def __init__(self, user_list, cookie_list=None): """ :param user_list: a list of users :param cookie_list: a list of cookies, default to be None :return: """ if cookie_list is not None: self.cookies = cookie_list self.fetchers = [] self.get_fetchers_by_cookie() self.parser = HtmlParser() else: # need login by users self.users = user_list self.fetchers = [] self.get_fetchers_by_user() self.parser = HtmlParser() self.database = Database() self.main_fetcher = 0 # current fetcher index self.follower_list = [] # store followers self.followee_list = [] # store followees self.timeline_list = [] # store timelines self.profile_list = [] # store profiles self.start_time = datetime.now() self.end_time = None
def run(key): url = set_url(host, key) Cookies() spider = Spider(url) html = spider.spider(BASEHEADERS) if not verify(html): BASEHEADERS["Cookie"] = BASEHEADERS["Cookie"] + Cookies.cookie_str( ["acw_tc", "PHPSESSID"]) proxieser.proxies() parser = HtmlParser(html) data = parser.parser("fund") print(data)
def __init__(self): #初始化分布式进程工作节点的连接工程 #实现第一步,使用BaseManager注册用于获取Queue的方法名称 BaseManager.register('get_task_queue') BaseManager.register('get_result_queue') BaseManager.register('get_page_queue') BaseManager.register('get_data_queue') sever_addr = '127.0.0.1' print('Connect to sever %s...' % sever_addr) self.m = BaseManager(address=(sever_addr, 8001), authkey='yuan'.encode('utf-8')) self.m.connect() self.task = self.m.get_task_queue() self.result = self.m.get_result_queue() self.page = self.m.get_page_queue() self.data = self.m.get_data_queue() self.downloader = HtmlDownloader() self.parser = HtmlParser() print('*--------------------------------------------*') print('初始化完成') print('*--------------------------------------------*')
def get_title(response): title = HtmlParser(response).parser("title") # parser = HtmlParser(response) # title = parser.parser("title") return title
def __init__(self): self.downloader = HtmlDownloader() self.parser = HtmlParser() self.output = DataOutput()