def __init__(self, tab_urls, depth, process_num = None): ''' @summary: --------- @param tab_urls: @param depth: @param process_num: 进程编号 --------- @result: ''' super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._urls = collections.deque() self._null_times = 0 self._tab_urls = tab_urls self._depth = depth# or int(tools.get_conf_value('config.conf', "collector", "depth")) self._interval = int(tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int(tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int(tools.get_conf_value('config.conf', "collector", "url_count")) self._url_manager = UrlManager(tab_urls) self._finished_callback = None self._is_show_wait = False self._tab_worker_status = 'news:worker_status' self._worker_mark = LOCAL_HOST_IP + ('_%s'%process_num if process_num else '')
def __init__(self, tab_urls, tab_site = '', tab_content = '', parser_count = None, depth = None, parser_params = {}, begin_callback = None, end_callback = None, content_unique_key = 'url', delete_tab_urls = False, process_num = None): ''' @summary: --------- @param tab_urls: url表名 @param tab_site: 网站表名 @param parser_count: parser 的线程数,为空时以配置文件为准 @param parser_params : 解析器所用的參數 @param begin_callback: 爬虫开始的回调 @param end_callback: 爬虫结束的回调 --------- @result: ''' super(Spider, self).__init__() self._tab_urls = tab_urls self._url_manager = UrlManager(tab_urls) if delete_tab_urls:self._url_manager.clear_url() # self._db = MongoDB() # if delete_tab_urls: self._db.delete(tab_urls) # self._db.set_unique_key(tab_urls, 'url') # if tab_site: self._db.set_unique_key(tab_site, 'site_id') # if tab_content: self._db.set_unique_key(tab_content, content_unique_key) # #设置索引 加快查询速度 # self._db.set_ensure_index(tab_urls, 'depth') # self._db.set_ensure_index(tab_urls, 'status') # if tab_site: self._db.set_ensure_index(tab_site, 'read_status') # if tab_content: self._db.set_ensure_index(tab_content, 'read_status') self._collector = Collector(tab_urls, depth, process_num) self._parsers = [] self._parser_params = parser_params self._begin_callback = begin_callback # 扩展end_callback方法 def _end_callback(): # self._url_manager.stop() if end_callback: end_callback() self._end_callabck = _end_callback self._parser_count = int(tools.get_conf_value('config.conf', 'parser', 'parser_count')) if not parser_count else parser_count self._spider_site_name = tools.get_conf_value('config.conf', "spider_site", "spider_site_name").split(',') self._except_site_name = tools.get_conf_value('config.conf', "spider_site", "except_site_name").split(',')
def __init__(self, tab_urls, depth): super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._urls = collections.deque() self._null_times = 0 self._tab_urls = tab_urls self._depth = depth # or int(tools.get_conf_value('config.conf', "collector", "depth")) self._interval = int( tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int( tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int( tools.get_conf_value('config.conf', "collector", "url_count")) self._url_manager = UrlManager(tab_urls) self._finished_callback = None self._is_show_wait = False
@summary: 提供一些操作数据库公用的方法 --------- @author: Boris ''' import sys sys.path.append('..') import init import base.constance as Constance from base.url_manager import UrlManager import utils.tools as tools # from db.mongodb import MongoDB import random mongodb = None #MongoDB() url_manager = UrlManager('news:news_urls') # 此处需要传url表名 url_manager.start() def get_contained_key(title, content, key1, key2, key3): text = title + content # 过滤 if tools.get_info(text, key3): return '', 0 # 取包含的关键字 contained_key = [] contained_key_count = 0 def get_weigth(text, keys, key_weigth): weigth = 0
class Spider(threading.Thread): def __init__(self, tab_urls, tab_site = '', tab_content = '', parser_count = None, depth = None, parser_params = {}, begin_callback = None, end_callback = None, content_unique_key = 'url', delete_tab_urls = False, process_num = None): ''' @summary: --------- @param tab_urls: url表名 @param tab_site: 网站表名 @param parser_count: parser 的线程数,为空时以配置文件为准 @param parser_params : 解析器所用的參數 @param begin_callback: 爬虫开始的回调 @param end_callback: 爬虫结束的回调 --------- @result: ''' super(Spider, self).__init__() self._tab_urls = tab_urls self._url_manager = UrlManager(tab_urls) if delete_tab_urls:self._url_manager.clear_url() # self._db = MongoDB() # if delete_tab_urls: self._db.delete(tab_urls) # self._db.set_unique_key(tab_urls, 'url') # if tab_site: self._db.set_unique_key(tab_site, 'site_id') # if tab_content: self._db.set_unique_key(tab_content, content_unique_key) # #设置索引 加快查询速度 # self._db.set_ensure_index(tab_urls, 'depth') # self._db.set_ensure_index(tab_urls, 'status') # if tab_site: self._db.set_ensure_index(tab_site, 'read_status') # if tab_content: self._db.set_ensure_index(tab_content, 'read_status') self._collector = Collector(tab_urls, depth, process_num) self._parsers = [] self._parser_params = parser_params self._begin_callback = begin_callback # 扩展end_callback方法 def _end_callback(): # self._url_manager.stop() if end_callback: end_callback() self._end_callabck = _end_callback self._parser_count = int(tools.get_conf_value('config.conf', 'parser', 'parser_count')) if not parser_count else parser_count self._spider_site_name = tools.get_conf_value('config.conf', "spider_site", "spider_site_name").split(',') self._except_site_name = tools.get_conf_value('config.conf', "spider_site", "except_site_name").split(',') def add_parser(self, parser): if self._spider_site_name[0] == 'all': for except_site_name in self._except_site_name: if parser.NAME != except_site_name.strip(): self._parsers.append(parser) else: for spider_site_name in self._spider_site_name: if parser.NAME == spider_site_name.strip(): self._parsers.append(parser) def run(self): self.__start() def __start(self): if self._begin_callback: self._begin_callback() if not self._parsers: if self._end_callabck: self._end_callabck() return # 启动parser 的add site 和 add root #print(self._parser_params) for parser in self._parsers: # parser.add_site_info() parser.add_root_url(self._parser_params) # 启动collector self._collector.add_finished_callback(self._end_callabck) self._collector.start() # 启动parser control while self._parser_count: parser_control = PaserControl(self._collector, self._tab_urls) for parser in self._parsers: parser_control.add_parser(parser) parser_control.start() self._parser_count -= 1
class Collector(threading.Thread): def __init__(self, tab_urls, depth, process_num = None): ''' @summary: --------- @param tab_urls: @param depth: @param process_num: 进程编号 --------- @result: ''' super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._urls = collections.deque() self._null_times = 0 self._tab_urls = tab_urls self._depth = depth# or int(tools.get_conf_value('config.conf', "collector", "depth")) self._interval = int(tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int(tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int(tools.get_conf_value('config.conf', "collector", "url_count")) self._url_manager = UrlManager(tab_urls) self._finished_callback = None self._is_show_wait = False self._tab_worker_status = 'news:worker_status' self._worker_mark = LOCAL_HOST_IP + ('_%s'%process_num if process_num else '') def run(self): while not self._thread_stop: try: self.__input_data() except Exception as e: log.error(e) time.sleep(self._interval) def stop(self): self._thread_stop = True if self._finished_callback: self._finished_callback() # @tools.log_function_time def __input_data(self): if self._urls: log.debug('url 未处理完,不取url, url数量 = %s'%len(self._urls)) return # 汇报节点信息 self._db.zadd(self._tab_worker_status, self._worker_mark, 0) # 未做 url_count = self._url_count # 先赋值 # 根据等待节点数量,动态分配url worker_wait_count = self._db.zget_count(self._tab_worker_status, priority_min = 0, priority_max = 0) if worker_wait_count: # 任务数量 task_count = self._db.zget_count(self._tab_urls) # 动态分配的数量 = 任务数量 / 休息的节点数量 url_count = task_count // worker_wait_count url_count = url_count if url_count <= self._url_count else self._url_count urls_list = self._db.zget(self._tab_urls, count = url_count) if not urls_list: if not self._is_show_wait: log.info('等待任务...') self._is_show_wait = True else: # # 记录url数量 测试用 # url_count_record = tools.read_file('url_count.txt') # url_count_record = url_count_record and int(url_count_record) or 0 # url_count_record += len(urls_list) # tools.write_file('url_count.txt', str(url_count_record)) # 汇报节点信息 self._db.zadd(self._tab_worker_status, self._worker_mark, 1) # 正在做 # 存url self.put_urls(urls_list) self._is_show_wait = False # if self.is_all_have_done(): # log.debug('is_all_have_done end') # self.stop() def is_finished(self): return self._thread_stop def add_finished_callback(self, callback): self._finished_callback = callback # 没有可做的url def is_all_have_done(self): # log.debug('判断是否有未做的url collector url size = %s | url_manager size = %s'%(len(self._urls), self._url_manager.get_urls_count())) if len(self._urls) == 0: self._null_times += 1 if self._null_times >= self._allowed_null_times and self._url_manager.get_urls_count() == 0: return True else: return False else: self._null_times = 0 return False # @tools.log_function_time def put_urls(self, urls_list): for url_info in urls_list: try: url_info = eval(url_info) except Exception as e: url_info = None if url_info: self._urls.append(url_info) # @tools.log_function_time def get_urls(self, count): urls = [] count = count if count <= len(self._urls) else len(self._urls) while count: urls.append(self._urls.popleft()) count -= 1 return urls
class Collector(threading.Thread): def __init__(self, tab_urls, depth): super(Collector, self).__init__() self._db = RedisDB() self._thread_stop = False self._urls = collections.deque() self._null_times = 0 self._tab_urls = tab_urls self._depth = depth # or int(tools.get_conf_value('config.conf', "collector", "depth")) self._interval = int( tools.get_conf_value('config.conf', "collector", "sleep_time")) self._allowed_null_times = int( tools.get_conf_value('config.conf', "collector", 'allowed_null_times')) self._url_count = int( tools.get_conf_value('config.conf', "collector", "url_count")) self._url_manager = UrlManager(tab_urls) self._finished_callback = None self._is_show_wait = False def run(self): while not self._thread_stop: self.__input_data() time.sleep(self._interval) def stop(self): self._thread_stop = True if self._finished_callback: self._finished_callback() # @tools.log_function_time def __input_data(self): if self._urls: log.debug('url 未处理完,不取url, url数量 = %s' % len(self._urls)) return urls_list = self._db.zget(self._tab_urls, count=self._url_count) if not urls_list: if not self._is_show_wait: log.info('等待任务...') self._is_show_wait = True else: # 存url self.put_urls(urls_list) self._is_show_wait = False # if self.is_all_have_done(): # log.debug('is_all_have_done end') # self.stop() def is_finished(self): return self._thread_stop def add_finished_callback(self, callback): self._finished_callback = callback # 没有可做的url def is_all_have_done(self): # log.debug('判断是否有未做的url collector url size = %s | url_manager size = %s'%(len(self._urls), self._url_manager.get_urls_count())) if len(self._urls) == 0: self._null_times += 1 if self._null_times >= self._allowed_null_times and self._url_manager.get_urls_count( ) == 0: return True else: return False else: self._null_times = 0 return False # @tools.log_function_time def put_urls(self, urls_list): for url_info in urls_list: try: url_info = eval(url_info) except Exception as e: url_info = None if url_info: self._urls.append(url_info) # @tools.log_function_time def get_urls(self, count): urls = [] count = count if count <= len(self._urls) else len(self._urls) while count: urls.append(self._urls.popleft()) count -= 1 return urls