def collect(args): if args.verbose: logging.basicConfig(filename='dbg_'+args.parser+'_'+str(time.time())+'.log', filemode='w', level=logging.DEBUG) parser_params = process_param_string(args.parser_params) collector = Collector(args.parser, args.use_proxy, parser_params) try: collector.start() finally: if collector.proxy_client is not None: collector.proxy_client.stop()
class Spider(threading.Thread): def __init__(self, tab_urls, tab_site, tab_content, parser_count=None, parser_params={}, begin_callback=None, end_callback=None, content_unique_key='url'): ''' @summary: --------- @param tab_urls: url表名 @param tab_site: 网站表名 @param parser_count: parser 的线程数,为空时以配置文件为准 @param parser_params : 解析器所用的參數 @param begin_callback: 爬虫开始的回调 @param end_callback: 爬虫结束的回调 --------- @result: ''' super(Spider, self).__init__() self._tab_urls = tab_urls self._db = MongoDB() self._db.set_unique_key(tab_urls, 'url') self._db.set_unique_key(tab_site, 'site_id') self._db.set_unique_key(tab_content, content_unique_key) #设置索引 加快查询速度 self._db.set_ensure_index(tab_urls, 'depth') self._db.set_ensure_index(tab_urls, 'status') self._db.set_ensure_index(tab_site, 'read_status') self._db.set_ensure_index(tab_content, 'read_status') self._collector = Collector(tab_urls) self._parsers = [] self._parser_params = parser_params self._begin_callback = begin_callback self._end_callabck = end_callback self._parser_count = int( tools.get_conf_value( 'config.conf', 'parser', 'parser_count')) if not parser_count else parser_count self._spider_site_name = tools.get_conf_value( 'config.conf', "spider_site", "spider_site_name").split(',') self._except_site_name = tools.get_conf_value( 'config.conf', "spider_site", "except_site_name").split(',') def add_parser(self, parser): if self._spider_site_name[0] == 'all': for except_site_name in self._except_site_name: if parser.NAME != except_site_name.strip(): self._parsers.append(parser) else: for spider_site_name in self._spider_site_name: if parser.NAME == spider_site_name.strip(): self._parsers.append(parser) def run(self): self.__start() def __start(self): if self._begin_callback: self._begin_callback() if not self._parsers: if self._end_callabck: self._end_callabck() return # 启动collector self._collector.add_finished_callback(self._end_callabck) self._collector.start() # 启动parser 的add site 和 add root #print(self._parser_params) for parser in self._parsers: threading.Thread(target=parser.add_site_info).start() threading.Thread(target=parser.add_root_url, args=(self._parser_params, )).start() # 启动parser control while self._parser_count: parser_control = PaserControl(self._collector, self._tab_urls) for parser in self._parsers: parser_control.add_parser(parser) parser_control.start() self._parser_count -= 1
class Spider(threading.Thread): def __init__(self, tab_list, tab_unique_key_list, tab_ensure_index_list, parser_count=None, site_parsers=None, parser_params={}, begin_callback=None, end_callback=None): ''' @summary: --------- @param tab_urls: url表名 @param tab_site: 网站表名 @param parser_count: parser 的线程数,为空时以配置文件为准 @param parser_params : 解析器所用的参数 @param begin_callback: 爬虫开始的回调 @param end_callback: 爬虫结束的回调 --------- @result: ''' super(Spider, self).__init__() self._tab_urls = tab_list[0] self._site_parsers = site_parsers self._db = MongoDB() for tab_index in range(len(tab_list)): self._db.set_unique_key(tab_list[tab_index], tab_unique_key_list[tab_index]) # 设置索引 加快查询速度 for ensure_index in tab_ensure_index_list[tab_index]: self._db.set_ensure_index(tab_list[tab_index], ensure_index) self._collector = Collector(self._tab_urls, self._site_parsers) self._parsers = [] self._parser_params = parser_params self._begin_callback = begin_callback self._end_callabck = end_callback self._parser_count = int( tools.get_conf_value( 'config.conf', 'parser', 'parser_count')) if not parser_count else parser_count self._spider_site_name = tools.get_conf_value( 'config.conf', "spider_site", "spider_site_name").split(',') self._except_site_name = tools.get_conf_value( 'config.conf', "spider_site", "except_site_name").split(',') def add_parser(self, parser): if self._spider_site_name[0] == 'all': for except_site_name in self._except_site_name: if parser.NAME != except_site_name.strip(): self._parsers.append(parser) else: for spider_site_name in self._spider_site_name: if parser.NAME == spider_site_name.strip(): self._parsers.append(parser) def run(self): self.__start() def __start(self): if self._begin_callback: self._begin_callback() if not self._parsers: if self._end_callabck: self._end_callabck() return # 启动parser 的add site 和 add root # print(self._parser_params) for parser in self._parsers: parser.add_site_info() parser.add_root_url(self._parser_params) print('添加跟url完毕') # 启动collector self._collector.add_finished_callback(self._end_callabck) self._collector.start() # 启动parser control while self._parser_count: parser_control = PaserControl(self._collector, self._tab_urls) for parser in self._parsers: parser_control.add_parser(parser) parser_control.start() self._parser_count -= 1
sys.path.append("..") import utils.tools as tools from utils.log import log from base.collector import Collector from base.root_url import AddRootUrl from html_parser.parser_control import PaserControl def init(): db = tools.getConnectedDB() # 设唯一索引 db.urls.ensure_index('url', unique=True) db.text_info.ensure_index('url', unique=True) if __name__ == '__main__': log.info("--------begin--------") init() addRootUrl = AddRootUrl() addRootUrl.start() coll = Collector() coll.start() paserCount = int(tools.getConfValue("html_parser", "parser_count")) while paserCount: paser = PaserControl() paser.start() paserCount = paserCount - 1
class Spider(threading.Thread): def __init__(self, tab_urls, tab_site, tab_content, parser_count=None, search_keyword1=[], search_keyword2=[], search_keyword3=[], begin_callback=None, end_callback=None, content_unique_key=None): ''' @summary: --------- @param tab_urls: url表名 @param tab_site: 网站表名 @param parser_count: parser 的线程数,为空时以配置文件为准 @param search_keyword1: 搜索关键字(列表)全部包含 @param search_keyword2: 搜索关键字(列表)至少包含一个 @param search_keyword3: 搜索关键字(列表)一个都不能包含 @param begin_callback: 爬虫开始的回调 @param end_callback: 爬虫结束的回调 --------- @result: ''' super(Spider, self).__init__() self._tab_urls = tab_urls self._db = MongoDB() self._db.set_unique_key(tab_urls, 'url') self._db.set_unique_key(tab_site, 'site_id') self._db.set_unique_key( tab_content, 'url' if not content_unique_key else content_unique_key) self._collector = Collector(tab_urls) self._parsers = [] self._search_keyword1 = search_keyword1 self._search_keyword2 = search_keyword2 self._search_keyword3 = search_keyword3 self._begin_callback = begin_callback self._end_callabck = end_callback self._parser_count = int( tools.get_conf_value( 'config.conf', 'parser', 'parser_count')) if not parser_count else parser_count self._spider_site_name = tools.get_conf_value( 'config.conf', "spider_site", "spider_site_name").split(',') self._except_site_name = tools.get_conf_value( 'config.conf', "spider_site", "except_site_name").split(',') def add_parser(self, parser): if self._spider_site_name[0] == 'all': for except_site_name in self._except_site_name: if parser.NAME != except_site_name.strip(): self._parsers.append(parser) else: for spider_site_name in self._spider_site_name: if parser.NAME == spider_site_name.strip(): self._parsers.append(parser) def run(self): self.__start() def __start(self): if self._begin_callback: self._begin_callback() if not self._parsers: if self._end_callabck: self._end_callabck() return # 启动collector self._collector.add_finished_callback(self._end_callabck) self._collector.start() # 启动parser 的add site 和 add root for parser in self._parsers: threading.Thread(target=parser.add_site_info).start() threading.Thread(target=parser.add_root_url, args=(self._search_keyword1, self._search_keyword2, self._search_keyword3)).start() # 启动parser control while self._parser_count: parser_control = PaserControl(self._collector, self._tab_urls) for parser in self._parsers: parser_control.add_parser(parser) parser_control.start() self._parser_count -= 1