def main(spider_name, thread_number=None, slave=False, force_url=None, settings='settings', *args, **kwargs): default_logging(propagate_network_logger=kwargs['propagate_network_logger']) lock_key = None if not slave: lock_key = 'crawl.%s' % spider_name if lock_key is not None: lock_path = 'var/run/%s.lock' % lock_key logger.debug('Trying to lock file: %s' % lock_path) assert_lock(lock_path) config = build_global_config(settings) spider_class = load_spider_class(config, spider_name) spider_config = build_spider_config(spider_class, config) if thread_number is None: thread_number = spider_config.getint('GRAB_THREAD_NUMBER') bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'), task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'), ) if spider_config.get('GRAB_QUEUE'): bot.setup_queue(**spider_config['GRAB_QUEUE']) if spider_config.get('GRAB_CACHE'): bot.setup_cache(**spider_config['GRAB_CACHE']) if spider_config.get('GRAB_PROXY_LIST'): bot.load_proxylist(**spider_config['GRAB_PROXY_LIST']) try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING')) if config.get('GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if config.get('GRAB_SAVE_FATAL_ERRORS'): bot.save_list('fatal', 'var/fatal-%d.txt' % pid) if config.get('GRAB_SAVE_TASK_ADD_ERRORS'): bot.save_list('task-could-not-be-added', 'var/task-add-error-%d.txt' % pid) if config.get('GRAB_SAVE_FINAL_STATS'): open('var/stats-%d.txt' % pid, 'wb').write(stats)
def grab_control_api(request, command): args = request.GET cls = load_spider_class(build_global_config(), args['spider']) spider = cls() iface = spider.controller.add_interface('redis') if command == 'put_command': result_id = iface.put_command({'name': args['command']}) return {'result_id': result_id} elif command == 'pop_result': result = iface.pop_result(args['result_id']) if result is None: return {'status': 'not-ready'} else: return {'data': result.get('data', ''), 'error': result.get('error', ''), } else: return {'error': 'unknown-command'}
def grab_control_api(request, command): args = request.GET cls = load_spider_class(build_root_config(), args['spider']) spider = cls() iface = spider.controller.add_interface('redis') if command == 'put_command': result_id = iface.put_command({'name': args['command']}) return {'result_id': result_id} elif command == 'pop_result': result = iface.pop_result(args['result_id']) if result is None: return {'status': 'not-ready'} else: return {'data': result.get('data', ''), 'error': result.get('error', ''), } else: return {'error': 'unknown-command'}
def main(spider_name, thread_number=None, slave=False, settings='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, *args, **kwargs): default_logging(propagate_network_logger=network_logs) root_config = build_root_config(settings) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, trash = parser.parse_known_args() spider_args = vars(opts) if thread_number is None: thread_number = \ int(spider_config.get('thread_number', deprecated_key='GRAB_THREAD_NUMBER')) stat_task_object = kwargs.get('stat_task_object', None) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=int( spider_config.get('network_try_limit', deprecated_key='GRAB_NETWORK_TRY_LIMIT')), task_try_limit=int( spider_config.get('task_try_limit', deprecated_key='GRAB_TASK_TRY_LIMIT')), args=spider_args, ) opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get('proxy_list', deprecated_key='GRAB_PROXY_LIST') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get('command_interfaces', deprecated_key='GRAB_COMMAND_INTERFACES') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) # Dirty hack # FIXIT: REMOVE bot.dump_spider_stats = kwargs.get('dump_spider_stats') bot.stats_object = kwargs.get('stats_object') try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=spider_config.get( 'display_timing', deprecated_key='GRAB_DISPLAY_TIMING')) if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if not disable_report: if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) for key, lst in bot.items.iteritems(): fname_key = key.replace('-', '_') bot.save_list(key, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(stats) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
def main(spider_name, thread_number=None, settings_module='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, api_port=None, parser_pool_size=2, grab_log_file=None, network_log_file=None, network_service=None, grab_transport=None, **kwargs): # pylint: disable=unused-argument default_logging( grab_log=grab_log_file, network_log=network_log_file, propagate_network_logger=network_logs, ) root_config = build_root_config(settings_module) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, _ = parser.parse_known_args() spider_args = vars(opts) bot = spider_class( thread_number=thread_number, config=spider_config, network_try_limit=None, task_try_limit=None, args=spider_args, http_api_port=api_port, parser_pool_size=parser_pool_size, network_service=network_service, grab_transport=grab_transport, ) opt_queue = spider_config.get('queue') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get('proxy_list') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get('command_interfaces') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats() if spider_config.get('display_stats'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d', pid) if not disable_report: if spider_config.get('save_report'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.makedirs(dir_) else: clear_directory(dir_) for key, lst in bot.stat.collections.items(): fname_key = key.replace('-', '_') save_list(lst, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(make_str(stats)) return { 'spider_stats': bot.render_stats(), }
def main(spider_name, thread_number=None, slave=False, settings='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, *args, **kwargs): default_logging(propagate_network_logger=network_logs) root_config = build_root_config(settings) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, trash = parser.parse_known_args() spider_args = vars(opts) if thread_number is None: thread_number = \ int(spider_config.get('thread_number', deprecated_key='GRAB_THREAD_NUMBER')) stat_task_object = kwargs.get('stat_task_object', None) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=int(spider_config.get( 'network_try_limit', deprecated_key='GRAB_NETWORK_TRY_LIMIT')), task_try_limit=int(spider_config.get( 'task_try_limit', deprecated_key='GRAB_TASK_TRY_LIMIT')), args=spider_args, ) opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get( 'proxy_list', deprecated_key='GRAB_PROXY_LIST') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get( 'command_interfaces', deprecated_key='GRAB_COMMAND_INTERFACES') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) # Dirty hack # FIXIT: REMOVE bot.dump_spider_stats = kwargs.get('dump_spider_stats') bot.stats_object = kwargs.get('stats_object') try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats( timing=spider_config.get('display_timing', deprecated_key='GRAB_DISPLAY_TIMING')) if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if not disable_report: if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) for key, lst in bot.items.iteritems(): fname_key = key.replace('-', '_') bot.save_list(key, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(stats) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
def test_load_spider_class(self): cfg = self.build_config(['test.util_module']) SPIDER_REGISTRY.clear() cls = load_spider_class(cfg, 'first_spider') self.assertEqual(cls, FirstSpider)
def main(spider_name, thread_number=None, slave=False, settings='settings', network_logs=False, *args, **kwargs): default_logging(propagate_network_logger=network_logs) lock_key = None if not slave: lock_key = 'crawl.%s' % spider_name if lock_key is not None: lock_path = 'var/run/%s.lock' % lock_key logger.debug('Trying to lock file: %s' % lock_path) assert_lock(lock_path) config = build_global_config(settings) spider_class = load_spider_class(config, spider_name) spider_config = build_spider_config(spider_class, config) if hasattr(spider_class, 'setup_extra_args'): parser = ArgumentParser() spider_class.setup_extra_args(parser) extra_args, trash = parser.parse_known_args() spider_config['extra_args'] = vars(extra_args) if thread_number is None: thread_number = spider_config.getint('GRAB_THREAD_NUMBER') stat_task_object = kwargs.get('stat_task_object', None) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'), task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'), ) if spider_config.get('GRAB_QUEUE'): bot.setup_queue(**spider_config['GRAB_QUEUE']) if spider_config.get('GRAB_CACHE'): bot.setup_cache(**spider_config['GRAB_CACHE']) if spider_config.get('GRAB_PROXY_LIST'): bot.load_proxylist(**spider_config['GRAB_PROXY_LIST']) if spider_config.get('GRAB_COMMAND_INTERFACES'): for iface_config in spider_config['GRAB_COMMAND_INTERFACES']: bot.controller.add_interface(**iface_config) # Dirty hack # FIXIT: REMOVE bot.dump_spider_stats = kwargs.get('dump_spider_stats') bot.stats_object = kwargs.get('stats_object') try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING')) if config.get('GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if config.get('GRAB_SAVE_FATAL_ERRORS'): bot.save_list('fatal', 'var/fatal-%d.txt' % pid) if config.get('GRAB_SAVE_TASK_ADD_ERRORS'): bot.save_list('task-could-not-be-added', 'var/task-add-error-%d.txt' % pid) if config.get('GRAB_SAVE_FINAL_STATS'): open('var/stats-%d.txt' % pid, 'wb').write(stats) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
def main(spider_name, thread_number=None, slave=False, settings_module='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, disable_default_logs=False, *args, **kwargs): if disable_default_logs: default_logging(propagate_network_logger=network_logs, grab_log=None, network_log=None) else: default_logging(propagate_network_logger=network_logs) root_config = build_root_config(settings_module) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, trash = parser.parse_known_args() spider_args = vars(opts) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=None, task_try_limit=None, args=spider_args, ) opt_queue = spider_config.get('queue') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get('proxy_list') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get('command_interfaces') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=spider_config.get('display_timing')) stats_with_time = bot.render_stats(timing=True) if spider_config.get('display_stats'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if not disable_report: if spider_config.get('save_report'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) for key, lst in bot.items.items(): fname_key = key.replace('-', '_') bot.save_list(key, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(make_str(stats_with_time)) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
def main(spider_name, thread_number=None, slave=False, settings='settings', network_logs=False, disable_proxy=False, *args, **kwargs): default_logging(propagate_network_logger=network_logs) lock_key = None if not slave: lock_key = 'crawl.%s' % spider_name if lock_key is not None: lock_path = 'var/run/%s.lock' % lock_key logger.debug('Trying to lock file: %s' % lock_path) assert_lock(lock_path) config = build_global_config(settings) spider_class = load_spider_class(config, spider_name) spider_config = build_spider_config(spider_class, config) if hasattr(spider_class, 'setup_extra_args'): parser = ArgumentParser() spider_class.setup_extra_args(parser) extra_args, trash = parser.parse_known_args() spider_config['extra_args'] = vars(extra_args) if thread_number is None: thread_number = spider_config.getint('GRAB_THREAD_NUMBER') stat_task_object = kwargs.get('stat_task_object', None) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'), task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'), ) if spider_config.get('GRAB_QUEUE'): bot.setup_queue(**spider_config['GRAB_QUEUE']) if spider_config.get('GRAB_CACHE'): bot.setup_cache(**spider_config['GRAB_CACHE']) if spider_config.get('GRAB_PROXY_LIST'): if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**spider_config['GRAB_PROXY_LIST']) if spider_config.get('GRAB_COMMAND_INTERFACES'): for iface_config in spider_config['GRAB_COMMAND_INTERFACES']: bot.controller.add_interface(**iface_config) # Dirty hack # FIXIT: REMOVE bot.dump_spider_stats = kwargs.get('dump_spider_stats') bot.stats_object = kwargs.get('stats_object') try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING')) if spider_config.get('GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if config.get('GRAB_SAVE_REPORT'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) bot.save_list('fatal', '%s/fatal.txt' % dir_) bot.save_list('task-count-rejected', '%s/task_count_rejected.txt' % dir_) bot.save_list('network-count-rejected', '%s/network_count_rejected.txt' % dir_) bot.save_list('task-with-invalid-url', '%s/task_with_invalid_url.txt' % dir_) with open('%s/report.txt' % dir_, 'wb') as out: out.write(stats) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }