def test_build_root_config_overwrite(self): cfg = build_root_config('tests.files.settings_overwrite') for key, val in DEFAULT_SPIDER_GLOBAL_CONFIG.items(): if key == 'spider_modules': self.assertEqual(cfg['global'][key], ['zzz']) else: self.assertEqual(cfg['global'][key], val)
def test_build_spider_config_empty(self): class TestSpider(Spider): pass root_cfg = build_root_config('test.files.settings_minimal') cfg = build_spider_config(TestSpider, root_cfg) self.assertEqual(cfg, DEFAULT_SPIDER_GLOBAL_CONFIG)
def test_build_spider_config_empty(self): class TestSpider(Spider): pass root_cfg = build_root_config('tests.files.settings_minimal') cfg = build_spider_config(TestSpider, root_cfg) self.assertEqual(cfg, DEFAULT_SPIDER_GLOBAL_CONFIG)
def test_build_root_config_overwrite(self): cfg = build_root_config('test.files.settings_overwrite') for key, val in DEFAULT_SPIDER_GLOBAL_CONFIG.items(): if key == 'spider_modules': self.assertEqual(cfg['global'][key], ['zzz']) else: self.assertEqual(cfg['global'][key], val)
def test_build_spider_config_overwrite(self): class TestSpider(Spider): pass root_cfg = build_root_config('test.files.settings_test_spider') cfg = build_spider_config(TestSpider, root_cfg) for key, val in DEFAULT_SPIDER_GLOBAL_CONFIG.items(): if key == 'spider_modules': self.assertEqual(cfg[key], ['zzz']) elif key == 'thread_number': self.assertEqual(cfg[key], 777) else: self.assertEqual(cfg[key], val)
def test_setup_spider_config(self): class TestSpider(Spider): @classmethod def setup_spider_config(cls, config): config['foo'] = 'bar' root_cfg = build_root_config('tests.files.settings_minimal') cfg = build_spider_config(TestSpider, root_cfg) for key, val in DEFAULT_SPIDER_GLOBAL_CONFIG.items(): if key == 'foo': self.assertEqual(cfg[key], 'bar') else: self.assertEqual(cfg[key], val)
def test_setup_spider_config(self): class TestSpider(Spider): @classmethod def setup_spider_config(cls, config): config['foo'] = 'bar' root_cfg = build_root_config('test.files.settings_minimal') cfg = build_spider_config(TestSpider, root_cfg) for key, val in DEFAULT_SPIDER_GLOBAL_CONFIG.items(): if key == 'foo': self.assertEqual(cfg[key], 'bar') else: self.assertEqual(cfg[key], val)
def grab_control(request): form = ControlForm(request.GET or None) spider_registry = build_spider_registry(build_root_config()) spider_choices = [(x, x) for x in spider_registry.keys()] form.fields['spider'].choices = spider_choices form.fields['spider'].widget.choices = spider_choices command_choices = [(x, x) for x in Spider.get_available_command_names()] form.fields['command'].choices = command_choices form.fields['command'].widget.choices = command_choices context = { 'form': form, } return render(request, 'grabstat/control_form.html', context)
def grab_control_api(request, command): args = request.GET cls = load_spider_class(build_root_config(), args['spider']) spider = cls() iface = spider.controller.add_interface('redis') if command == 'put_command': result_id = iface.put_command({'name': args['command']}) return {'result_id': result_id} elif command == 'pop_result': result = iface.pop_result(args['result_id']) if result is None: return {'status': 'not-ready'} else: return {'data': result.get('data', ''), 'error': result.get('error', ''), } else: return {'error': 'unknown-command'}
def main(spider_name, thread_number=None, slave=False, settings='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, *args, **kwargs): default_logging(propagate_network_logger=network_logs) root_config = build_root_config(settings) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, trash = parser.parse_known_args() spider_args = vars(opts) if thread_number is None: thread_number = \ int(spider_config.get('thread_number', deprecated_key='GRAB_THREAD_NUMBER')) stat_task_object = kwargs.get('stat_task_object', None) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=int( spider_config.get('network_try_limit', deprecated_key='GRAB_NETWORK_TRY_LIMIT')), task_try_limit=int( spider_config.get('task_try_limit', deprecated_key='GRAB_TASK_TRY_LIMIT')), args=spider_args, ) opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get('proxy_list', deprecated_key='GRAB_PROXY_LIST') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get('command_interfaces', deprecated_key='GRAB_COMMAND_INTERFACES') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) # Dirty hack # FIXIT: REMOVE bot.dump_spider_stats = kwargs.get('dump_spider_stats') bot.stats_object = kwargs.get('stats_object') try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=spider_config.get( 'display_timing', deprecated_key='GRAB_DISPLAY_TIMING')) if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if not disable_report: if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) for key, lst in bot.items.iteritems(): fname_key = key.replace('-', '_') bot.save_list(key, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(stats) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
def process_command_line(): # Add current directory to python path cur_dir = os.path.realpath(os.getcwd()) sys.path.insert(0, cur_dir) parser = ArgumentParser() parser.add_argument('action', type=str) parser.add_argument('--logging-level', default='debug') parser.add_argument('--lock-key') #parser.add_argument('--ignore-lock', action='store_true', default=False) parser.add_argument('--settings', type=str, default='settings') parser.add_argument('--env', type=str) parser.add_argument('--profile', action='store_true', default=False) args, trash = parser.parse_known_args() config = build_root_config() if config and config['GRAB_DJANGO_SETTINGS']: os.environ['DJANGO_SETTINGS_MODULE'] = 'settings' # Turn off DEBUG to prevent memory leaks from django.conf import settings settings.DEBUG = False # Setup logging logging_level = getattr(logging, args.logging_level.upper()) #if args.positional_args: #command_key = '_'.join([args.action] + args.positional_args) #else: #command_key = args.action # TODO: enable logs setup_logging(args.action, logging_level, clear_handlers=True) # Setup action handler action_name = args.action try: # First, try to import script from the grab package action_mod = __import__('grab.script.%s' % action_name, None, None, ['foo']) except ImportError as ex: if (unicode(ex).startswith('No module named') and action_name in unicode(ex)): pass else: logging.error('', exc_info=ex) # If grab does not provides the script # try to import it from the current project try: action_mod = __import__('script.%s' % action_name, None, None, ['foo']) except ImportError as ex: logging.error('', exc_info=ex) sys.stderr.write('Could not import %s script' % action_name) sys.exit(1) if hasattr(action_mod, 'setup_arg_parser'): action_mod.setup_arg_parser(parser) args, trash = parser.parse_known_args() # TODO: enable lock-file processing #lock_key = None #if not args.slave: #if not args.ignore_lock: #if not args.lock_key: #if hasattr(action_mod, 'setup_lock_key'): #lock_key = action_mod.setup_lock_key(action_name, args) #else: #lock_key = command_key #else: #lock_key = args.lock_key #if lock_key is not None: #lock_path = 'var/run/%s.lock' % lock_key #print 'Trying to lock file: %s' % lock_path #assert_lock(lock_path) #logger.debug('Executing %s action' % action_name) try: if args.profile: import cProfile import pyprof2calltree import pstats profile_file = 'var/%s.prof' % action_name profile_tree_file = 'var/%s.prof.out' % action_name prof = cProfile.Profile() prof.runctx('action_mod.main(**vars(args))', globals(), locals()) stats = pstats.Stats(prof) stats.strip_dirs() pyprof2calltree.convert(stats, profile_tree_file) else: action_mod.main(**vars(args)) except Exception as ex: logging.error('Unexpected exception from action handler:', exc_info=ex)
def build_spider_instance(cls, settings_module): root_config = build_root_config(settings_module) spider_config = build_spider_config(cls, root_config) return cls(config=spider_config)
def main(spider_name, thread_number=None, settings_module='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, api_port=None, parser_pool_size=2, grab_log_file=None, network_log_file=None, network_service=None, grab_transport=None, **kwargs): # pylint: disable=unused-argument default_logging( grab_log=grab_log_file, network_log=network_log_file, propagate_network_logger=network_logs, ) root_config = build_root_config(settings_module) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, _ = parser.parse_known_args() spider_args = vars(opts) bot = spider_class( thread_number=thread_number, config=spider_config, network_try_limit=None, task_try_limit=None, args=spider_args, http_api_port=api_port, parser_pool_size=parser_pool_size, network_service=network_service, grab_transport=grab_transport, ) opt_queue = spider_config.get('queue') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get('proxy_list') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get('command_interfaces') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats() if spider_config.get('display_stats'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d', pid) if not disable_report: if spider_config.get('save_report'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.makedirs(dir_) else: clear_directory(dir_) for key, lst in bot.stat.collections.items(): fname_key = key.replace('-', '_') save_list(lst, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(make_str(stats)) return { 'spider_stats': bot.render_stats(), }
def main(spider_name, thread_number=None, slave=False, settings_module='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, disable_default_logs=False, *args, **kwargs): if disable_default_logs: default_logging(propagate_network_logger=network_logs, grab_log=None, network_log=None) else: default_logging(propagate_network_logger=network_logs) root_config = build_root_config(settings_module) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, trash = parser.parse_known_args() spider_args = vars(opts) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=None, task_try_limit=None, args=spider_args, ) opt_queue = spider_config.get('queue') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get('proxy_list') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get('command_interfaces') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=spider_config.get('display_timing')) stats_with_time = bot.render_stats(timing=True) if spider_config.get('display_stats'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if not disable_report: if spider_config.get('save_report'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) for key, lst in bot.items.items(): fname_key = key.replace('-', '_') bot.save_list(key, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(make_str(stats_with_time)) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
def build_spider_instance(cls, settings_module, **kwargs): root_config = build_root_config(settings_module) spider_config = build_spider_config(cls, root_config) return cls(config=spider_config)
def test_build_root_config_minimal_settings(self): cfg = build_root_config('tests.files.settings_minimal') self.assertEqual(cfg['global'], DEFAULT_SPIDER_GLOBAL_CONFIG)
def main(spider_name, thread_number=None, slave=False, settings='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, *args, **kwargs): default_logging(propagate_network_logger=network_logs) root_config = build_root_config(settings) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, trash = parser.parse_known_args() spider_args = vars(opts) if thread_number is None: thread_number = \ int(spider_config.get('thread_number', deprecated_key='GRAB_THREAD_NUMBER')) stat_task_object = kwargs.get('stat_task_object', None) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=int(spider_config.get( 'network_try_limit', deprecated_key='GRAB_NETWORK_TRY_LIMIT')), task_try_limit=int(spider_config.get( 'task_try_limit', deprecated_key='GRAB_TASK_TRY_LIMIT')), args=spider_args, ) opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get( 'proxy_list', deprecated_key='GRAB_PROXY_LIST') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get( 'command_interfaces', deprecated_key='GRAB_COMMAND_INTERFACES') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) # Dirty hack # FIXIT: REMOVE bot.dump_spider_stats = kwargs.get('dump_spider_stats') bot.stats_object = kwargs.get('stats_object') try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats( timing=spider_config.get('display_timing', deprecated_key='GRAB_DISPLAY_TIMING')) if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if not disable_report: if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) for key, lst in bot.items.iteritems(): fname_key = key.replace('-', '_') bot.save_list(key, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(stats) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
def test_build_root_config_minimal_settings(self): cfg = build_root_config('test.files.settings_minimal') self.assertEqual(cfg['global'], DEFAULT_SPIDER_GLOBAL_CONFIG)