def test_build_spider_config1(self): modname = setup_settings_file({}) default_config.default_config = { 'CACHE': {'backend': 'mysql'}, 'VAR1': 'val1', } config = build_global_config(modname) spider_config = build_spider_config('foo', config) self.assertEqual(config['CACHE'], {'backend': 'mysql'})
def test_build_spider_config1(self): modname = setup_settings_file({}) default_config.default_config = { 'CACHE': { 'backend': 'mysql' }, 'VAR1': 'val1', } config = build_global_config(modname) spider_config = build_spider_config('foo', config) self.assertEqual(config['CACHE'], {'backend': 'mysql'})
def test_build_spider_config2(self): modname = setup_settings_file({}) default_config.default_config = { 'CACHE': {'backend': 'mysql'}, 'SPIDER_CONFIG_FOO': { 'CACHE': {'backend': 'tokyo'}, }, } config = build_global_config(modname) spider_config = build_spider_config('foo', config) self.assertEqual(spider_config['CACHE'], {'backend': 'tokyo'})
def main(spider_name, thread_number=None, slave=False, force_url=None, settings='settings', *args, **kwargs): default_logging(propagate_network_logger=kwargs['propagate_network_logger']) lock_key = None if not slave: lock_key = 'crawl.%s' % spider_name if lock_key is not None: lock_path = 'var/run/%s.lock' % lock_key logger.debug('Trying to lock file: %s' % lock_path) assert_lock(lock_path) config = build_global_config(settings) spider_class = load_spider_class(config, spider_name) spider_config = build_spider_config(spider_class, config) if thread_number is None: thread_number = spider_config.getint('GRAB_THREAD_NUMBER') bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'), task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'), ) if spider_config.get('GRAB_QUEUE'): bot.setup_queue(**spider_config['GRAB_QUEUE']) if spider_config.get('GRAB_CACHE'): bot.setup_cache(**spider_config['GRAB_CACHE']) if spider_config.get('GRAB_PROXY_LIST'): bot.load_proxylist(**spider_config['GRAB_PROXY_LIST']) try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING')) if config.get('GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if config.get('GRAB_SAVE_FATAL_ERRORS'): bot.save_list('fatal', 'var/fatal-%d.txt' % pid) if config.get('GRAB_SAVE_TASK_ADD_ERRORS'): bot.save_list('task-could-not-be-added', 'var/task-add-error-%d.txt' % pid) if config.get('GRAB_SAVE_FINAL_STATS'): open('var/stats-%d.txt' % pid, 'wb').write(stats)
def grab_control(request): form = ControlForm(request.GET or None) spider_registry = build_spider_registry(build_global_config()) spider_choices = [(x, x) for x in spider_registry.keys()] form.fields['spider'].choices = spider_choices form.fields['spider'].widget.choices = spider_choices command_choices = [(x, x) for x in Spider.get_available_command_names()] form.fields['command'].choices = command_choices form.fields['command'].widget.choices = command_choices context = { 'form': form, } return render(request, 'grabstat/control_form.html', context)
def test_build_spider_config2(self): modname = setup_settings_file({}) default_config.default_config = { 'CACHE': { 'backend': 'mysql' }, 'SPIDER_CONFIG_FOO': { 'CACHE': { 'backend': 'tokyo' }, }, } config = build_global_config(modname) spider_config = build_spider_config('foo', config) self.assertEqual(spider_config['CACHE'], {'backend': 'tokyo'})
def grab_control_api(request, command): args = request.GET cls = load_spider_class(build_global_config(), args['spider']) spider = cls() iface = spider.controller.add_interface('redis') if command == 'put_command': result_id = iface.put_command({'name': args['command']}) return {'result_id': result_id} elif command == 'pop_result': result = iface.pop_result(args['result_id']) if result is None: return {'status': 'not-ready'} else: return {'data': result.get('data', ''), 'error': result.get('error', ''), } else: return {'error': 'unknown-command'}
def grab_control_api(request, command): args = request.GET cls = load_spider_class(build_global_config(), args['spider']) spider = cls() iface = spider.controller.add_interface('redis') if command == 'put_command': result_id = iface.put_command({'name': args['command']}) return {'result_id': result_id} elif command == 'pop_result': result = iface.pop_result(args['result_id']) if result is None: return {'status': 'not-ready'} else: return { 'data': result.get('data', ''), 'error': result.get('error', ''), } else: return {'error': 'unknown-command'}
def main(spider_name, thread_number=None, slave=False, settings='settings', network_logs=False, disable_proxy=False, *args, **kwargs): default_logging(propagate_network_logger=network_logs) lock_key = None if not slave: lock_key = 'crawl.%s' % spider_name if lock_key is not None: lock_path = 'var/run/%s.lock' % lock_key logger.debug('Trying to lock file: %s' % lock_path) assert_lock(lock_path) config = build_global_config(settings) spider_class = load_spider_class(config, spider_name) spider_config = build_spider_config(spider_class, config) if hasattr(spider_class, 'setup_extra_args'): parser = ArgumentParser() spider_class.setup_extra_args(parser) extra_args, trash = parser.parse_known_args() spider_config['extra_args'] = vars(extra_args) if thread_number is None: thread_number = spider_config.getint('GRAB_THREAD_NUMBER') stat_task_object = kwargs.get('stat_task_object', None) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'), task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'), ) if spider_config.get('GRAB_QUEUE'): bot.setup_queue(**spider_config['GRAB_QUEUE']) if spider_config.get('GRAB_CACHE'): bot.setup_cache(**spider_config['GRAB_CACHE']) if spider_config.get('GRAB_PROXY_LIST'): if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**spider_config['GRAB_PROXY_LIST']) if spider_config.get('GRAB_COMMAND_INTERFACES'): for iface_config in spider_config['GRAB_COMMAND_INTERFACES']: bot.controller.add_interface(**iface_config) # Dirty hack # FIXIT: REMOVE bot.dump_spider_stats = kwargs.get('dump_spider_stats') bot.stats_object = kwargs.get('stats_object') try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING')) if spider_config.get('GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if config.get('GRAB_SAVE_REPORT'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) bot.save_list('fatal', '%s/fatal.txt' % dir_) bot.save_list('task-count-rejected', '%s/task_count_rejected.txt' % dir_) bot.save_list('network-count-rejected', '%s/network_count_rejected.txt' % dir_) bot.save_list('task-with-invalid-url', '%s/task_with_invalid_url.txt' % dir_) with open('%s/report.txt' % dir_, 'wb') as out: out.write(stats) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
import os from argparse import ArgumentParser import logging from grab.tools.lock import assert_lock from grab.tools.logs import default_logging import sys from grab.util.config import build_global_config from grab.util.py3k_support import * logger = logging.getLogger('grab.cli') config = build_global_config() def activate_env(env_path): activate_script = os.path.join(config['GRAB_ACTIVATE_VIRTUALENV'], 'bin/activate_this.py') # py3 hack if PY3K: exec(compile(open(activate_script).read(), activate_script, 'exec'), dict(__file__=activate_script)) else: execfile(activate_script, dict(__file__=activate_script)) def setup_logging(action, level): root = logging.getLogger() root.setLevel(logging.DEBUG) #for hdl in root.handlers: # root.removeHandler(hdl)
def test_build_global_config3(self): modname = setup_settings_file({}) default_config.default_config = {'CACHE': {'backend': 'mysql'}} config = build_global_config(modname) self.assertEqual(config['CACHE'], {'backend': 'mysql'})
def process_command_line(): # Add current directory to python path cur_dir = os.path.realpath(os.getcwd()) sys.path.insert(0, cur_dir) process_env_option() parser = ArgumentParser() parser.add_argument('action', type=str) #parser.add_argument('positional_args', nargs='*') #parser.add_argument('-t', '--thread-number', help='Number of network threads', #default=1, type=int) parser.add_argument('--logging-level', default='debug') #parser.add_argument('--slave', action='store_true', default=False) parser.add_argument('--lock-key') parser.add_argument('--ignore-lock', action='store_true', default=False) parser.add_argument('--settings', type=str, default='settings') parser.add_argument('--env', type=str) args, trash = parser.parse_known_args() config = build_global_config() if config and config['GRAB_DJANGO_SETTINGS']: os.environ['DJANGO_SETTINGS_MODULE'] = 'settings' # Turn off DEBUG to prevent memory leaks from django.conf import settings settings.DEBUG = False # Setup logging logging_level = getattr(logging, args.logging_level.upper()) #if args.positional_args: #command_key = '_'.join([args.action] + args.positional_args) #else: #command_key = args.action # TODO: enable logs setup_logging(args.action, logging_level, clear_handlers=True) # Setup action handler action_name = args.action try: # First, try to import script from the grab package action_mod = __import__('grab.script.%s' % action_name, None, None, ['foo']) except ImportError as ex: if (ex.message.startswith('No module named') and action_name in ex.message): pass else: logging.error('', exc_info=ex) # If grab does not provides the script # try to import it from the current project try: action_mod = __import__('script.%s' % action_name, None, None, ['foo']) except ImportError as ex: logging.error('', exc_info=ex) sys.stderr.write('Could not import %s script' % action_name) sys.exit(1) if hasattr(action_mod, 'setup_arg_parser'): action_mod.setup_arg_parser(parser) args = parser.parse_args() # TODO: enable lock-file processing #lock_key = None #if not args.slave: #if not args.ignore_lock: #if not args.lock_key: #if hasattr(action_mod, 'setup_lock_key'): #lock_key = action_mod.setup_lock_key(action_name, args) #else: #lock_key = command_key #else: #lock_key = args.lock_key #if lock_key is not None: #lock_path = 'var/run/%s.lock' % lock_key #print 'Trying to lock file: %s' % lock_path #assert_lock(lock_path) logger.debug('Executing %s action' % action_name) try: action_mod.main(**vars(args)) except Exception as ex: logging.error('Unexpected exception from action handler:', exc_info=ex)
def main(spider_name, thread_number=None, slave=False, settings='settings', network_logs=False, *args, **kwargs): default_logging(propagate_network_logger=network_logs) lock_key = None if not slave: lock_key = 'crawl.%s' % spider_name if lock_key is not None: lock_path = 'var/run/%s.lock' % lock_key logger.debug('Trying to lock file: %s' % lock_path) assert_lock(lock_path) config = build_global_config(settings) spider_class = load_spider_class(config, spider_name) spider_config = build_spider_config(spider_class, config) if hasattr(spider_class, 'setup_extra_args'): parser = ArgumentParser() spider_class.setup_extra_args(parser) extra_args, trash = parser.parse_known_args() spider_config['extra_args'] = vars(extra_args) if thread_number is None: thread_number = spider_config.getint('GRAB_THREAD_NUMBER') stat_task_object = kwargs.get('stat_task_object', None) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'), task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'), ) if spider_config.get('GRAB_QUEUE'): bot.setup_queue(**spider_config['GRAB_QUEUE']) if spider_config.get('GRAB_CACHE'): bot.setup_cache(**spider_config['GRAB_CACHE']) if spider_config.get('GRAB_PROXY_LIST'): bot.load_proxylist(**spider_config['GRAB_PROXY_LIST']) if spider_config.get('GRAB_COMMAND_INTERFACES'): for iface_config in spider_config['GRAB_COMMAND_INTERFACES']: bot.controller.add_interface(**iface_config) # Dirty hack # FIXIT: REMOVE bot.dump_spider_stats = kwargs.get('dump_spider_stats') bot.stats_object = kwargs.get('stats_object') try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING')) if config.get('GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if config.get('GRAB_SAVE_FATAL_ERRORS'): bot.save_list('fatal', 'var/fatal-%d.txt' % pid) if config.get('GRAB_SAVE_TASK_ADD_ERRORS'): bot.save_list('task-could-not-be-added', 'var/task-add-error-%d.txt' % pid) if config.get('GRAB_SAVE_FINAL_STATS'): open('var/stats-%d.txt' % pid, 'wb').write(stats) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
def main(spider_name, thread_number=None, slave=False, settings='settings', network_logs=False, disable_proxy=False, ignore_lock=False, *args, **kwargs): default_logging(propagate_network_logger=network_logs) if not ignore_lock: lock_key = None if not slave: lock_key = 'crawl.%s' % spider_name if lock_key is not None: lock_path = 'var/run/%s.lock' % lock_key logger.debug('Trying to lock file: %s' % lock_path) assert_lock(lock_path) config = build_global_config(settings) spider_class = load_spider_class(config, spider_name) spider_config = build_spider_config(spider_class, config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, trash = parser.parse_known_args() spider_args = vars(opts) if thread_number is None: thread_number = int(spider_config.get('thread_number', deprecated_key='GRAB_THREAD_NUMBER')) stat_task_object = kwargs.get('stat_task_object', None) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=int(spider_config.get('network_try_limit', deprecated_key='GRAB_NETWORK_TRY_LIMIT')), task_try_limit=int(spider_config.get('task_try_limit', deprecated_key='GRAB_TASK_TRY_LIMIT')), args=spider_args, ) opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get('proxy_list', deprecated_key='GRAB_PROXY_LIST') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get('command_interfaces', deprecated_key='GRAB_COMMAND_INTERFACES') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) # Dirty hack # FIXIT: REMOVE bot.dump_spider_stats = kwargs.get('dump_spider_stats') bot.stats_object = kwargs.get('stats_object') try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=spider_config.get('display_timing', deprecated_key='GRAB_DISPLAY_TIMING')) if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) for key, lst in bot.items.iteritems(): fname_key = key.replace('-', '_') bot.save_list(key, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(stats) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
import os from argparse import ArgumentParser import logging from grab.tools.lock import assert_lock from grab.tools.logs import default_logging import sys from grab.util.config import build_global_config from grab.util.py3k_support import * logger = logging.getLogger('grab.cli') config = build_global_config() def activate_env(env_path): activate_script = os.path.join(config['GRAB_ACTIVATE_VIRTUALENV'], 'bin/activate_this.py') # py3 hack if PY3K: exec(compile(open(activate_script).read(), activate_script, 'exec'), dict(__file__=activate_script)) else: execfile(activate_script, dict(__file__=activate_script)) def setup_logging(action, level): root = logging.getLogger() root.setLevel(logging.DEBUG) #for hdl in root.handlers: