def setup_loggin(): # setup default logging default_logging(grab_log='var/grab.log', level=LOG_LEVEL, mode='a', propagate_network_logger=False, network_log='var/grab.network.log')
def main(spider_name, thread_number=None, slave=False, settings_module='settings', network_logs=False, disable_proxy=False, ignore_lock=False, disable_report=False, disable_default_logs=False, *args, **kwargs): if disable_default_logs: default_logging(propagate_network_logger=network_logs, grab_log=None, network_log=None) else: default_logging(propagate_network_logger=network_logs) root_config = build_root_config(settings_module) spider_class = load_spider_class(root_config, spider_name) spider_config = build_spider_config(spider_class, root_config) spider_args = None if hasattr(spider_class, 'setup_arg_parser'): parser = ArgumentParser() spider_class.setup_arg_parser(parser) opts, trash = parser.parse_known_args() spider_args = vars(opts) bot = spider_class( thread_number=thread_number, slave=slave, config=spider_config, network_try_limit=None, task_try_limit=None, args=spider_args, ) opt_queue = spider_config.get('queue') if opt_queue: bot.setup_queue(**opt_queue) opt_cache = spider_config.get('cache') if opt_cache: bot.setup_cache(**opt_cache) opt_proxy_list = spider_config.get('proxy_list') if opt_proxy_list: if disable_proxy: logger.debug('Proxy servers disabled via command line') else: bot.load_proxylist(**opt_proxy_list) opt_ifaces = spider_config.get('command_interfaces') if opt_ifaces: for iface_config in opt_ifaces: bot.controller.add_interface(**iface_config) try: bot.run() except KeyboardInterrupt: pass stats = bot.render_stats(timing=spider_config.get('display_timing')) stats_with_time = bot.render_stats(timing=True) if spider_config.get('display_stats'): logger.debug(stats) pid = os.getpid() logger.debug('Spider pid is %d' % pid) if not disable_report: if spider_config.get('save_report'): for subdir in (str(pid), 'last'): dir_ = 'var/%s' % subdir if not os.path.exists(dir_): os.mkdir(dir_) else: clear_directory(dir_) for key, lst in bot.items.items(): fname_key = key.replace('-', '_') bot.save_list(key, '%s/%s.txt' % (dir_, fname_key)) with open('%s/report.txt' % dir_, 'wb') as out: out.write(make_str(stats_with_time)) return { 'spider_stats': bot.render_stats(timing=False), 'spider_timing': bot.render_timing(), }
def hideme_parse(): default_logging(grab_log=grab_settings.GRAB_LOG, network_log=grab_settings.NETWORK_LOG) bot = HidemeProxies(thread_number=1) bot.run()
def foxtools_parse(): default_logging(grab_log=grab_settings.GRAB_LOG, network_log=grab_settings.NETWORK_LOG) bot = FoxToolsProxy(thread_number=1) bot.run()
task_help = "task to run (active marked with '*'):\n{}".format( ", ".join(tasks_colored)) parser.add_argument('-T', '--task', type=str, help=task_help) parser.add_argument('-c', '--celery', action="store_true", default=False, help='run as celery task') args = parser.parse_args() return args if __name__ == '__main__': # noqa colorama_init() default_logging(grab_log='var/grab.log', level=LOG_LEVEL, mode='w', propagate_network_logger=False, network_log='var/grab.network.log') parser = argparse.ArgumentParser(description='command line interface') args = command_line_interface(parser) if args.task: if args.task == 'all': for task in ACTIVE_TASKS: scraper = ScrapersRunInterface.crawl(task) logger.info(scraper.render_stats()) if args.task not in TASKS.keys(): logger.critical( u"Can't find crawler in list: {}".format( TASKS.keys())) if args.celery:
# -*- coding: utf-8 -*- from __future__ import unicode_literals # py2 from grab import Grab from spiders import BaseSpider from grab.spider import Task import logging logging.basicConfig(level=logging.INFO) logging.getLogger().addHandler(logging.FileHandler('/tmp/parse.log')) from weblib.logs import default_logging default_logging() class InitialSpider(BaseSpider): """Spider for initial data grabbing""" BASE_STREAM_URL = BaseSpider.BASE_URL + '/ru/post/stream.json' initial_urls = (BASE_STREAM_URL,) def task_initial(self, grab, task): sorted_posts = sorted(grab.response.json['posts'], key=lambda x: int(x['id'])) past_jump = max_offset = int(sorted_posts[0]['id']) g = self.create_grab_instance()