Esempio n. 1
0
    def test_build_spider_config_empty(self):
        class TestSpider(Spider):
            pass

        root_cfg = build_root_config('tests.files.settings_minimal')
        cfg = build_spider_config(TestSpider, root_cfg)
        self.assertEqual(cfg, DEFAULT_SPIDER_GLOBAL_CONFIG)
Esempio n. 2
0
    def test_build_spider_config_empty(self):
        class TestSpider(Spider):
            pass

        root_cfg = build_root_config('test.files.settings_minimal')
        cfg = build_spider_config(TestSpider, root_cfg)
        self.assertEqual(cfg, DEFAULT_SPIDER_GLOBAL_CONFIG)
Esempio n. 3
0
 def test_build_spider_config1(self):
         modname = setup_settings_file({})
         default_config.default_config = {
             'CACHE': {'backend': 'mysql'},
             'VAR1': 'val1',
         }
         config = build_global_config(modname)
         spider_config = build_spider_config('foo', config)
         self.assertEqual(config['CACHE'], {'backend': 'mysql'})
Esempio n. 4
0
 def test_build_spider_config1(self):
     modname = setup_settings_file({})
     default_config.default_config = {
         'CACHE': {
             'backend': 'mysql'
         },
         'VAR1': 'val1',
     }
     config = build_global_config(modname)
     spider_config = build_spider_config('foo', config)
     self.assertEqual(config['CACHE'], {'backend': 'mysql'})
Esempio n. 5
0
 def test_build_spider_config_overwrite(self):
     class TestSpider(Spider): pass
     root_cfg = build_root_config('test.files.settings_test_spider')
     cfg = build_spider_config(TestSpider, root_cfg)
     for key, val in  DEFAULT_SPIDER_GLOBAL_CONFIG.items():
         if key == 'spider_modules':
             self.assertEqual(cfg[key], ['zzz'])
         elif key == 'thread_number':
             self.assertEqual(cfg[key], 777)
         else:
             self.assertEqual(cfg[key], val)
Esempio n. 6
0
 def test_build_spider_config2(self):
         modname = setup_settings_file({})
         default_config.default_config = {
             'CACHE': {'backend': 'mysql'},
             'SPIDER_CONFIG_FOO': {
                 'CACHE': {'backend': 'tokyo'},
             },
         }
         config = build_global_config(modname)
         spider_config = build_spider_config('foo', config)
         self.assertEqual(spider_config['CACHE'], {'backend': 'tokyo'})
Esempio n. 7
0
def main(spider_name, thread_number=None, slave=False, force_url=None,
         settings='settings', *args, **kwargs):
    default_logging(propagate_network_logger=kwargs['propagate_network_logger'])

    lock_key = None
    if not slave:
        lock_key = 'crawl.%s' % spider_name
    if lock_key is not None:
        lock_path = 'var/run/%s.lock' % lock_key
        logger.debug('Trying to lock file: %s' % lock_path)
        assert_lock(lock_path)

    config = build_global_config(settings)
    spider_class = load_spider_class(config, spider_name)
    spider_config = build_spider_config(spider_class, config)

    if thread_number is None:
        thread_number = spider_config.getint('GRAB_THREAD_NUMBER')

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'),
        task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'),
    )
    if spider_config.get('GRAB_QUEUE'):
        bot.setup_queue(**spider_config['GRAB_QUEUE'])
    if spider_config.get('GRAB_CACHE'):
        bot.setup_cache(**spider_config['GRAB_CACHE'])
    if spider_config.get('GRAB_PROXY_LIST'):
        bot.load_proxylist(**spider_config['GRAB_PROXY_LIST'])
    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING'))

    if config.get('GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if config.get('GRAB_SAVE_FATAL_ERRORS'):
        bot.save_list('fatal', 'var/fatal-%d.txt' % pid)

    if config.get('GRAB_SAVE_TASK_ADD_ERRORS'):
        bot.save_list('task-could-not-be-added', 'var/task-add-error-%d.txt' % pid)

    if config.get('GRAB_SAVE_FINAL_STATS'):
        open('var/stats-%d.txt' % pid, 'wb').write(stats)
Esempio n. 8
0
    def test_setup_spider_config(self):
        class TestSpider(Spider):
            @classmethod
            def setup_spider_config(cls, config):
                config['foo'] = 'bar'

        root_cfg = build_root_config('tests.files.settings_minimal')
        cfg = build_spider_config(TestSpider, root_cfg)
        for key, val in DEFAULT_SPIDER_GLOBAL_CONFIG.items():
            if key == 'foo':
                self.assertEqual(cfg[key], 'bar')
            else:
                self.assertEqual(cfg[key], val)
Esempio n. 9
0
    def test_setup_spider_config(self):
        class TestSpider(Spider):
            @classmethod
            def setup_spider_config(cls, config):
                config['foo'] = 'bar'

        root_cfg = build_root_config('test.files.settings_minimal')
        cfg = build_spider_config(TestSpider, root_cfg)
        for key, val in DEFAULT_SPIDER_GLOBAL_CONFIG.items():
            if key == 'foo':
                self.assertEqual(cfg[key], 'bar')
            else:
                self.assertEqual(cfg[key], val)
Esempio n. 10
0
 def test_build_spider_config2(self):
     modname = setup_settings_file({})
     default_config.default_config = {
         'CACHE': {
             'backend': 'mysql'
         },
         'SPIDER_CONFIG_FOO': {
             'CACHE': {
                 'backend': 'tokyo'
             },
         },
     }
     config = build_global_config(modname)
     spider_config = build_spider_config('foo', config)
     self.assertEqual(spider_config['CACHE'], {'backend': 'tokyo'})
Esempio n. 11
0
def build_spider_instance(cls, settings_module):
    root_config = build_root_config(settings_module)
    spider_config = build_spider_config(cls, root_config)
    return cls(config=spider_config)
Esempio n. 12
0
def main(spider_name,
         thread_number=None,
         settings_module='settings',
         network_logs=False,
         disable_proxy=False,
         ignore_lock=False,
         disable_report=False,
         api_port=None,
         parser_pool_size=2,
         grab_log_file=None,
         network_log_file=None,
         network_service=None,
         grab_transport=None,
         **kwargs):  # pylint: disable=unused-argument
    default_logging(
        grab_log=grab_log_file,
        network_log=network_log_file,
        propagate_network_logger=network_logs,
    )

    root_config = build_root_config(settings_module)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, _ = parser.parse_known_args()
        spider_args = vars(opts)

    bot = spider_class(
        thread_number=thread_number,
        config=spider_config,
        network_try_limit=None,
        task_try_limit=None,
        args=spider_args,
        http_api_port=api_port,
        parser_pool_size=parser_pool_size,
        network_service=network_service,
        grab_transport=grab_transport,
    )
    opt_queue = spider_config.get('queue')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats()

    if spider_config.get('display_stats'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d', pid)

    if not disable_report:
        if spider_config.get('save_report'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.makedirs(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.stat.collections.items():
                    fname_key = key.replace('-', '_')
                    save_list(lst, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(make_str(stats))

    return {
        'spider_stats': bot.render_stats(),
    }
Esempio n. 13
0
File: crawl.py Progetto: bodja/grab
def main(spider_name, thread_number=None, slave=False,
         settings='settings', network_logs=False,
         disable_proxy=False, ignore_lock=False, 
         disable_report=False,
         *args, **kwargs):
    default_logging(propagate_network_logger=network_logs)

    root_config = build_root_config(settings)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, trash = parser.parse_known_args()
        spider_args = vars(opts)

    if thread_number is None:
        thread_number = \
            int(spider_config.get('thread_number',
                                  deprecated_key='GRAB_THREAD_NUMBER'))

    stat_task_object = kwargs.get('stat_task_object', None)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=int(spider_config.get(
            'network_try_limit', deprecated_key='GRAB_NETWORK_TRY_LIMIT')),
        task_try_limit=int(spider_config.get(
            'task_try_limit', deprecated_key='GRAB_TASK_TRY_LIMIT')),
        args=spider_args,
    )
    opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get(
        'proxy_list', deprecated_key='GRAB_PROXY_LIST')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get(
        'command_interfaces', deprecated_key='GRAB_COMMAND_INTERFACES')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    # Dirty hack
    # FIXIT: REMOVE
    bot.dump_spider_stats = kwargs.get('dump_spider_stats')
    bot.stats_object = kwargs.get('stats_object')

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(
        timing=spider_config.get('display_timing',
                                 deprecated_key='GRAB_DISPLAY_TIMING'))

    if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if not disable_report:
        if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.mkdir(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.items.iteritems():
                    fname_key = key.replace('-', '_')
                    bot.save_list(key, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(stats)

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
Esempio n. 14
0
def build_spider_instance(cls, settings_module, **kwargs):
    root_config = build_root_config(settings_module)
    spider_config = build_spider_config(cls, root_config)
    return cls(config=spider_config)
Esempio n. 15
0
File: crawl.py Progetto: abael/grab
def main(spider_name, thread_number=None, slave=False,
         settings_module='settings', network_logs=False,
         disable_proxy=False, ignore_lock=False,
         disable_report=False,
         disable_default_logs=False,
         *args, **kwargs):
    if disable_default_logs:
        default_logging(propagate_network_logger=network_logs,
                        grab_log=None, network_log=None)
    else:
        default_logging(propagate_network_logger=network_logs)

    root_config = build_root_config(settings_module)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, trash = parser.parse_known_args()
        spider_args = vars(opts)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=None,
        task_try_limit=None,
        args=spider_args,
    )
    opt_queue = spider_config.get('queue')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=spider_config.get('display_timing'))
    stats_with_time = bot.render_stats(timing=True)

    if spider_config.get('display_stats'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if not disable_report:
        if spider_config.get('save_report'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.mkdir(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.items.items():
                    fname_key = key.replace('-', '_')
                    bot.save_list(key, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(make_str(stats_with_time))

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
Esempio n. 16
0
def main(spider_name, thread_number=None, slave=False,
         settings='settings', network_logs=False,
         *args, **kwargs):
    default_logging(propagate_network_logger=network_logs)

    lock_key = None
    if not slave:
        lock_key = 'crawl.%s' % spider_name
    if lock_key is not None:
        lock_path = 'var/run/%s.lock' % lock_key
        logger.debug('Trying to lock file: %s' % lock_path)
        assert_lock(lock_path)

    config = build_global_config(settings)
    spider_class = load_spider_class(config, spider_name)
    spider_config = build_spider_config(spider_class, config)

    if hasattr(spider_class, 'setup_extra_args'):
        parser = ArgumentParser()
        spider_class.setup_extra_args(parser)
        extra_args, trash = parser.parse_known_args()
        spider_config['extra_args'] = vars(extra_args)

    if thread_number is None:
        thread_number = spider_config.getint('GRAB_THREAD_NUMBER')

    stat_task_object = kwargs.get('stat_task_object', None)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'),
        task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'),
    )
    if spider_config.get('GRAB_QUEUE'):
        bot.setup_queue(**spider_config['GRAB_QUEUE'])
    if spider_config.get('GRAB_CACHE'):
        bot.setup_cache(**spider_config['GRAB_CACHE'])
    if spider_config.get('GRAB_PROXY_LIST'):
        bot.load_proxylist(**spider_config['GRAB_PROXY_LIST'])
    if spider_config.get('GRAB_COMMAND_INTERFACES'):
        for iface_config in spider_config['GRAB_COMMAND_INTERFACES']:
            bot.controller.add_interface(**iface_config)

    # Dirty hack
    # FIXIT: REMOVE
    bot.dump_spider_stats = kwargs.get('dump_spider_stats')
    bot.stats_object = kwargs.get('stats_object')

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING'))

    if config.get('GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if config.get('GRAB_SAVE_FATAL_ERRORS'):
        bot.save_list('fatal', 'var/fatal-%d.txt' % pid)

    if config.get('GRAB_SAVE_TASK_ADD_ERRORS'):
        bot.save_list('task-could-not-be-added', 'var/task-add-error-%d.txt' % pid)

    if config.get('GRAB_SAVE_FINAL_STATS'):
        open('var/stats-%d.txt' % pid, 'wb').write(stats)

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
Esempio n. 17
0
def main(spider_name,
         thread_number=None,
         slave=False,
         settings='settings',
         network_logs=False,
         disable_proxy=False,
         ignore_lock=False,
         disable_report=False,
         *args,
         **kwargs):
    default_logging(propagate_network_logger=network_logs)

    root_config = build_root_config(settings)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, trash = parser.parse_known_args()
        spider_args = vars(opts)

    if thread_number is None:
        thread_number = \
            int(spider_config.get('thread_number',
                                  deprecated_key='GRAB_THREAD_NUMBER'))

    stat_task_object = kwargs.get('stat_task_object', None)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=int(
            spider_config.get('network_try_limit',
                              deprecated_key='GRAB_NETWORK_TRY_LIMIT')),
        task_try_limit=int(
            spider_config.get('task_try_limit',
                              deprecated_key='GRAB_TASK_TRY_LIMIT')),
        args=spider_args,
    )
    opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list',
                                       deprecated_key='GRAB_PROXY_LIST')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces',
                                   deprecated_key='GRAB_COMMAND_INTERFACES')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    # Dirty hack
    # FIXIT: REMOVE
    bot.dump_spider_stats = kwargs.get('dump_spider_stats')
    bot.stats_object = kwargs.get('stats_object')

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=spider_config.get(
        'display_timing', deprecated_key='GRAB_DISPLAY_TIMING'))

    if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if not disable_report:
        if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.mkdir(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.items.iteritems():
                    fname_key = key.replace('-', '_')
                    bot.save_list(key, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(stats)

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
Esempio n. 18
0
def main(spider_name, thread_number=None, slave=False,
         settings='settings', network_logs=False,
         disable_proxy=False, 
         *args, **kwargs):
    default_logging(propagate_network_logger=network_logs)

    lock_key = None
    if not slave:
        lock_key = 'crawl.%s' % spider_name
    if lock_key is not None:
        lock_path = 'var/run/%s.lock' % lock_key
        logger.debug('Trying to lock file: %s' % lock_path)
        assert_lock(lock_path)

    config = build_global_config(settings)
    spider_class = load_spider_class(config, spider_name)
    spider_config = build_spider_config(spider_class, config)

    if hasattr(spider_class, 'setup_extra_args'):
        parser = ArgumentParser()
        spider_class.setup_extra_args(parser)
        extra_args, trash = parser.parse_known_args()
        spider_config['extra_args'] = vars(extra_args)

    if thread_number is None:
        thread_number = spider_config.getint('GRAB_THREAD_NUMBER')

    stat_task_object = kwargs.get('stat_task_object', None)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'),
        task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'),
    )
    if spider_config.get('GRAB_QUEUE'):
        bot.setup_queue(**spider_config['GRAB_QUEUE'])
    if spider_config.get('GRAB_CACHE'):
        bot.setup_cache(**spider_config['GRAB_CACHE'])
    if spider_config.get('GRAB_PROXY_LIST'):
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**spider_config['GRAB_PROXY_LIST'])
    if spider_config.get('GRAB_COMMAND_INTERFACES'):
        for iface_config in spider_config['GRAB_COMMAND_INTERFACES']:
            bot.controller.add_interface(**iface_config)

    # Dirty hack
    # FIXIT: REMOVE
    bot.dump_spider_stats = kwargs.get('dump_spider_stats')
    bot.stats_object = kwargs.get('stats_object')

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING'))

    if spider_config.get('GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if config.get('GRAB_SAVE_REPORT'):
        for subdir in (str(pid), 'last'):
            dir_ = 'var/%s' % subdir
            if not os.path.exists(dir_):
                os.mkdir(dir_)
            else:
                clear_directory(dir_)
            bot.save_list('fatal', '%s/fatal.txt' % dir_)
            bot.save_list('task-count-rejected', '%s/task_count_rejected.txt' % dir_)
            bot.save_list('network-count-rejected', '%s/network_count_rejected.txt' % dir_)
            bot.save_list('task-with-invalid-url', '%s/task_with_invalid_url.txt' % dir_)
            with open('%s/report.txt' % dir_, 'wb') as out:
                out.write(stats)

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }