コード例 #1
0
ファイル: crawl.py プロジェクト: vasia123/grab
def main(spider_name, thread_number=None, slave=False, force_url=None,
         settings='settings', *args, **kwargs):
    default_logging(propagate_network_logger=kwargs['propagate_network_logger'])

    lock_key = None
    if not slave:
        lock_key = 'crawl.%s' % spider_name
    if lock_key is not None:
        lock_path = 'var/run/%s.lock' % lock_key
        logger.debug('Trying to lock file: %s' % lock_path)
        assert_lock(lock_path)

    config = build_global_config(settings)
    spider_class = load_spider_class(config, spider_name)
    spider_config = build_spider_config(spider_class, config)

    if thread_number is None:
        thread_number = spider_config.getint('GRAB_THREAD_NUMBER')

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'),
        task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'),
    )
    if spider_config.get('GRAB_QUEUE'):
        bot.setup_queue(**spider_config['GRAB_QUEUE'])
    if spider_config.get('GRAB_CACHE'):
        bot.setup_cache(**spider_config['GRAB_CACHE'])
    if spider_config.get('GRAB_PROXY_LIST'):
        bot.load_proxylist(**spider_config['GRAB_PROXY_LIST'])
    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING'))

    if config.get('GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if config.get('GRAB_SAVE_FATAL_ERRORS'):
        bot.save_list('fatal', 'var/fatal-%d.txt' % pid)

    if config.get('GRAB_SAVE_TASK_ADD_ERRORS'):
        bot.save_list('task-could-not-be-added', 'var/task-add-error-%d.txt' % pid)

    if config.get('GRAB_SAVE_FINAL_STATS'):
        open('var/stats-%d.txt' % pid, 'wb').write(stats)
コード例 #2
0
ファイル: views.py プロジェクト: ArturFis/grab
def grab_control_api(request, command):
    args = request.GET 
    cls = load_spider_class(build_global_config(), args['spider'])
    spider = cls()
    iface = spider.controller.add_interface('redis')
    if command == 'put_command':
        result_id = iface.put_command({'name': args['command']})
        return {'result_id': result_id}
    elif command == 'pop_result':
        result = iface.pop_result(args['result_id'])
        if result is None:
            return {'status': 'not-ready'}
        else:
            return {'data': result.get('data', ''),
                    'error': result.get('error', ''),
                    }
    else:
        return {'error': 'unknown-command'}
コード例 #3
0
ファイル: views.py プロジェクト: sergithon/grab
def grab_control_api(request, command):
    args = request.GET 
    cls = load_spider_class(build_root_config(), args['spider'])
    spider = cls()
    iface = spider.controller.add_interface('redis')
    if command == 'put_command':
        result_id = iface.put_command({'name': args['command']})
        return {'result_id': result_id}
    elif command == 'pop_result':
        result = iface.pop_result(args['result_id'])
        if result is None:
            return {'status': 'not-ready'}
        else:
            return {'data': result.get('data', ''),
                    'error': result.get('error', ''),
                    }
    else:
        return {'error': 'unknown-command'}
コード例 #4
0
def main(spider_name,
         thread_number=None,
         slave=False,
         settings='settings',
         network_logs=False,
         disable_proxy=False,
         ignore_lock=False,
         disable_report=False,
         *args,
         **kwargs):
    default_logging(propagate_network_logger=network_logs)

    root_config = build_root_config(settings)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, trash = parser.parse_known_args()
        spider_args = vars(opts)

    if thread_number is None:
        thread_number = \
            int(spider_config.get('thread_number',
                                  deprecated_key='GRAB_THREAD_NUMBER'))

    stat_task_object = kwargs.get('stat_task_object', None)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=int(
            spider_config.get('network_try_limit',
                              deprecated_key='GRAB_NETWORK_TRY_LIMIT')),
        task_try_limit=int(
            spider_config.get('task_try_limit',
                              deprecated_key='GRAB_TASK_TRY_LIMIT')),
        args=spider_args,
    )
    opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list',
                                       deprecated_key='GRAB_PROXY_LIST')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces',
                                   deprecated_key='GRAB_COMMAND_INTERFACES')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    # Dirty hack
    # FIXIT: REMOVE
    bot.dump_spider_stats = kwargs.get('dump_spider_stats')
    bot.stats_object = kwargs.get('stats_object')

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=spider_config.get(
        'display_timing', deprecated_key='GRAB_DISPLAY_TIMING'))

    if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if not disable_report:
        if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.mkdir(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.items.iteritems():
                    fname_key = key.replace('-', '_')
                    bot.save_list(key, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(stats)

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
コード例 #5
0
ファイル: crawl.py プロジェクト: webdeveloper0012/Grab
def main(spider_name,
         thread_number=None,
         settings_module='settings',
         network_logs=False,
         disable_proxy=False,
         ignore_lock=False,
         disable_report=False,
         api_port=None,
         parser_pool_size=2,
         grab_log_file=None,
         network_log_file=None,
         network_service=None,
         grab_transport=None,
         **kwargs):  # pylint: disable=unused-argument
    default_logging(
        grab_log=grab_log_file,
        network_log=network_log_file,
        propagate_network_logger=network_logs,
    )

    root_config = build_root_config(settings_module)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, _ = parser.parse_known_args()
        spider_args = vars(opts)

    bot = spider_class(
        thread_number=thread_number,
        config=spider_config,
        network_try_limit=None,
        task_try_limit=None,
        args=spider_args,
        http_api_port=api_port,
        parser_pool_size=parser_pool_size,
        network_service=network_service,
        grab_transport=grab_transport,
    )
    opt_queue = spider_config.get('queue')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats()

    if spider_config.get('display_stats'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d', pid)

    if not disable_report:
        if spider_config.get('save_report'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.makedirs(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.stat.collections.items():
                    fname_key = key.replace('-', '_')
                    save_list(lst, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(make_str(stats))

    return {
        'spider_stats': bot.render_stats(),
    }
コード例 #6
0
ファイル: crawl.py プロジェクト: bodja/grab
def main(spider_name, thread_number=None, slave=False,
         settings='settings', network_logs=False,
         disable_proxy=False, ignore_lock=False, 
         disable_report=False,
         *args, **kwargs):
    default_logging(propagate_network_logger=network_logs)

    root_config = build_root_config(settings)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, trash = parser.parse_known_args()
        spider_args = vars(opts)

    if thread_number is None:
        thread_number = \
            int(spider_config.get('thread_number',
                                  deprecated_key='GRAB_THREAD_NUMBER'))

    stat_task_object = kwargs.get('stat_task_object', None)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=int(spider_config.get(
            'network_try_limit', deprecated_key='GRAB_NETWORK_TRY_LIMIT')),
        task_try_limit=int(spider_config.get(
            'task_try_limit', deprecated_key='GRAB_TASK_TRY_LIMIT')),
        args=spider_args,
    )
    opt_queue = spider_config.get('queue', deprecated_key='GRAB_QUEUE')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache', deprecated_key='GRAB_CACHE')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get(
        'proxy_list', deprecated_key='GRAB_PROXY_LIST')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get(
        'command_interfaces', deprecated_key='GRAB_COMMAND_INTERFACES')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    # Dirty hack
    # FIXIT: REMOVE
    bot.dump_spider_stats = kwargs.get('dump_spider_stats')
    bot.stats_object = kwargs.get('stats_object')

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(
        timing=spider_config.get('display_timing',
                                 deprecated_key='GRAB_DISPLAY_TIMING'))

    if spider_config.get('display_stats', deprecated_key='GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if not disable_report:
        if spider_config.get('save_report', deprecated_key='GRAB_SAVE_REPORT'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.mkdir(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.items.iteritems():
                    fname_key = key.replace('-', '_')
                    bot.save_list(key, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(stats)

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
コード例 #7
0
ファイル: util_module.py プロジェクト: wyrover/grab
 def test_load_spider_class(self):
     cfg = self.build_config(['test.util_module'])
     SPIDER_REGISTRY.clear()
     cls = load_spider_class(cfg, 'first_spider')
     self.assertEqual(cls, FirstSpider)
コード例 #8
0
ファイル: crawl.py プロジェクト: boooka/GeoPowerOff
def main(spider_name, thread_number=None, slave=False,
         settings='settings', network_logs=False,
         *args, **kwargs):
    default_logging(propagate_network_logger=network_logs)

    lock_key = None
    if not slave:
        lock_key = 'crawl.%s' % spider_name
    if lock_key is not None:
        lock_path = 'var/run/%s.lock' % lock_key
        logger.debug('Trying to lock file: %s' % lock_path)
        assert_lock(lock_path)

    config = build_global_config(settings)
    spider_class = load_spider_class(config, spider_name)
    spider_config = build_spider_config(spider_class, config)

    if hasattr(spider_class, 'setup_extra_args'):
        parser = ArgumentParser()
        spider_class.setup_extra_args(parser)
        extra_args, trash = parser.parse_known_args()
        spider_config['extra_args'] = vars(extra_args)

    if thread_number is None:
        thread_number = spider_config.getint('GRAB_THREAD_NUMBER')

    stat_task_object = kwargs.get('stat_task_object', None)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'),
        task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'),
    )
    if spider_config.get('GRAB_QUEUE'):
        bot.setup_queue(**spider_config['GRAB_QUEUE'])
    if spider_config.get('GRAB_CACHE'):
        bot.setup_cache(**spider_config['GRAB_CACHE'])
    if spider_config.get('GRAB_PROXY_LIST'):
        bot.load_proxylist(**spider_config['GRAB_PROXY_LIST'])
    if spider_config.get('GRAB_COMMAND_INTERFACES'):
        for iface_config in spider_config['GRAB_COMMAND_INTERFACES']:
            bot.controller.add_interface(**iface_config)

    # Dirty hack
    # FIXIT: REMOVE
    bot.dump_spider_stats = kwargs.get('dump_spider_stats')
    bot.stats_object = kwargs.get('stats_object')

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING'))

    if config.get('GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if config.get('GRAB_SAVE_FATAL_ERRORS'):
        bot.save_list('fatal', 'var/fatal-%d.txt' % pid)

    if config.get('GRAB_SAVE_TASK_ADD_ERRORS'):
        bot.save_list('task-could-not-be-added', 'var/task-add-error-%d.txt' % pid)

    if config.get('GRAB_SAVE_FINAL_STATS'):
        open('var/stats-%d.txt' % pid, 'wb').write(stats)

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
コード例 #9
0
ファイル: crawl.py プロジェクト: abael/grab
def main(spider_name, thread_number=None, slave=False,
         settings_module='settings', network_logs=False,
         disable_proxy=False, ignore_lock=False,
         disable_report=False,
         disable_default_logs=False,
         *args, **kwargs):
    if disable_default_logs:
        default_logging(propagate_network_logger=network_logs,
                        grab_log=None, network_log=None)
    else:
        default_logging(propagate_network_logger=network_logs)

    root_config = build_root_config(settings_module)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, trash = parser.parse_known_args()
        spider_args = vars(opts)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=None,
        task_try_limit=None,
        args=spider_args,
    )
    opt_queue = spider_config.get('queue')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=spider_config.get('display_timing'))
    stats_with_time = bot.render_stats(timing=True)

    if spider_config.get('display_stats'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if not disable_report:
        if spider_config.get('save_report'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.mkdir(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.items.items():
                    fname_key = key.replace('-', '_')
                    bot.save_list(key, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(make_str(stats_with_time))

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }
コード例 #10
0
ファイル: util_module.py プロジェクト: abael/grab
 def test_load_spider_class(self):
     cfg = self.build_config(['test.util_module'])
     SPIDER_REGISTRY.clear()
     cls = load_spider_class(cfg, 'first_spider')
     self.assertEqual(cls, FirstSpider)
コード例 #11
0
def main(spider_name, thread_number=None, slave=False,
         settings='settings', network_logs=False,
         disable_proxy=False, 
         *args, **kwargs):
    default_logging(propagate_network_logger=network_logs)

    lock_key = None
    if not slave:
        lock_key = 'crawl.%s' % spider_name
    if lock_key is not None:
        lock_path = 'var/run/%s.lock' % lock_key
        logger.debug('Trying to lock file: %s' % lock_path)
        assert_lock(lock_path)

    config = build_global_config(settings)
    spider_class = load_spider_class(config, spider_name)
    spider_config = build_spider_config(spider_class, config)

    if hasattr(spider_class, 'setup_extra_args'):
        parser = ArgumentParser()
        spider_class.setup_extra_args(parser)
        extra_args, trash = parser.parse_known_args()
        spider_config['extra_args'] = vars(extra_args)

    if thread_number is None:
        thread_number = spider_config.getint('GRAB_THREAD_NUMBER')

    stat_task_object = kwargs.get('stat_task_object', None)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=spider_config.getint('GRAB_NETWORK_TRY_LIMIT'),
        task_try_limit=spider_config.getint('GRAB_TASK_TRY_LIMIT'),
    )
    if spider_config.get('GRAB_QUEUE'):
        bot.setup_queue(**spider_config['GRAB_QUEUE'])
    if spider_config.get('GRAB_CACHE'):
        bot.setup_cache(**spider_config['GRAB_CACHE'])
    if spider_config.get('GRAB_PROXY_LIST'):
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**spider_config['GRAB_PROXY_LIST'])
    if spider_config.get('GRAB_COMMAND_INTERFACES'):
        for iface_config in spider_config['GRAB_COMMAND_INTERFACES']:
            bot.controller.add_interface(**iface_config)

    # Dirty hack
    # FIXIT: REMOVE
    bot.dump_spider_stats = kwargs.get('dump_spider_stats')
    bot.stats_object = kwargs.get('stats_object')

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=config.get('GRAB_DISPLAY_TIMING'))

    if spider_config.get('GRAB_DISPLAY_STATS'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if config.get('GRAB_SAVE_REPORT'):
        for subdir in (str(pid), 'last'):
            dir_ = 'var/%s' % subdir
            if not os.path.exists(dir_):
                os.mkdir(dir_)
            else:
                clear_directory(dir_)
            bot.save_list('fatal', '%s/fatal.txt' % dir_)
            bot.save_list('task-count-rejected', '%s/task_count_rejected.txt' % dir_)
            bot.save_list('network-count-rejected', '%s/network_count_rejected.txt' % dir_)
            bot.save_list('task-with-invalid-url', '%s/task_with_invalid_url.txt' % dir_)
            with open('%s/report.txt' % dir_, 'wb') as out:
                out.write(stats)

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }