コード例 #1
0
    def test_explicit_template_name(self):
        old_dir = os.getcwd()
        os.chdir(TMP_DIR)
        test_dir = os.path.dirname(__file__)
        project_sample_path = os.path.join(test_dir, 'files/project_sample')
        start_project.main(project_name='test_project',
                           template=project_sample_path)
        self.assertTrue(os.path.join(TMP_DIR, 'test_project/foo.py'))
        os.chdir(old_dir)

        dir_ = os.path.join(TMP_DIR, 'test_project')
        clear_directory(dir_)
        os.rmdir(dir_)
コード例 #2
0
ファイル: script_start_project.py プロジェクト: abael/grab
    def test_explicit_template_name(self):
        old_dir = os.getcwd()
        os.chdir(TMP_DIR)
        test_dir = os.path.dirname(__file__)
        project_sample_path = os.path.join(test_dir, 'files/project_sample')
        start_project.main(project_name='test_project',
                           template=project_sample_path)
        self.assertTrue(os.path.join(TMP_DIR, 'test_project/foo.py'))
        os.chdir(old_dir)

        dir_ = os.path.join(TMP_DIR, 'test_project')
        clear_directory(dir_)
        os.rmdir(dir_)
コード例 #3
0
 def test_start_project(self):
     old_dir = os.getcwd()
     os.chdir(TMP_DIR)
     dir_ = os.path.join(TMP_DIR, 'test_project')
     start_project.main(project_name='test_project', template=None)
     os.chdir(old_dir)
     self.assertTrue(
         os.path.exists(os.path.join(TMP_DIR, 'test_project/var')))
     self.assertTrue(
         os.path.exists(os.path.join(TMP_DIR, 'test_project/var/log')))
     self.assertTrue(
         os.path.exists(os.path.join(TMP_DIR, 'test_project/var/run')))
     path = os.path.join(TMP_DIR, 'test_project/spider.py')
     self.assertTrue('TestProjectSpider' in open(path).read())
     clear_directory(dir_)
     os.rmdir(dir_)
コード例 #4
0
ファイル: script_start_project.py プロジェクト: abael/grab
 def test_start_project(self):
     old_dir = os.getcwd()
     os.chdir(TMP_DIR)
     dir_ = os.path.join(TMP_DIR, 'test_project')
     start_project.main(project_name='test_project', template=None)
     os.chdir(old_dir)
     self.assertTrue(os.path.exists(os.path.join(TMP_DIR,
                                                 'test_project/var')))
     self.assertTrue(os.path.exists(os.path.join(TMP_DIR,
                                                 'test_project/var/log')))
     self.assertTrue(os.path.exists(os.path.join(TMP_DIR,
                                                 'test_project/var/run')))
     path = os.path.join(TMP_DIR, 'test_project/spider.py')
     self.assertTrue(
         'TestProjectSpider' in open(path).read())
     clear_directory(dir_)
     os.rmdir(dir_)
コード例 #5
0
ファイル: crawl.py プロジェクト: webdeveloper0012/Grab
def main(spider_name,
         thread_number=None,
         settings_module='settings',
         network_logs=False,
         disable_proxy=False,
         ignore_lock=False,
         disable_report=False,
         api_port=None,
         parser_pool_size=2,
         grab_log_file=None,
         network_log_file=None,
         network_service=None,
         grab_transport=None,
         **kwargs):  # pylint: disable=unused-argument
    default_logging(
        grab_log=grab_log_file,
        network_log=network_log_file,
        propagate_network_logger=network_logs,
    )

    root_config = build_root_config(settings_module)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, _ = parser.parse_known_args()
        spider_args = vars(opts)

    bot = spider_class(
        thread_number=thread_number,
        config=spider_config,
        network_try_limit=None,
        task_try_limit=None,
        args=spider_args,
        http_api_port=api_port,
        parser_pool_size=parser_pool_size,
        network_service=network_service,
        grab_transport=grab_transport,
    )
    opt_queue = spider_config.get('queue')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats()

    if spider_config.get('display_stats'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d', pid)

    if not disable_report:
        if spider_config.get('save_report'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.makedirs(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.stat.collections.items():
                    fname_key = key.replace('-', '_')
                    save_list(lst, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(make_str(stats))

    return {
        'spider_stats': bot.render_stats(),
    }
コード例 #6
0
ファイル: crawl.py プロジェクト: abael/grab
def main(spider_name, thread_number=None, slave=False,
         settings_module='settings', network_logs=False,
         disable_proxy=False, ignore_lock=False,
         disable_report=False,
         disable_default_logs=False,
         *args, **kwargs):
    if disable_default_logs:
        default_logging(propagate_network_logger=network_logs,
                        grab_log=None, network_log=None)
    else:
        default_logging(propagate_network_logger=network_logs)

    root_config = build_root_config(settings_module)
    spider_class = load_spider_class(root_config, spider_name)
    spider_config = build_spider_config(spider_class, root_config)

    spider_args = None
    if hasattr(spider_class, 'setup_arg_parser'):
        parser = ArgumentParser()
        spider_class.setup_arg_parser(parser)
        opts, trash = parser.parse_known_args()
        spider_args = vars(opts)

    bot = spider_class(
        thread_number=thread_number,
        slave=slave,
        config=spider_config,
        network_try_limit=None,
        task_try_limit=None,
        args=spider_args,
    )
    opt_queue = spider_config.get('queue')
    if opt_queue:
        bot.setup_queue(**opt_queue)

    opt_cache = spider_config.get('cache')
    if opt_cache:
        bot.setup_cache(**opt_cache)

    opt_proxy_list = spider_config.get('proxy_list')
    if opt_proxy_list:
        if disable_proxy:
            logger.debug('Proxy servers disabled via command line')
        else:
            bot.load_proxylist(**opt_proxy_list)

    opt_ifaces = spider_config.get('command_interfaces')
    if opt_ifaces:
        for iface_config in opt_ifaces:
            bot.controller.add_interface(**iface_config)

    try:
        bot.run()
    except KeyboardInterrupt:
        pass

    stats = bot.render_stats(timing=spider_config.get('display_timing'))
    stats_with_time = bot.render_stats(timing=True)

    if spider_config.get('display_stats'):
        logger.debug(stats)

    pid = os.getpid()
    logger.debug('Spider pid is %d' % pid)

    if not disable_report:
        if spider_config.get('save_report'):
            for subdir in (str(pid), 'last'):
                dir_ = 'var/%s' % subdir
                if not os.path.exists(dir_):
                    os.mkdir(dir_)
                else:
                    clear_directory(dir_)
                for key, lst in bot.items.items():
                    fname_key = key.replace('-', '_')
                    bot.save_list(key, '%s/%s.txt' % (dir_, fname_key))
                with open('%s/report.txt' % dir_, 'wb') as out:
                    out.write(make_str(stats_with_time))

    return {
        'spider_stats': bot.render_stats(timing=False),
        'spider_timing': bot.render_timing(),
    }