Example #1
0
def get_start_requests(project_path, spider_name):
    """
    get start requests
    :param project_path: project path
    :param spider_name: spider name
    :return:
    """
    work_cwd = os.getcwd()
    try:
        # change work dir
        os.chdir(project_path)
        # load settings
        settings = get_project_settings()
        check_deprecated_settings(settings)
        runner = CrawlerRunner(settings=settings)
        # add crawler
        spider_cls = runner.spider_loader.load(spider_name)
        runner.crawl(spider_cls)
        # get crawler
        crawler = list(runner.crawlers)[0]
        # get spider by crawler
        spider = crawler.spider
        # get start requests
        requests = list(spider.start_requests())
        if not requests and hasattr(spider, 'start'):
            requests = list(spider.start())
        requests = list(map(lambda r: process_request(r), requests))
        return {'finished': True, 'requests': requests}
    finally:
        os.chdir(work_cwd)
Example #2
0
def execute(url, project, spider, callback, **kwargs):
    argv = sys.argv
    
    print(argv)
    
    argv.append(url)
    if spider:
        argv.append('--spider')
        argv.append(spider)
    if callback:
        argv.append('--callback')
        argv.append(callback)
    
    print(argv)
    
    work_cwd = os.getcwd()
    print(work_cwd)
    try:
        os.chdir(project)
        settings = get_project_settings()
        check_deprecated_settings(settings)
        cmd = Parser()
        parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), conflict_handler='resolve')
        settings.setdict(cmd.default_settings, priority='command')
        cmd.settings = settings
        cmd.add_options(parser)
        opts, args = parser.parse_args(args=argv[1:])
        print('opt, args', opts, args)
        cmd.process_options(args, opts)
        cmd.crawler_process = CrawlerProcess(settings)
        cmd.run(args, opts)
    finally:
        os.chdir(work_cwd)
Example #3
0
def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    if settings is None and 'scrapy.conf' in sys.modules:
        from scrapy import conf
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

    if settings is None:
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf
        conf.settings = settings
    # ------------------------------------------------------------------

    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    #cmds中有scrapy.commands目录下所有命令
    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command')
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    # 真正执行CrawlerProcess的入口在这里
    cmd.crawler_process = CrawlerProcess(settings)
    #运行CrawlerProcess
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
Example #4
0
def execute(argv=None, settings=None):
    print("in cmdline line 98 : i am in eclipes")
    print("in cmdline line 98 argv %s" % argv)
    print("in cmdline line 99 settings %s" % settings)
    if argv is None:
        argv = sys.argv

    print("in cmdline line 101 argv %s" % argv)
    print("in cmdline line 102 settings %s" % settings)
    # --- backwards compatibility for scrapy.conf.settings singleton ---
    if settings is None and 'scrapy.conf' in sys.modules:
        from scrapy import conf
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

    if settings is None:
        settings = get_project_settings()

    print("in cmdline line 115 settings %s" % settings)
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf
        conf.settings = settings
    # ------------------------------------------------------------------

    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command')
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(settings)
    #从这一步就开始了...开始启动
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
Example #5
0
def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    if settings is None and 'scrapy.conf' in sys.modules:
        from scrapy import conf
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

    if settings is None:
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError: pass
        else:
            settings['EDITOR'] = editor
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf
        conf.settings = settings
    # ------------------------------------------------------------------

    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command')
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(settings)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
Example #6
0
def execute(argv=None, settings=None):
    #如果函数参数 argv是none的话 有从命令行读取参数
    if argv is None:
        argv = sys.argv

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    #判断函数参数中是否有Settings以及 'scrapy.conf 是否在模块列表中
    if settings is None and 'scrapy.conf' in sys.modules:
        from scrapy import conf
        #判断对象object是否包含名为name的特性(hasattr是通过调用getattr(ojbect, name)是否抛出异常来实现的)。
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

#如果settings是NONE
    if settings is None:

        settings = get_project_settings()
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf
        conf.settings = settings
    # ------------------------------------------------------------------

    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.defaults.update(cmd.default_settings)
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(settings)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
Example #7
0
def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    if settings is None and "scrapy.conf" in sys.modules:
        from scrapy import conf

        if hasattr(conf, "settings"):
            settings = conf.settings
    # ------------------------------------------------------------------

    if settings is None:
        settings = get_project_settings()
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf

        conf.settings = settings
    # ------------------------------------------------------------------

    crawler = CrawlerProcess(settings)
    crawler.install()
    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), conflict_handler="resolve")
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.defaults.update(cmd.default_settings)
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)
    cmd.set_crawler(crawler)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
Example #8
0
def execute(url, project_path, spider_name, callback, result, *arg, **kwargs):
    """
    execute parsing
    :param url: url
    :param project_path: project path
    :param spider_name: spider name
    :param callback: callback
    :param result: results generated by multiprocessing
    :return: 
    """
    argv = sys.argv
    argv.append(url)
    if spider_name:
        argv.append('--spider')
        argv.append(spider_name)
    if callback:
        argv.append('--callback')
        argv.append(callback)

    work_cwd = os.getcwd()
    try:
        # change work dir
        os.chdir(project_path)
        print('Move to ', project_path)
        # get settings of project
        settings = get_project_settings()
        check_deprecated_settings(settings)
        # get args by optparse
        parser = optparse.OptionParser(
            formatter=optparse.TitledHelpFormatter(),
            conflict_handler='resolve')
        # init SpiderParser
        spider_parser = SpiderParser()
        settings.setdict(spider_parser.default_settings, priority='command')
        spider_parser.settings = settings
        spider_parser.add_options(parser)
        opts, _ = parser.parse_args(args=argv[1:])
        args = [url]
        spider_parser.process_options(args, opts)
        # use CrawlerRunner instead of CrawlerProcess
        spider_parser.crawler_process = CrawlerRunner(settings)
        # spider_parser.crawler_process = CrawlerProcess(settings)
        spider_parser.run(args, opts)
        # get follow requests, items, response
        requests, items, response = spider_parser.get_requests(
        ), spider_parser.get_items(), spider_parser.get_response()
        result['requests'] = requests
        result['items'] = items
        result['response'] = response
    finally:
        os.chdir(work_cwd)
Example #9
0
def execute(argv=None, settings=None):
    """
    解析命令; 利用CrawlerProcess构建 crawler; 
    """
    # 1. 初始化执行环境; 读取用户自定义配置; 读取默认配置; 检查弃用配置项
    if argv is None:
        argv = sys.argv

    if settings is None:
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor
    check_deprecated_settings(settings)  # 检查弃用的配置项

    # 2. 检查执行环境是否在项目中: 即scrapy命令执行时的项目目录, 相邻文件夹是否存在scrapy.cfg
    inproject = inside_project()

    # 3. 读取当前scrapy支持的所有命令(commands文件夹)并解析命令
    cmds = _get_commands_dict(settings, inproject)  # 读取所有命令
    cmdname = _pop_command_name(argv)  # 解析命令
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(),
                                   conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    # 4. 获取命令实例并设置项目级别(command), 添加解析规则
    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command')
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])  # 解析命令行
    _run_print_help(parser, cmd.process_options, args, opts)

    # 5. 初始化CrawlerProcess并执行命令实例的run方法(比如命令srapy/command/crawl.py)
    cmd.crawler_process = CrawlerProcess(settings)
    _run_print_help(parser, _run_command, cmd, args, opts)  # 运行相应cmd的run方法
    sys.exit(cmd.exitcode)
Example #10
0
def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    if settings is None and 'scrapy.conf' in sys.modules:
        from scrapy import conf
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

    if settings is None:
        settings = get_project_settings()
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    from scrapy import conf
    conf.settings = settings
    # ------------------------------------------------------------------

    crawler = CrawlerProcess(settings)
    crawler.install()
    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.defaults.update(cmd.default_settings)
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)
    cmd.set_crawler(crawler)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
Example #11
0
def get_follow_requests_and_items(project_path, spider_name, args):
    """
    get follows
    :param project_path:
    :param spider_name:
    :param args:
    :return:
    """
    work_cwd = os.getcwd()
    try:
        os.chdir(project_path)
        settings = get_project_settings()
        check_deprecated_settings(settings)
        sp = SpiderParser(settings, spider_name, args)
        results = sp.run()
        return results
    finally:
        os.chdir(work_cwd)
Example #12
0
def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    if settings is None:
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor
    check_deprecated_settings(settings)  # 校验弃用的配置项

    inproject = inside_project()  # 校验执行环境是否在项目中,命令根据执行环境不同而结果不同
    cmds = _get_commands_dict(settings, inproject)  # 获取所有命令,以命令名为键,对应的命令对象为值
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
                                   conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]  # 获取形参中命令的对象实例
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command'
                     )  # 把cmd.default_settings(这里为空)加入setting对象的attribute中
    cmd.settings = settings  # 把setting对象附加到命令对象上
    cmd.add_options(parser)  # 添加解析规则
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(
        settings)  # 初始化CrawlerProcess实例,把实例添加到当前命令中
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
Example #13
0
def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    if settings is None:
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor
    check_deprecated_settings(settings)

    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(),
                                   conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command')
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(settings)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
Example #14
0
def detect_project_spiders(project_path):
    """
    检测爬虫工程下所有爬虫脚本及其文件路径
    :param project_path:
    :return:
    """
    work_cwd = os.getcwd()
    try:
        os.chdir(project_path)
        settings = get_project_settings()
        check_deprecated_settings(settings)
        runner = CrawlerRunner(settings=settings)
        #
        # easy_spiders = runner.spiders
        #
        spiders = dict()
        spider_dict = runner.spider_loader._spiders
        for spider_name, spider_cls in spider_dict.items():
            spiders[spider_name] = spider_cls.__module__ + ".py"
    finally:
        ENVVAR = 'SCRAPY_SETTINGS_MODULE'
        os.environ.__delitem__(ENVVAR)
        os.chdir(work_cwd)
    return spiders
Example #15
0
def run():
    #读取配置文件
    setting = get_project_settings()
    check_deprecated_settings(setting)
Example #16
0
def execute(argv=None, settings=None):
    ## 假设我们是以 execute(['scrapy', 'crawl', 'spidername']) 的形式执行该函数

    if argv is None:
        argv = sys.argv

    # --- backward compatibility for scrapy.conf.settings singleton ---
    if settings is None and 'scrapy.conf' in sys.modules:
        from scrapy import conf
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

    if settings is None:
        ## 获取项目配置
        ## 根据环境变量和 scrapy.cfg 初始化环境,最终生成一个 Settings 实例
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor
    ## 校验弃用的配置项
    check_deprecated_settings(settings)

    # --- backward compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf
        conf.settings = settings
    # ------------------------------------------------------------------

    ## 执行环境是否在项目中,主要检查 scrapy.cfg 配置文件是否存在
    inproject = inside_project()
    ## 读取 commands 文件夹,把所有的命令类转换为 {cmd_name: cmd_instance, ...} 的字典
    cmds = _get_commands_dict(settings, inproject)
    ## 从命令行参数中解析出执行的是哪个子命令
    ## 例如,若命令行中执行的命令是 `scrapy crawl xxx`,这里的 cmdname 就是 'crawl'
    cmdname = _pop_command_name(argv)
    ## optparse 模块,可以让程序员能轻松设计出简单明了、易于使用、符合标准的 Unix
    ## 命令例程式的帮助文档
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    ## 如果 cmdname 为空,则打印所有命令的帮助信息,并退出 Python 程序
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    ## 如果 cmdname 不为空,但不在 cmds 字典的键中,则打印未知命令错误,并异常退出程序
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    ## 根据命令名称找到对应的命令实例
    cmd = cmds[cmdname]
    ## cmd.syntax 方法返回命令的用法
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    ## cmd.long_desc 方法返回对命令的描述
    parser.description = cmd.long_desc()
    ## 设置命令实例的默认配置并指定优先级为 command
    settings.setdict(cmd.default_settings, priority='command')
    ## 设置命令实例的配置
    cmd.settings = settings
    ## 为命令实例添加解析规则
    cmd.add_options(parser)
    ## 解析命令参数,并交由 Scrapy 命令实例处理
    ## 对于命令 `scrapy crawl xxx` 而言,argv[1:] 为 ['crawl', 'xxx']
    ## opts = {'logfile': xxx, 'loglevel': xxx, ...}
    ## args = ['xxx']
    opts, args = parser.parse_args(args=argv[1:])
    ## 执行命令实例的 process_options 方法,若执行出错时,打印相关帮助文档后再退出程序
    ## 在执行命令前,可以为命令添加一些可选项,用于在命令行级别下,更新配置中的相应配置项
    _run_print_help(parser, cmd.process_options, args, opts)

    ## 根据配置,初始化 CrawlerProcess(爬虫进程) 类的实例
    ## 并赋值给命令实例的 crawler_process 属性
    cmd.crawler_process = CrawlerProcess(settings)
    ## 执行命令实例的 run 方法
    ## 如果运行命令是 scrapy crawl <spider_name>,则运行的就是
    ## commands/crawl.py 的 run 方法
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
Example #17
0
def execute(argv=None, settings=None):
    # 获取命令行输入参数
    if argv is None:
        argv = sys.argv

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    # 向上兼容scrapy.conf单例,其实是报不支持这种配置方式的异常了
    if settings is None and 'scrapy.conf' in sys.modules:
        from . import conf
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

    # 获取项目配置
    if settings is None:
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor
    # 检验失效配置项(提示哪些配置已经失效了,这应该是打的补丁把-.-)
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf
        conf.settings = settings  #获取项目配置
    # ------------------------------------------------------------------
    # 通过查找有没有cfg文件判定是否在当前执行目录下,用于后面对比require_project属性,决定指令是否可用
    inproject = inside_project()
    # 获取命令字典(key=命令名:value=实例对象)
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)  #解析返回当前指令名称
    # 指令解析对象
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        # 不存在的指令,打印内容(和 -version打印的内容其实一样)
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)  #这里告诫我们,没事不要瞎输,你看程序退出了把=。=
    # 获取指令对象
    cmd = cmds[cmdname]
    # 指令语法
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    # 指令描述
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings,
                     priority='command')  #保存cmd的settings,并且重复项将会覆盖之前的全局配置
    cmd.settings = settings  #重复项覆盖之后重新赋值给当前命令settings属性
    cmd.add_options(parser)  #给指令添加相应指令选项
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)  #输出公共指令参数

    # 创建爬虫进程对象,通过此对象进行爬虫启动运行(划重点!!此处可能运行报错不通过,重新安装cryptography)
    cmd.crawler_process = CrawlerProcess(settings)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
def execute(argv=None, settings=None):
    '''
    Run command for scrapy, the original command exit the program when finish crawling so we modified it
    :param argv: run_command
    :param settings: setting
    :return:
    '''
    if argv is None:
        argv = sys.argv

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    if settings is None and 'scrapy.conf' in sys.modules:
        from scrapy import conf
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

    if settings is None:
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf
        conf.settings = settings
    # ------------------------------------------------------------------

    inproject = inside_project()
    cmds = cmdline._get_commands_dict(settings, inproject)
    cmdname = cmdline._pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(),
                                   conflict_handler='resolve')
    if not cmdname:
        cmdline._print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        cmdline._print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command')
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    cmdline._run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(settings)
    try:
        cmdline._run_print_help(parser, cmdline._run_command, cmd, args, opts)
    except Exception as e:
        print('Done')