コード例 #1
0
ファイル: test_crawler.py プロジェクト: pferdwurst/parkyou
    def setUp(self):
        crawler = CrawlerProcess(settings)
        crawler.install()
        # what does this do?
        inside_project()
        self.items = []

        self.crawl_cmd = scrapy.commands.crawl.Command() 
        self.crawl_cmd.set_crawler(crawler)

        self.parser = optparse.OptionParser()
        self.crawl_cmd.add_options(self.parser)
        dispatcher.connect(self._item_passed, signals.item_passed)
コード例 #2
0
ファイル: cmdline.py プロジェクト: reenvs/self-summary
def execute(argv=None):
    if argv is None:
        argv = sys.argv
    crawler = CrawlerProcess(settings)
    crawler.install()
    inproject = inside_project()
    _check_deprecated_scrapy_ctl(argv, inproject) # TODO: remove for Scrapy 0.11
    cmds = _get_commands_dict(inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.defaults.update(cmd.default_settings)
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)
    cmd.set_crawler(crawler)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
コード例 #3
0
ファイル: cmdline.py プロジェクト: Root-nix/scrapy
def execute(argv=None):
    if argv is None:
        argv = sys.argv
    crawler = CrawlerProcess(settings)
    crawler.install()
    inproject = inside_project()
    cmds = _get_commands_dict(inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.defaults.update(cmd.default_settings)
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)
    cmd.set_crawler(crawler)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
コード例 #4
0
ファイル: cmdline.py プロジェクト: wanglikang/RetardedSpider
def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    if settings is None and 'scrapy.conf' in sys.modules:
        from scrapy import conf
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

    if settings is None:
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf
        conf.settings = settings
    # ------------------------------------------------------------------

    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    #cmds中有scrapy.commands目录下所有命令
    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command')
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    # 真正执行CrawlerProcess的入口在这里
    cmd.crawler_process = CrawlerProcess(settings)
    #运行CrawlerProcess
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
コード例 #5
0
def get_project_root():
    """
    Returns the absolute path of the root of the project, and raise an exception
    if the current directory is not inside a project path
    """
    os.path.abspath('.')
    if inside_project():
        return os.path.dirname(closest_scrapy_cfg())
    raise Exception(os.getcwd(), " does not belong to a Scrapy project")
コード例 #6
0
def execute(argv=None, settings=None):
    print("in cmdline line 98 : i am in eclipes")
    print("in cmdline line 98 argv %s" % argv)
    print("in cmdline line 99 settings %s" % settings)
    if argv is None:
        argv = sys.argv

    print("in cmdline line 101 argv %s" % argv)
    print("in cmdline line 102 settings %s" % settings)
    # --- backwards compatibility for scrapy.conf.settings singleton ---
    if settings is None and 'scrapy.conf' in sys.modules:
        from scrapy import conf
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

    if settings is None:
        settings = get_project_settings()

    print("in cmdline line 115 settings %s" % settings)
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf
        conf.settings = settings
    # ------------------------------------------------------------------

    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command')
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(settings)
    #从这一步就开始了...开始启动
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
コード例 #7
0
def get_scrapy_data_path(createdir=True):
    """ Return a path to a folder where Scrapy is storing data.
    Usually that's a .scrapy folder inside the project.
    """
    # This code is extracted from scrapy.utils.project.data_path function,
    # which does too many things.
    path = project_data_dir() if inside_project() else ".scrapy"
    if createdir:
        os.makedirs(path, exist_ok=True)
    return path
コード例 #8
0
ファイル: cmdline.py プロジェクト: JohnDoes95/project_parser
def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    if settings is None and 'scrapy.conf' in sys.modules:
        from scrapy import conf
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

    if settings is None:
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError: pass
        else:
            settings['EDITOR'] = editor
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf
        conf.settings = settings
    # ------------------------------------------------------------------

    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command')
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(settings)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
コード例 #9
0
ファイル: cmdline.py プロジェクト: xtmhm2000/scrapy-0.22
def execute(argv=None, settings=None):
    #如果函数参数 argv是none的话 有从命令行读取参数
    if argv is None:
        argv = sys.argv

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    #判断函数参数中是否有Settings以及 'scrapy.conf 是否在模块列表中
    if settings is None and 'scrapy.conf' in sys.modules:
        from scrapy import conf
        #判断对象object是否包含名为name的特性(hasattr是通过调用getattr(ojbect, name)是否抛出异常来实现的)。
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

#如果settings是NONE
    if settings is None:

        settings = get_project_settings()
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf
        conf.settings = settings
    # ------------------------------------------------------------------

    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.defaults.update(cmd.default_settings)
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(settings)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
コード例 #10
0
def main():
    opts, args = parse_opts()
    exitcode = 0
    if not inside_project():
        _log("Error: no Scrapy project found in this location")
        sys.exit(1)

    install_opener(
        build_opener(HTTPRedirectHandler)
    )

    if opts.list_targets:
        for name, target in _get_targets().items():
            print("%-20s %s" % (name, target['url']))
        return

    if opts.list_projects:
        target = _get_target(opts.list_projects)
        req = Request(_url(target, 'listprojects.json'))
        _add_auth_header(req, target)
        f = urlopen(req)
        projects = json.loads(f.read())['projects']
        print(os.linesep.join(projects))
        return

    tmpdir = None

    if opts.build_egg:  # build egg only
        egg, tmpdir = _build_egg()
        dest = opts.build_egg
        if(os.path.isdir(dest)):
            dest = os.path.join(dest, str(int(time.time())) + '.egg')
        _log("Writing egg to %s" % dest)
        shutil.copyfile(egg, dest)
    elif opts.deploy_all_targets:
        version = None
        for name, target in _get_targets().items():
            if version is None:
                version = _get_version(target, opts)
            _build_egg_and_deploy_target(target, version, opts)
    else:  # buld egg and deploy
        target_name = _get_target_name(args)
        target = _get_target(target_name)
        version = _get_version(target, opts)
        exitcode, tmpdir = _build_egg_and_deploy_target(target, version, opts)

    if tmpdir:
        if opts.debug:
            _log("Output dir not removed: %s" % tmpdir)
        else:
            shutil.rmtree(tmpdir)

    sys.exit(exitcode)
コード例 #11
0
ファイル: cmdline.py プロジェクト: bf96163/scrapy
def execute(argv=None, settings=None):
    # 是否用其他方式传入命令参数,否的话使用命令行参数
    if argv is None:
        argv = sys.argv  #sys.argv 的第一个元素一定是本体的名字,不论后面带不带参数 所以不能用非空来判断

    if settings is None:
        # 没指定setting的话,就调用默认方法
        settings = get_project_settings(
        )  # 从scrapy.cfg载入setting 再从环境中载入scrapy相关的setting到setting对象里
        # set EDITOR from environment if available 用于编辑文件
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor

    inproject = inside_project(
    )  #判断是否在项目内(先尝试载入setting模块 否则用是否能找到scrapy.cfg来判断)
    cmds = _get_commands_dict(settings, inproject)  #拿到当前状态下所有可用模块
    cmdname = _pop_command_name(argv)  #从命令行里拿到指向哪个命令
    parser = optparse.OptionParser(
        formatter=optparse.TitledHelpFormatter(
        ),  ##指定一个 optparse 解析器 已经被argparse 替代了 不用学这个
        conflict_handler='resolve')

    if not cmdname:  #未解析出指向模块
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:  #解析出的字符不在可用模块内
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]  #拿到模块
    ## 解析命令
    parser.usage = f"scrapy {cmdname} {cmd.syntax()}"
    parser.description = cmd.long_desc()
    #将命令中的设置弄到setting中
    settings.setdict(cmd.default_settings, priority='command')
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    ## 解析命令结束

    #运行命令
    _run_print_help(parser, cmd.process_options, args,
                    opts)  #调用对应的命令 传递由解析器解析出来的参数
    # 生成CrawlerProcess
    cmd.crawler_process = CrawlerProcess(settings)
    # 运行crawler_process 对应的命令
    _run_print_help(parser, _run_command, cmd, args,
                    opts)  #调用command的run启动这两个参数
    sys.exit(cmd.exitcode)
コード例 #12
0
ファイル: cmdline.py プロジェクト: ZhaiQiliang/scrapy
def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    if settings is None and "scrapy.conf" in sys.modules:
        from scrapy import conf

        if hasattr(conf, "settings"):
            settings = conf.settings
    # ------------------------------------------------------------------

    if settings is None:
        settings = get_project_settings()
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning

    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf

        conf.settings = settings
    # ------------------------------------------------------------------

    crawler = CrawlerProcess(settings)
    crawler.install()
    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), conflict_handler="resolve")
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.defaults.update(cmd.default_settings)
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)
    cmd.set_crawler(crawler)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
コード例 #13
0
ファイル: scrapyd.py プロジェクト: AugustLONG/sportslab
def main():
    opts, args = parse_opts()
    exitcode = 0
    if not inside_project():
        _log("Error: no Scrapy project found in this location")
        sys.exit(1)

    urllib2.install_opener(urllib2.build_opener(HTTPRedirectHandler))

    if opts.list_targets:
        for name, target in _get_targets().items():
            print "%-20s %s" % (name, target['url'])
        return

    if opts.list_projects:
        target = _get_target(opts.list_projects)
        req = urllib2.Request(_url(target, 'listprojects.json'))
        _add_auth_header(req, target)
        f = urllib2.urlopen(req)
        projects = json.loads(f.read())['projects']
        print os.linesep.join(projects)
        return

    tmpdir = None

    if opts.build_egg:  # build egg only
        egg, tmpdir = _build_egg()
        _log("Writing egg to %s" % opts.build_egg)
        shutil.copyfile(egg, opts.build_egg)
    else:  # buld egg and deploy
        target_name = _get_target_name(args)
        target = _get_target(target_name)
        project = _get_project(target, opts)
        version = _get_version(target, opts)
        if opts.egg:
            _log("Using egg: %s" % opts.egg)
            egg = opts.egg
        else:
            _log("Packing version %s" % version)
            egg, tmpdir = _build_egg()
        if not _upload_egg(target, egg, project, version):
            exitcode = 1

    if tmpdir:
        if opts.debug:
            _log("Output dir not removed: %s" % tmpdir)
        else:
            shutil.rmtree(tmpdir)

    sys.exit(exitcode)
コード例 #14
0
ファイル: scrapyd.py プロジェクト: mathhomework/sportslab
def main():
    opts, args = parse_opts()
    exitcode = 0
    if not inside_project():
        _log("Error: no Scrapy project found in this location")
        sys.exit(1)

    urllib2.install_opener(urllib2.build_opener(HTTPRedirectHandler))

    if opts.list_targets:
        for name, target in _get_targets().items():
            print "%-20s %s" % (name, target['url'])
        return

    if opts.list_projects:
        target = _get_target(opts.list_projects)
        req = urllib2.Request(_url(target, 'listprojects.json'))
        _add_auth_header(req, target)
        f = urllib2.urlopen(req)
        projects = json.loads(f.read())['projects']
        print os.linesep.join(projects)
        return

    tmpdir = None

    if opts.build_egg: # build egg only
        egg, tmpdir = _build_egg()
        _log("Writing egg to %s" % opts.build_egg)
        shutil.copyfile(egg, opts.build_egg)
    else: # buld egg and deploy
        target_name = _get_target_name(args)
        target = _get_target(target_name)
        project = _get_project(target, opts)
        version = _get_version(target, opts)
        if opts.egg:
            _log("Using egg: %s" % opts.egg)
            egg = opts.egg
        else:
            _log("Packing version %s" % version)
            egg, tmpdir = _build_egg()
        if not _upload_egg(target, egg, project, version):
            exitcode = 1

    if tmpdir:
        if opts.debug:
            _log("Output dir not removed: %s" % tmpdir)
        else:
            shutil.rmtree(tmpdir)

    sys.exit(exitcode)
コード例 #15
0
def deploy():
    opts, args = parse_opts()
    if not inside_project():
        _log("Error: no Scrapy project found in this location")
        sys.exit(1)

    _delete_old_package()

    urllib2.install_opener(urllib2.build_opener(HTTPRedirectHandler))

    if opts.list_targets:
        for name, target in _get_targets().items():
            print "%-20s %s" % (name, target['url'])
        return

    if opts.list_projects:
        target = _get_target(opts.list_projects)
        req = urllib2.Request(_url(target, 'listprojects.json'))
        _add_auth_header(req, target)
        f = urllib2.urlopen(req)
        projects = json.loads(f.read())['projects']
        print os.linesep.join(projects)
        return

    tmpdir = None

    # build egg only
    if opts.build_egg:
        egg, tmpdir = _build_egg()
        _log("Writing egg to %s" % opts.build_egg)
        shutil.copyfile(egg, opts.build_egg)
    elif opts.deploy_all_targets:
        version = None
        for name, target in _get_targets().items():
            if version is None:
                version = _get_version(target, opts)
            _build_egg_and_deploy_target(target, version, opts)
    else: # buld egg and deploy
        target_name = _get_target_name(args)
        target = _get_target(target_name)
        version = _get_version(target, opts)
        exitcode, tmpdir = _build_egg_and_deploy_target(target, version, opts)

    if tmpdir:
        if opts.debug:
            _log("Output dir not removed: %s" % tmpdir)
        else:
            shutil.rmtree(tmpdir)
            _delete_old_package()
コード例 #16
0
def execute(argv=None, settings=None):
    """
    解析命令; 利用CrawlerProcess构建 crawler; 
    """
    # 1. 初始化执行环境; 读取用户自定义配置; 读取默认配置; 检查弃用配置项
    if argv is None:
        argv = sys.argv

    if settings is None:
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor
    check_deprecated_settings(settings)  # 检查弃用的配置项

    # 2. 检查执行环境是否在项目中: 即scrapy命令执行时的项目目录, 相邻文件夹是否存在scrapy.cfg
    inproject = inside_project()

    # 3. 读取当前scrapy支持的所有命令(commands文件夹)并解析命令
    cmds = _get_commands_dict(settings, inproject)  # 读取所有命令
    cmdname = _pop_command_name(argv)  # 解析命令
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(),
                                   conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    # 4. 获取命令实例并设置项目级别(command), 添加解析规则
    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command')
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])  # 解析命令行
    _run_print_help(parser, cmd.process_options, args, opts)

    # 5. 初始化CrawlerProcess并执行命令实例的run方法(比如命令srapy/command/crawl.py)
    cmd.crawler_process = CrawlerProcess(settings)
    _run_print_help(parser, _run_command, cmd, args, opts)  # 运行相应cmd的run方法
    sys.exit(cmd.exitcode)
コード例 #17
0
ファイル: cmdline.py プロジェクト: tskylee/scrapy
def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    if settings is None and 'scrapy.conf' in sys.modules:
        from scrapy import conf
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

    if settings is None:
        settings = get_project_settings()
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    from scrapy import conf
    conf.settings = settings
    # ------------------------------------------------------------------

    crawler = CrawlerProcess(settings)
    crawler.install()
    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.defaults.update(cmd.default_settings)
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)
    cmd.set_crawler(crawler)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
コード例 #18
0
ファイル: cli.py プロジェクト: ryonlife/scrapy-autounit
    def __init__(self, parser):
        self.parser = parser
        self.args = parser.parse_args()

        if not inside_project():
            self.error("No active Scrapy project")

        self.command = self.args.command

        self.spider = sanitize_module_name(self.args.spider)
        self.callback = self.args.callback
        self.fixture = self.args.fixture

        self.project_dir = get_project_dir()
        sys.path.append(self.project_dir)

        self.settings = get_project_settings()

        base_path = self.settings.get(
            'AUTOUNIT_BASE_PATH',
            default=os.path.join(self.project_dir, 'autounit'))
        self.tests_dir = os.path.join(base_path, 'tests')

        self.spider_dir = os.path.join(self.tests_dir, self.spider)

        if not os.path.isdir(self.spider_dir):
            self.error(
                "No recorded data found "
                "for spider '{}'".format(self.spider))

        extra_path = self.settings.get('AUTOUNIT_EXTRA_PATH') or ''
        self.callback_dir = os.path.join(
            self.spider_dir, extra_path, self.callback)

        if not os.path.isdir(self.callback_dir):
            self.error(
                "No recorded data found for callback "
                "'{}' from '{}' spider".format(self.callback, self.spider))

        if self.fixture:
            self.fixture_path = os.path.join(
                self.callback_dir, self.parse_fixture_arg())
            if not os.path.isfile(self.fixture_path):
                self.error("Fixture '{}' not found".format(self.fixture_path))
コード例 #19
0
    def __init__(self, parser):
        self.parser = parser
        self.args = parser.parse_args()

        if not inside_project():
            self._error("No active Scrapy project")

        self.command = self.args.command

        self.spider = self.args.spider
        self.callback = self.args.callback
        self.fixture = self.args.fixture

        self.project_dir = get_project_dir()
        sys.path.append(self.project_dir)

        self.settings = get_project_settings()

        base_path = get_base_path(self.settings)
        self.tests_dir = os.path.join(base_path, 'tests')

        if self.spider:
            self.spider = sanitize_module_name(self.spider)
            self.callbacks_dir = self._get_callbacks_dir(self.spider)
            if not os.path.isdir(self.callbacks_dir):
                self._error("No recorded data found for spider '{}'".format(
                    self.spider))

            if self.callback:
                self.callback_dir = os.path.join(self.callbacks_dir,
                                                 self.callback)
                if not os.path.isdir(self.callback_dir):
                    self._error("No recorded data found for callback "
                                "'{}' from '{}' spider".format(
                                    self.callback, self.spider))

                if self.fixture:
                    self.fixture_path = os.path.join(self.callback_dir,
                                                     self.parse_fixture_arg())
                    if not os.path.isfile(self.fixture_path):
                        self._error("Fixture '{}' not found".format(
                            self.fixture_path))
コード例 #20
0
def cli(target, project, version, list_targets, debug, egg, build_egg):
    exitcode = 0
    if not inside_project():
        _log("Error: no Scrapy project found in this location")
        sys.exit(1)

    if list_targets:
        for name, target in _get_targets().items():
            click.echo(name)
        return

    tmpdir = None

    if build_egg:  # build egg only
        egg, tmpdir = _build_egg()
        _log("Writing egg to %s" % build_egg)
        shutil.copyfile(egg, build_egg)
    else:  # buld egg and deploy
        target = _get_target(target)
        project = _get_project(target, project)
        version = _get_version(target, version)
        if egg:
            _log("Using egg: %s" % egg)
            egg = egg
        else:
            _log("Packing version %s" % version)
            egg, tmpdir = _build_egg()
        if _upload_egg(target, egg, project, version):
            click.echo(
                "Run your spiders at: https://dash.scrapinghub.com/p/%s/" %
                project)
        else:
            exitcode = 1

    if tmpdir:
        if debug:
            _log("Output dir not removed: %s" % tmpdir)
        else:
            shutil.rmtree(tmpdir)

    sys.exit(exitcode)
コード例 #21
0
def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    if settings is None:
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor
    check_deprecated_settings(settings)  # 校验弃用的配置项

    inproject = inside_project()  # 校验执行环境是否在项目中,命令根据执行环境不同而结果不同
    cmds = _get_commands_dict(settings, inproject)  # 获取所有命令,以命令名为键,对应的命令对象为值
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
                                   conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]  # 获取形参中命令的对象实例
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command'
                     )  # 把cmd.default_settings(这里为空)加入setting对象的attribute中
    cmd.settings = settings  # 把setting对象附加到命令对象上
    cmd.add_options(parser)  # 添加解析规则
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(
        settings)  # 初始化CrawlerProcess实例,把实例添加到当前命令中
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
コード例 #22
0
def execute(argv=None, settings=None):
    if argv is None:
        argv = sys.argv

    if settings is None:
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor
    check_deprecated_settings(settings)

    inproject = inside_project()
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(),
                                   conflict_handler='resolve')
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command')
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(settings)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
コード例 #23
0
ファイル: deploy.py プロジェクト: imclab/shub
def cli(target, project, version, list_targets, debug, egg, build_egg):
    exitcode = 0
    if not inside_project():
        _log("Error: no Scrapy project found in this location")
        sys.exit(1)

    if list_targets:
        for name, target in _get_targets().items():
            click.echo(name)
        return

    tmpdir = None

    if build_egg:
        egg, tmpdir = _build_egg()
        _log("Writing egg to %s" % build_egg)
        shutil.copyfile(egg, build_egg)
    else:
        target = _get_target(target)
        project = _get_project(target, project)
        version = _get_version(target, version)
        if egg:
            _log("Using egg: %s" % egg)
            egg = egg
        else:
            _log("Packing version %s" % version)
            egg, tmpdir = _build_egg()
        if _upload_egg(target, egg, project, version):
            click.echo("Run your spiders at: https://dash.scrapinghub.com/p/%s/" % project)
        else:
            exitcode = 1

    if tmpdir:
        if debug:
            _log("Output dir not removed: %s" % tmpdir)
        else:
            shutil.rmtree(tmpdir)

    sys.exit(exitcode)
コード例 #24
0
ファイル: deploy.py プロジェクト: umrashrf/shub
def cli(target, project, version, list_targets, debug, egg, build_egg):
    if not inside_project():
        log("Error: no Scrapy project found in this location")
        sys.exit(1)

    if list_targets:
        for name, target in scrapycfg.get_targets().items():
            click.echo(name)
        return

    tmpdir = None

    try:
        if build_egg:
            egg, tmpdir = _build_egg()
            log("Writing egg to %s" % build_egg)
            shutil.copyfile(egg, build_egg)
        else:
            target = scrapycfg.get_target(target)
            project = scrapycfg.get_project(target, project)
            version = scrapycfg.get_version(target, version)
            apikey = target.get('username') or find_api_key()
            auth = (apikey, '')

            if egg:
                log("Using egg: %s" % egg)
                egg = egg
            else:
                log("Packing version %s" % version)
                egg, tmpdir = _build_egg()

            _upload_egg(target, egg, project, version, auth)
            click.echo("Run your spiders at: https://dash.scrapinghub.com/p/%s/" % project)
    finally:
        if tmpdir:
            if debug:
                log("Output dir not removed: %s" % tmpdir)
            else:
                shutil.rmtree(tmpdir, ignore_errors=True)
コード例 #25
0
ファイル: cmdline.py プロジェクト: pointworld/scrapy-analysis
def execute(argv=None, settings=None):
    ## 假设我们是以 execute(['scrapy', 'crawl', 'spidername']) 的形式执行该函数

    if argv is None:
        argv = sys.argv

    # --- backward compatibility for scrapy.conf.settings singleton ---
    if settings is None and 'scrapy.conf' in sys.modules:
        from scrapy import conf
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

    if settings is None:
        ## 获取项目配置
        ## 根据环境变量和 scrapy.cfg 初始化环境,最终生成一个 Settings 实例
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor
    ## 校验弃用的配置项
    check_deprecated_settings(settings)

    # --- backward compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf
        conf.settings = settings
    # ------------------------------------------------------------------

    ## 执行环境是否在项目中,主要检查 scrapy.cfg 配置文件是否存在
    inproject = inside_project()
    ## 读取 commands 文件夹,把所有的命令类转换为 {cmd_name: cmd_instance, ...} 的字典
    cmds = _get_commands_dict(settings, inproject)
    ## 从命令行参数中解析出执行的是哪个子命令
    ## 例如,若命令行中执行的命令是 `scrapy crawl xxx`,这里的 cmdname 就是 'crawl'
    cmdname = _pop_command_name(argv)
    ## optparse 模块,可以让程序员能轻松设计出简单明了、易于使用、符合标准的 Unix
    ## 命令例程式的帮助文档
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    ## 如果 cmdname 为空,则打印所有命令的帮助信息,并退出 Python 程序
    if not cmdname:
        _print_commands(settings, inproject)
        sys.exit(0)
    ## 如果 cmdname 不为空,但不在 cmds 字典的键中,则打印未知命令错误,并异常退出程序
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    ## 根据命令名称找到对应的命令实例
    cmd = cmds[cmdname]
    ## cmd.syntax 方法返回命令的用法
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    ## cmd.long_desc 方法返回对命令的描述
    parser.description = cmd.long_desc()
    ## 设置命令实例的默认配置并指定优先级为 command
    settings.setdict(cmd.default_settings, priority='command')
    ## 设置命令实例的配置
    cmd.settings = settings
    ## 为命令实例添加解析规则
    cmd.add_options(parser)
    ## 解析命令参数,并交由 Scrapy 命令实例处理
    ## 对于命令 `scrapy crawl xxx` 而言,argv[1:] 为 ['crawl', 'xxx']
    ## opts = {'logfile': xxx, 'loglevel': xxx, ...}
    ## args = ['xxx']
    opts, args = parser.parse_args(args=argv[1:])
    ## 执行命令实例的 process_options 方法,若执行出错时,打印相关帮助文档后再退出程序
    ## 在执行命令前,可以为命令添加一些可选项,用于在命令行级别下,更新配置中的相应配置项
    _run_print_help(parser, cmd.process_options, args, opts)

    ## 根据配置,初始化 CrawlerProcess(爬虫进程) 类的实例
    ## 并赋值给命令实例的 crawler_process 属性
    cmd.crawler_process = CrawlerProcess(settings)
    ## 执行命令实例的 run 方法
    ## 如果运行命令是 scrapy crawl <spider_name>,则运行的就是
    ## commands/crawl.py 的 run 方法
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
コード例 #26
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-s',
                        '--spider',
                        help='The spider where to look fixtures for')
    parser.add_argument(
        '-c',
        '--callback',
        help='The callback where to look fixtures for (requires spider)')
    parser.add_argument(
        '-f',
        '--fixture',
        help=('The fixture number to inspect (requires spider and callback).'
              'It can be an integer indicating the fixture number or a string'
              'indicating the fixture name'))
    parser.add_argument('-p',
                        '--path',
                        help='The full path for the fixture to inspect')

    args = parser.parse_args()

    if args.path:
        retcode = handle_path(args.path)
        sys.exit(retcode)

    if not inside_project():
        print('No active Scrapy project')
        sys.exit(1)

    if not args.spider:
        print('Must specify a spider')
        parser.print_help()
        sys.exit(1)

    if not args.callback:
        print('Must specify a callback')
        parser.print_help()
        sys.exit(1)

    if not args.fixture:
        print('Must specify a fixture')
        parser.print_help()
        sys.exit(1)

    settings = get_project_settings()
    base_path = settings.get('AUTOUNIT_BASE_PATH',
                             default=os.path.join(get_project_dir(),
                                                  'autounit'))

    tests_dir = os.path.join(base_path, 'tests')

    if not os.path.isdir(tests_dir):
        print('Autounit tests directory not found\n')
        sys.exit(1)

    args.fixture = parse_fixture_arg(args.fixture)

    extra_path = settings.get('AUTOUNIT_EXTRA_PATH') or ''
    path = os.path.join(tests_dir, args.spider, extra_path, args.callback,
                        args.fixture)

    retcode = handle_path(path)
    sys.exit(retcode)
コード例 #27
0
ファイル: cli.py プロジェクト: ThomasAitken/Scrapy-Testmaster
    def __init__(self, parser):
        self.parser = parser
        self.args = parser.parse_args()

        if not inside_project():
            self.error("No active Scrapy project")

        self.command = self.args.command

        self.spider = sanitize_module_name(self.args.spider) if \
            self.args.spider else None
        try:
            self.callback = self.args.callback
        except AttributeError:
            self.callback = None
        try:
            self.fixture = self.args.fixture
        except AttributeError:
            self.fixture = None

        if self.command == 'update':
            try:
                self.new = self.args.new
            except AttributeError:
                self.new = None
            try:
                self.dynamic = self.args.dynamic
            except AttributeError:
                self.dynamic = None

        if self.command == 'clear':
            self.fixtures = self.args.fixtures.split(',')

        if self.fixture and not self.callback:
            self.error("Can't specify a fixture without a callback")

        self.project_dir, self.project_name = get_project_dirs()
        sys.path.append(self.project_dir)

        self.settings = get_project_settings()

        if self.command == "parse":
            url_list = [url.strip() for url in self.args.urls.split('|')]
            for url in url_list:
                if not is_url(url):
                    self.error("Something went wrong with your urls arg! "
                               "Note that as of version 1.0, the character for separating "
                               "multiple urls is '|', as opposed to ','")

            self.args = process_options(self.args)
            crawler_process = CrawlerProcess(self.settings)
            run_command(crawler_process, url_list, self.args)

        else:
            self.base_path = self.settings.get(
                'TESTMASTER_BASE_PATH',
                default=os.path.join(self.project_dir, 'testmaster'))
            self.tests_dir = os.path.join(self.base_path, 'tests')

            self.spider_dir = os.path.join(self.tests_dir, self.spider)

            if not os.path.isdir(self.spider_dir) and self.command != "establish":
                self.error(
                    "No recorded data found "
                    "for spider '{}'".format(self.spider))

            self.extra_path = self.settings.get('TESTMASTER_EXTRA_PATH') or ''
            if self.callback:
                self.callback_dir = os.path.join(
                    self.spider_dir, self.extra_path, self.callback)

                if self.command == 'establish':
                    if os.path.isdir(self.callback_dir):
                        self.error(
                            "Can't use 'establish' with callback arg "
                            "if callback dir for spider '{}' "
                            "exists already".format(self.spider))
            else:
                if self.command == 'inspect':
                    self.error(
                        "No recorded data found for callback "
                        "'{}' from '{}' spider".format(self.callback, self.spider))

            if self.fixture:
                self.fixture_path = os.path.join(self.callback_dir,
                                                 self.parse_fixture_arg())
                if not os.path.isfile(self.fixture_path):
                    self.error("Fixture '{}' not found".format(self.fixture_path))
コード例 #28
0
def execute(argv=None, settings=None):
    # 获取命令行输入参数
    if argv is None:
        argv = sys.argv

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    # 向上兼容scrapy.conf单例,其实是报不支持这种配置方式的异常了
    if settings is None and 'scrapy.conf' in sys.modules:
        from . import conf
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

    # 获取项目配置
    if settings is None:
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor
    # 检验失效配置项(提示哪些配置已经失效了,这应该是打的补丁把-.-)
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf
        conf.settings = settings  #获取项目配置
    # ------------------------------------------------------------------
    # 通过查找有没有cfg文件判定是否在当前执行目录下,用于后面对比require_project属性,决定指令是否可用
    inproject = inside_project()
    # 获取命令字典(key=命令名:value=实例对象)
    cmds = _get_commands_dict(settings, inproject)
    cmdname = _pop_command_name(argv)  #解析返回当前指令名称
    # 指令解析对象
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(), \
        conflict_handler='resolve')
    if not cmdname:
        # 不存在的指令,打印内容(和 -version打印的内容其实一样)
        _print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        _print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)  #这里告诫我们,没事不要瞎输,你看程序退出了把=。=
    # 获取指令对象
    cmd = cmds[cmdname]
    # 指令语法
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    # 指令描述
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings,
                     priority='command')  #保存cmd的settings,并且重复项将会覆盖之前的全局配置
    cmd.settings = settings  #重复项覆盖之后重新赋值给当前命令settings属性
    cmd.add_options(parser)  #给指令添加相应指令选项
    opts, args = parser.parse_args(args=argv[1:])
    _run_print_help(parser, cmd.process_options, args, opts)  #输出公共指令参数

    # 创建爬虫进程对象,通过此对象进行爬虫启动运行(划重点!!此处可能运行报错不通过,重新安装cryptography)
    cmd.crawler_process = CrawlerProcess(settings)
    _run_print_help(parser, _run_command, cmd, args, opts)
    sys.exit(cmd.exitcode)
def execute(argv=None, settings=None):
    '''
    Run command for scrapy, the original command exit the program when finish crawling so we modified it
    :param argv: run_command
    :param settings: setting
    :return:
    '''
    if argv is None:
        argv = sys.argv

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    if settings is None and 'scrapy.conf' in sys.modules:
        from scrapy import conf
        if hasattr(conf, 'settings'):
            settings = conf.settings
    # ------------------------------------------------------------------

    if settings is None:
        settings = get_project_settings()
        # set EDITOR from environment if available
        try:
            editor = os.environ['EDITOR']
        except KeyError:
            pass
        else:
            settings['EDITOR'] = editor
    check_deprecated_settings(settings)

    # --- backwards compatibility for scrapy.conf.settings singleton ---
    import warnings
    from scrapy.exceptions import ScrapyDeprecationWarning
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", ScrapyDeprecationWarning)
        from scrapy import conf
        conf.settings = settings
    # ------------------------------------------------------------------

    inproject = inside_project()
    cmds = cmdline._get_commands_dict(settings, inproject)
    cmdname = cmdline._pop_command_name(argv)
    parser = optparse.OptionParser(formatter=optparse.TitledHelpFormatter(),
                                   conflict_handler='resolve')
    if not cmdname:
        cmdline._print_commands(settings, inproject)
        sys.exit(0)
    elif cmdname not in cmds:
        cmdline._print_unknown_command(settings, cmdname, inproject)
        sys.exit(2)

    cmd = cmds[cmdname]
    parser.usage = "scrapy %s %s" % (cmdname, cmd.syntax())
    parser.description = cmd.long_desc()
    settings.setdict(cmd.default_settings, priority='command')
    cmd.settings = settings
    cmd.add_options(parser)
    opts, args = parser.parse_args(args=argv[1:])
    cmdline._run_print_help(parser, cmd.process_options, args, opts)

    cmd.crawler_process = CrawlerProcess(settings)
    try:
        cmdline._run_print_help(parser, cmdline._run_command, cmd, args, opts)
    except Exception as e:
        print('Done')