Example #1
0
    def run(self, args, opts):
        if len(args) not in (1, 2):
            raise UsageError()

        project_name = args[0]
        project_dir = args[0]

        if len(args) == 2:
            project_dir = args[1]

        if exists(join(project_dir, 'scrapy.cfg')):
            self.exitcode = 1
            print(
                f'Error: scrapy.cfg already exists in {abspath(project_dir)}')
            return

        if not self._is_valid_name(project_name):
            self.exitcode = 1
            return

        self._copytree(self.templates_dir, abspath(project_dir))
        move(join(project_dir, 'module'), join(project_dir, project_name))
        for paths in TEMPLATES_TO_RENDER:
            path = join(*paths)
            tplfile = join(
                project_dir,
                string.Template(path).substitute(project_name=project_name))
            render_templatefile(tplfile,
                                project_name=project_name,
                                ProjectName=string_camelcase(project_name))
        print(f"New Scrapy project '{project_name}', using template directory "
              f"'{self.templates_dir}', created in:")
        print(f"    {abspath(project_dir)}\n")
        print("You can start your first spider with:")
        print(f"    cd {project_dir}")
        print("    scrapy genspider example example.com")
Example #2
0
    def run(self, args, opts):
        if len(args) != 1 or not is_url(args[0]):
            raise UsageError()
        request = Request(args[0],
                          callback=self._print_response,
                          cb_kwargs={"opts": opts},
                          dont_filter=True)
        # by default, let the framework handle redirects,
        # i.e. command handles all codes expect 3xx
        if not opts.no_redirect:
            request.meta['handle_httpstatus_list'] = SequenceExclude(
                range(300, 400))
        else:
            request.meta['handle_httpstatus_all'] = True

        spidercls = DefaultSpider
        spider_loader = self.crawler_process.spider_loader
        if opts.spider:
            spidercls = spider_loader.load(opts.spider)
        else:
            spidercls = spidercls_for_request(spider_loader, request,
                                              spidercls)
        self.crawler_process.crawl(spidercls, start_requests=lambda: [request])
        self.crawler_process.start()
Example #3
0
    def process_options(self, args, opts):
        try:
            self.settings.setdict(arglist_to_dict(opts.set),
                                  priority='cmdline')
        except ValueError:
            raise UsageError("Invalid -s value, use -s NAME=VALUE", print_help=False)

        if opts.logfile:
            self.settings.set('LOG_ENABLED', True, priority='cmdline')
            self.settings.set('LOG_FILE', opts.logfile, priority='cmdline')

        if opts.loglevel:
            self.settings.set('LOG_ENABLED', True, priority='cmdline')
            self.settings.set('LOG_LEVEL', opts.loglevel, priority='cmdline')

        if opts.nolog:
            self.settings.set('LOG_ENABLED', False, priority='cmdline')

        if opts.pidfile:
            with open(opts.pidfile, "w") as f:
                f.write(str(os.getpid()) + os.linesep)

        if opts.pdb:
            failure.startDebugMode()
Example #4
0
    def run(self, args, opts):
        if not opts.package:
            return super(Command, self).run(args, opts)

        if len(args) not in (1, 2):
            raise UsageError()

        project_name = args[0]
        project_dir = args[0]

        if len(args) == 2:
            project_dir = args[1]

        if exists(join(project_dir, "scrapy.cfg")):
            self.exitcode = 1
            print("Error: scrapy.cfg already exists in %s" %
                  abspath(project_dir))
            return

        if not self._is_valid_name(project_name):
            self.exitcode = 1
            return

        try:
            cookiecutter(
                os_scrapy_cookiecutter.TEMPLATE_DIR,
                no_input=True,
                extra_context={
                    "project_name": project_name,
                    "project_dir": project_dir,
                },
            )
        except Exception as e:
            self.exitcode = 1
            print(f"Error: create project with cookiecutter {e}")
            return
Example #5
0
    def process_options(self, args, opts):
        try:
            self.settings.overrides.update(arglist_to_dict(opts.set))
        except ValueError:
            raise UsageError("Invalid -s value, use -s NAME=VALUE",
                             print_help=False)

        if opts.logfile:
            self.settings.overrides['LOG_ENABLED'] = True
            self.settings.overrides['LOG_FILE'] = opts.logfile

        if opts.loglevel:
            self.settings.overrides['LOG_ENABLED'] = True
            self.settings.overrides['LOG_LEVEL'] = opts.loglevel

        if opts.nolog:
            self.settings.overrides['LOG_ENABLED'] = False

        if opts.pidfile:
            with open(opts.pidfile, "w") as f:
                f.write(str(os.getpid()) + os.linesep)

        if opts.pdb:
            failure.startDebugMode()
Example #6
0
def _get_project(target, opts):
    project = opts.project or target.get('project')
    if not project:
        raise UsageError("Missing project")
    return project
Example #7
0
    def run(self, args: list, opts: list) -> None:
        if len(args) < 2:
            raise UsageError()

        templates_dir = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), "templates")

        SUPPORTED_TEMPLATE_TYPES = [
            name.split(".")[0] for name in os.listdir(templates_dir)
        ]

        DEST_PREFIXES = {
            "command": ["commands"],
            "extension": ["extensions"],
            "item": ["items"],
            "middleware": ["middlewares"],
            "model": ["database", "models"],
            "pipeline": ["pipelines"],
            "spider_middleware": ["middlewares"],
            "spider": ["spiders"],
            "helper": ["helpers"],
            "rabbitmq": ["rabbitmq"],
            "pm2": ["pm2"],
            "loader": ["loaders"],
        }

        template_type = args[0]

        if opts.custom_templates_dir:
            # feature for adding custom templates
            # uses TEMPLATES_MODULE setting in settings.py
            if os.path.exists(self.default_settings_filename):
                from settings import TEMPLATES_MODULE  # isort:skip

                tmp = os.path.join(TEMPLATES_MODULE,
                                   "{}.py.mako".format(template_type))
                if os.path.exists(tmp):
                    templates_dir = TEMPLATES_MODULE
                    SUPPORTED_TEMPLATE_TYPES.extend(
                        name.split(".")[0]
                        for name in os.listdir(templates_dir))
            else:
                raise UsageError(f"No settings.py in project!")

        if template_type not in SUPPORTED_TEMPLATE_TYPES:
            print(f"ERROR: unsupported template type: {template_type}")
            print("supported types: {}".format(repr(SUPPORTED_TEMPLATE_TYPES)))
            sys.exit(1)

        template_name = os.path.join(templates_dir,
                                     "{}.py.mako".format(template_type))
        template = Template(filename=template_name)

        class_name = inflection.camelize(args[1])
        command_name = inflection.underscore(class_name)
        spider_name = inflection.underscore(class_name).replace("_spider", "")
        table_name = inflection.pluralize(inflection.underscore(class_name))
        logger_name = inflection.underscore(class_name).upper()
        item_class = inflection.camelize(
            opts.item_class) if opts.item_class else None

        if class_name[0].isdigit():
            raise UsageError(f"Class name violation in '{class_name}'")

        file_prefix = DEST_PREFIXES.get(template_type, [])
        file_name = command_name
        file_path = os.path.join(*file_prefix, f"{file_name}.py")

        if os.path.exists(file_path):
            print("WARNING: file already exists")
            do_overwrite = input("overwrite? [y/N] ")

            if do_overwrite.lower() not in ["y", "yes"]:
                print("aborted")
                return

        if not os.path.isdir(os.path.dirname(file_path)):
            os.makedirs(os.path.dirname(file_path))

        rendered_code = template.render(
            class_name=class_name,
            command_name=command_name,
            spider_name=spider_name,
            table_name=table_name,
            logger_name=logger_name,
            use_rabbit=opts.use_rabbit,
            item_class=item_class,
        )

        if opts.debug:
            print(rendered_code)

        if opts.priority and not opts.filename:
            opts.filename = self.default_settings_filename

        if template_type in self.SETTINGS_NAMES and opts.filename:
            filenames = opts.filename.split(",")
            for filename in filenames:
                if not path.exists(filename):
                    # try find spider by class name
                    spider_prefix = DEST_PREFIXES.get("spider", [])
                    spider_file_name = inflection.underscore(filename)
                    filename = os.path.join(*spider_prefix,
                                            f"{spider_file_name}.py")
                    if not path.exists(filename):
                        raise UsageError(
                            f"Could not find specified file name: {filename}")
                self._add_to_settings(
                    filename,
                    self.SETTINGS_NAMES[template_type],
                    f"{file_prefix[0]}.{class_name}",
                    opts.priority,
                )
                if opts.priority_terminal:
                    self._print_to_terminal(
                        self.SETTINGS_NAMES[template_type],
                        f"{file_prefix[0]}.{class_name}",
                        opts.priority_terminal,
                    )
                # there will be error in eval if started
                # with not formatted code next time
                # TODO disable when not using settings
                os.system(f"black {filename}")

        with open(file_path, "w") as out_file:
            out_file.write(rendered_code)

        self.add_init_import(file_prefix, file_name, class_name)

        print(f"Created {template_type} '{file_name}'")
Example #8
0
 def process_options(self, args, opts):
     ScrapyCommand.process_options(self, args, opts)
     try:
         opts.spargs = arglist_to_dict(opts.spargs)
     except ValueError:
         raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
    def run(self, args, opts):
        self.set_pages(opts.pages)
        if opts.filter:
            try:
                opts.filter = eval('filter.' + opts.filter)
            except:
                raise UsageError("Invalid filter function name!")
        self.settings.set("FILTER", opts.filter)
        cfg = config.config()
        if len(args) >= 3:
            raise UsageError("Too many arguments!")

        self.settings.set('MYSQL_HOST', cfg.config['MYSQL_HOST'])
        self.settings.set('MYSQL_USER', cfg.config['MYSQL_USER'])
        self.settings.set('MYSQL_PASSWD', cfg.config['MYSQL_PASSWD'])
        self.settings.set('MYSQL_USE_SSL', cfg.config['MYSQL_USE_SSL'])
        self.settings.set('MYSQL_SSL_CHECK_HOSTNAME',
                          cfg.config['MYSQL_SSL_CHECK_HOSTNAME'])
        self.settings.set('MYSQL_SSL_CA_PATH', cfg.config['MYSQL_SSL_CA_PATH'])

        tbname = cfg.config['DEFAULT_TIEBA']
        if len(args) >= 1:
            tbname = args[0]
        if isinstance(tbname, str):
            tbname = tbname.encode('utf8')

        dbname = None
        for key in cfg.config['MYSQL_DBNAME'].keys():
            if key.encode('utf8') == tbname:
                dbname = cfg.config['MYSQL_DBNAME'][key]
        if len(args) >= 2:
            dbname = args[1]
            cfg.config['MYSQL_DBNAME'][tbname.decode('utf8')] = dbname
        if not dbname:
            raise UsageError("Please input database name!")

        self.settings.set('TIEBA_NAME', tbname, priority='cmdline')
        self.settings.set('MYSQL_DBNAME', dbname, priority='cmdline')

        use_ssl = False
        ssl_check_hostname = False

        if cfg.config['MYSQL_USE_SSL'] == 'True':
            use_ssl = True

        if cfg.config['MYSQL_SSL_CHECK_HOSTNAME'] == 'False':
            ssl_check_hostname = False
        else:
            ssl_check_hostname = True

        config.init_database(cfg.config['MYSQL_HOST'],\
            cfg.config['MYSQL_USER'], cfg.config['MYSQL_PASSWD'], dbname,\
            use_ssl = use_ssl, ssl_check_hostname = ssl_check_hostname,\
            ssl_ca = cfg.config['MYSQL_SSL_CA_PATH'], spider_type='pantip')

        log = config.log(tbname, dbname, self.settings['BEGIN_PAGE'])
        self.settings.set('SIMPLE_LOG', log)

        self.crawler_process.crawl('pantip', **opts.spargs)
        self.crawler_process.start()

        cfg.save()
Example #10
0
    def set_city(self, city):
        if len(city) != 3:
            raise UsageError('必须是三个字,比如:北京市')

        self.settings.set('CITY', city, priority='cmdline')
Example #11
0
def feed_process_params_from_cli(settings,
                                 output,
                                 output_format=None,
                                 overwrite_output=None):
    """
    Receives feed export params (from the 'crawl' or 'runspider' commands),
    checks for inconsistencies in their quantities and returns a dictionary
    suitable to be used as the FEEDS setting.
    """
    valid_output_formats = without_none_values(
        settings.getwithbase('FEED_EXPORTERS')).keys()

    def check_valid_format(output_format):
        if output_format not in valid_output_formats:
            raise UsageError(
                f"Unrecognized output format '{output_format}'. "
                f"Set a supported one ({tuple(valid_output_formats)}) "
                "after a colon at the end of the output URI (i.e. -o/-O "
                "<URI>:<FORMAT>) or as a file extension.")

    overwrite = False
    if overwrite_output:
        if output:
            raise UsageError(
                "Please use only one of -o/--output and -O/--overwrite-output")
        output = overwrite_output
        overwrite = True

    if output_format:
        if len(output) == 1:
            check_valid_format(output_format)
            message = (
                'The -t command line option is deprecated in favor of '
                'specifying the output format within the output URI. See the '
                'documentation of the -o and -O options for more information.',
            )
            warnings.warn(message, ScrapyDeprecationWarning, stacklevel=2)
            return {output[0]: {'format': output_format}}
        else:
            raise UsageError(
                'The -t command-line option cannot be used if multiple output '
                'URIs are specified')

    result = {}
    for element in output:
        try:
            feed_uri, feed_format = element.rsplit(':', 1)
        except ValueError:
            feed_uri = element
            feed_format = os.path.splitext(element)[1].replace('.', '')
        else:
            if feed_uri == '-':
                feed_uri = 'stdout:'
        check_valid_format(feed_format)
        result[feed_uri] = {'format': feed_format}
        if overwrite:
            result[feed_uri]['overwrite'] = True

    # FEEDS setting should take precedence over the matching CLI options
    result.update(settings.getdict('FEEDS'))

    return result
Example #12
0
 def run(self, args, opts):
     try:
         from scrapyd.script import execute
         execute()
     except ImportError:
         raise UsageError("Scrapyd is not available in this system")
Example #13
0
 def check_valid_format(output_format):
     if output_format not in valid_output_formats:
         raise UsageError("Unrecognized output format '%s', set one after a"
                          " colon using the -o option (i.e. -o <URI>:<FORMAT>)"
                          " or as a file extension, from the supported list %s" %
                          (output_format, tuple(valid_output_formats)))
Example #14
0
 def process_options(self, args, opts):
     super(Command, self).process_options(args, opts)
     google_key_found = self.settings.get('GOOGLE_CLOUD_API_KEY')
     if not google_key_found:
         raise UsageError(usage_info.format(spider=args[0]))
Example #15
0
def _get_target(name):
    try:
        return _get_targets()[name]
    except KeyError:
        raise UsageError("Unknown target: %s" % name)
Example #16
0
 def process_spider_arguments(self, opts):
     try:
         opts.spargs = arglist_to_dict(opts.spargs)
     except ValueError:
         raise UsageError("Invalid -a value, use -a NAME=VALUE",
                          print_help=False)
Example #17
0
 def run(self, args, opts):
     if len(args) < 1:
         raise UsageError()
     elif len(args) > 1:
         raise UsageError(
             "running 'scrapy review' with more than one argument is not supported"
         )
     file_name = args[0]
     status = Status()
     if len(opts.classifiers) == 0:
         opts.classifiers = status.classifiers.keys(
         )  #If all classifiers are to be used
     #Setting up classifiers which are possible
     valid_classifiers = defaultdict(
         dict)  #Dictionary for currently feasible classifiers only
     for classifier_name in status.classifiers.keys():
         classifications = []
         if status.classifiers[classifier_name]['info'][
                 'settings'] and opts.classifiers.count(
                     classifier_name) == 1:
             valid_classifiers[classifier_name]['classifications'] = \
             sorted(status.classifiers[classifier_name]['classifications'])
     #Counting files for valid classifiers
     no_files = {}
     classifiers = valid_classifiers.keys()
     for classifier in valid_classifiers.keys():
         reviewed = status.classifiers[classifier]['reviewed']
         for classification in list(
                 valid_classifiers[classifier]['classifications']):
             no_files[classification] = len([
                 x for x in reviewed if x.find(os.sep + classification) >= 0
             ])
     items = Reader.read_unreviewed(file_name)
     #Confirmation mode
     confirmation_mode = False
     conf_input = 3
     while conf_input > 2:
         try:
             conf_input = int(
                 raw_input(
                     "1. Keep the same\n2. Turn on confirmation mode"))
         except:
             print "Wrong input"
         if conf_input == 2: confirmation_mode = True
     #Review of items
     n = opts.i_no
     while n < len(items):
         print "ITEM {0}/{1}".format(n, len(items))
         print no_files
         item = items[n]
         status.item.review(item)
         if n >= opts.i_no:
             to_write = {}
             for classifier in valid_classifiers.keys():
                 #Loop to ensure a choice
                 is_a_choice = False
                 while is_a_choice == False:
                     prompt = "Pick classification\n"
                     choices = {}
                     i = 0
                     for classification in valid_classifiers[classifier][
                             'classifications']:
                         i += 1
                         choices[i] = classification
                         prompt += "{0}. {1}\t".format(i, classification)
                         if i % 3 == 0: prompt += "\n"
                     try:
                         choice = int(raw_input(prompt))
                     except:
                         print "Wrong input"
                     if choices.has_key(choice): is_a_choice = True
                 to_write[classifier] = choices[choice]
             confirmed = True
             if confirmation_mode:
                 confirmed = False
                 print "Choices: {0}".format("\t".join(
                     to_write[classifier]
                     for classifier in to_write.keys()))
                 try:
                     choice = int(raw_input("1. Confirm \n 2. Reclassify"))
                 except:
                     print "Wrong input"
                 if choice == 1: confirmed = True
             if confirmed:
                 for classifier in to_write.keys():
                     classifications
                     classifier_dir = os.path.join(status.data_dir,
                                                   classifier)
                     no_files[to_write[classifier]] += 1
                     new_f_name = "{0}0{1}.json".format(
                         to_write[classifier],
                         no_files[to_write[classifier]])
                     with open(os.path.join(classifier_dir, new_f_name),
                               "wb") as new_f:
                         new_f.write(json.dumps(item))
                 item['classifications'] = to_write
                 with open(
                         os.path.join(status.to_upload_dir,
                                      "{0}.json".format(str(uuid.uuid4()))),
                         "wb") as upload_f:
                     upload_f.write(json.dumps(item))
                 n += 1
             if n == len(items): sys.exit()