def run(self, args, opts): if len(args) not in (1, 2): raise UsageError() project_name = args[0] project_dir = args[0] if len(args) == 2: project_dir = args[1] if exists(join(project_dir, 'scrapy.cfg')): self.exitcode = 1 print( f'Error: scrapy.cfg already exists in {abspath(project_dir)}') return if not self._is_valid_name(project_name): self.exitcode = 1 return self._copytree(self.templates_dir, abspath(project_dir)) move(join(project_dir, 'module'), join(project_dir, project_name)) for paths in TEMPLATES_TO_RENDER: path = join(*paths) tplfile = join( project_dir, string.Template(path).substitute(project_name=project_name)) render_templatefile(tplfile, project_name=project_name, ProjectName=string_camelcase(project_name)) print(f"New Scrapy project '{project_name}', using template directory " f"'{self.templates_dir}', created in:") print(f" {abspath(project_dir)}\n") print("You can start your first spider with:") print(f" cd {project_dir}") print(" scrapy genspider example example.com")
def run(self, args, opts): if len(args) != 1 or not is_url(args[0]): raise UsageError() request = Request(args[0], callback=self._print_response, cb_kwargs={"opts": opts}, dont_filter=True) # by default, let the framework handle redirects, # i.e. command handles all codes expect 3xx if not opts.no_redirect: request.meta['handle_httpstatus_list'] = SequenceExclude( range(300, 400)) else: request.meta['handle_httpstatus_all'] = True spidercls = DefaultSpider spider_loader = self.crawler_process.spider_loader if opts.spider: spidercls = spider_loader.load(opts.spider) else: spidercls = spidercls_for_request(spider_loader, request, spidercls) self.crawler_process.crawl(spidercls, start_requests=lambda: [request]) self.crawler_process.start()
def process_options(self, args, opts): try: self.settings.setdict(arglist_to_dict(opts.set), priority='cmdline') except ValueError: raise UsageError("Invalid -s value, use -s NAME=VALUE", print_help=False) if opts.logfile: self.settings.set('LOG_ENABLED', True, priority='cmdline') self.settings.set('LOG_FILE', opts.logfile, priority='cmdline') if opts.loglevel: self.settings.set('LOG_ENABLED', True, priority='cmdline') self.settings.set('LOG_LEVEL', opts.loglevel, priority='cmdline') if opts.nolog: self.settings.set('LOG_ENABLED', False, priority='cmdline') if opts.pidfile: with open(opts.pidfile, "w") as f: f.write(str(os.getpid()) + os.linesep) if opts.pdb: failure.startDebugMode()
def run(self, args, opts): if not opts.package: return super(Command, self).run(args, opts) if len(args) not in (1, 2): raise UsageError() project_name = args[0] project_dir = args[0] if len(args) == 2: project_dir = args[1] if exists(join(project_dir, "scrapy.cfg")): self.exitcode = 1 print("Error: scrapy.cfg already exists in %s" % abspath(project_dir)) return if not self._is_valid_name(project_name): self.exitcode = 1 return try: cookiecutter( os_scrapy_cookiecutter.TEMPLATE_DIR, no_input=True, extra_context={ "project_name": project_name, "project_dir": project_dir, }, ) except Exception as e: self.exitcode = 1 print(f"Error: create project with cookiecutter {e}") return
def process_options(self, args, opts): try: self.settings.overrides.update(arglist_to_dict(opts.set)) except ValueError: raise UsageError("Invalid -s value, use -s NAME=VALUE", print_help=False) if opts.logfile: self.settings.overrides['LOG_ENABLED'] = True self.settings.overrides['LOG_FILE'] = opts.logfile if opts.loglevel: self.settings.overrides['LOG_ENABLED'] = True self.settings.overrides['LOG_LEVEL'] = opts.loglevel if opts.nolog: self.settings.overrides['LOG_ENABLED'] = False if opts.pidfile: with open(opts.pidfile, "w") as f: f.write(str(os.getpid()) + os.linesep) if opts.pdb: failure.startDebugMode()
def _get_project(target, opts): project = opts.project or target.get('project') if not project: raise UsageError("Missing project") return project
def run(self, args: list, opts: list) -> None: if len(args) < 2: raise UsageError() templates_dir = os.path.join( os.path.dirname(os.path.abspath(__file__)), "templates") SUPPORTED_TEMPLATE_TYPES = [ name.split(".")[0] for name in os.listdir(templates_dir) ] DEST_PREFIXES = { "command": ["commands"], "extension": ["extensions"], "item": ["items"], "middleware": ["middlewares"], "model": ["database", "models"], "pipeline": ["pipelines"], "spider_middleware": ["middlewares"], "spider": ["spiders"], "helper": ["helpers"], "rabbitmq": ["rabbitmq"], "pm2": ["pm2"], "loader": ["loaders"], } template_type = args[0] if opts.custom_templates_dir: # feature for adding custom templates # uses TEMPLATES_MODULE setting in settings.py if os.path.exists(self.default_settings_filename): from settings import TEMPLATES_MODULE # isort:skip tmp = os.path.join(TEMPLATES_MODULE, "{}.py.mako".format(template_type)) if os.path.exists(tmp): templates_dir = TEMPLATES_MODULE SUPPORTED_TEMPLATE_TYPES.extend( name.split(".")[0] for name in os.listdir(templates_dir)) else: raise UsageError(f"No settings.py in project!") if template_type not in SUPPORTED_TEMPLATE_TYPES: print(f"ERROR: unsupported template type: {template_type}") print("supported types: {}".format(repr(SUPPORTED_TEMPLATE_TYPES))) sys.exit(1) template_name = os.path.join(templates_dir, "{}.py.mako".format(template_type)) template = Template(filename=template_name) class_name = inflection.camelize(args[1]) command_name = inflection.underscore(class_name) spider_name = inflection.underscore(class_name).replace("_spider", "") table_name = inflection.pluralize(inflection.underscore(class_name)) logger_name = inflection.underscore(class_name).upper() item_class = inflection.camelize( opts.item_class) if opts.item_class else None if class_name[0].isdigit(): raise UsageError(f"Class name violation in '{class_name}'") file_prefix = DEST_PREFIXES.get(template_type, []) file_name = command_name file_path = os.path.join(*file_prefix, f"{file_name}.py") if os.path.exists(file_path): print("WARNING: file already exists") do_overwrite = input("overwrite? [y/N] ") if do_overwrite.lower() not in ["y", "yes"]: print("aborted") return if not os.path.isdir(os.path.dirname(file_path)): os.makedirs(os.path.dirname(file_path)) rendered_code = template.render( class_name=class_name, command_name=command_name, spider_name=spider_name, table_name=table_name, logger_name=logger_name, use_rabbit=opts.use_rabbit, item_class=item_class, ) if opts.debug: print(rendered_code) if opts.priority and not opts.filename: opts.filename = self.default_settings_filename if template_type in self.SETTINGS_NAMES and opts.filename: filenames = opts.filename.split(",") for filename in filenames: if not path.exists(filename): # try find spider by class name spider_prefix = DEST_PREFIXES.get("spider", []) spider_file_name = inflection.underscore(filename) filename = os.path.join(*spider_prefix, f"{spider_file_name}.py") if not path.exists(filename): raise UsageError( f"Could not find specified file name: {filename}") self._add_to_settings( filename, self.SETTINGS_NAMES[template_type], f"{file_prefix[0]}.{class_name}", opts.priority, ) if opts.priority_terminal: self._print_to_terminal( self.SETTINGS_NAMES[template_type], f"{file_prefix[0]}.{class_name}", opts.priority_terminal, ) # there will be error in eval if started # with not formatted code next time # TODO disable when not using settings os.system(f"black {filename}") with open(file_path, "w") as out_file: out_file.write(rendered_code) self.add_init_import(file_prefix, file_name, class_name) print(f"Created {template_type} '{file_name}'")
def process_options(self, args, opts): ScrapyCommand.process_options(self, args, opts) try: opts.spargs = arglist_to_dict(opts.spargs) except ValueError: raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
def run(self, args, opts): self.set_pages(opts.pages) if opts.filter: try: opts.filter = eval('filter.' + opts.filter) except: raise UsageError("Invalid filter function name!") self.settings.set("FILTER", opts.filter) cfg = config.config() if len(args) >= 3: raise UsageError("Too many arguments!") self.settings.set('MYSQL_HOST', cfg.config['MYSQL_HOST']) self.settings.set('MYSQL_USER', cfg.config['MYSQL_USER']) self.settings.set('MYSQL_PASSWD', cfg.config['MYSQL_PASSWD']) self.settings.set('MYSQL_USE_SSL', cfg.config['MYSQL_USE_SSL']) self.settings.set('MYSQL_SSL_CHECK_HOSTNAME', cfg.config['MYSQL_SSL_CHECK_HOSTNAME']) self.settings.set('MYSQL_SSL_CA_PATH', cfg.config['MYSQL_SSL_CA_PATH']) tbname = cfg.config['DEFAULT_TIEBA'] if len(args) >= 1: tbname = args[0] if isinstance(tbname, str): tbname = tbname.encode('utf8') dbname = None for key in cfg.config['MYSQL_DBNAME'].keys(): if key.encode('utf8') == tbname: dbname = cfg.config['MYSQL_DBNAME'][key] if len(args) >= 2: dbname = args[1] cfg.config['MYSQL_DBNAME'][tbname.decode('utf8')] = dbname if not dbname: raise UsageError("Please input database name!") self.settings.set('TIEBA_NAME', tbname, priority='cmdline') self.settings.set('MYSQL_DBNAME', dbname, priority='cmdline') use_ssl = False ssl_check_hostname = False if cfg.config['MYSQL_USE_SSL'] == 'True': use_ssl = True if cfg.config['MYSQL_SSL_CHECK_HOSTNAME'] == 'False': ssl_check_hostname = False else: ssl_check_hostname = True config.init_database(cfg.config['MYSQL_HOST'],\ cfg.config['MYSQL_USER'], cfg.config['MYSQL_PASSWD'], dbname,\ use_ssl = use_ssl, ssl_check_hostname = ssl_check_hostname,\ ssl_ca = cfg.config['MYSQL_SSL_CA_PATH'], spider_type='pantip') log = config.log(tbname, dbname, self.settings['BEGIN_PAGE']) self.settings.set('SIMPLE_LOG', log) self.crawler_process.crawl('pantip', **opts.spargs) self.crawler_process.start() cfg.save()
def set_city(self, city): if len(city) != 3: raise UsageError('必须是三个字,比如:北京市') self.settings.set('CITY', city, priority='cmdline')
def feed_process_params_from_cli(settings, output, output_format=None, overwrite_output=None): """ Receives feed export params (from the 'crawl' or 'runspider' commands), checks for inconsistencies in their quantities and returns a dictionary suitable to be used as the FEEDS setting. """ valid_output_formats = without_none_values( settings.getwithbase('FEED_EXPORTERS')).keys() def check_valid_format(output_format): if output_format not in valid_output_formats: raise UsageError( f"Unrecognized output format '{output_format}'. " f"Set a supported one ({tuple(valid_output_formats)}) " "after a colon at the end of the output URI (i.e. -o/-O " "<URI>:<FORMAT>) or as a file extension.") overwrite = False if overwrite_output: if output: raise UsageError( "Please use only one of -o/--output and -O/--overwrite-output") output = overwrite_output overwrite = True if output_format: if len(output) == 1: check_valid_format(output_format) message = ( 'The -t command line option is deprecated in favor of ' 'specifying the output format within the output URI. See the ' 'documentation of the -o and -O options for more information.', ) warnings.warn(message, ScrapyDeprecationWarning, stacklevel=2) return {output[0]: {'format': output_format}} else: raise UsageError( 'The -t command-line option cannot be used if multiple output ' 'URIs are specified') result = {} for element in output: try: feed_uri, feed_format = element.rsplit(':', 1) except ValueError: feed_uri = element feed_format = os.path.splitext(element)[1].replace('.', '') else: if feed_uri == '-': feed_uri = 'stdout:' check_valid_format(feed_format) result[feed_uri] = {'format': feed_format} if overwrite: result[feed_uri]['overwrite'] = True # FEEDS setting should take precedence over the matching CLI options result.update(settings.getdict('FEEDS')) return result
def run(self, args, opts): try: from scrapyd.script import execute execute() except ImportError: raise UsageError("Scrapyd is not available in this system")
def check_valid_format(output_format): if output_format not in valid_output_formats: raise UsageError("Unrecognized output format '%s', set one after a" " colon using the -o option (i.e. -o <URI>:<FORMAT>)" " or as a file extension, from the supported list %s" % (output_format, tuple(valid_output_formats)))
def process_options(self, args, opts): super(Command, self).process_options(args, opts) google_key_found = self.settings.get('GOOGLE_CLOUD_API_KEY') if not google_key_found: raise UsageError(usage_info.format(spider=args[0]))
def _get_target(name): try: return _get_targets()[name] except KeyError: raise UsageError("Unknown target: %s" % name)
def process_spider_arguments(self, opts): try: opts.spargs = arglist_to_dict(opts.spargs) except ValueError: raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
def run(self, args, opts): if len(args) < 1: raise UsageError() elif len(args) > 1: raise UsageError( "running 'scrapy review' with more than one argument is not supported" ) file_name = args[0] status = Status() if len(opts.classifiers) == 0: opts.classifiers = status.classifiers.keys( ) #If all classifiers are to be used #Setting up classifiers which are possible valid_classifiers = defaultdict( dict) #Dictionary for currently feasible classifiers only for classifier_name in status.classifiers.keys(): classifications = [] if status.classifiers[classifier_name]['info'][ 'settings'] and opts.classifiers.count( classifier_name) == 1: valid_classifiers[classifier_name]['classifications'] = \ sorted(status.classifiers[classifier_name]['classifications']) #Counting files for valid classifiers no_files = {} classifiers = valid_classifiers.keys() for classifier in valid_classifiers.keys(): reviewed = status.classifiers[classifier]['reviewed'] for classification in list( valid_classifiers[classifier]['classifications']): no_files[classification] = len([ x for x in reviewed if x.find(os.sep + classification) >= 0 ]) items = Reader.read_unreviewed(file_name) #Confirmation mode confirmation_mode = False conf_input = 3 while conf_input > 2: try: conf_input = int( raw_input( "1. Keep the same\n2. Turn on confirmation mode")) except: print "Wrong input" if conf_input == 2: confirmation_mode = True #Review of items n = opts.i_no while n < len(items): print "ITEM {0}/{1}".format(n, len(items)) print no_files item = items[n] status.item.review(item) if n >= opts.i_no: to_write = {} for classifier in valid_classifiers.keys(): #Loop to ensure a choice is_a_choice = False while is_a_choice == False: prompt = "Pick classification\n" choices = {} i = 0 for classification in valid_classifiers[classifier][ 'classifications']: i += 1 choices[i] = classification prompt += "{0}. {1}\t".format(i, classification) if i % 3 == 0: prompt += "\n" try: choice = int(raw_input(prompt)) except: print "Wrong input" if choices.has_key(choice): is_a_choice = True to_write[classifier] = choices[choice] confirmed = True if confirmation_mode: confirmed = False print "Choices: {0}".format("\t".join( to_write[classifier] for classifier in to_write.keys())) try: choice = int(raw_input("1. Confirm \n 2. Reclassify")) except: print "Wrong input" if choice == 1: confirmed = True if confirmed: for classifier in to_write.keys(): classifications classifier_dir = os.path.join(status.data_dir, classifier) no_files[to_write[classifier]] += 1 new_f_name = "{0}0{1}.json".format( to_write[classifier], no_files[to_write[classifier]]) with open(os.path.join(classifier_dir, new_f_name), "wb") as new_f: new_f.write(json.dumps(item)) item['classifications'] = to_write with open( os.path.join(status.to_upload_dir, "{0}.json".format(str(uuid.uuid4()))), "wb") as upload_f: upload_f.write(json.dumps(item)) n += 1 if n == len(items): sys.exit()