def __init__( self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=("a", "area"), attrs=("href",), canonicalize=True, unique=True, process_value=None, deny_extensions=None, restrict_css=(), ): tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs)) tag_func = lambda x: x in tags attr_func = lambda x: x in attrs lx = LxmlParserLinkExtractor(tag=tag_func, attr=attr_func, unique=unique, process=process_value) super(LxmlLinkExtractor, self).__init__( lx, allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths, restrict_css=restrict_css, canonicalize=canonicalize, deny_extensions=deny_extensions, )
def __init__( self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=("a", "area"), attrs=("href"), canonicalize=True, unique=True, process_value=None, deny_extensions=None, ): self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)] self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)] self.allow_domains = set(arg_to_iter(allow_domains)) self.deny_domains = set(arg_to_iter(deny_domains)) self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths)) self.canonicalize = canonicalize if deny_extensions is None: deny_extensions = IGNORED_EXTENSIONS self.deny_extensions = set(["." + e for e in deny_extensions]) tag_func = lambda x: x in tags attr_func = lambda x: x in attrs BaseSgmlLinkExtractor.__init__(self, tag=tag_func, attr=attr_func, unique=unique, process_value=process_value)
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None, deny_extensions=None, restrict_css=()): warnings.warn( "SgmlLinkExtractor is deprecated and will be removed in future releases. " "Please use scrapy.linkextractors.LinkExtractor", ScrapyDeprecationWarning ) tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs)) tag_func = lambda x: x in tags attr_func = lambda x: x in attrs with warnings.catch_warnings(record=True): lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func, unique=unique, process_value=process_value) super(SgmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths, restrict_css=restrict_css, canonicalize=canonicalize, deny_extensions=deny_extensions) # FIXME: was added to fix a RegexLinkExtractor testcase self.base_url = None
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href',), canonicalize=False, unique=True, process_value=None, deny_extensions=None, restrict_css=(), strip=True, restrict_text=()): warnings.warn( "SgmlLinkExtractor is deprecated and will be removed in future releases. " "Please use scrapy.linkextractors.LinkExtractor", ScrapyDeprecationWarning, stacklevel=2, ) tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs)) tag_func = lambda x: x in tags attr_func = lambda x: x in attrs with warnings.catch_warnings(): warnings.simplefilter('ignore', ScrapyDeprecationWarning) lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func, unique=unique, process_value=process_value, strip=strip, canonicalized=canonicalize) super(SgmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths, restrict_css=restrict_css, canonicalize=canonicalize, deny_extensions=deny_extensions, restrict_text=restrict_text)
def __init__(self, allow=(), deny=()): """Initialize allow/deny attributes""" _re_type = type(re.compile('', 0)) self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)] self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)]
def wrapper(self, response): if not hasattr(self, 'debug'): self.debug = False if self.debug: if not hasattr(self, 'failed_urls'): self.failed_urls = [] if not hasattr(self, 'missing_fields'): self.missing_fields = {} self.crawler.stats.set_value('DEBUG MODE', 'TRUE') result = func(self, response) for result in arg_to_iter(func(self, response)): if isinstance(result, Item): exception = DropItem else: exception = CloseSpider if response.status >= 400 or any(error in response.body for error in errorstrings): if self.debug: self.crawler.stats.inc_value('DEBUG: failed_urls_count') self.failed_urls.append(response.url) self.crawler.stats.set_value( 'DEBUG: failed_urls', self.failed_urls) if exception == DropItem: self.crawler.stats.inc_value('items_dropped_count') if response.status >= 400: raise exception( 'Status Code Error: %s\nURL: %s' % (response.status, response.url)) else: errors = [error for error in errorstrings if error in response.body] raise exception( 'Response Body Error: %s\nURL: %s' % (', '.join(errors), response.url)) yield None if isinstance(result, Item): if self.debug: job_misses_a_required_field = False for field in arg_to_iter(fields_to_check): if not result.get(field): if field not in self.missing_fields: self.missing_fields[field] = [] self.missing_fields[field].append(response.url) job_misses_a_required_field = True for key in self.missing_fields.keys(): self.crawler.stats.set_value('DEBUG: missing_%s_field' % key, self.missing_fields[key]) if job_misses_a_required_field: self.crawler.stats.inc_value( 'DEBUG: jobs_missing_required_field_count') else: if not result.get('referencenumber'): self.crawler.stats.inc_value('items_dropped_count') raise MissingJobField('referencenumber', response.url) yield result
def test_arg_to_iter(self): assert hasattr(arg_to_iter(None), '__iter__') assert hasattr(arg_to_iter(100), '__iter__') assert hasattr(arg_to_iter('lala'), '__iter__') assert hasattr(arg_to_iter([1,2,3]), '__iter__') assert hasattr(arg_to_iter(l for l in 'abcd'), '__iter__') self.assertEqual(list(arg_to_iter(None)), []) self.assertEqual(list(arg_to_iter('lala')), ['lala']) self.assertEqual(list(arg_to_iter(100)), [100]) self.assertEqual(list(arg_to_iter(l for l in 'abc')), ['a', 'b', 'c']) self.assertEqual(list(arg_to_iter([1,2,3])), [1,2,3])
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area', 'script', 'link'), attrs=('href', 'src'), canonicalize=True, unique=True, process_value=None, deny_extensions=None): tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs)) tag_func = lambda x: x in tags attr_func = lambda x: x in attrs lx = SayParserLinkExtractor(tag=tag_func, attr=attr_func, unique=unique, process=process_value) super(SayLinkExtractor, self).__init__(lx, allow, deny, allow_domains, deny_domains, restrict_xpaths, canonicalize, deny_extensions)
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None): self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)] self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)] self.allow_domains = set(arg_to_iter(allow_domains)) self.deny_domains = set(arg_to_iter(deny_domains)) self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths)) self.canonicalize = canonicalize tag_func = lambda x: x in tags attr_func = lambda x: x in attrs BaseSgmlLinkExtractor.__init__(self, tag=tag_func, attr=attr_func, unique=unique, process_value=process_value)
def __call__(self, value, loader_context=None): values = arg_to_iter(value) if loader_context: context = MergeDict(loader_context, self.default_loader_context) else: context = self.default_loader_context wrapped_funcs = [wrap_loader_context(f, context) for f in self.functions] for func in wrapped_funcs: next_values = [] for v in values: next_values += arg_to_iter(func(v)) values = next_values return values
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None, deny_extensions=None, restrict_css=()): tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs)) tag_func = lambda x: x in tags attr_func = lambda x: x in attrs lx = LxmlParserLinkExtractor(tag=tag_func, attr=attr_func, unique=unique, process=process_value) self.crawledPagesPerSite={} self.maximumPagesPerSite=10000 #每个网站最多可以爬取的页面个数 super(CustomLinkExtractor, self).__init__(lx, allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths, restrict_css=restrict_css, canonicalize=canonicalize, deny_extensions=deny_extensions)
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href',), canonicalize=True, unique=True, process_value=None, deny_extensions=None): tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs)) tag_func = lambda x: x in tags attr_func = lambda x: x in attrs lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func, unique=unique, process_value=process_value) super(SgmlLinkExtractor, self).__init__(lx, allow, deny, allow_domains, deny_domains, restrict_xpaths, canonicalize, deny_extensions) # FIXME: was added to fix a RegexLinkExtractor testcase self.base_url = None
def send(self, to, subject, body, cc=None, attachs=(), mimetype='text/plain', charset=None, _callback=None): if attachs: msg = MIMEMultipart() else: msg = MIMENonMultipart(*mimetype.split('/', 1)) to = list(arg_to_iter(to)) cc = list(arg_to_iter(cc)) msg['From'] = self.mailfrom msg['To'] = COMMASPACE.join(to) msg['Date'] = formatdate(localtime=True) msg['Subject'] = subject rcpts = to[:] if cc: rcpts.extend(cc) msg['Cc'] = COMMASPACE.join(cc) if charset: msg.set_charset(charset) if attachs: msg.attach(MIMEText(body, 'plain', charset or 'us-ascii')) for attach_name, mimetype, f in attachs: part = MIMEBase(*mimetype.split('/')) part.set_payload(f.read()) Encoders.encode_base64(part) part.add_header('Content-Disposition', 'attachment; filename="%s"' \ % attach_name) msg.attach(part) else: msg.set_payload(body) if _callback: _callback(to=to, subject=subject, body=body, cc=cc, attach=attachs, msg=msg) if self.debug: logger.debug('Debug mail sent OK: To=%(mailto)s Cc=%(mailcc)s ' 'Subject="%(mailsubject)s" Attachs=%(mailattachs)d', {'mailto': to, 'mailcc': cc, 'mailsubject': subject, 'mailattachs': len(attachs)}) return dfd = self._sendmail(rcpts, msg.as_string().encode(charset or 'utf-8')) dfd.addCallbacks(self._sent_ok, self._sent_failed, callbackArgs=[to, cc, subject, len(attachs)], errbackArgs=[to, cc, subject, len(attachs)]) reactor.addSystemEventTrigger('before', 'shutdown', lambda: dfd) return dfd
def _do_extract_items_from(self, htmlpage, extractor, response=None): # Try to predict template to use template_cluster, pref_template_id = self._cluster_page(htmlpage) extracted, template = extractor.extract(htmlpage, pref_template_id) extracted = extracted or [] link_regions = [] for ddict in extracted: link_regions.extend(arg_to_iter(ddict.pop("_links", []))) descriptor = None unprocessed = False if template is not None and hasattr(template, 'descriptor'): descriptor = template.descriptor() if hasattr(descriptor, 'name'): item_cls_name = descriptor.name elif hasattr(descriptor, 'get'): item_cls_name = descriptor.get('name', descriptor.get('display_name')) else: item_cls_name = '' else: unprocessed = True try: descriptor = self.schema_descriptors[template.id] item_cls_name = self.template_scrapes[template.id] except (AttributeError, KeyError): try: descriptor = sorted(self.schema_descriptors.items())[0][1] item_cls_name = sorted(self.template_scrapes.items())[0][1] except IndexError: descriptor, item_cls_name = None, None item_cls = self.item_classes.get(item_cls_name) items = [] for processed_attributes in extracted: if processed_attributes.get('_type') in self.item_classes: _type = processed_attributes['_type'] item = self.item_classes[_type](processed_attributes) item['_type'] = item.display_name() elif unprocessed: item = self._process_attributes(processed_attributes, descriptor, htmlpage) if item_cls: item = item_cls(item) elif item_cls: item = item_cls(processed_attributes) else: item = dict(processed_attributes) item[u'url'] = htmlpage.url item[u'_template'] = str(template.id) item.setdefault('_type', item_cls_name) if not isinstance(item, SlybotItem): default_meta = {'type': 'text', 'required': False, 'vary': False} item_cls = SlybotItem.create_iblitem_class( {'fields': {k: default_meta for k in item}} ) item = item_cls(**item) if self.clustering: item['_template_cluster'] = template_cluster items.append(item) return items, link_regions
def _compose(values, wrapped_funcs): for func in wrapped_funcs: next_values = [] for v in values: next_values += arg_to_iter(func(v)) values = next_values return values
def _get_spider_requests(*args): """Collect requests and spiders from the given arguments. Returns a dict of spider -> list of requests """ spider_requests = defaultdict(list) for arg in args: if isinstance(arg, tuple): request, spider = arg spider_requests[spider] = request elif isinstance(arg, Request): spider = spiders.fromurl(arg.url) or BaseSpider('default') if spider: spider_requests[spider] += [arg] else: log.msg('Could not find spider for request: %s' % arg, log.ERROR) elif isinstance(arg, BaseSpider): spider_requests[arg] += arg.start_requests() elif is_url(arg): spider = spiders.fromurl(arg) or BaseSpider('default') if spider: for req in arg_to_iter(spider.make_requests_from_url(arg)): spider_requests[spider] += [req] else: log.msg('Could not find spider for url: %s' % arg, log.ERROR) elif isinstance(arg, basestring): spider = spiders.fromdomain(arg) if spider: spider_requests[spider] += spider.start_requests() else: log.msg('Could not find spider for domain: %s' % arg, log.ERROR) else: raise TypeError("Unsupported argument: %r" % arg) return spider_requests
def process_item(self, item, spider): obj_map = self.spider_objs[spider] rel_fields = item._model_rel_fields # If there are no related fields to resolve just save and return. if not rel_fields: return self.save_item(None, item, spider) # Build a list of outstanding requests. req_ids = sum([arg_to_iter(item.get(f.name, [])) for f in rel_fields], []) req_ids = [u for u in req_ids if (u not in obj_map or isinstance(obj_map[u], Deferred))] # If there are no requests to perform, fill, save and return. if not req_ids: return self.save_item(None, item, spider) # Defer? dlist = [] for id in req_ids: if id not in obj_map: obj_map[id] = Deferred() dfd = obj_map[id] assert dfd is not None dlist.append(dfd) return DeferredList(dlist, consumeErrors=1).addCallback(self.save_item, item, spider)
def _request_callback(self, spider, original_callback, response): """ Close the page (lose the reference to it so it is garbage collected) when the callback returns. The original callback may prevent page closing by setting the should_close_webpage attribute in responses. This is useful for example if the page is stored somewhere else (e.g. request meta) to be used later. The page then needs to be closed manually at some point by calling its close_page() function, which is created here. """ if isinstance(original_callback, basestring): original_callback = getattr(spider, original_callback) webpage = response.webpage response.should_close_webpage = True try: returnValue(arg_to_iter((yield maybeDeferred(original_callback, response)))) finally: # FIXME: sometimes this section is reached before the wrapped # callback finishes, when it returns a Deferred. if response.should_close_webpage: self._close_page(webpage) else: webpage.close_page = partial(self._close_page, webpage) webpage.close_page.__doc__ = ("Lose the reference to the " "webpage object and allow it " "to be garbage collected.")
def __call__(self, value, loader_context=None): value = arg_to_iter(value)[0] if 'http://' not in value: #import pdb; pdb.set_trace() value = urljoin_rfc(get_base_url(loader_context['response']), value) return value
def __call__(self, values): out = [] for value in [x for x in arg_to_iter(values) if isinstance(x, basestring)]: for m in self.latin_html_map: value = value.replace(m,self.latin_html_map[m]) out.append(value) return out
def __call__(self, values): new_values = [] for v in arg_to_iter(values): if isinstance(v, (str, unicode)): v = remove_entities(v).strip() new_values.append(int(v)) return new_values
def __call__(self, value, loader_context=None): #value = TakeFirst(value) #value = value.Clean() values = arg_to_iter(value) for i,value in enumerate(values): if value: return value.replace(',','.')
def __call__(self, values): # first we convert the strings to Element objects list_of_elements = [] for x in arg_to_iter(values): if not x.isspace(): try: list_of_elements.append(html.fromstring(x)) except: # invalid html(eg:comment tags) pass # then, we go ahead and remove every higher-level 'bad' tag from the list processed_list = [x for x in list_of_elements if x.tag not in self.bad] # then we check each element for sub-elements, and # remove the 'bad' tags under each. this is because # sometimes the strings we get are full-on trees in their # own right, so we need to go and pick off the children # one by one. for elem in processed_list: # if the top level element is bad # we just remove it and all the children # should follow if elem.tag in self.bad: elem.getparent().remove(elem) else: # otherwise, we use the xpath # method to check if there are any # bad children using xpath. self.remove_children(elem, self.bad) return [html.tostring(x, encoding='unicode') for x in processed_list]
def remove_extra_words(self, value, remove_words=[]): """Removing all the extra words""" for temp in remove_words: temp = temp.lower() value = [v.lower().replace(temp, '') for v in arg_to_iter(value)] return value
def __call__(self, value, loader_context=None): values = arg_to_iter(value) if loader_context: context = MergeDict(loader_context, self.default_loader_context) else: context = self.default_loader_context wrapped_funcs = (wrap_loader_context(f, context) for f in self.functions) return self._compose(values, wrapped_funcs)
def _get_jmes_values(self, jmes_paths): if self.json_obj is None: raise RuntimeError("no JSON object found") jmes_paths = arg_to_iter(jmes_paths) return flatten( jmespath.search(jmes_path, self.json_obj) for jmes_path in jmes_paths)
def test_url_has_any_extension(self): deny_extensions = {'.' + e for e in arg_to_iter(IGNORED_EXTENSIONS)} self.assertTrue(url_has_any_extension("http://www.example.com/archive.tar.gz", deny_extensions)) self.assertTrue(url_has_any_extension("http://www.example.com/page.doc", deny_extensions)) self.assertTrue(url_has_any_extension("http://www.example.com/page.pdf", deny_extensions)) self.assertFalse(url_has_any_extension("http://www.example.com/page.htm", deny_extensions)) self.assertFalse(url_has_any_extension("http://www.example.com/", deny_extensions)) self.assertFalse(url_has_any_extension("http://www.example.com/page.doc.html", deny_extensions))
def map_final_data(self, value): indutries_codes = [] for x in arg_to_iter(value): if x.lower() in self.map.keys(): indutries_codes.extend([self.map[x.lower()]]) else: indutries_codes.extend([x]) return indutries_codes
def __call__(self, values): rvals = [] for v in arg_to_iter(values): try: rvals.append(datetime.strptime(v.strip(), self.informat).strftime(self.outformat)) except ValueError: pass return rvals
def _get_cssvalues(self, csss, **kw): self._check_selector_method() csss = arg_to_iter(csss) ret = self._extract_hier_csss(self.selector, csss, **kw) if ret is None or not flatten(ret): return None else: return ret
def __call__(self, value, loader_context=None): if not value: value.append(" ") values = arg_to_iter(value) if loader_context: context = MergeDict(loader_context, self.default_loader_context) else: context = self.default_loader_context wrapped_funcs = [ wrap_loader_context(f, context) for f in self.functions ] for func in wrapped_funcs: next_values = [] for v in values: next_values += arg_to_iter(func(v)) values = next_values return values
def process_item(self, item, spider): if 'meta' not in spider.name: return item info = self.spiderinfo requests = arg_to_iter(self.get_media_requests(item, info)) dlist = [self._process_request(r, info) for r in requests] dfd = DeferredList(dlist, consumeErrors=1) return dfd.addCallback(self.item_completed, item, info)
def __init__(self, settings): if not settings.getbool('PROXYMESH_ENABLED', True): raise NotConfigured self.proxies = itertools.cycle( arg_to_iter( settings.get('PROXYMESH_URL', 'http://us-il.proxymesh.com:31280'))) self.timeout = settings.getint('PROXYMESH_TIMEOUT', 0)
def assertReMatch(self, regex, actual, msg=None): # {{{ actuals = arg_to_iter(actual) for actual in actuals: match = re.search(regex, actual) errmsg = "%s not match %s" % (actual, regex) if msg: errmsg = "%s\n%s" % (msg, errmsg) self.assertTrue(match, errmsg)
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href', ), canonicalize=False, unique=True, process_value=None, deny_extensions=None, restrict_css=(), strip=True, restrict_text=()): warnings.warn( "SgmlLinkExtractor is deprecated and will be removed in future releases. " "Please use scrapy.linkextractors.LinkExtractor", ScrapyDeprecationWarning, stacklevel=2, ) tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs)) tag_func = lambda x: x in tags attr_func = lambda x: x in attrs with warnings.catch_warnings(): warnings.simplefilter('ignore', ScrapyDeprecationWarning) lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func, unique=unique, process_value=process_value, strip=strip, canonicalized=canonicalize) super(SgmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths, restrict_css=restrict_css, canonicalize=canonicalize, deny_extensions=deny_extensions, restrict_text=restrict_text)
def process_item(self, item, spider): info = self.spiderinfo requests = arg_to_iter(self.get_media_requests(item, info)) dlist = [self._process_request(r, info) for r in requests] dfd = DeferredList(dlist, consumeErrors=1) # https://github.com/scrapy/scrapy/issues/4228 item_copied = deepcopy(item)
def __init__(self, url, fields, lang_priorities=None): self.url = url self.fields = fields self.lang_priorities = { lang: prio for prio, lang in enumerate(arg_to_iter(lang_priorities)) } self.labels = {} self.logger = LOGGER
def process_item(self, item, spider): # add names self.file_paths.update(item["file_paths"]) info = self.spiderinfo requests = arg_to_iter(self.get_media_requests(item, info)) dlist = [self._process_request(r, info) for r in requests] dfd = DeferredList(dlist, consumeErrors=1) return dfd.addCallback(self.item_completed, item, info)
def _get_jsonpathvalues(self, jsonpaths, **kw): self._check_selector_method() jsonpaths = arg_to_iter(jsonpaths) ret = self._extract_hier_jsonpaths(self.selector.json, jsonpaths, **kw) if not flatten(ret): return None else: return ret
def __call__(self, values): output = [] for val in arg_to_iter(values): val = self.remove_empty_spaces(val) val = self.remove_special_chars(val) output.extend(val) return val
def __call__(self, value, loader_context=None): values = arg_to_iter(value) values.reverse() secs=0 for i, v in enumerate(values): secs+= int(v.strip('\:'))*(60**i) return secs
def __init__(self, link_extractor, allow, deny, allow_domains, deny_domains, restrict_xpaths, canonicalize, deny_extensions, restrict_css): self.link_extractor = link_extractor self.allow_res = [ x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow) ] self.deny_res = [ x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny) ] self.allow_domains = set(arg_to_iter(allow_domains)) self.deny_domains = set(arg_to_iter(deny_domains)) self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths)) self.restrict_xpaths += tuple( map(self._csstranslator.css_to_xpath, arg_to_iter(restrict_css))) self.canonicalize = canonicalize if deny_extensions is None: deny_extensions = IGNORED_EXTENSIONS self.deny_extensions = {'.' + e for e in arg_to_iter(deny_extensions)}
def process_item(self, item, spider): """ custom process_item func,so it will manage the Request result. """ info = self.spiderinfo requests = arg_to_iter(self.get_media_requests(item, info)) dlist = [self._process_request(r, info) for r in requests] dfd = DeferredList(dlist, consumeErrors=1) return dfd.addCallback(self.item_completed, item, info)
def iterate_spider_output(result): if inspect.isasyncgen(result): return result elif inspect.iscoroutine(result): d = deferred_from_coro(result) d.addCallback(iterate_spider_output) return d else: return arg_to_iter(deferred_from_coro(result))
def process_item(self, item, spider): if item.get('image_urls'): info = self.spiderinfo requests = arg_to_iter(self.get_media_requests(item, info)) dlist = [self._process_request(r, info) for r in requests] dfd = DeferredList(dlist, consumeErrors=True) return dfd.addCallback(self.item_completed, item, info) else: return item
def start_requests(self): print('self.queries: ', self.queries) for query in arg_to_iter(self.queries): url = self.make_google_search_request(COUNTRIES[self.region], query) print('url: ', url) yield scrapy.Request(url=url, meta={'query': query}, callback=self.parse)
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href', ), canonicalize=True, unique=True, process_value=None, deny_extensions=None, restrict_css=()): warnings.warn( "SgmlLinkExtractor is deprecated and will be removed in future releases. " "Please use scrapy.linkextractors.LinkExtractor", ScrapyDeprecationWarning, stacklevel=2, ) tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs)) tag_func = lambda x: x in tags attr_func = lambda x: x in attrs with warnings.catch_warnings(record=True): lx = BaseSgmlLinkExtractor(tag=tag_func, attr=attr_func, unique=unique, process_value=process_value) super(SgmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths, restrict_css=restrict_css, canonicalize=canonicalize, deny_extensions=deny_extensions) # FIXME: was added to fix a RegexLinkExtractor testcase self.base_url = None
def __call__(self, values): new_values = [] for v in arg_to_iter(values): if isinstance(v, (str, unicode)): v = remove_entities(v).strip() v = (v.lower() == 'true') else: v = bool(v) new_values.append(v) return new_values
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href'), canonicalize=True, unique=True, process_value=None, deny_extensions=None): self.allow_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(allow)] self.deny_res = [x if isinstance(x, _re_type) else re.compile(x) for x in arg_to_iter(deny)] self.allow_domains = set(arg_to_iter(allow_domains)) self.deny_domains = set(arg_to_iter(deny_domains)) self.restrict_xpaths = tuple(arg_to_iter(restrict_xpaths)) self.canonicalize = canonicalize if deny_extensions is None: deny_extensions = IGNORED_EXTENSIONS self.deny_extensions = set(['.' + e for e in deny_extensions]) tag_func = lambda x: x in tags attr_func = lambda x: x in attrs BaseSgmlLinkExtractor.__init__(self, tag=tag_func, attr=attr_func, unique=unique, process_value=process_value)
def add_values(self, values, keys=None): if not keys: keys = self.default_keys elif isinstance(keys, basestring): keys = self.keys[keys] for k, v in zip(keys, values): if k: for k in arg_to_iter(k): self.add_value(k, v)
def _extract(self, response, option): """ 1. extract links/items 2. filter desired output """ conds = option.get('conds', []) ref = option.get('ref') if not ref: extracts = self.__extract(response, option) _ITEM_REFS[response] = list(arg_to_iter(extracts)) return self.__filter(_ITEM_REFS[response], conds)
def __call__(self, values): out_values = [] values = arg_to_iter(values) while values: val = values.pop(0) if values and val == 'R' and values[0] == 'B': values.pop(0) out_values.append('R&B') elif val: out_values.append(val) return out_values
def _deferred_field(self, field, item, spider): deferreds = [ self._deferred_value(value, spider) for value in arg_to_iter(item.get(field)) ] if not deferreds: item[field] = None return defer_result(item) deferred = DeferredList(deferreds, consumeErrors=True) deferred.addBoth(self._add_value, field, item) return deferred
def _extract_hier_csss(self, node, csss, **kw): csss = arg_to_iter(csss) if len(csss) > 1: child_csss = csss[1:] return [ self._extract_hier_csss(Selector(text=child_node_html), child_csss, **kw) for child_node_html in node.css(csss[0]) ] else: return filter_regex(kw.get('regex'), node.css(csss[0]))
def iterate_spider_output(result): if collect_asyncgen and hasattr( inspect, 'isasyncgen') and inspect.isasyncgen(result): d = deferred_from_coro(collect_asyncgen(result)) d.addCallback(iterate_spider_output) return d elif inspect.iscoroutine(result): d = deferred_from_coro(result) d.addCallback(iterate_spider_output) return d return arg_to_iter(result)
def process_item(self, item, spider): """Copy a limited number of image URLs to be downloaded from source to target.""" # adding target field would result in error; return item as-is if hasattr(item, "fields") and self.target_field not in item.fields: return item if self.limit is None or self.limit < 0: # copy through everything item[self.target_field] = list( arg_to_iter(item.get(self.source_field))) return item if not self.limit: # limit is zero item[self.target_field] = [] return item # actual limit item[self.target_field] = list( islice(arg_to_iter(item.get(self.source_field)), self.limit)) return item
def process_item(self, item, spider): # ensure if abs(spider.count) + 1 > spider.max: spider.close_down = True info = self.spiderinfo requests = arg_to_iter(self.get_media_requests(item, info)) dlist = [self._process_request(r, info) for r in requests] dfd = DeferredList(dlist, consumeErrors=1) # only update when item is passed to pipeline, ensuring count consistency spider.count -= 1 return dfd.addCallback(self.item_completed, item, info)
def sticky_passthrough(spider, response, func, sticky_args, *args, **kwargs): meta_keys = set( list(sticky_args) + list(getattr(spider, 'sticky_meta', []))) sticky = {k: v for k, v in response.meta.items() if k in meta_keys} f = func(spider, response, *args, **kwargs) for r in arg_to_iter(f): if sticky and isinstance(r, Request): r.meta.update( {k: v for k, v in sticky.items() if k not in r.meta.keys()}) yield r
def __init__(self, allow=(), deny=(), allow_domains=(), deny_domains=(), restrict_xpaths=(), tags=('a', 'area'), attrs=('href',), canonicalize=False, unique=True, process_value=None, deny_extensions=None, restrict_css=(), strip=True): tags, attrs = set(arg_to_iter(tags)), set(arg_to_iter(attrs)) tag_func = lambda x: x in tags attr_func = lambda x: x in attrs lx = LxmlParserLinkExtractor( tag=tag_func, attr=attr_func, unique=unique, process=process_value, strip=strip, canonicalized=canonicalize ) super(LxmlLinkExtractor, self).__init__(lx, allow=allow, deny=deny, allow_domains=allow_domains, deny_domains=deny_domains, restrict_xpaths=restrict_xpaths, restrict_css=restrict_css, canonicalize=canonicalize, deny_extensions=deny_extensions)
def get_value(self, value, *processors, **kw): regex = kw.get('re', None) if regex: value = arg_to_iter(value) value = flatten(extract_regex(regex, x) for x in value) for proc in processors: if value is None: break proc = wrap_loader_context(proc, self.context) value = proc(value) return value