def post_process(self, output): for x in output: if is_item(x): missing = [arg for arg in self.args if arg not in ItemAdapter(x) or ItemAdapter(x)[arg]==None] if missing: missing_str = ", ".join(missing) raise ContractFail("Missing or None fields: %s. Item is %s" % (missing_str, x))
def _process_spidermw_output(self, output: Any, request: Request, response: Response, spider: Spider) -> Optional[Deferred]: """Process each Request/Item (given in the output parameter) returned from the given spider """ assert self.slot is not None # typing if isinstance(output, Request): self.crawler.engine.crawl(request=output) elif is_item(output): self.slot.itemproc_size += 1 dfd = self.itemproc.process_item(output, spider) dfd.addBoth(self._itemproc_finished, output, response, spider) return dfd elif output is None: pass else: typename = type(output).__name__ logger.error( 'Spider must return request, item, or None, got %(typename)r in %(request)s', { 'request': request, 'typename': typename }, extra={'spider': spider}, ) return None
def _process_spidermw_output(self, output, request, response, spider): """Process each Request/Item (given in the output parameter) returned from the given spider """ if isinstance(output, Request): self.crawler.engine.crawl(request=output, spider=spider) #丢给 engine 处理 elif is_item(output): self.slot.itemproc_size += 1 # slot 正在处理 item计数器+1 dfd = self.itemproc.process_item( output, spider) #用处理ItemPipelineManager的类 output(item) dfd.addBoth(self._itemproc_finished, output, response, spider) # item回调链 添加self._itemproc_finished return dfd elif output is None: pass else: typename = type(output).__name__ logger.error( 'Spider must return request, item, or None, got %(typename)r in %(request)s', { 'request': request, 'typename': typename }, extra={'spider': spider}, )
def item_scraped(self, item, spider): if not is_item(item): self.logger.error("Wrong item type: %s" % item) return type_ = type(item).__name__ item = self.exporter.export_item(item) item.setdefault("_type", type_) self._write_item(item)
def post_process(self, output): for x in output: if is_item(x): missing = [ arg for arg in self.args if arg not in ItemAdapter(x) ] if missing: missing_fields = ", ".join(missing) raise ContractFail(f"Missing fields: {missing_fields}")
def run_callback(self, response, callback, cb_kwargs=None): cb_kwargs = cb_kwargs or {} items, requests = [], [] for x in iterate_spider_output(callback(response, **cb_kwargs)): if is_item(x): items.append(x) elif isinstance(x, Request): requests.append(x) return items, requests
def _serialize_value(self, value): if isinstance(value, _BaseItem): return self.export_item(value) elif is_item(value): return dict(self._serialize_item(value)) elif is_listlike(value): return [self._serialize_value(v) for v in value] encode_func = to_bytes if self.binary else to_unicode if isinstance(value, (str, bytes)): return encode_func(value, encoding=self.encoding) return value
def arg_to_iter(arg): """Convert an argument to an iterable. The argument can be a None, single value, or an iterable. Exception: if arg is a dict, [arg] will be returned """ if arg is None: return [] elif (hasattr(arg, '__iter__') and not isinstance(arg, _ITERABLE_SINGLE_VALUES) and not is_item(arg)): return arg else: return [arg]
async def _process_spidermw_output(self, output, request, response, spider): """Process each Request/Item (given in the output parameter) returned from the given spider """ if is_item(output): self.slot.itemproc_size += 1 item = await self.itemproc.process_item(output, spider) await self._itemproc_finished(output, item, response, spider) elif output is None: pass else: typename = type(output).__name__ logger.error( 'Spider must return request, item, or None, got %(typename)r in %(request)s', { 'request': request, 'typename': typename }, extra={'spider': spider}, )
def default(self, o): if isinstance(o, set): return list(o) elif isinstance(o, datetime.datetime): return o.strftime("%s %s" % (self.DATE_FORMAT, self.TIME_FORMAT)) elif isinstance(o, datetime.date): return o.strftime(self.DATE_FORMAT) elif isinstance(o, datetime.time): return o.strftime(self.TIME_FORMAT) elif isinstance(o, decimal.Decimal): return str(o) elif isinstance(o, defer.Deferred): return str(o) elif is_item(o): return ItemAdapter(o).asdict() elif isinstance(o, Request): return "<%s %s %s>" % (type(o).__name__, o.method, o.url) elif isinstance(o, Response): return "<%s %s %s>" % (type(o).__name__, o.status, o.url) else: return super().default(o)
def _serialize_value(self, value, pre=None, field_filter=None): try: if isinstance(value, dict): return dict( self._serialize_dict(value, pre=pre, field_filter=field_filter)) elif isinstance(value, _BaseItem): return self.export_item(value, pre=pre, field_filter=field_filter) elif is_item(value): return dict(self._serialize_item(value)) value = super(TextDictKeyPythonItemExporter, self)._serialize_value(value) except UnicodeDecodeError as e: if self.ensure_base64 and isinstance(value, bytes): value = to_unicode(base64.b64encode(value)) else: raise e return value
def default(self, o): if isinstance(o, set): return list(o) elif isinstance(o, datetime.datetime): return o.strftime(f"{self.DATE_FORMAT} {self.TIME_FORMAT}") elif isinstance(o, datetime.date): return o.strftime(self.DATE_FORMAT) elif isinstance(o, datetime.time): return o.strftime(self.TIME_FORMAT) elif isinstance(o, decimal.Decimal): return str(o) elif isinstance(o, defer.Deferred): return str(o) elif is_item(o): return ItemAdapter(o).asdict() elif isinstance(o, Request): return f"<{type(o).__name__} {o.method} {o.url}>" elif isinstance(o, Response): return f"<{type(o).__name__} {o.status} {o.url}>" else: return super().default(o)
def _process_spidermw_output(self, output, request, response, spider): """Process each Request/Item (given in the output parameter) returned from the given spider """ if isinstance(output, Request): self.crawler.engine.crawl(request=output, spider=spider) elif is_item(output): self.slot.itemproc_size += 1 dfd = self.itemproc.process_item(output, spider) dfd.addBoth(self._itemproc_finished, output, response, spider) return dfd elif output is None: pass else: typename = type(output).__name__ logger.error( "Spider must return request, item, or None, got %(typename)r in %(request)s", { "request": request, "typename": typename }, extra={"spider": spider}, )
def _is_relevant(self, value): return isinstance(value, self.relevant_classes) or is_item(value)