def __init__(self, crawler): if not hsref.enabled: raise NotConfigured self.crawler = crawler self._write_item = hsref.job.items.write self.exporter = PythonItemExporter() log.msg("HubStorage: writing items to %s" % hsref.job.items.url)
class HubstorageExtension(object): """Extension to write scraped items to HubStorage""" def __init__(self, crawler): if not hsref.enabled: raise NotConfigured self.crawler = crawler self._write_item = hsref.job.items.write self.exporter = PythonItemExporter() log.msg("HubStorage: writing items to %s" % hsref.job.items.url) @classmethod def from_crawler(cls, crawler): o = cls(crawler) crawler.signals.connect(o.item_scraped, signals.item_scraped) crawler.signals.connect(o.spider_closed, signals.spider_closed) return o def item_scraped(self, item, spider): type_ = type(item).__name__ item = self.exporter.export_item(item) item.setdefault("_type", type_) self._write_item(item) def spider_closed(self, spider, reason): # flush item writer hsref.job.items.flush() # update outcome hsref.job.metadata.update(close_reason=reason) hsref.job.metadata.save()
def _get_exporter(self, **kwargs): return PythonItemExporter(**kwargs)