def process_item(self, item, spider): """Checks whether a scrapy item is a dupe, based on version (not vary) fields of the item class""" if not hasattr(item, 'version_fields'): return item version = create_item_version(item) if version in self._itemversion_cache: old_url = self._itemversion_cache[version] raise DropItem("Duplicate product scraped at <%s>, first one was scraped at <%s>" % (item["url"], old_url)) self._itemversion_cache[version] = item["url"] return item
def process_item(self, item, spider): """Checks whether a scrapy item is a dupe, based on version (not vary) fields of the item class""" if not hasattr(item, 'version_fields') or not item.version_fields: return item version = create_item_version(item) if version in self._itemversion_cache: old_url = self._itemversion_cache[version] raise DropItem("Duplicate product scraped at <%s>, first one was scraped at <%s>" % (item["url"], old_url)) self._itemversion_cache[version] = item["url"] return item
def process_item(self, item, spider): """Checks whether a scrapy item is a dupe, based on version (not vary) fields of the item class""" item_cls = spider.itemcls_info[item['_type']]['class'] if not item_cls.version_fields: return item version = create_item_version(item_cls, item) if version in self._itemversion_cache: old_url = self._itemversion_cache[version] raise DropItem("Duplicate product scraped at <%s>, first one was scraped at <%s>" % (item["url"], old_url)) self._itemversion_cache[version] = item["url"] return item
def _check_not_dupe(self, item_cls, item): """Checks whether a scrapy item is a dupe, based on version (not vary) fields of the item class""" if not item_cls.version_fields: return True version = create_item_version(item_cls, item) if version in self._itemversion_cache: old_url = self._itemversion_cache[version] self.log("Duplicate product scraped at <%s>, first one was scraped at <%s>" % (item["url"], old_url), log.WARNING) return False self._itemversion_cache[version] = item["url"] return True