Ejemplo n.º 1
0
 def process_item(self, item, spider):
     """Checks whether a scrapy item is a dupe, based on version (not vary)
     fields of the item class"""
     if not hasattr(item, 'version_fields'):
         return item
     version = create_item_version(item)
     if version in self._itemversion_cache:
         old_url = self._itemversion_cache[version]
         raise DropItem("Duplicate product scraped at <%s>, first one was scraped at <%s>" % (item["url"], old_url))
     self._itemversion_cache[version] = item["url"]
     return item
Ejemplo n.º 2
0
 def process_item(self, item, spider):
     """Checks whether a scrapy item is a dupe, based on version (not vary)
     fields of the item class"""
     if not hasattr(item, 'version_fields') or not item.version_fields:
         return item
     version = create_item_version(item)
     if version in self._itemversion_cache:
         old_url = self._itemversion_cache[version]
         raise DropItem("Duplicate product scraped at <%s>, first one was scraped at <%s>" % (item["url"], old_url))
     self._itemversion_cache[version] = item["url"]
     return item
Ejemplo n.º 3
0
    def process_item(self, item, spider):
        """Checks whether a scrapy item is a dupe, based on version (not vary)
        fields of the item class"""
        item_cls = spider.itemcls_info[item['_type']]['class']

        if not item_cls.version_fields:
            return item
        version = create_item_version(item_cls, item)
        if version in self._itemversion_cache:
            old_url = self._itemversion_cache[version]
            raise DropItem("Duplicate product scraped at <%s>, first one was scraped at <%s>" % (item["url"], old_url))
        self._itemversion_cache[version] = item["url"]
        return item
Ejemplo n.º 4
0
 def _check_not_dupe(self, item_cls, item):
     """Checks whether a scrapy item is a dupe, based on version (not vary)
     fields of the item class"""
     if not item_cls.version_fields:
         return True
     
     version = create_item_version(item_cls, item)
     if version in self._itemversion_cache:
         old_url = self._itemversion_cache[version]
         self.log("Duplicate product scraped at <%s>, first one was scraped at <%s>" % (item["url"], old_url),
                 log.WARNING)
         return False
     self._itemversion_cache[version] = item["url"]
     return True