def _get_last_ts(self): """Returns a datetime object""" if os.path.exists(self.last_seen_filename): with open(self.last_seen_filename) as f: f.seek(0) try: return utils.convert_to_datetime(f.read().strip()) except ValueError: return None return None
def process_item(self, item, spider): # inject source name item['source'] = spider.name item_date = utils.convert_to_datetime(item['date']) # if crawler launched for the first time - get first item's date as # last_ts; else take last launch time as starting point if not self.last_ts: self.last_ts = (item_date if not spider.last_ts else spider.last_ts) # first launch -> save all items found if not spider.last_ts: return item # if last_ts exists -> any item older than last crawl time is ignored if item_date <= spider.last_ts: raise exceptions.DropItem( "Item %s date is older than last crawled" % item) # in case posts can be updated -> check that last ts is maximum if (self.last_ts < item_date): self.last_ts = item_date return item