Ejemplo n.º 1
0
def _get_last_ts(self):
    """Returns a datetime object"""
    if os.path.exists(self.last_seen_filename):
        with open(self.last_seen_filename) as f:
            f.seek(0)
            try:
                return utils.convert_to_datetime(f.read().strip())
            except ValueError:
                return None
    return None
Ejemplo n.º 2
0
 def process_item(self, item, spider):
     # inject source name
     item['source'] = spider.name
     item_date = utils.convert_to_datetime(item['date'])
     # if crawler launched for the first time - get first item's date as
     # last_ts; else take last launch time as starting point
     if not self.last_ts:
         self.last_ts = (item_date if not spider.last_ts else spider.last_ts)
     # first launch -> save all items found
     if not spider.last_ts:
         return item
     # if last_ts exists -> any item older than last crawl time is ignored
     if item_date <= spider.last_ts:
         raise exceptions.DropItem(
             "Item %s date is older than last crawled" % item)
     # in case posts can be updated -> check that last ts is maximum
     if (self.last_ts < item_date):
         self.last_ts = item_date
     return item