def get_desired_items_from_feeds(self, remember_end_item=False, case_sensitive=False): if not self._check_prerequisities_for_run(): return False feeds = SourceManager(self.db, self.prefix) needles = WantedItem(self.db, self.prefix) result = [] for haystack in feeds: logging.debug('On feed "%s"...', haystack.caption) p = haystack.get(needles, remember_end_item, case_sensitive) logging.info('Feed "%s" yielded %d matches.', haystack.caption, len(p)) result += p logging.info('Run is complete. Result consists of %d items.', len(result)) return result
def load_and_parse(self, lastpos=None, load_from_cache=True, expiry=3600): key = self.prefix + '/feed/' + self.id + '/lastparsed' if load_from_cache: rss = self.db.get(key) if rss: logging.info('Feed "%s" has been found in, and loaded from cache.', self.url) return pickle.loads(rss) try: logging.debug('Feed "%s" has not been cached, yet. Or its entry expired. It will be fetched...', self.url) entries = self.load_page(lastpos) logging.info('Feed "%s" has successfully been retrieved.', self.url) self.db.set(key, pickle.dumps(entries)) self.db.expire(key, expiry) return entries except Exception, e: logging.error('Fetching feed "%s" failed with this exception: %s', self.url, e) raise
def get_latest_items(self, remember_end_item=True): # Operates under the assumption that the most recent item is on top ([0]). lastpos_key = self.prefix + "/feed/" + self.id + "/lastpos" lastpos = self.db.get(lastpos_key) if not lastpos: logging.debug('Feed "%s" has no last position marker.', self.url) try: f = self.load_and_parse(lastpos) logging.info('Loading and parsing of "%s" succeeded.', self.url) except: logging.debug('Something went wrong at loading and parsing of "%s". Result: empty.', self.url) return [] if remember_end_item: if not lastpos or (f is not None and len(f) > 0 and lastpos != self.feed_entry_id(f[0]) ): self.db.set(lastpos_key, self.feed_entry_id(f[0])) logging.debug('"%s" has %d feed-entries in total.', self.url, len(f)) latest = self.sieve_feed_entries(f, lastpos) logging.info('"%s" has %d new feed-entries.', self.url, len(latest)) return latest