def load_cache(path): """Load a cached feedparser result.""" jsonpath = path.replace('dat', 'json') if not os.path.exists(jsonpath): return None with open(jsonpath) as f: data = json.loads(f.read()) ret = feedparser.FeedParserDict() ret.update(data) if 'updated_parsed' in data['feed'] and data['feed']['updated_parsed']: try: data['feed']['updated_parsed'] = time.gmtime( data['feed']['updated_parsed']) except: pass ret.feed = feedparser.FeedParserDict(data.get('feed', {})) entries = [] for e in data.get('entries', []): if 'updated_parsed' in e and e['updated_parsed']: try: e['updated_parsed'] = time.gmtime(e['updated_parsed']) except: pass entries.append(feedparser.FeedParserDict(e)) ret.entries = entries return ret
def parse(document, clean_html=True, unix_timestamp=False, encoding=None): """Parse a document and return a feedparser dictionary with attr key access. If clean_html is False, the html in the feed will not be cleaned. If clean_html is True, a sane version of lxml.html.clean.Cleaner will be used. If it is a Cleaner object, that cleaner will be used. If unix_timestamp is True, the date information will be a numerical unix timestamp rather than a struct_time. If encoding is provided, the encoding of the document will be manually set to that.""" if isinstance(clean_html, bool): cleaner = default_cleaner if clean_html else fake_cleaner else: cleaner = clean_html result = feedparser.FeedParserDict() result['feed'] = feedparser.FeedParserDict() result['entries'] = [] result['bozo'] = 0 try: parser = SpeedParser(document, cleaner, unix_timestamp, encoding) parser.update(result) except Exception as e: if isinstance(e, UnicodeDecodeError) and encoding is True: encoding = chardet.detect(document)['encoding'] document = document.decode(encoding, 'replace').encode('utf-8') return parse(document, clean_html, unix_timestamp, encoding) import traceback result['bozo'] = 1 result['bozo_exception'] = e result['bozo_tb'] = traceback.format_exc() return result
def test_parse_ensure_proper_return_on_success(self): response = feedparser.FeedParserDict() response.status = 200 response['feed'] = feedparser.FeedParserDict() response['feed']['title'] = 'Bola' f = create_dynamic_parse_func(response) scraper = Scraper(f, self.feed) self.assertEqual(response, scraper.parse(False))
def _start_newznab_attr(self, attrsD): context = self._getContext() context.setdefault('newznab', feedparser.FeedParserDict()) context['newznab'].setdefault('tags', feedparser.FeedParserDict()) name = attrsD.get('name') value = attrsD.get('value') if name == 'category': context['newznab'].setdefault('categories', []).append(value) else: context['newznab'][name] = value
def get_date_published(entry: feedparser.FeedParserDict()) -> str: try: return entry['published'] except Exception as exc: logger.exception(exc) return ""
def get_rss_link(entry: feedparser.FeedParserDict()) -> str: try: return entry['link'] except Exception as exc: logger.exception(exc) return ""
def get_summary(entry: feedparser.FeedParserDict()) -> str: try: return entry['summary'] except Exception as exc: logger.exception(exc) return ""
def get_images(self, folder_id, raw=False): """ get the contents of a folder """ response = self._request("%s/%s/images" % (self.folders_url, folder_id), headers=self.accept_header)[1] if raw: return response # the image list uses microformats which are not on the feedparser # whitelist, so we'll need some custom parsing sanitize_html = fp.SANITIZE_HTML fp.SANITIZE_HTML = 0 parsed = fp.parse(response) BeautifulSoup = fp.BeautifulSoup.BeautifulSoup for image in parsed.entries: # TODO: think of a better way to automate this content = image.content[0].value image.content = fp.FeedParserDict() soup = BeautifulSoup(content) for prop in ['filename', 'imageurl', 'height', 'width', 'description']: image.content[prop] = getattr(soup, prop).text fp.SANITIZE_HTML = sanitize_html return parsed
def restore_sort_feedparserdict(feed_items): ''' restore feed items to FeedParserDict - for some reason Django sessions converts them to Dict; sort by date :arguments: list of news item dicts :returns: dictionary of news items dicts ''' # first restore feed_item list to a feedparser dict feed_items = [ feedparser.FeedParserDict(feed_item) for feed_item in feed_items ] # sort the feed_items list on date feed_items_sorted = [] for feed_item in feed_items: feed_items_sorted.append( (feed_item, feedparser_time_to_datetime(feed_item))) feed_items_sorted.sort(key=lambda k: k[1], reverse=True) # now make the new feedparser dict new_feed_items = {} for i, feed_item in enumerate(feed_items_sorted): new_feed_items[i] = feed_item[0] return new_feed_items
def bulk_import(parsed_feed): match = USERNAME_RE.search(parsed_feed.feed.link) username = match.group('name') if parsed_feed.feed.link in _cached_video_count: count = _cached_video_count[parsed_feed.feed.link] else: count = video_count(parsed_feed) post_url = _post_url(username, match.group('type') or 'videos', 'page=%i') parsed_feed = feedparser.FeedParserDict(parsed_feed.copy()) parsed_feed.entries = [] for i in range(1, int(math.ceil(count / 20.0)) + 1): response = open_url_while_lying_about_agent(post_url % i) if response.getcode() != 200: break data = response.read() if not data: break json_data = simplejson.loads(data) for video in json_data: parsed_feed.entries.append( feedparser_dict(_json_to_feedparser(video))) # clean up cache if parsed_feed.feed.link in _cached_video_count: del _cached_video_count[parsed_feed.feed.link] return parsed_feed
def generateFeeds(self, number, date=None, name='feed'): feeds = [] for i in xrange(number): if date is None: date = time.gmtime() if date == self.NO_DATE: entries = [feedparser.FeedParserDict()] else: entries = [feedparser.FeedParserDict(updated_parsed=date)] feeds.append( feedparser.FeedParserDict(feed='%s-%i' % (name, i), encoding='UTF-8', entries=entries)) return feeds
def test_parse_ensure_temporary_error_on_unknown_status(self): response = feedparser.FeedParserDict() response.status = 500 f = create_dynamic_parse_func(response) scraper = Scraper(f, self.feed) with self.assertRaises(TemporaryFeedError): scraper.parse(False)
def httpThread(thread_index, input_queue, output_queue, log): import httplib2 from httplib import BadStatusLine h = httplib2.Http(config.http_cache_directory()) uri, feed_info = input_queue.get(block=True) while uri: log.info("Fetching %s via %d", uri, thread_index) feed = StringIO('') setattr(feed, 'url', uri) setattr(feed, 'headers', feedparser.FeedParserDict({'status': '500'})) try: # map IRI => URI try: if isinstance(uri, unicode): idna = uri.encode('idna') else: idna = uri.decode('utf-8').encode('idna') if idna != uri: log.info("IRI %s mapped to %s", uri, idna) except: log.info("unable to map %s to a URI", uri) idna = uri # cache control headers headers = {} if feed_info.feed.has_key('planet_http_etag'): headers['If-None-Match'] = feed_info.feed['planet_http_etag'] if feed_info.feed.has_key('planet_http_last_modified'): headers['If-Modified-Since'] = \ feed_info.feed['planet_http_last_modified'] headers["user-agent"] = "Venus (+%s)" % config.link() # issue request (resp, content) = h.request(idna, 'GET', headers=headers) # unchanged detection resp['-content-hash'] = md5(content or '').hexdigest() if resp.status == 200: if resp.fromcache: resp.status = 304 elif feed_info.feed.has_key('planet_content_hash') and \ feed_info.feed['planet_content_hash'] == \ resp['-content-hash']: resp.status = 304 # build a file-like object feed = StringIO(content) setattr(feed, 'url', resp.get('content-location', uri)) if resp.has_key('content-encoding'): del resp['content-encoding'] setattr(feed, 'headers', resp) except BadStatusLine: log.error("Bad Status Line received for %s via %d", uri, thread_index) except httplib2.HttpLib2Error, e: log.error("HttpLib2Error: %s via %d", str(e), thread_index) except socket.gaierror, e: log.error("socket.gaierror: %s - %s (thread %d)", uri, str(e[1]), thread_index)
def test_parse_ensure_broken_feed_on_404(self): response = feedparser.FeedParserDict() response.status = 404 f = create_dynamic_parse_func(response) scraper = Scraper(f, self.feed) with self.assertRaises(BrokenFeed): scraper.parse(False)
def test_issue_328_fallback_behavior(self): warnings.filterwarnings('error') d = feedparser.FeedParserDict() d['published'] = u'pub string' d['published_parsed'] = u'pub tuple' d['updated'] = u'upd string' d['updated_parsed'] = u'upd tuple' # Ensure that `updated` doesn't map to `published` when it exists self.assertTrue('published' in d) self.assertTrue('published_parsed' in d) self.assertTrue('updated' in d) self.assertTrue('updated_parsed' in d) self.assertEqual(d['published'], 'pub string') self.assertEqual(d['published_parsed'], 'pub tuple') self.assertEqual(d['updated'], 'upd string') self.assertEqual(d['updated_parsed'], 'upd tuple') d = feedparser.FeedParserDict() d['published'] = u'pub string' d['published_parsed'] = u'pub tuple' # Ensure that `updated` doesn't actually exist self.assertTrue('updated' not in d) self.assertTrue('updated_parsed' not in d) # Ensure that accessing `updated` throws a DeprecationWarning try: d['updated'] except DeprecationWarning: # Expected behavior pass else: # Wrong behavior self.assertEqual(True, False) try: d['updated_parsed'] except DeprecationWarning: # Expected behavior pass else: # Wrong behavior self.assertEqual(True, False) # Ensure that `updated` maps to `published` warnings.filterwarnings('ignore') self.assertEqual(d['updated'], u'pub string') self.assertEqual(d['updated_parsed'], u'pub tuple') warnings.resetwarnings()
def feedparser_dict(obj): if isinstance(obj, dict): return feedparser.FeedParserDict( dict([(key, feedparser_dict(value)) for (key, value) in obj.items()])) if isinstance(obj, (list, tuple)): return [feedparser_dict(member) for member in obj] return obj
def getRSSFeed(self, url, params=None): try: if self.provider.login(): resp = WebSession().get(url, params=params).text return feedparser.parse(resp) except Exception as e: sickrage.app.log.debug("RSS Error: {}".format(e)) return feedparser.FeedParserDict()
def get_datetime(fp: feedparser.FeedParserDict()) -> str: try: tm = fp['feed']['updated_parsed'] return str(tm.year) + left_pad(tm.month) + left_pad(tm.day) + \ left_pad(tm.hour) + left_pad(tm.minute) except Exception as exc: logger.exception(exc) tm = datetime.now() return str(tm.year) + left_pad(tm.month) + left_pad(tm.day) + \ left_pad(tm.hour) + left_pad(tm.minute)
def test_01_load_entry(self): # TODO: standalone test ... import feedparser entry = feedparser.FeedParserDict() title = u' xxx' name = title.strip() content = [{'value': u'yyy', 'language': 'en'}] entry.title = title entry.content = content word = wordm.load_entry(entry) model.Session.remove() word = model.Word.by_name(name) assert word.notes == content[0]['value'], word.notes
def test_has_updated_past(self): self.feed.last_updated_at = make_aware(datetime.now()) future = datetime.now() - timedelta(days=1) entry_list = [ {'updated_parsed': future.timetuple()}, ] feed_dict = feedparser.FeedParserDict() feed_dict['entries'] = entry_list f = create_dynamic_parse_func(feed_dict) scraper = Scraper(f, self.feed) self.assertFalse(scraper._has_updated(feed_dict, False))
def _feed_dict(d): if isinstance(d, dict): for key, value in d.iteritems(): #parsed time tuple if key.endswith('_parsed'): utc_time = utc.from_string(value) d[key] = utc_time.timetuple() inst = feedparser.FeedParserDict(d) elif 'error' in d: inst = pickle.loads(d['error']) else: inst = d return inst
def __init__(self, root, namespaces={}, encoding='utf-8', type='rss20', cleaner=default_cleaner, unix_timestamp=False): """A port of SpeedParserFeed that uses far fewer xpath lookups, which ends up simplifying parsing and makes it easier to catch the various names that different tags might come under.""" self.root = root self.unix_timestamp = unix_timestamp nslookup = reverse_namespace_map(namespaces) self.cleaner = cleaner self.baseurl = base_url(root) feed = feedparser.FeedParserDict() tag_map = self.tag_map channel = xpath(root, self.channel_xpath, namespaces) if len(channel) == 1: channel = channel[0] for child in channel: if isinstance(child, etree._Comment): continue ns, tag = clean_ns(child.tag) mapping = tag_map.get(tag, None) if mapping: getattr(self, 'parse_%s' % mapping)(child, feed, nslookup.get(ns, ns)) if not ns: continue fulltag = '%s:%s' % (nslookup.get(ns, ''), tag) mapping = tag_map.get(fulltag, None) if mapping: getattr(self, 'parse_%s' % mapping)(child, feed, nslookup[ns]) # this copies feedparser behavior if, say, xml:lang is defined in the # root feed element, even though this element tends to have garbage like # "utf-8" in it rather than an actual language if 'language' not in feed: for attr in root.attrib: if attr.endswith('lang'): feed['language'] = root.attrib[attr] if 'id' in feed and 'link' not in feed: feed['link'] = feed['id'] self.feed = feed
def test_01_load_entry(self): import feedparser entry = feedparser.FeedParserDict() title = u'Introduction: Sonnet 18' name = title.strip() content = [{'value': u'yyy', 'language': 'en'}] entry.title = title entry.content = content loader = feed.WorkIntroductionLoader() work = loader.load_entry(entry) assert work.name == 'test_sonnet18', work model.Session.commit() model.Session.remove() work = model.Work.by_name(TestData.name) assert work.notes == content[0]['value'], work.notes
def test_builds_valid_message_for_post(self): search_term = 'some search' url = 'http://example.com/item1' feedUrl = 'http://example.com/feed' title = 'some title' content = 'some content' date_published = None author = 'some guy' entry = feedparser.FeedParserDict({'id': feedUrl}) post = pshb.PostFactory.createPost(url, feedUrl, title, content, date_published, author, entry) message_builder = MessageBuilder() message = message_builder.build_message_from_post(post, search_term) expected = '[%s] matched post: [%s] with URL: [%s]' % (search_term, title, url) self.assertEquals(expected, message)
def parse_entry(self, entry): """An attempt to parse pieces of an entry out w/o xpath, by looping over the entry root's children and slotting them into the right places. This is going to be way messier than SpeedParserEntries, and maybe less cleanly usable, but it should be faster.""" e = feedparser.FeedParserDict() tag_map = self.tag_map nslookup = self.nslookup for child in entry.getchildren(): if isinstance(child, etree._Comment): continue ns, tag = clean_ns(child.tag) mapping = tag_map.get(tag, None) if mapping: getattr(self, 'parse_%s' % mapping)(child, e, nslookup.get(ns, ns)) if not ns: continue fulltag = '%s:%s' % (nslookup.get(ns, ''), tag) mapping = tag_map.get(fulltag, None) if mapping: getattr(self, 'parse_%s' % mapping)(child, e, nslookup[ns]) lacks_summary = 'summary' not in e or e['summary'] is None lacks_content = 'content' not in e or not bool(e.get('content', None)) if not lacks_summary and lacks_content: e['content'] = [{'value': e.summary}] # feedparser sometimes copies the first content value into the # summary field when summary was completely missing; we want # to do that as well, but avoid the case where summary was given as '' if lacks_summary and not lacks_content: e['summary'] = e['content'][0]['value'] if e.get('summary', False) is None: e['summary'] = u'' # support feed entries that have a guid but no link if 'guid' in e and 'link' not in e: e['link'] = full_href(e['guid'], self.baseurl) return e
def extractTags(x): complete = "" dic = feedparser.FeedParserDict() lis = [1, 2] lis_type = type(lis) dic_type = type(dic) values = x.values() for value in values: value_type = type(value) if (value_type == type(dic)): values.extend(value.values()) elif (value_type == type(lis)): values.extend(value[0].values()) else: complete = complete + "\n , " + str(value) compounds = saveCompounds(str(complete)) return compounds
class rssManager(object): """description of class""" rssNewsUrl = main.botConfig.GetRssNewsUrl() rssNewsCount = main.botConfig.GetRssNewsCount() rssNews = feedparser.FeedParserDict() def RssNewsReader(self): log.info('Rss News Reader') self.rssNews = feedparser.parse(self.rssNewsUrl) newsMessage = '<strong>' + self.rssNews.feed.title + '</strong>\n\n' for (i, entry) in enumerate(self.rssNews.entries): if i == self.rssNewsCount: break log.info(entry.title) newsMessage += '<a href="%s">%d. %s</a>\n' % (entry.link, i + 1, entry.title) log.info('Rss News : %s', newsMessage) return newsMessage
def setUp(self): self.d = feedparser.FeedParserDict()
def spiderPlanet(only_if_new=False): """ Spider (fetch) an entire planet """ log = planet.logger global index index = True timeout = config.feed_timeout() try: socket.setdefaulttimeout(float(timeout)) log.info("Socket timeout set to %d seconds", timeout) except: try: import timeoutsocket timeoutsocket.setDefaultSocketTimeout(float(timeout)) log.info("Socket timeout set to %d seconds", timeout) except: log.warning("Timeout set to invalid value '%s', skipping", timeout) from Queue import Queue from threading import Thread fetch_queue = Queue() parse_queue = Queue() threads = {} http_cache = config.http_cache_directory() # Should this be done in config? if http_cache and not os.path.exists(http_cache): os.makedirs(http_cache) if int(config.spider_threads()): # Start all the worker threads for i in range(int(config.spider_threads())): threads[i] = Thread(target=httpThread, args=(i, fetch_queue, parse_queue, log)) threads[i].start() else: log.info("Building work queue") # Load the fetch and parse work queues for uri in config.subscriptions(): # read cached feed info sources = config.cache_sources_directory() feed_source = filename(sources, uri) feed_info = feedparser.parse(feed_source) if feed_info.feed and only_if_new: log.info("Feed %s already in cache", uri) continue if feed_info.feed.get('planet_http_status', None) == '410': log.info("Feed %s gone", uri) continue if threads and _is_http_uri(uri): fetch_queue.put(item=(uri, feed_info)) else: parse_queue.put(item=(uri, feed_info, uri)) # Mark the end of the fetch queue for thread in threads.keys(): fetch_queue.put(item=(None, None)) # Process the results as they arrive feeds_seen = {} while fetch_queue.qsize() or parse_queue.qsize() or threads: while parse_queue.qsize(): (uri, feed_info, feed) = parse_queue.get(False) try: if not hasattr(feed, 'headers') or int(feed.headers.status) < 300: options = {} if hasattr(feed_info, 'feed'): options['etag'] = \ feed_info.feed.get('planet_http_etag',None) try: modified = time.strptime( feed_info.feed.get('planet_http_last_modified', None)) except: pass data = feedparser.parse(feed, **options) else: data = feedparser.FeedParserDict({ 'version': None, 'headers': feed.headers, 'entries': [], 'feed': {}, 'href': feed.url, 'bozo': 0, 'status': int(feed.headers.status) }) # duplicate feed? id = data.feed.get('id', None) if not id: id = feed_info.feed.get('id', None) href = uri if data.has_key('href'): href = data.href duplicate = None if id and id in feeds_seen: duplicate = id elif href and href in feeds_seen: duplicate = href if duplicate: feed_info.feed['planet_message'] = \ 'duplicate subscription: ' + feeds_seen[duplicate] log.warn('Duplicate subscription: %s and %s' % (uri, feeds_seen[duplicate])) if href: feed_info.feed['planet_http_location'] = href if id: feeds_seen[id] = uri if href: feeds_seen[href] = uri # complete processing for the feed writeCache(uri, feed_info, data) except Exception, e: import sys, traceback type, value, tb = sys.exc_info() log.error('Error processing %s', uri) for line in (traceback.format_exception_only(type, value) + traceback.format_tb(tb)): log.error(line.rstrip()) time.sleep(0.1) for index in threads.keys(): if not threads[index].isAlive(): del threads[index] if not threads: log.info("Finished threaded part of processing.")
def writeCache(feed_uri, feed_info, data): log = planet.logger sources = config.cache_sources_directory() blacklist = config.cache_blacklist_directory() # capture http status if not data.has_key("status"): if data.has_key("entries") and len(data.entries) > 0: data.status = 200 elif data.bozo and \ data.bozo_exception.__class__.__name__.lower()=='timeout': data.status = 408 else: data.status = 500 activity_horizon = \ time.gmtime(time.time()-86400*config.activity_threshold(feed_uri)) # process based on the HTTP status code if data.status == 200 and data.has_key("url"): feed_info.feed['planet_http_location'] = data.url if data.has_key("entries") and len(data.entries) == 0: log.warning("No data %s", feed_uri) feed_info.feed['planet_message'] = 'no data' elif feed_uri == data.url: log.info("Updating feed %s", feed_uri) else: log.info("Updating feed %s @ %s", feed_uri, data.url) elif data.status == 301 and data.has_key("entries") and len( data.entries) > 0: log.warning("Feed has moved from <%s> to <%s>", feed_uri, data.url) data.feed['planet_http_location'] = data.url elif data.status == 304 and data.has_key("url"): feed_info.feed['planet_http_location'] = data.url if feed_uri == data.url: log.info("Feed %s unchanged", feed_uri) else: log.info("Feed %s unchanged @ %s", feed_uri, data.url) if not feed_info.feed.has_key('planet_message'): if feed_info.feed.has_key('planet_updated'): updated = feed_info.feed.planet_updated if feedparser._parse_date_iso8601(updated) >= activity_horizon: return else: if feed_info.feed.planet_message.startswith("no activity in"): return if not feed_info.feed.planet_message.startswith("duplicate") and \ not feed_info.feed.planet_message.startswith("no data"): del feed_info.feed['planet_message'] elif data.status == 410: log.info("Feed %s gone", feed_uri) elif data.status == 408: log.warning("Feed %s timed out", feed_uri) elif data.status >= 400: log.error("Error %d while updating feed %s", data.status, feed_uri) else: log.info("Updating feed %s", feed_uri) # if read failed, retain cached information if not data.get('version') and feed_info.get('version'): data.feed = feed_info.feed data.bozo = feed_info.feed.get('planet_bozo', 'true') == 'true' data.version = feed_info.feed.get('planet_format') data.feed['planet_http_status'] = str(data.status) # capture etag and last-modified information if data.has_key('headers'): if data.has_key('etag') and data.etag: data.feed['planet_http_etag'] = data.etag elif data.headers.has_key('etag') and data.headers['etag']: data.feed['planet_http_etag'] = data.headers['etag'] if data.headers.has_key('last-modified'): data.feed['planet_http_last_modified'] = data.headers[ 'last-modified'] elif data.has_key('modified') and data.modified: data.feed['planet_http_last_modified'] = time.asctime( data.modified) if data.headers.has_key('-content-hash'): data.feed['planet_content_hash'] = data.headers['-content-hash'] # capture feed and data from the planet configuration file if data.get('version'): if not data.feed.has_key('links'): data.feed['links'] = list() feedtype = 'application/atom+xml' if data.version.startswith('rss'): feedtype = 'application/rss+xml' if data.version in ['rss090', 'rss10']: feedtype = 'application/rdf+xml' for link in data.feed.links: if link.rel == 'self': link['type'] = feedtype break else: data.feed.links.append( feedparser.FeedParserDict({ 'rel': 'self', 'type': feedtype, 'href': feed_uri })) for name, value in config.feed_options(feed_uri).items(): if name == "gravatar" and re.match(re_email, value): data.feed['planet_' + name] = md5(value.strip()).hexdigest() else: data.feed['planet_' + name] = value # perform user configured scrub operations on the data scrub.scrub(feed_uri, data) from planet import idindex global index if index != None: index = idindex.open() # select latest entry for each unique id ids = {} for entry in data.entries: # generate an id, if none is present if not entry.has_key('id') or not entry.id: entry['id'] = reconstitute.id(None, entry) elif hasattr(entry['id'], 'values'): entry['id'] = entry['id'].values()[0] if not entry['id']: continue # determine updated date for purposes of selection updated = '' if entry.has_key('published'): updated = entry.published if entry.has_key('updated'): updated = entry.updated # if not seen or newer than last seen, select it if updated >= ids.get(entry.id, ('', ))[0]: ids[entry.id] = (updated, entry) # write each entry to the cache cache = config.cache_directory() for updated, entry in ids.values(): # compute blacklist file name based on the id blacklist_file = filename(blacklist, entry.id) # check if blacklist file exists. If so, skip it. if os.path.exists(blacklist_file): continue # compute cache file name based on the id cache_file = filename(cache, entry.id) # get updated-date either from the entry or the cache (default to now) mtime = None if not entry.has_key('updated_parsed') or not entry['updated_parsed']: entry['updated_parsed'] = entry.get('published_parsed', None) if entry.has_key('updated_parsed'): try: mtime = calendar.timegm(entry.updated_parsed) except: pass if not mtime: try: mtime = os.stat(cache_file).st_mtime except: if data.feed.has_key('updated_parsed'): try: mtime = calendar.timegm(data.feed.updated_parsed) except: pass if not mtime: mtime = time.time() entry['updated_parsed'] = time.gmtime(mtime) # apply any filters xdoc = reconstitute.reconstitute(data, entry) output = xdoc.toxml().encode('utf-8') xdoc.unlink() for filter in config.filters(feed_uri): output = shell.run(filter, output, mode="filter") if not output: break if not output: if os.path.exists(cache_file): os.remove(cache_file) continue # write out and timestamp the results write(output, cache_file, mtime) # optionally index if index != None: feedid = data.feed.get('id', data.feed.get('link', None)) if feedid: if type(feedid) == unicode: feedid = feedid.encode('utf-8') index[filename('', entry.id)] = feedid if index: index.close() # identify inactive feeds if config.activity_threshold(feed_uri): updated = [ entry.updated_parsed for entry in data.entries if entry.has_key('updated_parsed') ] updated.sort() if updated: data.feed['planet_updated'] = \ time.strftime("%Y-%m-%dT%H:%M:%SZ", updated[-1]) elif data.feed.has_key('planet_updated'): updated = [ feedparser._parse_date_iso8601(data.feed.planet_updated) ] if not updated or updated[-1] < activity_horizon: msg = "no activity in %d days" % config.activity_threshold( feed_uri) log.info(msg) data.feed['planet_message'] = msg # report channel level errors if data.status == 226: if data.feed.has_key('planet_message'): del data.feed['planet_message'] if feed_info.feed.has_key('planet_updated'): data.feed['planet_updated'] = feed_info.feed['planet_updated'] elif data.status == 403: data.feed['planet_message'] = "403: forbidden" elif data.status == 404: data.feed['planet_message'] = "404: not found" elif data.status == 408: data.feed['planet_message'] = "408: request timeout" elif data.status == 410: data.feed['planet_message'] = "410: gone" elif data.status == 500: data.feed['planet_message'] = "internal server error" elif data.status >= 400: data.feed['planet_message'] = "http status %s" % data.status # write the feed info to the cache if not os.path.exists(sources): os.makedirs(sources) xdoc = minidom.parseString('''<feed xmlns:planet="%s" xmlns="http://www.w3.org/2005/Atom"/>\n''' % planet.xmlns) reconstitute.source(xdoc.documentElement, data.feed, data.bozo, data.version) write(xdoc.toxml().encode('utf-8'), filename(sources, feed_uri)) xdoc.unlink()