def get_feed(self, url, request_headers=None, **kwargs): if not self._check_auth_cookie(): return session = None if self.provider and hasattr(self.provider, 'session'): session = self.provider.session response = helpers.getURL(url, headers=request_headers, session=session, hooks=dict(response=self.cb_response), **kwargs) if not response: return try: feed = feedparser.parse(response) feed['rq_response'] = self.response if feed and 'entries' in feed: return feed if feed and 'error' in feed.feed: err_code = feed.feed['error']['code'] err_desc = feed.feed['error']['description'] logger.log(u'RSS ERROR:[%s] CODE:[%s]' % (err_desc, err_code), logger.DEBUG) else: logger.log(u'RSS error loading url: ' + url, logger.DEBUG) except Exception as e: logger.log(u'RSS error: ' + ex(e), logger.DEBUG)
def get_news(request, slug, first=0, nresults=3): inst = get_object_or_404(Institution, slug__exact=slug) if not inst.feed_url: raise Http404('The is no feed url for this institution.') feed = feedparser.parse(inst.feed_url) for news in feed.entries: news.updated = datetime.fromtimestamp(mktime(news.updated_parsed)) news.main_link = news.links[0].href match = re.search(r'<\s*img[^>]+>', news.summary_detail.value) if match: news.main_image = match.group(0) news.summary_detail.value = re.sub(r'<\s*img[^>]+>', '', news.summary_detail.value) first = int(first) nresults = int(nresults) ctx = { 'entries': feed.entries[first : nresults + first], } if (first + nresults) <= len(feed.entries): ctx['next_url'] = reverse('portal.institutions.views.get_news', args=[slug, int(first + nresults), int(nresults),]) if (first - nresults) >= 0: ctx['prev_url'] = reverse('portal.institutions.views.get_news', args=[slug, int(first - nresults), int(nresults),]) return direct_to_template(request, 'news/list.html', ctx)
def get_news(request, id, first=0, nresults=3): disc = get_object_or_404(Discipline, pk=id) if not disc.feed_url: raise Http404("The is no feed url for this course.") feed = feedparser.parse(disc.feed_url) for news in feed.entries: news.updated = datetime.fromtimestamp(mktime(news.updated_parsed)) news.main_link = news.links[0].href match = re.search(r"<\s*img[^>]+>", news.summary_detail.value) if match: news.main_image = match.group(0) news.summary_detail.value = re.sub(r"<\s*img[^>]+>", "", news.summary_detail.value) first = int(first) nresults = int(nresults) ctx = {"entries": feed.entries[first : nresults + first]} if (first + nresults) <= len(feed.entries): ctx["next_url"] = reverse("portal.disciplines.views.get_news", args=[id, int(first + nresults), int(nresults)]) if (first - nresults) >= 0: ctx["prev_url"] = reverse("portal.disciplines.views.get_news", args=[id, int(first - nresults), int(nresults)]) return direct_to_template(request, "news/list.html", ctx)
def get_feed(self, url, request_headers=None): if not self._check_auth_cookie(): return session = None if self.provider and hasattr(self.provider, 'session'): session = self.provider.session response = helpers.getURL(url, headers=request_headers, session=session) if not response: return try: feed = feedparser.parse(response) if feed and 'entries' in feed: return feed if feed and 'error' in feed.feed: err_code = feed.feed['error']['code'] err_desc = feed.feed['error']['description'] logger.log(u'RSS ERROR:[%s] CODE:[%s]' % (err_desc, err_code), logger.DEBUG) else: logger.log(u'RSS error loading url: ' + url, logger.DEBUG) except Exception as e: logger.log(u'RSS error: ' + ex(e), logger.DEBUG)
def _parse_and_write_entry(self, raw_rss): rss = feedparser.parse(raw_rss) for entry in rss.entries: content = entry.content[0]["value"] published = datetime.fromtimestamp(mktime(entry.updated_parsed)) title = entry.title if title is None or title == "": title = entry.link em = EntryModel.get_or_insert_by_link( entry.link, link=entry.link, content=content, description=entry.description, published=published, title=title ) # update model em.link = entry.link em.content = em.content em.description = entry.description em.published = published em.title = title em.put()
def rpc_fetch(): q = Feed.query() results = ndb.get_multi(q.fetch(keys_only=True)) rpcs = [] for f in results: rpc = urlfetch.create_rpc() urlfetch.make_fetch_call(rpc, f.url) rpcs.append(rpc) for rpc in rpcs: rpc.wait() result = rpc.get_result() d = feedparser.parse(result.content) for e in d['entries']: dt = parser.parse(e["published"]).replace(tzinfo=None) dy = (datetime.datetime.utcnow() - datetime.timedelta(days=COLLECT_DAYS, seconds=COLLECT_HOURS*3600)).replace(tzinfo=None) if dt > dy: obj = EntryCollect.get_or_insert(e["id"]) if obj.published and obj.published >= dt: pass else: logging.info("new entry : %s" % e["id"]) obj.published = dt obj.title = e["title"] obj.link = e["link"] obj.summary = clean_html(e["summary"]) obj.feed = d['feed']['title'] obj.need_collect_word = True obj.need_notice = True obj.put()
def extract_videourl_for_each_category(rss_category_links): category_page_video_url = {} for each_rss_path in rss_category_links: resp = feedparser.parse('http://www.pyvideo.org' + each_rss_path) page_urls, video_urls, page_video_urls = get_page_video_urls(resp) py_category = extract_category_from_rsslinks(each_rss_path) category_page_video_url[py_category] = [page_urls, video_urls] return category_page_video_url
def extract_videourl_for_each_category(rss_category_links): category_page_video_url={} for each_rss_path in rss_category_links: resp=feedparser.parse('http://www.pyvideo.org'+each_rss_path) page_urls,video_urls,page_video_urls=get_page_video_urls(resp) py_category=extract_category_from_rsslinks(each_rss_path) category_page_video_url[py_category]=[page_urls,video_urls] return category_page_video_url
def action(self): feed = feedparser.parse("http://s1.dlnws.com/dealnews/rss/editors-choice.xml") list_of_products_brands = shared.watch_list.splitlines() matches_from_feed = set() for i in range(0, len(feed['entries'])): for keyword in list_of_products_brands: if keyword in feed['entries'][i].title: matches_from_feed.add(feed['entries'][i].title) shared.deal_list = '\n'.join(matches_from_feed)
def handle_single_source(self, src): url = src.url try: data = urlfetch.fetch(url) except: logging.log(logging.WARN, "Failed to fetch url %s" % url) return feed = feedparser.parse(data.content) current_title = None try: current_title = src.title except: pass if hasattr(feed.feed, 'title'): if feed.feed.title != current_title: src.title = feed.feed.title ndb.put_multi([src]) maxpublished = datetime.datetime.fromtimestamp(0) logging.log(logging.INFO, "#entries=%s" % len(feed.entries)) for entry in feed.entries: try: entry.published_parsed = datetime.datetime( *entry.published_parsed[:6]) if maxpublished is None: maxpublished = entry.published_parsed else: maxpublished = max(maxpublished, entry.published_parsed) except: entry.published_parsed = None if hasattr(entry, 'pkw_tags'): entry.pkw_tags = set(entry.pkw_tags.split(',')) else: entry.pkw_tags = None if hasattr(entry, 'pkw_score'): entry.pkw_score = float(entry.pkw_score) else: entry.pkw_score = 1 logging.log(logging.INFO, "#maxpublished=%r" % maxpublished) if maxpublished is None: logging.log(logging.WARN, "Could not get published date for feed %s" % url) return now = datetime.datetime.now() subscriptions = Subscription.query(Subscription.next_poll < now, Subscription.source == src.key) for subscription in subscriptions: logging.log(logging.DEBUG, "subscription=%r" % subscription) self.send_mail(subscription, feed, maxpublished)
def handle_single_source(self,src): url = src.url try: data = urlfetch.fetch(url) except: logging.log(logging.WARN, "Failed to fetch url %s" % url) return feed = feedparser.parse(data.content) current_title = None try: current_title = src.title except: pass if hasattr(feed.feed,'title'): if feed.feed.title != current_title: src.title = feed.feed.title ndb.put_multi([src]) maxpublished = datetime.datetime.fromtimestamp(0) logging.log(logging.INFO, "#entries=%s" % len(feed.entries)) for entry in feed.entries: try: entry.published_parsed = datetime.datetime(*entry.published_parsed[:6]) if maxpublished is None: maxpublished = entry.published_parsed else: maxpublished = max(maxpublished,entry.published_parsed) except: entry.published_parsed = None if hasattr(entry,'pkw_tags'): entry.pkw_tags = set(entry.pkw_tags.split(',')) else: entry.pkw_tags = None if hasattr(entry,'pkw_score'): entry.pkw_score = float(entry.pkw_score) else: entry.pkw_score = 1 logging.log(logging.INFO, "#maxpublished=%r" % maxpublished) if maxpublished is None: logging.log(logging.WARN, "Could not get published date for feed %s" % url) return now = datetime.datetime.now() subscriptions = Subscription.query( Subscription.next_poll < now, Subscription.source == src.key ) for subscription in subscriptions: logging.log(logging.DEBUG, "subscription=%r" % subscription) self.send_mail( subscription, feed, maxpublished )
def fetchSource(self,source): feedUrl = source.rssUrl feed = None try: feed = feedparser.parse( feedUrl ) #logging.debug("loaded the source from " + feedUrl) except Exception as ex: logging.error( "Error calling " + feedUrl ) logging.error( ex ) return None return self.parseFeed(feed,source.quality)
def fetchSource(self, source): feedUrl = source.rssUrl feed = None try: feed = feedparser.parse(feedUrl) #logging.debug("loaded the source from " + feedUrl) except Exception as ex: logging.error("Error calling " + feedUrl) logging.error(ex) return None return self.parseFeed(feed, source.quality)
def refresh(self): for name, url in self.feedlist: try: if not self.feeds.has_key(name): # just download the feed self.feeds[name] = feedparser.parse(url) else: of = self.feeds[name] # back up old feed if hasattr(of, 'etag'): # download new feed nf = feedparser.parse(url, etag=of.etag) elif hasattr(of, 'modified'): nf = feedparser.parse(url, modified=of.modified) else: nf = feedparser.parse(url) # if not all the same if nf.status != 304: self.feeds[name] = nf except Exception as e: print 'feeds.py: Warning: unable to download feed "%s":' % name, e # generate new picture list pics_new = {} for name, feed in self.feeds.iteritems(): pic_list = self.extract(feed) for p in pic_list: pics_new[p['url']] = p # delete obsolete pics for url in self.pics.keys(): if not url in pics_new: self.delete_pic(url) # move new pics for url in pics_new.keys(): if not url in self.pics: self.pics[url] = pics_new[url] self.save()
def getRss(self): # get rss feed feed = feedparser.parse(self.rss) # if new feed not equal to last feed if self.lastFeedTitle != feed.entries[0].title: self.lastFeedTitle = feed.entries[0].title # compose message outgoingMessage = feed.entries[0].title + "\n" + feed.entries[0].link self.sendMessageToChat(outgoingMessage) t = Timer(10.0, self.getRss) t.start() return
def loadSport(sport_id): mc.LogDebug("Loading sport %s" % sport_id) mc.ShowDialogWait() label = mc.GetActiveWindow().GetLabel(9010) sport = None for SPORT in SPORTS: if SPORT['id'] == sport_id: sport = SPORT break else: mc.LogError("Sport %s not found." % sport_id) mc.ShowDialogOk("Error", "Sport %s not found." % sport_id) mc.CloseWindow() label.SetLabel(sport['title']) gamesList = mc.GetActiveWindow().GetList(9000) gamesItems = mc.ListItems() content = feedparser.parse(urllib.urlopen(FEED % sport_id)) for item in content['entries']: title = str(item.title) match = sport['regex_title'].match(title) if match: day_date = date(int(match.group('year')), MONTHS.index(match.group('month').lower()), int(match.group('day'))) if day_date < date.today(): continue for game in sport['regex_game'].finditer(str(item.content)): game_time = time.strptime('%s %s EST' % (day_date, game.group('est_time')), '%Y-%m-%d %I:%M %p %Z') #Skip games that occured more than 5 hours ago if time.localtime(time.mktime(game_time) + 5 * 60 * 60) < time.time(): continue name = '%s @ %s' % (game.group('team_away').strip(), game.group('team_home').strip()) streams = [] for stream in sport['regex_stream'].finditer(game.group('streams')): mc.LogDebug('MYP2P::: "%s" - Stream "%s" (%s)' % (name, stream.group('name'), stream.group('url'))) streams.append(stream.groupdict()) stream_string = '\n'.join(['%s\t%s' % (stream['name'], stream['url']) for stream in streams]) gameItem = mc.ListItem(mc.ListItem.MEDIA_UNKNOWN) gameItem.SetLabel(name) gameItem.SetProperty('streams', stream_string) gamesItems.append(gameItem) gamesList.SetItems(gamesItems) mc.HideDialogWait()
def parse_feed(self, key): ok = True; status = 0; feedcount = 0 url = FeedInfo().geturl(key.id) rss = feedparser.parse(url) if self.verbose: print 'parse feed from %s' % url try: status = rss.status except: raise PluginConnectionError( 'Connection Problem', 'Plugin could not connect', 'No data received. Abort directory listing..' ) if status not in range(200,300): raise PluginHTTPError(url, status) else: if self.verbose: print 'HTTP response code for rss: %d' % status for feed in rss.entries: label = feed.title; url = feed.links[1].href path = os.path.dirname(url) base = os.path.basename(url) image = self.IMG_TPL % ( path, base, self.IMG_TAIL_TPL ) li = self.entry( label, thumbnailImage=image ) ok = self.add( self.hndl, url = url, listitem = li, isFolder = False ) feedcount += 1 if self.verbose: print 'added %d entries. success: %s' % ( feedcount, repr(ok) ) if ok: #tell end of directory self.eod( self.hndl )
def get(self): for src in NotificationSource.query(): logging.log(logging.INFO, "src=%s" % src.url) url = src.url try: data = urlfetch.fetch(url) except: logging.log(logging.WARN, "Failed to fetch url %s" % url) continue feed = feedparser.parse(data.content) if feed.feed.title != src.title: src.title = feed.feed.title ndb.put_multi([src]) maxpublished = datetime.datetime.fromtimestamp(0) logging.log(logging.INFO, "#entries=%s" % len(feed.entries)) for entry in feed.entries: try: entry.published_parsed = datetime.datetime(*entry.published_parsed[:6]) if maxpublished is None: maxpublished = entry.published_parsed else: maxpublished = max(maxpublished,entry.published_parsed) except: entry.published_parsed = None if hasattr(entry,'pkw_tags'): entry.pkw_tags = set(entry.pkw_tags.split(',')) else: entry.pkw_tags = None logging.log(logging.INFO, "#maxpublished=%r" % maxpublished) if maxpublished is None: logging.log(logging.WARN, "Could not get published date for feed %s" % url) continue now = datetime.datetime.now() subscriptions = Subscription.query( Subscription.next_poll < now, Subscription.source == src.key ) for subscription in subscriptions: logging.log(logging.DEBUG, "subscription=%r" % subscription) self.send_mail( subscription, feed, maxpublished )
def parseFeed(self, feed): # Grab the gmail atom feed mail = {} atom = feedparser.parse(feed) if atom.bozo <> 0: if self.debug: print "[gmail-plasmoid] Error parsing feed:", atom.bozo_exception mail = None else: mail["fullcount"] = int(atom.feed.fullcount) mail["url"] = atom.feed.link mail["entries"] = [] for i in xrange(len(atom.entries)): entry = {} if atom.entries[i].has_key("title"): entry["subject"] = atom.entries[i].title else: entry["subject"] = "" if atom.entries[i].has_key("author_detail"): if atom.entries[i].author_detail.has_key("name"): entry["authorname"] = atom.entries[i].author_detail.name else: entry["authorname"] = "" if atom.entries[i].author_detail.has_key("email"): entry["authoremail"] = atom.entries[i].author_detail.email else: entry["authoremail"] = "" else: entry["authorname"] = "" entry["authoremail"] = "" #if atom.entries[i].has_key("link"): #entry["link"] = atom.entries[i].link #else: #entry["link"] = "" mail["entries"].append(entry) return mail
def parse_feed(self, key): ok = True status = 0 feedcount = 0 url = FeedInfo().geturl(key.id) rss = feedparser.parse(url) if self.verbose: print 'parse feed from %s' % url try: status = rss.status except: raise PluginConnectionError( 'Connection Problem', 'Plugin could not connect', 'No data received. Abort directory listing..') if status not in range(200, 300): raise PluginHTTPError(url, status) else: if self.verbose: print 'HTTP response code for rss: %d' % status for feed in rss.entries: label = feed.title url = feed.links[1].href path = os.path.dirname(url) base = os.path.basename(url) image = self.IMG_TPL % (path, base, self.IMG_TAIL_TPL) li = self.entry(label, thumbnailImage=image) ok = self.add(self.hndl, url=url, listitem=li, isFolder=False) feedcount += 1 if self.verbose: print 'added %d entries. success: %s' % (feedcount, repr(ok)) if ok: #tell end of directory self.eod(self.hndl)
def _retrieveFeed(self): """do the actual work and try to retrieve the feed""" url = self.url if url != '': self._last_update_time_in_minutes = time.time() / 60 self._last_update_time = DateTime() d = feedparser.parse(url) if getattr(d, 'bozo', 0) == 1 and not isinstance( d.get('bozo_exception'), ACCEPTED_FEEDPARSER_EXCEPTIONS): self._loaded = True # we tried at least but have a failed load self._failed = True return False self._title = d.feed.title self._siteurl = d.feed.link self._items = [] for item in d['items']: try: link = item.links[0]['href'] itemdict = { 'title': item.title, 'url': link, 'summary': item.get('description', ''), } if hasattr(item, "updated"): try: itemdict['updated'] = DateTime(item.updated) except DateTimeError: # It's okay to drop it because in the # template, this is checked with # ``exists:`` pass except AttributeError: continue self._items.append(itemdict) self._loaded = True self._failed = False return True self._loaded = True self._failed = True # no url set means failed return False # no url set, although that actually should not really happen
def parse_feed(self, url, image): if self.verbose: print 'parse feed from %s' % url feedcount = 0 #parse feed rss = feedparser.parse(url) for f in rss.entries: #iterate and add to directory link = f.enclosures[0].href size = f.enclosures[0].length label = f.title.encode('utf-8', 'replace') listitem = self.entry( label, thumbnailImage=image, path=link ) #add to directory ok = self.add( self.hndl, url = link, listitem = listitem, isFolder = False ) feedcount += 1 if self.verbose: print 'added %d feed entries' % feedcount print 'success: %s' % repr(feedcount > 0) #Tell end of directory listing if feedcount > 0: self.eod( self.hndl, ok, False, False ) else: exec "xbmcgui.Dialog().ok('No podcasts received from NRK.no', '')"
def parse_feed(self, url, image): if self.verbose: print 'parse feed from %s' % url feedcount = 0 #parse feed rss = feedparser.parse(url) for f in rss.entries: #iterate and add to directory link = f.enclosures[0].href size = f.enclosures[0].length label = f.title.encode('utf-8', 'replace') listitem = self.entry(label, thumbnailImage=image, path=link) #add to directory ok = self.add(self.hndl, url=link, listitem=listitem, isFolder=False) feedcount += 1 if self.verbose: print 'added %d feed entries' % feedcount print 'success: %s' % repr(feedcount > 0) #Tell end of directory listing if feedcount > 0: self.eod(self.hndl, ok, False, False) else: exec "xbmcgui.Dialog().ok('No podcasts received from NRK.no', '')"
def loadSport(sport_id): mc.LogDebug("Loading sport %s" % sport_id) mc.ShowDialogWait() label = mc.GetActiveWindow().GetLabel(9010) sport = None for SPORT in SPORTS: if SPORT['id'] == sport_id: sport = SPORT break else: mc.LogError("Sport %s not found." % sport_id) mc.ShowDialogOk("Error", "Sport %s not found." % sport_id) mc.CloseWindow() label.SetLabel(sport['title']) gamesList = mc.GetActiveWindow().GetList(9000) gamesItems = mc.ListItems() content = feedparser.parse(urllib.urlopen(FEED % sport_id)) for item in content['entries']: title = str(item.title) match = sport['regex_title'].match(title) if match: day_date = date(int(match.group('year')), MONTHS.index(match.group('month').lower()), int(match.group('day'))) if day_date < date.today(): continue for game in sport['regex_game'].finditer(str(item.content)): game_time = time.strptime( '%s %s EST' % (day_date, game.group('est_time')), '%Y-%m-%d %I:%M %p %Z') #Skip games that occured more than 5 hours ago if time.localtime(time.mktime(game_time) + 5 * 60 * 60) < time.time(): continue name = '%s @ %s' % (game.group('team_away').strip(), game.group('team_home').strip()) streams = [] for stream in sport['regex_stream'].finditer( game.group('streams')): mc.LogDebug( 'MYP2P::: "%s" - Stream "%s" (%s)' % (name, stream.group('name'), stream.group('url'))) streams.append(stream.groupdict()) stream_string = '\n'.join([ '%s\t%s' % (stream['name'], stream['url']) for stream in streams ]) gameItem = mc.ListItem(mc.ListItem.MEDIA_UNKNOWN) gameItem.SetLabel(name) gameItem.SetProperty('streams', stream_string) gamesItems.append(gameItem) gamesList.SetItems(gamesItems) mc.HideDialogWait()
def fetch(self, url, force_update=False, offline=False, request_headers=None, referrer=None): """Return the feed at url. url - The URL of the feed. force_update=False - When True, update the cache whether the current contents have exceeded their time-to-live or not. offline=False - When True, only return data from the local cache and never access the remote URL. request_headers=None - Add addition request headers to request referrer=None - Added a referrer to request If there is data for that feed in the cache already, check the expiration date before accessing the server. If the cached data has not expired, return it without accessing the server. In cases where the server is accessed, check for updates before deciding what to return. If the server reports a status of 304, the previously cached content is returned. The cache is only updated if the server returns a status of 200, to avoid holding redirected data in the cache. """ logger.debug('url="%s"' % url) # Convert the URL to a value we can use # as a key for the storage backend. key = url if isinstance(key, unicode): key = key.encode('utf-8') modified = None etag = None now = time.time() cached_time, cached_content = self.storage.get(key, (None, None)) # Offline mode support (no networked requests) # so return whatever we found in the storage. # If there is nothing in the storage, we'll be returning None. if offline: logger.debug('offline mode') return cached_content # Does the storage contain a version of the data # which is older than the time-to-live? logger.debug('cache modified time: %s' % str(cached_time)) if cached_time is not None and not force_update: if self.time_to_live: age = now - cached_time if age <= self.time_to_live: logger.debug('cache contents still valid') return cached_content else: logger.debug('cache contents older than TTL') else: logger.debug('no TTL value') # The cache is out of date, but we have # something. Try to use the etag and modified_time # values from the cached content. etag = cached_content.get('etag') modified = cached_content.get('modified') logger.debug('cached etag=%s' % etag) logger.debug('cached modified=%s' % str(modified)) else: logger.debug('nothing in the cache, or forcing update') # We know we need to fetch, so go ahead and do it. logger.debug('fetching...') parsed_result = feedparser.parse(url, agent=self.user_agent, modified=modified, etag=etag, referrer=referrer, request_headers=request_headers) status = parsed_result.get('status', None) logger.debug('HTTP status=%s' % status) if status == 304: # No new data, based on the etag or modified values. # We need to update the modified time in the # storage, though, so we know that what we have # stored is up to date. self.storage[key] = (now, cached_content) # Return the data from the cache, since # the parsed data will be empty. parsed_result = cached_content elif status == 200: # There is new content, so store it unless there was an error. error = parsed_result.get('bozo_exception') if not error: logger.debug('Updating stored data for %s' % url) self.storage[key] = (now, parsed_result) else: logger.warning('Not storing data with exception: %s', error) else: logger.warning('Not updating cache with HTTP status %s', status) return parsed_result
def load(self, data): parsed = feedparser.parse(data) Newsfeed.__init__(self, parsed, self._none)
__author__ = 'Tramel Jones' """ 2014 RSS Reader created with the intention of viewing Valve Software blog posts and other dev blogs with RSS. """ import feedparser.feedparser d = feedparser.parse("http://blogs.valvesoftware.com/feed/") for item in d.entries: print(item.link)
def fetch(self, url, force_update=False, offline=False, request_headers=None, referrer=None, handlers=[]): """Return the feed at url. url - The URL of the feed. force_update=False - When True, update the cache whether the current contents have exceeded their time-to-live or not. offline=False - When True, only return data from the local cache and never access the remote URL. request_headers=None - Add addition request headers to request referrer=None - Added a referrer to request handlers=None - Urllib2 handlers If there is data for that feed in the cache already, check the expiration date before accessing the server. If the cached data has not expired, return it without accessing the server. In cases where the server is accessed, check for updates before deciding what to return. If the server reports a status of 304, the previously cached content is returned. The cache is only updated if the server returns a status of 200, to avoid holding redirected data in the cache. """ logger.debug('url="%s"' % url) # Convert the URL to a value we can use # as a key for the storage backend. key = url if isinstance(key, unicode): key = key.encode('utf-8') modified = None etag = None now = time.time() cached_time, cached_content = self.storage.get(key, (None, None)) # Offline mode support (no networked requests) # so return whatever we found in the storage. # If there is nothing in the storage, we'll be returning None. if offline: logger.debug('offline mode') return cached_content # Does the storage contain a version of the data # which is older than the time-to-live? logger.debug('cache modified time: %s' % str(cached_time)) if cached_time is not None and not force_update: if self.time_to_live: age = now - cached_time if age <= self.time_to_live: logger.debug('cache contents still valid') return cached_content else: logger.debug('cache contents older than TTL') else: logger.debug('no TTL value') # The cache is out of date, but we have # something. Try to use the etag and modified_time # values from the cached content. etag = cached_content.get('etag') modified = cached_content.get('modified') logger.debug('cached etag=%s' % etag) logger.debug('cached modified=%s' % str(modified)) else: logger.debug('nothing in the cache, or forcing update') # We know we need to fetch, so go ahead and do it. logger.debug('fetching...') parsed_result = feedparser.parse(url, agent=self.user_agent, modified=modified, etag=etag, referrer=referrer, request_headers=request_headers, handlers = handlers) status = parsed_result.get('status', None) logger.debug('HTTP status=%s' % status) if status == 304: # No new data, based on the etag or modified values. # We need to update the modified time in the # storage, though, so we know that what we have # stored is up to date. self.storage[key] = (now, cached_content) # Return the data from the cache, since # the parsed data will be empty. parsed_result = cached_content elif status == 200: # There is new content, so store it unless there was an error. error = parsed_result.get('bozo_exception') if not error: logger.debug('Updating stored data for %s' % url) self.storage[key] = (now, parsed_result) else: logger.warning('Not storing data with exception: %s', error) else: logger.warning('Not updating cache with HTTP status %s', status) return parsed_result
def getArticleSource(article): return cleanUpText(article['summary_detail']['base']) def cleanUpText(text): """ Function handles encoding of xml file Xml usually based in as UTF-8. We want to return Ascii version for python """ return text.encode('ascii','ignore') db = Database() db.connect() for feed in rssFeeds: rss = feedparser.parse(feed) #RSS object numFound += len(rss['entries']) print("Found a total of "+str(numFound)+" articles from rss feed: "+str(feed)) for entry in rss['entries']: outputDir = relativePath + buildArticlePath(entry) if not os.path.exists(outputDir): os.makedirs(outputDir) # Download the article link = entry['link'] originalTitle = cleanUpText(entry['title']) formattedTitle = originalTitle.replace(' ','_').replace('/','-').replace("'",'') relFilePath = buildArticlePath(entry) + formattedTitle+".html"