def build(self): """ Fetch feed data and return JavaScript code usable as an include to format the feed as HTML. """ # Fetch and parse the feed cache = HTTPCache(self.feed_url) feed_data = feedparser.parse(cache.content()) # Build a list of content strings by populating entry template entries_out = [ self.ENTRY_TMPL % { 'link' : entry.get('link', ''), 'title' : entry.get('title', ''), 'summary' : entry.get('summary', ''), } for entry in feed_data.entries ] # Build final content by populating the overall shell template out = self.INCLUDE_TMPL % { 'feed.title' : feed_data.feed.title, 'feed.entries' : "\n".join(entries_out) } # Encode the content using the object unicode encoding out = out.encode(self.UNICODE_ENC) # Return the content wrapped in JavaScript code return self.js_format(out)
def build(self): """ Fetch feed data and return JavaScript code usable as an include to format the feed as HTML. """ # Fetch and parse the feed cache = HTTPCache(self.feed_url) feed_data = feedparser.parse(cache.content()) # Build a list of content strings by populating entry template entries_out = [ self.ENTRY_TMPL % { 'link': entry.get('link', ''), 'title': entry.get('title', ''), 'summary': entry.get('summary', ''), } for entry in feed_data.entries ] # Build final content by populating the overall shell template out = self.INCLUDE_TMPL % { 'feed.title': feed_data.feed.title, 'feed.entries': "\n".join(entries_out) } # Encode the content using the object unicode encoding out = out.encode(self.UNICODE_ENC) # Return the content wrapped in JavaScript code return self.js_format(out)
def main(): """ Given a feed URL as an argument, fetch and print the feed. """ feed_uri = sys.argv[1] cache = HTTPCache(feed_uri) feed_content = cache.content() print feed_content
def parse(self, feed_uri): """Given a URI to a feed, fetch it and return parsed data.""" cache = HTTPCache(feed_uri) feed_content = cache.content() self.reset() self.feed(feed_content) return { 'version' : self._version, 'feed' : self._feed, 'entries' : self._entries }
def fetch_items(self): """ Grab search result items for given index and keywords. """ # Construct the list of arguments for the AWS query args = { 'Service' : 'AWSECommerceService', 'Operation' : 'ListLookup', 'ResponseGroup' : 'Medium,ListFull,ItemAttributes', 'Sort' : 'LastUpdated', 'ListType' : 'WishList', 'ListId' : self.wishlist_id, 'SubscriptionId' : self.aws_id, } # Build the URL for the API call using the base URL and params. url = "%s?%s" % (self.AWS_URL, urllib.urlencode(args)) # Perform the query, fetch and parse the results. data = HTTPCache(url).content() doc = xmltramp.parse(data) # Update the feed link and title from search result metadata self.FEED_META['feed.link'] = doc.Lists.List.ListURL self.FEED_META['feed.title'] = \ 'Amazon.com wishlist items for "%s"' % \ doc.Lists.List.CustomerName # Fetch first page of items. return [ x.Item for x in doc.Lists.List if 'ListItem' in x._name ]
def main(): """ Search for wishlists using command line arguments. """ # Leaving out the program name, grab all space-separated arguments. name = " ".join(sys.argv[1:]) # Construct the list of arguments for the AWS query args = { 'Service': 'AWSECommerceService', 'Operation': 'ListSearch', 'ListType': 'WishList', 'SubscriptionId': AWS_ID, 'Name': name } # Build the URL for the API call using the base URL and params. url = "%s?%s" % (AWS_URL, urllib.urlencode(args)) # Perform the query, fetch and parse the results. data = HTTPCache(url).content() doc = xmltramp.parse(data) # Print out the list IDs found. lists = [x for x in doc.Lists if 'List' in x._name] for list in lists: print '%15s: %s' % (list.ListId, list.CustomerName)
def technorati_search(self, query): """ Given a query string, perform a Technorati search. """ # Construct a Technorati search URL and fetch it. url = self.SEARCH_URL_TMPL % \ ( self.TECHNORATI_KEY, urllib.quote_plus(query) ) data = HTTPCache(url).content() # HACK: I get occasional encoding issues with Technorati, so # here's an ugly hack that seems to make things work anyway. try: return xmltramp.parse(data).document except SAXParseException: data = data.decode('ascii', 'ignore') return xmltramp.parse(data).document
def main(): """ Perform iCalendar to hCalendar rendering. """ # Establish the calendar URL and output file. ics_url = len(sys.argv) > 1 and sys.argv[0] or ICS_URL html_dir = len(sys.argv) > 2 and sys.argv[1] or HTML_DIR # Get the calendar via URL and parse the data cal = Calendar.from_string(HTTPCache(ics_url).content()) # Create HTML_DIR if it doesn't already exist if not os.path.exists(html_dir): os.makedirs(html_dir) # Process calendar components. for event in cal.walk(): # Skip this calendar component if it's not an event. if not type(event) is Event: continue # Summarize the event data, make a hash, build a filename. hash_src = ','.join(['%s=%s' % x for x in event.items()]) hash = md5(hash_src).hexdigest() hcal_fn = os.path.join(html_dir, '%s.html' % hash) # Build the hCalendar content and write out to file. hcal_out = HEVENT_TMPL % ICalTmplWrapper(event) open(hcal_fn, 'w').write(hcal_out)
def fetch(self): """Fetch the data for the feed, return whether feed has changed.""" plugin_manager.dispatch("feed_fetch_start", subscription=self) # Prepare the URI and initial headers for the fetch. feed_uri = self.uri headers = { 'User-Agent': config.USER_AGENT, 'Accept': config.ACCEPT_HEADER } # Stolen from feedparser: Handle inline user:password for basic auth auth = None urltype, rest = urllib.splittype(feed_uri) realhost, rest = urllib.splithost(rest) if realhost: user_passwd, realhost = urllib.splituser(realhost) if user_passwd: feed_uri = "%s://%s%s" % (urltype, realhost, rest) auth = base64.encodestring(user_passwd).strip() headers['Authorization'] = "Basic %s" % auth # Grab the feed data via HTTPCache cache = HTTPCache(feed_uri, headers) info = cache.info() content = cache.content() feed_hash = md5(content).hexdigest() # Copy over some HTTP headers as feed metadata if 'ETag' in info: self.meta.set('scan', 'http_etag', info['ETag']) if 'Last-Modified' in info: self.meta.set('scan', 'http_last_modified', info['Last-Modified']) #changed = cache.fresh() changed = ( not self.meta.has_option('scan', 'last_feed_md5') or \ not feed_hash == self.meta.get('scan', 'last_feed_md5') ) if changed: # Update the feed hash, write the fetched feed. self.meta.set('scan', 'last_feed_md5', feed_hash) fout = open(self.feed_fn, 'w') fout.write(content) fout.close() plugin_manager.dispatch("feed_fetch_end", subscription=self, changed=changed) return changed
def main(): tmpl = 'http://api.technorati.com/search?key=%s&limit=5&query=%s' key = open("technorati-key.txt", "r").read().strip() query = (len(sys.argv) > 1) and sys.argv[1] or 'test query' url = tmpl % (key, urllib.quote_plus(query)) data = HTTPCache(url).content() # HACK: I get occasional encoding issues with Technorati, so # here's an ugly hack that seems to make things work anyway. try: doc = xmltramp.parse(data) except SAXParseException: data = data.decode('utf8', 'ignore').encode('utf8') doc = xmltramp.parse(data) items = [x for x in doc.document if x._name == 'item'] for i in items: print '"%(title)s"\n\t%(permalink)s' % i
def main(): tmpl = 'http://api.technorati.com/search?key=%s&limit=5&query=%s' key = open("technorati-key.txt", "r").read().strip() query = (len(sys.argv) > 1) and sys.argv[1] or 'test query' url = tmpl % (key, urllib.quote_plus(query)) data = HTTPCache(url).content() # HACK: I get occasional encoding issues with Technorati, so # here's an ugly hack that seems to make things work anyway. try: doc = xmltramp.parse(data) except SAXParseException: data = data.decode('utf8', 'ignore').encode('utf8') doc = xmltramp.parse(data) items = [ x for x in doc.document if x._name == 'item' ] for i in items: print '"%(title)s"\n\t%(permalink)s' % i
def amazon_search(self, query): """ Given a query string, perform an Amazon search. """ # Construct an Amazon search URL and fetch it. args = { 'SubscriptionId': self.AMAZON_KEY, 'AssociateTag': self.ASSOCIATE_TAG, 'Service': 'AWSECommerceService', 'Operation': 'ItemSearch', 'ResponseGroup': 'Medium,ItemAttributes', 'SearchIndex': 'Books', 'TextStream': query } url = "http://webservices.amazon.com/onca/xml?%s" % \ urllib.urlencode(args) # Parse and return the results of the search data = HTTPCache(url).content() doc = xmltramp.parse(data) return doc
def produce_entries(self): """ Use FeedNormalizer to get feed entries, then merge the lists together. """ entries = [] # Iterate and gather normalized entries for each feed. for feed_uri in self.feed_uris: # Grab and parse the feed feed_data = feedparser.parse(HTTPCache(feed_uri).content()) # Append the list of normalized entries onto merged list. curr_entries = normalize_entries(feed_data.entries) for e in curr_entries: if self.INCLUDE_TITLE: e['title'] = "["+ feed_data.feed.title + "] " + \ e.data['title'] entries.extend(curr_entries) return entries
def produce_entries(self): """ Use FeedNormalizer to get feed entries, then merge the lists together. """ # Grab and parse the feed feed = feedparser.parse(HTTPCache(self.main_feed).content()) # Normalize feed meta data self.FEED_META = normalize_feed_meta(feed, self.date_fmt) self.FEED_META['feed.title'] += ' (with related links)' # Normalize entries from the feed entries = normalize_entries(feed.entries) # Run through all the normalized entries... for e in entries: # Perform a search on the entry title, extract the items result = self.technorati_search(e['title']) items = [x for x in result if x._name == 'item'] # Use each search result item to populate the templates. insert_items = [ self.INSERT_ITEM_TMPL % { 'weblog.name': i.weblog.name, 'weblog.url': i.weblog.url, 'title': i.title, 'permalink': i.permalink } for i in items ] insert_out = self.INSERT_TMPL % '\n'.join(insert_items) # Append the rendered search results onto the entry summary. e.data['summary'] += insert_out.decode('utf-8', 'ignore') return entries
def fetch_items(self): """ Grab search result items for given index and keywords. """ # Construct the list of arguments for the AWS query args = { 'Service': 'AWSECommerceService', 'Operation': 'ItemSearch', 'ResponseGroup': 'Medium', 'SearchIndex': self.index, 'Keywords': self.keywords, 'SubscriptionId': self.aws_id, } # Build the URL for the API call using the base URL and params. url = "%s?%s" % (self.AWS_URL, urllib.urlencode(args)) # Perform the query, fetch and parse the results. data = HTTPCache(url).content() doc = xmltramp.parse(data) # Fetch first page of items. items = [x for x in doc.Items if 'Item' in x._name] return items
def produce_entries(self): """ Use FeedNormalizer to get feed entries, then merge the lists together. """ # Grab and parse the feed feed = feedparser.parse(HTTPCache(self.main_feed).content()) # Normalize feed meta data self.FEED_META = normalize_feed_meta(feed, self.date_fmt) self.FEED_META['feed.title'] += ' (with Amazon items)' # Normalize entries from the feed entries = normalize_entries(feed.entries) # Run through all the normalized entries... for e in entries: # Perform a search on the entry title, extract the items result = self.amazon_search(e['summary']) items = [x for x in result.Items if 'Item' in x._name] # Use each search result item to populate the templates. insert_items = [ self.INSERT_ITEM_TMPL % { 'title': i.ItemAttributes.Title, 'url': i.DetailPageURL, 'img': i.SmallImage.URL } for i in items[:self.MAX_ITEMS] ] insert_out = self.INSERT_TMPL % '\n'.join(insert_items) # Append the rendered search results onto the entry summary. e.data['summary'] += insert_out.decode('utf-8', 'ignore') return entries
#!/usr/bin/env python import sys from httpcache import HTTPCache feed_uri = sys.argv[1] cache = HTTPCache(feed_uri) feed_content = cache.content() print feed_content
def produce_entries(self): """ Normalize the source feed, insert del.icio.us daily link recaps. """ # Grab and parse the feed feed = feedparser.parse(HTTPCache(self.main_feed).content()) # Normalize feed meta data self.FEED_META = normalize_feed_meta(feed, self.date_fmt) self.FEED_META['feed.title'] += ' (with del.icio.us links)' # Normalize entries from the feed entries = normalize_entries(feed.entries) # Iterate through a number of past days' links for n in range(self.NUM_DAYS): # Calculate and format date for this query post_secs = time.time() - ((n + 1) * 24 * 60 * 60) post_time = time.localtime(post_secs) post_dt = time.strftime('%Y-%m-%d', post_time) # Prepare for Basic Authentication in calling del API auth = urllib2.HTTPBasicAuthHandler() auth.add_password('del.icio.us API', 'del.icio.us', self.DEL_USER, self.DEL_PASSWD) urllib2.install_opener(urllib2.build_opener(auth)) # Build del API URL, execute the query, and parse response. url = self.DEL_API_URL % post_dt data = HTTPCache(url).content() doc = xmltramp.parse(data) # Skip this day if no posts resulted from the query if not len(doc) > 0: continue # Iterate through all posts retrieved, build content for entry. post_out = [] for post in doc: # Run through post tags, render links with template. tags_out = [ self.DEL_TAG_TMPL % { 'tag': t, 'href': 'http://del.icio.us/%s/%s' % (self.DEL_USER, t) } for t in post("tag").split() ] # Build content for this link posting using template. try: extended = post('extended') except: extended = '' post_out.append( self.DEL_LINK_TMPL % { 'href': post('href'), 'description': post('description'), 'extended': extended, 'tags': ''.join(tags_out) }) # Construct and append a new feed entry based on the day's links new_entry = FeedEntryDict(date_fmt=self.date_fmt, init_dict={ 'title' : 'del.icio.us links on %s' % post_dt, 'issued' : post_secs, 'modified' : post_secs, 'link' : 'http://del.icio.us/%s#%s' % \ (self.DEL_USER, post_dt), 'summary' : self.DEL_ENTRY_TMPL % "\n".join(post_out) }) entries.append(new_entry) # Pause, because http://del.icio.us/doc/api says so. time.sleep(1) # Return the list of entries built return entries
def parse_uri(self, uri): """Parse HTML content at a URI, return items.""" return self.parse(HTTPCache(uri).content())