Python HTTPCache Examples, httpcache.HTTPCache Python Examples

Example #1

0

Show file

File: ch17_feed_to_javascript.py Project: lmorchard/hacking_rss_and_atom

    def build(self):
        """
        Fetch feed data and return JavaScript code usable as an include
        to format the feed as HTML.
        """
        # Fetch and parse the feed
        cache     = HTTPCache(self.feed_url) 
        feed_data = feedparser.parse(cache.content())

        # Build a list of content strings by populating entry template
        entries_out = [ self.ENTRY_TMPL % {
            'link'    : entry.get('link',  ''),
            'title'   : entry.get('title', ''),
            'summary' : entry.get('summary', ''),
        } for entry in feed_data.entries ]

        # Build final content by populating the overall shell template
        out = self.INCLUDE_TMPL % {
            'feed.title'   : feed_data.feed.title,
            'feed.entries' : "\n".join(entries_out)
        }

        # Encode the content using the object unicode encoding
        out = out.encode(self.UNICODE_ENC)
        
        # Return the content wrapped in JavaScript code
        return self.js_format(out)

Example #2

0

Show file

File: ch17_feed_to_javascript.py Project: openstake/hacking_rss_and_atom

    def build(self):
        """
        Fetch feed data and return JavaScript code usable as an include
        to format the feed as HTML.
        """
        # Fetch and parse the feed
        cache = HTTPCache(self.feed_url)
        feed_data = feedparser.parse(cache.content())

        # Build a list of content strings by populating entry template
        entries_out = [
            self.ENTRY_TMPL % {
                'link': entry.get('link', ''),
                'title': entry.get('title', ''),
                'summary': entry.get('summary', ''),
            } for entry in feed_data.entries
        ]

        # Build final content by populating the overall shell template
        out = self.INCLUDE_TMPL % {
            'feed.title': feed_data.feed.title,
            'feed.entries': "\n".join(entries_out)
        }

        # Encode the content using the object unicode encoding
        out = out.encode(self.UNICODE_ENC)

        # Return the content wrapped in JavaScript code
        return self.js_format(out)

Example #3

0

Show file

File: ch08_feed_fetch.py Project: lmorchard/hacking_rss_and_atom

def main():
    """
    Given a feed URL as an argument, fetch and print the feed.
    """
    feed_uri     = sys.argv[1]
    cache        = HTTPCache(feed_uri)
    feed_content = cache.content()

    print feed_content

Example #4

0

Show file

File: ch08_feed_fetch.py Project: openstake/hacking_rss_and_atom

def main():
    """
    Given a feed URL as an argument, fetch and print the feed.
    """
    feed_uri = sys.argv[1]
    cache = HTTPCache(feed_uri)
    feed_content = cache.content()

    print feed_content

Example #5

0

Show file

File: minifeedparser.py Project: openstake/hacking_rss_and_atom

 def parse(self, feed_uri):
     """Given a URI to a feed, fetch it and return parsed data."""
     
     cache        = HTTPCache(feed_uri)
     feed_content = cache.content()
     
     self.reset()
     self.feed(feed_content)
     
     return {
         'version'  : self._version,
         'feed'     : self._feed,
         'entries'  : self._entries
     }

Example #6

0

Show file

File: ch13_amazon_wishlist_scraper.py Project: openstake/hacking_rss_and_atom

 def fetch_items(self):
     """
     Grab search result items for given index and keywords.
     """
     # Construct the list of arguments for the AWS query
     args = {
         'Service'        : 'AWSECommerceService',
         'Operation'      : 'ListLookup',
         'ResponseGroup'  : 'Medium,ListFull,ItemAttributes',
         'Sort'           : 'LastUpdated',
         'ListType'       : 'WishList',
         'ListId'         : self.wishlist_id,
         'SubscriptionId' : self.aws_id,
     }
     
     # Build the URL for the API call using the base URL and params.
     url = "%s?%s" % (self.AWS_URL, urllib.urlencode(args))
     
     # Perform the query, fetch and parse the results.
     data  = HTTPCache(url).content()
     doc   = xmltramp.parse(data)
     
     # Update the feed link and title from search result metadata
     self.FEED_META['feed.link']  = doc.Lists.List.ListURL
     self.FEED_META['feed.title'] = \
         'Amazon.com wishlist items for "%s"' % \
         doc.Lists.List.CustomerName
     
     # Fetch first page of items.
     return [ x.Item for x in doc.Lists.List if 'ListItem' in x._name ]

Example #7

0

Show file

File: ch13_amazon_find_wishlist.py Project: openstake/hacking_rss_and_atom

def main():
    """
    Search for wishlists using command line arguments.
    """
    # Leaving out the program name, grab all space-separated arguments.
    name = " ".join(sys.argv[1:])

    # Construct the list of arguments for the AWS query
    args = {
        'Service': 'AWSECommerceService',
        'Operation': 'ListSearch',
        'ListType': 'WishList',
        'SubscriptionId': AWS_ID,
        'Name': name
    }

    # Build the URL for the API call using the base URL and params.
    url = "%s?%s" % (AWS_URL, urllib.urlencode(args))

    # Perform the query, fetch and parse the results.
    data = HTTPCache(url).content()
    doc = xmltramp.parse(data)

    # Print out the list IDs found.
    lists = [x for x in doc.Lists if 'List' in x._name]
    for list in lists:
        print '%15s: %s' % (list.ListId, list.CustomerName)

Example #8

0

Show file

    def technorati_search(self, query):
        """
        Given a query string, perform a Technorati search.
        """
        # Construct a Technorati search URL and fetch it.
        url  = self.SEARCH_URL_TMPL % \
               ( self.TECHNORATI_KEY, urllib.quote_plus(query) )
        data = HTTPCache(url).content()

        # HACK: I get occasional encoding issues with Technorati, so
        # here's an ugly hack that seems to make things work anyway.
        try:
            return xmltramp.parse(data).document
        except SAXParseException:
            data = data.decode('ascii', 'ignore')
            return xmltramp.parse(data).document

Example #9

0

Show file

def main():
    """
    Perform iCalendar to hCalendar rendering.
    """
    # Establish the calendar URL and output file.
    ics_url = len(sys.argv) > 1 and sys.argv[0] or ICS_URL
    html_dir = len(sys.argv) > 2 and sys.argv[1] or HTML_DIR

    # Get the calendar via URL and parse the data
    cal = Calendar.from_string(HTTPCache(ics_url).content())

    # Create HTML_DIR if it doesn't already exist
    if not os.path.exists(html_dir): os.makedirs(html_dir)

    # Process calendar components.
    for event in cal.walk():

        # Skip this calendar component if it's not an event.
        if not type(event) is Event: continue

        # Summarize the event data, make a hash, build a filename.
        hash_src = ','.join(['%s=%s' % x for x in event.items()])
        hash = md5(hash_src).hexdigest()
        hcal_fn = os.path.join(html_dir, '%s.html' % hash)

        # Build the hCalendar content and write out to file.
        hcal_out = HEVENT_TMPL % ICalTmplWrapper(event)
        open(hcal_fn, 'w').write(hcal_out)

Example #10

0

Show file

File: subscriptions.py Project: lmorchard/feedspool

    def fetch(self):
        """Fetch the data for the feed, return whether feed has changed."""
        plugin_manager.dispatch("feed_fetch_start", subscription=self)

        # Prepare the URI and initial headers for the fetch.
        feed_uri = self.uri
        headers  = {
            'User-Agent': config.USER_AGENT,
            'Accept':     config.ACCEPT_HEADER 
        }

        # Stolen from feedparser: Handle inline user:password for basic auth
        auth = None
        urltype, rest  = urllib.splittype(feed_uri)
        realhost, rest = urllib.splithost(rest)
        if realhost:
            user_passwd, realhost = urllib.splituser(realhost)
            if user_passwd:
                feed_uri = "%s://%s%s" % (urltype, realhost, rest)
                auth = base64.encodestring(user_passwd).strip()
                headers['Authorization'] = "Basic %s" % auth

        # Grab the feed data via HTTPCache
        cache     = HTTPCache(feed_uri, headers)
        info      = cache.info()
        content   = cache.content()
        feed_hash = md5(content).hexdigest()

        # Copy over some HTTP headers as feed metadata
        if 'ETag' in info: 
            self.meta.set('scan', 'http_etag', info['ETag'])
        if 'Last-Modified' in info:
            self.meta.set('scan', 'http_last_modified', info['Last-Modified'])

        #changed = cache.fresh()
        changed = ( not self.meta.has_option('scan', 'last_feed_md5') or \
                    not feed_hash == self.meta.get('scan', 'last_feed_md5') )
        if changed:
            # Update the feed hash, write the fetched feed.
            self.meta.set('scan', 'last_feed_md5', feed_hash)
            fout = open(self.feed_fn, 'w')
            fout.write(content)
            fout.close()

        plugin_manager.dispatch("feed_fetch_end", subscription=self, changed=changed)
        return changed

Example #11

0

Show file

def main():
    tmpl = 'http://api.technorati.com/search?key=%s&limit=5&query=%s'
    key = open("technorati-key.txt", "r").read().strip()
    query = (len(sys.argv) > 1) and sys.argv[1] or 'test query'
    url = tmpl % (key, urllib.quote_plus(query))
    data = HTTPCache(url).content()

    # HACK: I get occasional encoding issues with Technorati, so
    # here's an ugly hack that seems to make things work anyway.
    try:
        doc = xmltramp.parse(data)
    except SAXParseException:
        data = data.decode('utf8', 'ignore').encode('utf8')
        doc = xmltramp.parse(data)

    items = [x for x in doc.document if x._name == 'item']
    for i in items:
        print '"%(title)s"\n\t%(permalink)s' % i

Example #12

0

Show file

File: ch16_technorati_search.py Project: lmorchard/hacking_rss_and_atom

def main():
    tmpl  = 'http://api.technorati.com/search?key=%s&limit=5&query=%s'
    key   = open("technorati-key.txt", "r").read().strip()
    query = (len(sys.argv) > 1) and sys.argv[1] or 'test query'
    url   = tmpl % (key, urllib.quote_plus(query))
    data  = HTTPCache(url).content()
    
    # HACK: I get occasional encoding issues with Technorati, so
    # here's an ugly hack that seems to make things work anyway.
    try:
        doc = xmltramp.parse(data)
    except SAXParseException:
        data = data.decode('utf8', 'ignore').encode('utf8')
        doc = xmltramp.parse(data)

    items = [ x for x in doc.document if x._name == 'item' ]
    for i in items:
        print '"%(title)s"\n\t%(permalink)s' % i

Example #13

0

Show file

File: ch16_feed_amazon_ads.py Project: openstake/hacking_rss_and_atom

    def amazon_search(self, query):
        """
        Given a query string, perform an Amazon search.
        """
        # Construct an Amazon search URL and fetch it.
        args = {
            'SubscriptionId': self.AMAZON_KEY,
            'AssociateTag': self.ASSOCIATE_TAG,
            'Service': 'AWSECommerceService',
            'Operation': 'ItemSearch',
            'ResponseGroup': 'Medium,ItemAttributes',
            'SearchIndex': 'Books',
            'TextStream': query
        }
        url  = "http://webservices.amazon.com/onca/xml?%s" % \
            urllib.urlencode(args)

        # Parse and return the results of the search
        data = HTTPCache(url).content()
        doc = xmltramp.parse(data)
        return doc

Example #14

0

Show file

    def produce_entries(self):
        """
        Use FeedNormalizer to get feed entries, then merge
        the lists together.
        """
        entries = []

        # Iterate and gather normalized entries for each feed.
        for feed_uri in self.feed_uris:

            # Grab and parse the feed
            feed_data = feedparser.parse(HTTPCache(feed_uri).content())

            # Append the list of normalized entries onto merged list.
            curr_entries = normalize_entries(feed_data.entries)
            for e in curr_entries:
                if self.INCLUDE_TITLE:
                    e['title'] = "["+ feed_data.feed.title + "] " + \
                                 e.data['title']
            entries.extend(curr_entries)

        return entries

Example #15

0

Show file

    def produce_entries(self):
        """
        Use FeedNormalizer to get feed entries, then merge
        the lists together.
        """
        # Grab and parse the feed
        feed = feedparser.parse(HTTPCache(self.main_feed).content())

        # Normalize feed meta data
        self.FEED_META = normalize_feed_meta(feed, self.date_fmt)
        self.FEED_META['feed.title'] += ' (with related links)'

        # Normalize entries from the feed
        entries = normalize_entries(feed.entries)

        # Run through all the normalized entries...
        for e in entries:

            # Perform a search on the entry title, extract the items
            result = self.technorati_search(e['title'])
            items = [x for x in result if x._name == 'item']

            # Use each search result item to populate the templates.
            insert_items = [
                self.INSERT_ITEM_TMPL % {
                    'weblog.name': i.weblog.name,
                    'weblog.url': i.weblog.url,
                    'title': i.title,
                    'permalink': i.permalink
                } for i in items
            ]
            insert_out = self.INSERT_TMPL % '\n'.join(insert_items)

            # Append the rendered search results onto the entry summary.
            e.data['summary'] += insert_out.decode('utf-8', 'ignore')

        return entries

Example #16

0

Show file

File: ch13_amazon_search_scraper.py Project: openstake/hacking_rss_and_atom

    def fetch_items(self):
        """
        Grab search result items for given index and keywords.
        """
        # Construct the list of arguments for the AWS query
        args = {
            'Service': 'AWSECommerceService',
            'Operation': 'ItemSearch',
            'ResponseGroup': 'Medium',
            'SearchIndex': self.index,
            'Keywords': self.keywords,
            'SubscriptionId': self.aws_id,
        }

        # Build the URL for the API call using the base URL and params.
        url = "%s?%s" % (self.AWS_URL, urllib.urlencode(args))

        # Perform the query, fetch and parse the results.
        data = HTTPCache(url).content()
        doc = xmltramp.parse(data)

        # Fetch first page of items.
        items = [x for x in doc.Items if 'Item' in x._name]
        return items

Example #17

0

Show file

File: ch16_feed_amazon_ads.py Project: openstake/hacking_rss_and_atom

    def produce_entries(self):
        """
        Use FeedNormalizer to get feed entries, then merge
        the lists together.
        """
        # Grab and parse the feed
        feed = feedparser.parse(HTTPCache(self.main_feed).content())

        # Normalize feed meta data
        self.FEED_META = normalize_feed_meta(feed, self.date_fmt)
        self.FEED_META['feed.title'] += ' (with Amazon items)'

        # Normalize entries from the feed
        entries = normalize_entries(feed.entries)

        # Run through all the normalized entries...
        for e in entries:

            # Perform a search on the entry title, extract the items
            result = self.amazon_search(e['summary'])
            items = [x for x in result.Items if 'Item' in x._name]

            # Use each search result item to populate the templates.
            insert_items = [
                self.INSERT_ITEM_TMPL % {
                    'title': i.ItemAttributes.Title,
                    'url': i.DetailPageURL,
                    'img': i.SmallImage.URL
                } for i in items[:self.MAX_ITEMS]
            ]
            insert_out = self.INSERT_TMPL % '\n'.join(insert_items)

            # Append the rendered search results onto the entry summary.
            e.data['summary'] += insert_out.decode('utf-8', 'ignore')

        return entries

Example #18

0

Show file

File: feed_fetch.py Project: lmorchard/hacking_rss_and_atom

#!/usr/bin/env python

import sys
from httpcache import HTTPCache

feed_uri     = sys.argv[1]
cache        = HTTPCache(feed_uri)
feed_content = cache.content()

print feed_content

Example #19

0

Show file

File: ch16_feed_delicious_recaps.py Project: openstake/hacking_rss_and_atom

    def produce_entries(self):
        """
        Normalize the source feed, insert del.icio.us daily link recaps.
        """
        # Grab and parse the feed
        feed = feedparser.parse(HTTPCache(self.main_feed).content())

        # Normalize feed meta data
        self.FEED_META = normalize_feed_meta(feed, self.date_fmt)
        self.FEED_META['feed.title'] += ' (with del.icio.us links)'

        # Normalize entries from the feed
        entries = normalize_entries(feed.entries)

        # Iterate through a number of past days' links
        for n in range(self.NUM_DAYS):

            # Calculate and format date for this query
            post_secs = time.time() - ((n + 1) * 24 * 60 * 60)
            post_time = time.localtime(post_secs)
            post_dt = time.strftime('%Y-%m-%d', post_time)

            # Prepare for Basic Authentication in calling del API
            auth = urllib2.HTTPBasicAuthHandler()
            auth.add_password('del.icio.us API', 'del.icio.us', self.DEL_USER,
                              self.DEL_PASSWD)
            urllib2.install_opener(urllib2.build_opener(auth))

            # Build del API URL, execute the query, and parse response.
            url = self.DEL_API_URL % post_dt
            data = HTTPCache(url).content()
            doc = xmltramp.parse(data)

            # Skip this day if no posts resulted from the query
            if not len(doc) > 0: continue

            # Iterate through all posts retrieved, build content for entry.
            post_out = []
            for post in doc:

                # Run through post tags, render links with template.
                tags_out = [
                    self.DEL_TAG_TMPL % {
                        'tag': t,
                        'href': 'http://del.icio.us/%s/%s' % (self.DEL_USER, t)
                    } for t in post("tag").split()
                ]

                # Build content for this link posting using template.
                try:
                    extended = post('extended')
                except:
                    extended = ''

                post_out.append(
                    self.DEL_LINK_TMPL % {
                        'href': post('href'),
                        'description': post('description'),
                        'extended': extended,
                        'tags': ''.join(tags_out)
                    })

            # Construct and append a new feed entry based on the day's links
            new_entry = FeedEntryDict(date_fmt=self.date_fmt, init_dict={
                'title'    : 'del.icio.us links on %s' % post_dt,
                'issued'   : post_secs,
                'modified' : post_secs,
                'link'     : 'http://del.icio.us/%s#%s' % \
                             (self.DEL_USER, post_dt),
                'summary'  : self.DEL_ENTRY_TMPL % "\n".join(post_out)
            })
            entries.append(new_entry)

            # Pause, because http://del.icio.us/doc/api says so.
            time.sleep(1)

        # Return the list of entries built
        return entries

Example #20

0

Show file

File: hcalendar.py Project: openstake/hacking_rss_and_atom

 def parse_uri(self, uri):
     """Parse HTML content at a URI, return items."""
     return self.parse(HTTPCache(uri).content())