Python FeedEntryDict Exemples, scraperlib.FeedEntryDict Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : monitorfeedlib.py Projet : openstake/hacking_rss_and_atom

def main():
    """
    Test out LogBufferFeed by maintaining a random number feed.
    """
    # Construct the feed generator
    f = LogBufferFeed('random_feed')
    f.FEED_META['feed.title'] = 'Random Number of the Moment'
    f.FEED_META['feed.tagline'] = 'Serving your random number needs.'
    f.MAX_ENTRIES = 4
    f.MAX_AGE = 30  #10 * 60 # 10 minutes

    # Construct and append a new entry
    import random
    num = random.random() * 1000
    entry = FeedEntryDict({
        'title':
        'Random number %s' % num,
        'link':
        '',
        'summary':
        'Here is another random number for you: %s' % num
    })
    f.append_entry(entry)

    # Output the current feed entries
    if len(sys.argv) > 1 and sys.argv[1] == 'rss':
        print f.scrape_rss()
    else:
        print f.scrape_atom()

Exemple #2

0

Afficher le fichier

Fichier : ch13_google_search_scraper.py Projet : openstake/hacking_rss_and_atom

    def produce_entries(self):
        """
        Produce feed entries from Google product item data.
        """
        # Start off with an empty list for entries.
        entries = []

        # Execute the Google search
        data = google.doGoogleSearch(self.search_query,
                                     license_key=self.license_key)

        # Run through all fetched items, building entries
        for result in data.results:

            # Map the web search result data to feed entry properties
            entry = FeedEntryDict(date_fmt=self.date_fmt,
                                  init_dict={
                                      'title': result.directoryTitle
                                      or '(untitled)',
                                      'link': result.URL,
                                      'summary': result.snippet,
                                  })

            # Append completed entry to list
            entries.append(entry)

        return entries

Exemple #3

0

Afficher le fichier

    def produce_entries(self):
        """
        Produce feed entries from Yahoo! product item data.
        """
        # Start off with an empty list for entries.
        entries = []

        # Create a new Yahoo! API web search
        search = NewsSearch(self.app_id, query=self.search_query,
                            sort='date', results=50)
        
        # Run through all fetched items, building entries
        for result in search.parse_results():
            
            # Map the web search result data to feed entry properties
            entry = FeedEntryDict(date_fmt=self.date_fmt, init_dict={
                'title'       : '[%s] %s' % \
                    (result['NewsSource'], result['Title']),
                'link'        : result['ClickUrl'],
                'summary'     : result['Summary'],
                'author.name' : result['NewsSource'],
                'author.link' : result['NewsSourceUrl'],
                'modified'    : int(result['ModificationDate']),
                'issued'      : int(result['PublishDate']),
            })

            # Append completed entry to list
            entries.append(entry)

        return entries

Exemple #4

0

Afficher le fichier

Fichier : ch10_apache_error_feed.py Projet : openstake/hacking_rss_and_atom

def main():
    """
    Report new errors found in Apache logs.
    """
    # Construct the feed generator
    f = LogBufferFeed(FEED_DIR)
    f.MAX_AGE = 24 * 60 * 60 # 1 day
    f.FEED_META['feed.title']   = '%s Apache Errors' % SITE_NAME
    f.FEED_META['feed.tagline'] = \
        'New errors from Apache on %s' % SITE_NAME
    
    # If there were new referrers found, insert a new entry.
    new_lines = bookmark_tailgrep(ERROR_LOG, 
                                  max_initial_lines=3000)

    if len(new_lines) > 0:
        # Construct and append a new entry
        esc_lines = [escape(x) for x in new_lines]
        entry = FeedEntryDict({
            'title'   : '%s new lines of errors' % len(new_lines),
            'link'    : '',
            'summary' : """
                <div style="font-family:monospace">
                    %s
                </div>
            """ % "<br />\n".join(esc_lines)
        })
        f.append_entry(entry)

    # Output the current feed entries as both RSS and Atom
    open(FEED_NAME_FN % 'rss', 'w').write(f.scrape_rss())
    open(FEED_NAME_FN % 'atom', 'w').write(f.scrape_atom())

Exemple #5

0

Afficher le fichier

def main():
    """
    Detect login activity changes and report in feed.
    """
    # Construct the feed generator
    f = LogBufferFeed(FEED_DIR)
    f.MAX_AGE = 24 * 60 * 60 # 1 day
    f.FEED_META['feed.title']   = '%s Login Activity' % SITE_NAME
    f.FEED_META['feed.tagline'] = \
        'Summary of login activity on the %s server' % SITE_NAME
   
    # Call the command and capture output
    (sout, sin) = popen4(COMMAND) 
    new_lines   = [ x for x in sout.readlines() 
                    if x.find('reboot') == -1 ]
    
    # Attempt load up output from the previous run.
    old_lines = None
    old_output_fn = os.path.join(FEED_DIR, 'old_output.gz')
    if os.path.exists(old_output_fn):
        old_lines = gzip.open(old_output_fn, "r").readlines()
    
    # If there is previous output, check for changes...
    if old_lines:
        
        # Run a diff on the previous and current program output.
        diff_lines = [ x for x in difflib.ndiff(old_lines, new_lines) ]

        # Extract only the lines that have changed.
        changes_lines = [ x for x in diff_lines 
                          if x.startswith('-') or x.startswith('+') ]
        
        # Construct and append a new entry if there were changes
        if len(changes_lines) > 0:
            esc_changes_lines = [escape(x) for x in changes_lines]
            esc_diff_lines = [escape(x) for x in diff_lines]
            entry = FeedEntryDict({
                'link'    : '',
                'title'   : TITLE_TMPL % { 
                    'changes' : len(changes_lines) 
                },
                'summary' : SUMMARY_TMPL % {
                    'changes_lines' : "<br />".join(esc_changes_lines),
                    'diff_lines'    : "<br />".join(esc_diff_lines)
                }
            })
            f.append_entry(entry)

    # Save output from the current run for use next time.
    gzip.open(old_output_fn, "w").write("".join(new_lines))

    # Output the current feed entries as both RSS and Atom
    open(FEED_NAME_FN % 'rss', 'w').write(f.scrape_rss())
    open(FEED_NAME_FN % 'atom', 'w').write(f.scrape_atom())

Exemple #6

0

Afficher le fichier

Fichier : ch14_feed_normalizer.py Projet : openstake/hacking_rss_and_atom

def normalize_entries(entries_in, full_content=True):
    """
    Return a list of normalized FeedEntryDict objects, given a
    list of entries from the feedparser.
    """
    entries = []

    # Process incoming feed entries.
    for entry_in in entries_in:

        # Create the empty new output feed entry.
        entry_out = FeedEntryDict()

        entry_out.orig = entry_in

        # Perform a straight copy of a few entry attributes.
        for n in ('id', 'title', 'link'):
            if entry_in.has_key(n):
                entry_out[n] = entry_in[n]

        # Convert feedparser time tuples to seconds and copy over.
        for n in ('modified', 'issued'):
            if entry_in.get('%s_parsed' % n, None):
                entry_out[n] = calendar.timegm(entry_in['%s_parsed' % n])

        # Decide whether to copy only summary or full content.
        if full_content and entry_in.has_key('content'):
            content_list = [
                x.value for x in entry_in.content if 'text' in x.type
            ]
            entry_out['summary'] = ''.join(content_list)
        elif entry_in.has_key('summary'):
            entry_out['summary'] = entry_in.summary

        # Append finished feed to list.
        entries.append(entry_out)

    # Return accumulated output feed entries.
    return entries

Exemple #7

0

Afficher le fichier

Fichier : ch14_feed_normalizer.py Projet : lmorchard/hacking_rss_and_atom

def normalize_entries(entries_in, full_content=True):
    """
    Return a list of normalized FeedEntryDict objects, given a
    list of entries from the feedparser.
    """
    entries = []
    
    # Process incoming feed entries.
    for entry_in in entries_in:
        
        # Create the empty new output feed entry.
        entry_out = FeedEntryDict()

        entry_out.orig = entry_in

        # Perform a straight copy of a few entry attributes.
        for n in ('id', 'title', 'link'):
            if entry_in.has_key(n):
                entry_out[n] = entry_in[n]

        # Convert feedparser time tuples to seconds and copy over.
        for n in ('modified', 'issued'):
            if entry_in.get('%s_parsed' % n, None):
                entry_out[n] = calendar.timegm(entry_in['%s_parsed' % n])
        
        # Decide whether to copy only summary or full content.
        if full_content and entry_in.has_key('content'):
            content_list = [ x.value for x in entry_in.content 
                             if 'text' in x.type ]
            entry_out['summary'] = ''.join(content_list)
        elif entry_in.has_key('summary'):
            entry_out['summary'] = entry_in.summary
            
        # Append finished feed to list.
        entries.append(entry_out)

    # Return accumulated output feed entries.
    return entries

Exemple #8

0

Afficher le fichier

Fichier : ch11_svn_log_scraper.py Projet : openstake/hacking_rss_and_atom

    def produce_entries(self):
        """Use xpaths to extract feed entries and entry attributes."""
        entries = []

        # Iterate through the parts identified as log entry nodes.
        for entry_node in self.svn_log().xpath('//logentry'):

            # Extract a few basic elements from the log entry
            revision = self.xpval(entry_node, './@revision')
            author = self.xpval(entry_node, './author/text()')
            msg = self.xpval(entry_node, './msg/text()')

            # Extract and parse the date for the log entry
            date_str = self.xpval(entry_node, './date/text()')
            date_tup = time.strptime(date_str[:19], '%Y-%m-%dT%H:%M:%S')
            entry_time = calendar.timegm(date_tup)

            # Extract and process the list of affected file paths
            paths_changed = []
            for path_node in entry_node.xpath('./paths/path'):
                action = self.xpval(path_node, './@action')
                path = self.xpval(path_node, './text()')
                paths_changed.append("%s %s" % (action, path))

            entry_id = 'tag:%s%s' % (self.TAG_PREFIX, revision)

            # Build the feed entry based on log entry information
            entry = FeedEntryDict(init_dict={
                'id':
                entry_id,
                'title':
                'Revision %s by %s' % (revision, author),
                'link':
                self.url,
                'issued':
                entry_time,
                'modified':
                entry_time,
                'summary':
                "<pre>%s\n\nFiles affected:\n%s</pre>" %
                (msg, '\n'.join(paths_changed))
            },
                                  date_fmt=self.date_fmt)
            entries.append(entry)

        return entries

Exemple #9

0

Afficher le fichier

Fichier : amazonlib.py Projet : openstake/hacking_rss_and_atom

    def produce_entries(self):
        """
        Produce feed entries from Amazon product item data.
        """
        entries = []

        all_items = self.fetch_items()

        # Run through all fetched items, building entries
        for item in all_items:

            # Wrap the item in a template-friendly object
            tmpl_item = TrampTmplWrapper(item)

            # Build an empty entry object
            entry = FeedEntryDict(date_fmt=self.date_fmt)

            # Generate an ID for this entry based on tracked data
            m = md5.md5()
            for k in self.ITEM_TRACK:
                m.update(tmpl_item[k])
            entry['id'] = state_id = "tag:%s,%s:%s" % \
                (self.TAG_DOMAIN, self.TAG_DATE, m.hexdigest())

            # Use the item detail URL for entry link
            entry['link'] = tmpl_item['DetailPageURL']

            # Use the author, artist, or actor name for item
            # and entry author
            authors = []
            for k in ('Author', 'Artist', 'Actor'):
                v = tmpl_item['ItemAttributes.%s' % k]
                if v: authors.append(v)
            entry['author.name'] = ", ".join(authors)

            # Build entry title and summary from string templates
            entry['title'] = self.TITLE_TMPL % tmpl_item
            entry['summary'] = self.SUMMARY_TMPL % tmpl_item

            # Append completed entry to list
            entries.append(entry)

        return entries

Exemple #10

0

Afficher le fichier

Fichier : ch11_cvs_history_scraper.py Projet : openstake/hacking_rss_and_atom

    def produce_entries(self):
        """
        Build feed entries based on queried CVS history events.
        """
        events = self.client.history()

        entries = []
        for event in events[:self.MAX_ENTRIES]:
            # Build a GUID for this entry
            cvs_id = '%(path)s:%(revision)s' % event
            entry_id = 'tag:%s%s' % (self.TAG_PREFIX, quote(cvs_id))

            # Attempt to grab an existing state record for this entry ID.
            if not self.state_db.has_key(entry_id):
                self.state_db[entry_id] = {}
            entry_state = self.state_db[entry_id]

            # If this entry's state doesn't already have a description
            # cached, query CVS for the log entry and grab the it.
            if not entry_state.has_key('description'):
                log_entry = self.client.rlog(event.revision, event.path)
                entry_state['description'] = log_entry.description
            description = entry_state['description']

            # Build the feed entry based on the CVS event and log entry
            entry = FeedEntryDict(init_dict={
                'id': entry_id,
                'title': self.TITLE_TMPL % event,
                'link': self.LINK_TMPL % event,
                'author.name': event.user,
                'modified': event.time,
                'issued': event.time,
                'summary': '<pre>%s</pre>' % description
            },
                                  date_fmt=self.date_fmt)

            # Append the completed entry to the list, and save the
            # entry state.
            entries.append(entry)
            self.state_db[entry_id] = entry_state

        return entries

Exemple #11

0

Afficher le fichier

Fichier : mailfeedlib.py Projet : openstake/hacking_rss_and_atom

    def entries_from_messages(self, msgs):
        """
        Given a list of email.Message, attempt to build a list 
        of FeedEntryDict objects
        """
        entries = []

        for msg in msgs:

            entry = FeedEntryDict(date_fmt=self.date_fmt)

            # Set the 'dummy' link for the entry from feed.link
            entry['link'] = self.FEED_META['feed.link']

            # Use message Subject for entry title.
            entry['title'] = msg.get('Subject', '(Untitled)')

            # Use From header for entry author email.
            entry['author.name'] = msg['From']

            # Convert message Date into seconds, use for modified
            # and issued
            msg_time_raw = email.Utils.parsedate(msg['Date'])
            msg_time = time.mktime(msg_time_raw)
            entry.data['modified'] = entry.data['issued'] = msg_time

            # Get a GUID for this entry.
            entry['id'] = self.build_guid_for_message(msg, entry)

            # Summarize the email for the entry.
            entry['summary'] = self.extract_summary_from_message(msg)

            # Stuff the new entry into the running list.
            entries.append(entry)

        # Finally, return what was accumulated
        return entries

Exemple #12

0

Afficher le fichier

Fichier : ch13_yahoo_search_scraper.py Projet : openstake/hacking_rss_and_atom

    def produce_entries(self):
        """
        Produce feed entries from Yahoo! product item data.
        """
        # Start off with an empty list for entries.
        entries = []

        # Create a new Yahoo! API web search
        search = WebSearch(self.app_id, query=self.search_query, results=50)

        # Execute the query and gather results.
        results = [r for r in search.parse_results()]

        # Sort the results in reverse-chronological order by
        # modification date
        results.sort(lambda a,b: \
            cmp(b['ModificationDate'], a['ModificationDate']))

        # Run through all fetched items, building entries
        for result in results:

            # Map the web search result data to feed entry properties
            entry = FeedEntryDict(date_fmt=self.date_fmt,
                                  init_dict={
                                      'title': result['Title'],
                                      'link': result['ClickUrl'],
                                      'summary': result['Summary'],
                                      'modified':
                                      int(result['ModificationDate']),
                                      'issued':
                                      int(result['ModificationDate']),
                                  })

            # Append completed entry to list
            entries.append(entry)

        return entries

Exemple #13

0

Afficher le fichier

Fichier : ch16_feed_delicious_recaps.py Projet : openstake/hacking_rss_and_atom

    def produce_entries(self):
        """
        Normalize the source feed, insert del.icio.us daily link recaps.
        """
        # Grab and parse the feed
        feed = feedparser.parse(HTTPCache(self.main_feed).content())

        # Normalize feed meta data
        self.FEED_META = normalize_feed_meta(feed, self.date_fmt)
        self.FEED_META['feed.title'] += ' (with del.icio.us links)'

        # Normalize entries from the feed
        entries = normalize_entries(feed.entries)

        # Iterate through a number of past days' links
        for n in range(self.NUM_DAYS):

            # Calculate and format date for this query
            post_secs = time.time() - ((n + 1) * 24 * 60 * 60)
            post_time = time.localtime(post_secs)
            post_dt = time.strftime('%Y-%m-%d', post_time)

            # Prepare for Basic Authentication in calling del API
            auth = urllib2.HTTPBasicAuthHandler()
            auth.add_password('del.icio.us API', 'del.icio.us', self.DEL_USER,
                              self.DEL_PASSWD)
            urllib2.install_opener(urllib2.build_opener(auth))

            # Build del API URL, execute the query, and parse response.
            url = self.DEL_API_URL % post_dt
            data = HTTPCache(url).content()
            doc = xmltramp.parse(data)

            # Skip this day if no posts resulted from the query
            if not len(doc) > 0: continue

            # Iterate through all posts retrieved, build content for entry.
            post_out = []
            for post in doc:

                # Run through post tags, render links with template.
                tags_out = [
                    self.DEL_TAG_TMPL % {
                        'tag': t,
                        'href': 'http://del.icio.us/%s/%s' % (self.DEL_USER, t)
                    } for t in post("tag").split()
                ]

                # Build content for this link posting using template.
                try:
                    extended = post('extended')
                except:
                    extended = ''

                post_out.append(
                    self.DEL_LINK_TMPL % {
                        'href': post('href'),
                        'description': post('description'),
                        'extended': extended,
                        'tags': ''.join(tags_out)
                    })

            # Construct and append a new feed entry based on the day's links
            new_entry = FeedEntryDict(date_fmt=self.date_fmt, init_dict={
                'title'    : 'del.icio.us links on %s' % post_dt,
                'issued'   : post_secs,
                'modified' : post_secs,
                'link'     : 'http://del.icio.us/%s#%s' % \
                             (self.DEL_USER, post_dt),
                'summary'  : self.DEL_ENTRY_TMPL % "\n".join(post_out)
            })
            entries.append(new_entry)

            # Pause, because http://del.icio.us/doc/api says so.
            time.sleep(1)

        # Return the list of entries built
        return entries

Exemple #14

0

Afficher le fichier

Fichier : ch15_popular_links.py Projet : openstake/hacking_rss_and_atom

def main():
    """
    Scan all feeds and update the feed with a new link popularity
    report entry.
    """
    # Construct the feed generator.
    f = LogBufferFeed(FEED_DIR)
    f.MAX_AGE = 1 * 24 * 60 * 60  # 1 day
    f.FEED_META['feed.title'] = FEED_TITLE
    f.FEED_META['feed.tagline'] = FEED_TAGLINE

    # Load up the list of feeds.
    feed_uris = [x.strip() for x in open(FEEDS_FN, 'r').readlines()]

    # Skim for links from each feed, collect feed and entries in an
    # inverted index using link URLs as top-level keys.
    links = {}
    for feed_uri in feed_uris:
        feed_data = feedparser.parse(feed_uri)

        # Grab the feed metadata from parsed feed.
        feed = feed_data.feed
        feed_link = feed.get('link', '#')

        # Process all entries for their links...
        for curr_entry in feed_data.entries:

            # HACK: Ignore entries without modification dates.
            # Maybe improve this by stashing seen dates in a DB.
            if curr_entry.get('modified_parsed', None) is None:
                continue

            # If the current entry is older than the max allowed age,
            # skip processing it.
            now = time.time()
            entry_time = calendar.timegm(curr_entry.modified_parsed)
            if (now - entry_time) > MAX_ENTRY_AGE:
                continue

            # Build a LinkSkimmer and feed it all summary and HTML
            # content data from the current entry.  Ignore parse
            # errors in the interest of just grabbing what we can.
            skimmer = LinkSkimmer()
            try:
                skimmer.feed(curr_entry.get('summary', ''))
                for c in curr_entry.get('content', []):
                    skimmer.feed(c.value)
            except HTMLParseError:
                pass

            # Process each link by adding the current feed and entry
            # under the link's key in the inverted index.
            for uri, cnt in skimmer.get_links():
                if not links.has_key(uri):
                    links[uri] = {}
                if not links[uri].has_key(feed_link):
                    links[uri][feed_link] = (feed, curr_entry)

    # Turn the inverted index of links into a list of tuples, sort by
    # popularity of links as measured by number of linked entries.
    links_sorted = links.items()
    links_sorted.sort(lambda a, b: cmp(len(b[1].keys()), len(a[1].keys())))

    # Build the overall entry content from all the links.
    links_out = []
    for x in links_sorted:

        # Get the link and the list of linkers, skip this link if there
        # aren't enough linkers counted.
        link, linkers = x
        if len(linkers) < MIN_LINKS: continue

        # Build the list of linkers for this link by populating the
        # LINKER_TMPL string template.
        linkers_out = []
        for feed, entry in linkers.values():
            linkers_out.append(
                LINKER_TMPL % {
                    'feed.title': feed.get('title', 'untitled'),
                    'feed.link': feed.get('link', '#'),
                    'entry.title': entry.get('title', 'untitled'),
                    'entry.link': entry.get('link', '#'),
                })

        # Build the content block for this link by populating the
        # LINK_TMPL string template.
        links_out.append(
            LINK_TMPL % {
                'link': link,
                'link_cnt': len(linkers),
                'linkers': '\n'.join(linkers_out)
            })

    # Complete building the content for this entry by populating the
    # CONTENT_TMPL string template.
    out = CONTENT_TMPL % '\n'.join(links_out)

    # Construct and append a new entry
    entry = FeedEntryDict({
        'title': TITLE_TMPL % {
            'link_cnt': len(links_out),
            'time': time.strftime(TITLE_TIME_FMT)
        },
        'link': '',
        'summary': out
    })
    f.append_entry(entry)

    # Output the current feed entries as both RSS and Atom
    open(FEED_NAME_FN % 'rss', 'w').write(f.scrape_rss())
    open(FEED_NAME_FN % 'atom', 'w').write(f.scrape_atom())