ch15_bayes_filter.py

#!/usr/bin/env python
"""
ch15_bayes_filter.py

Build a new feed out of entries filtered from a source feed
above a given Bayesian classifier threshold.
"""
import sys, os, time, ch15_bayes_agg
from agglib import openDBs, closeDBs, getNewFeedEntries
from scraperlib import Scraper
from ch14_feed_normalizer import normalize_feed_meta, normalize_entries
from reverend.thomas import Bayes

FEED_TITLE    = 'Bayes Recommendations'
FEED_TAGLINE  = 'Entries recommended by Bayesian-derived ratings'
FEED_NAME_FN  = "www/www.decafbad.com/docs/private-feeds/bayes-filtered.%s"
FEEDS_FN      = "bayes_feeds.txt"
FEED_DB_FN    = "bayes_filter_feeds_db"
ENTRY_DB_FN   = "bayes_filter_entry_seen_db"
BAYES_DATA_FN = "bayesdata.dat"

def main():
    """
    Perform a test run of the FeedFilter using defaults.
    """
    # Create a new Bayes guesser, attempt to load data
    guesser = Bayes()
    guesser.load(BAYES_DATA_FN)
    
    # Open up the databases, load the subscriptions, get new entries.
    feed_db, entry_db = openDBs(FEED_DB_FN, ENTRY_DB_FN)
    feeds   = [ x.strip() for x in open(FEEDS_FN, "r").readlines() ]
    entries = getNewFeedEntries(feeds, feed_db, entry_db)
    
    # Build the feed filter.
    f = BayesFilter(guesser, entries)
    f.FEED_META['feed.title']   = FEED_TITLE
    f.FEED_META['feed.tagline'] = FEED_TAGLINE
    
    # Output the feed as both RSS and Atom.
    open(FEED_NAME_FN % 'rss', 'w').write(f.scrape_rss())
    open(FEED_NAME_FN % 'atom', 'w').write(f.scrape_atom())
    
    # Close the databases and save the current guesser's state to disk.
    closeDBs(feed_db, entry_db)
    
class BayesFilter(Scraper):
    """
    Filter feed entries using scores from a Bayesian classifier.
    """
    LAST_RUN_FN  = 'filter_last_run.txt'
    STATE_FN     = 'bayes_filter_state'
    
    def __init__(self, guesser, entries, min_score=0.5):
        """Initialize with the feed URI for parsing."""
        self.guesser          = guesser
        self.entries          = entries
        self.min_score        = min_score
        self.entries_filtered = []

    def produce_entries(self):
        """
        Filter entries from a feed using the regex map, use the
        feed normalizer to produce FeedEntryDict objects.
        """
        # If this hasn't already been done, filter aggregator entries.
        if len(self.entries_filtered) < 1:
            self.filter_aggregator_entries()
            
        # Normalize all the filtered entries
        entries = normalize_entries(self.entries_filtered)
        for e in entries:
            e.date_fmt = self.date_fmt
        
        return entries
    
    def filter_aggregator_entries(self):
        """
        Process new entries from the aggregator for inclusion in the
        output feed.  This is broken out into its own method in order
        to reuse the new entries from the aggregator for multiple feed
        output runs.
        """
        # Now, get a score for each entry and, for each entry scored
        # above the minimum threshold, include it in the entries for output.
        for e in self.entries:
            score = ch15_bayes_agg.scoreEntry(self.guesser, e)
            if score > self.min_score:
                # HACK: Tweak each entry's title to include the score.
                e.entry['title'] = u"(%0.3f) %s" % \
                        (score, e.entry.get('title', 'untitled')) 
                self.entries_filtered.append(e.entry)
        
if __name__=='__main__': main()