/
ch15_bayes_filter.py
95 lines (80 loc) · 3.42 KB
/
ch15_bayes_filter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
#!/usr/bin/env python
"""
ch15_bayes_filter.py
Build a new feed out of entries filtered from a source feed
above a given Bayesian classifier threshold.
"""
import sys, os, time, ch15_bayes_agg
from agglib import openDBs, closeDBs, getNewFeedEntries
from scraperlib import Scraper
from ch14_feed_normalizer import normalize_feed_meta, normalize_entries
from reverend.thomas import Bayes
FEED_TITLE = 'Bayes Recommendations'
FEED_TAGLINE = 'Entries recommended by Bayesian-derived ratings'
FEED_NAME_FN = "www/www.decafbad.com/docs/private-feeds/bayes-filtered.%s"
FEEDS_FN = "bayes_feeds.txt"
FEED_DB_FN = "bayes_filter_feeds_db"
ENTRY_DB_FN = "bayes_filter_entry_seen_db"
BAYES_DATA_FN = "bayesdata.dat"
def main():
"""
Perform a test run of the FeedFilter using defaults.
"""
# Create a new Bayes guesser, attempt to load data
guesser = Bayes()
guesser.load(BAYES_DATA_FN)
# Open up the databases, load the subscriptions, get new entries.
feed_db, entry_db = openDBs(FEED_DB_FN, ENTRY_DB_FN)
feeds = [ x.strip() for x in open(FEEDS_FN, "r").readlines() ]
entries = getNewFeedEntries(feeds, feed_db, entry_db)
# Build the feed filter.
f = BayesFilter(guesser, entries)
f.FEED_META['feed.title'] = FEED_TITLE
f.FEED_META['feed.tagline'] = FEED_TAGLINE
# Output the feed as both RSS and Atom.
open(FEED_NAME_FN % 'rss', 'w').write(f.scrape_rss())
open(FEED_NAME_FN % 'atom', 'w').write(f.scrape_atom())
# Close the databases and save the current guesser's state to disk.
closeDBs(feed_db, entry_db)
class BayesFilter(Scraper):
"""
Filter feed entries using scores from a Bayesian classifier.
"""
LAST_RUN_FN = 'filter_last_run.txt'
STATE_FN = 'bayes_filter_state'
def __init__(self, guesser, entries, min_score=0.5):
"""Initialize with the feed URI for parsing."""
self.guesser = guesser
self.entries = entries
self.min_score = min_score
self.entries_filtered = []
def produce_entries(self):
"""
Filter entries from a feed using the regex map, use the
feed normalizer to produce FeedEntryDict objects.
"""
# If this hasn't already been done, filter aggregator entries.
if len(self.entries_filtered) < 1:
self.filter_aggregator_entries()
# Normalize all the filtered entries
entries = normalize_entries(self.entries_filtered)
for e in entries:
e.date_fmt = self.date_fmt
return entries
def filter_aggregator_entries(self):
"""
Process new entries from the aggregator for inclusion in the
output feed. This is broken out into its own method in order
to reuse the new entries from the aggregator for multiple feed
output runs.
"""
# Now, get a score for each entry and, for each entry scored
# above the minimum threshold, include it in the entries for output.
for e in self.entries:
score = ch15_bayes_agg.scoreEntry(self.guesser, e)
if score > self.min_score:
# HACK: Tweak each entry's title to include the score.
e.entry['title'] = u"(%0.3f) %s" % \
(score, e.entry.get('title', 'untitled'))
self.entries_filtered.append(e.entry)
if __name__=='__main__': main()