def run(self): from dbfrontend import DBProxy db = DBProxy() clusters = db.get_all_clusters() for cluster in clusters: print "Cluster " + str(clusters.index(cluster)) for n in cluster.newss.all(): print " " + n.title print ""
def run(self): from dbfrontend import DBProxy import sanitizer rss_entries = self.fetch_rss_entries() db = DBProxy() db_news = db.get_all_news() db_urls = [n.url for n in db_news] yet_unfetched_entries = [rss for rss in rss_entries if rss.url not in db_urls] unique_rss_entries = [] for entry in yet_unfetched_entries: if entry.url not in [e.url for e in unique_rss_entries]: unique_rss_entries.append(entry) print "There are " + str(len(unique_rss_entries)) + " news entries" news = self.fetch_and_parse_news(unique_rss_entries) print "Fetched "+ str(len(news)) + " news" news = [n for n in news if n] db.add_list(news)
def run(self): from dbfrontend import DBProxy import sanitizer rss_entries = self.fetch_rss_entries() db = DBProxy() db_news = db.get_all_news() db_urls = [n.url for n in db_news] yet_unfetched_entries = [ rss for rss in rss_entries if rss.url not in db_urls ] unique_rss_entries = [] for entry in yet_unfetched_entries: if entry.url not in [e.url for e in unique_rss_entries]: unique_rss_entries.append(entry) print "There are " + str(len(unique_rss_entries)) + " news entries" news = self.fetch_and_parse_news(unique_rss_entries) print "Fetched " + str(len(news)) + " news" news = [n for n in news if n] db.add_list(news)
def run(self, distance_function, reduce_function, threshold): from dbfrontend import DBProxy db = DBProxy() db.delete_all_clusters() news = db.get_all_news() tw = NewsGroup.TITLE_WEIGHT bw = NewsGroup.BODY_WEIGHT for n in news: a = reduce_function(n.clean_body) b = reduce_function(n.clean_title) n.vector = dict( (n, bw*a.get(n, 0)+tw*b.get(n, 0)) for n in set(a)|set(b) ) clusters = self.group(db, news, threshold, distance_function) db.add_list(clusters) return clusters