def do_it(self, sources): for source in sources: words = nltk.wordpunct_tokenize(source.headline) words.extend(nltk.wordpunct_tokenize(source.summary)) lowerwords=[x.lower() for x in words if len(x) > 1] self.ct += 1 print self.ct, "TITLE",source.headline self.corpus.append(lowerwords) self.titles.append(source.headline) self.links.append(source.url) [[self.key_word_list.add(x) for x in self.top_keywords(self.nkeywords,doc,self.corpus)] for doc in self.corpus] self.ct=-1 for doc in self.corpus: self.ct+=1 print self.ct,"KEYWORDS"," ".join(self.top_keywords(self.nkeywords,doc,self.corpus)) for document in self.corpus: vec=[] [vec.append(self.tfidf(word, document, self.corpus) if word in document else 0) for word in self.key_word_list] self.feature_vectors.append(vec) self.n=len(self.corpus) mat = numpy.empty((self.n, self.n)) for i in xrange(0,self.n): for j in xrange(0,self.n): mat[i][j] = nltk.cluster.util.cosine_distance(self.feature_vectors[i],self.feature_vectors[j]) Z = linkage(mat, 'single') dendrogram(Z, color_threshold=self.t) clusters = self.extract_clusters(Z,self.t,self.n) stories = [] for key in clusters: print "=============================================" story = Story() for id in clusters[key]: story.add_source(sources[id]) print id,self.titles[id],sources[id].url stories.append(story) return stories
def add_story(self, title, date, category, story, sources): #db = sqlite3.connect('model/news.db') #c = db.cursor() #c.execute("insert into news (title,date,category,story) values (?,?,?,?)", (title,date,category,story)) #id = c.lastrowid #for source in sources: # c.execute("insert into sources (id,source,url,headline,story) values (?,?,?,?,?)", (id,source.name,source.url,source.headline,source.story)) #db.commit() #c.close() story_instance = Story() story_instance.set_id(id) story_instance.set_title(title) story_instance.set_date(date) story_instance.set_story(story) for source in sources: story_instance.add_source(source) self.stories.append(story_instance)