Ejemplo n.º 1
0
		def do_it(self, sources):
			for source in sources:
				words = nltk.wordpunct_tokenize(source.headline)
				words.extend(nltk.wordpunct_tokenize(source.summary))
				lowerwords=[x.lower() for x in words if len(x) > 1]
				self.ct += 1
				print self.ct, "TITLE",source.headline
				self.corpus.append(lowerwords)
				self.titles.append(source.headline)
				self.links.append(source.url)


			[[self.key_word_list.add(x) for x in self.top_keywords(self.nkeywords,doc,self.corpus)] for doc in self.corpus]

			self.ct=-1
			for doc in self.corpus:
			   self.ct+=1
			   print self.ct,"KEYWORDS"," ".join(self.top_keywords(self.nkeywords,doc,self.corpus))


			for document in self.corpus:
				vec=[]
				[vec.append(self.tfidf(word, document, self.corpus) if word in document else 0) for word in self.key_word_list]
				self.feature_vectors.append(vec)


			self.n=len(self.corpus)

			mat = numpy.empty((self.n, self.n))
			for i in xrange(0,self.n):
			  for j in xrange(0,self.n):
				mat[i][j] = nltk.cluster.util.cosine_distance(self.feature_vectors[i],self.feature_vectors[j])


			Z = linkage(mat, 'single')

			dendrogram(Z, color_threshold=self.t)


			clusters = self.extract_clusters(Z,self.t,self.n)
			
			stories = []

			for key in clusters:
				print "============================================="
				story = Story()  
				for id in clusters[key]:
					story.add_source(sources[id])
					print id,self.titles[id],sources[id].url
				stories.append(story)


			return stories
Ejemplo n.º 2
0
	def add_story(self, title, date, category, story, sources):
		#db = sqlite3.connect('model/news.db')
		#c = db.cursor()
		#c.execute("insert into news (title,date,category,story) values (?,?,?,?)", (title,date,category,story))
		#id = c.lastrowid
		#for source in sources:
		#	c.execute("insert into sources (id,source,url,headline,story) values (?,?,?,?,?)", (id,source.name,source.url,source.headline,source.story))
		#db.commit()
		#c.close()

		story_instance = Story()
		story_instance.set_id(id)
		story_instance.set_title(title)
		story_instance.set_date(date)
		story_instance.set_story(story)
		for source in sources:
			story_instance.add_source(source)
		self.stories.append(story_instance)