コード例 #1
0
def fetch_articles(feed_list):	
	for (name, url) in feed_list.items():
		print "Fetching from "+name
		doc = feedparser.parse(url)
		print "Fetched "+str(len(doc.entries))+" items"

		for entry in doc.entries:
			entities = []
			words = []
			article = []
			try:
				#response = urllib2.urlopen(entry.link);
				#data = response.read()
				#content = strip_tags(removeNonAscii(data))
				links = string.split(entry.link, "http://")

				print links[2]
				response = urllib2.urlopen("http://"+links[2])
				html = response.read()
				
				cleaned = cleanhtml.cleanhtml(html,False)

				if len(cleaned) > 200 :
					#content = strip_tags(cleaned).encode('ascii', 'ignore')
					content = strip_tags(entry.description).encode('ascii', 'ignore')
					sents = nltk.sent_tokenize(content)

					words.extend(nltk.word_tokenize(entry.title))

					# Chunk each sentence
					for sent in sents:
						words.extend(nltk.word_tokenize(sent))

					words = filter_words(words)

					update_wordsdoc(words)

					t = tf.TFIDF()
					tfidf = t.assign_values(words)
					n_tfidf = normalize_tfidf(tfidf[:10])

					pp = pprint.PrettyPrinter(indent=4)
					pp.pprint(n_tfidf)

					article.append((int(time.time()), entry.title, links[2], json.JSONEncoder().encode(n_tfidf)))

					save_articles(article)
			except Exception, e:
				print e
コード例 #2
0
def read():
	name = request.form['username']
	d = db.Database()
	d.open()
	articles = d.fetch_articles_time_limit(int(time.time()) - 2*86400)[::-1] #Fetch articles in the last 2*24 hours
	read = d.fetch_read_articles_url(name)
	nbc = classifier.NaiveBayesClassifier(name, 0.5, 0.5, 0.0001)
	#cl = cluster.Cluster(25,name,100)
	#cl.find_neighbours()

	url = ""
	keywords = ""
	p_nbc_like = 0
	p_nbc_dislike = 0
	p_nbc_like_c = 0
	p_nbc_dislike_c = 0
	p_cl_like = 0
	p_cl_diskile = 0

	total_p_like = 0

	weight_nbc, weight_cluster = 0.8, 0.2

	for article in articles:
		if article[2] not in read:
			print "---URL---"
			print article[2]

			print "--Naive Bayes Classifier---"
			(nbc_like, nbc_dislike), (nbc_like_c, nbc_dislike_c) = nbc.calculate_probability(article[2])
			print "like/dislike"
			print nbc_like, nbc_dislike

			print "---k Nearest Neighbor Cluster---"
			#cl_like, cl_dislike = cl.get_probability(article[2])
			cl_like, cl_dislike = 0.5, 0.5

			# Add of getting random article
			#rnd = (time.time()%10 == 0)
			rnd = True
			#rnd = False
			if (weight_cluster*cl_like + weight_nbc*nbc_like > total_p_like) or rnd:
				total_p_like = weight_cluster*cl_like + weight_nbc*nbc_like 

				url = article[2]
				keywords = article[3]
				p_nbc_like = nbc_like
				p_nbc_dislike = nbc_dislike
				p_nbc_like_c = nbc_like_c
				p_nbc_dislike_c = nbc_dislike_c
				p_cl_like = cl_like
				p_cl_diskile = cl_dislike

				if rnd:
					break

	response = urllib2.urlopen("http://"+url)
	html = response.read()
	
	cleaned = cleanhtml.cleanhtml(html,True)

	return json.JSONEncoder().encode({'url':url, 'keywords':keywords, 'p_like': p_nbc_like, 'p_dislike': p_nbc_dislike, 'p_like_c': p_nbc_like_c, 'p_dislike_c': p_nbc_dislike_c, 'page':cleaned, 'cl_like':p_cl_like, 'cl_dislike':p_cl_diskile, 'weight_nbc': weight_nbc, 'weight_cluster': weight_cluster})