def fetch_articles(feed_list): for (name, url) in feed_list.items(): print "Fetching from "+name doc = feedparser.parse(url) print "Fetched "+str(len(doc.entries))+" items" for entry in doc.entries: entities = [] words = [] article = [] try: #response = urllib2.urlopen(entry.link); #data = response.read() #content = strip_tags(removeNonAscii(data)) links = string.split(entry.link, "http://") print links[2] response = urllib2.urlopen("http://"+links[2]) html = response.read() cleaned = cleanhtml.cleanhtml(html,False) if len(cleaned) > 200 : #content = strip_tags(cleaned).encode('ascii', 'ignore') content = strip_tags(entry.description).encode('ascii', 'ignore') sents = nltk.sent_tokenize(content) words.extend(nltk.word_tokenize(entry.title)) # Chunk each sentence for sent in sents: words.extend(nltk.word_tokenize(sent)) words = filter_words(words) update_wordsdoc(words) t = tf.TFIDF() tfidf = t.assign_values(words) n_tfidf = normalize_tfidf(tfidf[:10]) pp = pprint.PrettyPrinter(indent=4) pp.pprint(n_tfidf) article.append((int(time.time()), entry.title, links[2], json.JSONEncoder().encode(n_tfidf))) save_articles(article) except Exception, e: print e
def read(): name = request.form['username'] d = db.Database() d.open() articles = d.fetch_articles_time_limit(int(time.time()) - 2*86400)[::-1] #Fetch articles in the last 2*24 hours read = d.fetch_read_articles_url(name) nbc = classifier.NaiveBayesClassifier(name, 0.5, 0.5, 0.0001) #cl = cluster.Cluster(25,name,100) #cl.find_neighbours() url = "" keywords = "" p_nbc_like = 0 p_nbc_dislike = 0 p_nbc_like_c = 0 p_nbc_dislike_c = 0 p_cl_like = 0 p_cl_diskile = 0 total_p_like = 0 weight_nbc, weight_cluster = 0.8, 0.2 for article in articles: if article[2] not in read: print "---URL---" print article[2] print "--Naive Bayes Classifier---" (nbc_like, nbc_dislike), (nbc_like_c, nbc_dislike_c) = nbc.calculate_probability(article[2]) print "like/dislike" print nbc_like, nbc_dislike print "---k Nearest Neighbor Cluster---" #cl_like, cl_dislike = cl.get_probability(article[2]) cl_like, cl_dislike = 0.5, 0.5 # Add of getting random article #rnd = (time.time()%10 == 0) rnd = True #rnd = False if (weight_cluster*cl_like + weight_nbc*nbc_like > total_p_like) or rnd: total_p_like = weight_cluster*cl_like + weight_nbc*nbc_like url = article[2] keywords = article[3] p_nbc_like = nbc_like p_nbc_dislike = nbc_dislike p_nbc_like_c = nbc_like_c p_nbc_dislike_c = nbc_dislike_c p_cl_like = cl_like p_cl_diskile = cl_dislike if rnd: break response = urllib2.urlopen("http://"+url) html = response.read() cleaned = cleanhtml.cleanhtml(html,True) return json.JSONEncoder().encode({'url':url, 'keywords':keywords, 'p_like': p_nbc_like, 'p_dislike': p_nbc_dislike, 'p_like_c': p_nbc_like_c, 'p_dislike_c': p_nbc_dislike_c, 'page':cleaned, 'cl_like':p_cl_like, 'cl_dislike':p_cl_diskile, 'weight_nbc': weight_nbc, 'weight_cluster': weight_cluster})