def send_relevant_entry_updates(self,max_entries=4, decay=.9, context_utts=9, extract_top_n_keywords=10, min_found_keywords=3, min_transcript_utts=2): print 'send_relevant_entry_updates called' with Timer() as t: #Do the decay for the displayed entries: #TODO: handle duplicate keywords and updated scores for entry in self.displayed_entries: entry["score"] *= decay # keywords = self.ke.getKeywordsDruid(self.complete_transcript[-1]) # Take last 10 utterances and combine them most_recent_transcript = " ".join(self.complete_transcript[-context_utts:]) # Extract top 9 keywords keywords = self.ke.extract_best_keywords(most_recent_transcript, n_words=extract_top_n_keywords) print keywords #abort if we found very little keywords and haven't seen enough utterances if len(keywords) < min_found_keywords or len(self.complete_transcript) < min_transcript_utts: return # Extract top wiki articles new_relevant_entries = wiki_search_es.extract_best_articles(keywords, n=max_entries) print "-> Extracted top ", len(new_relevant_entries), " documents", [(entry["title"], entry["score"]) for entry in new_relevant_entries] new_relevant_entries = dict(zip([entry["title"] for entry in new_relevant_entries], [entry for entry in new_relevant_entries ] ) ) new_relevant_entries_set = set(new_relevant_entries) relevant_entries_set = set(self.relevant_entries) num_added = 0 #generate add relevant entries for key in new_relevant_entries_set - relevant_entries_set: entry = new_relevant_entries[key] if self.addDisplayEntry("wiki", entry): num_added += 1 for category in entry["categories"]: self.categories[category] += 1 #now look for changed scores (happens if a keyword got more important and gets mentioned again) for key in (new_relevant_entries_set & relevant_entries_set): entry = new_relevant_entries[key] if entry["score"] > self.relevant_entries[key]["score"]: print "score change for:",entry["title"], self.relevant_entries[key]["score"], "->", entry["score"] found_displayed_entry = False for display_entry in self.displayed_entries: #already displayed, we could delete and read it, to reflect the new placement if display_entry["title"] == key: found_displayed_entry = True #self.delDisplayEntry("wiki", entry["title"]) #self.addDisplayEntry("wiki", entry) break if not found_displayed_entry: #not displayed, try to see if the higher score gets results in a document that is more important self.addDisplayEntry("wiki", entry) for key in new_relevant_entries_set - relevant_entries_set: self.relevant_entries[key] = new_relevant_entries[key] topCategories_Event = self.topCategories() print topCategories_Event # TODO: only send something if topCategories actually changes self.keyword_client.sendCategories(topCategories_Event) print 'send_relevant_entry_updates finished. Time needed:', t.secs, 'seconds.' print 'Displayed entries should now be:',[entry['title'] for entry in self.displayed_entries] print 'Added:',num_added
# minimum_should_match_percent=5 print "extract_best_keywords:" #extracted_num_tokens_like_manual = ke.extract_best_keywords(in_text, n_words=10, tfidf_only=tfidf_only) #was n_words=num_tokens #print extracted_num_tokens_like_manual extracted_10_tokens = ke.extract_best_keywords(in_text, n_words=10, tfidf_only=tfidf_only) #if multiwords: # #boost scores # extracted_10_tokens = [(elem,score) for (elem,score) in extracted_10_tokens] print extracted_10_tokens # Extract top wiki articles new_relevant_entries = wiki_search_es.extract_best_articles(extracted_10_tokens, n=10, min_summary_chars=300, minimum_should_match_percent=minimum_should_match_percent) print "-> Extracted top ", len(new_relevant_entries), " documents", [(entry["title"], entry["score"]) for entry in new_relevant_entries] # Write extracted tokens into file with io.open(os.path.join(keyword_eval_dir, myfile), 'w', encoding='utf-8') as out_file: out_file.write(u'\n'.join([' '.join(elem[0].split('_')) for elem in extracted_10_tokens])+u'\n') with io.open(os.path.join(query_eval_dir, myfile), 'w', encoding='utf-8') as out_file: out_file.write(wiki_search_es.construct_query_string(extracted_10_tokens)+u'\n') json_out = {'filename':myfile, 'orig':orig, 'top10':new_relevant_entries} print json_out with open(os.path.join(ndcg_eval_dir, myfile[:-4]+'.json'), 'w') as outfile: json.dump(json_out, outfile, indent=4)
start_time = time.time() print 'Extracting keyphrases...' tokens = ke.habibi_mimic(raw, n=num_tokens) print 'Done extracting keyphrases. Time needed:', time.time() - start_time print '==========Text==========' for sentence in nltk.sent_tokenize(raw): print sentence print '==========Keyphrases==========' print tokens start_time = time.time() print 'Retrieving best articles...' results = wiki_search_es.extract_best_articles(tokens, n=10) print 'Done retrieving articles. Time needed:', time.time() - start_time # Convert resulting dictionary into sorted list sorted_results = [] for key, value in results.iteritems(): sorted_results.append(value) sorted_results = sorted(sorted_results, key=lambda item: item['score'], reverse=True) for article in sorted_results: print '==========Article==========' print 'Title:', article['title'] #print 'Summary:', article['text'] #print 'URL:', article['url'] print 'Categories:', article['categories'] print 'Score:', article['score']
def send_relevant_entry_updates(self,max_entries=4, decay=.8): print 'send_relevant_entry_updates called' with Timer() as t: #Do the decay for the displayed entries: #TODO: handle duplicate keywords and updated scores for entry in self.displayed_entries: entry["score"] *= decay # keywords = self.ke.getKeywordsDruid(self.complete_transcript[-1]) # Take last 10 utterances and combine them most_recent_transcript = " ".join(self.complete_transcript[-10:]) # Extract top 9 keywords keywords = self.ke.extract_best_keywords(most_recent_transcript, n=9) print keywords # Extract top 5 wiki articles new_relevant_entries = wiki_search_es.extract_best_articles(keywords, n=5) new_relevant_entries_set = set(new_relevant_entries) relevant_entries_set = set(self.relevant_entries) #generate del relevant entries #for key in relevant_entries_set - new_relevant_entries_set: # entry = self.relevant_entries[key] # self.delDisplayEntry("wiki", entry["title"]) #generate add relevant entries for key in new_relevant_entries_set - relevant_entries_set: entry = new_relevant_entries[key] if self.addDisplayEntry("wiki", entry): for category in entry["categories"]: self.categories[category] += 1 #now look for changed scores (happens if a keyword got more important and gets mentioned again) for key in (new_relevant_entries_set & relevant_entries_set): entry = new_relevant_entries[key] if entry["score"] > self.relevant_entries[key]["score"]: print "score change for:",entry["title"], self.relevant_entries[key]["score"], "->", entry["score"] found_displayed_entry = False for display_entry in self.displayed_entries: #already displayed, we could delete and read it, to reflect the new placement if display_entry["title"] == key: found_displayed_entry = True #self.delDisplayEntry("wiki", entry["title"]) #self.addDisplayEntry("wiki", entry) break if not found_displayed_entry: #not displayed, try to see if the higher score gets results in a document that is more important self.addDisplayEntry("wiki", entry) for key in new_relevant_entries_set - relevant_entries_set: self.relevant_entries[key] = new_relevant_entries[key] topCategories_Event = self.topCategories() print topCategories_Event # TODO: only send something if topCategories actually changes self.keyword_client.sendCategories(topCategories_Event) print 'send_relevant_entry_updates finished. Time needed:', t.secs, 'seconds.' print 'Displayed entries should now be:',[entry['title'] for entry in self.displayed_entries]