def generate_query_list(self, topic): """ Given a Topic object, produces a list of query terms that could be issued by the simulated agent. """ self.__title_stem_length = 2 self.__description_cutoff = 0 topic_title = topic.title topic_description = topic.content topic_language_model = self._generate_topic_language_model(topic) # Generate a series of query terms from the title, and then rank the generated terms. title_generator = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) title_query_list = title_generator.extract_queries_from_text(topic_title) title_query_list = self._rank_terms(title_query_list, topic_language_model=topic_language_model) # Produce the title query "stem" title_stem = self.__get_title_stem(topic_language_model, title_query_list) # Perform the same steps, but from the description of the topic. description_generator = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) description_query_list = description_generator.extract_queries_from_text(topic_description) description_query_list = self._rank_terms(description_query_list, topic_language_model=topic_language_model) return self.__generate_permutations(topic_language_model, title_stem, description_query_list)
def __update_topic_language_model(self, text_list): topic_text = '{title} {title} {title} {content}'.format(**self._topic.__dict__) n = len(text_list) snippet_text = ' '.join(text_list) term_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) term_extractor.extract_queries_from_text(topic_text) topic_term_counts = term_extractor.query_count term_extractor.extract_queries_from_text(snippet_text) new_text_term_counts = term_extractor.query_count for term in topic_term_counts: if term in new_text_term_counts: new_text_term_counts[term] += topic_term_counts[term] else: new_text_term_counts[term] = topic_term_counts[term] new_language_model = LanguageModel(term_dict=new_text_term_counts) self.topic_language_model = SmoothedLanguageModel(new_language_model, self.background_language_model, self.mu) log.debug("Updating topic {0}".format(self._topic.id))
def _update_topic_language_model(self, text_list): topic_text = self._make_topic_text(document_text=text_list) n = len(text_list) snippet_text = ' '.join(text_list) term_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) term_extractor.extract_queries_from_text(topic_text) topic_term_counts = term_extractor.query_count term_extractor.extract_queries_from_text(snippet_text) new_text_term_counts = term_extractor.query_count for term in topic_term_counts: if term in new_text_term_counts: new_text_term_counts[term] += topic_term_counts[term] else: new_text_term_counts[term] = topic_term_counts[term] new_language_model = LanguageModel(term_dict=new_text_term_counts) self.topic_language_model = new_language_model log.debug("Updating topic {0}".format(self._topic.id))
def __update_topic_language_model(self, text_list): topic_text = '{title} {title} {title} {content}'.format( **self._topic.__dict__) n = len(text_list) snippet_text = ' '.join(text_list) term_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) term_extractor.extract_queries_from_text(topic_text) topic_term_counts = term_extractor.query_count term_extractor.extract_queries_from_text(snippet_text) new_text_term_counts = term_extractor.query_count for term in topic_term_counts: if term in new_text_term_counts: new_text_term_counts[term] += topic_term_counts[term] else: new_text_term_counts[term] = topic_term_counts[term] new_language_model = LanguageModel(term_dict=new_text_term_counts) self.topic_language_model = SmoothedLanguageModel( new_language_model, self.background_language_model, self.mu) log.debug("Updating topic {0}".format(self._topic.id))
def generate_query_list(self, search_context): """ Given a Topic object, produces a list of query terms that could be issued by the simulated agent. """ topic = search_context.topic topic_title = topic.title topic_description = topic.content topic_language_model = self._generate_topic_language_model( search_context) # Generate a series of query terms from the title, and then rank the generated terms. title_generator = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) title_query_list = title_generator.extract_queries_from_text( topic_title) title_query_list = self._rank_terms( title_query_list, topic_language_model=topic_language_model) # Produce the title query "stem" title_stem = self.__get_title_stem(topic_language_model, title_query_list) # Perform the same steps, but from the description of the topic. description_generator = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) description_query_list = description_generator.extract_queries_from_text( topic_description) description_query_list = self._rank_terms( description_query_list, topic_language_model=topic_language_model) query_permutations = self.__generate_permutations( topic_language_model, title_stem, description_query_list) return query_permutations
def make_topic_lm(self, topic ): topic_text = topic.content doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count topicLM = LanguageModel(term_dict=doc_term_counts) return topicLM
def make_topic_lm(self, topic): topic_text = topic.content doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count topicLM = LanguageModel(term_dict=doc_term_counts) return topicLM
def make_topic_lm(self): topic_text = self.topic.content + self.topic.title doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count lm = LanguageModel(term_dict=doc_term_counts) self.topicLM = SmoothedLanguageModel(lm,self.backgroundLM,100) print "making topic", self.topicLM.docLM.total_occurrences
def make_topic_lm(self): topic_text = self.topic.content + self.topic.title doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count lm = LanguageModel(term_dict=doc_term_counts) self.topicLM = SmoothedLanguageModel(lm, self.backgroundLM, 100) print "making topic", self.topicLM.docLM.total_occurrences
def extract_term_dict_from_text(text, stopword_file): """ takes text, parses it, and counts how many times each term occurs. :param text: a string :return: a dict of {term, count} """ single_term_text_extractor = SingleQueryGeneration(minlen=3, stopwordfile=stopword_file) single_term_text_extractor.extract_queries_from_text(text) term_counts_dict = single_term_text_extractor.query_count return term_counts_dict
def make_topic_language_model(self): """ """ topic_text = self._topic.content + self._topic.title document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel(language_model, self.background_language_model, 100) print "making topic", self.topic_language_model.docLM.total_occurrences
def make_topic_language_model(self): """ """ topic_text = '{title} {title} {title} {content}'.format(**self._topic.__dict__) document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel(language_model, self.background_language_model, self.mu) log.debug("Making topic {0}".format(self.topic_language_model.docLM.total_occurrences))
def _generate_naive_topic_language_model(self, topic): """ Given a Topic object, returns a language model representation for the given topic. Override this method in inheriting classes to generate and return different language models. """ topic_text = topic.content document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count # The langauge model we return is simply a representtaion of the number of times terms occur within the topic text. topic_language_model = LanguageModel(term_dict=document_term_counts) return topic_language_model
def make_topic_language_model(self): """ """ topic_text = self._topic.content + self._topic.title document_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel( language_model, self.background_language_model, 100) print "making topic", self.topic_language_model.docLM.total_occurrences
def make_topic_language_model(self): """ """ topic_text = '{title} {title} {title} {content}'.format( **self._topic.__dict__) document_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel( language_model, self.background_language_model, self.mu) log.debug("Making topic {0}".format( self.topic_language_model.docLM.total_occurrences))
def make_topic_lm(self, topic): topic_text = topic.title topic_bg = topic.content doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count doc_extractor.extract_queries_from_text(topic_bg) bg_term_counts = doc_extractor.query_count titleLM = LanguageModel(term_dict=doc_term_counts) bgLM = LanguageModel(term_dict=bg_term_counts) topicLM = BayesLanguageModel(titleLM, bgLM,beta=10) return topicLM
def _generate_topic_language_model(self, topic): """ Returns a languge model for the given topic, considering both the title and content text. """ topic_text = topic.title topic_background = topic.content document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count document_extractor.extract_queries_from_text(topic_background) background_term_counts = document_extractor.query_count title_language_model = LanguageModel(term_dict=document_term_counts) background_language_model = LanguageModel(term_dict=background_term_counts) topic_language_model = BayesLanguageModel(title_language_model, background_language_model, beta=10) return topic_language_model
def _generate_topic_language_model(self, topic): """ """ topic_text = topic.title topic_background = topic.content document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count document_extractor.extract_queries_from_text(topic_background) background_term_counts = document_extractor.query_count title_language_model = LanguageModel(term_dict=document_term_counts) background_language_model = LanguageModel(term_dict=background_term_counts) topic_language_model = BayesLanguageModel(title_language_model, background_language_model, beta=10) return topic_language_model
def _generate_topic_language_model(self, topic): """ Given a Topic object, returns a language model representation for the given topic. Override this method in inheriting classes to generate and return different language models. """ topic_text = topic.title topic_background = topic.content document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count document_extractor.extract_queries_from_text(topic_background) background_term_counts = document_extractor.query_count title_language_model = LanguageModel(term_dict=document_term_counts) background_language_model = LanguageModel(term_dict=background_term_counts) topic_language_model = BayesLanguageModel(title_language_model, background_language_model, beta=10) return topic_language_model
def make_topic_lm(self, topic): topic_text = topic.title topic_bg = topic.content doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count doc_extractor.extract_queries_from_text(topic_bg) bg_term_counts = doc_extractor.query_count titleLM = LanguageModel(term_dict=doc_term_counts) bgLM = LanguageModel(term_dict=bg_term_counts) topicLM = BayesLanguageModel(titleLM, bgLM, beta=10) return topicLM
def generate_query_list(self, topic): """ Given a Topic object, produces a list of query terms that could be issued by the simulated agent. """ topic_text = topic.content topic_lang_model = self._generate_topic_language_model(topic) single_query_generator = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) single_query_list = single_query_generator.extract_queries_from_text(topic_text) query_ranker = QueryRanker(smoothed_language_model=topic_lang_model) query_ranker.calculate_query_list_probabilities(single_query_list) return query_ranker.get_top_queries(100)
def _update_topic_language_model(self, text_list): topic_text = self._make_topic_text(document_text=text_list) n = len(text_list) snippet_text = ' '.join(text_list) term_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) term_extractor.extract_queries_from_text(topic_text) topic_term_counts = term_extractor.query_count term_extractor.extract_queries_from_text(snippet_text) new_text_term_counts = term_extractor.query_count for term in topic_term_counts: if term in new_text_term_counts: new_text_term_counts[term] += topic_term_counts[term] else: new_text_term_counts[term] = topic_term_counts[term] new_language_model = LanguageModel(term_dict=new_text_term_counts) self.topic_language_model = new_language_model log.debug("Updating topic {0}".format(self._topic.id))
def _generate_topic_language_model(self, topic): """ Given a Topic object, returns a language model representation for the given topic. Override this method in inheriting classes to generate and return different language models. """ topic_text = topic.title topic_background = topic.content document_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count document_extractor.extract_queries_from_text(topic_background) background_term_counts = document_extractor.query_count title_language_model = LanguageModel(term_dict=document_term_counts) background_language_model = LanguageModel( term_dict=background_term_counts) topic_language_model = BayesLanguageModel(title_language_model, background_language_model, beta=10) return topic_language_model
def generate_query_list(self, topic): """ Given a Topic object, produces a list of query terms that could be issued by the simulated agent. """ topic_text = topic.content topic_lang_model = self._generate_topic_language_model(topic) single_query_generator = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) single_query_list = single_query_generator.extract_queries_from_text( topic_text) query_ranker = QueryRanker(smoothed_language_model=topic_lang_model) query_ranker.calculate_query_list_probabilities(single_query_list) return query_ranker.get_top_queries(100)
def generate_query_list(self, search_context): """ Given a Topic object, produces a list of query terms that could be issued by the simulated agent. """ topic = search_context.topic topic_text = "{0} {1}".format(topic.title, topic.content) topic_language_model = self._generate_topic_language_model(search_context) generator = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) query_list = generator.extract_queries_from_text(topic_text) query_ranker = QueryRanker(smoothed_language_model=topic_language_model) query_ranker.calculate_query_list_probabilities(query_list) generated_queries = query_ranker.get_top_queries(100) return generated_queries