def __update_topic_language_model(self, text_list): topic_text = '{title} {title} {title} {content}'.format(**self._topic.__dict__) n = len(text_list) snippet_text = ' '.join(text_list) term_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) term_extractor.extract_queries_from_text(topic_text) topic_term_counts = term_extractor.query_count term_extractor.extract_queries_from_text(snippet_text) new_text_term_counts = term_extractor.query_count for term in topic_term_counts: if term in new_text_term_counts: new_text_term_counts[term] += topic_term_counts[term] else: new_text_term_counts[term] = topic_term_counts[term] new_language_model = LanguageModel(term_dict=new_text_term_counts) self.topic_language_model = SmoothedLanguageModel(new_language_model, self.background_language_model, self.mu) log.debug("Updating topic {0}".format(self._topic.id))
def generate_query_list(self, topic): """ Given a Topic object, produces a list of query terms that could be issued by the simulated agent. """ self.__title_stem_length = 2 self.__description_cutoff = 0 topic_title = topic.title topic_description = topic.content topic_language_model = self._generate_topic_language_model(topic) # Generate a series of query terms from the title, and then rank the generated terms. title_generator = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) title_query_list = title_generator.extract_queries_from_text(topic_title) title_query_list = self._rank_terms(title_query_list, topic_language_model=topic_language_model) # Produce the title query "stem" title_stem = self.__get_title_stem(topic_language_model, title_query_list) # Perform the same steps, but from the description of the topic. description_generator = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) description_query_list = description_generator.extract_queries_from_text(topic_description) description_query_list = self._rank_terms(description_query_list, topic_language_model=topic_language_model) return self.__generate_permutations(topic_language_model, title_stem, description_query_list)
def get_ranked_queries(self, text=''): """ loads the background document model and generates the ranked queries :return: the queries in a list """ if not text: text = self.page_html backgroundfile = 'background.txt' filename = raw_input("enter the filename of the background file, background.txt is default") if filename: backgroundfile = filename print "background file is ", backgroundfile doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=self.stopwordfile) query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=self.stopwordfile) print "Loading background distribution" colLM = LanguageModel(file=backgroundfile) print "Background loaded, number of terms: ", colLM.get_num_terms() #doc_extractor.extract_queries_from_html(self.page_html) doc_extractor.extract_queries_from_html(text) doc_term_counts = doc_extractor.query_count print "Number of terms in document: %d" % (len(doc_term_counts)) docLM = LanguageModel(term_dict=doc_term_counts) slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500) #query_list = query_generator.extract_queries_from_html(self.page_html) query_list = query_generator.extract_queries_from_html(text) print "Queries generated: ", len(query_list) qr = OddsRatioQueryRanker(smoothed_language_model=slm) scored_queries = qr.calculate_query_list_probabilities(query_list) queries = qr.get_top_queries(self.mq) query_list = [] for query in queries: query_list.append(query[0]) return query_list
def make_topic_lm(self, topic ): topic_text = topic.content doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count topicLM = LanguageModel(term_dict=doc_term_counts) return topicLM
def make_topic_lm(self): topic_text = self.topic.content + self.topic.title doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count lm = LanguageModel(term_dict=doc_term_counts) self.topicLM = SmoothedLanguageModel(lm,self.backgroundLM,100) print "making topic", self.topicLM.docLM.total_occurrences
def extract_term_dict_from_text(text, stopword_file): """ takes text, parses it, and counts how many times each term occurs. :param text: a string :return: a dict of {term, count} """ single_term_text_extractor = SingleQueryGeneration(minlen=3, stopwordfile=stopword_file) single_term_text_extractor.extract_queries_from_text(text) term_counts_dict = single_term_text_extractor.query_count return term_counts_dict
def make_topic_language_model(self): """ """ topic_text = self._topic.content + self._topic.title document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel(language_model, self.background_language_model, 100) print "making topic", self.topic_language_model.docLM.total_occurrences
def _generate_naive_topic_language_model(self, topic): """ Given a Topic object, returns a language model representation for the given topic. Override this method in inheriting classes to generate and return different language models. """ topic_text = topic.content document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count # The langauge model we return is simply a representtaion of the number of times terms occur within the topic text. topic_language_model = LanguageModel(term_dict=document_term_counts) return topic_language_model
def make_topic_language_model(self): """ """ topic_text = '{title} {title} {title} {content}'.format(**self._topic.__dict__) document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel(language_model, self.background_language_model, self.mu) log.debug("Making topic {0}".format(self.topic_language_model.docLM.total_occurrences))
def generate_query_list(self, topic): """ Given a Topic object, produces a list of query terms that could be issued by the simulated agent. """ topic_text = topic.content topic_lang_model = self._generate_topic_language_model(topic) single_query_generator = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) single_query_list = single_query_generator.extract_queries_from_text(topic_text) query_ranker = QueryRanker(smoothed_language_model=topic_lang_model) query_ranker.calculate_query_list_probabilities(single_query_list) return query_ranker.get_top_queries(100)
def _generate_topic_language_model(self, topic): """ Returns a languge model for the given topic, considering both the title and content text. """ topic_text = topic.title topic_background = topic.content document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count document_extractor.extract_queries_from_text(topic_background) background_term_counts = document_extractor.query_count title_language_model = LanguageModel(term_dict=document_term_counts) background_language_model = LanguageModel(term_dict=background_term_counts) topic_language_model = BayesLanguageModel(title_language_model, background_language_model, beta=10) return topic_language_model
def generate_query_list(self, search_context): """ Given a Topic object, produces a list of query terms that could be issued by the simulated agent. """ topic = search_context.topic topic_text = "{0} {1}".format(topic.title, topic.content) topic_language_model = self._generate_topic_language_model(search_context) generator = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) query_list = generator.extract_queries_from_text(topic_text) query_ranker = QueryRanker(smoothed_language_model=topic_language_model) query_ranker.calculate_query_list_probabilities(query_list) generated_queries = query_ranker.get_top_queries(100) return generated_queries
def make_topic_lm(self, topic): topic_text = topic.title topic_bg = topic.content doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count doc_extractor.extract_queries_from_text(topic_bg) bg_term_counts = doc_extractor.query_count titleLM = LanguageModel(term_dict=doc_term_counts) bgLM = LanguageModel(term_dict=bg_term_counts) topicLM = BayesLanguageModel(titleLM, bgLM,beta=10) return topicLM
def _generate_topic_language_model(self, topic): """ Given a Topic object, returns a language model representation for the given topic. Override this method in inheriting classes to generate and return different language models. """ topic_text = topic.title topic_background = topic.content document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count document_extractor.extract_queries_from_text(topic_background) background_term_counts = document_extractor.query_count title_language_model = LanguageModel(term_dict=document_term_counts) background_language_model = LanguageModel(term_dict=background_term_counts) topic_language_model = BayesLanguageModel(title_language_model, background_language_model, beta=10) return topic_language_model
def _update_topic_language_model(self, text_list): topic_text = self._make_topic_text(document_text=text_list) n = len(text_list) snippet_text = ' '.join(text_list) term_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) term_extractor.extract_queries_from_text(topic_text) topic_term_counts = term_extractor.query_count term_extractor.extract_queries_from_text(snippet_text) new_text_term_counts = term_extractor.query_count for term in topic_term_counts: if term in new_text_term_counts: new_text_term_counts[term] += topic_term_counts[term] else: new_text_term_counts[term] = topic_term_counts[term] new_language_model = LanguageModel(term_dict=new_text_term_counts) self.topic_language_model = new_language_model log.debug("Updating topic {0}".format(self._topic.id))
def __update_topic_language_model(self, text_list): topic_text = '{title} {title} {title} {content}'.format( **self._topic.__dict__) n = len(text_list) snippet_text = ' '.join(text_list) term_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) term_extractor.extract_queries_from_text(topic_text) topic_term_counts = term_extractor.query_count term_extractor.extract_queries_from_text(snippet_text) new_text_term_counts = term_extractor.query_count for term in topic_term_counts: if term in new_text_term_counts: new_text_term_counts[term] += topic_term_counts[term] else: new_text_term_counts[term] = topic_term_counts[term] new_language_model = LanguageModel(term_dict=new_text_term_counts) self.topic_language_model = SmoothedLanguageModel( new_language_model, self.background_language_model, self.mu) log.debug("Updating topic {0}".format(self._topic.id))
def generate_query_list(self, topic): """ Given a Topic object, produces a list of query terms that could be issued by the simulated agent. """ self.__description_cutoff = 5 topic_title = topic.title topic_description = topic.content topic_language_model = self._generate_topic_language_model(topic) # Generate a series of query terms from the title, and then rank the generated terms. title_generator = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) title_query_list = title_generator.extract_queries_from_text( topic_title) title_query_list = self._rank_terms( title_query_list, topic_language_model=topic_language_model) # Perform the same steps, but from the description of the topic. description_generator = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) description_query_list = description_generator.extract_queries_from_text( topic_description) description_query_list = self._rank_terms( description_query_list, topic_language_model=topic_language_model) return self.__generate_permutations(topic_language_model, title_query_list, description_query_list)
def _update_topic_language_model(self, text_list): topic_text = self._make_topic_text(document_text=text_list) n = len(text_list) snippet_text = ' '.join(text_list) term_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) term_extractor.extract_queries_from_text(topic_text) topic_term_counts = term_extractor.query_count term_extractor.extract_queries_from_text(snippet_text) new_text_term_counts = term_extractor.query_count for term in topic_term_counts: if term in new_text_term_counts: new_text_term_counts[term] += topic_term_counts[term] else: new_text_term_counts[term] = topic_term_counts[term] new_language_model = LanguageModel(term_dict=new_text_term_counts) self.topic_language_model = new_language_model log.debug("Updating topic {0}".format(self._topic.id))
def _generate_topic_language_model(self, topic): """ """ topic_text = topic.title topic_background = topic.content document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count document_extractor.extract_queries_from_text(topic_background) background_term_counts = document_extractor.query_count title_language_model = LanguageModel(term_dict=document_term_counts) background_language_model = LanguageModel(term_dict=background_term_counts) topic_language_model = BayesLanguageModel(title_language_model, background_language_model, beta=10) return topic_language_model
def make_topic_lm(self, topic): topic_text = topic.title topic_bg = topic.content doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count doc_extractor.extract_queries_from_text(topic_bg) bg_term_counts = doc_extractor.query_count titleLM = LanguageModel(term_dict=doc_term_counts) bgLM = LanguageModel(term_dict=bg_term_counts) topicLM = BayesLanguageModel(titleLM, bgLM, beta=10) return topicLM
def _generate_topic_language_model(self, topic): """ Given a Topic object, returns a language model representation for the given topic. Override this method in inheriting classes to generate and return different language models. """ topic_text = topic.title topic_background = topic.content document_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count document_extractor.extract_queries_from_text(topic_background) background_term_counts = document_extractor.query_count title_language_model = LanguageModel(term_dict=document_term_counts) background_language_model = LanguageModel( term_dict=background_term_counts) topic_language_model = BayesLanguageModel(title_language_model, background_language_model, beta=10) return topic_language_model
def main(): """ :return: """ parser = argparse.ArgumentParser( description="Page Calculator for pages") parser.add_argument("-u", "--url", type=str, help="url address") parser.add_argument("-e","--engine",type=str, help="Name of search engine: " + ENGINE_LIST.__str__()) parser.add_argument("-k","--key",type=str, help="API Key for search engine (if applicable)") parser.add_argument("-c","--cutoff", type=int, help ="The cutoff value for queries") parser.add_argument("-m","--maxqueries", type=int, help ="The maximum number of queries per page") parser.add_argument("-s","--stopwordfile", type=str, help ="The filename name containing stopwords") parser.add_argument("-b","--backgroundfile", type=str, help ="The filename name containing background term counts") parser.add_argument("-ca", "--cache", action="store_true", default=False, help="use cache") args = parser.parse_args() if not args.url: print "Check your URL argument" parser.print_help() return 2 cache = None if args.cache: cache = 'engine' if args.key: engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache) else: print "cache is ", cache engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1) stopwordfile = None if args.stopwordfile: stopwordfile = args.stopwordfile mq = 50 if args.maxqueries: mq = args.maxqueries backgroundfile = 'background.txt' if args.backgroundfile: backgroundfile = args.backgroundfile doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=stopwordfile) query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=stopwordfile) print "Loading background distribution" colLM = LanguageModel(file=backgroundfile) print "Background loaded, number of terms: ", colLM.get_num_terms() print "Fetching page: %s" % (args.url) pc = PageCapture(args.url) page_html = pc.get_page_sourcecode() print "Page loaded" doc_extractor.extract_queries_from_html(page_html) doc_term_counts = doc_extractor.query_count print "Number of terms in document: %d" % (len(doc_term_counts)) docLM = LanguageModel(term_dict=doc_term_counts) slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500) query_list = query_generator.extract_queries_from_html(page_html) print "Queries generated: ", len(query_list) qr = OddsRatioQueryRanker(smoothed_language_model=slm) scored_queries = qr.calculate_query_list_probabilities(query_list) queries = qr.get_top_queries(mq) query_list = [] for query in queries: query_list.append(query[0]) prc = PageRetrievabilityCalculator(engine=engine) prc.score_page(args.url, query_list) print "\nRetrievability Scores for cumulative c=20" prc.calculate_page_retrievability(c=20) prc.report() print "\nRetrievability Scores for gravity beta=1.0" prc.calculate_page_retrievability(c=20, beta=1.0) prc.report() print "Done!" return 0