def get_ranked_queries(self, text=''): """ loads the background document model and generates the ranked queries :return: the queries in a list """ if not text: text = self.page_html backgroundfile = 'background.txt' filename = raw_input("enter the filename of the background file, background.txt is default") if filename: backgroundfile = filename print "background file is ", backgroundfile doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=self.stopwordfile) query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=self.stopwordfile) print "Loading background distribution" colLM = LanguageModel(file=backgroundfile) print "Background loaded, number of terms: ", colLM.get_num_terms() #doc_extractor.extract_queries_from_html(self.page_html) doc_extractor.extract_queries_from_html(text) doc_term_counts = doc_extractor.query_count print "Number of terms in document: %d" % (len(doc_term_counts)) docLM = LanguageModel(term_dict=doc_term_counts) slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500) #query_list = query_generator.extract_queries_from_html(self.page_html) query_list = query_generator.extract_queries_from_html(text) print "Queries generated: ", len(query_list) qr = OddsRatioQueryRanker(smoothed_language_model=slm) scored_queries = qr.calculate_query_list_probabilities(query_list) queries = qr.get_top_queries(self.mq) query_list = [] for query in queries: query_list.append(query[0]) return query_list
def make_topic_language_model(self): """ Combines term counts from the topic and background to produce the language model. """ topic_text = self._make_topic_text() # Get term counts from the TREC topic title and description. topic_terms = extract_term_dict_from_text(topic_text, self._stopword_file) # Get term counts from the topic background. background_terms = self._topic.background_terms combined_term_counts = {} combined_term_counts = self._combine_dictionaries( combined_term_counts, topic_terms, self.topic_weighting) combined_term_counts = self._combine_dictionaries( combined_term_counts, background_terms, self.topic_background_weighting) # Build the LM from the combined count dictionary. language_model = LanguageModel(term_dict=combined_term_counts) self.topic_language_model = language_model log.debug("Making topic {0}".format(self._topic.id))
def __update_topic_language_model(self, text_list): topic_text = '{title} {title} {title} {content}'.format( **self._topic.__dict__) n = len(text_list) snippet_text = ' '.join(text_list) term_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) term_extractor.extract_queries_from_text(topic_text) topic_term_counts = term_extractor.query_count term_extractor.extract_queries_from_text(snippet_text) new_text_term_counts = term_extractor.query_count for term in topic_term_counts: if term in new_text_term_counts: new_text_term_counts[term] += topic_term_counts[term] else: new_text_term_counts[term] = topic_term_counts[term] new_language_model = LanguageModel(term_dict=new_text_term_counts) self.topic_language_model = SmoothedLanguageModel( new_language_model, self.background_language_model, self.mu) log.debug("Updating topic {0}".format(self._topic.id))
def _update_topic_language_model(self, text_list): """ Updates the language model for the topic, given snippet/document text (text_list) and prior (knowledge) text. """ topic_text = self._make_topic_text() document_text = ' '.join(text_list) topic_term_counts = extract_term_dict_from_text( topic_text, self._stopword_file) background_scores = self._topic.background_terms document_term_counts = extract_term_dict_from_text( document_text, self._stopword_file) combined_term_counts = {} combined_term_counts = self._combine_dictionaries( combined_term_counts, topic_term_counts, self.topic_weighting) combined_term_counts = self._combine_dictionaries( combined_term_counts, background_scores, self.topic_background_weighting) combined_term_counts = self._combine_dictionaries( combined_term_counts, document_term_counts, self.document_weighting) # Build the updated language model. new_language_model = LanguageModel(term_dict=combined_term_counts) self.topic_language_model = new_language_model log.debug("Updating topic {0}".format(self._topic.id))
def update_model(self, search_context): if not self.updating: return False snippet_text = self._get_snip_text(search_context) snippet_text = self._check_terms(snippet_text) if snippet_text: topic_text = search_context.topic.get_topic_text() all_text = '{0} {1}'.format(topic_text, snippet_text) #snippet_term_counts = lm_methods.extract_term_dict_from_text(snippet_text, self._stopword_file) #topic_term_counts = lm_methods.extract_term_dict_from_text(topic_text, self._stopword_file) #title_language_model = LanguageModel(term_dict=topic_term_counts) #snippet_language_model = LanguageModel(term_dict=snippet_term_counts) #topic_language_model = BayesLanguageModel(title_language_model, snippet_language_model, beta=10) term_counts = lm_methods.extract_term_dict_from_text( all_text, self._stopword_file) language_model = LanguageModel(term_dict=term_counts) self.topic_lang_model = language_model if self.background_language_model: smoothed_topic_language_model = SmoothedLanguageModel( language_model, self.background_language_model) self.topic_lang_model = smoothed_topic_language_model return True else: return False
def _update_topic_language_model(self, text_list): topic_text = self._make_topic_text(document_text=text_list) n = len(text_list) snippet_text = ' '.join(text_list) term_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) term_extractor.extract_queries_from_text(topic_text) topic_term_counts = term_extractor.query_count term_extractor.extract_queries_from_text(snippet_text) new_text_term_counts = term_extractor.query_count for term in topic_term_counts: if term in new_text_term_counts: new_text_term_counts[term] += topic_term_counts[term] else: new_text_term_counts[term] = topic_term_counts[term] new_language_model = LanguageModel(term_dict=new_text_term_counts) self.topic_language_model = new_language_model log.debug("Updating topic {0}".format(self._topic.id))
def read_in_background(self, vocab_file): vocab = {} f = open(vocab_file, 'r') for line in f: tc = line.split(',') vocab[tc[0]] = int(tc[1]) self.backgroundLM = LanguageModel(term_dict=vocab)
def make_topic_lm(self, topic): topic_text = topic.content doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count topicLM = LanguageModel(term_dict=doc_term_counts) return topicLM
def make_topic_lm(self): topic_text = self.topic.content + self.topic.title doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count lm = LanguageModel(term_dict=doc_term_counts) self.topicLM = SmoothedLanguageModel(lm, self.backgroundLM, 100) print "making topic", self.topicLM.docLM.total_occurrences
def _generate_topic_language_model(self, topic): """ """ topic_text = topic.title topic_background = topic.content document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count document_extractor.extract_queries_from_text(topic_background) background_term_counts = document_extractor.query_count title_language_model = LanguageModel(term_dict=document_term_counts) background_language_model = LanguageModel(term_dict=background_term_counts) topic_language_model = BayesLanguageModel(title_language_model, background_language_model, beta=10) return topic_language_model
def make_topic_lm(self, topic): topic_text = topic.title topic_bg = topic.content doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file) doc_extractor.extract_queries_from_text(topic_text) doc_term_counts = doc_extractor.query_count doc_extractor.extract_queries_from_text(topic_bg) bg_term_counts = doc_extractor.query_count titleLM = LanguageModel(term_dict=doc_term_counts) bgLM = LanguageModel(term_dict=bg_term_counts) topicLM = BayesLanguageModel(titleLM, bgLM, beta=10) return topicLM
def compute_info_gain(word_list, language_model): word_dict = dict() for word in word_list: if word in word_dict: word_dict[word] +=1 else: word_dict[word] = 1 wlm = LanguageModel(term_dict = word_dict) ig = 0.0 for word in word_dict: pw = wlm.get_term_prob(word) pwc = language_model.get_term_prob(word) g = 0.0 if pwc > 0.0: g = pw * (math.log(pwc)- math.log(pw)) ig += g return ig
def read_in_background(self, vocab_file): """ Helper method to read in a file containing terms and construct a background language model. """ vocab = {} f = open(vocab_file, 'r') for line in f: tc = line.split(',') vocab[tc[0]] = int(tc[1]) f.close() self.background_language_model = LanguageModel(term_dict=vocab)
def make_topic_language_model(self): """ Generates a topic language model. """ topic_text = self._make_topic_text() document_term_counts = extract_term_dict_from_text( topic_text, self._stopword_file) language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = language_model #SmoothedLanguageModel(language_model, self.background_language_model, 100) log.debug("Making topic {0}".format(self._topic.id))
def _generate_naive_topic_language_model(self, topic): """ Given a Topic object, returns a language model representation for the given topic. Override this method in inheriting classes to generate and return different language models. """ topic_text = topic.content document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count # The langauge model we return is simply a representtaion of the number of times terms occur within the topic text. topic_language_model = LanguageModel(term_dict=document_term_counts) return topic_language_model
def _generate_topic_language_model(self, search_context): """ Given a Topic object, returns a language model representation for the given topic. Override this method in inheriting classes to generate and return different language models. """ topic = search_context.topic topic_text = "{0} {1}".format(topic.title, topic.content) document_term_counts = lm_methods.extract_term_dict_from_text( topic_text, self._stopword_file) # The language model we return is simply a representation of the number of times terms occur within the topic text. topic_language_model = LanguageModel(term_dict=document_term_counts) return topic_language_model
def read_in_background(vocab_file): """ Helper method to read in a file containing terms and construct a background language model. Returns a LanguageModel instance trained on the vocabulary file passed. """ vocab = {} f = open(vocab_file, 'r') for line in f: tc = line.split(',') vocab[tc[0]] = int(tc[1]) f.close() return LanguageModel(term_dict=vocab)
def _generate_topic_language_model(self, topic): """ Given a Topic object, returns a language model representation for the given topic. Override this method in inheriting classes to generate and return different language models. """ topic_text = topic.title topic_background = topic.content document_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count document_extractor.extract_queries_from_text(topic_background) background_term_counts = document_extractor.query_count title_language_model = LanguageModel(term_dict=document_term_counts) background_language_model = LanguageModel( term_dict=background_term_counts) topic_language_model = BayesLanguageModel(title_language_model, background_language_model, beta=10) return topic_language_model
def make_topic_language_model(self): """ """ topic_text = self._topic.content + self._topic.title document_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel( language_model, self.background_language_model, 100) print "making topic", self.topic_language_model.docLM.total_occurrences
def _generate_topic_language_model(self, search_context): """ creates an empirical language model based on the search topic, or a smoothed language model if a background model has been loaded. """ topic_text = self._make_topic_text(search_context) topic_term_counts = lm_methods.extract_term_dict_from_text( topic_text, self._stopword_file) topic_language_model = LanguageModel(term_dict=topic_term_counts) if self.background_language_model: smoothed_topic_language_model = SmoothedLanguageModel( topic_language_model, self.background_language_model) return smoothed_topic_language_model else: return topic_language_model
def make_topic_language_model(self): """ """ topic_text = '{title} {title} {title} {content}'.format( **self._topic.__dict__) document_extractor = SingleQueryGeneration( minlen=3, stopwordfile=self._stopword_file) document_extractor.extract_queries_from_text(topic_text) document_term_counts = document_extractor.query_count language_model = LanguageModel(term_dict=document_term_counts) self.topic_language_model = SmoothedLanguageModel( language_model, self.background_language_model, self.mu) log.debug("Making topic {0}".format( self.topic_language_model.docLM.total_occurrences))
def main(): """ :return: """ parser = argparse.ArgumentParser( description="Page Calculator for pages") parser.add_argument("-u", "--url", type=str, help="url address") parser.add_argument("-e","--engine",type=str, help="Name of search engine: " + ENGINE_LIST.__str__()) parser.add_argument("-k","--key",type=str, help="API Key for search engine (if applicable)") parser.add_argument("-c","--cutoff", type=int, help ="The cutoff value for queries") parser.add_argument("-m","--maxqueries", type=int, help ="The maximum number of queries per page") parser.add_argument("-s","--stopwordfile", type=str, help ="The filename name containing stopwords") parser.add_argument("-b","--backgroundfile", type=str, help ="The filename name containing background term counts") parser.add_argument("-ca", "--cache", action="store_true", default=False, help="use cache") args = parser.parse_args() if not args.url: print "Check your URL argument" parser.print_help() return 2 cache = None if args.cache: cache = 'engine' if args.key: engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache) else: print "cache is ", cache engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1) stopwordfile = None if args.stopwordfile: stopwordfile = args.stopwordfile mq = 50 if args.maxqueries: mq = args.maxqueries backgroundfile = 'background.txt' if args.backgroundfile: backgroundfile = args.backgroundfile doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=stopwordfile) query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=stopwordfile) print "Loading background distribution" colLM = LanguageModel(file=backgroundfile) print "Background loaded, number of terms: ", colLM.get_num_terms() print "Fetching page: %s" % (args.url) pc = PageCapture(args.url) page_html = pc.get_page_sourcecode() print "Page loaded" doc_extractor.extract_queries_from_html(page_html) doc_term_counts = doc_extractor.query_count print "Number of terms in document: %d" % (len(doc_term_counts)) docLM = LanguageModel(term_dict=doc_term_counts) slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500) query_list = query_generator.extract_queries_from_html(page_html) print "Queries generated: ", len(query_list) qr = OddsRatioQueryRanker(smoothed_language_model=slm) scored_queries = qr.calculate_query_list_probabilities(query_list) queries = qr.get_top_queries(mq) query_list = [] for query in queries: query_list.append(query[0]) prc = PageRetrievabilityCalculator(engine=engine) prc.score_page(args.url, query_list) print "\nRetrievability Scores for cumulative c=20" prc.calculate_page_retrievability(c=20) prc.report() print "\nRetrievability Scores for gravity beta=1.0" prc.calculate_page_retrievability(c=20, beta=1.0) prc.report() print "Done!" return 0