def generate_query_list(self, topic):
     """
     Given a Topic object, produces a list of query terms that could be issued by the simulated agent.
     """
     self.__title_stem_length = 2
     self.__description_cutoff = 0
     
     topic_title = topic.title
     topic_description = topic.content
     topic_language_model = self._generate_topic_language_model(topic)
     
     # Generate a series of query terms from the title, and then rank the generated terms.
     title_generator = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
     title_query_list = title_generator.extract_queries_from_text(topic_title)
     title_query_list = self._rank_terms(title_query_list, topic_language_model=topic_language_model)
     
     # Produce the title query "stem"
     title_stem = self.__get_title_stem(topic_language_model, title_query_list)
     
     # Perform the same steps, but from the description of the topic.
     description_generator = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
     description_query_list = description_generator.extract_queries_from_text(topic_description)
     description_query_list = self._rank_terms(description_query_list, topic_language_model=topic_language_model)
     
     return self.__generate_permutations(topic_language_model, title_stem, description_query_list)
Beispiel #2
0
    def __update_topic_language_model(self, text_list):

        topic_text =  '{title} {title} {title} {content}'.format(**self._topic.__dict__)

        n = len(text_list)
        snippet_text = ' '.join(text_list)

        term_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
        term_extractor.extract_queries_from_text(topic_text)
        topic_term_counts = term_extractor.query_count

        term_extractor.extract_queries_from_text(snippet_text)
        new_text_term_counts = term_extractor.query_count

        for term in topic_term_counts:
            if term in new_text_term_counts:
                new_text_term_counts[term] += topic_term_counts[term]
            else:
                new_text_term_counts[term] = topic_term_counts[term]

        new_language_model = LanguageModel(term_dict=new_text_term_counts)

        self.topic_language_model = SmoothedLanguageModel(new_language_model, self.background_language_model, self.mu)



        log.debug("Updating topic {0}".format(self._topic.id))
Beispiel #3
0
    def _update_topic_language_model(self, text_list):
        topic_text = self._make_topic_text(document_text=text_list)

        n = len(text_list)
        snippet_text = ' '.join(text_list)

        term_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        term_extractor.extract_queries_from_text(topic_text)
        topic_term_counts = term_extractor.query_count

        term_extractor.extract_queries_from_text(snippet_text)
        new_text_term_counts = term_extractor.query_count

        for term in topic_term_counts:
            if term in new_text_term_counts:
                new_text_term_counts[term] += topic_term_counts[term]
            else:
                new_text_term_counts[term] = topic_term_counts[term]

        new_language_model = LanguageModel(term_dict=new_text_term_counts)

        self.topic_language_model = new_language_model

        log.debug("Updating topic {0}".format(self._topic.id))
Beispiel #4
0
    def __update_topic_language_model(self, text_list):

        topic_text = '{title} {title} {title} {content}'.format(
            **self._topic.__dict__)

        n = len(text_list)
        snippet_text = ' '.join(text_list)

        term_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        term_extractor.extract_queries_from_text(topic_text)
        topic_term_counts = term_extractor.query_count

        term_extractor.extract_queries_from_text(snippet_text)
        new_text_term_counts = term_extractor.query_count

        for term in topic_term_counts:
            if term in new_text_term_counts:
                new_text_term_counts[term] += topic_term_counts[term]
            else:
                new_text_term_counts[term] = topic_term_counts[term]

        new_language_model = LanguageModel(term_dict=new_text_term_counts)

        self.topic_language_model = SmoothedLanguageModel(
            new_language_model, self.background_language_model, self.mu)

        log.debug("Updating topic {0}".format(self._topic.id))
Beispiel #5
0
    def generate_query_list(self, search_context):
        """
        Given a Topic object, produces a list of query terms that could be issued by the simulated agent.
        """

        topic = search_context.topic
        topic_title = topic.title
        topic_description = topic.content
        topic_language_model = self._generate_topic_language_model(
            search_context)

        # Generate a series of query terms from the title, and then rank the generated terms.
        title_generator = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        title_query_list = title_generator.extract_queries_from_text(
            topic_title)
        title_query_list = self._rank_terms(
            title_query_list, topic_language_model=topic_language_model)

        # Produce the title query "stem"
        title_stem = self.__get_title_stem(topic_language_model,
                                           title_query_list)

        # Perform the same steps, but from the description of the topic.
        description_generator = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        description_query_list = description_generator.extract_queries_from_text(
            topic_description)
        description_query_list = self._rank_terms(
            description_query_list, topic_language_model=topic_language_model)

        query_permutations = self.__generate_permutations(
            topic_language_model, title_stem, description_query_list)
        return query_permutations
Beispiel #6
0
    def make_topic_lm(self, topic ):
        topic_text = topic.content
        doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file)
        doc_extractor.extract_queries_from_text(topic_text)
        doc_term_counts = doc_extractor.query_count
        topicLM = LanguageModel(term_dict=doc_term_counts)

        return topicLM
Beispiel #7
0
    def make_topic_lm(self, topic):
        topic_text = topic.content
        doc_extractor = SingleQueryGeneration(minlen=3,
                                              stopwordfile=self.stopword_file)
        doc_extractor.extract_queries_from_text(topic_text)
        doc_term_counts = doc_extractor.query_count
        topicLM = LanguageModel(term_dict=doc_term_counts)

        return topicLM
Beispiel #8
0
    def make_topic_lm(self):
        topic_text = self.topic.content + self.topic.title

        doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file)
        doc_extractor.extract_queries_from_text(topic_text)
        doc_term_counts = doc_extractor.query_count
        lm = LanguageModel(term_dict=doc_term_counts)
        self.topicLM = SmoothedLanguageModel(lm,self.backgroundLM,100)
        print "making topic", self.topicLM.docLM.total_occurrences
Beispiel #9
0
    def make_topic_lm(self):
        topic_text = self.topic.content + self.topic.title

        doc_extractor = SingleQueryGeneration(minlen=3,
                                              stopwordfile=self.stopword_file)
        doc_extractor.extract_queries_from_text(topic_text)
        doc_term_counts = doc_extractor.query_count
        lm = LanguageModel(term_dict=doc_term_counts)
        self.topicLM = SmoothedLanguageModel(lm, self.backgroundLM, 100)
        print "making topic", self.topicLM.docLM.total_occurrences
Beispiel #10
0
def extract_term_dict_from_text(text, stopword_file):
    """
    takes text, parses it, and counts how many times each term occurs.
    :param text: a string
    :return: a dict of {term, count}
    """
    single_term_text_extractor = SingleQueryGeneration(minlen=3, stopwordfile=stopword_file)
    single_term_text_extractor.extract_queries_from_text(text)
    term_counts_dict = single_term_text_extractor.query_count

    return term_counts_dict
Beispiel #11
0
 def make_topic_language_model(self):
     """
     
     """
     topic_text = self._topic.content + self._topic.title
     
     document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
     document_extractor.extract_queries_from_text(topic_text)
     document_term_counts = document_extractor.query_count
     
     language_model = LanguageModel(term_dict=document_term_counts)
     self.topic_language_model = SmoothedLanguageModel(language_model, self.background_language_model, 100)
     print "making topic", self.topic_language_model.docLM.total_occurrences
Beispiel #12
0
    def make_topic_language_model(self):
        """
        
        """
        topic_text = '{title} {title} {title} {content}'.format(**self._topic.__dict__)

        document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count
        
        language_model = LanguageModel(term_dict=document_term_counts)

        self.topic_language_model = SmoothedLanguageModel(language_model, self.background_language_model, self.mu)
        log.debug("Making topic {0}".format(self.topic_language_model.docLM.total_occurrences))
Beispiel #13
0
 def _generate_naive_topic_language_model(self, topic):
     """
     Given a Topic object, returns a language model representation for the given topic.
     Override this method in inheriting classes to generate and return different language models.
     """
     topic_text = topic.content
     
     document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
     document_extractor.extract_queries_from_text(topic_text)
     document_term_counts = document_extractor.query_count
     
     # The langauge model we return is simply a representtaion of the number of times terms occur within the topic text.
     topic_language_model = LanguageModel(term_dict=document_term_counts)
     return topic_language_model
Beispiel #14
0
    def make_topic_language_model(self):
        """
        
        """
        topic_text = self._topic.content + self._topic.title

        document_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count

        language_model = LanguageModel(term_dict=document_term_counts)
        self.topic_language_model = SmoothedLanguageModel(
            language_model, self.background_language_model, 100)
        print "making topic", self.topic_language_model.docLM.total_occurrences
Beispiel #15
0
    def make_topic_language_model(self):
        """
        
        """
        topic_text = '{title} {title} {title} {content}'.format(
            **self._topic.__dict__)

        document_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count

        language_model = LanguageModel(term_dict=document_term_counts)

        self.topic_language_model = SmoothedLanguageModel(
            language_model, self.background_language_model, self.mu)
        log.debug("Making topic {0}".format(
            self.topic_language_model.docLM.total_occurrences))
Beispiel #16
0
    def make_topic_lm(self, topic):

        topic_text = topic.title
        topic_bg = topic.content
        doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file)
        doc_extractor.extract_queries_from_text(topic_text)
        doc_term_counts = doc_extractor.query_count

        doc_extractor.extract_queries_from_text(topic_bg)

        bg_term_counts = doc_extractor.query_count

        titleLM = LanguageModel(term_dict=doc_term_counts)

        bgLM = LanguageModel(term_dict=bg_term_counts)

        topicLM = BayesLanguageModel(titleLM, bgLM,beta=10)

        return topicLM
Beispiel #17
0
 def _generate_topic_language_model(self, topic):
     """
     Returns a languge model for the given topic, considering both the title and content text.
     """
     topic_text = topic.title
     topic_background = topic.content
 
     document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
     document_extractor.extract_queries_from_text(topic_text)
     document_term_counts = document_extractor.query_count
 
     document_extractor.extract_queries_from_text(topic_background)
 
     background_term_counts = document_extractor.query_count
 
     title_language_model = LanguageModel(term_dict=document_term_counts)
     background_language_model = LanguageModel(term_dict=background_term_counts)
     topic_language_model = BayesLanguageModel(title_language_model, background_language_model, beta=10)
     return topic_language_model
Beispiel #18
0
 def _generate_topic_language_model(self, topic):
     """
     
     """
     topic_text = topic.title
     topic_background = topic.content
     
     document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
     document_extractor.extract_queries_from_text(topic_text)
     document_term_counts = document_extractor.query_count
     
     document_extractor.extract_queries_from_text(topic_background)
     
     background_term_counts = document_extractor.query_count
     
     title_language_model = LanguageModel(term_dict=document_term_counts)
     background_language_model = LanguageModel(term_dict=background_term_counts)
     topic_language_model = BayesLanguageModel(title_language_model, background_language_model, beta=10)
     return topic_language_model
    def _generate_topic_language_model(self, topic):
        """
        Given a Topic object, returns a language model representation for the given topic.
        Override this method in inheriting classes to generate and return different language models.
        """
        topic_text = topic.title
        topic_background = topic.content

        document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count

        document_extractor.extract_queries_from_text(topic_background)

        background_term_counts = document_extractor.query_count

        title_language_model = LanguageModel(term_dict=document_term_counts)
        background_language_model = LanguageModel(term_dict=background_term_counts)
        topic_language_model = BayesLanguageModel(title_language_model, background_language_model, beta=10)
        return topic_language_model
Beispiel #20
0
    def make_topic_lm(self, topic):

        topic_text = topic.title
        topic_bg = topic.content
        doc_extractor = SingleQueryGeneration(minlen=3,
                                              stopwordfile=self.stopword_file)
        doc_extractor.extract_queries_from_text(topic_text)
        doc_term_counts = doc_extractor.query_count

        doc_extractor.extract_queries_from_text(topic_bg)

        bg_term_counts = doc_extractor.query_count

        titleLM = LanguageModel(term_dict=doc_term_counts)

        bgLM = LanguageModel(term_dict=bg_term_counts)

        topicLM = BayesLanguageModel(titleLM, bgLM, beta=10)

        return topicLM
    def generate_query_list(self, topic):
        """
        Given a Topic object, produces a list of query terms that could be issued by the simulated agent.
        """
        topic_text = topic.content
        topic_lang_model = self._generate_topic_language_model(topic)

        single_query_generator = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)

        single_query_list = single_query_generator.extract_queries_from_text(topic_text)

        query_ranker = QueryRanker(smoothed_language_model=topic_lang_model)
        query_ranker.calculate_query_list_probabilities(single_query_list)
        return query_ranker.get_top_queries(100)
Beispiel #22
0
    def _update_topic_language_model(self, text_list):
        topic_text = self._make_topic_text(document_text=text_list)

        n = len(text_list)
        snippet_text = ' '.join(text_list)

        term_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
        term_extractor.extract_queries_from_text(topic_text)
        topic_term_counts = term_extractor.query_count

        term_extractor.extract_queries_from_text(snippet_text)
        new_text_term_counts = term_extractor.query_count

        for term in topic_term_counts:
            if term in new_text_term_counts:
                new_text_term_counts[term] += topic_term_counts[term]
            else:
                new_text_term_counts[term] = topic_term_counts[term]

        new_language_model = LanguageModel(term_dict=new_text_term_counts)

        self.topic_language_model = new_language_model

        log.debug("Updating topic {0}".format(self._topic.id))
Beispiel #23
0
    def _generate_topic_language_model(self, topic):
        """
        Given a Topic object, returns a language model representation for the given topic.
        Override this method in inheriting classes to generate and return different language models.
        """
        topic_text = topic.title
        topic_background = topic.content

        document_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count

        document_extractor.extract_queries_from_text(topic_background)

        background_term_counts = document_extractor.query_count

        title_language_model = LanguageModel(term_dict=document_term_counts)
        background_language_model = LanguageModel(
            term_dict=background_term_counts)
        topic_language_model = BayesLanguageModel(title_language_model,
                                                  background_language_model,
                                                  beta=10)
        return topic_language_model
Beispiel #24
0
    def generate_query_list(self, topic):
        """
        Given a Topic object, produces a list of query terms that could be issued by the simulated agent.
        """
        topic_text = topic.content
        topic_lang_model = self._generate_topic_language_model(topic)

        single_query_generator = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)

        single_query_list = single_query_generator.extract_queries_from_text(
            topic_text)

        query_ranker = QueryRanker(smoothed_language_model=topic_lang_model)
        query_ranker.calculate_query_list_probabilities(single_query_list)
        return query_ranker.get_top_queries(100)
Beispiel #25
0
    def generate_query_list(self, search_context):
        """
        Given a Topic object, produces a list of query terms that could be issued by the simulated agent.
        """
        topic = search_context.topic

        topic_text = "{0} {1}".format(topic.title, topic.content)

        topic_language_model = self._generate_topic_language_model(search_context)
        
        generator = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
        query_list = generator.extract_queries_from_text(topic_text)
        
        query_ranker = QueryRanker(smoothed_language_model=topic_language_model)
        query_ranker.calculate_query_list_probabilities(query_list)
        
        generated_queries = query_ranker.get_top_queries(100)

        return generated_queries