Esempio n. 1
0
    def __update_topic_language_model(self, text_list):

        topic_text =  '{title} {title} {title} {content}'.format(**self._topic.__dict__)

        n = len(text_list)
        snippet_text = ' '.join(text_list)

        term_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
        term_extractor.extract_queries_from_text(topic_text)
        topic_term_counts = term_extractor.query_count

        term_extractor.extract_queries_from_text(snippet_text)
        new_text_term_counts = term_extractor.query_count

        for term in topic_term_counts:
            if term in new_text_term_counts:
                new_text_term_counts[term] += topic_term_counts[term]
            else:
                new_text_term_counts[term] = topic_term_counts[term]

        new_language_model = LanguageModel(term_dict=new_text_term_counts)

        self.topic_language_model = SmoothedLanguageModel(new_language_model, self.background_language_model, self.mu)



        log.debug("Updating topic {0}".format(self._topic.id))
 def generate_query_list(self, topic):
     """
     Given a Topic object, produces a list of query terms that could be issued by the simulated agent.
     """
     self.__title_stem_length = 2
     self.__description_cutoff = 0
     
     topic_title = topic.title
     topic_description = topic.content
     topic_language_model = self._generate_topic_language_model(topic)
     
     # Generate a series of query terms from the title, and then rank the generated terms.
     title_generator = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
     title_query_list = title_generator.extract_queries_from_text(topic_title)
     title_query_list = self._rank_terms(title_query_list, topic_language_model=topic_language_model)
     
     # Produce the title query "stem"
     title_stem = self.__get_title_stem(topic_language_model, title_query_list)
     
     # Perform the same steps, but from the description of the topic.
     description_generator = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
     description_query_list = description_generator.extract_queries_from_text(topic_description)
     description_query_list = self._rank_terms(description_query_list, topic_language_model=topic_language_model)
     
     return self.__generate_permutations(topic_language_model, title_stem, description_query_list)
Esempio n. 3
0
    def get_ranked_queries(self, text=''):
        """
        loads the background document model and generates the ranked queries
        :return: the queries in a list
        """
        if not text:
            text = self.page_html
        backgroundfile = 'background.txt'
        filename = raw_input("enter the filename of the background file, background.txt is default")
        if filename:
            backgroundfile = filename
        print "background file is ", backgroundfile

        doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=self.stopwordfile)
        query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=self.stopwordfile)
        print "Loading background distribution"
        colLM = LanguageModel(file=backgroundfile)
        print "Background loaded, number of terms: ", colLM.get_num_terms()
        #doc_extractor.extract_queries_from_html(self.page_html)
        doc_extractor.extract_queries_from_html(text)
        doc_term_counts = doc_extractor.query_count
        print "Number of terms in document: %d" % (len(doc_term_counts))
        docLM = LanguageModel(term_dict=doc_term_counts)
        slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500)
        #query_list = query_generator.extract_queries_from_html(self.page_html)
        query_list = query_generator.extract_queries_from_html(text)

        print "Queries generated: ", len(query_list)
        qr = OddsRatioQueryRanker(smoothed_language_model=slm)
        scored_queries = qr.calculate_query_list_probabilities(query_list)
        queries = qr.get_top_queries(self.mq)
        query_list = []
        for query in queries:
            query_list.append(query[0])
        return query_list
Esempio n. 4
0
    def make_topic_lm(self, topic ):
        topic_text = topic.content
        doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file)
        doc_extractor.extract_queries_from_text(topic_text)
        doc_term_counts = doc_extractor.query_count
        topicLM = LanguageModel(term_dict=doc_term_counts)

        return topicLM
Esempio n. 5
0
    def make_topic_lm(self):
        topic_text = self.topic.content + self.topic.title

        doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file)
        doc_extractor.extract_queries_from_text(topic_text)
        doc_term_counts = doc_extractor.query_count
        lm = LanguageModel(term_dict=doc_term_counts)
        self.topicLM = SmoothedLanguageModel(lm,self.backgroundLM,100)
        print "making topic", self.topicLM.docLM.total_occurrences
Esempio n. 6
0
def extract_term_dict_from_text(text, stopword_file):
    """
    takes text, parses it, and counts how many times each term occurs.
    :param text: a string
    :return: a dict of {term, count}
    """
    single_term_text_extractor = SingleQueryGeneration(minlen=3, stopwordfile=stopword_file)
    single_term_text_extractor.extract_queries_from_text(text)
    term_counts_dict = single_term_text_extractor.query_count

    return term_counts_dict
Esempio n. 7
0
 def make_topic_language_model(self):
     """
     
     """
     topic_text = self._topic.content + self._topic.title
     
     document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
     document_extractor.extract_queries_from_text(topic_text)
     document_term_counts = document_extractor.query_count
     
     language_model = LanguageModel(term_dict=document_term_counts)
     self.topic_language_model = SmoothedLanguageModel(language_model, self.background_language_model, 100)
     print "making topic", self.topic_language_model.docLM.total_occurrences
Esempio n. 8
0
 def _generate_naive_topic_language_model(self, topic):
     """
     Given a Topic object, returns a language model representation for the given topic.
     Override this method in inheriting classes to generate and return different language models.
     """
     topic_text = topic.content
     
     document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
     document_extractor.extract_queries_from_text(topic_text)
     document_term_counts = document_extractor.query_count
     
     # The langauge model we return is simply a representtaion of the number of times terms occur within the topic text.
     topic_language_model = LanguageModel(term_dict=document_term_counts)
     return topic_language_model
Esempio n. 9
0
    def make_topic_language_model(self):
        """
        
        """
        topic_text = '{title} {title} {title} {content}'.format(**self._topic.__dict__)

        document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count
        
        language_model = LanguageModel(term_dict=document_term_counts)

        self.topic_language_model = SmoothedLanguageModel(language_model, self.background_language_model, self.mu)
        log.debug("Making topic {0}".format(self.topic_language_model.docLM.total_occurrences))
    def generate_query_list(self, topic):
        """
        Given a Topic object, produces a list of query terms that could be issued by the simulated agent.
        """
        topic_text = topic.content
        topic_lang_model = self._generate_topic_language_model(topic)

        single_query_generator = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)

        single_query_list = single_query_generator.extract_queries_from_text(topic_text)

        query_ranker = QueryRanker(smoothed_language_model=topic_lang_model)
        query_ranker.calculate_query_list_probabilities(single_query_list)
        return query_ranker.get_top_queries(100)
Esempio n. 11
0
 def _generate_topic_language_model(self, topic):
     """
     Returns a languge model for the given topic, considering both the title and content text.
     """
     topic_text = topic.title
     topic_background = topic.content
 
     document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
     document_extractor.extract_queries_from_text(topic_text)
     document_term_counts = document_extractor.query_count
 
     document_extractor.extract_queries_from_text(topic_background)
 
     background_term_counts = document_extractor.query_count
 
     title_language_model = LanguageModel(term_dict=document_term_counts)
     background_language_model = LanguageModel(term_dict=background_term_counts)
     topic_language_model = BayesLanguageModel(title_language_model, background_language_model, beta=10)
     return topic_language_model
Esempio n. 12
0
    def generate_query_list(self, search_context):
        """
        Given a Topic object, produces a list of query terms that could be issued by the simulated agent.
        """
        topic = search_context.topic

        topic_text = "{0} {1}".format(topic.title, topic.content)

        topic_language_model = self._generate_topic_language_model(search_context)
        
        generator = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
        query_list = generator.extract_queries_from_text(topic_text)
        
        query_ranker = QueryRanker(smoothed_language_model=topic_language_model)
        query_ranker.calculate_query_list_probabilities(query_list)
        
        generated_queries = query_ranker.get_top_queries(100)

        return generated_queries
Esempio n. 13
0
    def make_topic_lm(self, topic):

        topic_text = topic.title
        topic_bg = topic.content
        doc_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self.stopword_file)
        doc_extractor.extract_queries_from_text(topic_text)
        doc_term_counts = doc_extractor.query_count

        doc_extractor.extract_queries_from_text(topic_bg)

        bg_term_counts = doc_extractor.query_count

        titleLM = LanguageModel(term_dict=doc_term_counts)

        bgLM = LanguageModel(term_dict=bg_term_counts)

        topicLM = BayesLanguageModel(titleLM, bgLM,beta=10)

        return topicLM
    def _generate_topic_language_model(self, topic):
        """
        Given a Topic object, returns a language model representation for the given topic.
        Override this method in inheriting classes to generate and return different language models.
        """
        topic_text = topic.title
        topic_background = topic.content

        document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count

        document_extractor.extract_queries_from_text(topic_background)

        background_term_counts = document_extractor.query_count

        title_language_model = LanguageModel(term_dict=document_term_counts)
        background_language_model = LanguageModel(term_dict=background_term_counts)
        topic_language_model = BayesLanguageModel(title_language_model, background_language_model, beta=10)
        return topic_language_model
Esempio n. 15
0
    def _update_topic_language_model(self, text_list):
        topic_text = self._make_topic_text(document_text=text_list)

        n = len(text_list)
        snippet_text = ' '.join(text_list)

        term_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        term_extractor.extract_queries_from_text(topic_text)
        topic_term_counts = term_extractor.query_count

        term_extractor.extract_queries_from_text(snippet_text)
        new_text_term_counts = term_extractor.query_count

        for term in topic_term_counts:
            if term in new_text_term_counts:
                new_text_term_counts[term] += topic_term_counts[term]
            else:
                new_text_term_counts[term] = topic_term_counts[term]

        new_language_model = LanguageModel(term_dict=new_text_term_counts)

        self.topic_language_model = new_language_model

        log.debug("Updating topic {0}".format(self._topic.id))
Esempio n. 16
0
    def __update_topic_language_model(self, text_list):

        topic_text = '{title} {title} {title} {content}'.format(
            **self._topic.__dict__)

        n = len(text_list)
        snippet_text = ' '.join(text_list)

        term_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        term_extractor.extract_queries_from_text(topic_text)
        topic_term_counts = term_extractor.query_count

        term_extractor.extract_queries_from_text(snippet_text)
        new_text_term_counts = term_extractor.query_count

        for term in topic_term_counts:
            if term in new_text_term_counts:
                new_text_term_counts[term] += topic_term_counts[term]
            else:
                new_text_term_counts[term] = topic_term_counts[term]

        new_language_model = LanguageModel(term_dict=new_text_term_counts)

        self.topic_language_model = SmoothedLanguageModel(
            new_language_model, self.background_language_model, self.mu)

        log.debug("Updating topic {0}".format(self._topic.id))
Esempio n. 17
0
    def generate_query_list(self, topic):
        """
        Given a Topic object, produces a list of query terms that could be issued by the simulated agent.
        """
        self.__description_cutoff = 5

        topic_title = topic.title
        topic_description = topic.content
        topic_language_model = self._generate_topic_language_model(topic)

        # Generate a series of query terms from the title, and then rank the generated terms.
        title_generator = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        title_query_list = title_generator.extract_queries_from_text(
            topic_title)
        title_query_list = self._rank_terms(
            title_query_list, topic_language_model=topic_language_model)

        # Perform the same steps, but from the description of the topic.
        description_generator = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        description_query_list = description_generator.extract_queries_from_text(
            topic_description)
        description_query_list = self._rank_terms(
            description_query_list, topic_language_model=topic_language_model)

        return self.__generate_permutations(topic_language_model,
                                            title_query_list,
                                            description_query_list)
Esempio n. 18
0
    def _update_topic_language_model(self, text_list):
        topic_text = self._make_topic_text(document_text=text_list)

        n = len(text_list)
        snippet_text = ' '.join(text_list)

        term_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
        term_extractor.extract_queries_from_text(topic_text)
        topic_term_counts = term_extractor.query_count

        term_extractor.extract_queries_from_text(snippet_text)
        new_text_term_counts = term_extractor.query_count

        for term in topic_term_counts:
            if term in new_text_term_counts:
                new_text_term_counts[term] += topic_term_counts[term]
            else:
                new_text_term_counts[term] = topic_term_counts[term]

        new_language_model = LanguageModel(term_dict=new_text_term_counts)

        self.topic_language_model = new_language_model

        log.debug("Updating topic {0}".format(self._topic.id))
Esempio n. 19
0
 def _generate_topic_language_model(self, topic):
     """
     
     """
     topic_text = topic.title
     topic_background = topic.content
     
     document_extractor = SingleQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
     document_extractor.extract_queries_from_text(topic_text)
     document_term_counts = document_extractor.query_count
     
     document_extractor.extract_queries_from_text(topic_background)
     
     background_term_counts = document_extractor.query_count
     
     title_language_model = LanguageModel(term_dict=document_term_counts)
     background_language_model = LanguageModel(term_dict=background_term_counts)
     topic_language_model = BayesLanguageModel(title_language_model, background_language_model, beta=10)
     return topic_language_model
Esempio n. 20
0
    def make_topic_lm(self, topic):

        topic_text = topic.title
        topic_bg = topic.content
        doc_extractor = SingleQueryGeneration(minlen=3,
                                              stopwordfile=self.stopword_file)
        doc_extractor.extract_queries_from_text(topic_text)
        doc_term_counts = doc_extractor.query_count

        doc_extractor.extract_queries_from_text(topic_bg)

        bg_term_counts = doc_extractor.query_count

        titleLM = LanguageModel(term_dict=doc_term_counts)

        bgLM = LanguageModel(term_dict=bg_term_counts)

        topicLM = BayesLanguageModel(titleLM, bgLM, beta=10)

        return topicLM
Esempio n. 21
0
    def _generate_topic_language_model(self, topic):
        """
        Given a Topic object, returns a language model representation for the given topic.
        Override this method in inheriting classes to generate and return different language models.
        """
        topic_text = topic.title
        topic_background = topic.content

        document_extractor = SingleQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        document_extractor.extract_queries_from_text(topic_text)
        document_term_counts = document_extractor.query_count

        document_extractor.extract_queries_from_text(topic_background)

        background_term_counts = document_extractor.query_count

        title_language_model = LanguageModel(term_dict=document_term_counts)
        background_language_model = LanguageModel(
            term_dict=background_term_counts)
        topic_language_model = BayesLanguageModel(title_language_model,
                                                  background_language_model,
                                                  beta=10)
        return topic_language_model
Esempio n. 22
0
def main():
    """

    :return:
    """
    parser = argparse.ArgumentParser(
                                description="Page Calculator for pages")
    parser.add_argument("-u", "--url", type=str,
                        help="url address")
    parser.add_argument("-e","--engine",type=str,
                        help="Name of search engine: " + ENGINE_LIST.__str__())
    parser.add_argument("-k","--key",type=str,
                        help="API Key for search engine (if applicable)")
    parser.add_argument("-c","--cutoff", type=int,
                        help ="The cutoff value for queries")
    parser.add_argument("-m","--maxqueries", type=int,
                        help ="The maximum number of queries per page")
    parser.add_argument("-s","--stopwordfile", type=str,
                        help ="The filename name containing stopwords")
    parser.add_argument("-b","--backgroundfile", type=str,
                        help ="The filename name containing background term counts")
    parser.add_argument("-ca", "--cache",
                  action="store_true", default=False,
                  help="use cache")


    args = parser.parse_args()

    if not args.url:
        print "Check your URL argument"
        parser.print_help()
        return 2

    cache = None
    if args.cache:
        cache = 'engine'

    if args.key:
        engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache)
    else:
        print "cache is ", cache
        engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1)


    stopwordfile = None
    if args.stopwordfile:
        stopwordfile = args.stopwordfile

    mq = 50
    if args.maxqueries:
        mq = args.maxqueries

    backgroundfile = 'background.txt'
    if args.backgroundfile:
        backgroundfile = args.backgroundfile

    doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=stopwordfile)
    query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=stopwordfile)
    print "Loading background distribution"
    colLM = LanguageModel(file=backgroundfile)
    print "Background loaded, number of terms: ", colLM.get_num_terms()

    print "Fetching page: %s" % (args.url)
    pc = PageCapture(args.url)
    page_html = pc.get_page_sourcecode()
    print "Page loaded"
    doc_extractor.extract_queries_from_html(page_html)
    doc_term_counts = doc_extractor.query_count
    print "Number of terms in document: %d" % (len(doc_term_counts))
    docLM = LanguageModel(term_dict=doc_term_counts)
    slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500)
    query_list = query_generator.extract_queries_from_html(page_html)

    print "Queries generated: ", len(query_list)
    qr = OddsRatioQueryRanker(smoothed_language_model=slm)
    scored_queries = qr.calculate_query_list_probabilities(query_list)
    queries = qr.get_top_queries(mq)
    query_list = []
    for query in queries:
        query_list.append(query[0])


    prc = PageRetrievabilityCalculator(engine=engine)
    prc.score_page(args.url, query_list)

    print "\nRetrievability Scores for cumulative c=20"
    prc.calculate_page_retrievability(c=20)
    prc.report()
    print "\nRetrievability Scores for gravity beta=1.0"

    prc.calculate_page_retrievability(c=20, beta=1.0)
    prc.report()

    print "Done!"
    return 0