Esempio n. 1
0
    def get_ranked_queries(self, text=''):
        """
        loads the background document model and generates the ranked queries
        :return: the queries in a list
        """
        if not text:
            text = self.page_html
        backgroundfile = 'background.txt'
        filename = raw_input("enter the filename of the background file, background.txt is default")
        if filename:
            backgroundfile = filename
        print "background file is ", backgroundfile

        doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=self.stopwordfile)
        query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=self.stopwordfile)
        print "Loading background distribution"
        colLM = LanguageModel(file=backgroundfile)
        print "Background loaded, number of terms: ", colLM.get_num_terms()
        #doc_extractor.extract_queries_from_html(self.page_html)
        doc_extractor.extract_queries_from_html(text)
        doc_term_counts = doc_extractor.query_count
        print "Number of terms in document: %d" % (len(doc_term_counts))
        docLM = LanguageModel(term_dict=doc_term_counts)
        slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500)
        #query_list = query_generator.extract_queries_from_html(self.page_html)
        query_list = query_generator.extract_queries_from_html(text)

        print "Queries generated: ", len(query_list)
        qr = OddsRatioQueryRanker(smoothed_language_model=slm)
        scored_queries = qr.calculate_query_list_probabilities(query_list)
        queries = qr.get_top_queries(self.mq)
        query_list = []
        for query in queries:
            query_list.append(query[0])
        return query_list
Esempio n. 2
0
    def generate_query_list(self, search_context):
        """
        Given a Topic object, produces a list of query terms that could be issued by the simulated agent.
        """

        topic_text = search_context.topic.get_topic_text()
        if self.topic_lang_model is None:
            self.topic_lang_model = self._generate_topic_language_model(
                search_context)

        snip_text = self._get_snip_text(search_context)

        all_text = topic_text + ' ' + snip_text

        all_text = self._check_terms(all_text)

        bi_query_generator = BiTermQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)
        tri_query_generator = TriTermQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)

        tri_query_list = tri_query_generator.extract_queries_from_text(
            all_text)
        bi_query_list = bi_query_generator.extract_queries_from_text(all_text)

        query_list = tri_query_list + bi_query_list

        query_ranker = QueryRanker(
            smoothed_language_model=self.topic_lang_model)
        query_ranker.calculate_query_list_probabilities(query_list)
        gen_query_list = query_ranker.get_top_queries(100)

        return gen_query_list
Esempio n. 3
0
    def generate_query_list(self, search_context):
        """
        Given a Topic object, produces a list of query terms that could be issued by the simulated agent.
        """

        topic_text = search_context.topic.get_topic_text()
        if self.topic_lang_model is None:
            self.topic_lang_model = self._generate_topic_language_model(search_context)


        snip_text = self._get_snip_text(search_context)

        all_text = topic_text + ' ' + snip_text

        all_text = self._check_terms(all_text)

        bi_query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
        tri_query_generator = TriTermQueryGeneration(minlen=3, stopwordfile=self._stopword_file)

        tri_query_list = tri_query_generator.extract_queries_from_text(all_text)
        bi_query_list = bi_query_generator.extract_queries_from_text(all_text)

        query_list = tri_query_list + bi_query_list


        query_ranker = QueryRanker(smoothed_language_model=self.topic_lang_model)
        query_ranker.calculate_query_list_probabilities(query_list)
        gen_query_list = query_ranker.get_top_queries(100)


        return gen_query_list
Esempio n. 4
0
    def get_position_queries(self, text):

        #print "text is ", text
        query_gen = None
        query_list = []
        if self.stopwordfile:
            query_gen = BiTermQueryGeneration(minlen=3, stopwordfile=self.stopwordfile)
        else:
            query_gen = BiTermQueryGeneration(minlen=3)
        query_list = query_gen.extract_queries_from_text(text)
        print query_list
        return query_list
Esempio n. 5
0
    def get_position_queries(self, text):

        # print "text is ", text
        query_gen = None
        query_list = []
        if self.stopwordfile:
            query_gen = BiTermQueryGeneration(minlen=3, stopwordfile=self.stopwordfile)
        else:
            query_gen = BiTermQueryGeneration(minlen=3)
        query_list = query_gen.extract_queries_from_text(text)
        print query_list
        return query_list
Esempio n. 6
0
    def produce_query_list(self, topic):

        topic_text = topic.content
        topicLM = self.make_topic_lm(topic)
        bi_query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=self.stopword_file)
        tri_query_generator = TriTermQueryGeneration(minlen=3, stopwordfile=self.stopword_file)
        tri_query_list = tri_query_generator.extract_queries_from_text(topic_text)
        bi_query_list = bi_query_generator.extract_queries_from_text(topic_text)

        query_list = tri_query_list + bi_query_list

        qr = QueryRanker(smoothed_language_model=topicLM)
        qr.calculate_query_list_probabilities(query_list)
        queries = qr.get_top_queries(100)
        return queries
Esempio n. 7
0
 def generate_query_list(self, topic):
     """
     Given a Topic object, produces a list of query terms that could be issued by the simulated agent.
     """
     topic_text = topic.content
     topic_lang_model = self._generate_topic_language_model(topic)
     
     bi_query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
     tri_query_generator = TriTermQueryGeneration(minlen=3, stopwordfile=self._stopword_file)
     
     tri_query_list = tri_query_generator.extract_queries_from_text(topic_text)
     bi_query_list = bi_query_generator.extract_queries_from_text(topic_text)
     
     query_list = tri_query_list + bi_query_list
     
     query_ranker = QueryRanker(smoothed_language_model=topic_lang_model)
     query_ranker.calculate_query_list_probabilities(query_list)
     return query_ranker.get_top_queries(100)
Esempio n. 8
0
    def produce_query_list(self, topic):

        topic_text = topic.content
        topicLM = self.make_topic_lm(topic)
        bi_query_generator = BiTermQueryGeneration(
            minlen=3, stopwordfile=self.stopword_file)
        tri_query_generator = TriTermQueryGeneration(
            minlen=3, stopwordfile=self.stopword_file)
        tri_query_list = tri_query_generator.extract_queries_from_text(
            topic_text)
        bi_query_list = bi_query_generator.extract_queries_from_text(
            topic_text)

        query_list = tri_query_list + bi_query_list

        qr = QueryRanker(smoothed_language_model=topicLM)
        qr.calculate_query_list_probabilities(query_list)
        queries = qr.get_top_queries(100)
        return queries
Esempio n. 9
0
    def generate_query_list(self, search_context):
        """
        Given a Topic object, produces a list of query terms that could be issued by the simulated agent.
        """
        topic = search_context.topic
        topic_text = "{0} {1}".format(topic.title, topic.content)

        topic_lang_model = self._generate_topic_language_model(search_context)
        
        bi_query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=self._stopword_file)

        bi_query_list = bi_query_generator.extract_queries_from_text(topic_text)

        query_list = bi_query_list
        
        query_ranker = QueryRanker(smoothed_language_model=topic_lang_model)
        query_ranker.calculate_query_list_probabilities(query_list)
        gen_query_list = query_ranker.get_top_queries(100)

        return gen_query_list
Esempio n. 10
0
    def generate_query_list(self, search_context):
        """
        Given a Topic object, produces a list of query terms that could be issued by the simulated agent.
        """
        topic = search_context.topic
        topic_text = "{0} {1}".format(topic.title, topic.content)

        topic_lang_model = self._generate_topic_language_model(search_context)

        bi_query_generator = BiTermQueryGeneration(
            minlen=3, stopwordfile=self._stopword_file)

        bi_query_list = bi_query_generator.extract_queries_from_text(
            topic_text)

        query_list = bi_query_list

        query_ranker = QueryRanker(smoothed_language_model=topic_lang_model)
        query_ranker.calculate_query_list_probabilities(query_list)
        gen_query_list = query_ranker.get_top_queries(100)

        return gen_query_list
Esempio n. 11
0
def main():
    """

    :return:
    """
    parser = argparse.ArgumentParser(
                                description="Page Calculator for pages")
    parser.add_argument("-u", "--url", type=str,
                        help="url address")
    parser.add_argument("-e","--engine",type=str,
                        help="Name of search engine: " + ENGINE_LIST.__str__())
    parser.add_argument("-k","--key",type=str,
                        help="API Key for search engine (if applicable)")
    parser.add_argument("-c","--cutoff", type=int,
                        help ="The cutoff value for queries")
    parser.add_argument("-m","--maxqueries", type=int,
                        help ="The maximum number of queries per page")
    parser.add_argument("-s","--stopwordfile", type=str,
                        help ="The filename name containing stopwords")
    parser.add_argument("-b","--backgroundfile", type=str,
                        help ="The filename name containing background term counts")
    parser.add_argument("-ca", "--cache",
                  action="store_true", default=False,
                  help="use cache")


    args = parser.parse_args()

    if not args.url:
        print "Check your URL argument"
        parser.print_help()
        return 2

    cache = None
    if args.cache:
        cache = 'engine'

    if args.key:
        engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache)
    else:
        print "cache is ", cache
        engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1)


    stopwordfile = None
    if args.stopwordfile:
        stopwordfile = args.stopwordfile

    mq = 50
    if args.maxqueries:
        mq = args.maxqueries

    backgroundfile = 'background.txt'
    if args.backgroundfile:
        backgroundfile = args.backgroundfile

    doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=stopwordfile)
    query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=stopwordfile)
    print "Loading background distribution"
    colLM = LanguageModel(file=backgroundfile)
    print "Background loaded, number of terms: ", colLM.get_num_terms()

    print "Fetching page: %s" % (args.url)
    pc = PageCapture(args.url)
    page_html = pc.get_page_sourcecode()
    print "Page loaded"
    doc_extractor.extract_queries_from_html(page_html)
    doc_term_counts = doc_extractor.query_count
    print "Number of terms in document: %d" % (len(doc_term_counts))
    docLM = LanguageModel(term_dict=doc_term_counts)
    slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500)
    query_list = query_generator.extract_queries_from_html(page_html)

    print "Queries generated: ", len(query_list)
    qr = OddsRatioQueryRanker(smoothed_language_model=slm)
    scored_queries = qr.calculate_query_list_probabilities(query_list)
    queries = qr.get_top_queries(mq)
    query_list = []
    for query in queries:
        query_list.append(query[0])


    prc = PageRetrievabilityCalculator(engine=engine)
    prc.score_page(args.url, query_list)

    print "\nRetrievability Scores for cumulative c=20"
    prc.calculate_page_retrievability(c=20)
    prc.report()
    print "\nRetrievability Scores for gravity beta=1.0"

    prc.calculate_page_retrievability(c=20, beta=1.0)
    prc.report()

    print "Done!"
    return 0
Esempio n. 12
0
    def setup(self):
        """

        :return:
        """
        parser = argparse.ArgumentParser(
                                    description="Page Calculator for pages")
        parser.add_argument("-u", "--url", type=str,
                            help="url address")
        parser.add_argument("-e","--engine",type=str,
                            help="Name of search engine: " + ENGINE_LIST.__str__())
        parser.add_argument("-k","--key",type=str,
                            help="API Key for search engine (if applicable)")
        parser.add_argument("-d","--domain",type=str,
                            help="domain for search engine (if applicable, i.e. engine is sitebing, default is gla.ac.uk)")
        parser.add_argument("-c","--cutoff", type=int,
                            help ="The cutoff value for queries")
        parser.add_argument("-m","--maxqueries", type=int,
                            help ="The maximum number of queries per page")
        parser.add_argument("-s","--stopwordfile", type=str,
                            help ="The filename name containing stopwords")
        parser.add_argument("-ca", "--cache",
                      action="store_true", default=False,
                      help="use cache")
        #parser.add_argument("-ex","--experiment", type=int, help=" experiment number 1 - x")
        args = parser.parse_args()

        if not args.url:
            print "Check your URL argument"
            parser.print_help()
            return 2
        else:
            self.url = args.url
#    cache = None
#    if args.cache:
#        cache = 'engine'

#    if args.key:
#        engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache)
#    else:
#        print "cache is ", cache
#        engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1)


        cache = None
        if args.cache:
            self.cache = 'engine'
        else:
            self.cache = cache

        if args.key:
            self.engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=self.cache)
        else:
            self.engine = EngineFactory(engine=args.engine, cache=self.cache, throttle=0.1)

        if args.domain:
            self.engine.site = args.domain

        stopwordfile = None
        if args.stopwordfile:
            self.stopwordfile = args.stopwordfile
        else:
            self.stopwordfile = None

        self.mq = 250
        if args.maxqueries:
            self.mq = args.maxqueries

        print "Fetching page: %s" % (args.url)
        pc = PageCapture(args.url)
        self.page_html = pc.get_page_sourcecode()
        print "Page loaded"
        self.page_text = ''

        # answer = raw_input("Do you want to use a percentage of this page? Enter y or n \n")
        # if answer == 'y':
        #     percent = raw_input("What percentage do you want to use? \n")
        #     if self.is_integer(percent):
        #         self.page_text = self.reduce_page(percentage=percent)
        #     else:
        #         print "input error, will exit"
        #         sys.exit(2)
        #         #todo update so asks again, not exit
        # else:
        self.page_text = self.page_html

        query_list = []
        answer = raw_input("Do you want to use only a position based extractor? Enter y or n \n")
        if answer == 'y' or answer != 'n': #if enter is hit then assume y
            text = self.get_position_text()
            #todo at this stage this could be single, bi or tri terms
            query_gen = None
            if self.stopwordfile:
                query_gen = BiTermQueryGeneration(minlen=3, stopwordfile=self.stopwordfile)
            else:
                query_gen = BiTermQueryGeneration(minlen=3)
            query_list = query_gen.extract_queries_from_text(text)
        elif answer == 'n':
            answer = raw_input("Do you want to use only a rank based extractor? Enter y or n \n")
            if answer == 'y' or answer != 'n': #if enter is hit then assume y:
                query_list = self.get_ranked_queries()
            elif answer == 'n':
                answer = raw_input("Do you want to use a rank based extractor combined with a position extractor? Enter y or n \n")
                if answer == 'y' or answer != 'n': #if enter is hit then assume y:
                    text = self.get_position_text()
                    query_list = self.get_ranked_queries(text)
                elif answer == 'n':
                    print "sorry, that's all the options, system will exit"
                    sys.exit(0)

        print "Queries generated: ", len(query_list)
        prc = None
        if args.cutoff:
            prc = PageRetrievabilityCalculator(engine=self.engine, max_queries=self.mq)
        else:
            prc = PageRetrievabilityCalculator(engine=self.engine, max_queries=self.mq)
        prc.score_page(self.url, query_list)

        print "\nRetrievability Scores for cumulative pce=20"
        prc.calculate_page_retrievability(c=20)
        prc.report()
        print "\nRetrievability Scores for gravity beta=1.0"

        prc.calculate_page_retrievability(c=20, beta=1.0)
        prc.report()

        print "Done!"
        return 0