Beispiel #1
0
def main():

    engine = EngineFactory(
        engine='Sitebing',
        api_key="msRh5UoZzyV3qvroEpzXMzbZEVjW3ENfTGMAQO1yuRc",
        throttle=0.1,
        cache='engine')

    query_generator = TriTermQueryGeneration(minlen=TERM_LEN,
                                             stopwordfile=STOPWORD_FILE,
                                             maxsize=100)
    tuple_list = get_trending_queries(URL_FILE)
    page_calculator = PageRetrievabilityCalculator(engine=engine,
                                                   cutoff=CUTOFF,
                                                   generator=query_generator)

    with open(RESULT_FILE, 'a') as f:

        for tuple in tuple_list:
            url = tuple[1]
            findability = tuple[0]
            category_name = tuple[2]
            retrievability = page_calculator.score_page(url)
            s = page_calculator.stats()

            f.write('{0},{1},{2},{3},{4},{5}\n'.format(category_name, url,
                                                       findability,
                                                       retrievability,
                                                       s['retrieved'],
                                                       s['query_count']))
Beispiel #2
0
def main():
   
    engine = EngineFactory(engine='Sitebing',
                           api_key="msRh5UoZzyV3qvroEpzXMzbZEVjW3ENfTGMAQO1yuRc",
                           throttle=0.1,
                           cache='engine')

    query_generator = TriTermQueryGeneration(minlen=TERM_LEN, stopwordfile=STOPWORD_FILE, maxsize=100)
    tuple_list = get_trending_queries(URL_FILE)
    page_calculator = PageRetrievabilityCalculator(engine=engine, cutoff=CUTOFF, generator=query_generator)

    with open(RESULT_FILE, 'a') as f:

        for tuple in tuple_list:
            url = tuple[1]
            findability = tuple[0]
            category_name = tuple[2]
            retrievability = page_calculator.score_page(url)
            s = page_calculator.stats()

            f.write('{0},{1},{2},{3},{4},{5}\n'.format(category_name,
                                                           url,
                                                           findability,
                                                           retrievability,
                                                           s['retrieved'],
                                                           s['query_count']))
Beispiel #3
0
def main():
    """

    :return:
    """
    parser = argparse.ArgumentParser(
                                description="Page Calculator for pages")
    parser.add_argument("-u", "--url", type=str,
                        help="url address")
    parser.add_argument("-e","--engine",type=str,
                        help="Name of search engine: " + ENGINE_LIST.__str__())
    parser.add_argument("-k","--key",type=str,
                        help="API Key for search engine (if applicable)")
    parser.add_argument("-c","--cutoff", type=int,
                        help ="The cutoff value for queries")
    parser.add_argument("-m","--maxqueries", type=int,
                        help ="The maximum number of queries per page")
    parser.add_argument("-s","--stopwordfile", type=str,
                        help ="The filename name containing stopwords")
    parser.add_argument("-b","--backgroundfile", type=str,
                        help ="The filename name containing background term counts")
    parser.add_argument("-ca", "--cache",
                  action="store_true", default=False,
                  help="use cache")


    args = parser.parse_args()

    if not args.url:
        print "Check your URL argument"
        parser.print_help()
        return 2

    cache = None
    if args.cache:
        cache = 'engine'

    if args.key:
        engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache)
    else:
        print "cache is ", cache
        engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1)


    stopwordfile = None
    if args.stopwordfile:
        stopwordfile = args.stopwordfile

    mq = 50
    if args.maxqueries:
        mq = args.maxqueries

    backgroundfile = 'background.txt'
    if args.backgroundfile:
        backgroundfile = args.backgroundfile

    doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=stopwordfile)
    query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=stopwordfile)
    print "Loading background distribution"
    colLM = LanguageModel(file=backgroundfile)
    print "Background loaded, number of terms: ", colLM.get_num_terms()

    print "Fetching page: %s" % (args.url)
    pc = PageCapture(args.url)
    page_html = pc.get_page_sourcecode()
    print "Page loaded"
    doc_extractor.extract_queries_from_html(page_html)
    doc_term_counts = doc_extractor.query_count
    print "Number of terms in document: %d" % (len(doc_term_counts))
    docLM = LanguageModel(term_dict=doc_term_counts)
    slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500)
    query_list = query_generator.extract_queries_from_html(page_html)

    print "Queries generated: ", len(query_list)
    qr = OddsRatioQueryRanker(smoothed_language_model=slm)
    scored_queries = qr.calculate_query_list_probabilities(query_list)
    queries = qr.get_top_queries(mq)
    query_list = []
    for query in queries:
        query_list.append(query[0])


    prc = PageRetrievabilityCalculator(engine=engine)
    prc.score_page(args.url, query_list)

    print "\nRetrievability Scores for cumulative c=20"
    prc.calculate_page_retrievability(c=20)
    prc.report()
    print "\nRetrievability Scores for gravity beta=1.0"

    prc.calculate_page_retrievability(c=20, beta=1.0)
    prc.report()

    print "Done!"
    return 0
Beispiel #4
0
    def setup(self):
        """

        :return:
        """
        parser = argparse.ArgumentParser(
                                    description="Page Calculator for pages")
        parser.add_argument("-u", "--url", type=str,
                            help="url address")
        parser.add_argument("-e","--engine",type=str,
                            help="Name of search engine: " + ENGINE_LIST.__str__())
        parser.add_argument("-k","--key",type=str,
                            help="API Key for search engine (if applicable)")
        parser.add_argument("-d","--domain",type=str,
                            help="domain for search engine (if applicable, i.e. engine is sitebing, default is gla.ac.uk)")
        parser.add_argument("-c","--cutoff", type=int,
                            help ="The cutoff value for queries")
        parser.add_argument("-m","--maxqueries", type=int,
                            help ="The maximum number of queries per page")
        parser.add_argument("-s","--stopwordfile", type=str,
                            help ="The filename name containing stopwords")
        parser.add_argument("-ca", "--cache",
                      action="store_true", default=False,
                      help="use cache")
        #parser.add_argument("-ex","--experiment", type=int, help=" experiment number 1 - x")
        args = parser.parse_args()

        if not args.url:
            print "Check your URL argument"
            parser.print_help()
            return 2
        else:
            self.url = args.url
#    cache = None
#    if args.cache:
#        cache = 'engine'

#    if args.key:
#        engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache)
#    else:
#        print "cache is ", cache
#        engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1)


        cache = None
        if args.cache:
            self.cache = 'engine'
        else:
            self.cache = cache

        if args.key:
            self.engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=self.cache)
        else:
            self.engine = EngineFactory(engine=args.engine, cache=self.cache, throttle=0.1)

        if args.domain:
            self.engine.site = args.domain

        stopwordfile = None
        if args.stopwordfile:
            self.stopwordfile = args.stopwordfile
        else:
            self.stopwordfile = None

        self.mq = 250
        if args.maxqueries:
            self.mq = args.maxqueries

        print "Fetching page: %s" % (args.url)
        pc = PageCapture(args.url)
        self.page_html = pc.get_page_sourcecode()
        print "Page loaded"
        self.page_text = ''

        # answer = raw_input("Do you want to use a percentage of this page? Enter y or n \n")
        # if answer == 'y':
        #     percent = raw_input("What percentage do you want to use? \n")
        #     if self.is_integer(percent):
        #         self.page_text = self.reduce_page(percentage=percent)
        #     else:
        #         print "input error, will exit"
        #         sys.exit(2)
        #         #todo update so asks again, not exit
        # else:
        self.page_text = self.page_html

        query_list = []
        answer = raw_input("Do you want to use only a position based extractor? Enter y or n \n")
        if answer == 'y' or answer != 'n': #if enter is hit then assume y
            text = self.get_position_text()
            #todo at this stage this could be single, bi or tri terms
            query_gen = None
            if self.stopwordfile:
                query_gen = BiTermQueryGeneration(minlen=3, stopwordfile=self.stopwordfile)
            else:
                query_gen = BiTermQueryGeneration(minlen=3)
            query_list = query_gen.extract_queries_from_text(text)
        elif answer == 'n':
            answer = raw_input("Do you want to use only a rank based extractor? Enter y or n \n")
            if answer == 'y' or answer != 'n': #if enter is hit then assume y:
                query_list = self.get_ranked_queries()
            elif answer == 'n':
                answer = raw_input("Do you want to use a rank based extractor combined with a position extractor? Enter y or n \n")
                if answer == 'y' or answer != 'n': #if enter is hit then assume y:
                    text = self.get_position_text()
                    query_list = self.get_ranked_queries(text)
                elif answer == 'n':
                    print "sorry, that's all the options, system will exit"
                    sys.exit(0)

        print "Queries generated: ", len(query_list)
        prc = None
        if args.cutoff:
            prc = PageRetrievabilityCalculator(engine=self.engine, max_queries=self.mq)
        else:
            prc = PageRetrievabilityCalculator(engine=self.engine, max_queries=self.mq)
        prc.score_page(self.url, query_list)

        print "\nRetrievability Scores for cumulative pce=20"
        prc.calculate_page_retrievability(c=20)
        prc.report()
        print "\nRetrievability Scores for gravity beta=1.0"

        prc.calculate_page_retrievability(c=20, beta=1.0)
        prc.report()

        print "Done!"
        return 0
    def process_queries(self):
        prc = None
        if self.maxqueries:
            prc = PageRetrievabilityCalculator(engine=self.engine, max_queries=self.maxqueries)
        else:
            prc = PageRetrievabilityCalculator(engine=self.engine)
        prc.score_page(self.url, self.query_list)

        #rose todo refactor following code, not optimal and a bit messy
        prc.calculate_page_retrievability(c=20)
        c_20_summary_data = prc.output_summary_report()
        c_20_score = str(c_20_summary_data['score'])
        c_20_breakdown_data = prc.output_query_report()

        prc.calculate_page_retrievability(c=10)
        c_10_summary_data = prc.output_summary_report()
        c_10_score = str(c_10_summary_data['score'])
        c_10_breakdown_data = prc.output_query_report()

        prc.calculate_page_retrievability(c=5)
        c_5_summary_data = prc.output_summary_report()
        c_5_score = str(c_5_summary_data['score'])
        c_5_breakdown_data = prc.output_query_report()

        prc.calculate_page_retrievability(c=20, beta=1.0)
        g_1_summary_data = prc.output_summary_report()
        g_1_score = str(g_1_summary_data['score'])
        g_1_breakdown_data = prc.output_query_report()

        prc.calculate_page_retrievability(c=20, beta=0.5)
        g_pt5_summary_data = prc.output_summary_report()
        g_pt5_score = str(g_pt5_summary_data['score'])
        g_pt5_breakdown_data = prc.output_query_report()
        #print "c breakdown is NOW ", str(c_20_breakdown_data["internationally excellent site:gla.ac.uk"].ret_score)

        #print "c breakdown is " , c_20_breakdown_data
        #print "g breakdown is " , g_1_breakdown_data

        summary_report = ""
        #add all the details for cumulative to the summary report
        for value in c_20_summary_data.values():
            summary_report += str(value) + " "
        #at the end add in the remaining c and gravity scores for the same page and a new line
        summary_report += c_10_score + " " + c_5_score + " " + g_1_score + " " + g_pt5_score +  "\n"

        breakdown_report = ""
        for q_terms, curr_query in c_20_breakdown_data.items():
            q_c20_score = str(curr_query.ret_score)
            q_c10_score = str(c_10_breakdown_data[q_terms].ret_score)
            q_c5_score = str(c_5_breakdown_data[q_terms].ret_score)
            q_g1_score = str(g_1_breakdown_data[q_terms].ret_score)
            q_gpt5_score = str(g_pt5_breakdown_data[q_terms].ret_score)
            breakdown_report += self.url + " " + q_terms + " " + str(curr_query.rank) + " " + q_c20_score \
                                + " " + q_c10_score + " " + q_c5_score + " " + q_g1_score + " " + q_gpt5_score + "\n"
        self.write_output_files(summary_report,breakdown_report)
Beispiel #6
0
    def process_queries(self):
        prc = None
        if self.maxqueries:
            prc = PageRetrievabilityCalculator(engine=self.engine, max_queries=self.maxqueries)
        else:
            prc = PageRetrievabilityCalculator(engine=self.engine)
        prc.score_page(self.url, self.query_list)

        # rose todo refactor following code, not optimal and a bit messy
        prc.calculate_page_retrievability(c=20)
        c_20_summary_data = prc.output_summary_report()
        c_20_score = str(c_20_summary_data["score"])
        c_20_breakdown_data = prc.output_query_report()

        prc.calculate_page_retrievability(c=10)
        c_10_summary_data = prc.output_summary_report()
        c_10_score = str(c_10_summary_data["score"])
        c_10_breakdown_data = prc.output_query_report()

        prc.calculate_page_retrievability(c=5)
        c_5_summary_data = prc.output_summary_report()
        c_5_score = str(c_5_summary_data["score"])
        c_5_breakdown_data = prc.output_query_report()

        prc.calculate_page_retrievability(c=20, beta=1.0)
        g_1_summary_data = prc.output_summary_report()
        g_1_score = str(g_1_summary_data["score"])
        g_1_breakdown_data = prc.output_query_report()

        prc.calculate_page_retrievability(c=20, beta=0.5)
        g_pt5_summary_data = prc.output_summary_report()
        g_pt5_score = str(g_pt5_summary_data["score"])
        g_pt5_breakdown_data = prc.output_query_report()
        # print "c breakdown is NOW ", str(c_20_breakdown_data["internationally excellent site:gla.ac.uk"].ret_score)

        # print "c breakdown is " , c_20_breakdown_data
        # print "g breakdown is " , g_1_breakdown_data

        summary_report = ""
        # add all the details for cumulative to the summary report
        for value in c_20_summary_data.values():
            summary_report += str(value) + " "
        # at the end add in the remaining c and gravity scores for the same page and a new line
        summary_report += c_10_score + " " + c_5_score + " " + g_1_score + " " + g_pt5_score + "\n"

        breakdown_report = ""
        for q_terms, curr_query in c_20_breakdown_data.items():
            q_c20_score = str(curr_query.ret_score)
            q_c10_score = str(c_10_breakdown_data[q_terms].ret_score)
            q_c5_score = str(c_5_breakdown_data[q_terms].ret_score)
            q_g1_score = str(g_1_breakdown_data[q_terms].ret_score)
            q_gpt5_score = str(g_pt5_breakdown_data[q_terms].ret_score)
            breakdown_report += (
                self.url
                + " "
                + q_terms
                + " "
                + str(curr_query.rank)
                + " "
                + q_c20_score
                + " "
                + q_c10_score
                + " "
                + q_c5_score
                + " "
                + q_g1_score
                + " "
                + q_gpt5_score
                + "\n"
            )
        self.write_output_files(summary_report, breakdown_report)