def main(): """ :return: """ parser = argparse.ArgumentParser( description="Page Calculator for pages") parser.add_argument("-u", "--url", type=str, help="url address") parser.add_argument("-e","--engine",type=str, help="Name of search engine: " + ENGINE_LIST.__str__()) parser.add_argument("-k","--key",type=str, help="API Key for search engine (if applicable)") parser.add_argument("-c","--cutoff", type=int, help ="The cutoff value for queries") parser.add_argument("-m","--maxqueries", type=int, help ="The maximum number of queries per page") parser.add_argument("-s","--stopwordfile", type=str, help ="The filename name containing stopwords") parser.add_argument("-b","--backgroundfile", type=str, help ="The filename name containing background term counts") parser.add_argument("-ca", "--cache", action="store_true", default=False, help="use cache") args = parser.parse_args() if not args.url: print "Check your URL argument" parser.print_help() return 2 cache = None if args.cache: cache = 'engine' if args.key: engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache) else: print "cache is ", cache engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1) stopwordfile = None if args.stopwordfile: stopwordfile = args.stopwordfile mq = 50 if args.maxqueries: mq = args.maxqueries backgroundfile = 'background.txt' if args.backgroundfile: backgroundfile = args.backgroundfile doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=stopwordfile) query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=stopwordfile) print "Loading background distribution" colLM = LanguageModel(file=backgroundfile) print "Background loaded, number of terms: ", colLM.get_num_terms() print "Fetching page: %s" % (args.url) pc = PageCapture(args.url) page_html = pc.get_page_sourcecode() print "Page loaded" doc_extractor.extract_queries_from_html(page_html) doc_term_counts = doc_extractor.query_count print "Number of terms in document: %d" % (len(doc_term_counts)) docLM = LanguageModel(term_dict=doc_term_counts) slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500) query_list = query_generator.extract_queries_from_html(page_html) print "Queries generated: ", len(query_list) qr = OddsRatioQueryRanker(smoothed_language_model=slm) scored_queries = qr.calculate_query_list_probabilities(query_list) queries = qr.get_top_queries(mq) query_list = [] for query in queries: query_list.append(query[0]) prc = PageRetrievabilityCalculator(engine=engine) prc.score_page(args.url, query_list) print "\nRetrievability Scores for cumulative c=20" prc.calculate_page_retrievability(c=20) prc.report() print "\nRetrievability Scores for gravity beta=1.0" prc.calculate_page_retrievability(c=20, beta=1.0) prc.report() print "Done!" return 0
def setup(self): """ :return: """ parser = argparse.ArgumentParser( description="Page Calculator for pages") parser.add_argument("-u", "--url", type=str, help="url address") parser.add_argument("-e","--engine",type=str, help="Name of search engine: " + ENGINE_LIST.__str__()) parser.add_argument("-k","--key",type=str, help="API Key for search engine (if applicable)") parser.add_argument("-d","--domain",type=str, help="domain for search engine (if applicable, i.e. engine is sitebing, default is gla.ac.uk)") parser.add_argument("-c","--cutoff", type=int, help ="The cutoff value for queries") parser.add_argument("-m","--maxqueries", type=int, help ="The maximum number of queries per page") parser.add_argument("-s","--stopwordfile", type=str, help ="The filename name containing stopwords") parser.add_argument("-ca", "--cache", action="store_true", default=False, help="use cache") #parser.add_argument("-ex","--experiment", type=int, help=" experiment number 1 - x") args = parser.parse_args() if not args.url: print "Check your URL argument" parser.print_help() return 2 else: self.url = args.url # cache = None # if args.cache: # cache = 'engine' # if args.key: # engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache) # else: # print "cache is ", cache # engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1) cache = None if args.cache: self.cache = 'engine' else: self.cache = cache if args.key: self.engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=self.cache) else: self.engine = EngineFactory(engine=args.engine, cache=self.cache, throttle=0.1) if args.domain: self.engine.site = args.domain stopwordfile = None if args.stopwordfile: self.stopwordfile = args.stopwordfile else: self.stopwordfile = None self.mq = 250 if args.maxqueries: self.mq = args.maxqueries print "Fetching page: %s" % (args.url) pc = PageCapture(args.url) self.page_html = pc.get_page_sourcecode() print "Page loaded" self.page_text = '' # answer = raw_input("Do you want to use a percentage of this page? Enter y or n \n") # if answer == 'y': # percent = raw_input("What percentage do you want to use? \n") # if self.is_integer(percent): # self.page_text = self.reduce_page(percentage=percent) # else: # print "input error, will exit" # sys.exit(2) # #todo update so asks again, not exit # else: self.page_text = self.page_html query_list = [] answer = raw_input("Do you want to use only a position based extractor? Enter y or n \n") if answer == 'y' or answer != 'n': #if enter is hit then assume y text = self.get_position_text() #todo at this stage this could be single, bi or tri terms query_gen = None if self.stopwordfile: query_gen = BiTermQueryGeneration(minlen=3, stopwordfile=self.stopwordfile) else: query_gen = BiTermQueryGeneration(minlen=3) query_list = query_gen.extract_queries_from_text(text) elif answer == 'n': answer = raw_input("Do you want to use only a rank based extractor? Enter y or n \n") if answer == 'y' or answer != 'n': #if enter is hit then assume y: query_list = self.get_ranked_queries() elif answer == 'n': answer = raw_input("Do you want to use a rank based extractor combined with a position extractor? Enter y or n \n") if answer == 'y' or answer != 'n': #if enter is hit then assume y: text = self.get_position_text() query_list = self.get_ranked_queries(text) elif answer == 'n': print "sorry, that's all the options, system will exit" sys.exit(0) print "Queries generated: ", len(query_list) prc = None if args.cutoff: prc = PageRetrievabilityCalculator(engine=self.engine, max_queries=self.mq) else: prc = PageRetrievabilityCalculator(engine=self.engine, max_queries=self.mq) prc.score_page(self.url, query_list) print "\nRetrievability Scores for cumulative pce=20" prc.calculate_page_retrievability(c=20) prc.report() print "\nRetrievability Scores for gravity beta=1.0" prc.calculate_page_retrievability(c=20, beta=1.0) prc.report() print "Done!" return 0