Example #1
0
    def set_engine(self):
        #if self.cache:
        #todo currently defaults to using cache, may want to change this
        self.cache = 'engine'

        if self.key:
            self.engine = EngineFactory(engine=self.engine_name, api_key=self.key, throttle=0.25, cache=self.cache)
        else:
            self.engine = EngineFactory(engine=self.engine_name, cache=self.cache, throttle=0.25)

        if self.domain:
            self.engine.site = self.domain
Example #2
0
def main():

    engine = EngineFactory(
        engine='Sitebing',
        api_key="msRh5UoZzyV3qvroEpzXMzbZEVjW3ENfTGMAQO1yuRc",
        throttle=0.1,
        cache='engine')

    query_generator = TriTermQueryGeneration(minlen=TERM_LEN,
                                             stopwordfile=STOPWORD_FILE,
                                             maxsize=100)
    tuple_list = get_trending_queries(URL_FILE)
    page_calculator = PageRetrievabilityCalculator(engine=engine,
                                                   cutoff=CUTOFF,
                                                   generator=query_generator)

    with open(RESULT_FILE, 'a') as f:

        for tuple in tuple_list:
            url = tuple[1]
            findability = tuple[0]
            category_name = tuple[2]
            retrievability = page_calculator.score_page(url)
            s = page_calculator.stats()

            f.write('{0},{1},{2},{3},{4},{5}\n'.format(category_name, url,
                                                       findability,
                                                       retrievability,
                                                       s['retrieved'],
                                                       s['query_count']))
Example #3
0
 def setUp(self):
     self.logger = logging.getLogger("TestPageRetCalc")
     #currently engine set to govuk, may need to change this
     self.engine = EngineFactory(engine="govuk")
     #url may need to be changed
     self.url = "https://www.gov.uk/renew-adult-passport"
     self.pg_calc = PageRetrievabilityCalculator(engine=self.engine)
Example #4
0
def fetch_results(queries_list):
    """Builds a list of tuples (category,url,rank) and returns it """
    myengine = EngineFactory('bing',api_key=API_KEY)
    result_list =[]
    for term in queries_list:
        query = Query(term[1], top=30)
        response = myengine.search(query)
        #TODO implement select_ranks properly maybe (num_to_select,step)
        rank_list = select_ranks(6,10) #TODO make this arguments
        for rank in rank_list:
            #term[0] is trend categoty, term[1] is search term
            try:
                result_list.append((term[0], response.results[rank].url, rank))
                #print "appended" + term[0] + response.results[rank].url
            except IndexError:
                print "index error.."

    print result_list[:]
    return result_list
Example #5
0
def fetch_results(queries_list):
    """Builds a list of tuples (category,url,rank) and returns it """
    myengine = EngineFactory('bing', api_key=API_KEY)
    result_list = []
    for term in queries_list:
        query = Query(term[1], top=30)
        response = myengine.search(query)
        #TODO implement select_ranks properly maybe (num_to_select,step)
        rank_list = select_ranks(6, 10)  #TODO make this arguments
        for rank in rank_list:
            #term[0] is trend categoty, term[1] is search term
            try:
                result_list.append((term[0], response.results[rank].url, rank))
                #print "appended" + term[0] + response.results[rank].url
            except IndexError:
                print "index error.."

    print result_list[:]
    return result_list
Example #6
0
    def test_game_scoring(self):
        self.logger.info("Testing Game Scoring with a Dummy SearchEngine")
        #print "Testing Game Scoring with a Dummy SearchEngine"
        se = EngineFactory("Dummy")
        u = User.objects.get(username='******')
        c = Category.objects.get(name='Numbers')

        gm = GameMechanic(se)
        gm.create_game(u, c)
        self.logger.info("Checking if the category Numbers has four pages.")
        #print "Checking if the category Numbers has four pages."
        self.assertEquals(len(gm.pages), 4)

        gm.handle_query('one')
        gm.take_points()
        gm.set_next_page()
        self.logger.info("Checking whether the query, one, scores 1000 points-\
                         which it should given the data and dummy search engine"
                         )
        #print "Checking whether the query, one, scores 1000 points -
        #which it should given the data and dummy search engine"
        self.assertEquals(gm.get_current_score(), 1000)
Example #7
0
def main():
    """

    :return:
    """
    parser = argparse.ArgumentParser(
                                description="Page Calculator for pages")
    parser.add_argument("-u", "--url", type=str,
                        help="url address")
    parser.add_argument("-e","--engine",type=str,
                        help="Name of search engine: " + ENGINE_LIST.__str__())
    parser.add_argument("-k","--key",type=str,
                        help="API Key for search engine (if applicable)")
    parser.add_argument("-c","--cutoff", type=int,
                        help ="The cutoff value for queries")
    parser.add_argument("-m","--maxqueries", type=int,
                        help ="The maximum number of queries per page")
    parser.add_argument("-s","--stopwordfile", type=str,
                        help ="The filename name containing stopwords")
    parser.add_argument("-b","--backgroundfile", type=str,
                        help ="The filename name containing background term counts")
    parser.add_argument("-ca", "--cache",
                  action="store_true", default=False,
                  help="use cache")


    args = parser.parse_args()

    if not args.url:
        print "Check your URL argument"
        parser.print_help()
        return 2

    cache = None
    if args.cache:
        cache = 'engine'

    if args.key:
        engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache)
    else:
        print "cache is ", cache
        engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1)


    stopwordfile = None
    if args.stopwordfile:
        stopwordfile = args.stopwordfile

    mq = 50
    if args.maxqueries:
        mq = args.maxqueries

    backgroundfile = 'background.txt'
    if args.backgroundfile:
        backgroundfile = args.backgroundfile

    doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=stopwordfile)
    query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=stopwordfile)
    print "Loading background distribution"
    colLM = LanguageModel(file=backgroundfile)
    print "Background loaded, number of terms: ", colLM.get_num_terms()

    print "Fetching page: %s" % (args.url)
    pc = PageCapture(args.url)
    page_html = pc.get_page_sourcecode()
    print "Page loaded"
    doc_extractor.extract_queries_from_html(page_html)
    doc_term_counts = doc_extractor.query_count
    print "Number of terms in document: %d" % (len(doc_term_counts))
    docLM = LanguageModel(term_dict=doc_term_counts)
    slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500)
    query_list = query_generator.extract_queries_from_html(page_html)

    print "Queries generated: ", len(query_list)
    qr = OddsRatioQueryRanker(smoothed_language_model=slm)
    scored_queries = qr.calculate_query_list_probabilities(query_list)
    queries = qr.get_top_queries(mq)
    query_list = []
    for query in queries:
        query_list.append(query[0])


    prc = PageRetrievabilityCalculator(engine=engine)
    prc.score_page(args.url, query_list)

    print "\nRetrievability Scores for cumulative c=20"
    prc.calculate_page_retrievability(c=20)
    prc.report()
    print "\nRetrievability Scores for gravity beta=1.0"

    prc.calculate_page_retrievability(c=20, beta=1.0)
    prc.report()

    print "Done!"
    return 0
Example #8
0
    def setup(self):
        """

        :return:
        """
        parser = argparse.ArgumentParser(
                                    description="Page Calculator for pages")
        parser.add_argument("-u", "--url", type=str,
                            help="url address")
        parser.add_argument("-e","--engine",type=str,
                            help="Name of search engine: " + ENGINE_LIST.__str__())
        parser.add_argument("-k","--key",type=str,
                            help="API Key for search engine (if applicable)")
        parser.add_argument("-d","--domain",type=str,
                            help="domain for search engine (if applicable, i.e. engine is sitebing, default is gla.ac.uk)")
        parser.add_argument("-c","--cutoff", type=int,
                            help ="The cutoff value for queries")
        parser.add_argument("-m","--maxqueries", type=int,
                            help ="The maximum number of queries per page")
        parser.add_argument("-s","--stopwordfile", type=str,
                            help ="The filename name containing stopwords")
        parser.add_argument("-ca", "--cache",
                      action="store_true", default=False,
                      help="use cache")
        #parser.add_argument("-ex","--experiment", type=int, help=" experiment number 1 - x")
        args = parser.parse_args()

        if not args.url:
            print "Check your URL argument"
            parser.print_help()
            return 2
        else:
            self.url = args.url
#    cache = None
#    if args.cache:
#        cache = 'engine'

#    if args.key:
#        engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache)
#    else:
#        print "cache is ", cache
#        engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1)


        cache = None
        if args.cache:
            self.cache = 'engine'
        else:
            self.cache = cache

        if args.key:
            self.engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=self.cache)
        else:
            self.engine = EngineFactory(engine=args.engine, cache=self.cache, throttle=0.1)

        if args.domain:
            self.engine.site = args.domain

        stopwordfile = None
        if args.stopwordfile:
            self.stopwordfile = args.stopwordfile
        else:
            self.stopwordfile = None

        self.mq = 250
        if args.maxqueries:
            self.mq = args.maxqueries

        print "Fetching page: %s" % (args.url)
        pc = PageCapture(args.url)
        self.page_html = pc.get_page_sourcecode()
        print "Page loaded"
        self.page_text = ''

        # answer = raw_input("Do you want to use a percentage of this page? Enter y or n \n")
        # if answer == 'y':
        #     percent = raw_input("What percentage do you want to use? \n")
        #     if self.is_integer(percent):
        #         self.page_text = self.reduce_page(percentage=percent)
        #     else:
        #         print "input error, will exit"
        #         sys.exit(2)
        #         #todo update so asks again, not exit
        # else:
        self.page_text = self.page_html

        query_list = []
        answer = raw_input("Do you want to use only a position based extractor? Enter y or n \n")
        if answer == 'y' or answer != 'n': #if enter is hit then assume y
            text = self.get_position_text()
            #todo at this stage this could be single, bi or tri terms
            query_gen = None
            if self.stopwordfile:
                query_gen = BiTermQueryGeneration(minlen=3, stopwordfile=self.stopwordfile)
            else:
                query_gen = BiTermQueryGeneration(minlen=3)
            query_list = query_gen.extract_queries_from_text(text)
        elif answer == 'n':
            answer = raw_input("Do you want to use only a rank based extractor? Enter y or n \n")
            if answer == 'y' or answer != 'n': #if enter is hit then assume y:
                query_list = self.get_ranked_queries()
            elif answer == 'n':
                answer = raw_input("Do you want to use a rank based extractor combined with a position extractor? Enter y or n \n")
                if answer == 'y' or answer != 'n': #if enter is hit then assume y:
                    text = self.get_position_text()
                    query_list = self.get_ranked_queries(text)
                elif answer == 'n':
                    print "sorry, that's all the options, system will exit"
                    sys.exit(0)

        print "Queries generated: ", len(query_list)
        prc = None
        if args.cutoff:
            prc = PageRetrievabilityCalculator(engine=self.engine, max_queries=self.mq)
        else:
            prc = PageRetrievabilityCalculator(engine=self.engine, max_queries=self.mq)
        prc.score_page(self.url, query_list)

        print "\nRetrievability Scores for cumulative pce=20"
        prc.calculate_page_retrievability(c=20)
        prc.report()
        print "\nRetrievability Scores for gravity beta=1.0"

        prc.calculate_page_retrievability(c=20, beta=1.0)
        prc.report()

        print "Done!"
        return 0
Example #9
0
def main():

    logger = create_ifind_logger('test_game_mech.log')
    logger.info("Program started")
    logger.info('Testing game mechanics')

    print "This script is to test the GameMechanics and interaction with the Models"

    ds = EngineFactory("Dummy")

    gm = GameMechanic(ds)
    print gm
    u = User.objects.filter(username='******')
    if u:
        u = u[0]
    else:
        print "Adding testy user"
        u = User(username='******', password='******')
        u.save()

    c = Category.objects.filter(name='Numbers')

    if c:
        c = c[0]
    else:
        print "Adding a Numbers Category"
        c = Category(name='Numbers',
                     desc='Looking for sites that around about numbers')
        c.save()

    pages = Page.objects.filter(category=c)

    if not pages:
        print "Adding pages"

        for pn in ['one', 'two', 'three', 'four']:
            p = Page(category=c,
                     title=pn,
                     url='www.' + pn + '.com',
                     snippet=pn,
                     desc=('desc: ' + pn))
            p.save()

        pages = Page.objects.filter(category=c)

    print u
    print c
    print pages

    gm.create_game(u, c)

    print gm

    print "Game is set up to play"
    raw_input('Press enter to continue')

    while not gm.is_game_over():
        clear_screen()
        print gm
        last_query = gm.get_last_query()
        if last_query:
            print "\nLast Query: %s and Query Score: %d" % (
                last_query, gm.get_last_query_score())
        state = handle_game_input()
        if state == 1:
            gm.take_points()
            gm.set_next_page()
            state = 0
        if state == 2:
            query = handle_query_input()
            gm.handle_query(query)

    print '\nGame Over!!\n'
    print gm
    logger.info("Done!")