Ejemplo n.º 1
0
 def read_html(self):
     print "Fetching page: %s" % self.url
     pc = PageCapture(self.url)
     self.page_html = pc.get_page_sourcecode()
     print "Page loaded"
     self.page_text = ''
     self.page_text = self.page_html
Ejemplo n.º 2
0
def main():

    parser = argparse.ArgumentParser(
        description="Take screenshots of web pages")
    parser.add_argument("-w",
                        "--width",
                        type=int,
                        default=800,
                        help="browser width (default=800)")
    parser.add_argument("-H",
                        "--height",
                        type=int,
                        default=600,
                        help="browser height (default=600)")
    parser.add_argument("-wp", "--webpage", type=str, help="webpage address")
    parser.add_argument(
        "-f",
        "--filename",
        type=str,
        default="screen.png",
        help="filename of saved screenshot (default=screen.png)"
        "\nexpected format: .jpeg or .png")
    args = parser.parse_args()

    if not args.webpage and args.filename:
        parser.print_help()
        return 2
    else:
        pc = PageCapture(args.webpage, args.width, args.height)
        pc.take_screen_shot(args.filename)
        title = pc.get_page_title()
        print "Screen shot of %s taken and saved to %s." % (title,
                                                            args.filename)
        return 0
Ejemplo n.º 3
0
 def read_html(self):
     print "Fetching page: %s" % self.url
     pc = PageCapture(self.url)
     self.page_html = pc.get_page_sourcecode()
     print "Page loaded"
     self.page_text = ""
     self.page_text = self.page_html
Ejemplo n.º 4
0
def populate_pages(url_list, category, halved_screen_shot=False):
    """

    :param url_list: a list of the urls for the pages that are going to be populated
    :param category: the category in which the pages fall into
    :return:
    """

    #For each url in the url_list
    f = open('page_meta_data.txt','a')
    for url in url_list:

        # create PageCapture object - specify the browser to be 800 x 600.
        try:
            pc = PageCapture(url,800, 600)
            url_file_name = convert_url_to_filename(url)+'.png'
            # To change to accomodate for the new changes
            image_file_name = os.path.join(DATA_DIR, url_file_name)
            pc.load_url(url)
            # fetch the screen-shot
            if halved_screen_shot:
                pc.crop_screen_shot(image_file_name,0,0,1000,1000)
                #pc.halve_screen_shot(image_file_name)
            else:
                pc.take_screen_shot(image_file_name)

            # get the title
            title = pc.get_page_title()
            # create page in models/db with category
            # Abdullah , using DATA_DIR did not work for me because it uses the current working directory in the url.

            #save to file instead of db here to decouple.
            f.write('%s,%s,%s,%s\n' % (category.name, url, title,image_file_name,))
            print("written {0} to file.".format(title))




            #p = Page(category=category, title=title, is_shown=True, url=url, screenshot=os.path.join('/', MEDIA_ROOT, url_file_name))
            #p.save()
            #print 'Page title= ' + p.title + ' has been saved!'

        except ValueError:
            print 'Page  has ((NOT)) been saved!'
            print 'ERROR IS {0}'.format("ValueError")
            continue
    f.close()
Ejemplo n.º 5
0
def populate_pages(url_list, category, halved_screen_shot=False):
    """

    :param url_list: a list of the urls for the pages that are going to be populated
    :param category: the category in which the pages fall into
    :return:
    """

    #For each url in the url_list
    f = open('page_meta_data.txt','a')
    for url in url_list:

        p = Page.objects.filter(url=url)
        pf = None
        if p:
            pf =p[0]
        if not pf:
            # create PageCapture object - specify the browser to be 800 x 600.
            try:
                pc = PageCapture(url,800, 600)
                url_file_name = convert_url_to_filename(url)+'.png'
                # To change to accomodate for the new changes
                image_file_name = os.path.join(DATA_DIR, url_file_name)
                pc.load_url(url)
                # fetch the screen-shot
                if halved_screen_shot:
                    pc.crop_screen_shot(image_file_name,0,0,1000,1000)
                    #pc.halve_screen_shot(image_file_name)
                else:
                    pc.take_screen_shot(image_file_name)

                # get the title
                title = pc.get_page_title()
                # create page in models/db with category
                # Abdullah , using DATA_DIR did not work for me because it uses the current working directory in the url.

                #save to file instead of db here to decouple.
                f.write('%s,%s,%s,%s\n' % (category.name, url, title,image_file_name,))
                print("written {0} to file.".format(title))

                p = Page(category=category, title=title, is_shown=True, url=url, screenshot=os.path.join('/', MEDIA_ROOT, url_file_name))
                p.save()
                print 'Page title= ' + p.title + ' has been saved!'

            except ValueError:
                print 'Page  has ((NOT)) been saved!'
                print 'ERROR IS {0}'.format("ValueError")
                continue
        else:
            print "Already added: {0}".format(pf.title, pf.url)
    f.close()
Ejemplo n.º 6
0
def populate_pages(url_list, category, halved_screen_shot=False):
    """

    :param url_list: a list of the urls for the pages that are going to be populated
    :param category: the category in which the pages fall into
    :return:
    """

    #For each url in the url_list
    for url in url_list:

        # create PageCapture object - specify the browser to be 800 x 600.
        try:
            pc = PageCapture(url,800, 600)
            url_file_name = convert_url_to_filename(url)+'.png'
            # To change to accomodate for the new changes
            image_file_name = os.path.join(DATA_DIR, url_file_name)
            pc.load_url(url)
            # fetch the screen-shot
            if halved_screen_shot:
                if random.random() > 0.5:
                    pc.crop_screen_shot(image_file_name,200,400,700,900)
                else:
                    pc.crop_screen_shot(image_file_name,0,0,1000,1000)
                #pc.halve_screen_shot(image_file_name)
            else:
                pc.take_screen_shot(image_file_name)

            # get the title
            title = pc.get_page_title()
            # create page in models/db with category
            # Abdullah , using DATA_DIR did not work for me because it uses the current working directory in the url.
            p = Page(category=category, title=title, is_shown=True, url=url, screenshot=os.path.join('/', MEDIA_ROOT, url_file_name))
            p.save()
            print 'Page title= ' + p.title + ' has been saved!'
        except ValueError:
            print 'Page  has ((NOT)) been saved!'
            print 'ERROR IS'
            print ValueError
            continue
Ejemplo n.º 7
0
def main():

    parser = argparse.ArgumentParser(description="Take screenshots of web pages")
    parser.add_argument("-w", "--width", type=int, default=800, help="browser width (default=800)")
    parser.add_argument("-H", "--height", type=int, default=600, help="browser height (default=600)")
    parser.add_argument("-wp", "--webpage", type=str, help="webpage address")
    parser.add_argument(
        "-f",
        "--filename",
        type=str,
        default="screen.png",
        help="filename of saved screenshot (default=screen.png)" "\nexpected format: .jpeg or .png",
    )
    args = parser.parse_args()

    if not args.webpage and args.filename:
        parser.print_help()
        return 2
    else:
        pc = PageCapture(args.webpage, args.width, args.height)
        pc.take_screen_shot(args.filename)
        title = pc.get_page_title()
        print "Screen shot of %s taken and saved to %s." % (title, args.filename)
        return 0
Ejemplo n.º 8
0
def main():
    """

    :return:
    """
    parser = argparse.ArgumentParser(
                                description="Page Calculator for pages")
    parser.add_argument("-u", "--url", type=str,
                        help="url address")
    parser.add_argument("-e","--engine",type=str,
                        help="Name of search engine: " + ENGINE_LIST.__str__())
    parser.add_argument("-k","--key",type=str,
                        help="API Key for search engine (if applicable)")
    parser.add_argument("-c","--cutoff", type=int,
                        help ="The cutoff value for queries")
    parser.add_argument("-m","--maxqueries", type=int,
                        help ="The maximum number of queries per page")
    parser.add_argument("-s","--stopwordfile", type=str,
                        help ="The filename name containing stopwords")
    parser.add_argument("-b","--backgroundfile", type=str,
                        help ="The filename name containing background term counts")
    parser.add_argument("-ca", "--cache",
                  action="store_true", default=False,
                  help="use cache")


    args = parser.parse_args()

    if not args.url:
        print "Check your URL argument"
        parser.print_help()
        return 2

    cache = None
    if args.cache:
        cache = 'engine'

    if args.key:
        engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache)
    else:
        print "cache is ", cache
        engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1)


    stopwordfile = None
    if args.stopwordfile:
        stopwordfile = args.stopwordfile

    mq = 50
    if args.maxqueries:
        mq = args.maxqueries

    backgroundfile = 'background.txt'
    if args.backgroundfile:
        backgroundfile = args.backgroundfile

    doc_extractor = SingleQueryGeneration(minlen=3,stopwordfile=stopwordfile)
    query_generator = BiTermQueryGeneration(minlen=3, stopwordfile=stopwordfile)
    print "Loading background distribution"
    colLM = LanguageModel(file=backgroundfile)
    print "Background loaded, number of terms: ", colLM.get_num_terms()

    print "Fetching page: %s" % (args.url)
    pc = PageCapture(args.url)
    page_html = pc.get_page_sourcecode()
    print "Page loaded"
    doc_extractor.extract_queries_from_html(page_html)
    doc_term_counts = doc_extractor.query_count
    print "Number of terms in document: %d" % (len(doc_term_counts))
    docLM = LanguageModel(term_dict=doc_term_counts)
    slm = BayesLanguageModel(docLM=docLM, colLM=colLM, beta=500)
    query_list = query_generator.extract_queries_from_html(page_html)

    print "Queries generated: ", len(query_list)
    qr = OddsRatioQueryRanker(smoothed_language_model=slm)
    scored_queries = qr.calculate_query_list_probabilities(query_list)
    queries = qr.get_top_queries(mq)
    query_list = []
    for query in queries:
        query_list.append(query[0])


    prc = PageRetrievabilityCalculator(engine=engine)
    prc.score_page(args.url, query_list)

    print "\nRetrievability Scores for cumulative c=20"
    prc.calculate_page_retrievability(c=20)
    prc.report()
    print "\nRetrievability Scores for gravity beta=1.0"

    prc.calculate_page_retrievability(c=20, beta=1.0)
    prc.report()

    print "Done!"
    return 0
Ejemplo n.º 9
0
    def setup(self):
        """

        :return:
        """
        parser = argparse.ArgumentParser(
                                    description="Page Calculator for pages")
        parser.add_argument("-u", "--url", type=str,
                            help="url address")
        parser.add_argument("-e","--engine",type=str,
                            help="Name of search engine: " + ENGINE_LIST.__str__())
        parser.add_argument("-k","--key",type=str,
                            help="API Key for search engine (if applicable)")
        parser.add_argument("-d","--domain",type=str,
                            help="domain for search engine (if applicable, i.e. engine is sitebing, default is gla.ac.uk)")
        parser.add_argument("-c","--cutoff", type=int,
                            help ="The cutoff value for queries")
        parser.add_argument("-m","--maxqueries", type=int,
                            help ="The maximum number of queries per page")
        parser.add_argument("-s","--stopwordfile", type=str,
                            help ="The filename name containing stopwords")
        parser.add_argument("-ca", "--cache",
                      action="store_true", default=False,
                      help="use cache")
        #parser.add_argument("-ex","--experiment", type=int, help=" experiment number 1 - x")
        args = parser.parse_args()

        if not args.url:
            print "Check your URL argument"
            parser.print_help()
            return 2
        else:
            self.url = args.url
#    cache = None
#    if args.cache:
#        cache = 'engine'

#    if args.key:
#        engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=cache)
#    else:
#        print "cache is ", cache
#        engine = EngineFactory(engine=args.engine, cache=cache, throttle=0.1)


        cache = None
        if args.cache:
            self.cache = 'engine'
        else:
            self.cache = cache

        if args.key:
            self.engine = EngineFactory(engine=args.engine, api_key=args.key, throttle=0.1, cache=self.cache)
        else:
            self.engine = EngineFactory(engine=args.engine, cache=self.cache, throttle=0.1)

        if args.domain:
            self.engine.site = args.domain

        stopwordfile = None
        if args.stopwordfile:
            self.stopwordfile = args.stopwordfile
        else:
            self.stopwordfile = None

        self.mq = 250
        if args.maxqueries:
            self.mq = args.maxqueries

        print "Fetching page: %s" % (args.url)
        pc = PageCapture(args.url)
        self.page_html = pc.get_page_sourcecode()
        print "Page loaded"
        self.page_text = ''

        # answer = raw_input("Do you want to use a percentage of this page? Enter y or n \n")
        # if answer == 'y':
        #     percent = raw_input("What percentage do you want to use? \n")
        #     if self.is_integer(percent):
        #         self.page_text = self.reduce_page(percentage=percent)
        #     else:
        #         print "input error, will exit"
        #         sys.exit(2)
        #         #todo update so asks again, not exit
        # else:
        self.page_text = self.page_html

        query_list = []
        answer = raw_input("Do you want to use only a position based extractor? Enter y or n \n")
        if answer == 'y' or answer != 'n': #if enter is hit then assume y
            text = self.get_position_text()
            #todo at this stage this could be single, bi or tri terms
            query_gen = None
            if self.stopwordfile:
                query_gen = BiTermQueryGeneration(minlen=3, stopwordfile=self.stopwordfile)
            else:
                query_gen = BiTermQueryGeneration(minlen=3)
            query_list = query_gen.extract_queries_from_text(text)
        elif answer == 'n':
            answer = raw_input("Do you want to use only a rank based extractor? Enter y or n \n")
            if answer == 'y' or answer != 'n': #if enter is hit then assume y:
                query_list = self.get_ranked_queries()
            elif answer == 'n':
                answer = raw_input("Do you want to use a rank based extractor combined with a position extractor? Enter y or n \n")
                if answer == 'y' or answer != 'n': #if enter is hit then assume y:
                    text = self.get_position_text()
                    query_list = self.get_ranked_queries(text)
                elif answer == 'n':
                    print "sorry, that's all the options, system will exit"
                    sys.exit(0)

        print "Queries generated: ", len(query_list)
        prc = None
        if args.cutoff:
            prc = PageRetrievabilityCalculator(engine=self.engine, max_queries=self.mq)
        else:
            prc = PageRetrievabilityCalculator(engine=self.engine, max_queries=self.mq)
        prc.score_page(self.url, query_list)

        print "\nRetrievability Scores for cumulative pce=20"
        prc.calculate_page_retrievability(c=20)
        prc.report()
        print "\nRetrievability Scores for gravity beta=1.0"

        prc.calculate_page_retrievability(c=20, beta=1.0)
        prc.report()

        print "Done!"
        return 0
Ejemplo n.º 10
0
def populate_pages(url_list, category, halved_screen_shot=False):
    """

    :param url_list: a list of the urls for the pages that are going to be populated
    :param category: the category in which the pages fall into
    :return:
    """

    #For each url in the url_list
    for url in url_list:

        # create PageCapture object - specify the browser to be 800 x 600.
        try:
            pc = PageCapture(url, 800, 600)
            url_file_name = convert_url_to_filename(url) + '.png'
            # To change to accomodate for the new changes
            image_file_name = os.path.join(DATA_DIR, url_file_name)
            pc.load_url(url)
            # fetch the screen-shot
            if halved_screen_shot:
                if random.random() > 0.5:
                    pc.crop_screen_shot(image_file_name, 200, 400, 700, 900)
                else:
                    pc.crop_screen_shot(image_file_name, 0, 0, 1000, 1000)
                #pc.halve_screen_shot(image_file_name)
            else:
                pc.take_screen_shot(image_file_name)

            # get the title
            title = pc.get_page_title()
            # create page in models/db with category
            # Abdullah , using DATA_DIR did not work for me because it uses the current working directory in the url.
            p = Page(category=category,
                     title=title,
                     is_shown=True,
                     url=url,
                     screenshot=os.path.join('/', MEDIA_ROOT, url_file_name))
            p.save()
            print 'Page title= ' + p.title + ' has been saved!'
        except ValueError:
            print 'Page  has ((NOT)) been saved!'
            print 'ERROR IS'
            print ValueError
            continue