Example #1
0
def test_page_indexability():
    """ Test to make sure indexability carries through from item to webpage
    """

    webpage_index = Webpage(TEST_WEBSITE)
    webpage_index.item_from_md_filename('001-basic-article.md')

    webpage_dont_index = Webpage(TEST_WEBSITE)
    webpage_dont_index.item_from_md_filename('009-unindexed-article.md')

    # Don't include noindex tag for article page that SHOULD be indexed
    assert '<meta name="robots" content="noindex">' not in webpage_index.html

    # Include noindex tag for article page that should NOT be indexed
    assert '<meta name="robots" content="noindex">' in webpage_dont_index.html
Example #2
0
def getUrlTexts(urlList):
    """ Lazy returns url texts """

    for url in urlList:
        page = Webpage(url)
        #data = tokenizeDocText(page.text)
        yield page.text
Example #3
0
 def crawl(self):
     #start crawling
     #myopener = MyOpener()
     while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(
     ):
         work_url = self.priorityQueue.pop()
         self.visited.append(work_url)
         print("%s, %s") % (-1 * work_url[0], work_url[1])
         #page = urllib2.urlopen(work_url)
         '''page = myopener.open(work_url)
         self.pagesCount += 1
         soup = BeautifulSoup(page)
         links = soup.find_all('a')'''
         page = Webpage(work_url[1])
         self.pagesCount += 1
         for link in page.outgoingUrls:
             url = link.address
             if url != None and url != '':
                 if url.find('?') != -1:
                     url = url.split('?')[0]
                 if not self.exists(url, self.visited):
                     if url.startswith('http:') and url.find(
                             '#') == -1 and not self.exists(
                                 url, self.priorityQueue.queue):
                         url_score = self.scorer.calculate_score(
                             link.getAllText())
                         self.totalPagesCount += 1
                         if url_score > 0.1:
                             self.priorityQueue.push(
                                 ((-1 * url_score), url))
                             self.relevantPagesCount += 1
Example #4
0
def test_webpage_from_single_article():
    """ Test creating an article item page using item_from_md_filename()
    """

    webpage = Webpage(TEST_WEBSITE)
    webpage.item_from_md_filename('001-basic-article.md')

    # Page title should be "Article title - Website name"
    title = 'This is the heading - Test website name'
    assert webpage.title == title
    assert webpage.html.count('<title>' + title + '</title>') == 1

    # Page should use static page template
    assert '<p>Article page template</p>' in webpage.html

    # List page header should be present
    assert webpage.html.count('<div>list page header</div>') == 1

    # Webpage should contain the text from the article
    assert webpage.html.count('<p>And here is some text...</p>') == 1

    # Article item footer should be present
    assert webpage.html.count('<footer class="article-footer">') == 1

    # Filename for webpage should be based on the article
    article = Item(TEST_WEBSITE)
    article.from_md_filename('001-basic-article.md')
    assert webpage.filename == article.filename

    # Body should have class='magnetizer-article-page'
    assert webpage.html.count("<body class='magnetizer-article-page'>") == 1

    # Twitter card should be present
    assert '<meta name="twitter:card" content="summary_large_image" />' in webpage.html

    # Link to Atom feed should be present
    assert ('<link rel="alternate" type="application/rss+xml" ' +
            'href="https://example.com/atom.xml" />') in webpage.html

    # Link to CSS should be present
    assert '<link rel="stylesheet" type="text/css" href="test-stylesheet.css' in webpage.html

    # Includes should be included, as per the .md file
    assert webpage.html.count("<div class='include'>Include 1</div>") == 2
    assert webpage.html.count("<div class='include'>Include 2</div>") == 1
    assert webpage.html.count("<div class='include'>Include 3</div>") == 1
    assert "[ ERROR: Include 'inexistent_file.html' does not exist! ]" in webpage.html

    # No html comments should be left in page
    assert '<!--' not in webpage.html

    # Meta description should be pulled in from article
    assert '<meta name="description" content="Meta description from article">' in webpage.html

    # Footnote link should have been added
    assert "<a href='#1'>[1]</a>" in webpage.html

    # Footnote anchor should have been added
    assert "<a id='1'></a>[1]:" in webpage.html
Example #5
0
    def crawl(self):
        #start crawling
        #myopener = MyOpener()
        self.harvestRatioData = []
        self.relevantPages = []
        while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(
        ):
            work_url = self.priorityQueue.pop()
            self.visited.append(work_url[1])
            #print ("%s, %s") % (-1 * work_url[0], work_url[1])
            #page = urllib2.urlopen(work_url)
            '''page = myopener.open(work_url)
            self.pagesCount += 1
            soup = BeautifulSoup(page)
            links = soup.find_all('a')'''
            page = Webpage(work_url, self.pagesCount)
            if len(page.text) > 0:
                page_score = self.scorer.calculate_score(page.text)
            else:
                page_score = 0

            self.pagesCount += 1
            if (page_score > self.pageScoreThreshold):
                page.getUrls()
                self.relevantPagesCount += 1
                self.relevantPages.append(page)
                self.harvestRatioData.append(
                    (self.relevantPagesCount, self.pagesCount))
                print("%s," + str(page_score) + ", %s") % (-1 * work_url[0],
                                                           work_url[1])
                for link in page.outgoingUrls:
                    url = link.address
                    if url != None and url != '':
                        if url.find('?') != -1:
                            url = url.split('?')[0]
                        if url.find('#') != -1:
                            url = url.split('#')[0]

#                         if url.startswith('http') == False:
#                             parts = page.pageUrl[1].split("://")
#                             baseUrl = parts[1].split("/")[0]
#                             baseUrl = parts[0] +"://" + baseUrl
#                             url = baseUrl + url

#if not self.existsInVisited(url,self.visited):
                        if url not in self.visited:
                            #if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue):
                            if url.startswith('http') and not self.exists(
                                    url, self.priorityQueue.queue):
                                url_score = self.scorer.calculate_score(
                                    link.getAllText())
                                self.totalPagesCount += 1
                                #tot_score = (page_score + url_score)/2.0
                                #tot_score = page_score + url_score
                                tot_score = url_score
                                if tot_score > self.urlScoreThreshold:
                                    #self.priorityQueue.push(((-1 * url_score),url))
                                    self.priorityQueue.push(
                                        ((-1 * tot_score), url, page.pageId))
Example #6
0
def main(models=None):
    models = [] if models is None else models.split(',')
    models.insert(0, 'original')
    print("Generate results -> " + str(models))
    url = input("URLを入力してください\n")
    layout_type = input("タイプを選択してください(1-7)\n")
    webpage = Webpage(url, layout_type)

    # スクリーンショット取得
    screenshot = webpage.get_screenshot('screen-pc.png', 1280, 800)
    # HTML保存
    webpage.save_html()

    for model in models:
        print(model)
        # CSVの準備
        csv_tags = Csv('./working/tag_list_' + model + '.csv')
        csv_tags_custom = Csv('./working/tag_list_custom_' + model + '.csv')
        default_row = [
            'class or id', 'tag_name', 'start_x', 'start_y', 'size_w',
            'size_h', 'average_color', 'salient_level', 'element_area'
        ]
        csv_tags.writerow(default_row)
        csv_tags_custom.writerow(default_row)

        # ハーフサイズの顕著性マップ
        if model == 'original':
            # 顕著性マップ取得
            saliency_map = webpage.get_saliency_map(screenshot)
            resize_saliency_map = Image(saliency_map).get_halfsize()
        elif model == 'original-mlnet':
            saliency_map = Image(cv2.imread('./data/mlnet.png', 1))
            resize_saliency_map = saliency_map.get_trimming((1280, 726))
            resize_saliency_map = cv2.cvtColor(resize_saliency_map,
                                               cv2.COLOR_BGR2GRAY)
        else:
            saliency_map = Image(cv2.imread('./data/' + model + '.png', 1))
            resize_saliency_map = saliency_map.get_trimming((1280, 726))
            resize_saliency_map = cv2.cvtColor(resize_saliency_map,
                                               cv2.COLOR_BGR2GRAY)

        # 各要素のサイズと顕著度を取得
        Element.canvas = Image(resize_saliency_map)
        Element.layout_type = webpage.layout_type
        Element.model = model
        print('ウェブページ全体の顕著度:' + str(Element.GetTotalSaliency()))

        GetElementInfo(webpage, csv_tags, csv_tags_custom)

        # CSVとWebDriverのClose
        csv_tags.close()
        csv_tags_custom.close()
        CreateRegionMap(model)

    webpage.driver.quit()
    getFinalLine()
    getFinalTile()
Example #7
0
 def crawl(self):
     self.harvestRatioData = []
     self.relevantPages = []
     
     while self.pagesCount <  self.pagesLimit and not self.priorityQueue.isempty():
     
         work_url = self.priorityQueue.pop()
         self.visited.append(work_url)
         page = Webpage(work_url,self.pagesCount)
         if page.text =='' :
             continue
         page_score = 0.0
         if self.combineScore:
             if len(page.text) > 0:
                 page_score = self.scorer.calculate_score(page.text,'W')
             else:
                 continue
             page.estimatedScore = page_score
             if self.restricted:
                 if page_score < self.pageScoreThreshold:
                     continue
             
         #print -1 * work_url[0],",", str(page_score),",",work_url[1],",", work_url[3]
         print -1 * work_url[0],",",work_url[1],",", work_url[3]
         self.pagesCount += 1
         
         page.getUrls()
         self.relevantPages.append(page)
         for link in page.outgoingUrls:
             url = link.address
             if url != None and url != '':
                 url = url.strip()
                 if url.find('?')!= -1:                            
                     url = url.split('?')[0]
                 if url.find('#') != -1:
                     url = url.split('#')[0]
                 
                 if url.endswith(("comment","comment/","feed","comments","feed/","comments/",".rss","video","video/","link","gif","jpeg","mp4","wav","jpg","mp3","png","share.php","sharer.php","login.php","print","print/","button/","share","email","submit","post",".pdf") ):    
                     continue
                 if not self.exists(url,1):
                     #tot_score = 0.0
                     if url.startswith('http') and not self.exists(url,2):                            
                         if self.mode == 1:
                             url_score = self.scorer.calculate_score(link.getAllText(),'U')
                             if self.combineScore:
                                 tot_score= 0.5 *page_score + 0.5 *url_score
                             else:
                                 tot_score = url_score
                             #if tot_score >= self.urlScoreThreshold:
                             self.priorityQueue.push(((-1 * tot_score),url,page.pageId,link.getAllText()))
                         #else:
                         #    self.priorityQueue.push(((-1 * page_score),url,page.pageId,link.getAllText()))
         #else:
         #    self.pages.append((page,0))
                                 
     print self.priorityQueue.isempty()
 def __init__(self, seedUrls):
     self.exclude_words = ['ads', 'print', 'advertisement']
     self.seedUrls = seedUrls
     super(TFIDF_Scorer, self).__init__(None)
     self.seedPages = []
     self.avgdl = 0
     for url in self.seedUrls:
         page = Webpage(url)
         data = self.cleanDoc(page.text)
         self.seedPages.append(data)
         self.avgdl += len(data)
     self.buildModel(self.seedPages)
def test_pagination_previous_only():
    """ Test that webpage.pagination_html() returns previous page correctly when no
    next page
    """
    webpage = Webpage(TEST_WEBSITE)
    webpage.url_previous = 'page-1.html'

    result = '<nav class="magnetizer-pagination"><ul>'
    result += '<li><a href="page-1.html" class="magnetizer-previous">Newer posts</a></li>'
    result += '</ul></nav>'

    assert webpage.pagination_html() == result
Example #10
0
    def crawl(self):
        #start crawling
        #myopener = MyOpener()
        self.harvestRatioData = []
        self.relevantPages = []
        while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(
        ):
            work_url = self.priorityQueue.pop()
            self.visited.append(work_url)

            page = Webpage(work_url, self.pagesCount)
            if len(page.text) > 0:
                page_score = self.scorer.calculate_score(page.text)
            else:
                page_score = 0

            self.pagesCount += 1
            if (page_score > self.pageScoreThreshold):
                page.getUrls()
                self.relevantPagesCount += 1
                self.relevantPages.append(page)
                self.harvestRatioData.append(
                    (self.relevantPagesCount, self.pagesCount))
                print("%s, %s") % (-1 * work_url[0], work_url[1])
                for link in page.outgoingUrls:
                    url = link.address
                    if url != None and url != '':
                        #if url.find('?')!= -1:
                        #    url = url.split('?')[0]
                        if url.startswith("/"):
                            base = page.pageUrl[1][7:].split("/")[0]
                            url = "http://" + base + url
                        if not self.exists(url, self.visited):
                            if url.startswith('http') and url.find(
                                    '#'
                            ) == -1 and not self.priorityQueue.exists(
                                    url
                            ):  #self.exists(url,self.priorityQueue.queue):
                                url_score = self.scorer.calculate_score(
                                    link.getAllText())
                                self.totalPagesCount += 1
                                #tot_score = (page_score + url_score)/2.0
                                #tot_score = page_score + url_score
                                tot_score = url_score
                                if tot_score > self.urlScoreThreshold:
                                    #self.priorityQueue.push(((-1 * url_score),url))
                                    self.priorityQueue.push(
                                        ((-1 * tot_score), url, page.pageId))
                                    #self.relevantPagesCount += 1

                self.priorityQueue.next()
def test_pagination_previous_and_next():
    """ Test that webpage.pagination_html() returns next and previous pages correctly
    when both are available
    """

    webpage = Webpage(TEST_WEBSITE)
    webpage.url_previous = 'page-3.html'
    webpage.url_next = 'page-5.html'

    result = '<nav class="magnetizer-pagination"><ul>'
    result += '<li><a href="page-3.html" class="magnetizer-previous">Newer posts</a></li>'
    result += '<li><a href="page-5.html" class="magnetizer-next">Older posts</a></li>'
    result += '</ul></nav>'

    assert webpage.pagination_html() == result
Example #12
0
def test_static_item_page():
    """ Test creating a static item page using item_from_md_filename()
    """

    webpage = Webpage(TEST_WEBSITE)
    webpage.item_from_md_filename('dont-show-on-list-page.md')

    # Page title should be "Article title - Website name"
    title = 'This post should not be in the index - Test website name'
    assert webpage.title == title
    assert webpage.html.count('<title>' + title + '</title>') == 1

    # Page should use static page template
    assert '<p>Static page template</p>' in webpage.html

    # List page header should NOT present
    assert webpage.html.count('<div>list page header</div>') == 0

    # Webpage should contain the text from the article
    assert webpage.html.count("<p>That's why it doesn't start with") == 1

    # Article footer should NOT be present
    assert webpage.html.count('<footer>footer</footer>') == 0

    # Filename for webpage should be based on the article
    article = Item(TEST_WEBSITE)
    article.from_md_filename('dont-show-on-list-page.md')
    assert webpage.filename == article.filename

    # Body should have class='magnetizer-static-page'
    assert webpage.html.count("<body class='magnetizer-static-page'>") == 1

    # Twitter card should be present
    assert '<meta name="twitter:card" content="summary_large_image" />' in webpage.html

    # Link to Atom feed should be present
    assert ('<link rel="alternate" type="application/rss+xml" ' +
            'href="https://example.com/atom.xml" />') in webpage.html

    # Link to CSS should be present
    assert '<link rel="stylesheet" type="text/css" href="test-stylesheet.css' in webpage.html

    # No html comments should be left in page
    assert '<!--' not in webpage.html
Example #13
0
def test_includes():
    """ Test of webpage.includes()
    """

    webpage = Webpage(TEST_WEBSITE)
    webpage.html = '<h1>Some html</h1>'
    webpage.html += "<!-- MAGNETIZER_INCLUDE _include1.html -->"
    webpage.html += "<!-- MAGNETIZER_INCLUDE _include2.html -->"
    webpage.html += '<div>More html...</div>'
    webpage.html += "<!-- MAGNETIZER_INCLUDE _include3.html -->"
    webpage.html += "<!-- MAGNETIZER_INCLUDE _include1.html -->"

    correct_includes = ['_include1.html', '_include2.html', '_include3.html']
    includes = webpage.includes()

    # Set should contain each include from the html
    for correct_include in correct_includes:
        assert correct_include in includes

    assert len(includes) == len(correct_includes)
Example #14
0
def test_webpage_write():
    """ Test of webpage.write()
    """

    result = "This is a test!"

    webpage = Webpage(TEST_WEBSITE)
    webpage.html = result
    webpage.filename = 'my-post.html'
    webpage.write()

    # File should have the correct contents
    with open(
            TEST_WEBSITE.config.value('output_path') + webpage.filename,
            'r') as myfile:
        assert myfile.read() == result

    # Page should be included in sitemap
    assert 'https://example.com/my-post.html' in TEST_WEBSITE.sitemap.pages

    TEST_WEBSITE.wipe()
Example #15
0
 def crawl(self):
     #start crawling
     while self.pagesCount <  self.pagesLimit and not self.priorityQueue.isempty():
         work_url = self.priorityQueue.pop()
         self.visited.append(work_url)
         page = Webpage(work_url[1])
         page_score = self.scorer.calculate_score(page.text)
         if (page_score > self.threshold):
             self.relevantPagesCount += 1
             print ("%s, %s") % (-1 * work_url[0], work_url[1])
         self.pagesCount += 1
         for link in page.outgoingUrls:
             url = link.address
             if url != None and url != '':
                 if url.find('?')!= -1:
                     url = url.split('?')[0]
                 if not self.exists(url,self.visited):
                     if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue):                            
                         url_score = self.scorer.calculate_score(link.getAllText())
                         self.totalPagesCount +=1
                         tot_score = (page_score + url_score)/2.0
                         if tot_score > threshold:
                             self.priorityQueue.push(((-1 * tot_score),url))
def test_pagination_none():
    """ Test that webpage.pagination_html() returns None when no pagination needed
    """

    webpage = Webpage(TEST_WEBSITE)
    assert webpage.pagination_html() is None
Example #17
0
    def enhanced_crawl(self):
            #start crawling
            #myopener = MyOpener()
            self.harvestRatioData = []
            self.relevantPages = []
            while self.pagesCount <  self.pagesLimit and not self.priorityQueue.isempty():
                work_url = self.priorityQueue.pop()
                self.visited.append(work_url[1])
                #print ("%s, %s") % (-1 * work_url[0], work_url[1])
                #page = urllib2.urlopen(work_url)
                '''page = myopener.open(work_url)
                self.pagesCount += 1
                soup = BeautifulSoup(page)
                links = soup.find_all('a')'''
                #print work_url[1]
                try:
                    req = urllib2.Request(work_url[1])
                    # create a request object

                    handle = urllib2.urlopen(req)
                    # and open it to return a handle on the url
                except urllib2.URLError, e:
                    # ignore error, URL timed out
                    pass

                else:
                    html = handle.read()
                    soup = BeautifulSoup(html)
                    paras = soup.findAll('p')
                    #print paras
                    text = ""
                    for para in paras:
                            text = text + " " + para.text
                                    
                    page = Webpage(work_url,self.pagesCount)
                    if len(page.text) > 0:
                        page_score = self.scorer.calculate_smart_score(text, work_url[1])
                    else:
                        page_score = 0
                        
                    self.pagesCount += 1
                    if (page_score > self.pageScoreThreshold):
                        page.getUrls()
                        self.relevantPagesCount += 1
                        self.relevantPages.append(page)
                        self.harvestRatioData.append((self.relevantPagesCount,self.pagesCount))
                        print ("%s|"+ str(page_score)+"|%s") % (-1.0 * work_url[0], work_url[1])
                        for link in page.outgoingUrls:
                            url = link.address
                            if url != None and url != '':
                                if url.find('?')!= -1:
                                    url = url.split('?')[0]
                                if url.find('#') != -1:
                                    url = url.split('#')[0]
                                    
            #                         if url.startswith('http') == False:
            #                             parts = page.pageUrl[1].split("://")
            #                             baseUrl = parts[1].split("/")[0]
            #                             baseUrl = parts[0] +"://" + baseUrl
            #                             url = baseUrl + url
                                    
                                    #if not self.existsInVisited(url,self.visited): 
                                if url not in self.visited:
                                        #if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue):                            
                                    if url.startswith('http') and not self.exists(url,self.priorityQueue.queue):
                                        url_score = self.url_scorer.calculate_score(link.getAllText())
                                        self.totalPagesCount +=1
                                        #tot_score = (page_score + url_score)/2.0
                                        #tot_score = page_score + url_score
                                        tot_score = url_score
                                        if tot_score > self.urlScoreThreshold:
                                            #self.priorityQueue.push(((-1 * url_score),url))
                                            self.priorityQueue.push(((-1 * tot_score),url,page.pageId))
Example #18
0
    def crawl(self):
        self.harvestRatioData = []
        self.relevantPages = []
        webpages = []
        count = 0
        ftext = open(self.pagesDir + "webpagesTxt.txt", "w")
        webpageLabel = 0  # 0 for Non-relevant and 1 for Relevant
        while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(
        ):

            work_url = self.priorityQueue.pop()
            url = work_url[1]
            #if self.exists(url,1):
            #    continue
            if url in self.visited:
                continue
            #self.visited.append(url)#work_url[1])
            self.visited[url] = 1
            page = Webpage(work_url, self.pagesCount)
            if page.text == '':
                continue

            page.estimatedScore = 0
            if self.combineScore:
                page_score = 0
                if len(page.text) > 0:
                    #page_score = self.scorer.calculate_score(page.text,'W')[1]
                    page_score = self.scorer.calculate_score(page, 'W')[1]
                    if page_score == -1:
                        continue
                else:
                    print 'page text is empty'
                    continue

                page.estimatedScore = page_score

                if self.restricted:
                    if page_score < self.pageScoreThreshold:
                        #self.pagesCount += 1
                        continue

                pageDom = getDomain(url)
                if page_score >= self.pageScoreThreshold:
                    self.sourcesImp[pageDom][0] += 1
                    webpageLabel = 1
                else:
                    self.sourcesImp[pageDom][1] += 1
                    #self.sourcesImp[pageDom][0] = self.sourcesImp[pageDom][1]
                    webpageLabel = 0
            if self.combineScore:
                print page.pageId, ": ", str(
                    page_score), ",", -1 * work_url[0], ",", work_url[
                        1]  #,",", work_url[3]
            else:
                print -1 * work_url[0], ",", work_url[1]  #,",", work_url[3]
            self.pagesCount += 1
            #self.relevantPages.append((page.pageId,page.pageUrl,page.estimatedScore))
            self.relevantPages.append(
                (page.pageId, (page.pageUrl[1], page.pageUrl[2]),
                 page.estimatedScore))

            wbsStr = page.text.replace('\n', '. ').replace('\t', ' ')

            webpages.append(wbsStr)
            count += 1
            #save webpage's text to disk instead of adding to list
            # this will lead to change in evaluation
            if count % self.bufferLen == 0:
                strToWrite = '\n'.join(webpages).encode("utf-8")
                ftext.write(strToWrite)
                webpages = []
            #ftext = open(self.pagesDir+str(page.pageId) + ".txt", "w")
            #ftext.write(page.text.encode("utf-8"))
            #ftext.close()
            #-------

            if page_score < 0.1:
                continue
            page.getUrls()

            for link in page.outgoingUrls:
                url = link.address

                #if url != None and url != '':
                if url:
                    url = url.strip()
                    if url.find('report-a-typo') != -1:
                        continue
                    if url.find('m.tempo.co/') != -1:
                        continue
                    if url.find('?') != -1:
                        furl = url.split('?')[1]
                        if furl.startswith('id=') == False or furl.startswith(
                                'v=') == False or furl.startswith(
                                    'tid=') == False:
                            url = url.split('?')[0]
                    if url.find('#') != -1:
                        url = url.split('#')[0]

                    if url.endswith('/'):
                        url = url[:-1]
                    #if url.endswith(("comment","comment/","feed","comments","feed/","comments/",".rss","video","video/","link","gif","jpeg","mp4","wav","jpg","mp3","png","share.php","sharer.php","login.php","print","print/","button/","share","email","submit","post",".pdf") ):
                    if url.endswith(
                        ("comment", "feed", "comments", ".rss", "video",
                         "link", "gif", "jpeg", "mp4", "wav", "jpg", "mp3",
                         "png", "share.php", "sharer.php", "login.php",
                         "print", "button", "share", "email", "submit", "post",
                         ".pdf")):
                        continue

                    #if not self.exists(url,1):
                    if url in self.visited:
                        continue
                    #tot_score = 0.0
                    if url.startswith('http'):  #and not self.exists(url,2):
                        linkText = link.getAllText()
                        #if self.mode == 1:
                        #url_score = self.scorer.calculate_score(linkText,'U')
                        url_score = self.scorer.calculate_score(link, 'U')
                        tot_score = url_score
                        if self.combineScore:
                            #tot_score= 0.4 *page_score + 0.6 *url_score

                            tot_score = page_score * url_score
                        if tot_score < self.urlScoreThreshold:
                            continue
                        urlDom = getDomain(url)

                        si_score = self.sourcesImp[urlDom][
                            0] / self.sourcesImp[urlDom][1]
                        if self.siScoreCombineMethod == 1:
                            if webpageLabel:
                                tot_score = tot_score * si_score
                        elif self.siScoreCombineMethod == 2:
                            tot_score = self.topicWeight * tot_score + self.siWeight * si_score
                        #tot_score = tot_score * si_score
                        #else:
                        #    tot_score = url_score
                        #if tot_score >= self.urlScoreThreshold:
                        #print tot_score, '-', url, linkText
                        if self.restricted:
                            if tot_score < self.urlScoreThreshold:
                                continue
                        if tot_score >= self.urlScoreThreshold:
                            self.priorityQueue.push(
                                ((-1 * tot_score), url,
                                 page.pageId))  #,linkText))
                        #else:
                        #    self.priorityQueue.push(((-1 * page_score),url,page.pageId,link.getAllText()))
            #else:
            #    self.pages.append((page,0))

        print self.priorityQueue.isempty()

        if webpages:
            strToWrite = '\n'.join(webpages).encode("utf-8")
            ftext.write(strToWrite)
        ftext.close()

        return self.priorityQueue.queue
Example #19
0

import csv
from pathlib import Path
from selenium import webdriver
from webpage import Webpage

with open(Path("webpage-info-scraper/webpage_urls.txt"), "r") as webpage_urls:
    with open(Path("webpage-info-scraper/webpage_data.csv"),
              "w") as webpage_data:
        writer = csv.writer(webpage_data, delimiter=",")
        web_driver = webdriver.Firefox()
        webpages = []
        """Scrape information from webpages using URLs stored in CSV file"""
        for url in webpage_urls:
            webpage = Webpage(url, web_driver)
            webpages.append(webpage)
            webpage.open()
            webpage.org_name = webpage.find_element_by_xpath("//h1").text
            print(webpage.org_name)

            try:
                contact_email_element = webpage.find_element_by_xpath(
                    "//span[text()='Contact Email']")
                div_text = contact_email_element.find_element_by_xpath(
                    '..').text
                webpage.email_address = remove_words_from_text(
                    div_text, ['Contact', 'Email', 'E:'])
                print(webpage.email_address)
            except:
                webpage.email_address = ""