Esempio n. 1
0
def test_webpage_from_single_article():
    """ Test creating an article item page using item_from_md_filename()
    """

    webpage = Webpage(TEST_WEBSITE)
    webpage.item_from_md_filename('001-basic-article.md')

    # Page title should be "Article title - Website name"
    title = 'This is the heading - Test website name'
    assert webpage.title == title
    assert webpage.html.count('<title>' + title + '</title>') == 1

    # Page should use static page template
    assert '<p>Article page template</p>' in webpage.html

    # List page header should be present
    assert webpage.html.count('<div>list page header</div>') == 1

    # Webpage should contain the text from the article
    assert webpage.html.count('<p>And here is some text...</p>') == 1

    # Article item footer should be present
    assert webpage.html.count('<footer class="article-footer">') == 1

    # Filename for webpage should be based on the article
    article = Item(TEST_WEBSITE)
    article.from_md_filename('001-basic-article.md')
    assert webpage.filename == article.filename

    # Body should have class='magnetizer-article-page'
    assert webpage.html.count("<body class='magnetizer-article-page'>") == 1

    # Twitter card should be present
    assert '<meta name="twitter:card" content="summary_large_image" />' in webpage.html

    # Link to Atom feed should be present
    assert ('<link rel="alternate" type="application/rss+xml" ' +
            'href="https://example.com/atom.xml" />') in webpage.html

    # Link to CSS should be present
    assert '<link rel="stylesheet" type="text/css" href="test-stylesheet.css' in webpage.html

    # Includes should be included, as per the .md file
    assert webpage.html.count("<div class='include'>Include 1</div>") == 2
    assert webpage.html.count("<div class='include'>Include 2</div>") == 1
    assert webpage.html.count("<div class='include'>Include 3</div>") == 1
    assert "[ ERROR: Include 'inexistent_file.html' does not exist! ]" in webpage.html

    # No html comments should be left in page
    assert '<!--' not in webpage.html

    # Meta description should be pulled in from article
    assert '<meta name="description" content="Meta description from article">' in webpage.html

    # Footnote link should have been added
    assert "<a href='#1'>[1]</a>" in webpage.html

    # Footnote anchor should have been added
    assert "<a id='1'></a>[1]:" in webpage.html
Esempio n. 2
0
    def crawl(self):
        #start crawling
        #myopener = MyOpener()
        self.harvestRatioData = []
        self.relevantPages = []
        while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(
        ):
            work_url = self.priorityQueue.pop()
            self.visited.append(work_url[1])
            #print ("%s, %s") % (-1 * work_url[0], work_url[1])
            #page = urllib2.urlopen(work_url)
            '''page = myopener.open(work_url)
            self.pagesCount += 1
            soup = BeautifulSoup(page)
            links = soup.find_all('a')'''
            page = Webpage(work_url, self.pagesCount)
            if len(page.text) > 0:
                page_score = self.scorer.calculate_score(page.text)
            else:
                page_score = 0

            self.pagesCount += 1
            if (page_score > self.pageScoreThreshold):
                page.getUrls()
                self.relevantPagesCount += 1
                self.relevantPages.append(page)
                self.harvestRatioData.append(
                    (self.relevantPagesCount, self.pagesCount))
                print("%s," + str(page_score) + ", %s") % (-1 * work_url[0],
                                                           work_url[1])
                for link in page.outgoingUrls:
                    url = link.address
                    if url != None and url != '':
                        if url.find('?') != -1:
                            url = url.split('?')[0]
                        if url.find('#') != -1:
                            url = url.split('#')[0]

#                         if url.startswith('http') == False:
#                             parts = page.pageUrl[1].split("://")
#                             baseUrl = parts[1].split("/")[0]
#                             baseUrl = parts[0] +"://" + baseUrl
#                             url = baseUrl + url

#if not self.existsInVisited(url,self.visited):
                        if url not in self.visited:
                            #if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue):
                            if url.startswith('http') and not self.exists(
                                    url, self.priorityQueue.queue):
                                url_score = self.scorer.calculate_score(
                                    link.getAllText())
                                self.totalPagesCount += 1
                                #tot_score = (page_score + url_score)/2.0
                                #tot_score = page_score + url_score
                                tot_score = url_score
                                if tot_score > self.urlScoreThreshold:
                                    #self.priorityQueue.push(((-1 * url_score),url))
                                    self.priorityQueue.push(
                                        ((-1 * tot_score), url, page.pageId))
Esempio n. 3
0
def test_mongin(caplog):
    pagename = 'mongin.html'
    url = 'https://studies2.hec.fr/jahia/Jahia/cache/offonce/lang/en/mongin/pid/1072'
    page = Webpage(url, html=source(pagename))
    svars = page.session_variables()
    assert 'jsessionid' in svars
    testurl = 'https://studies2.hec.fr/jahia/webdav/site/hec/shared/sites/mongin/foo.pdf;jsessionid=123456'
    stripped = page.strip_session_variables(testurl)
    assert stripped == 'https://studies2.hec.fr/jahia/webdav/site/hec/shared/sites/mongin/foo.pdf;'
Esempio n. 4
0
 def crawl(self):
     self.harvestRatioData = []
     self.relevantPages = []
     
     while self.pagesCount <  self.pagesLimit and not self.priorityQueue.isempty():
     
         work_url = self.priorityQueue.pop()
         self.visited.append(work_url)
         page = Webpage(work_url,self.pagesCount)
         if page.text =='' :
             continue
         page_score = 0.0
         if self.combineScore:
             if len(page.text) > 0:
                 page_score = self.scorer.calculate_score(page.text,'W')
             else:
                 continue
             page.estimatedScore = page_score
             if self.restricted:
                 if page_score < self.pageScoreThreshold:
                     continue
             
         #print -1 * work_url[0],",", str(page_score),",",work_url[1],",", work_url[3]
         print -1 * work_url[0],",",work_url[1],",", work_url[3]
         self.pagesCount += 1
         
         page.getUrls()
         self.relevantPages.append(page)
         for link in page.outgoingUrls:
             url = link.address
             if url != None and url != '':
                 url = url.strip()
                 if url.find('?')!= -1:                            
                     url = url.split('?')[0]
                 if url.find('#') != -1:
                     url = url.split('#')[0]
                 
                 if url.endswith(("comment","comment/","feed","comments","feed/","comments/",".rss","video","video/","link","gif","jpeg","mp4","wav","jpg","mp3","png","share.php","sharer.php","login.php","print","print/","button/","share","email","submit","post",".pdf") ):    
                     continue
                 if not self.exists(url,1):
                     #tot_score = 0.0
                     if url.startswith('http') and not self.exists(url,2):                            
                         if self.mode == 1:
                             url_score = self.scorer.calculate_score(link.getAllText(),'U')
                             if self.combineScore:
                                 tot_score= 0.5 *page_score + 0.5 *url_score
                             else:
                                 tot_score = url_score
                             #if tot_score >= self.urlScoreThreshold:
                             self.priorityQueue.push(((-1 * tot_score),url,page.pageId,link.getAllText()))
                         #else:
                         #    self.priorityQueue.push(((-1 * page_score),url,page.pageId,link.getAllText()))
         #else:
         #    self.pages.append((page,0))
                                 
     print self.priorityQueue.isempty()
Esempio n. 5
0
    def crawl(self):
        #start crawling
        #myopener = MyOpener()
        self.harvestRatioData = []
        self.relevantPages = []
        while self.pagesCount <  self.pagesLimit and not self.priorityQueue.isempty():
            work_url = self.priorityQueue.pop()
            self.visited.append(work_url[1])
            #print ("%s, %s") % (-1 * work_url[0], work_url[1])
            #page = urllib2.urlopen(work_url)
            '''page = myopener.open(work_url)
            self.pagesCount += 1
            soup = BeautifulSoup(page)
            links = soup.find_all('a')'''
            page = Webpage(work_url,self.pagesCount)
            if len(page.text) > 0:
                page_score = self.scorer.calculate_score(page.text)
            else:
                page_score = 0
            
            self.pagesCount += 1
            if (page_score > self.pageScoreThreshold):
                page.getUrls()
                self.relevantPagesCount += 1
                self.relevantPages.append(page)
                self.harvestRatioData.append((self.relevantPagesCount,self.pagesCount))
                print ("%s,"+ str(page_score)+", %s") % (-1 * work_url[0], work_url[1])
                for link in page.outgoingUrls:
                    url = link.address
                    if url != None and url != '':
                        if url.find('?')!= -1:
                            url = url.split('?')[0]
                        if url.find('#') != -1:
                            url = url.split('#')[0]
                        
#                         if url.startswith('http') == False:
#                             parts = page.pageUrl[1].split("://")
#                             baseUrl = parts[1].split("/")[0]
#                             baseUrl = parts[0] +"://" + baseUrl
#                             url = baseUrl + url
                        
                        #if not self.existsInVisited(url,self.visited): 
                        if url not in self.visited:
                            #if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue):                            
                            if url.startswith('http') and not self.exists(url,self.priorityQueue.queue):
                                url_score = self.scorer.calculate_score(link.getAllText())
                                self.totalPagesCount +=1
                                #tot_score = (page_score + url_score)/2.0
                                #tot_score = page_score + url_score
                                tot_score = url_score
                                if tot_score > self.urlScoreThreshold:
                                    #self.priorityQueue.push(((-1 * url_score),url))
                                    self.priorityQueue.push(((-1 * tot_score),url,page.pageId))
def test_pagination_previous_only():
    """ Test that webpage.pagination_html() returns previous page correctly when no
    next page
    """
    webpage = Webpage(TEST_WEBSITE)
    webpage.url_previous = 'page-1.html'

    result = '<nav class="magnetizer-pagination"><ul>'
    result += '<li><a href="page-1.html" class="magnetizer-previous">Newer posts</a></li>'
    result += '</ul></nav>'

    assert webpage.pagination_html() == result
Esempio n. 7
0
    def crawl(self):
        #start crawling
        #myopener = MyOpener()
        self.harvestRatioData = []
        self.relevantPages = []
        while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(
        ):
            work_url = self.priorityQueue.pop()
            self.visited.append(work_url)

            page = Webpage(work_url, self.pagesCount)
            if len(page.text) > 0:
                page_score = self.scorer.calculate_score(page.text)
            else:
                page_score = 0

            self.pagesCount += 1
            if (page_score > self.pageScoreThreshold):
                page.getUrls()
                self.relevantPagesCount += 1
                self.relevantPages.append(page)
                self.harvestRatioData.append(
                    (self.relevantPagesCount, self.pagesCount))
                print("%s, %s") % (-1 * work_url[0], work_url[1])
                for link in page.outgoingUrls:
                    url = link.address
                    if url != None and url != '':
                        #if url.find('?')!= -1:
                        #    url = url.split('?')[0]
                        if url.startswith("/"):
                            base = page.pageUrl[1][7:].split("/")[0]
                            url = "http://" + base + url
                        if not self.exists(url, self.visited):
                            if url.startswith('http') and url.find(
                                    '#'
                            ) == -1 and not self.priorityQueue.exists(
                                    url
                            ):  #self.exists(url,self.priorityQueue.queue):
                                url_score = self.scorer.calculate_score(
                                    link.getAllText())
                                self.totalPagesCount += 1
                                #tot_score = (page_score + url_score)/2.0
                                #tot_score = page_score + url_score
                                tot_score = url_score
                                if tot_score > self.urlScoreThreshold:
                                    #self.priorityQueue.push(((-1 * url_score),url))
                                    self.priorityQueue.push(
                                        ((-1 * tot_score), url, page.pageId))
                                    #self.relevantPagesCount += 1

                self.priorityQueue.next()
def test_pagination_previous_and_next():
    """ Test that webpage.pagination_html() returns next and previous pages correctly
    when both are available
    """

    webpage = Webpage(TEST_WEBSITE)
    webpage.url_previous = 'page-3.html'
    webpage.url_next = 'page-5.html'

    result = '<nav class="magnetizer-pagination"><ul>'
    result += '<li><a href="page-3.html" class="magnetizer-previous">Newer posts</a></li>'
    result += '<li><a href="page-5.html" class="magnetizer-next">Older posts</a></li>'
    result += '</ul></nav>'

    assert webpage.pagination_html() == result
Esempio n. 9
0
    def crawl(self):
        hitPageLimit = False
        while self.pageCount() <  self.pageLimit and not self.priorityQueue.isempty():
            url_priority_obj = self.priorityQueue.pop()
            priority = url_priority_obj[0]
            url = url_priority_obj[1]
            if url not in self.visited:
                self.visited.append(url)
                print "Crawling page #{} {}".format(self.pageCount(), url_priority_obj)

                page = Webpage(url_priority_obj[1])
                page_score = page.score(self.scorer)
                if (page_score > self.relevantThreshold):
                    self.relevantPages.append(url_priority_obj)
                    print "Relevant page found. Score ({}) URL ({})".format(page_score, url)
                else:
                    print "Irrelevant page found. Score ({}) URL ({})".format(page_score, url)
                linked_url_count = 0
                for linked_url in page.outgoingUrls:
                    if linked_url != None and linked_url != '':
                        if not self.is_blacklisted(linked_url):
                            # if linked_url.find('?')!= -1:
                            #     linked_url = linked_url.split('?')[0]
                            if linked_url not in self.visited and linked_url not in self.irrelevantUrls:
                                if linked_url.startswith('http') and not self.exists(linked_url,self.priorityQueue.queue):
                                    linked_url_count += 1
                                    print "Checking link #{} {}".format(linked_url_count, linked_url)
                                    linked_page = Webpage(linked_url)
                                    if hasattr(linked_page, "text"):  # webpage was parseable
                                        linked_url_score = linked_page.score(self.scorer)
                                        self.totalPagesCount +=1
                                        link_weight = 2
                                        page_weight = 1
                                        tot_score = ((page_weight * page_score) + (link_weight * linked_url_score))/2.0
                                        if tot_score >= self.relevantThreshold:
                                            print "Relevant link found. Score ({}) URL ({})".format(tot_score, linked_url)
                                            self.priorityQueue.push(((-1 * tot_score),linked_url))
                                        else:
                                            self.irrelevantUrls.append(linked_url)

                                        if self.linkLimit > 0 and linked_url_count >= self.linkLimit:
                                            print "Done crawling page. Reached linkLimit."
                                            break

        if self.pageCount() >=  self.pageLimit:
            print "Done crawling. Reached pageLimit."
        elif self.priorityQueue.isempty():
            print "Done crawling. No more pages to crawl."
Esempio n. 10
0
    def __init__(self, website):

        self.website = website

        filenames = Webpage.filenames_from_directory(
            self.website.config.value('source_path'))
        self.feed_data = self.feed(filenames)
Esempio n. 11
0
 def crawl(self):
     #start crawling
     #myopener = MyOpener()
     while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(
     ):
         work_url = self.priorityQueue.pop()
         self.visited.append(work_url)
         print("%s, %s") % (-1 * work_url[0], work_url[1])
         #page = urllib2.urlopen(work_url)
         '''page = myopener.open(work_url)
         self.pagesCount += 1
         soup = BeautifulSoup(page)
         links = soup.find_all('a')'''
         page = Webpage(work_url[1])
         self.pagesCount += 1
         for link in page.outgoingUrls:
             url = link.address
             if url != None and url != '':
                 if url.find('?') != -1:
                     url = url.split('?')[0]
                 if not self.exists(url, self.visited):
                     if url.startswith('http:') and url.find(
                             '#') == -1 and not self.exists(
                                 url, self.priorityQueue.queue):
                         url_score = self.scorer.calculate_score(
                             link.getAllText())
                         self.totalPagesCount += 1
                         if url_score > 0.1:
                             self.priorityQueue.push(
                                 ((-1 * url_score), url))
                             self.relevantPagesCount += 1
Esempio n. 12
0
def getUrlTexts(urlList):
    """ Lazy returns url texts """

    for url in urlList:
        page = Webpage(url)
        #data = tokenizeDocText(page.text)
        yield page.text
Esempio n. 13
0
def ensure_game_is_active(sleep_time):
    """
        Pauses the run while the game window isn't active.
        :param sleep_time: time to sleep between polling activeness of window
    """
    while not Webpage.is_active():
        sleep(sleep_time)
Esempio n. 14
0
def test_static_item_page():
    """ Test creating a static item page using item_from_md_filename()
    """

    webpage = Webpage(TEST_WEBSITE)
    webpage.item_from_md_filename('dont-show-on-list-page.md')

    # Page title should be "Article title - Website name"
    title = 'This post should not be in the index - Test website name'
    assert webpage.title == title
    assert webpage.html.count('<title>' + title + '</title>') == 1

    # Page should use static page template
    assert '<p>Static page template</p>' in webpage.html

    # List page header should NOT present
    assert webpage.html.count('<div>list page header</div>') == 0

    # Webpage should contain the text from the article
    assert webpage.html.count("<p>That's why it doesn't start with") == 1

    # Article footer should NOT be present
    assert webpage.html.count('<footer>footer</footer>') == 0

    # Filename for webpage should be based on the article
    article = Item(TEST_WEBSITE)
    article.from_md_filename('dont-show-on-list-page.md')
    assert webpage.filename == article.filename

    # Body should have class='magnetizer-static-page'
    assert webpage.html.count("<body class='magnetizer-static-page'>") == 1

    # Twitter card should be present
    assert '<meta name="twitter:card" content="summary_large_image" />' in webpage.html

    # Link to Atom feed should be present
    assert ('<link rel="alternate" type="application/rss+xml" ' +
            'href="https://example.com/atom.xml" />') in webpage.html

    # Link to CSS should be present
    assert '<link rel="stylesheet" type="text/css" href="test-stylesheet.css' in webpage.html

    # No html comments should be left in page
    assert '<!--' not in webpage.html
 def crawl(self):
     #start crawling
     #myopener = MyOpener()
     self.harvestRatioData = []
     self.relevantPages = []
     while self.pagesCount <  self.pagesLimit and not self.priorityQueue.isempty():            
         work_url = self.priorityQueue.pop()
         self.visited.append(work_url)
         
         page = Webpage(work_url,self.pagesCount)
         if len(page.text) > 0:
             page_score = self.scorer.calculate_score(page.text)
         else:
             page_score = 0
             
         self.pagesCount += 1
         if (page_score > self.pageScoreThreshold):
             page.getUrls()
             self.relevantPagesCount += 1
             self.relevantPages.append(page)
             self.harvestRatioData.append((self.relevantPagesCount,self.pagesCount))
             print ("%s, %s") % (-1 * work_url[0], work_url[1])
             for link in page.outgoingUrls:
                 url = link.address                    
                 if url != None and url != '':
                     #if url.find('?')!= -1:
                     #    url = url.split('?')[0]
                     if url.startswith("/"):                            
                         base = page.pageUrl[1][7:].split("/")[0]
                         url = "http://" + base + url
                     if not self.exists(url,self.visited):
                         if url.startswith('http') and url.find('#') == -1 and not self.priorityQueue.exists(url):#self.exists(url,self.priorityQueue.queue):                            
                             url_score = self.scorer.calculate_score(link.getAllText())
                             self.totalPagesCount +=1
                             #tot_score = (page_score + url_score)/2.0
                             #tot_score = page_score + url_score
                             tot_score = url_score
                             if tot_score > self.urlScoreThreshold:
                                 #self.priorityQueue.push(((-1 * url_score),url))
                                 self.priorityQueue.push(((-1 * tot_score),url,page.pageId))
                                 #self.relevantPagesCount += 1                            
                             
             self.priorityQueue.next()
Esempio n. 16
0
def test_includes():
    """ Test of webpage.includes()
    """

    webpage = Webpage(TEST_WEBSITE)
    webpage.html = '<h1>Some html</h1>'
    webpage.html += "<!-- MAGNETIZER_INCLUDE _include1.html -->"
    webpage.html += "<!-- MAGNETIZER_INCLUDE _include2.html -->"
    webpage.html += '<div>More html...</div>'
    webpage.html += "<!-- MAGNETIZER_INCLUDE _include3.html -->"
    webpage.html += "<!-- MAGNETIZER_INCLUDE _include1.html -->"

    correct_includes = ['_include1.html', '_include2.html', '_include3.html']
    includes = webpage.includes()

    # Set should contain each include from the html
    for correct_include in correct_includes:
        assert correct_include in includes

    assert len(includes) == len(correct_includes)
Esempio n. 17
0
def test_webpage_write_multiple_from_filenames():
    """ Test of write_item_pages_from_md_filenames()
    """

    TEST_WEBSITE.wipe()

    filenames = [
        '001-basic-article.md', '002-article-with-h1-break-and-date.md',
        '003-another-article.md', '100-ignore-this.txt',
        'dont-show-on-list-page.md', '009-unindexed-article.md'
    ]
    Webpage.write_item_pages_from_md_filenames(TEST_WEBSITE, filenames)

    written_filenames = listdir(TEST_WEBSITE.config.value('output_path'))

    # All the normal articles should have been written
    assert 'basic-article.html' in written_filenames
    assert 'article-with-h1-break-and-date.html' in written_filenames
    assert 'another-article.html' in written_filenames
    assert 'unindexed-article.html' in written_filenames

    # The static pages should have been written too
    assert 'dont-show-on-list-page.html' in written_filenames

    # The file not ending in .md should not have been written
    assert 'ignore-this.html' not in written_filenames
    assert '100-ignore-this.txt' not in written_filenames

    # The written files should be included in the sitemap...
    assert 'https://example.com/basic-article.html' in TEST_WEBSITE.sitemap.pages
    assert 'https://example.com/article-with-h1-break-and-date.html' in TEST_WEBSITE.sitemap.pages
    assert 'https://example.com/another-article.html' in TEST_WEBSITE.sitemap.pages
    assert 'https://example.com/dont-show-on-list-page.html' in TEST_WEBSITE.sitemap.pages

    # ... except for the unindexed article
    assert 'https://example.com/unindexed-article.html' not in TEST_WEBSITE.sitemap.pages

    # Ignored files should not be included in the sitemap
    assert 'https://example.com/ignore-this.html' not in TEST_WEBSITE.sitemap.pages

    TEST_WEBSITE.wipe()
 def __init__(self, seedUrls):
     self.exclude_words = ['ads', 'print', 'advertisement']
     self.seedUrls = seedUrls
     super(TFIDF_Scorer, self).__init__(None)
     self.seedPages = []
     self.avgdl = 0
     for url in self.seedUrls:
         page = Webpage(url)
         data = self.cleanDoc(page.text)
         self.seedPages.append(data)
         self.avgdl += len(data)
     self.buildModel(self.seedPages)
Esempio n. 19
0
def test_page_indexability():
    """ Test to make sure indexability carries through from item to webpage
    """

    webpage_index = Webpage(TEST_WEBSITE)
    webpage_index.item_from_md_filename('001-basic-article.md')

    webpage_dont_index = Webpage(TEST_WEBSITE)
    webpage_dont_index.item_from_md_filename('009-unindexed-article.md')

    # Don't include noindex tag for article page that SHOULD be indexed
    assert '<meta name="robots" content="noindex">' not in webpage_index.html

    # Include noindex tag for article page that should NOT be indexed
    assert '<meta name="robots" content="noindex">' in webpage_dont_index.html
Esempio n. 20
0
def main():
    sleep_time = 0.7
    screen_size = Size(316, 280, 600, 480)
    cap = CaptureImage(screen_size)
    Webpage.open_url()
    cap.capture()
    board = Board((17, 18, 500, 500))
    for i in xrange(1000):
        ensure_game_is_active(sleep_time)
        pyautogui.moveTo(316 + 17 + 34, 280 + 389)
        cap.capture()
        board.read_board_from_screen()
        board.accumulate_neighbors()
        x, y = board.get_x_y_for_shot()
        if x > 900 or y > 900:
            continue
        try:
            pyautogui.moveTo(x, y)
            pyautogui.click()
        except (WindowsError, ValueError) as e:
            pass  # Ignoring pyautogui exceptions
        sleep(sleep_time)
    Webpage.close()
Esempio n. 21
0
def main(models=None):
    models = [] if models is None else models.split(',')
    models.insert(0, 'original')
    print("Generate results -> " + str(models))
    url = input("URLを入力してください\n")
    layout_type = input("タイプを選択してください(1-7)\n")
    webpage = Webpage(url, layout_type)

    # スクリーンショット取得
    screenshot = webpage.get_screenshot('screen-pc.png', 1280, 800)
    # HTML保存
    webpage.save_html()

    for model in models:
        print(model)
        # CSVの準備
        csv_tags = Csv('./working/tag_list_' + model + '.csv')
        csv_tags_custom = Csv('./working/tag_list_custom_' + model + '.csv')
        default_row = [
            'class or id', 'tag_name', 'start_x', 'start_y', 'size_w',
            'size_h', 'average_color', 'salient_level', 'element_area'
        ]
        csv_tags.writerow(default_row)
        csv_tags_custom.writerow(default_row)

        # ハーフサイズの顕著性マップ
        if model == 'original':
            # 顕著性マップ取得
            saliency_map = webpage.get_saliency_map(screenshot)
            resize_saliency_map = Image(saliency_map).get_halfsize()
        elif model == 'original-mlnet':
            saliency_map = Image(cv2.imread('./data/mlnet.png', 1))
            resize_saliency_map = saliency_map.get_trimming((1280, 726))
            resize_saliency_map = cv2.cvtColor(resize_saliency_map,
                                               cv2.COLOR_BGR2GRAY)
        else:
            saliency_map = Image(cv2.imread('./data/' + model + '.png', 1))
            resize_saliency_map = saliency_map.get_trimming((1280, 726))
            resize_saliency_map = cv2.cvtColor(resize_saliency_map,
                                               cv2.COLOR_BGR2GRAY)

        # 各要素のサイズと顕著度を取得
        Element.canvas = Image(resize_saliency_map)
        Element.layout_type = webpage.layout_type
        Element.model = model
        print('ウェブページ全体の顕著度:' + str(Element.GetTotalSaliency()))

        GetElementInfo(webpage, csv_tags, csv_tags_custom)

        # CSVとWebDriverのClose
        csv_tags.close()
        csv_tags_custom.close()
        CreateRegionMap(model)

    webpage.driver.quit()
    getFinalLine()
    getFinalTile()
Esempio n. 22
0
def main():
    """ Main method to trigger generation of all pages
    """

    if len(argv) == 3 and argv[1] == '-config':
        config_filename = argv[2]
    else:
        config_filename = '../config/magnetizer.cfg'

    print('Using config ' + config_filename + '...')

    website = Website(config_filename)
    website.wipe()

    Webpage.write_list_pages_from_directory(
        website, website.config.value('source_path'))
    Webpage.write_item_pages_from_directory(
        website, website.config.value('source_path'))

    website.copy_resources()
    website.sitemap.write(website.config.value('output_path'))

    atom = Atom(website)
    atom.write()
Esempio n. 23
0
 def crawl(self):
     #start crawling
     while self.pagesCount <  self.pagesLimit and not self.priorityQueue.isempty():
         work_url = self.priorityQueue.pop()
         self.visited.append(work_url)
         page = Webpage(work_url[1])
         page_score = self.scorer.calculate_score(page.text)
         if (page_score > self.threshold):
             self.relevantPagesCount += 1
             print ("%s, %s") % (-1 * work_url[0], work_url[1])
         self.pagesCount += 1
         for link in page.outgoingUrls:
             url = link.address
             if url != None and url != '':
                 if url.find('?')!= -1:
                     url = url.split('?')[0]
                 if not self.exists(url,self.visited):
                     if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue):                            
                         url_score = self.scorer.calculate_score(link.getAllText())
                         self.totalPagesCount +=1
                         tot_score = (page_score + url_score)/2.0
                         if tot_score > threshold:
                             self.priorityQueue.push(((-1 * tot_score),url))
Esempio n. 24
0
def test_webpage_write():
    """ Test of webpage.write()
    """

    result = "This is a test!"

    webpage = Webpage(TEST_WEBSITE)
    webpage.html = result
    webpage.filename = 'my-post.html'
    webpage.write()

    # File should have the correct contents
    with open(
            TEST_WEBSITE.config.value('output_path') + webpage.filename,
            'r') as myfile:
        assert myfile.read() == result

    # Page should be included in sitemap
    assert 'https://example.com/my-post.html' in TEST_WEBSITE.sitemap.pages

    TEST_WEBSITE.wipe()
def test_single_list_page():
    """ Test when there is just one listing page, so no pagination etc
    """

    TEST_WEBSITE.wipe()
    _clean_up_test_items_md()
    _generate_test_items_md(4)
    _generate_non_indexable_test_items_md()

    Webpage.write_list_pages_from_directory(
        TEST_WEBSITE, TEST_WEBSITE.config.value('source_path'))

    # There should be an index.html but no blog-n.html files
    assert path.isfile(TEST_WEBSITE.config.value('output_path') + 'index.html')
    assert not path.isfile(
        TEST_WEBSITE.config.value('output_path') + 'blog-1.html')
    assert not path.isfile(
        TEST_WEBSITE.config.value('output_path') + 'blog-2.html')

    with open(TEST_WEBSITE.config.value('output_path') + 'index.html',
              'r') as myfile:
        blog_1_content = myfile.read()

    assert blog_1_content.count('<article>') == 4
    assert 'Article 4.' in blog_1_content
    assert 'Article 3.' in blog_1_content
    assert 'Article 2.' in blog_1_content
    assert 'Article 1.' in blog_1_content

    # Page should use listing page template
    assert '<p>Listing page template</p>' in blog_1_content

    # Index title = "Website Name - Page 1"
    assert 'Test website name - test tag & line' in blog_1_content

    # Don't show article footers on list page
    assert '<footer>footer</footer>' not in blog_1_content

    # Body should have class='magnetizer-listing-page'
    assert "<body class='magnetizer-listing-page'>" in blog_1_content

    # Twitter card should *not* be present
    assert '<meta name="twitter:card" content="summary" />' not in blog_1_content

    # Link to Atom feed should be present
    assert ('<link rel="alternate" type="application/rss+xml" ' +
            'href="https://example.com/atom.xml" />') in blog_1_content

    # No links previous/next page should be present
    assert 'class="magnetizer-pagination"' not in blog_1_content
    assert 'class="magnetizer-previous"' not in blog_1_content
    assert 'class="magnetizer-next"' not in blog_1_content

    # The index page should be present in the sitemap
    assert 'https://example.com/' in TEST_WEBSITE.sitemap.pages

    # Link to CSS should be present
    assert '<link rel="stylesheet" type="text/css" href="test-stylesheet.css' in blog_1_content

    # Meta description from config file should be present
    assert '<meta name="description" content="Meta \\"description\\" from cfg">' in blog_1_content
def test_three_paginated_list_pages():
    """ Test 3 listing pages, with pagination
    """

    TEST_WEBSITE.wipe()
    _clean_up_test_items_md()
    _generate_test_items_md(10)

    Webpage.write_list_pages_from_directory(
        TEST_WEBSITE, TEST_WEBSITE.config.value('source_path'))

    # There should be an index.html and exactly 2 blog-n.html files
    assert path.isfile(TEST_WEBSITE.config.value('output_path') + 'index.html')
    assert not path.isfile(
        TEST_WEBSITE.config.value('output_path') + 'blog-1.html')
    assert path.isfile(
        TEST_WEBSITE.config.value('output_path') + 'blog-2.html')
    assert path.isfile(
        TEST_WEBSITE.config.value('output_path') + 'blog-3.html')
    assert not path.isfile(
        TEST_WEBSITE.config.value('output_path') + 'blog-4.html')

    with open(TEST_WEBSITE.config.value('output_path') + 'index.html',
              'r') as myfile:
        blog_1_content = myfile.read()

    with open(TEST_WEBSITE.config.value('output_path') + 'blog-2.html',
              'r') as myfile:
        blog_2_content = myfile.read()

    with open(TEST_WEBSITE.config.value('output_path') + 'blog-3.html',
              'r') as myfile:
        blog_3_content = myfile.read()

    assert blog_1_content.count('<article>') == 4
    assert 'Article 10.' in blog_1_content
    assert 'Article 9.' in blog_1_content
    assert 'Article 8.' in blog_1_content
    assert 'Article 7.' in blog_1_content
    assert '<p>Listing page template</p>' in blog_1_content

    assert blog_2_content.count('<article>') == 4
    assert 'Article 6.' in blog_2_content
    assert 'Article 5.' in blog_2_content
    assert 'Article 4.' in blog_2_content
    assert 'Article 3.' in blog_2_content
    assert '<p>Listing page template</p>' in blog_2_content

    assert blog_3_content.count('<article>') == 2
    assert 'Article 2.' in blog_3_content
    assert 'Article 1.' in blog_3_content
    assert '<p>Listing page template</p>' in blog_3_content

    # Page title = "Website Name - Page n"
    assert 'Test website name - test tag & line' in blog_1_content
    assert '<title>Test website name - Page 2</title>' in blog_2_content
    assert '<title>Test website name - Page 3</title>' in blog_3_content

    # First page should have link to older posts but not newer
    assert '<a href="blog-2.html" class="magnetizer-next">Older posts</a>' in blog_1_content
    assert 'class="magnetizer-previous"' not in blog_1_content

    # Middle page should have link to older posts (i.e. homepage) and newer
    assert '<a href="blog-3.html" class="magnetizer-next">Older posts</a>' in blog_2_content
    assert '<a href="/" class="magnetizer-previous">Newer posts</a>' in blog_2_content

    # Last page should have link to newer posts but not older
    assert 'class="magnetizer-next"' not in blog_3_content
    assert '<a href="blog-2.html" class="magnetizer-previous">Newer posts</a>' in blog_3_content

    # Pages should have meta description from config
    assert '<meta name="description" content="Meta \\"description\\" from cfg">' in blog_1_content
    assert '<meta name="description" content="Meta \\"description\\" from cfg">' in blog_2_content
    assert '<meta name="description" content="Meta \\"description\\" from cfg">' in blog_3_content

    # index.html and the blog-n pages should be present in the sitemap
    assert 'https://example.com/' in TEST_WEBSITE.sitemap.pages
    assert not 'https://example.com/blog-1.html' in TEST_WEBSITE.sitemap.pages
    assert 'https://example.com/blog-2.html' in TEST_WEBSITE.sitemap.pages
    assert 'https://example.com/blog-3.html' in TEST_WEBSITE.sitemap.pages
Esempio n. 27
0
    def enhanced_crawl(self):
            #start crawling
            #myopener = MyOpener()
            self.harvestRatioData = []
            self.relevantPages = []
            while self.pagesCount <  self.pagesLimit and not self.priorityQueue.isempty():
                work_url = self.priorityQueue.pop()
                self.visited.append(work_url[1])
                #print ("%s, %s") % (-1 * work_url[0], work_url[1])
                #page = urllib2.urlopen(work_url)
                '''page = myopener.open(work_url)
                self.pagesCount += 1
                soup = BeautifulSoup(page)
                links = soup.find_all('a')'''
                #print work_url[1]
                try:
                    req = urllib2.Request(work_url[1])
                    # create a request object

                    handle = urllib2.urlopen(req)
                    # and open it to return a handle on the url
                except urllib2.URLError, e:
                    # ignore error, URL timed out
                    pass

                else:
                    html = handle.read()
                    soup = BeautifulSoup(html)
                    paras = soup.findAll('p')
                    #print paras
                    text = ""
                    for para in paras:
                            text = text + " " + para.text
                                    
                    page = Webpage(work_url,self.pagesCount)
                    if len(page.text) > 0:
                        page_score = self.scorer.calculate_smart_score(text, work_url[1])
                    else:
                        page_score = 0
                        
                    self.pagesCount += 1
                    if (page_score > self.pageScoreThreshold):
                        page.getUrls()
                        self.relevantPagesCount += 1
                        self.relevantPages.append(page)
                        self.harvestRatioData.append((self.relevantPagesCount,self.pagesCount))
                        print ("%s|"+ str(page_score)+"|%s") % (-1.0 * work_url[0], work_url[1])
                        for link in page.outgoingUrls:
                            url = link.address
                            if url != None and url != '':
                                if url.find('?')!= -1:
                                    url = url.split('?')[0]
                                if url.find('#') != -1:
                                    url = url.split('#')[0]
                                    
            #                         if url.startswith('http') == False:
            #                             parts = page.pageUrl[1].split("://")
            #                             baseUrl = parts[1].split("/")[0]
            #                             baseUrl = parts[0] +"://" + baseUrl
            #                             url = baseUrl + url
                                    
                                    #if not self.existsInVisited(url,self.visited): 
                                if url not in self.visited:
                                        #if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue):                            
                                    if url.startswith('http') and not self.exists(url,self.priorityQueue.queue):
                                        url_score = self.url_scorer.calculate_score(link.getAllText())
                                        self.totalPagesCount +=1
                                        #tot_score = (page_score + url_score)/2.0
                                        #tot_score = page_score + url_score
                                        tot_score = url_score
                                        if tot_score > self.urlScoreThreshold:
                                            #self.priorityQueue.push(((-1 * url_score),url))
                                            self.priorityQueue.push(((-1 * tot_score),url,page.pageId))
Esempio n. 28
0
    def crawl(self):
        self.harvestRatioData = []
        self.relevantPages = []
        webpages = []
        count = 0
        ftext = open(self.pagesDir + "webpagesTxt.txt", "w")
        webpageLabel = 0  # 0 for Non-relevant and 1 for Relevant
        while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(
        ):

            work_url = self.priorityQueue.pop()
            url = work_url[1]
            #if self.exists(url,1):
            #    continue
            if url in self.visited:
                continue
            #self.visited.append(url)#work_url[1])
            self.visited[url] = 1
            page = Webpage(work_url, self.pagesCount)
            if page.text == '':
                continue

            page.estimatedScore = 0
            if self.combineScore:
                page_score = 0
                if len(page.text) > 0:
                    #page_score = self.scorer.calculate_score(page.text,'W')[1]
                    page_score = self.scorer.calculate_score(page, 'W')[1]
                    if page_score == -1:
                        continue
                else:
                    print 'page text is empty'
                    continue

                page.estimatedScore = page_score

                if self.restricted:
                    if page_score < self.pageScoreThreshold:
                        #self.pagesCount += 1
                        continue

                pageDom = getDomain(url)
                if page_score >= self.pageScoreThreshold:
                    self.sourcesImp[pageDom][0] += 1
                    webpageLabel = 1
                else:
                    self.sourcesImp[pageDom][1] += 1
                    #self.sourcesImp[pageDom][0] = self.sourcesImp[pageDom][1]
                    webpageLabel = 0
            if self.combineScore:
                print page.pageId, ": ", str(
                    page_score), ",", -1 * work_url[0], ",", work_url[
                        1]  #,",", work_url[3]
            else:
                print -1 * work_url[0], ",", work_url[1]  #,",", work_url[3]
            self.pagesCount += 1
            #self.relevantPages.append((page.pageId,page.pageUrl,page.estimatedScore))
            self.relevantPages.append(
                (page.pageId, (page.pageUrl[1], page.pageUrl[2]),
                 page.estimatedScore))

            wbsStr = page.text.replace('\n', '. ').replace('\t', ' ')

            webpages.append(wbsStr)
            count += 1
            #save webpage's text to disk instead of adding to list
            # this will lead to change in evaluation
            if count % self.bufferLen == 0:
                strToWrite = '\n'.join(webpages).encode("utf-8")
                ftext.write(strToWrite)
                webpages = []
            #ftext = open(self.pagesDir+str(page.pageId) + ".txt", "w")
            #ftext.write(page.text.encode("utf-8"))
            #ftext.close()
            #-------

            if page_score < 0.1:
                continue
            page.getUrls()

            for link in page.outgoingUrls:
                url = link.address

                #if url != None and url != '':
                if url:
                    url = url.strip()
                    if url.find('report-a-typo') != -1:
                        continue
                    if url.find('m.tempo.co/') != -1:
                        continue
                    if url.find('?') != -1:
                        furl = url.split('?')[1]
                        if furl.startswith('id=') == False or furl.startswith(
                                'v=') == False or furl.startswith(
                                    'tid=') == False:
                            url = url.split('?')[0]
                    if url.find('#') != -1:
                        url = url.split('#')[0]

                    if url.endswith('/'):
                        url = url[:-1]
                    #if url.endswith(("comment","comment/","feed","comments","feed/","comments/",".rss","video","video/","link","gif","jpeg","mp4","wav","jpg","mp3","png","share.php","sharer.php","login.php","print","print/","button/","share","email","submit","post",".pdf") ):
                    if url.endswith(
                        ("comment", "feed", "comments", ".rss", "video",
                         "link", "gif", "jpeg", "mp4", "wav", "jpg", "mp3",
                         "png", "share.php", "sharer.php", "login.php",
                         "print", "button", "share", "email", "submit", "post",
                         ".pdf")):
                        continue

                    #if not self.exists(url,1):
                    if url in self.visited:
                        continue
                    #tot_score = 0.0
                    if url.startswith('http'):  #and not self.exists(url,2):
                        linkText = link.getAllText()
                        #if self.mode == 1:
                        #url_score = self.scorer.calculate_score(linkText,'U')
                        url_score = self.scorer.calculate_score(link, 'U')
                        tot_score = url_score
                        if self.combineScore:
                            #tot_score= 0.4 *page_score + 0.6 *url_score

                            tot_score = page_score * url_score
                        if tot_score < self.urlScoreThreshold:
                            continue
                        urlDom = getDomain(url)

                        si_score = self.sourcesImp[urlDom][
                            0] / self.sourcesImp[urlDom][1]
                        if self.siScoreCombineMethod == 1:
                            if webpageLabel:
                                tot_score = tot_score * si_score
                        elif self.siScoreCombineMethod == 2:
                            tot_score = self.topicWeight * tot_score + self.siWeight * si_score
                        #tot_score = tot_score * si_score
                        #else:
                        #    tot_score = url_score
                        #if tot_score >= self.urlScoreThreshold:
                        #print tot_score, '-', url, linkText
                        if self.restricted:
                            if tot_score < self.urlScoreThreshold:
                                continue
                        if tot_score >= self.urlScoreThreshold:
                            self.priorityQueue.push(
                                ((-1 * tot_score), url,
                                 page.pageId))  #,linkText))
                        #else:
                        #    self.priorityQueue.push(((-1 * page_score),url,page.pageId,link.getAllText()))
            #else:
            #    self.pages.append((page,0))

        print self.priorityQueue.isempty()

        if webpages:
            strToWrite = '\n'.join(webpages).encode("utf-8")
            ftext.write(strToWrite)
        ftext.close()

        return self.priorityQueue.queue
Esempio n. 29
0
def process_link(li, force_reprocess=False, redir_url=None, recurse=0):
    """
    fetch url, check for http errors and steppingstones, filter spam,
    save local file, convert to xml, add source_url etc. to xml,
    run Extractor on xml file, compute spam score, check for
    duplicate, check if published before last year.

    Links often lead to intermediate pages (e.g. on repositories) with
    another link to the actual paper. In this case, we only store the
    original link in the 'links' table, so the 'doc' entry has a url
    that doesn't match any link. To process the new link, process_link
    is called again, with redir_url set to the new url and recurse +=
    1.
    """
    
    # fetch url and handle errors, redirects, etc.:
    time.sleep(2) # be gentle on servers
    
    url = redir_url or li.url

    if not force_reprocess and li.last_checked:
        ims = time.strptime(li.last_checked, '%Y-%m-%d %H:%M:%S').strftime('%a, %d %b %Y %H:%M:%S GMT')
        status,r = self.request_url(url, if_modified_since=ims, etag=li.etag)
        if (status == 304 or
            status == 200 and r.headers.get('content-length') == li.filesize):
            li.update_db()
            debug(2, "not modified: not processing further")
            return 0
    else:
        status,r = self.request_url(url)
    
    if status != 200:
        li.update_db(status=status)
        debug(2, "error status {}", status)
        return 0

    li.etag = r.headers.get('etag')
    li.filsesize = r.headers.get('content-length')
    
    if r.url != url: # redirected
        url = self.normalize_url(r.url)
        # now we treat li as if it directly led to the redirected document

    if r.filetype == 'html':
        r.encoding = 'utf-8'
        doc = Webpage(url, html=r.text)
        debug(5, "\n====== %s ======\n%s\n======\n", url, r.text)

        # check for steppingstone pages with link to a paper:
        target_url = check_steppingstone(doc)
        if target_url and recurse < 3:
            debug(2, "steppingstone to {}", target_url)
            return process_link(li, redir_url=target_url, 
                                force_reprocess=force_reprocess, recurse=recurse+1)
        
        # Genuine papers are almost never in HTML format, and
        # almost every HTML page is not a paper. Moreover, the few
        # exceptions (such as entries on SEP) tend to require
        # unusual parsing. Hence the following special
        # treatment. If people start posting articles on medium or
        # in plain HTML, we might return to the old procedure of
        # converting the page to pdf and treating it like any
        # candidate paper.
        import parser.html
        if not parser.html.parse(doc, debug_level=debug_level):
            debug(2, "no metadata extracted: page ignored")
            li.update_db(status=1)
            return 0

    if r.filetype not in ('pdf', 'doc', 'rtf'):
        li.update_db(status=error.code['unsupported filetype')]
        return debug(2, "unsupported filetype: {}", r.filetype)

    else:
        doc = r
        doc.anchortext = li.anchortext
        doc.source = li.source

        # save document and convert to pdf:
        doc.tempfile = self.save_local(r)
        if not doc.tempfile:
            return li.update_db(status=error.code['cannot save local file')]
        if r.filetype != 'pdf':
            doc.tempfile = self.convert_to_pdf(doc.tempfile)
            if not doc.tempfile:
                return li.update_db(status=error.code['pdf conversion failed')]

        # extract metadata:
        import parser.pdf
        if not parser.pdf.parse(doc, debug_level=debug_level):
            logger.warning("metadata extraction failed for {}", url)
            li.update_db(status=error.code['parser error')]
            return 0

        # estimate spamminess:
        import spamfilter.pdf 
        doc.spamminess = spamfilter.pdf.evaluate(doc)
        if doc.spamminess > MAX_SPAMMINESS:
            li.update_db(status=1)
            debug(1, "spam: score {} > {}", doc.spamminess, self.MAX_SPAMMINESS)
            return 0

    if li.doc_id:
        # checking for revisions
        olddoc = Doc(li.doc_id)
        olddoc.load_from_db()
        if doc.content != olddoc.content:
            sm = SequenceMatcher(None, doc.content, olddoc.content)
            match_ratio = sm.ratio()
            if match_ratio < 0.8:
                debug(1, "substantive revisions, ratio {}", match_ratio)
                doc.earlier_id = olddoc.doc_id
        if not doc.earlier_id:
            li.update_db(status=1)
            debug(1, "no substantive revisions")
            return 0
    
    else:
        # check for duplicates:
        dupe = get_duplicate(doc)
        if dupe:
            debug(1, "duplicate of document {}", dupe.doc_id)
            li.update_db(status=1, doc_id=dupe.doc_id)
            return 0
    
        # don't show old papers in news feed:
        if document_is_old(doc):
            debug(2, "paper is old: setting found_date to 1970")
            doc.found_date = '1970-01-01 12:00:00'

        # don't show papers (incl HTML pages) from newly added source
        # pages in news feed:
        if source.status == 0:
            debug(2, "new source page: setting found_date to 1970")
            doc.found_date = '1970-01-01 12:00:00'
        
    doc_id = doc.add_to_db()
    li.update_db(status=1, doc_id)



                


    def check_steppingstone(self, page):
        debug(2, "checking: intermediate page leading to article?")

        # steppingstone pages from known repositories:
        redir_patterns = {
            # arxiv.org, springer.com, researchgate, etc.:
            '<meta name="citation_pdf_url" content="(.+?)"': '*',
            # philpapers.org:
            'class=\'outLink\' href="http://philpapers.org/go.pl[^"]+u=(http.+?)"': '*', 
            # philsci-archive.pitt.edu:
            '<meta name="eprints.document_url" content="(.+?)"': '*',
            # sciencedirect.com:
            'pdfurl="(.+?)"': '*',
            # PLOSOne:
            '(http://www.plosone.org/article/.+?representation=PDF)" id="downloadPdf"': '*',
            # Google Drive:
            'content="https://drive.google.com/file/d/(.+?)/': 'https://googledrive.com/host/*'
        }
        for pat, target in redir_patterns:
            m = re.search(pat, page.source)
            if m:
                target = target.replace('*', m.group(1))
                target = self.normalize_url(page.make_absolute(target))
                if target == page.url:
                    return None
                debug(2, "repository page for {}", target)
                return target
    
        # other steppingstone pages must have link(s) to a single pdf file:
        targets = set(u for u in page.xpath('//a/@href') if re.search('.pdf$', u, re.I))
        if len(targets) != 1:
            debug(4, "no: {} links to pdf files", len(targets))
            return None
        debug(4, "looks good: single link to pdf file {}", targets[0])
        target = self.normalize_url(page.make_absolute(targets[0]))
        return target
Esempio n. 30
0

if __name__ == '__main__':
    args = parse_args()
    config = configparser.ConfigParser()
    config.read(args.config)

    # load URL list file
    urls = []
    with open(args.input_path) as f:
        for line in f:
            url = line.strip()
            if url == '':
                continue
            urls.append(url)

    # parse web pages
    webpages = []
    for url in urls:
        webpages.append(Webpage.factory(url))
        time.sleep(CRAWL_TIME_INTERVAL)

    trello = Trello(config['trello']['api_key'],
                    config['trello']['api_token'],
                    config['trello']['t_list_id']
                    )

    # put data into Trello
    for q in webpages:
        trello.create_card(q.title, q.url)
Esempio n. 31
0

import csv
from pathlib import Path
from selenium import webdriver
from webpage import Webpage

with open(Path("webpage-info-scraper/webpage_urls.txt"), "r") as webpage_urls:
    with open(Path("webpage-info-scraper/webpage_data.csv"),
              "w") as webpage_data:
        writer = csv.writer(webpage_data, delimiter=",")
        web_driver = webdriver.Firefox()
        webpages = []
        """Scrape information from webpages using URLs stored in CSV file"""
        for url in webpage_urls:
            webpage = Webpage(url, web_driver)
            webpages.append(webpage)
            webpage.open()
            webpage.org_name = webpage.find_element_by_xpath("//h1").text
            print(webpage.org_name)

            try:
                contact_email_element = webpage.find_element_by_xpath(
                    "//span[text()='Contact Email']")
                div_text = contact_email_element.find_element_by_xpath(
                    '..').text
                webpage.email_address = remove_words_from_text(
                    div_text, ['Contact', 'Email', 'E:'])
                print(webpage.email_address)
            except:
                webpage.email_address = ""
def test_pagination_none():
    """ Test that webpage.pagination_html() returns None when no pagination needed
    """

    webpage = Webpage(TEST_WEBSITE)
    assert webpage.pagination_html() is None
Esempio n. 33
0
def test_utf8(caplog):
    pagename = 'philpapers-rec.html'
    url = 'https://blah.org'
    page = Webpage(url, html=source(pagename))
    assert 'Analytic' in page.text()