def test_webpage_from_single_article(): """ Test creating an article item page using item_from_md_filename() """ webpage = Webpage(TEST_WEBSITE) webpage.item_from_md_filename('001-basic-article.md') # Page title should be "Article title - Website name" title = 'This is the heading - Test website name' assert webpage.title == title assert webpage.html.count('<title>' + title + '</title>') == 1 # Page should use static page template assert '<p>Article page template</p>' in webpage.html # List page header should be present assert webpage.html.count('<div>list page header</div>') == 1 # Webpage should contain the text from the article assert webpage.html.count('<p>And here is some text...</p>') == 1 # Article item footer should be present assert webpage.html.count('<footer class="article-footer">') == 1 # Filename for webpage should be based on the article article = Item(TEST_WEBSITE) article.from_md_filename('001-basic-article.md') assert webpage.filename == article.filename # Body should have class='magnetizer-article-page' assert webpage.html.count("<body class='magnetizer-article-page'>") == 1 # Twitter card should be present assert '<meta name="twitter:card" content="summary_large_image" />' in webpage.html # Link to Atom feed should be present assert ('<link rel="alternate" type="application/rss+xml" ' + 'href="https://example.com/atom.xml" />') in webpage.html # Link to CSS should be present assert '<link rel="stylesheet" type="text/css" href="test-stylesheet.css' in webpage.html # Includes should be included, as per the .md file assert webpage.html.count("<div class='include'>Include 1</div>") == 2 assert webpage.html.count("<div class='include'>Include 2</div>") == 1 assert webpage.html.count("<div class='include'>Include 3</div>") == 1 assert "[ ERROR: Include 'inexistent_file.html' does not exist! ]" in webpage.html # No html comments should be left in page assert '<!--' not in webpage.html # Meta description should be pulled in from article assert '<meta name="description" content="Meta description from article">' in webpage.html # Footnote link should have been added assert "<a href='#1'>[1]</a>" in webpage.html # Footnote anchor should have been added assert "<a id='1'></a>[1]:" in webpage.html
def crawl(self): #start crawling #myopener = MyOpener() self.harvestRatioData = [] self.relevantPages = [] while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty( ): work_url = self.priorityQueue.pop() self.visited.append(work_url[1]) #print ("%s, %s") % (-1 * work_url[0], work_url[1]) #page = urllib2.urlopen(work_url) '''page = myopener.open(work_url) self.pagesCount += 1 soup = BeautifulSoup(page) links = soup.find_all('a')''' page = Webpage(work_url, self.pagesCount) if len(page.text) > 0: page_score = self.scorer.calculate_score(page.text) else: page_score = 0 self.pagesCount += 1 if (page_score > self.pageScoreThreshold): page.getUrls() self.relevantPagesCount += 1 self.relevantPages.append(page) self.harvestRatioData.append( (self.relevantPagesCount, self.pagesCount)) print("%s," + str(page_score) + ", %s") % (-1 * work_url[0], work_url[1]) for link in page.outgoingUrls: url = link.address if url != None and url != '': if url.find('?') != -1: url = url.split('?')[0] if url.find('#') != -1: url = url.split('#')[0] # if url.startswith('http') == False: # parts = page.pageUrl[1].split("://") # baseUrl = parts[1].split("/")[0] # baseUrl = parts[0] +"://" + baseUrl # url = baseUrl + url #if not self.existsInVisited(url,self.visited): if url not in self.visited: #if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue): if url.startswith('http') and not self.exists( url, self.priorityQueue.queue): url_score = self.scorer.calculate_score( link.getAllText()) self.totalPagesCount += 1 #tot_score = (page_score + url_score)/2.0 #tot_score = page_score + url_score tot_score = url_score if tot_score > self.urlScoreThreshold: #self.priorityQueue.push(((-1 * url_score),url)) self.priorityQueue.push( ((-1 * tot_score), url, page.pageId))
def test_mongin(caplog): pagename = 'mongin.html' url = 'https://studies2.hec.fr/jahia/Jahia/cache/offonce/lang/en/mongin/pid/1072' page = Webpage(url, html=source(pagename)) svars = page.session_variables() assert 'jsessionid' in svars testurl = 'https://studies2.hec.fr/jahia/webdav/site/hec/shared/sites/mongin/foo.pdf;jsessionid=123456' stripped = page.strip_session_variables(testurl) assert stripped == 'https://studies2.hec.fr/jahia/webdav/site/hec/shared/sites/mongin/foo.pdf;'
def crawl(self): self.harvestRatioData = [] self.relevantPages = [] while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(): work_url = self.priorityQueue.pop() self.visited.append(work_url) page = Webpage(work_url,self.pagesCount) if page.text =='' : continue page_score = 0.0 if self.combineScore: if len(page.text) > 0: page_score = self.scorer.calculate_score(page.text,'W') else: continue page.estimatedScore = page_score if self.restricted: if page_score < self.pageScoreThreshold: continue #print -1 * work_url[0],",", str(page_score),",",work_url[1],",", work_url[3] print -1 * work_url[0],",",work_url[1],",", work_url[3] self.pagesCount += 1 page.getUrls() self.relevantPages.append(page) for link in page.outgoingUrls: url = link.address if url != None and url != '': url = url.strip() if url.find('?')!= -1: url = url.split('?')[0] if url.find('#') != -1: url = url.split('#')[0] if url.endswith(("comment","comment/","feed","comments","feed/","comments/",".rss","video","video/","link","gif","jpeg","mp4","wav","jpg","mp3","png","share.php","sharer.php","login.php","print","print/","button/","share","email","submit","post",".pdf") ): continue if not self.exists(url,1): #tot_score = 0.0 if url.startswith('http') and not self.exists(url,2): if self.mode == 1: url_score = self.scorer.calculate_score(link.getAllText(),'U') if self.combineScore: tot_score= 0.5 *page_score + 0.5 *url_score else: tot_score = url_score #if tot_score >= self.urlScoreThreshold: self.priorityQueue.push(((-1 * tot_score),url,page.pageId,link.getAllText())) #else: # self.priorityQueue.push(((-1 * page_score),url,page.pageId,link.getAllText())) #else: # self.pages.append((page,0)) print self.priorityQueue.isempty()
def crawl(self): #start crawling #myopener = MyOpener() self.harvestRatioData = [] self.relevantPages = [] while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(): work_url = self.priorityQueue.pop() self.visited.append(work_url[1]) #print ("%s, %s") % (-1 * work_url[0], work_url[1]) #page = urllib2.urlopen(work_url) '''page = myopener.open(work_url) self.pagesCount += 1 soup = BeautifulSoup(page) links = soup.find_all('a')''' page = Webpage(work_url,self.pagesCount) if len(page.text) > 0: page_score = self.scorer.calculate_score(page.text) else: page_score = 0 self.pagesCount += 1 if (page_score > self.pageScoreThreshold): page.getUrls() self.relevantPagesCount += 1 self.relevantPages.append(page) self.harvestRatioData.append((self.relevantPagesCount,self.pagesCount)) print ("%s,"+ str(page_score)+", %s") % (-1 * work_url[0], work_url[1]) for link in page.outgoingUrls: url = link.address if url != None and url != '': if url.find('?')!= -1: url = url.split('?')[0] if url.find('#') != -1: url = url.split('#')[0] # if url.startswith('http') == False: # parts = page.pageUrl[1].split("://") # baseUrl = parts[1].split("/")[0] # baseUrl = parts[0] +"://" + baseUrl # url = baseUrl + url #if not self.existsInVisited(url,self.visited): if url not in self.visited: #if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue): if url.startswith('http') and not self.exists(url,self.priorityQueue.queue): url_score = self.scorer.calculate_score(link.getAllText()) self.totalPagesCount +=1 #tot_score = (page_score + url_score)/2.0 #tot_score = page_score + url_score tot_score = url_score if tot_score > self.urlScoreThreshold: #self.priorityQueue.push(((-1 * url_score),url)) self.priorityQueue.push(((-1 * tot_score),url,page.pageId))
def test_pagination_previous_only(): """ Test that webpage.pagination_html() returns previous page correctly when no next page """ webpage = Webpage(TEST_WEBSITE) webpage.url_previous = 'page-1.html' result = '<nav class="magnetizer-pagination"><ul>' result += '<li><a href="page-1.html" class="magnetizer-previous">Newer posts</a></li>' result += '</ul></nav>' assert webpage.pagination_html() == result
def crawl(self): #start crawling #myopener = MyOpener() self.harvestRatioData = [] self.relevantPages = [] while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty( ): work_url = self.priorityQueue.pop() self.visited.append(work_url) page = Webpage(work_url, self.pagesCount) if len(page.text) > 0: page_score = self.scorer.calculate_score(page.text) else: page_score = 0 self.pagesCount += 1 if (page_score > self.pageScoreThreshold): page.getUrls() self.relevantPagesCount += 1 self.relevantPages.append(page) self.harvestRatioData.append( (self.relevantPagesCount, self.pagesCount)) print("%s, %s") % (-1 * work_url[0], work_url[1]) for link in page.outgoingUrls: url = link.address if url != None and url != '': #if url.find('?')!= -1: # url = url.split('?')[0] if url.startswith("/"): base = page.pageUrl[1][7:].split("/")[0] url = "http://" + base + url if not self.exists(url, self.visited): if url.startswith('http') and url.find( '#' ) == -1 and not self.priorityQueue.exists( url ): #self.exists(url,self.priorityQueue.queue): url_score = self.scorer.calculate_score( link.getAllText()) self.totalPagesCount += 1 #tot_score = (page_score + url_score)/2.0 #tot_score = page_score + url_score tot_score = url_score if tot_score > self.urlScoreThreshold: #self.priorityQueue.push(((-1 * url_score),url)) self.priorityQueue.push( ((-1 * tot_score), url, page.pageId)) #self.relevantPagesCount += 1 self.priorityQueue.next()
def test_pagination_previous_and_next(): """ Test that webpage.pagination_html() returns next and previous pages correctly when both are available """ webpage = Webpage(TEST_WEBSITE) webpage.url_previous = 'page-3.html' webpage.url_next = 'page-5.html' result = '<nav class="magnetizer-pagination"><ul>' result += '<li><a href="page-3.html" class="magnetizer-previous">Newer posts</a></li>' result += '<li><a href="page-5.html" class="magnetizer-next">Older posts</a></li>' result += '</ul></nav>' assert webpage.pagination_html() == result
def crawl(self): hitPageLimit = False while self.pageCount() < self.pageLimit and not self.priorityQueue.isempty(): url_priority_obj = self.priorityQueue.pop() priority = url_priority_obj[0] url = url_priority_obj[1] if url not in self.visited: self.visited.append(url) print "Crawling page #{} {}".format(self.pageCount(), url_priority_obj) page = Webpage(url_priority_obj[1]) page_score = page.score(self.scorer) if (page_score > self.relevantThreshold): self.relevantPages.append(url_priority_obj) print "Relevant page found. Score ({}) URL ({})".format(page_score, url) else: print "Irrelevant page found. Score ({}) URL ({})".format(page_score, url) linked_url_count = 0 for linked_url in page.outgoingUrls: if linked_url != None and linked_url != '': if not self.is_blacklisted(linked_url): # if linked_url.find('?')!= -1: # linked_url = linked_url.split('?')[0] if linked_url not in self.visited and linked_url not in self.irrelevantUrls: if linked_url.startswith('http') and not self.exists(linked_url,self.priorityQueue.queue): linked_url_count += 1 print "Checking link #{} {}".format(linked_url_count, linked_url) linked_page = Webpage(linked_url) if hasattr(linked_page, "text"): # webpage was parseable linked_url_score = linked_page.score(self.scorer) self.totalPagesCount +=1 link_weight = 2 page_weight = 1 tot_score = ((page_weight * page_score) + (link_weight * linked_url_score))/2.0 if tot_score >= self.relevantThreshold: print "Relevant link found. Score ({}) URL ({})".format(tot_score, linked_url) self.priorityQueue.push(((-1 * tot_score),linked_url)) else: self.irrelevantUrls.append(linked_url) if self.linkLimit > 0 and linked_url_count >= self.linkLimit: print "Done crawling page. Reached linkLimit." break if self.pageCount() >= self.pageLimit: print "Done crawling. Reached pageLimit." elif self.priorityQueue.isempty(): print "Done crawling. No more pages to crawl."
def __init__(self, website): self.website = website filenames = Webpage.filenames_from_directory( self.website.config.value('source_path')) self.feed_data = self.feed(filenames)
def crawl(self): #start crawling #myopener = MyOpener() while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty( ): work_url = self.priorityQueue.pop() self.visited.append(work_url) print("%s, %s") % (-1 * work_url[0], work_url[1]) #page = urllib2.urlopen(work_url) '''page = myopener.open(work_url) self.pagesCount += 1 soup = BeautifulSoup(page) links = soup.find_all('a')''' page = Webpage(work_url[1]) self.pagesCount += 1 for link in page.outgoingUrls: url = link.address if url != None and url != '': if url.find('?') != -1: url = url.split('?')[0] if not self.exists(url, self.visited): if url.startswith('http:') and url.find( '#') == -1 and not self.exists( url, self.priorityQueue.queue): url_score = self.scorer.calculate_score( link.getAllText()) self.totalPagesCount += 1 if url_score > 0.1: self.priorityQueue.push( ((-1 * url_score), url)) self.relevantPagesCount += 1
def getUrlTexts(urlList): """ Lazy returns url texts """ for url in urlList: page = Webpage(url) #data = tokenizeDocText(page.text) yield page.text
def ensure_game_is_active(sleep_time): """ Pauses the run while the game window isn't active. :param sleep_time: time to sleep between polling activeness of window """ while not Webpage.is_active(): sleep(sleep_time)
def test_static_item_page(): """ Test creating a static item page using item_from_md_filename() """ webpage = Webpage(TEST_WEBSITE) webpage.item_from_md_filename('dont-show-on-list-page.md') # Page title should be "Article title - Website name" title = 'This post should not be in the index - Test website name' assert webpage.title == title assert webpage.html.count('<title>' + title + '</title>') == 1 # Page should use static page template assert '<p>Static page template</p>' in webpage.html # List page header should NOT present assert webpage.html.count('<div>list page header</div>') == 0 # Webpage should contain the text from the article assert webpage.html.count("<p>That's why it doesn't start with") == 1 # Article footer should NOT be present assert webpage.html.count('<footer>footer</footer>') == 0 # Filename for webpage should be based on the article article = Item(TEST_WEBSITE) article.from_md_filename('dont-show-on-list-page.md') assert webpage.filename == article.filename # Body should have class='magnetizer-static-page' assert webpage.html.count("<body class='magnetizer-static-page'>") == 1 # Twitter card should be present assert '<meta name="twitter:card" content="summary_large_image" />' in webpage.html # Link to Atom feed should be present assert ('<link rel="alternate" type="application/rss+xml" ' + 'href="https://example.com/atom.xml" />') in webpage.html # Link to CSS should be present assert '<link rel="stylesheet" type="text/css" href="test-stylesheet.css' in webpage.html # No html comments should be left in page assert '<!--' not in webpage.html
def crawl(self): #start crawling #myopener = MyOpener() self.harvestRatioData = [] self.relevantPages = [] while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(): work_url = self.priorityQueue.pop() self.visited.append(work_url) page = Webpage(work_url,self.pagesCount) if len(page.text) > 0: page_score = self.scorer.calculate_score(page.text) else: page_score = 0 self.pagesCount += 1 if (page_score > self.pageScoreThreshold): page.getUrls() self.relevantPagesCount += 1 self.relevantPages.append(page) self.harvestRatioData.append((self.relevantPagesCount,self.pagesCount)) print ("%s, %s") % (-1 * work_url[0], work_url[1]) for link in page.outgoingUrls: url = link.address if url != None and url != '': #if url.find('?')!= -1: # url = url.split('?')[0] if url.startswith("/"): base = page.pageUrl[1][7:].split("/")[0] url = "http://" + base + url if not self.exists(url,self.visited): if url.startswith('http') and url.find('#') == -1 and not self.priorityQueue.exists(url):#self.exists(url,self.priorityQueue.queue): url_score = self.scorer.calculate_score(link.getAllText()) self.totalPagesCount +=1 #tot_score = (page_score + url_score)/2.0 #tot_score = page_score + url_score tot_score = url_score if tot_score > self.urlScoreThreshold: #self.priorityQueue.push(((-1 * url_score),url)) self.priorityQueue.push(((-1 * tot_score),url,page.pageId)) #self.relevantPagesCount += 1 self.priorityQueue.next()
def test_includes(): """ Test of webpage.includes() """ webpage = Webpage(TEST_WEBSITE) webpage.html = '<h1>Some html</h1>' webpage.html += "<!-- MAGNETIZER_INCLUDE _include1.html -->" webpage.html += "<!-- MAGNETIZER_INCLUDE _include2.html -->" webpage.html += '<div>More html...</div>' webpage.html += "<!-- MAGNETIZER_INCLUDE _include3.html -->" webpage.html += "<!-- MAGNETIZER_INCLUDE _include1.html -->" correct_includes = ['_include1.html', '_include2.html', '_include3.html'] includes = webpage.includes() # Set should contain each include from the html for correct_include in correct_includes: assert correct_include in includes assert len(includes) == len(correct_includes)
def test_webpage_write_multiple_from_filenames(): """ Test of write_item_pages_from_md_filenames() """ TEST_WEBSITE.wipe() filenames = [ '001-basic-article.md', '002-article-with-h1-break-and-date.md', '003-another-article.md', '100-ignore-this.txt', 'dont-show-on-list-page.md', '009-unindexed-article.md' ] Webpage.write_item_pages_from_md_filenames(TEST_WEBSITE, filenames) written_filenames = listdir(TEST_WEBSITE.config.value('output_path')) # All the normal articles should have been written assert 'basic-article.html' in written_filenames assert 'article-with-h1-break-and-date.html' in written_filenames assert 'another-article.html' in written_filenames assert 'unindexed-article.html' in written_filenames # The static pages should have been written too assert 'dont-show-on-list-page.html' in written_filenames # The file not ending in .md should not have been written assert 'ignore-this.html' not in written_filenames assert '100-ignore-this.txt' not in written_filenames # The written files should be included in the sitemap... assert 'https://example.com/basic-article.html' in TEST_WEBSITE.sitemap.pages assert 'https://example.com/article-with-h1-break-and-date.html' in TEST_WEBSITE.sitemap.pages assert 'https://example.com/another-article.html' in TEST_WEBSITE.sitemap.pages assert 'https://example.com/dont-show-on-list-page.html' in TEST_WEBSITE.sitemap.pages # ... except for the unindexed article assert 'https://example.com/unindexed-article.html' not in TEST_WEBSITE.sitemap.pages # Ignored files should not be included in the sitemap assert 'https://example.com/ignore-this.html' not in TEST_WEBSITE.sitemap.pages TEST_WEBSITE.wipe()
def __init__(self, seedUrls): self.exclude_words = ['ads', 'print', 'advertisement'] self.seedUrls = seedUrls super(TFIDF_Scorer, self).__init__(None) self.seedPages = [] self.avgdl = 0 for url in self.seedUrls: page = Webpage(url) data = self.cleanDoc(page.text) self.seedPages.append(data) self.avgdl += len(data) self.buildModel(self.seedPages)
def test_page_indexability(): """ Test to make sure indexability carries through from item to webpage """ webpage_index = Webpage(TEST_WEBSITE) webpage_index.item_from_md_filename('001-basic-article.md') webpage_dont_index = Webpage(TEST_WEBSITE) webpage_dont_index.item_from_md_filename('009-unindexed-article.md') # Don't include noindex tag for article page that SHOULD be indexed assert '<meta name="robots" content="noindex">' not in webpage_index.html # Include noindex tag for article page that should NOT be indexed assert '<meta name="robots" content="noindex">' in webpage_dont_index.html
def main(): sleep_time = 0.7 screen_size = Size(316, 280, 600, 480) cap = CaptureImage(screen_size) Webpage.open_url() cap.capture() board = Board((17, 18, 500, 500)) for i in xrange(1000): ensure_game_is_active(sleep_time) pyautogui.moveTo(316 + 17 + 34, 280 + 389) cap.capture() board.read_board_from_screen() board.accumulate_neighbors() x, y = board.get_x_y_for_shot() if x > 900 or y > 900: continue try: pyautogui.moveTo(x, y) pyautogui.click() except (WindowsError, ValueError) as e: pass # Ignoring pyautogui exceptions sleep(sleep_time) Webpage.close()
def main(models=None): models = [] if models is None else models.split(',') models.insert(0, 'original') print("Generate results -> " + str(models)) url = input("URLを入力してください\n") layout_type = input("タイプを選択してください(1-7)\n") webpage = Webpage(url, layout_type) # スクリーンショット取得 screenshot = webpage.get_screenshot('screen-pc.png', 1280, 800) # HTML保存 webpage.save_html() for model in models: print(model) # CSVの準備 csv_tags = Csv('./working/tag_list_' + model + '.csv') csv_tags_custom = Csv('./working/tag_list_custom_' + model + '.csv') default_row = [ 'class or id', 'tag_name', 'start_x', 'start_y', 'size_w', 'size_h', 'average_color', 'salient_level', 'element_area' ] csv_tags.writerow(default_row) csv_tags_custom.writerow(default_row) # ハーフサイズの顕著性マップ if model == 'original': # 顕著性マップ取得 saliency_map = webpage.get_saliency_map(screenshot) resize_saliency_map = Image(saliency_map).get_halfsize() elif model == 'original-mlnet': saliency_map = Image(cv2.imread('./data/mlnet.png', 1)) resize_saliency_map = saliency_map.get_trimming((1280, 726)) resize_saliency_map = cv2.cvtColor(resize_saliency_map, cv2.COLOR_BGR2GRAY) else: saliency_map = Image(cv2.imread('./data/' + model + '.png', 1)) resize_saliency_map = saliency_map.get_trimming((1280, 726)) resize_saliency_map = cv2.cvtColor(resize_saliency_map, cv2.COLOR_BGR2GRAY) # 各要素のサイズと顕著度を取得 Element.canvas = Image(resize_saliency_map) Element.layout_type = webpage.layout_type Element.model = model print('ウェブページ全体の顕著度:' + str(Element.GetTotalSaliency())) GetElementInfo(webpage, csv_tags, csv_tags_custom) # CSVとWebDriverのClose csv_tags.close() csv_tags_custom.close() CreateRegionMap(model) webpage.driver.quit() getFinalLine() getFinalTile()
def main(): """ Main method to trigger generation of all pages """ if len(argv) == 3 and argv[1] == '-config': config_filename = argv[2] else: config_filename = '../config/magnetizer.cfg' print('Using config ' + config_filename + '...') website = Website(config_filename) website.wipe() Webpage.write_list_pages_from_directory( website, website.config.value('source_path')) Webpage.write_item_pages_from_directory( website, website.config.value('source_path')) website.copy_resources() website.sitemap.write(website.config.value('output_path')) atom = Atom(website) atom.write()
def crawl(self): #start crawling while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(): work_url = self.priorityQueue.pop() self.visited.append(work_url) page = Webpage(work_url[1]) page_score = self.scorer.calculate_score(page.text) if (page_score > self.threshold): self.relevantPagesCount += 1 print ("%s, %s") % (-1 * work_url[0], work_url[1]) self.pagesCount += 1 for link in page.outgoingUrls: url = link.address if url != None and url != '': if url.find('?')!= -1: url = url.split('?')[0] if not self.exists(url,self.visited): if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue): url_score = self.scorer.calculate_score(link.getAllText()) self.totalPagesCount +=1 tot_score = (page_score + url_score)/2.0 if tot_score > threshold: self.priorityQueue.push(((-1 * tot_score),url))
def test_webpage_write(): """ Test of webpage.write() """ result = "This is a test!" webpage = Webpage(TEST_WEBSITE) webpage.html = result webpage.filename = 'my-post.html' webpage.write() # File should have the correct contents with open( TEST_WEBSITE.config.value('output_path') + webpage.filename, 'r') as myfile: assert myfile.read() == result # Page should be included in sitemap assert 'https://example.com/my-post.html' in TEST_WEBSITE.sitemap.pages TEST_WEBSITE.wipe()
def test_single_list_page(): """ Test when there is just one listing page, so no pagination etc """ TEST_WEBSITE.wipe() _clean_up_test_items_md() _generate_test_items_md(4) _generate_non_indexable_test_items_md() Webpage.write_list_pages_from_directory( TEST_WEBSITE, TEST_WEBSITE.config.value('source_path')) # There should be an index.html but no blog-n.html files assert path.isfile(TEST_WEBSITE.config.value('output_path') + 'index.html') assert not path.isfile( TEST_WEBSITE.config.value('output_path') + 'blog-1.html') assert not path.isfile( TEST_WEBSITE.config.value('output_path') + 'blog-2.html') with open(TEST_WEBSITE.config.value('output_path') + 'index.html', 'r') as myfile: blog_1_content = myfile.read() assert blog_1_content.count('<article>') == 4 assert 'Article 4.' in blog_1_content assert 'Article 3.' in blog_1_content assert 'Article 2.' in blog_1_content assert 'Article 1.' in blog_1_content # Page should use listing page template assert '<p>Listing page template</p>' in blog_1_content # Index title = "Website Name - Page 1" assert 'Test website name - test tag & line' in blog_1_content # Don't show article footers on list page assert '<footer>footer</footer>' not in blog_1_content # Body should have class='magnetizer-listing-page' assert "<body class='magnetizer-listing-page'>" in blog_1_content # Twitter card should *not* be present assert '<meta name="twitter:card" content="summary" />' not in blog_1_content # Link to Atom feed should be present assert ('<link rel="alternate" type="application/rss+xml" ' + 'href="https://example.com/atom.xml" />') in blog_1_content # No links previous/next page should be present assert 'class="magnetizer-pagination"' not in blog_1_content assert 'class="magnetizer-previous"' not in blog_1_content assert 'class="magnetizer-next"' not in blog_1_content # The index page should be present in the sitemap assert 'https://example.com/' in TEST_WEBSITE.sitemap.pages # Link to CSS should be present assert '<link rel="stylesheet" type="text/css" href="test-stylesheet.css' in blog_1_content # Meta description from config file should be present assert '<meta name="description" content="Meta \\"description\\" from cfg">' in blog_1_content
def test_three_paginated_list_pages(): """ Test 3 listing pages, with pagination """ TEST_WEBSITE.wipe() _clean_up_test_items_md() _generate_test_items_md(10) Webpage.write_list_pages_from_directory( TEST_WEBSITE, TEST_WEBSITE.config.value('source_path')) # There should be an index.html and exactly 2 blog-n.html files assert path.isfile(TEST_WEBSITE.config.value('output_path') + 'index.html') assert not path.isfile( TEST_WEBSITE.config.value('output_path') + 'blog-1.html') assert path.isfile( TEST_WEBSITE.config.value('output_path') + 'blog-2.html') assert path.isfile( TEST_WEBSITE.config.value('output_path') + 'blog-3.html') assert not path.isfile( TEST_WEBSITE.config.value('output_path') + 'blog-4.html') with open(TEST_WEBSITE.config.value('output_path') + 'index.html', 'r') as myfile: blog_1_content = myfile.read() with open(TEST_WEBSITE.config.value('output_path') + 'blog-2.html', 'r') as myfile: blog_2_content = myfile.read() with open(TEST_WEBSITE.config.value('output_path') + 'blog-3.html', 'r') as myfile: blog_3_content = myfile.read() assert blog_1_content.count('<article>') == 4 assert 'Article 10.' in blog_1_content assert 'Article 9.' in blog_1_content assert 'Article 8.' in blog_1_content assert 'Article 7.' in blog_1_content assert '<p>Listing page template</p>' in blog_1_content assert blog_2_content.count('<article>') == 4 assert 'Article 6.' in blog_2_content assert 'Article 5.' in blog_2_content assert 'Article 4.' in blog_2_content assert 'Article 3.' in blog_2_content assert '<p>Listing page template</p>' in blog_2_content assert blog_3_content.count('<article>') == 2 assert 'Article 2.' in blog_3_content assert 'Article 1.' in blog_3_content assert '<p>Listing page template</p>' in blog_3_content # Page title = "Website Name - Page n" assert 'Test website name - test tag & line' in blog_1_content assert '<title>Test website name - Page 2</title>' in blog_2_content assert '<title>Test website name - Page 3</title>' in blog_3_content # First page should have link to older posts but not newer assert '<a href="blog-2.html" class="magnetizer-next">Older posts</a>' in blog_1_content assert 'class="magnetizer-previous"' not in blog_1_content # Middle page should have link to older posts (i.e. homepage) and newer assert '<a href="blog-3.html" class="magnetizer-next">Older posts</a>' in blog_2_content assert '<a href="/" class="magnetizer-previous">Newer posts</a>' in blog_2_content # Last page should have link to newer posts but not older assert 'class="magnetizer-next"' not in blog_3_content assert '<a href="blog-2.html" class="magnetizer-previous">Newer posts</a>' in blog_3_content # Pages should have meta description from config assert '<meta name="description" content="Meta \\"description\\" from cfg">' in blog_1_content assert '<meta name="description" content="Meta \\"description\\" from cfg">' in blog_2_content assert '<meta name="description" content="Meta \\"description\\" from cfg">' in blog_3_content # index.html and the blog-n pages should be present in the sitemap assert 'https://example.com/' in TEST_WEBSITE.sitemap.pages assert not 'https://example.com/blog-1.html' in TEST_WEBSITE.sitemap.pages assert 'https://example.com/blog-2.html' in TEST_WEBSITE.sitemap.pages assert 'https://example.com/blog-3.html' in TEST_WEBSITE.sitemap.pages
def enhanced_crawl(self): #start crawling #myopener = MyOpener() self.harvestRatioData = [] self.relevantPages = [] while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(): work_url = self.priorityQueue.pop() self.visited.append(work_url[1]) #print ("%s, %s") % (-1 * work_url[0], work_url[1]) #page = urllib2.urlopen(work_url) '''page = myopener.open(work_url) self.pagesCount += 1 soup = BeautifulSoup(page) links = soup.find_all('a')''' #print work_url[1] try: req = urllib2.Request(work_url[1]) # create a request object handle = urllib2.urlopen(req) # and open it to return a handle on the url except urllib2.URLError, e: # ignore error, URL timed out pass else: html = handle.read() soup = BeautifulSoup(html) paras = soup.findAll('p') #print paras text = "" for para in paras: text = text + " " + para.text page = Webpage(work_url,self.pagesCount) if len(page.text) > 0: page_score = self.scorer.calculate_smart_score(text, work_url[1]) else: page_score = 0 self.pagesCount += 1 if (page_score > self.pageScoreThreshold): page.getUrls() self.relevantPagesCount += 1 self.relevantPages.append(page) self.harvestRatioData.append((self.relevantPagesCount,self.pagesCount)) print ("%s|"+ str(page_score)+"|%s") % (-1.0 * work_url[0], work_url[1]) for link in page.outgoingUrls: url = link.address if url != None and url != '': if url.find('?')!= -1: url = url.split('?')[0] if url.find('#') != -1: url = url.split('#')[0] # if url.startswith('http') == False: # parts = page.pageUrl[1].split("://") # baseUrl = parts[1].split("/")[0] # baseUrl = parts[0] +"://" + baseUrl # url = baseUrl + url #if not self.existsInVisited(url,self.visited): if url not in self.visited: #if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue): if url.startswith('http') and not self.exists(url,self.priorityQueue.queue): url_score = self.url_scorer.calculate_score(link.getAllText()) self.totalPagesCount +=1 #tot_score = (page_score + url_score)/2.0 #tot_score = page_score + url_score tot_score = url_score if tot_score > self.urlScoreThreshold: #self.priorityQueue.push(((-1 * url_score),url)) self.priorityQueue.push(((-1 * tot_score),url,page.pageId))
def crawl(self): self.harvestRatioData = [] self.relevantPages = [] webpages = [] count = 0 ftext = open(self.pagesDir + "webpagesTxt.txt", "w") webpageLabel = 0 # 0 for Non-relevant and 1 for Relevant while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty( ): work_url = self.priorityQueue.pop() url = work_url[1] #if self.exists(url,1): # continue if url in self.visited: continue #self.visited.append(url)#work_url[1]) self.visited[url] = 1 page = Webpage(work_url, self.pagesCount) if page.text == '': continue page.estimatedScore = 0 if self.combineScore: page_score = 0 if len(page.text) > 0: #page_score = self.scorer.calculate_score(page.text,'W')[1] page_score = self.scorer.calculate_score(page, 'W')[1] if page_score == -1: continue else: print 'page text is empty' continue page.estimatedScore = page_score if self.restricted: if page_score < self.pageScoreThreshold: #self.pagesCount += 1 continue pageDom = getDomain(url) if page_score >= self.pageScoreThreshold: self.sourcesImp[pageDom][0] += 1 webpageLabel = 1 else: self.sourcesImp[pageDom][1] += 1 #self.sourcesImp[pageDom][0] = self.sourcesImp[pageDom][1] webpageLabel = 0 if self.combineScore: print page.pageId, ": ", str( page_score), ",", -1 * work_url[0], ",", work_url[ 1] #,",", work_url[3] else: print -1 * work_url[0], ",", work_url[1] #,",", work_url[3] self.pagesCount += 1 #self.relevantPages.append((page.pageId,page.pageUrl,page.estimatedScore)) self.relevantPages.append( (page.pageId, (page.pageUrl[1], page.pageUrl[2]), page.estimatedScore)) wbsStr = page.text.replace('\n', '. ').replace('\t', ' ') webpages.append(wbsStr) count += 1 #save webpage's text to disk instead of adding to list # this will lead to change in evaluation if count % self.bufferLen == 0: strToWrite = '\n'.join(webpages).encode("utf-8") ftext.write(strToWrite) webpages = [] #ftext = open(self.pagesDir+str(page.pageId) + ".txt", "w") #ftext.write(page.text.encode("utf-8")) #ftext.close() #------- if page_score < 0.1: continue page.getUrls() for link in page.outgoingUrls: url = link.address #if url != None and url != '': if url: url = url.strip() if url.find('report-a-typo') != -1: continue if url.find('m.tempo.co/') != -1: continue if url.find('?') != -1: furl = url.split('?')[1] if furl.startswith('id=') == False or furl.startswith( 'v=') == False or furl.startswith( 'tid=') == False: url = url.split('?')[0] if url.find('#') != -1: url = url.split('#')[0] if url.endswith('/'): url = url[:-1] #if url.endswith(("comment","comment/","feed","comments","feed/","comments/",".rss","video","video/","link","gif","jpeg","mp4","wav","jpg","mp3","png","share.php","sharer.php","login.php","print","print/","button/","share","email","submit","post",".pdf") ): if url.endswith( ("comment", "feed", "comments", ".rss", "video", "link", "gif", "jpeg", "mp4", "wav", "jpg", "mp3", "png", "share.php", "sharer.php", "login.php", "print", "button", "share", "email", "submit", "post", ".pdf")): continue #if not self.exists(url,1): if url in self.visited: continue #tot_score = 0.0 if url.startswith('http'): #and not self.exists(url,2): linkText = link.getAllText() #if self.mode == 1: #url_score = self.scorer.calculate_score(linkText,'U') url_score = self.scorer.calculate_score(link, 'U') tot_score = url_score if self.combineScore: #tot_score= 0.4 *page_score + 0.6 *url_score tot_score = page_score * url_score if tot_score < self.urlScoreThreshold: continue urlDom = getDomain(url) si_score = self.sourcesImp[urlDom][ 0] / self.sourcesImp[urlDom][1] if self.siScoreCombineMethod == 1: if webpageLabel: tot_score = tot_score * si_score elif self.siScoreCombineMethod == 2: tot_score = self.topicWeight * tot_score + self.siWeight * si_score #tot_score = tot_score * si_score #else: # tot_score = url_score #if tot_score >= self.urlScoreThreshold: #print tot_score, '-', url, linkText if self.restricted: if tot_score < self.urlScoreThreshold: continue if tot_score >= self.urlScoreThreshold: self.priorityQueue.push( ((-1 * tot_score), url, page.pageId)) #,linkText)) #else: # self.priorityQueue.push(((-1 * page_score),url,page.pageId,link.getAllText())) #else: # self.pages.append((page,0)) print self.priorityQueue.isempty() if webpages: strToWrite = '\n'.join(webpages).encode("utf-8") ftext.write(strToWrite) ftext.close() return self.priorityQueue.queue
def process_link(li, force_reprocess=False, redir_url=None, recurse=0): """ fetch url, check for http errors and steppingstones, filter spam, save local file, convert to xml, add source_url etc. to xml, run Extractor on xml file, compute spam score, check for duplicate, check if published before last year. Links often lead to intermediate pages (e.g. on repositories) with another link to the actual paper. In this case, we only store the original link in the 'links' table, so the 'doc' entry has a url that doesn't match any link. To process the new link, process_link is called again, with redir_url set to the new url and recurse += 1. """ # fetch url and handle errors, redirects, etc.: time.sleep(2) # be gentle on servers url = redir_url or li.url if not force_reprocess and li.last_checked: ims = time.strptime(li.last_checked, '%Y-%m-%d %H:%M:%S').strftime('%a, %d %b %Y %H:%M:%S GMT') status,r = self.request_url(url, if_modified_since=ims, etag=li.etag) if (status == 304 or status == 200 and r.headers.get('content-length') == li.filesize): li.update_db() debug(2, "not modified: not processing further") return 0 else: status,r = self.request_url(url) if status != 200: li.update_db(status=status) debug(2, "error status {}", status) return 0 li.etag = r.headers.get('etag') li.filsesize = r.headers.get('content-length') if r.url != url: # redirected url = self.normalize_url(r.url) # now we treat li as if it directly led to the redirected document if r.filetype == 'html': r.encoding = 'utf-8' doc = Webpage(url, html=r.text) debug(5, "\n====== %s ======\n%s\n======\n", url, r.text) # check for steppingstone pages with link to a paper: target_url = check_steppingstone(doc) if target_url and recurse < 3: debug(2, "steppingstone to {}", target_url) return process_link(li, redir_url=target_url, force_reprocess=force_reprocess, recurse=recurse+1) # Genuine papers are almost never in HTML format, and # almost every HTML page is not a paper. Moreover, the few # exceptions (such as entries on SEP) tend to require # unusual parsing. Hence the following special # treatment. If people start posting articles on medium or # in plain HTML, we might return to the old procedure of # converting the page to pdf and treating it like any # candidate paper. import parser.html if not parser.html.parse(doc, debug_level=debug_level): debug(2, "no metadata extracted: page ignored") li.update_db(status=1) return 0 if r.filetype not in ('pdf', 'doc', 'rtf'): li.update_db(status=error.code['unsupported filetype')] return debug(2, "unsupported filetype: {}", r.filetype) else: doc = r doc.anchortext = li.anchortext doc.source = li.source # save document and convert to pdf: doc.tempfile = self.save_local(r) if not doc.tempfile: return li.update_db(status=error.code['cannot save local file')] if r.filetype != 'pdf': doc.tempfile = self.convert_to_pdf(doc.tempfile) if not doc.tempfile: return li.update_db(status=error.code['pdf conversion failed')] # extract metadata: import parser.pdf if not parser.pdf.parse(doc, debug_level=debug_level): logger.warning("metadata extraction failed for {}", url) li.update_db(status=error.code['parser error')] return 0 # estimate spamminess: import spamfilter.pdf doc.spamminess = spamfilter.pdf.evaluate(doc) if doc.spamminess > MAX_SPAMMINESS: li.update_db(status=1) debug(1, "spam: score {} > {}", doc.spamminess, self.MAX_SPAMMINESS) return 0 if li.doc_id: # checking for revisions olddoc = Doc(li.doc_id) olddoc.load_from_db() if doc.content != olddoc.content: sm = SequenceMatcher(None, doc.content, olddoc.content) match_ratio = sm.ratio() if match_ratio < 0.8: debug(1, "substantive revisions, ratio {}", match_ratio) doc.earlier_id = olddoc.doc_id if not doc.earlier_id: li.update_db(status=1) debug(1, "no substantive revisions") return 0 else: # check for duplicates: dupe = get_duplicate(doc) if dupe: debug(1, "duplicate of document {}", dupe.doc_id) li.update_db(status=1, doc_id=dupe.doc_id) return 0 # don't show old papers in news feed: if document_is_old(doc): debug(2, "paper is old: setting found_date to 1970") doc.found_date = '1970-01-01 12:00:00' # don't show papers (incl HTML pages) from newly added source # pages in news feed: if source.status == 0: debug(2, "new source page: setting found_date to 1970") doc.found_date = '1970-01-01 12:00:00' doc_id = doc.add_to_db() li.update_db(status=1, doc_id) def check_steppingstone(self, page): debug(2, "checking: intermediate page leading to article?") # steppingstone pages from known repositories: redir_patterns = { # arxiv.org, springer.com, researchgate, etc.: '<meta name="citation_pdf_url" content="(.+?)"': '*', # philpapers.org: 'class=\'outLink\' href="http://philpapers.org/go.pl[^"]+u=(http.+?)"': '*', # philsci-archive.pitt.edu: '<meta name="eprints.document_url" content="(.+?)"': '*', # sciencedirect.com: 'pdfurl="(.+?)"': '*', # PLOSOne: '(http://www.plosone.org/article/.+?representation=PDF)" id="downloadPdf"': '*', # Google Drive: 'content="https://drive.google.com/file/d/(.+?)/': 'https://googledrive.com/host/*' } for pat, target in redir_patterns: m = re.search(pat, page.source) if m: target = target.replace('*', m.group(1)) target = self.normalize_url(page.make_absolute(target)) if target == page.url: return None debug(2, "repository page for {}", target) return target # other steppingstone pages must have link(s) to a single pdf file: targets = set(u for u in page.xpath('//a/@href') if re.search('.pdf$', u, re.I)) if len(targets) != 1: debug(4, "no: {} links to pdf files", len(targets)) return None debug(4, "looks good: single link to pdf file {}", targets[0]) target = self.normalize_url(page.make_absolute(targets[0])) return target
if __name__ == '__main__': args = parse_args() config = configparser.ConfigParser() config.read(args.config) # load URL list file urls = [] with open(args.input_path) as f: for line in f: url = line.strip() if url == '': continue urls.append(url) # parse web pages webpages = [] for url in urls: webpages.append(Webpage.factory(url)) time.sleep(CRAWL_TIME_INTERVAL) trello = Trello(config['trello']['api_key'], config['trello']['api_token'], config['trello']['t_list_id'] ) # put data into Trello for q in webpages: trello.create_card(q.title, q.url)
import csv from pathlib import Path from selenium import webdriver from webpage import Webpage with open(Path("webpage-info-scraper/webpage_urls.txt"), "r") as webpage_urls: with open(Path("webpage-info-scraper/webpage_data.csv"), "w") as webpage_data: writer = csv.writer(webpage_data, delimiter=",") web_driver = webdriver.Firefox() webpages = [] """Scrape information from webpages using URLs stored in CSV file""" for url in webpage_urls: webpage = Webpage(url, web_driver) webpages.append(webpage) webpage.open() webpage.org_name = webpage.find_element_by_xpath("//h1").text print(webpage.org_name) try: contact_email_element = webpage.find_element_by_xpath( "//span[text()='Contact Email']") div_text = contact_email_element.find_element_by_xpath( '..').text webpage.email_address = remove_words_from_text( div_text, ['Contact', 'Email', 'E:']) print(webpage.email_address) except: webpage.email_address = ""
def test_pagination_none(): """ Test that webpage.pagination_html() returns None when no pagination needed """ webpage = Webpage(TEST_WEBSITE) assert webpage.pagination_html() is None
def test_utf8(caplog): pagename = 'philpapers-rec.html' url = 'https://blah.org' page = Webpage(url, html=source(pagename)) assert 'Analytic' in page.text()