def test_page_indexability(): """ Test to make sure indexability carries through from item to webpage """ webpage_index = Webpage(TEST_WEBSITE) webpage_index.item_from_md_filename('001-basic-article.md') webpage_dont_index = Webpage(TEST_WEBSITE) webpage_dont_index.item_from_md_filename('009-unindexed-article.md') # Don't include noindex tag for article page that SHOULD be indexed assert '<meta name="robots" content="noindex">' not in webpage_index.html # Include noindex tag for article page that should NOT be indexed assert '<meta name="robots" content="noindex">' in webpage_dont_index.html
def getUrlTexts(urlList): """ Lazy returns url texts """ for url in urlList: page = Webpage(url) #data = tokenizeDocText(page.text) yield page.text
def crawl(self): #start crawling #myopener = MyOpener() while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty( ): work_url = self.priorityQueue.pop() self.visited.append(work_url) print("%s, %s") % (-1 * work_url[0], work_url[1]) #page = urllib2.urlopen(work_url) '''page = myopener.open(work_url) self.pagesCount += 1 soup = BeautifulSoup(page) links = soup.find_all('a')''' page = Webpage(work_url[1]) self.pagesCount += 1 for link in page.outgoingUrls: url = link.address if url != None and url != '': if url.find('?') != -1: url = url.split('?')[0] if not self.exists(url, self.visited): if url.startswith('http:') and url.find( '#') == -1 and not self.exists( url, self.priorityQueue.queue): url_score = self.scorer.calculate_score( link.getAllText()) self.totalPagesCount += 1 if url_score > 0.1: self.priorityQueue.push( ((-1 * url_score), url)) self.relevantPagesCount += 1
def test_webpage_from_single_article(): """ Test creating an article item page using item_from_md_filename() """ webpage = Webpage(TEST_WEBSITE) webpage.item_from_md_filename('001-basic-article.md') # Page title should be "Article title - Website name" title = 'This is the heading - Test website name' assert webpage.title == title assert webpage.html.count('<title>' + title + '</title>') == 1 # Page should use static page template assert '<p>Article page template</p>' in webpage.html # List page header should be present assert webpage.html.count('<div>list page header</div>') == 1 # Webpage should contain the text from the article assert webpage.html.count('<p>And here is some text...</p>') == 1 # Article item footer should be present assert webpage.html.count('<footer class="article-footer">') == 1 # Filename for webpage should be based on the article article = Item(TEST_WEBSITE) article.from_md_filename('001-basic-article.md') assert webpage.filename == article.filename # Body should have class='magnetizer-article-page' assert webpage.html.count("<body class='magnetizer-article-page'>") == 1 # Twitter card should be present assert '<meta name="twitter:card" content="summary_large_image" />' in webpage.html # Link to Atom feed should be present assert ('<link rel="alternate" type="application/rss+xml" ' + 'href="https://example.com/atom.xml" />') in webpage.html # Link to CSS should be present assert '<link rel="stylesheet" type="text/css" href="test-stylesheet.css' in webpage.html # Includes should be included, as per the .md file assert webpage.html.count("<div class='include'>Include 1</div>") == 2 assert webpage.html.count("<div class='include'>Include 2</div>") == 1 assert webpage.html.count("<div class='include'>Include 3</div>") == 1 assert "[ ERROR: Include 'inexistent_file.html' does not exist! ]" in webpage.html # No html comments should be left in page assert '<!--' not in webpage.html # Meta description should be pulled in from article assert '<meta name="description" content="Meta description from article">' in webpage.html # Footnote link should have been added assert "<a href='#1'>[1]</a>" in webpage.html # Footnote anchor should have been added assert "<a id='1'></a>[1]:" in webpage.html
def crawl(self): #start crawling #myopener = MyOpener() self.harvestRatioData = [] self.relevantPages = [] while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty( ): work_url = self.priorityQueue.pop() self.visited.append(work_url[1]) #print ("%s, %s") % (-1 * work_url[0], work_url[1]) #page = urllib2.urlopen(work_url) '''page = myopener.open(work_url) self.pagesCount += 1 soup = BeautifulSoup(page) links = soup.find_all('a')''' page = Webpage(work_url, self.pagesCount) if len(page.text) > 0: page_score = self.scorer.calculate_score(page.text) else: page_score = 0 self.pagesCount += 1 if (page_score > self.pageScoreThreshold): page.getUrls() self.relevantPagesCount += 1 self.relevantPages.append(page) self.harvestRatioData.append( (self.relevantPagesCount, self.pagesCount)) print("%s," + str(page_score) + ", %s") % (-1 * work_url[0], work_url[1]) for link in page.outgoingUrls: url = link.address if url != None and url != '': if url.find('?') != -1: url = url.split('?')[0] if url.find('#') != -1: url = url.split('#')[0] # if url.startswith('http') == False: # parts = page.pageUrl[1].split("://") # baseUrl = parts[1].split("/")[0] # baseUrl = parts[0] +"://" + baseUrl # url = baseUrl + url #if not self.existsInVisited(url,self.visited): if url not in self.visited: #if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue): if url.startswith('http') and not self.exists( url, self.priorityQueue.queue): url_score = self.scorer.calculate_score( link.getAllText()) self.totalPagesCount += 1 #tot_score = (page_score + url_score)/2.0 #tot_score = page_score + url_score tot_score = url_score if tot_score > self.urlScoreThreshold: #self.priorityQueue.push(((-1 * url_score),url)) self.priorityQueue.push( ((-1 * tot_score), url, page.pageId))
def main(models=None): models = [] if models is None else models.split(',') models.insert(0, 'original') print("Generate results -> " + str(models)) url = input("URLを入力してください\n") layout_type = input("タイプを選択してください(1-7)\n") webpage = Webpage(url, layout_type) # スクリーンショット取得 screenshot = webpage.get_screenshot('screen-pc.png', 1280, 800) # HTML保存 webpage.save_html() for model in models: print(model) # CSVの準備 csv_tags = Csv('./working/tag_list_' + model + '.csv') csv_tags_custom = Csv('./working/tag_list_custom_' + model + '.csv') default_row = [ 'class or id', 'tag_name', 'start_x', 'start_y', 'size_w', 'size_h', 'average_color', 'salient_level', 'element_area' ] csv_tags.writerow(default_row) csv_tags_custom.writerow(default_row) # ハーフサイズの顕著性マップ if model == 'original': # 顕著性マップ取得 saliency_map = webpage.get_saliency_map(screenshot) resize_saliency_map = Image(saliency_map).get_halfsize() elif model == 'original-mlnet': saliency_map = Image(cv2.imread('./data/mlnet.png', 1)) resize_saliency_map = saliency_map.get_trimming((1280, 726)) resize_saliency_map = cv2.cvtColor(resize_saliency_map, cv2.COLOR_BGR2GRAY) else: saliency_map = Image(cv2.imread('./data/' + model + '.png', 1)) resize_saliency_map = saliency_map.get_trimming((1280, 726)) resize_saliency_map = cv2.cvtColor(resize_saliency_map, cv2.COLOR_BGR2GRAY) # 各要素のサイズと顕著度を取得 Element.canvas = Image(resize_saliency_map) Element.layout_type = webpage.layout_type Element.model = model print('ウェブページ全体の顕著度:' + str(Element.GetTotalSaliency())) GetElementInfo(webpage, csv_tags, csv_tags_custom) # CSVとWebDriverのClose csv_tags.close() csv_tags_custom.close() CreateRegionMap(model) webpage.driver.quit() getFinalLine() getFinalTile()
def crawl(self): self.harvestRatioData = [] self.relevantPages = [] while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(): work_url = self.priorityQueue.pop() self.visited.append(work_url) page = Webpage(work_url,self.pagesCount) if page.text =='' : continue page_score = 0.0 if self.combineScore: if len(page.text) > 0: page_score = self.scorer.calculate_score(page.text,'W') else: continue page.estimatedScore = page_score if self.restricted: if page_score < self.pageScoreThreshold: continue #print -1 * work_url[0],",", str(page_score),",",work_url[1],",", work_url[3] print -1 * work_url[0],",",work_url[1],",", work_url[3] self.pagesCount += 1 page.getUrls() self.relevantPages.append(page) for link in page.outgoingUrls: url = link.address if url != None and url != '': url = url.strip() if url.find('?')!= -1: url = url.split('?')[0] if url.find('#') != -1: url = url.split('#')[0] if url.endswith(("comment","comment/","feed","comments","feed/","comments/",".rss","video","video/","link","gif","jpeg","mp4","wav","jpg","mp3","png","share.php","sharer.php","login.php","print","print/","button/","share","email","submit","post",".pdf") ): continue if not self.exists(url,1): #tot_score = 0.0 if url.startswith('http') and not self.exists(url,2): if self.mode == 1: url_score = self.scorer.calculate_score(link.getAllText(),'U') if self.combineScore: tot_score= 0.5 *page_score + 0.5 *url_score else: tot_score = url_score #if tot_score >= self.urlScoreThreshold: self.priorityQueue.push(((-1 * tot_score),url,page.pageId,link.getAllText())) #else: # self.priorityQueue.push(((-1 * page_score),url,page.pageId,link.getAllText())) #else: # self.pages.append((page,0)) print self.priorityQueue.isempty()
def __init__(self, seedUrls): self.exclude_words = ['ads', 'print', 'advertisement'] self.seedUrls = seedUrls super(TFIDF_Scorer, self).__init__(None) self.seedPages = [] self.avgdl = 0 for url in self.seedUrls: page = Webpage(url) data = self.cleanDoc(page.text) self.seedPages.append(data) self.avgdl += len(data) self.buildModel(self.seedPages)
def test_pagination_previous_only(): """ Test that webpage.pagination_html() returns previous page correctly when no next page """ webpage = Webpage(TEST_WEBSITE) webpage.url_previous = 'page-1.html' result = '<nav class="magnetizer-pagination"><ul>' result += '<li><a href="page-1.html" class="magnetizer-previous">Newer posts</a></li>' result += '</ul></nav>' assert webpage.pagination_html() == result
def crawl(self): #start crawling #myopener = MyOpener() self.harvestRatioData = [] self.relevantPages = [] while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty( ): work_url = self.priorityQueue.pop() self.visited.append(work_url) page = Webpage(work_url, self.pagesCount) if len(page.text) > 0: page_score = self.scorer.calculate_score(page.text) else: page_score = 0 self.pagesCount += 1 if (page_score > self.pageScoreThreshold): page.getUrls() self.relevantPagesCount += 1 self.relevantPages.append(page) self.harvestRatioData.append( (self.relevantPagesCount, self.pagesCount)) print("%s, %s") % (-1 * work_url[0], work_url[1]) for link in page.outgoingUrls: url = link.address if url != None and url != '': #if url.find('?')!= -1: # url = url.split('?')[0] if url.startswith("/"): base = page.pageUrl[1][7:].split("/")[0] url = "http://" + base + url if not self.exists(url, self.visited): if url.startswith('http') and url.find( '#' ) == -1 and not self.priorityQueue.exists( url ): #self.exists(url,self.priorityQueue.queue): url_score = self.scorer.calculate_score( link.getAllText()) self.totalPagesCount += 1 #tot_score = (page_score + url_score)/2.0 #tot_score = page_score + url_score tot_score = url_score if tot_score > self.urlScoreThreshold: #self.priorityQueue.push(((-1 * url_score),url)) self.priorityQueue.push( ((-1 * tot_score), url, page.pageId)) #self.relevantPagesCount += 1 self.priorityQueue.next()
def test_pagination_previous_and_next(): """ Test that webpage.pagination_html() returns next and previous pages correctly when both are available """ webpage = Webpage(TEST_WEBSITE) webpage.url_previous = 'page-3.html' webpage.url_next = 'page-5.html' result = '<nav class="magnetizer-pagination"><ul>' result += '<li><a href="page-3.html" class="magnetizer-previous">Newer posts</a></li>' result += '<li><a href="page-5.html" class="magnetizer-next">Older posts</a></li>' result += '</ul></nav>' assert webpage.pagination_html() == result
def test_static_item_page(): """ Test creating a static item page using item_from_md_filename() """ webpage = Webpage(TEST_WEBSITE) webpage.item_from_md_filename('dont-show-on-list-page.md') # Page title should be "Article title - Website name" title = 'This post should not be in the index - Test website name' assert webpage.title == title assert webpage.html.count('<title>' + title + '</title>') == 1 # Page should use static page template assert '<p>Static page template</p>' in webpage.html # List page header should NOT present assert webpage.html.count('<div>list page header</div>') == 0 # Webpage should contain the text from the article assert webpage.html.count("<p>That's why it doesn't start with") == 1 # Article footer should NOT be present assert webpage.html.count('<footer>footer</footer>') == 0 # Filename for webpage should be based on the article article = Item(TEST_WEBSITE) article.from_md_filename('dont-show-on-list-page.md') assert webpage.filename == article.filename # Body should have class='magnetizer-static-page' assert webpage.html.count("<body class='magnetizer-static-page'>") == 1 # Twitter card should be present assert '<meta name="twitter:card" content="summary_large_image" />' in webpage.html # Link to Atom feed should be present assert ('<link rel="alternate" type="application/rss+xml" ' + 'href="https://example.com/atom.xml" />') in webpage.html # Link to CSS should be present assert '<link rel="stylesheet" type="text/css" href="test-stylesheet.css' in webpage.html # No html comments should be left in page assert '<!--' not in webpage.html
def test_includes(): """ Test of webpage.includes() """ webpage = Webpage(TEST_WEBSITE) webpage.html = '<h1>Some html</h1>' webpage.html += "<!-- MAGNETIZER_INCLUDE _include1.html -->" webpage.html += "<!-- MAGNETIZER_INCLUDE _include2.html -->" webpage.html += '<div>More html...</div>' webpage.html += "<!-- MAGNETIZER_INCLUDE _include3.html -->" webpage.html += "<!-- MAGNETIZER_INCLUDE _include1.html -->" correct_includes = ['_include1.html', '_include2.html', '_include3.html'] includes = webpage.includes() # Set should contain each include from the html for correct_include in correct_includes: assert correct_include in includes assert len(includes) == len(correct_includes)
def test_webpage_write(): """ Test of webpage.write() """ result = "This is a test!" webpage = Webpage(TEST_WEBSITE) webpage.html = result webpage.filename = 'my-post.html' webpage.write() # File should have the correct contents with open( TEST_WEBSITE.config.value('output_path') + webpage.filename, 'r') as myfile: assert myfile.read() == result # Page should be included in sitemap assert 'https://example.com/my-post.html' in TEST_WEBSITE.sitemap.pages TEST_WEBSITE.wipe()
def crawl(self): #start crawling while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(): work_url = self.priorityQueue.pop() self.visited.append(work_url) page = Webpage(work_url[1]) page_score = self.scorer.calculate_score(page.text) if (page_score > self.threshold): self.relevantPagesCount += 1 print ("%s, %s") % (-1 * work_url[0], work_url[1]) self.pagesCount += 1 for link in page.outgoingUrls: url = link.address if url != None and url != '': if url.find('?')!= -1: url = url.split('?')[0] if not self.exists(url,self.visited): if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue): url_score = self.scorer.calculate_score(link.getAllText()) self.totalPagesCount +=1 tot_score = (page_score + url_score)/2.0 if tot_score > threshold: self.priorityQueue.push(((-1 * tot_score),url))
def test_pagination_none(): """ Test that webpage.pagination_html() returns None when no pagination needed """ webpage = Webpage(TEST_WEBSITE) assert webpage.pagination_html() is None
def enhanced_crawl(self): #start crawling #myopener = MyOpener() self.harvestRatioData = [] self.relevantPages = [] while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty(): work_url = self.priorityQueue.pop() self.visited.append(work_url[1]) #print ("%s, %s") % (-1 * work_url[0], work_url[1]) #page = urllib2.urlopen(work_url) '''page = myopener.open(work_url) self.pagesCount += 1 soup = BeautifulSoup(page) links = soup.find_all('a')''' #print work_url[1] try: req = urllib2.Request(work_url[1]) # create a request object handle = urllib2.urlopen(req) # and open it to return a handle on the url except urllib2.URLError, e: # ignore error, URL timed out pass else: html = handle.read() soup = BeautifulSoup(html) paras = soup.findAll('p') #print paras text = "" for para in paras: text = text + " " + para.text page = Webpage(work_url,self.pagesCount) if len(page.text) > 0: page_score = self.scorer.calculate_smart_score(text, work_url[1]) else: page_score = 0 self.pagesCount += 1 if (page_score > self.pageScoreThreshold): page.getUrls() self.relevantPagesCount += 1 self.relevantPages.append(page) self.harvestRatioData.append((self.relevantPagesCount,self.pagesCount)) print ("%s|"+ str(page_score)+"|%s") % (-1.0 * work_url[0], work_url[1]) for link in page.outgoingUrls: url = link.address if url != None and url != '': if url.find('?')!= -1: url = url.split('?')[0] if url.find('#') != -1: url = url.split('#')[0] # if url.startswith('http') == False: # parts = page.pageUrl[1].split("://") # baseUrl = parts[1].split("/")[0] # baseUrl = parts[0] +"://" + baseUrl # url = baseUrl + url #if not self.existsInVisited(url,self.visited): if url not in self.visited: #if url.startswith('http:') and url.find('#') == -1 and not self.exists(url,self.priorityQueue.queue): if url.startswith('http') and not self.exists(url,self.priorityQueue.queue): url_score = self.url_scorer.calculate_score(link.getAllText()) self.totalPagesCount +=1 #tot_score = (page_score + url_score)/2.0 #tot_score = page_score + url_score tot_score = url_score if tot_score > self.urlScoreThreshold: #self.priorityQueue.push(((-1 * url_score),url)) self.priorityQueue.push(((-1 * tot_score),url,page.pageId))
def crawl(self): self.harvestRatioData = [] self.relevantPages = [] webpages = [] count = 0 ftext = open(self.pagesDir + "webpagesTxt.txt", "w") webpageLabel = 0 # 0 for Non-relevant and 1 for Relevant while self.pagesCount < self.pagesLimit and not self.priorityQueue.isempty( ): work_url = self.priorityQueue.pop() url = work_url[1] #if self.exists(url,1): # continue if url in self.visited: continue #self.visited.append(url)#work_url[1]) self.visited[url] = 1 page = Webpage(work_url, self.pagesCount) if page.text == '': continue page.estimatedScore = 0 if self.combineScore: page_score = 0 if len(page.text) > 0: #page_score = self.scorer.calculate_score(page.text,'W')[1] page_score = self.scorer.calculate_score(page, 'W')[1] if page_score == -1: continue else: print 'page text is empty' continue page.estimatedScore = page_score if self.restricted: if page_score < self.pageScoreThreshold: #self.pagesCount += 1 continue pageDom = getDomain(url) if page_score >= self.pageScoreThreshold: self.sourcesImp[pageDom][0] += 1 webpageLabel = 1 else: self.sourcesImp[pageDom][1] += 1 #self.sourcesImp[pageDom][0] = self.sourcesImp[pageDom][1] webpageLabel = 0 if self.combineScore: print page.pageId, ": ", str( page_score), ",", -1 * work_url[0], ",", work_url[ 1] #,",", work_url[3] else: print -1 * work_url[0], ",", work_url[1] #,",", work_url[3] self.pagesCount += 1 #self.relevantPages.append((page.pageId,page.pageUrl,page.estimatedScore)) self.relevantPages.append( (page.pageId, (page.pageUrl[1], page.pageUrl[2]), page.estimatedScore)) wbsStr = page.text.replace('\n', '. ').replace('\t', ' ') webpages.append(wbsStr) count += 1 #save webpage's text to disk instead of adding to list # this will lead to change in evaluation if count % self.bufferLen == 0: strToWrite = '\n'.join(webpages).encode("utf-8") ftext.write(strToWrite) webpages = [] #ftext = open(self.pagesDir+str(page.pageId) + ".txt", "w") #ftext.write(page.text.encode("utf-8")) #ftext.close() #------- if page_score < 0.1: continue page.getUrls() for link in page.outgoingUrls: url = link.address #if url != None and url != '': if url: url = url.strip() if url.find('report-a-typo') != -1: continue if url.find('m.tempo.co/') != -1: continue if url.find('?') != -1: furl = url.split('?')[1] if furl.startswith('id=') == False or furl.startswith( 'v=') == False or furl.startswith( 'tid=') == False: url = url.split('?')[0] if url.find('#') != -1: url = url.split('#')[0] if url.endswith('/'): url = url[:-1] #if url.endswith(("comment","comment/","feed","comments","feed/","comments/",".rss","video","video/","link","gif","jpeg","mp4","wav","jpg","mp3","png","share.php","sharer.php","login.php","print","print/","button/","share","email","submit","post",".pdf") ): if url.endswith( ("comment", "feed", "comments", ".rss", "video", "link", "gif", "jpeg", "mp4", "wav", "jpg", "mp3", "png", "share.php", "sharer.php", "login.php", "print", "button", "share", "email", "submit", "post", ".pdf")): continue #if not self.exists(url,1): if url in self.visited: continue #tot_score = 0.0 if url.startswith('http'): #and not self.exists(url,2): linkText = link.getAllText() #if self.mode == 1: #url_score = self.scorer.calculate_score(linkText,'U') url_score = self.scorer.calculate_score(link, 'U') tot_score = url_score if self.combineScore: #tot_score= 0.4 *page_score + 0.6 *url_score tot_score = page_score * url_score if tot_score < self.urlScoreThreshold: continue urlDom = getDomain(url) si_score = self.sourcesImp[urlDom][ 0] / self.sourcesImp[urlDom][1] if self.siScoreCombineMethod == 1: if webpageLabel: tot_score = tot_score * si_score elif self.siScoreCombineMethod == 2: tot_score = self.topicWeight * tot_score + self.siWeight * si_score #tot_score = tot_score * si_score #else: # tot_score = url_score #if tot_score >= self.urlScoreThreshold: #print tot_score, '-', url, linkText if self.restricted: if tot_score < self.urlScoreThreshold: continue if tot_score >= self.urlScoreThreshold: self.priorityQueue.push( ((-1 * tot_score), url, page.pageId)) #,linkText)) #else: # self.priorityQueue.push(((-1 * page_score),url,page.pageId,link.getAllText())) #else: # self.pages.append((page,0)) print self.priorityQueue.isempty() if webpages: strToWrite = '\n'.join(webpages).encode("utf-8") ftext.write(strToWrite) ftext.close() return self.priorityQueue.queue
import csv from pathlib import Path from selenium import webdriver from webpage import Webpage with open(Path("webpage-info-scraper/webpage_urls.txt"), "r") as webpage_urls: with open(Path("webpage-info-scraper/webpage_data.csv"), "w") as webpage_data: writer = csv.writer(webpage_data, delimiter=",") web_driver = webdriver.Firefox() webpages = [] """Scrape information from webpages using URLs stored in CSV file""" for url in webpage_urls: webpage = Webpage(url, web_driver) webpages.append(webpage) webpage.open() webpage.org_name = webpage.find_element_by_xpath("//h1").text print(webpage.org_name) try: contact_email_element = webpage.find_element_by_xpath( "//span[text()='Contact Email']") div_text = contact_email_element.find_element_by_xpath( '..').text webpage.email_address = remove_words_from_text( div_text, ['Contact', 'Email', 'E:']) print(webpage.email_address) except: webpage.email_address = ""