def process_pages(self): skipped = [] pbar = ProgressBar(widgets=['Processing pages: ', SimpleProgress()], maxval=len(self.urls)).start() i = 0 for (num, url) in self.urls: pbar.update(int(num)) if (num and url): html = helpers.get_html(num, url) if html is not None: self.urls_with_nums[url] = num soup = BeautifulSoup(html.encode('utf-8', 'ignore'), 'lxml') page = Page(title=soup.title.string, num=num, html=soup.prettify(), url=url, text=soup.body.get_text()) page.index = i self.indices_with_pages[i] = page if page.ID not in self.pages_with_ids.keys(): self.pages_with_ids[page.ID] = page else: raise RuntimeError('COLLISION: %s collides with %s with hash %s.' % (page.num, self.pages_with_ids[page.ID].num, page.ID)) for link in soup.find_all('a'): if link.get('href') and 'mailto:' != link.get('href').strip()[0:7]: page.a.append(link) self.pages.append(page) i += 1 else: skipped.append(num) else: skipped.append(num) pbar.finish() print "Skipped page(s) %s because of an error." % (', '.join(skipped))
def calc_page_ranks(self, d=0.85): self.adj = numpy.zeros( (len(self.pages_with_ids),len(self.pages_with_ids)) ) pbar = ProgressBar(widgets=['Processing links: ', SimpleProgress()], maxval=len(self.pages_with_ids.keys())).start() progress = 1 for (ID, page) in self.pages_with_ids.iteritems(): pbar.update(progress) # magic PageRank for a in page.a: href = a.get('href') # normalize URLS url = page.normalize_url(href) if url in self.S: soup = BeautifulSoup(helpers.get_html(self.urls_with_nums[url]).encode('utf-8', 'ignore'), 'lxml') ID = helpers.page_hash(soup.prettify()) if ID in self.pages_with_ids.keys(): #print "%s (#%d) cites %s (#%d)" % (page.num, page.index, self.pages_with_ids[ID].num, self.pages_with_ids[ID].index) #print self.urls[int(self.pages_with_ids[ID].num)-1] self.adj[page.index][self.pages_with_ids[ID].index] = 1.0 progress += 1 # Normalize adjacency matrix into PageRanks pbar = ProgressBar(widgets=['Normalizing adjacencies: ', SimpleProgress()], maxval=len(self.pages_with_ids.keys())).start() progress = 1 col_sums = numpy.sum(self.adj, axis=1) for (ID, page) in self.pages_with_ids.iteritems(): pbar.update(progress) for k in xrange(len(self.adj[page.index])): if col_sums[page.index] != 0: self.adj[page.index][k] = self.adj[page.index][k] / col_sums[page.index] else: self.adj[page.index][k] = 0.0 self.indices_with_pages[k] progress += 1 pbar.finish() numpy.savetxt("adj.txt", self.adj) # Run PageRank and converge to principal eigenvector of adj matrix self.ranks = numpy.ones(len(self.pages_with_ids.keys())) z = numpy.ones(len(self.pages_with_ids.keys())) b = 1.0 - d pbar = ProgressBar(widgets=['Running PageRank: ', SimpleProgress()], maxval=1000).start() for m in xrange(1000): pbar.update(m) u = numpy.dot(self.adj, self.ranks) e = d*u f = b*z self.ranks = e+f pbar.finish() # Updating ranks of the pages pbar = ProgressBar(widgets=['Updating pages with new ranks: ', SimpleProgress()], maxval=len(self.pages_with_ids.keys())).start() progress = 1 for (ID, page) in self.pages_with_ids.iteritems(): pbar.update(progress) page.rank = self.ranks[page.index] progress += 1 pbar.finish() numpy.savetxt("page_ranks.txt", self.ranks)
def extract_data(url): """Given news article URL, return a dict with its data organised""" #author link? parsed_url = urlparse(url) html = helpers.get_html(url) if parsed_url.netloc == 'www.canarias7.es': try: text = html.find(attrs={'itemprop':'articleBody'}).get_text().strip()\ if html.find(attrs={'itemprop':'articleBody'}) else "" headline = html.find(attrs={'itemprop':'headline'}).get_text().strip()\ if html.find(attrs={'itemprop':'headline'}) else "" subheadline = html.find(attrs={'class':'subheadline'}).get_text().strip()\ if html.find(attrs={'class':'subheadline'}) else "" date = html.find(attrs={'class':'datefrom'}).get_text().strip()\ if html.find(attrs={'class':'datefrom'}) else "" author = html.find(attrs={'itemprop':'author'}).get_text().strip().title()\ if html.find(attrs={'itemprop':'author'}) else 'Anónimo' n_comments = html.find(attrs={'class':'numComments'}).get_text().strip()\ if html.find(attrs={'class':'numComments'}) else 0 categories = parsed_url.path.split('/')[1:3] labels = [topic.find('a').get_text() if topic.find('a') else ""\ for topic in html.find_all(attrs={'class':'topic'})] except Exception as e: print(e, parsed_url.path[1:]) if parsed_url.netloc == 'www.laprovincia.es': try: text = html.find(attrs={'itemprop':'articleBody'}).get_text().strip()\ if html.find(attrs={'itemprop':'articleBody'}) else "" headline = html.find(attrs={'itemprop':'headline'}).get_text().strip()\ if html.find(attrs={'itemprop':'headline'}) else "" subheadline = html.find(attrs={'itemprop':'description'}).get_text().strip()\ if html.find(attrs={'itemprop':'description'}) else "" date = html.find(attrs={'itemprop':'dateCreated'}).get_text().split('|')[0].strip()\ if html.find(attrs={'itemprop':'dateCreated'}) else "" author = html.find(attrs={'itemprop':'author'}).get_text().strip().title()\ if html.find(attrs={'itemprop':'author'}) else 'Anónimo' n_comments = html.find(attrs={'class':'textveces'}).get_text().strip()\ if html.find(attrs={'class':'textveces'}) else 0 categories = parsed_url.path.split('/')[1:3] labels = [x.get_text() for x in html.find(attrs={'id':'listaTags'}).findChildren('a')[1:]]\ if html.find(attrs={'id':'listaTags'}) else [] except Exception as e: print(e, parsed_url.path[1:]) if parsed_url.netloc == 'www.eldia.es': try: text = html.find(attrs={'itemprop':'articleBody'}).get_text().strip()\ if html.find(attrs={'itemprop':'articleBody'}) else "" headline = html.find(attrs={'itemprop':'headline'}).get_text().strip()\ if html.find(attrs={'itemprop':'headline'}) else "" subheadline = html.find(attrs={'itemprop':'description'}).get_text().strip()\ if html.find(attrs={'itemprop':'description'}) else "" date = html.find(attrs={'itemprop':'dateCreated'}).get_text().split('|')[0].strip()\ if html.find(attrs={'itemprop':'dateCreated'}) else "" author = html.find(attrs={'itemprop':'author'}).get_text().strip().title()\ if html.find(attrs={'itemprop':'author'}) else 'Anónimo' n_comments = html.find(attrs={'class':'textveces'}).get_text().strip()\ if html.find(attrs={'class':'textveces'}) else 0 categories = parsed_url.path.split('/')[1:3] labels = [x.get_text() for x in html.find(attrs={'id':'listaTags'}).findChildren('a')[1:]]\ if html.find(attrs={'id':'listaTags'}) else [] except Exception as e: print(e, parsed_url.path[1:]) if parsed_url.netloc == 'www.noticanarias.com': try: text = ' '.join([x.get_text().strip() for x in html.find(attrs={'itemprop':'articleBody'}).findChildren('p')])\ if html.find(attrs={'itemprop':'articleBody'}) else "" headline = html.find(attrs={'itemprop':'headline'}).get_text().strip()\ if html.find(attrs={'itemprop':'headline'}) else "" subheadline = "" date = html.find(attrs={'class':'vw-post-date updated'}).findChildren('time')[0]['datetime'].split('T')[0].strip()\ if html.find(attrs={'class':'vw-post-date updated'}) else "" author = html.find(attrs={'itemprop':'name'}).get_text().strip().title()\ if html.find(attrs={'itemprop':'name'}) else 'Anónimo' n_comments = "" categories = ['',''] labels = [x.get_text() for x in html.find_all('a',attrs={'rel':'tag'})]\ if html.find_all('a',attrs={'rel':'tag'}) else [] except Exception as e: print(e, parsed_url.path[1:]) if parsed_url.netloc == 'www.canarias24horas.com': try: text = html.find(attrs={'class':'itemFullText'}).get_text().strip()\ if html.find(attrs={'class':'itemFullText'}) else "" headline = html.find(attrs={'class':'itemTitle'}).get_text().strip()\ if html.find(attrs={'class':'itemTitle'}) else "" subheadline = html.find(attrs={'class':'itemIntroText'}).get_text().strip()\ if html.find(attrs={'class':'itemIntroText'}) else "" date = html.find(attrs={'class':'gkDate'}).get_text().strip()\ if html.find(attrs={'class':'gkDate'}) else "" author = html.find(attrs={'class':'itemAuthor'}).get_text().strip().title().split()[-1]\ if html.find(attrs={'class':'itemAuthor'}) else 'Anónimo' n_comments = "" categories = parsed_url.path.split('/')[1:3] labels = [topic.get_text() for topic in html.find(attrs={'class':'itemTags'}).findChildren('a')]\ if html.find(attrs={'class':'itemTags'}) else [] except Exception as e: print(e, parsed_url.path[1:]) if parsed_url.netloc == 'canariasnoticias.es': try: text = ' '.join([x.get_text().strip() for x in html.find(attrs={'class':'noticia-body'}).findChildren('p')])\ if html.find(attrs={'class':'noticia-body'}) else "" headline = html.find('h1', attrs={'class':'title'}).get_text().strip()\ if html.find('h1', attrs={'class':'title'}) else "" subheadline = html.find('h3', attrs={'class':'subtitle'}).get_text().strip()\ if html.find('h3', attrs={'class':'subtitle'}) else "" date = html.find(attrs={'class':'date'}).get_text().strip()\ if html.find(attrs={'class':'date'}) else "" author = html.find(attrs={'class':'author'}).get_text().strip().title()\ if html.find(attrs={'class':'author'}) else 'Anónimo' n_comments = html.find(attrs={'class':'comment-count'}).get_text().strip()\ if html.find(attrs={'class':'comment-count'}) else 0 categories = parsed_url.path.split('/')[1:3] labels = [] except Exception as e: print(e, parsed_url.path[1:]) if parsed_url.netloc == 'tribunadecanarias.es': try: text = ' '.join([x.get_text().strip() for x in html.find(attrs={'itemprop':'articleBody'}).findChildren('p')])\ if html.find(attrs={'itemprop':'articleBody'}) else "" headline = html.find(attrs={'itemprop':'headline'}).get_text().strip()\ if html.find(attrs={'itemprop':'headline'}) else "" subheadline = html.find(attrs={'class':'subheadline'}).get_text().strip()\ if html.find(attrs={'class':'subheadline'}) else "" date = html.find(attrs={'id':'t1'}).get_text().strip()\ if html.find(attrs={'id':'t1'}) else datetime.datetime.now().strftime('%Y-%m-%d') author = html.find(attrs={'itemprop':'author'}).get_text().strip().title()\ if html.find(attrs={'itemprop':'author'}) else 'Anónimo' n_comments = html.find(attrs={'class':'numComments'}).get_text().strip()\ if html.find(attrs={'class':'numComments'}) else 0 categories = parsed_url.path.split('/')[1:3] labels = [] except Exception as e: print(e, parsed_url.path[1:]) #save categories in one # extracted data in dict form data_dict = { 'newspaper':parsed_url.netloc, 'news_link':parsed_url.path[1:], 'headline':headline, 'subhead':subheadline, 'author':author, 'date':date, 'raw_text':text, 'n_comments':n_comments, 'main_cat':categories[1], 'sub_cat':categories[0], 'labels':labels } return data_dict
def test_get_html(self): self.assertIsNotNone(helpers.get_html(self.url))
def get_links(url,n_links=5): """Given a news outlet main website, return its news links. Identifies where links are depending on outlet""" # it doesnt get content that loads with JS - solution would query directly those urls # https://gohighbrow.com/scraping-javascript-heavy-websites/ not implemented parsed_url = urlparse(url) html = helpers.get_html(url) #get bs4 object links = [] #store news links # give parameters to scrape website depending on link if parsed_url.netloc == 'www.canarias7.es': for link in html.find_all(['h2','h3','div'], attrs={'class':'headline'}): #include div (more news but more noise) if link.parent.has_attr('href'):# skip None links - without the structure (normally voting polls etc) links.append(link.parent['href']) if parsed_url.netloc == 'www.laprovincia.es': for link in html.find_all('a', attrs={'data-tipo':'noticia'}): links.append(link['href']) if parsed_url.netloc == 'www.eldia.es': for link in html.find_all('a', attrs={'data-tipo':'noticia'}): links.append(link['href']) if parsed_url.netloc == 'www.noticanarias.com': for link in html.find_all('a', attrs={'itemprop':'url'}): links.append(link['href']) if parsed_url.netloc == 'www.canarias24horas.com': for link in html.find_all('h4', attrs={'class':'nspHeader'}): links.append(link.findChildren('a')[0]['href']) #add data scrapers#add data scrapers#add data scrapers#add data scrapers#add data scrapers if parsed_url.netloc == 'canariasnoticias.es': for link in html.find_all(attrs={'class':'title'}): links.append(link.find('a')['href']) if parsed_url.netloc == 'www.sanborondon.info': for link in html.find_all(attrs={'class':'nspHeader'}): links.append(link.find('a')['href']) if parsed_url.netloc == 'tribunadecanarias.es': for link in html.find_all(attrs={'class':'ns2-title'}): links.append(link.find('a')['href']) if parsed_url.netloc == 'www.canariasdiario.com': for link in html.find_all(attrs={'itemprop':'mainEntityOfPage'}): links.append(link['href']) if parsed_url.netloc == 'www.europapress.es': for link in html.find_all(attrs={'itemprop':'headline'}): print(link) links.append(link.find('a')['href']) if parsed_url.netloc == 'www.efe.com': for link in html.find_all('a', attrs={'itemprop':'url'}): links.append(link['href']) return links