def wrap_newspaper(self, web_page): parser = NewspaperArticle(url=web_page.final_url) parser.html = web_page.html parser.is_downloaded = True parser.parse() return parser
def parse_article(url, min_words_count=jg.MIN_WORDS_TO_SCRAPE): """ We download an article by ourselves so that we do it behind the Tor network and with a random user agent (Don't let Newspaper do it!). Then we fool Newspaper to think that it was the one who downloaded it so we can parse it and return the article. Returns None if the article is smaller than min_words_count. """ try: response = get_page(url) except Exception as err: update_log.error('Error in get_page()') update_log.error(err) return None if response is not None: article = ArticleParser(url="http://something") article.html = response.content article.download_state = 2 try: article.parse() except Exception as err: update_log.error('Error in article.parse()') update_log.error(err) return None else: add_url_to_blacklist(url) if len(article.text.split(' ')) >= min_words_count: return article return None
def today_helper(article_num, body, article_total): try: article_name = "article_{:04d}".format(article_num) print(">>> Processing: {}/{}".format(article_num, article_total), end='\r') parser = etree.HTMLParser() tree = etree.parse(StringIO(body), parser).getroot() result = '' for p in tree.iter('p'): result += etree.tostring(p, method="html").decode("utf-8") article = Article(body) article.download() article.html = result article.parse() body = article.text words = body.split() text = ' '.join(words) except: print(">>> Broken link, skipping URL {}".format(article_num)) return out_dir = data_path + config['scraping']['folder'] filename = "{}{}.txt".format(out_dir, article_name) with open(filename, 'w') as f: f.write(text)
def _get_content_from_url(self, url): """Takes in a single url and return article content and title""" #r = requests.get(url) try: r = requests.get(url,timeout=6) # print('successful!') except requests.exceptions.Timeout as e: # Maybe set up for a retry print(e) return ' ', ' ' except requests.exceptions.RequestException as e: print(e) return ' ', ' ' # save to file with open('file.html', 'wb') as fh: fh.write(r.content) #print('Running Article...') a = Article(url) # set html manually with open("file.html", 'rb') as fh: a.html = fh.read() #print('Done opening Article.html...') # need to set download_state to 2 for this to work a.download_state = 2 a.parse() title = a.title content = re.sub("\n\n"," ",a.text) # Now the article should be populated return content, title
def run_newspaper(htmlstring): '''try with the newspaper module''' ## does not work! myarticle = Article('https://www.example.org/test/') myarticle.html = htmlstring myarticle.download_state = ArticleDownloadState.SUCCESS myarticle.parse() if myarticle.publish_date is None: return None date = convert_date(myarticle.publish_date, '%Y-%m-%d %H:%M:%S', '%Y-%m-%d') return date
def newspaper_test(): from newspaper import fulltext, Article for i in range(0, 1000): input_filename = 'page/' + str(i) + '.txt' output_filename = 'newspaper/' + str(i) + '.txt' input_file = open(input_filename, 'r') s = input_file.read() input_file.close() a = Article(language='zh') a.html = s a.parse() # print a.text raw_input('wait')
def run_newspaper(htmlstring): '''try with the newspaper module''' # throws error on the eval_default dataset try: myarticle = Article(htmlstring) except (TypeError, UnicodeDecodeError): return None myarticle.html = htmlstring myarticle.download_state = ArticleDownloadState.SUCCESS myarticle.parse() if myarticle.publish_date is None or myarticle.publish_date == '': return None return convert_date(myarticle.publish_date, '%Y-%m-%d %H:%M:%S', '%Y-%m-%d')
def extract_data(url, bert_summary): article = Article(url) print("article object created") article.download() if article.download_state != ArticleDownloadState.SUCCESS: article.html = urllib.request.urlopen(url).read() # Hacking the library article.download_state = ArticleDownloadState.SUCCESS print("download completed") article.parse() print("parsing completed") top_image = article.top_image title = article.title if bert_summary: print("extracting bert summary") summary = extract_bert_summary(article.text) else: print("extracting short summary") summary = extract_short_summary(article) return summary, top_image, title
def cna_helper(article_num, url, article_total): try: article_name = "article_{:04d}".format(article_num) print(">>> Processing: {}/{}".format(article_num, article_total), end='\r') article = Article(url, "english") article.download() parser = etree.HTMLParser() tree = etree.parse(StringIO(article.html), parser).getroot() result = '' for div in tree.iter('div'): if 'class' in div.attrib and 'c-rte--article' in div.attrib[ 'class']: for p in div.iter('p'): result += etree.tostring(p, method="html").decode("utf-8") break article.html = result article.parse() body = article.text words = body.split() text = ' '.join(words) except: print(">>> Broken link, skipping URL {}".format(article_num)) return out_dir = data_path + config['scraping']['folder'] filename = "{}{}.txt".format(out_dir, article_name) with open(filename, 'w') as f: f.write(text)
def get_text_date(url): try: article = Article(url) article.download() article.html = re.sub(r"\n+", " ", article.html) article.html = re.sub( r"<blockquote class=\"twitter-tweet\".+?</blockquote>", "", article.html) article.html = re.sub( r"<blockquote class=\"instagram-media\".+?</blockquote>", "", article.html) article.html = re.sub( r"<blockquote class=\"tiktok-embed\".+?</blockquote>", "", article.html) article.html = re.sub(r"<blockquote cite=\".+?</blockquote>", "", article.html) #article.html = re.sub(r"<h2 class=\"mce\">·.+?</p>", "", article.html) # subtitulares de vertele article.html = re.sub(r"<figcaption.+?</figcaption>", "", article.html) article.parse() return article.text, article.publish_date except newspaper.article.ArticleException: return None, None
def get_text_date(url): try: article = Article(url) article.download() if "Noticia servida automáticamente por la Agencia EFE" in article.html: return None, None article.html = re.sub(r"\n+", " ", article.html) article.html = re.sub( r"<blockquote class=\"twitter-tweet\".+?</blockquote>", "", article.html) article.html = re.sub( r"<blockquote class=\"instagram-media\".+?</blockquote>", "", article.html) article.html = re.sub( r"<blockquote class=\"tiktok-embed\".+?</blockquote>", "", article.html) article.html = re.sub(r"<blockquote cite=\".+?</blockquote>", "", article.html) #article.html = re.sub(r"<h2 class=\"mce\">·.+?</p>", "", article.html) # subtitulares de vertele article.html = re.sub(r"<figcaption.+?</figcaption>", "", article.html) article.html = re.sub( r"<p><em>Si alguien te ha reenviado esta carta.+?</em></p>", "", article.html) # Matrioska de verne article.html = re.sub( r"<p class=\"\">(<b>)?Información sobre el coronavirus(</b>)?.+?ante la enfermedad</a></p>", "", article.html) # El Pais nuevo pie coronavirus article.html = re.sub( r"<p class=\"\">(<b>)?Información sobre el coronavirus(</b>)?.+?sobre la pandemia.*?</p>", "", article.html) # El Pais viejo pie coronavirus article.html = re.sub(r"<p class=\"\">.*?Suscríbase aquí.*?</p>", "", article.html) # newsletter El País article.html = re.sub(r"<a[^>]+>Apúntate a .*?</a>", "", article.html) # newsletter 20 minutos article.html = re.sub(r"<p[^>]+>Apúntate a .*?</p>", "", article.html) # newsletter 20 minutos article.html = re.sub( r"<span class=\"datos-articulo\".+?</div><p class=\"enviaremailerr captcha\">", "", article.html) article.html = re.sub(r"<aside class=\"modulo temas\".+?</aside>", "", article.html) article.html = re.sub(r"Si quieres seguir recibiendo.+?</p>", "", article.html) article.html = re.sub(r"<p class=\"siguenos_opinion\">.+?</p>", "", article.html) article.html = re.sub(r"<p><a.+?<em>playlists</em> de EL PAÍS</a></p>", "", article.html) article.html = re.sub(r"<section class=\"more_info .+?</section>", "", article.html) article.html = re.sub(r"<span class=\"EPS-000.+?eps</span>", "", article.html) article.html = re.sub( r"<span class=\"f_a | color_black uppercase light.+?</span>", "", article.html) article.html = re.sub(r"<i>Puedes seguir a .+?[nN]ewsletter.?</i>", "", article.html) # pie de Materia article.html = re.sub(r"Puedes seguir a .+?(<i>)? *[nN]ewsletter</a>", "", article.html) # pie de Materia article.html = re.sub( r"<i>Puedes seguir a .+?(<i>)? *[nN]ewsletter</i></a>", "", article.html) # pie de Materia article.html = re.sub( r"<i>Puedes escribirnos a .+?[Nn]ewsletter</i></a>", "", article.html) # pie de Materia nuevo article.html = re.sub(r"<p><em><strong>¿Nos ayudas?.+?</p>", "", article.html) # Kiko Llaneras article.html = re.sub( r"<p class=\"nota_pie\".+?a nuestra <em>newsletter</em>\.?(</span>)*</p>", "", article.html) # pie de Planeta Futuro article.html = re.sub( r"<i>Puedes escribirnos a.+?<i>[nN]ewsletter</i></a>", "", article.html) # pie de Materia article.html = re.sub(r"<p class=" "><i>Puedes escribirnos a.+?</p>", "", article.html) article.html = re.sub( r"<i>Lee este y otros reportajes.+?con EL PAÍS.</i>", "", article.html) # pie Buenavida EL PAIS article.html = re.sub( r"<h3 class=\"title-related\">.+?</div>", "", article.html) # noticias relacionadas en El Confi article.html = re.sub( r"<button.+?</button>", "", article.html) # botones de compartir en elpais icon article.html = re.sub(r"<p class=\"g-pstyle.+?</p>", "", article.html) article.html = re.sub(r"<p class=\"nota_pie\">.+?</p>", "", article.html) article.html = re.sub(r"<strong>Apúntate a la .+?</strong>", "", article.html) article.html = re.sub(r"<p><strong>O súmate a .+?</strong></p>", "", article.html) #article.html = re.sub(r"<h2.*?>¿En qué se basa todo esto\?</h2>.*</div>", "", article.html) article.html = re.sub( r"<strong>Más en tu mejor yo</strong>: <a.*?</a>", "", article.html) article.html = re.sub(r"<p class=\"article-text\"> +<a.*?</a>", "", article.html) article.html = re.sub( r"<span>Este sitio web utiliza cookies propias.+?</span>", "", article.html) article.html = re.sub(r"\[LEER MÁS:.+?\]", "", article.html) article.html = re.sub(r"<div id=\"post-ratings-.+?Cargando…</div>", "", article.html) # rating EFE article.html = re.sub( r"<div id=\"div_guia\" class=\"guia\" itemprop=\"alternativeHeadline\">.+?</div>", "", article.html) # subtitulo EFE article.html = re.sub( r"<div class=\"f f__v video_player.+?</div></div></div>", "", article.html) article.html = article.html.replace("<em class=\"mce\">", "<em>") article.html = re.sub("([^ ])<em>", "\g<1> <em>", article.html) article.html = article.html.replace("<em> ", "<em>") article.html = re.sub("([^ ])<i>", "\g<1> <i>", article.html) article.html = article.html.replace("<i> ", "<i>") article.html = article.html.replace(" </em>", "</em>") #article.html = re.sub("</em>([^ \W])", "</em> \g<1>", article.html) article.html = re.sub("</em>([^\s\.,;:])", "</em> \g<1>", article.html) article.html = article.html.replace(" </i>", "</i>") article.html = re.sub("</i>([^\s\.,;:])", "</i> \g<1>", article.html) article.html = article.html.replace("<em>", "'") article.html = article.html.replace("</em>", "'") article.html = article.html.replace("<i>", "'") article.html = article.html.replace("</i>", "'") article.parse() """ if article.meta_description: article.text = article.meta_description + "\n\n" + article.text """ return article.text, article.publish_date except newspaper.article.ArticleException: return None, None
def scrape(url): """ Scrapes an article from the 'url', extracts meta data using Nespaper3K package Parameters: -------- url : str, url to scrape Returns: -------- doc : dict, { 'url' : url, 'date' : article publish_date, 'title' : article title, 'text' : article cleaned_text, 'keywords' : article meta_keywords, 'summary' : article summary } False : bool, if get request fails or html < 500 """ from newspaper import Article, Config import re logger.info(f"SCRAPE: trying {url}") config = Config() config.memoize_articles = False config.fetch_images = False config.language = 'en' config.browser_user_agent = get_ua() config.request_timeout = 5 config.number_threads = 8 response = get_html_from_url(url) if response['status_code'] and response['html']: try: article = Article(url=url, config=config) article.download_state = 2 article.html = response['html'] article.parse() article.nlp() words_count = len((article.text).split()) if words_count > 200: logger.info( f'SCRAPE: Extracted TEXT from URL: {url}\n Title: "{article.title}"' ) return { 'url': url, 'datetime': article.publish_date, 'title': article.title, 'text': " ".join(re.split(r'[\n\t]+', article.text)), 'keywords': article.keywords, 'summary': article.summary } else: logger.info(f'''SCRAPE: Could not extract TEXT from {url}\n Article too short: {words_count} words''') except Exception as e: logger.info( f'SCRAPE: Could not extract TEXT from {url}\n Error: {e}') else: logger.info(f'SCRAPE: Could not extract TEXT from {url}') return False
#cast string to list source_list = ast.literal_eval(e['source_list']) #finds which is the position of the o_url in the list (we will need that to retrieve the correct .html) o_idx = source_list.index(o_url) a = Article(o_url) #finds the html file article_alias = a_url.rstrip("/").split("/")[-1] article_folder = html_folder+"/"+article_alias o_html_filename = article_folder+"/"+str(o_idx)+".html" # set html manually with open(o_html_filename, 'rb') as fh: a.html = fh.read() # need to set download_state to 2 for this to work a.download_state = 2 a.parse() # Now the article should be populated print(a.text) gold_df.to_csv(cwd+"/datasetVeritas3.csv", index=False) print("average number of annotations per doc:", sum(lenlen)/len(lenlen)) lenlen.sort(reverse = True) print(lenlen[:200]) print("max num of annotations on the same source") print(max(lenlen)) print("NEW")
def parse_article(self, response): news_id = 19684 #response.meta.get('news_id') # save to file with open(str(news_id) + '.html', 'wb') as fh: fh.write(response.body) article = Article(response.url) # set html manually with open(str(news_id) + '.html', 'rb') as fh: article.html = fh.read() os.remove(str(news_id) + '.html') # need to set download_state to 2 for this to work article.download_state = 2 article.parse() article.nlp() date = article.publish_date keywords = str([x.replace("'", "''") for x in article.keywords]).replace('"', '\'') content = article.text.replace("'", "''") summary = article.summary.replace("'", "''") title = article.title.replace("'", "''") if date is None: date = 'null' else: date = "'" + str(date) + "'" authors = str([x.replace("'", "''") for x in article.authors]).replace('"', '\'') tags = str([x.replace("'", "''") for x in article.meta_keywords]).replace('"', '\'') dbconnector.execute( self.conn, 'INSERT INTO "ParsedNews-newspaper"("IDNews", "Date", "Content", "Keywords", ' + '"Summary", "Authors", "Tags", "Title") ' + 'VALUES (' + str(news_id) + ', ' + str(date) + ', \'' + content + '\', ARRAY ' + str(keywords) + '::text[], \'' + summary + '\', ARRAY ' + str(authors) + '::text[], ARRAY ' + str(tags) + '::text[], \'' + title + '\')') # get main article without comments content = extract_content(response.text).replace("'", "''") # get article and comments content_comments = '[\'' + extract_content_and_comments( response.text).replace("'", "''") + '\']' dbconnector.execute( self.conn, 'INSERT INTO "ParsedNews-dragnet"("IDNews", "Content", "Comments") ' + 'VALUES (' + str(news_id) + ', \'' + content + '\', ARRAY ' + str(content_comments) + '::text[])') date = articleDateExtractor.extractArticlePublishedDate( articleLink=response.url, html=response.text) if date is not None: dbconnector.execute( self.conn, 'INSERT INTO "ParsedNews-ade"("IDNews", "Date") ' + 'VALUES (' + str(news_id) + ', \'' + str(date) + '\')') g = Goose() article = g.extract(raw_html=response.text) date = article.publish_datetime_utc keywords = str([x.replace("'", "''") for x in article.tags]).replace('"', '\'') content = article.cleaned_text.replace("'", "''") summary = article.meta_description.replace("'", "''") title = article.title.replace("'", "''") if date is None: date = 'null' else: date = "'" + str(date) + "'" authors = str([x.replace("'", "''") for x in article.authors]).replace('"', '\'') tags = str([ x.replace("'", "''") for x in article.meta_keywords.split(",") ]).replace('"', '\'') tweets = str([x.replace("'", "''") for x in article.tweets]).replace('"', '\'') dbconnector.execute( self.conn, 'INSERT INTO "ParsedNews-goose"(' + '"IDNews", "Date", "Content", "Keywords", "Summary", ' + '"Authors", "Tags", "Tweets",' + '"Title") VALUES (' + str(news_id) + ', ' + date + ', \'' + content + '\', ARRAY ' + str(keywords) + '::text[], \'' + str(summary) + '\', ARRAY ' + str(authors) + '::text[], ARRAY ' + str(tags) + '::text[], ARRAY ' + str(tweets) + '::text[], \'' + str(title) + '\')') pass
import re from pymongo import MongoClient from newspaper import Article client = MongoClient() db_articles = client.news.articles db_web_cache = client.news.web_cache docs = db_articles.find() for doc in docs: print doc['_id'] if not doc['page']: continue url = doc['page']['urls'][0] web_cache_doc = db_web_cache.find_one({'url': url}) if 'html_compressed' in web_cache_doc: article = Article(url=url) article.html = bz2.decompress(web_cache_doc['html_compressed']) article.is_downloaded = True article.parse() doc['page']['text'] = article.text print len(doc['page']['text']) db_articles.save(doc)
from newspaper import Article client = MongoClient() db_articles = client.news.articles db_web_cache = client.news.web_cache docs = db_articles.find() for doc in docs: print doc['_id'] if not doc['page']: continue url = doc['page']['urls'][0] web_cache_doc = db_web_cache.find_one({'url': url}) if 'html_compressed' in web_cache_doc: article = Article(url=url) article.html = bz2.decompress(web_cache_doc['html_compressed']) article.is_downloaded = True article.parse() doc['page']['text'] = article.text print len(doc['page']['text']) db_articles.save(doc)
def fetch_main_content(html: str) -> Article: a = Article(url='') a.html = html a.download_state = 2 a.parse() return a