def getArticle(self, html): article = "NA" author = "NA" soup = BeautifulSoup(html, "html.parser") article_tag = soup.find("div", {"class": "ArticleBody-articleBody"}) if article_tag != None: article = "" div_groups = article_tag.find_all("div", {"class": "group"}, recursive=False) for d in div_groups: for p in d.find_all("p", recursive=False): article += text4csv(p.get_text()) else: #is a video video_desc_div = soup.find( "div", {"class": "ClipPlayer-clipPlayerIntroSummary"}) if video_desc_div != None: article = text4csv(video_desc_div.get_text()) author_a = soup.find("div", {"class": "Author-author"}) if author_a != None: author = text4csv(author_a.get_text()) return article, author
def getArticle(self,html): article = "NA" author = "NA" soup = BeautifulSoup(html, "html.parser") article_tag = soup.find("div",{"class":"caas-content-wrapper"}) if article_tag==None: article_tag = soup.find("article") if article_tag!=None: article = "" for p in article_tag.find_all("p", recursive=False): article += text4csv(p.get_text()) author_span = article_tag.find("div", {"class": "caas-attr-meta"}) if author_span==None: author_span = article_tag.find("a", {"class": "authors"}) if author_span==None: author_span = article_tag.find("span", {"class": "authors"}) if author_span!=None: author = text4csv(author_span.get_text()) return article, author
def getArticle(self,html): avoid_text = ["Do Not Sell","We're no longer maintaining this page.","For the latest business news and markets data, please visit CNN"] article = "NA" author = "NA" soup = BeautifulSoup(html, "html.parser") article_tag = soup.findAll("div", {"class": "el__leafmedia el__leafmedia--sourced-paragraph"}) article_tag += soup.findAll("div", {"class": "zn-body__paragraph"}) if len(article_tag)>0: #version 1 article = "" for d in article_tag: if d.get_text().strip() in avoid_text: continue article += " "+text4csv(d.get_text()) else: #version 2 article_tag = soup.find("div", {"id": "storytext"}) if article_tag != None: paragraphs = article_tag.findAll("p") if len(paragraphs)>0: article = "" for p in paragraphs: if p.get_text().strip() in avoid_text: continue article += " "+text4csv(p.get_text()) author_span = soup.find("span", {"class": "metadata__byline__author"}) if author_span == None: author_span = soup.find("span", {"class": "byline"}) if author_span!=None: author = text4csv(author_span.get_text()) return article, author
def getArticle(self, html): article = "NA" author = "NA" soup = BeautifulSoup(html, "html.parser") article_tag = soup.find("div", {"class": "article-content"}) if article_tag != None: pragraphs = article_tag.findAll("p") article = "" for p in pragraphs: article += text4csv(p.get_text()) author_span = soup.find("span", {"class": "article-source"}) if author_span != None: author = text4csv(author_span.get_text()) return article, author