Ejemplo n.º 1
0
    def getArticle(self, html):
        article = "NA"
        author = "NA"

        soup = BeautifulSoup(html, "html.parser")

        article_tag = soup.find("div", {"class": "ArticleBody-articleBody"})
        if article_tag != None:
            article = ""
            div_groups = article_tag.find_all("div", {"class": "group"},
                                              recursive=False)
            for d in div_groups:
                for p in d.find_all("p", recursive=False):
                    article += text4csv(p.get_text())
        else:  #is a video
            video_desc_div = soup.find(
                "div", {"class": "ClipPlayer-clipPlayerIntroSummary"})
            if video_desc_div != None:
                article = text4csv(video_desc_div.get_text())

        author_a = soup.find("div", {"class": "Author-author"})
        if author_a != None:
            author = text4csv(author_a.get_text())

        return article, author
Ejemplo n.º 2
0
    def getArticle(self,html):
        article = "NA"
        author = "NA"

        soup = BeautifulSoup(html, "html.parser")
        article_tag = soup.find("div",{"class":"caas-content-wrapper"})
        if article_tag==None:
            article_tag = soup.find("article")

        if article_tag!=None:
            article = ""
            for p in article_tag.find_all("p", recursive=False):
                article += text4csv(p.get_text())

        author_span = article_tag.find("div", {"class": "caas-attr-meta"})
        if author_span==None:
            author_span = article_tag.find("a", {"class": "authors"})
        if author_span==None:
            author_span = article_tag.find("span", {"class": "authors"})
        

        if author_span!=None:
            author = text4csv(author_span.get_text())


        return article, author
Ejemplo n.º 3
0
    def getArticle(self,html):
        avoid_text = ["Do Not Sell","We're no longer maintaining this page.","For the latest business news and markets data, please visit CNN"]
        article = "NA"
        author = "NA"

        soup = BeautifulSoup(html, "html.parser")
        article_tag = soup.findAll("div", {"class": "el__leafmedia el__leafmedia--sourced-paragraph"})
        article_tag += soup.findAll("div", {"class": "zn-body__paragraph"})
        if len(article_tag)>0: #version 1
            article = ""
            for d in article_tag:
                if d.get_text().strip() in avoid_text:
                    continue
                article += " "+text4csv(d.get_text())
        
        else: #version 2
            article_tag = soup.find("div", {"id": "storytext"})
            if article_tag != None:
                paragraphs = article_tag.findAll("p")
                if len(paragraphs)>0:
                    article = ""
                    for p in paragraphs:
                        if p.get_text().strip() in avoid_text:
                            continue
                        article += " "+text4csv(p.get_text())

        author_span = soup.find("span", {"class": "metadata__byline__author"})
        if author_span == None:
            author_span = soup.find("span", {"class": "byline"})

        if author_span!=None:
            author = text4csv(author_span.get_text())

        return article, author
Ejemplo n.º 4
0
    def getArticle(self, html):
        article = "NA"
        author = "NA"

        soup = BeautifulSoup(html, "html.parser")
        article_tag = soup.find("div", {"class": "article-content"})
        if article_tag != None:
            pragraphs = article_tag.findAll("p")
            article = ""
            for p in pragraphs:
                article += text4csv(p.get_text())

        author_span = soup.find("span", {"class": "article-source"})
        if author_span != None:
            author = text4csv(author_span.get_text())

        return article, author