Exemple #1
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup,
                                 url: str) -> List[Claim]:
        claim = Claim()
        claim.set_url(url)
        claim.set_source("newtral")
        #title, claim and autor claim
        title = parsed_claim_review_page.find("meta",
                                              attrs={'property':
                                                     'og:title'})['content']
        title = title.strip().split("|")[0]
        claim.set_title(title)
        entry_content = parsed_claim_review_page.find(
            "div", attrs={'class': 'entry-content'})
        #print (title)
        dospunto = re.search(r'(: «)', title)
        dospunt = re.search(r'(: “)', title)

        if dospunto:
            claim_a = title.split(":")
            auteur = claim_a[0].strip()
            claim.author = auteur
            claim_text = claim_a[1].strip("« »")
            claim.claim = claim_text
            #print (claim_a)

        elif dospunt:
            claim_b = title.split(":")
            auteur = claim_b[0].strip()
            # print ("auteur:" , auteur)
            claim.author = auteur
            claim_text = claim_b[1].strip(": “ ”")
            # print ("claim :", claim)
            claim.claim = claim_text
        else:
            pass
        #multiple title or claim
        claim_mult = entry_content.findAll('h2')

        if claim_mult:
            claim_al = [i.text.strip() for i in claim_mult]
            dospunt = re.search(r'(: “)', claim_al)
            dospunto = re.search(r'(: «)', claim_al)
            if dospunt:
                claim_b = title.split(":")
                auteur = claim_b[0].strip()
                # print ("auteur:" , auteur)
                claim.author = auteur
                claim_text = claim_b[1].strip(": “ ”")
                # print ("claim :", claim)
                claim.claim = claim_text
            elif dospunto:
                claim_a = title.split(":")
                auteur = claim_a[0].strip()
                claim.author = auteur
                claim_text = claim_a[1].strip("« »")
                claim.claim = claim_text
                #print (claim_a)
            else:
                claim.set_title(claim_al)

                #tags
        tags = parsed_claim_review_page.find_all(
            "meta", attrs={'property': 'article:tag'})
        tag_list = []
        for tag in tags:
            tag_text = tag['content']
            tag_list.append(tag_text)
        claim.set_tags(",".join(tag_list))

        #date pubished
        published = parsed_claim_review_page.find(
            "meta", attrs={'property': 'article:published_time'})['content']
        claim.date_published = published.strip()

        #autor article
        author_span = parsed_claim_review_page.find(
            "span", attrs={'class': 'c-article__author'})
        author_a = author_span.find("a")
        author_url = author_a['href']
        author_text = author_a.text
        author_text = re.sub('Por', '', author_text).strip()
        claim.author_url = author_url
        claim.review_author = author_text

        # Recuperation du texte de l'article

        entry_text = ""
        body_t = entry_content.find_all('p')
        body = [text.text.strip() for text in body_t]
        entry_text += " ".join(body) + "\n"
        claim.body = entry_text

        # Recuperation des liens dans le texte de l'article
        links = [
            link['href'] for link in entry_content.find_all('a', href=True)
        ]
        claim.referred_links = links

        #Veracite
        intro = parsed_claim_review_page.find(
            "div", attrs={'class': 'c-article__intro'})

        veracities = [
            "ENGAÑOSA", "ENGAÑOSO", "FALSO", "FALSA", "FALSOS", "VERDADERO",
            "VERDAD A MEDIAS"
        ]

        def common(a, b):
            c = [value for value in a if value in b]
            return c

        if intro:
            intro_p = " ".join(str(v) for v in intro)
            #print(type(body_t))
            rating_text_list = intro_p.upper()
            rating_text = [
                i.strip() for i in common(veracities, rating_text_list)
            ]
            claim.alternate_name = rating_text

        else:
            body_a = " ".join(str(v) for v in body)
            #print(type(body_t))
            rating_text_list = body_a.upper()
            rating_text = [
                i.strip() for i in common(veracities, rating_text_list)
            ]
            claim.alternate_name = rating_text

        return [claim]
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup,
                                 url: str) -> List[Claim]:
        claim = Claim()
        claim.set_url(url)
        claim.set_source("factscan")

        json_ = None
        if parsed_claim_review_page.find("script",
                                         {"type": "application/ld+json"}):
            json_ = parsed_claim_review_page.find("script", {
                "type": "application/ld+json"
            }).get_text()

        def parse_wrong_json(json_, left, right):
            if json_:
                if len(json_.split(left)) > 0:
                    return json_.split(left)[1].split(right)[0]
            else:
                return None

            # Summary box

        summary_box = parsed_claim_review_page.find("div",
                                                    {"class": "summary-box"})

        # title
        title = parsed_claim_review_page.find(
            "meta", {"property": "og:title"})['content']
        claim.set_title(title)

        # claim review date
        date = parsed_claim_review_page.find(
            'meta', {"property": "article:published_time"})
        if date:
            date_str = search_dates(
                date['content'].split("T")[0])[0][1].strftime("%Y-%m-%d")
            claim.set_date(date_str)

        # Creative work date

        summary_text = summary_box.find("p").text
        date_published = ""
        if " on " in summary_text:
            date_published = summary_text.split(" on ")[-1].strip()
        else:
            if " published " in summary_text:
                date_published = summary_text.split(" published ")[-1].strip()
            elif " dated " in summary_text:
                date_published = summary_text.split(" dated ")[-1].strip()
            elif " from " in summary_text:
                date_published = summary_text.split(" from ")[-1].strip()
            elif " sent " in summary_text:
                date_published = summary_text.split(" in ")[-1].strip()
            elif " in " in summary_text:
                date_published = summary_text.split(" in ")[-1].strip()

        if len(date_published) > 0:
            date_published = search_dates(date_published)[0][1].strftime(
                "%Y-%m-%d")
            claim.setDatePublished(date_published)

        # rating
        if json_:
            claim.set_rating_value(
                parse_wrong_json(json_, '"ratingValue":', ","))
            claim.setWorstRating(parse_wrong_json(json_, '"worstRating":',
                                                  ","))
            claim.set_best_rating(parse_wrong_json(json_, '"bestRating":',
                                                   ","))
            claim.set_alternate_name(
                parse_wrong_json(json_, '"alternateName":', ","))
        # when there is no json
        else:
            if parsed_claim_review_page.find("div",
                                             {"class": "fact-check-icon"}):
                if parsed_claim_review_page.find("div", {
                        "class": "fact-check-icon"
                }).find('img'):
                    claim_str = \
                        parsed_claim_review_page.find("div", {"class": "fact-check-icon"}).find('img')['alt'].split(
                            ":")[1]
                    claim.alternate_name = claim_str.strip()

        # body
        body = parsed_claim_review_page.find("div", {"class": "entry-content"})
        claim.set_body(body.get_text())

        # author
        author = parsed_claim_review_page.find(
            "div", {"class": "sharethefacts-speaker-name"})
        if not author:
            author = summary_box.find("p").find("strong")

        if author:
            claim.set_author(author.text)

        # same_as
        claim.setSameAs(parse_wrong_json(json_, '"sameAs": [', "]"))

        # related links
        divTag = parsed_claim_review_page.find("div",
                                               {"class": "entry-content"})
        related_links = []
        for link in divTag.findAll('a', href=True):
            related_links.append(link['href'])
        claim.set_refered_links(related_links)

        if parsed_claim_review_page.find("div",
                                         {"class": "sharethefacts-statement"}):
            claim.set_claim(
                parsed_claim_review_page.find(
                    "div", {
                        "class": "sharethefacts-statement"
                    }).get_text())
        else:
            claim.set_claim(claim.title)

        tags = []

        for tag in parsed_claim_review_page.findAll(
                'meta', {"property": "article:tag"}):
            tags.append(tag["content"])
        if len(tags) == 0:
            for tag in parsed_claim_review_page.findAll(
                    "a", {"rel": "category tag"}):
                tags.append(tag.text)
        claim.set_tags(", ".join(tags))

        return [claim]
Exemple #3
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]:
        claim = Claim()
        claim.set_url(url)
        claim.set_source("politifact")


        # Claim
        title = parsed_claim_review_page.find("div", {"class": "m-statement__quote"})
        claim.set_claim(title.text)

        # title
        title = parsed_claim_review_page.find("h2", {"class": "c-title"})
        claim.set_title(title.text)
        
        # date
        date = parsed_claim_review_page.find('span', {"class": "m-author__date"})
        if date:
            date_str = search_dates(date.text)[0][1].strftime("%Y-%m-%d")
            claim.set_date(date_str)

        # rating
        statement_body=parsed_claim_review_page.find("div", {"class", "m-statement__body"})
        statement_detail = statement_body.find("div", {"class", "c-image"})
        statement_detail_image=statement_detail.find("picture")
        statement_detail_image_alt=statement_detail_image.find("img",{"class", "c-image__original"})
        if statement_detail_image_alt:
            claim.alternate_name = statement_detail_image_alt['alt']

        # body
        body = parsed_claim_review_page.find("article", {"class": "m-textblock"})
        claim.set_body(body.get_text())

        # author
        statement_meta = parsed_claim_review_page.find("div", {"class": "m-statement__meta"})
        if statement_meta:
            author = statement_meta.find("a").text
            claim.set_author(author)

        # date published
        if statement_meta:
            meta_text = statement_meta.text
            if "on" in meta_text:
                meta_text = meta_text.split(" on ")[1]
            if "in" in meta_text:
                meta_text = meta_text.split(" in ")[0]
            if meta_text:
                date = search_dates(meta_text)
                if date:
                    date = date[0][1].strftime("%Y-%m-%d")
                    claim.setDatePublished(date)
        
        # related links
        div_tag = parsed_claim_review_page.find("article", {"class": "m-textblock"})
        related_links = []
        for link in body.find_all('a', href=True):
            related_links.append(link['href'])
        claim.set_refered_links(related_links)

        claim.set_claim(parsed_claim_review_page.find("div", {"class": "m-statement__quote"}).text.strip())
        
        tags = []
        ul_tag = parsed_claim_review_page.find("ul", {"class", "m-list"})
        if ul_tag:
            ul_tag_contents = ul_tag.findAll("li", {"class", "m-list__item"})
            for a in ul_tag_contents:
                a_tag=a.find("a", title=True)
                a_tag_text=a_tag['title']
                tags.append(a_tag_text)

        if statement_body:
            topics = statement_body.find("ul", {"class", "m-list"}).find_all("a")
            for link in topics:
                text = link['title']
                tags.append(text)
            claim.set_tags(",".join(tags))

        return [claim]
Exemple #4
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]:
        claim = Claim()
        claim.set_url(url)
        claim.set_source("newtral")

        title = parsed_claim_review_page.find("meta", attrs={'property': 'og:title'})['content']
        title = title.strip().split("|")[0]
        claim.set_title(title)

        dospunto = re.search(r'(: «)', title)
        dospunt = re.search(r'(: “)', title)

        if dospunto:
            claim_a = title.split(":")
            auteur = claim_a[0].strip()
            claim.author = auteur
            # print ("auteur:" , auteur)
            claim_text = claim_a[1].strip("« »")
            claim.claim = claim_text

        elif dospunt:
            claim_b = title.split(":")
            auteur = claim_b[0].strip()
            # print ("auteur:" , auteur)
            claim.author = auteur
            claim_text = claim_b[1].strip(": “ ”")
            # print ("claim :", claim)
            claim.claim = claim_text
        else:
            pass

        tags = parsed_claim_review_page.find_all("meta", attrs={'property': 'article:tag'})
        tag_list = []
        for tag in tags:
            tag_text = tag['content']
            tag_list.append(tag_text)
        claim.set_tags(",".join(tag_list))

        published = parsed_claim_review_page.find("meta", attrs={'property': 'article:published_time'})[
            'content']
        claim.date_published = published.strip()

        entry_content = parsed_claim_review_page.find("div", attrs={'class': 'entry-content'})

        intro = parsed_claim_review_page.find("div", attrs={'class': 'c-article__intro'})
        if intro is None:
            intro_rating_p = entry_content.find("em")
            if intro_rating_p is None:
                intro_rating_p = entry_content.find("p")
            if intro_rating_p is None:
                intro_rating_p = entry_content.find("div")
        else:
            intro_rating_p = intro.p
        rating_in_image = False
        if intro_rating_p is None:  # Rating in image...
            rating_in_image = True
            rating_text = ""
        else:
            rating_text = intro_rating_p.get_text()

        rating_re_es_falso = regex.compile(
            r"(La afirmación es|La afirmación es una|La declaración es|Es|El dato es" + \
            "|La comparación de Colau es)? ?([\p{Lu}| ]+)(\.| –|,| )")

        es_falso_match = rating_re_es_falso.match(rating_text)
        if es_falso_match is not None and es_falso_match.group(2) is not None:
            rating_text = es_falso_match.group(2)
        else:
            if not rating_in_image:
                is_there_b = intro_rating_p.find('b')
                if is_there_b is not None:
                    rating_text = is_there_b.text
                else:
                    is_there_strong = intro_rating_p.find("strong")
                    if is_there_strong is not None:
                        rating_text = is_there_strong.text
                    else:
                        pass

        claim.alternate_name = rating_text

        author_span = parsed_claim_review_page.find("span", attrs={'class': 'c-article__author'})
        author_a = author_span.find("a")
        author_url = author_a['href']
        author_text = author_a.text
        author_text = re.sub('Por', '', author_text).strip()
        claim.author_url = author_url
        claim.review_author = author_text

        # Recuperation du texte de l'article

        entry_text = ""
        body_t = entry_content.find_all('p')
        body = [text.text.strip() for text in body_t]
        entry_text += " ".join(body) + "\n"
        claim.body = entry_text

        # Recuperation des liens dans le texte de l'article
        links = [link['href'] for link in entry_content.find_all('a', href=True)]
        claim.referred_links = links

        # else:
        #     title = container.h3.text
        #     titles.append(title)
        #     # print("title", title)
        #     claim_c = hd.h1.text.split(":")
        #     claim_d = hd.h1.text.strip()
        #
        #     if claim_c:
        #         auteur = claim_c[0].strip()
        #         auteurs.append(auteur)
        #         print("auteur:", auteur)
        #         claim = claim_c[1].strip("« »")
        #         claims.append(claim)
        #         # print ("claim :", claim)
        #     # else  :
        #     # print (claim_d)
        #

        return [claim]
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup,
                                 url: str) -> List[Claim]:
        claim = Claim()
        claim.set_url(url)
        claim.set_source("politifact")

        # title
        title = parsed_claim_review_page.find("h1",
                                              {"class": "article__title"})
        claim.set_title(title.text)

        # date
        date = parsed_claim_review_page.find('div', {
            "class": "widget__content"
        }).find("p")
        if date:
            date_str = search_dates(date.text)[0][1].strftime("%Y-%m-%d")
            claim.set_date(date_str)

        # rating
        rating_div = parsed_claim_review_page.find(
            "div", {"itemprop": "reviewRating"})
        if rating_div:
            rating_value = rating_div.find("div", {"itemprop": "ratingValue"})
            if rating_value:
                claim.rating_value = rating_value.text
            worst_rating = rating_div.find("div", {"itemprop": "worstRating"})
            if worst_rating:
                claim.worst_rating = worst_rating.text

            best_rating = rating_div.find("div", {"itemprop": "bestRating"})
            if best_rating:
                claim.best_rating = best_rating.text

            alternate_name = rating_div.find("div",
                                             {"itemprop": "alternateName"})
            if alternate_name:
                claim.alternate_name = alternate_name.text
        else:
            statement_detail = parsed_claim_review_page.find(
                "img", {"class", "statement-detail"})
            if statement_detail:
                claim.alternate_name = statement_detail['alt']

        # body
        body = parsed_claim_review_page.find("div", {"class": "article__text"})
        claim.set_body(body.get_text())

        # author
        statement_meta = parsed_claim_review_page.find(
            "p", {"class": "statement__meta"})
        if statement_meta:
            author = statement_meta.find("a").text
            claim.set_author(author)
        else:
            author = parsed_claim_review_page.find(
                "div", {"itemprop": "itemReviewed"})
            if author:
                author = author.find("div", {"itemprop": "author"})
                author_text = author.text
                claim.set_author(author_text)

        # same as
        rating_div = parsed_claim_review_page.find(
            "div", {"itemprop": "itemReviewed"})
        if rating_div and rating_div.find("div", {"itemprop": "sameAs"}):
            claim.setSameAs(
                rating_div.find("div", {
                    "itemprop": "sameAs"
                }).get_text())

        # date published
        if statement_meta:
            meta_text = statement_meta.text
            if "on" in meta_text:
                meta_text = meta_text.split(" on ")[1]
            if "in" in meta_text:
                meta_text = meta_text.split(" in ")[0]
            if meta_text:
                date = search_dates(meta_text)
                if date:
                    date = date[0][1].strftime("%Y-%m-%d")
                    claim.setDatePublished(date)
        else:
            rating_div = parsed_claim_review_page.find(
                "div", {"itemprop": "itemReviewed"})
            if rating_div and rating_div.find("div",
                                              {"itemprop": "datePublished"}):
                claim.setDatePublished(
                    rating_div.find("div", {
                        "itemprop": "datePublished"
                    }).get_text())

        # related links
        div_tag = parsed_claim_review_page.find("div",
                                                {"class": "article__text"})
        related_links = []
        for link in div_tag.findAll('a', href=True):
            related_links.append(link['href'])
        claim.set_refered_links(related_links)

        claim.set_claim(
            parsed_claim_review_page.find("div", {
                "class": "statement__text"
            }).text.strip())

        tags = []
        about_widget = parsed_claim_review_page.find(
            "div", {"class", "widget_about-article"})
        if about_widget:
            about_widget_contents = about_widget.find(
                "div", {"class", "widget__content"})
            for p in about_widget_contents.findAll("p"):
                text = p.text
                if "Subjects:" in text:
                    for subject in p.findAll("a"):
                        tags.append(subject.text)

            claim.set_tags(",".join(tags))

        return [claim]
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup,
                                 url: str) -> List[Claim]:
        claim = Claim()
        claim.set_url(url)
        claim.set_source("truthorfiction")

        title = parsed_claim_review_page.find(
            "meta", {"property": "og:title"})['content']
        claim.set_title(title)

        article = parsed_claim_review_page.find("article")

        # date

        date_ = parsed_claim_review_page.find(
            'meta', {"property": "article:published_time"})['content']
        if date_:
            date_str = date_.split("T")[0]
            claim.set_date(date_str)

        # body
        content = [
            tag for tag in article.contents
            if not isinstance(tag, NavigableString)
        ]
        body = content[-1]  # type: Tag
        if body.has_attr("class") and "content-source" in body['class']:
            body = content[-2]
        claim.set_body(body.text.strip())

        # related links
        related_links = []
        for link in body.findAll('a', href=True):
            related_links.append(link['href'])
        claim.set_refered_links(related_links)

        description = article.find("div", {"class", "claim-description"})
        rating = article.find("div", {"class", "rating-description"})

        if description and rating:
            claim.set_claim(description.text)
            claim.alternate_name = rating.text
        else:
            h1 = article.find("h1")
            text = h1.text.replace("–", "-")
            split_text = text.split("-")
            rating_text = split_text[-1]
            claim_text = "".join(split_text[0:-1])
            if len(claim_text) == 0 or "-" not in text:
                return []
            else:
                claim.set_alternate_name(rating_text)
                claim.set_claim(claim_text)

        tags = []

        for tag in parsed_claim_review_page.findAll(
                "meta", {"property", "article:tags"}):
            tag_str = tag['content']
            tags.append(tag_str)
        claim.set_tags(", ".join(tags))

        return [claim]