Ejemplo n.º 1
0
def get_claim_from_cache(url: str) -> Optional[Claim]:
    result = redis.hgetall("___cached___claim___" + url)
    if result:
        claim = Claim.from_dictionary(result)
        return claim
    else:
        return None
Ejemplo n.º 2
0
    def _annotate_claim(self, claim: Claim):
        if self.language == "eng" or self.language == "fra":
            claim_text = claim.claim
            claim.claim_entities = self.annotator.annotate(
                claim_text, language=self.language)

            if claim.tags is not None:
                keywords = claim.tags
                claim.keyword_entities = self.annotator.annotate(
                    keywords, language=self.language)

            if claim.body is not None:
                claim_body = claim.body
                claim.body_entities = self.annotator.annotate(
                    claim_body, language=self.language)

            if claim.author is not None:
                author = claim.author
                claim.author_entities = self.annotator.annotate(
                    author, language=self.language)
Ejemplo n.º 3
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]:
        claim = Claim()
        claim.set_url(url)
        claim.set_source("newtral")

        title = parsed_claim_review_page.find("meta", attrs={'property': 'og:title'})['content']
        title = title.strip().split("|")[0]
        claim.set_title(title)

        dospunto = re.search(r'(: «)', title)
        dospunt = re.search(r'(: “)', title)

        if dospunto:
            claim_a = title.split(":")
            auteur = claim_a[0].strip()
            claim.author = auteur
            # print ("auteur:" , auteur)
            claim_text = claim_a[1].strip("« »")
            claim.claim = claim_text

        elif dospunt:
            claim_b = title.split(":")
            auteur = claim_b[0].strip()
            # print ("auteur:" , auteur)
            claim.author = auteur
            claim_text = claim_b[1].strip(": “ ”")
            # print ("claim :", claim)
            claim.claim = claim_text
        else:
            pass

        tags = parsed_claim_review_page.find_all("meta", attrs={'property': 'article:tag'})
        tag_list = []
        for tag in tags:
            tag_text = tag['content']
            tag_list.append(tag_text)
        claim.set_tags(",".join(tag_list))

        published = parsed_claim_review_page.find("meta", attrs={'property': 'article:published_time'})[
            'content']
        claim.date_published = published.strip()

        entry_content = parsed_claim_review_page.find("div", attrs={'class': 'entry-content'})

        intro = parsed_claim_review_page.find("div", attrs={'class': 'c-article__intro'})
        if intro is None:
            intro_rating_p = entry_content.find("em")
            if intro_rating_p is None:
                intro_rating_p = entry_content.find("p")
            if intro_rating_p is None:
                intro_rating_p = entry_content.find("div")
        else:
            intro_rating_p = intro.p
        rating_in_image = False
        if intro_rating_p is None:  # Rating in image...
            rating_in_image = True
            rating_text = ""
        else:
            rating_text = intro_rating_p.get_text()

        rating_re_es_falso = regex.compile(
            r"(La afirmación es|La afirmación es una|La declaración es|Es|El dato es" + \
            "|La comparación de Colau es)? ?([\p{Lu}| ]+)(\.| –|,| )")

        es_falso_match = rating_re_es_falso.match(rating_text)
        if es_falso_match is not None and es_falso_match.group(2) is not None:
            rating_text = es_falso_match.group(2)
        else:
            if not rating_in_image:
                is_there_b = intro_rating_p.find('b')
                if is_there_b is not None:
                    rating_text = is_there_b.text
                else:
                    is_there_strong = intro_rating_p.find("strong")
                    if is_there_strong is not None:
                        rating_text = is_there_strong.text
                    else:
                        pass

        claim.rating = rating_text

        author_span = parsed_claim_review_page.find("span", attrs={'class': 'c-article__author'})
        author_a = author_span.find("a")
        author_url = author_a['href']
        author_text = author_a.text
        author_text = re.sub('Por', '', author_text).strip()
        claim.author_url = author_url
        claim.review_author = author_text

        # Recuperation du texte de l'article

        entry_text = ""
        body_t = entry_content.find_all('p')
        body = [text.text.strip() for text in body_t]
        entry_text += " ".join(body) + "\n"
        claim.body = entry_text

        # Recuperation des liens dans le texte de l'article
        links = [link['href'] for link in entry_content.find_all('a', href=True)]
        claim.referred_links = links

        # else:
        #     title = container.h3.text
        #     titles.append(title)
        #     # print("title", title)
        #     claim_c = hd.h1.text.split(":")
        #     claim_d = hd.h1.text.strip()
        #
        #     if claim_c:
        #         auteur = claim_c[0].strip()
        #         auteurs.append(auteur)
        #         print("auteur:", auteur)
        #         claim = claim_c[1].strip("« »")
        #         claims.append(claim)
        #         # print ("claim :", claim)
        #     # else  :
        #     # print (claim_d)
        #

        return [claim]
Ejemplo n.º 4
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]:
        claim = Claim()
        claim.set_url(url)
        claim.set_source("truthorfiction")

        title = parsed_claim_review_page.find("meta", {"property": "og:title"})['content']
        claim.set_title(title)

        article = parsed_claim_review_page.find("article")

        # date
        date_ = parsed_claim_review_page.find('meta', {"property": "article:published_time"})['content']
        if date_:
            date_str = date_.split("T")[0]
            claim.set_date(date_str)

        # author
        author_ = parsed_claim_review_page.find('meta', {"name": "author"})['content']
        if author_:
            author_str = author_.split("T")[0]
            claim.set_author(author_str)

        ## auth link        
        author_url = parsed_claim_review_page.find('a', {"class": "url fn n"})['href']
        if author_url:
            claim.author_url = author_url

        # body
        content = [tag for tag in article.contents if not isinstance(tag, NavigableString)]
        body = content[-1]  # type: Tag
        if body.has_attr("class") and "content-source" in body['class']:
            body = content[-2]
        claim.set_body(body.text.strip())

        # related links
        related_links = []
        for link in body.findAll('a', href=True):
            related_links.append(link['href'])
        claim.set_refered_links(related_links)

        description = article.find("div", {"class", "claim-description"})
        rating = article.find("div", {"class", "rating-description"})

        if description and rating:
            claim.set_claim(description.text)
            claim.rating = rating.text
        else:
            h1 = article.find("h1")
            text = h1.text.replace("–", "-")
            split_text = text.split("-")
            rating_text = split_text[-1]
            claim_text = "".join(split_text[0:-1])
            if len(claim_text) == 0 or "-" not in text:
                return []
            else:
                claim.set_rating(rating_text)
                claim.set_claim(claim_text)
        
        # tags
        tags = []
        if parsed_claim_review_page.select('footer > span.tags-links > a'):
            for link in parsed_claim_review_page.select('footer > span.tags-links > a'):
                if hasattr(link, 'href'):
                    #tag_link = link['href']
                    tags.append(link.text)

        claim.set_tags(", ".join(tags))

        return [claim]
Ejemplo n.º 5
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]:
        claim = Claim()
        claim.set_url(url)
        claim.set_source("polygraph")

        # title
        title = parsed_claim_review_page.find("h1", {"class": "title pg-title"})
        claim.set_title(title.text.replace(";", ","))

        # date
        full_date = parsed_claim_review_page.find("time")['datetime'].split("T")
        claim.set_date(full_date[0])

        # body
        # body = parsed_claim_review_page.find('div', {"id":"article-content"}).find_all('p')
        # for b in body:
        #    claim.set_body(b.get_text())
        body = parsed_claim_review_page.find("div", {"id": "article-content"})
        claim.set_body(body.get_text())

        # related related_links
        related_links = []
        for link in body.findAll('a', href=True):
            related_links.append(link['href'])
        claim.set_refered_links(related_links)

        claim.set_claim(claim.title)

        # author
        author = parsed_claim_review_page.find('h4', {"class": "author"})
        claim.set_author(author.text)

        # rating
        rating = parsed_claim_review_page.find('div', {"class": "verdict"}).find_all('span')[1]
        claim.set_rating(rating.text)

        return [claim]
def get_all_claims(criteria):
    headers = {
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}

    # print criteria.maxClaims
    # performing a search by each letter, and adding each article to a urls_ var.
    now = datetime.datetime.now()
    urls_ = {}
    last_page = []
    for page_number in range(1, 500):
        if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims):
            break
        url = "https://theferret.scot/category/fact-check/page/" + str(page_number) + "/"
        # try:
        page = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(page.text, "lxml")
        soup.prettify()

        links = soup.findAll("h1", {"class": "entry-title"})
        if (len(links) != 0) or (links != last_page):
            for anchor in links:
                anchor = anchor.find('a', {"rel": "bookmark"}, href=True)
                ind_ = str(anchor['href'])
                if (ind_ not in list(urls_.keys())):
                    if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims):
                        break
                    urls_[ind_] = page
                    print("adding " + str(ind_))
            last_page = links
        else:
            print("break!")
            break
    # except:
    #	print "error=>"+str(url)

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url, conclusion in urls_.items():
        print(str(index) + "/" + str(len(list(urls_.keys()))) + " extracting " + str(url))
        index += 1

        url_complete = str(url)

        # print url_complete
        try:
            page = requests.get(url_complete, headers=headers, timeout=5)
            soup = BeautifulSoup(page.text, "lxml")
            soup.prettify("utf-8")

            claim_ = Claim()
            claim_.set_url(url_complete)
            claim_.set_source("theferret")

            if (criteria.html):
                claim_.setHtml(soup.prettify("utf-8"))

            # title
            # if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1):
            title = soup.find("h1", {"class": "cover-title"})
            claim_.set_title(title.text)

            # date

            date_ = soup.find('div', {"class": "widget__content"}).find("p")
            # print date_["content"]
            if date_:
                date_str = search_dates(date_.text)[0][1].strftime("%Y-%m-%d")
                # print date_str
                claim_.set_date(date_str)
            # print claim_.date

            # body
            body = soup.find("div", {"class": "article__text"})
            claim_.set_body(body.get_text())

            # related links
            divTag = soup.find("div", {"class": "article__text"})
            related_links = []
            for link in divTag.findAll('a', href=True):
                related_links.append(link['href'])
            claim_.set_refered_links(related_links)

            claim_.set_claim(soup.find("h1", {"class": "article__title"}).text)
            claim_.setConclusion(conclusion)

            tags = []

            for tag in soup.findAll('meta', {"property": "article:tag"}):
                # print "achou"
                tags.append(tag["content"])
            claim_.set_tags(", ".join(tags))

            claims.append(claim_.generate_dictionary())
        except:
            print("Error ->" + str(url_complete))

    # creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf
def get_all_claims(criteria):
    # performing a search by each letter, and adding each article to a urls_ var.
    now = datetime.datetime.now()
    urls_ = {}
    letters = [
        "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "m",
        "o", "p", "q", "x", "y", "z"
    ]
    letters = ["a"]
    for l in letters:
        for page in range(1, 500):
            if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims):
                break
            try:
                print(
                    ("http://www.mimikama.at/page/" + str(page) + "/?s=" + l))
                page = urllib.request.urlopen("http://www.mimikama.at/page/" +
                                              str(page) + "/?s=" + l).read()
            except:
                break
            soup = BeautifulSoup(page, "lxml")
            soup.prettify()
            links = soup.find('div', {
                "class": "td-ss-main-content"
            }).findAll('a', {"rel": "bookmark"}, href=True)
            if len(links) != 0:
                for anchor in links:
                    if (anchor['href'] not in list(urls_.keys())):
                        urls_[anchor['href']] = l
                        print("adding " + str(anchor['href']))
                        if (criteria.maxClaims > 0
                                and len(urls_) >= criteria.maxClaims):
                            break
            else:
                print("break!")
                break

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url in list(urls_.keys()):
        try:
            print(
                str(index) + "/" + str(len(list(urls_.keys()))) +
                " extracting " + str(url))
            index += 1
            claim_ = Claim()
            claim_.set_source("mimikama")
            url_complete = url
            claim_.set_url(url_complete)
            page = urllib.request.urlopen(url_complete, timeout=5).read()
            soup = BeautifulSoup(page, "lxml")
            soup.prettify()

            # conclusin
            # conclusion=soup.find('div', {"class": "td-post-content"}).find('h2')
            # if conclusion :
            # 	claim_.setConclusion(conclusion.get_text())

            # title
            title = soup.find("h1", {"class": "entry-title"})
            claim_.set_title(title.text)

            # claim
            # claim = soup.find('div', {"class": "td-post-content"}).find('h2')
            # if claim and claim.find_previous('strong'):
            #	claim_.setClaim(claim.find_previous('strong').get_text())
            # else:
            claim_.set_claim(claim_.title)

            # date
            date = soup.find("time",
                             {"class": "entry-date updated td-module-date"})
            # print date

            # print (search_dates(date.get_text())[0][1].strftime("%Y-%m-%d"))
            claim_.set_date(
                search_dates(date.get_text())[0][1].strftime("%Y-%m-%d"))

            # related links
            divTag = soup.find("div", {"class": "td-post-content"})
            related_links = []
            for link in divTag.findAll('a', href=True):
                related_links.append(link['href'])
            claim_.set_refered_links(related_links)

            body = soup.find("div", {"class": "td-post-content"})
            claim_.set_body(body.get_text())

            claims.append(claim_.generate_dictionary())
        except:
            print("Erro =>" + url)

    # creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf
Ejemplo n.º 8
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]:
        claim = Claim()
        claim.set_url(url)
        #print("\r" + url)

        claim.set_source("politifact")

        # Claim
        title = parsed_claim_review_page.find("div", {"class": "m-statement__quote"})
        claim.set_claim(title.text.strip())

        # title
        title = parsed_claim_review_page.find("h2", {"class": "c-title"})
        claim.set_title(title.text.strip())
        
        # date
        date = parsed_claim_review_page.find('span', {"class": "m-author__date"})
        if date:
            date_str = search_dates(date.text)[0][1].strftime("%Y-%m-%d")
            claim.set_date(date_str)

        # rating
        # https://static.politifact.com/politifact/rulings/meter-mostly-false.jpg
        statement_body=parsed_claim_review_page.find("div", {"class", "m-statement__body"})
        statement_detail = statement_body.find("div", {"class", "c-image"})
        statement_detail_image=statement_detail.find("picture")
        statement_detail_image_alt=statement_detail_image.find("img",{"class", "c-image__original"})
        if statement_detail_image_alt:
            #claim.alternate_name = statement_detail_image_alt['src'].split("rulings/")[1].split(".jpg")[0]            
            if self.translate_rating_value(statement_detail_image_alt['alt']) != "":
                claim.rating = self.translate_rating_value(statement_detail_image_alt['alt'])
            else:
                claim.rating = statement_detail_image_alt['alt']

        # body
        body = parsed_claim_review_page.find("article", {"class": "m-textblock"})
        #body.find("div", {"class": "artembed"}).decompose()
        #claim.set_body(body.get_text())

        

        
        
        text =""
        if parsed_claim_review_page.select( 'main > section > div.t-row__center > article.m-textblock' ):
            for child in parsed_claim_review_page.select( 'main > section > div.t-row__center > article.m-textblock' ):
                for element in child.contents:
                    if (element.name == "div"):
                        valid = True
                        # check for illegal JS element in artembed (tag):
                        if (hasattr( element, 'class' )):
                            try:
                                if ('class' in element.attrs):
                                    if (element.attrs['class'][0] == "artembed"):
                                        if (element.text.startswith("\r\nwindow.gciAnalyticsUAID")):
                                            valid = False
                            except KeyError:
                                print("KeyError: Skip")
                    else:
                        valid = True
                        if hasattr( element, 'text' ):
                            #if (element.text == "We rate this claim False." and url == "https://www.politifact.com/staff/kelsey-tamakloe/"):
                            if (url == "https://www.politifact.com/staff/kelsey-tamakloe/"):
                                print("\r" + str(element.text))
                    if (valid == True):
                        if (element):
                            if (hasattr( element, 'text' )):
                                text += " " + str(element.text)
                            else:
                                text += " " + str(element)

            body_description = text.strip()
            claim.body = str(body_description).strip()

        # author
        author_meta = parsed_claim_review_page.find("div", {"class": "m-author__content"})
        if author_meta:
            author = author_meta.find("a").text
            claim.set_author(author)
            author_url = author_meta.find("a")
            if author_url.attrs["href"] != "":
                claim.author_url = "https://www.politifact.com" + author_url.attrs["href"]

        # date published
        statement_meta = parsed_claim_review_page.find("div", {"class": "m-statement__meta"})
        if statement_meta:
            meta_text = statement_meta.text
            if "on" in meta_text:
                meta_text = meta_text.split(" on ")[1]
            if "in" in meta_text:
                meta_text = meta_text.split(" in ")[0]
            if meta_text:
                date = search_dates(meta_text)
                if date:
                    date = date[0][1].strftime("%Y-%m-%d")
                    claim.date = date
        
        # related links
        div_tag = parsed_claim_review_page.find("article", {"class": "m-textblock"})
        related_links = []
        for link in body.find_all('a', href=True):
            if (link['href'][0] == "/"):
                    related_links.append("https://www.politifact.com" + link['href'])
            else:
                related_links.append(link['href'])
        claim.set_refered_links(related_links)

        claim.set_claim(parsed_claim_review_page.find("div", {"class": "m-statement__quote"}).text.strip())
        
        tags = []
        ul_tag = parsed_claim_review_page.find("ul", {"class", "m-list"})
        if ul_tag:
            ul_tag_contents = ul_tag.findAll("li", {"class", "m-list__item"})
            for a in ul_tag_contents:
                a_tag=a.find("a", title=True)
                a_tag_text=a_tag['title']
                tags.append(a_tag_text)

        if statement_body:
            topics = statement_body.find("ul", {"class", "m-list"}).find_all("a")
            for link in topics:
                text = link['title']
                tags.append(text)
            claim.set_tags(",".join(tags))

        return [claim]
Ejemplo n.º 9
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup,
                                 url: str) -> List[Claim]:
        claim = Claim()
        claim.set_url(url)
        claim.set_source("factcheck_aap")

        # The title
        elements = parsed_claim_review_page.findAll('h1')
        if len(elements) == 1:
            title = elements[0].text
        else:
            title = elements[1].text

        claim.set_title(title.strip())

        body = parsed_claim_review_page.select(".c-article__content")

        verdict_div = body[0].select(".c-article__verdict")
        if len(verdict_div) > 0:
            verdict_strongs = verdict_div[0].find_all("strong")
        else:
            verdict_strongs = body[0].find_all("strong")
        verdict = ""
        for verdict_strong in verdict_strongs:
            if "AAP FactCheck" not in verdict_strong.text and "AAP FactCheck Investigation:" not in verdict_strong.text:
                verdict = verdict_strong.text
                break
        claim.set_rating(verdict)
        if len(verdict_div) > 0:
            verdict_div[0].decompose()

        # The body
        body_text = body[0].text
        claim.set_body(body_text)

        # Date where the article was published

        date_tag = parsed_claim_review_page.find("date",
                                                 attrs={'class': 'd-none'})
        date_text = date_tag.text
        find_date = dateparser.parse(date_text)
        claim.set_date_published(find_date.strftime("%Y-%m-%d"))

        elements = body[0].find_all('a')
        refs = []
        for elem in elements:
            refs.append(elem['href'])
        claim.set_refered_links(refs)

        return [claim]
Ejemplo n.º 10
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup,
                                 url: str) -> List[Claim]:
        claim = Claim()
        claim.set_url(url)
        claim.set_source("newtral")
        #title, claim and autor claim
        title = parsed_claim_review_page.find("meta",
                                              attrs={'property':
                                                     'og:title'})['content']
        title = title.strip().split("|")[0]
        claim.set_title(title)
        entry_content = parsed_claim_review_page.find(
            "div", attrs={'class': 'entry-content'})
        #print (title)
        dospunto = re.search(r'(: «)', title)
        dospunt = re.search(r'(: “)', title)

        if dospunto:
            claim_a = title.split(":")
            auteur = claim_a[0].strip()
            claim.author = auteur
            claim_text = claim_a[1].strip("« »")
            claim.claim = claim_text
            #print (claim_a)

        elif dospunt:
            claim_b = title.split(":")
            auteur = claim_b[0].strip()
            # print ("auteur:" , auteur)
            claim.author = auteur
            claim_text = claim_b[1].strip(": “ ”")
            # print ("claim :", claim)
            claim.claim = claim_text
        else:
            pass
        #multiple title or claim
        claim_mult = entry_content.findAll('h2')

        if claim_mult:
            claim_al = [i.text.strip() for i in claim_mult]
            dospunt = re.search(r'(: “)', claim_al)
            dospunto = re.search(r'(: «)', claim_al)
            if dospunt:
                claim_b = title.split(":")
                auteur = claim_b[0].strip()
                # print ("auteur:" , auteur)
                claim.author = auteur
                claim_text = claim_b[1].strip(": “ ”")
                # print ("claim :", claim)
                claim.claim = claim_text
            elif dospunto:
                claim_a = title.split(":")
                auteur = claim_a[0].strip()
                claim.author = auteur
                claim_text = claim_a[1].strip("« »")
                claim.claim = claim_text
                #print (claim_a)
            else:
                claim.set_title(claim_al)

                #tags
        tags = parsed_claim_review_page.find_all(
            "meta", attrs={'property': 'article:tag'})
        tag_list = []
        for tag in tags:
            tag_text = tag['content']
            tag_list.append(tag_text)
        claim.set_tags(",".join(tag_list))

        #date pubished
        published = parsed_claim_review_page.find(
            "meta", attrs={'property': 'article:published_time'})['content']
        claim.date_published = published.strip()

        #autor article
        author_span = parsed_claim_review_page.find(
            "span", attrs={'class': 'c-article__author'})
        author_a = author_span.find("a")
        author_url = author_a['href']
        author_text = author_a.text
        author_text = re.sub('Por', '', author_text).strip()
        claim.author_url = author_url
        claim.review_author = author_text

        # Recuperation du texte de l'article

        entry_text = ""
        body_t = entry_content.find_all('p')
        body = [text.text.strip() for text in body_t]
        entry_text += " ".join(body) + "\n"
        claim.body = entry_text

        # Recuperation des liens dans le texte de l'article
        links = [
            link['href'] for link in entry_content.find_all('a', href=True)
        ]
        claim.referred_links = links

        #Veracite
        intro = parsed_claim_review_page.find(
            "div", attrs={'class': 'c-article__intro'})

        veracities = [
            "ENGAÑOSA", "ENGAÑOSO", "FALSO", "FALSA", "FALSOS", "VERDADERO",
            "VERDAD A MEDIAS"
        ]

        def common(a, b):
            c = [value for value in a if value in b]
            return c

        if intro:
            intro_p = " ".join(str(v) for v in intro)
            #print(type(body_t))
            rating_text_list = intro_p.upper()
            rating_text = [
                i.strip() for i in common(veracities, rating_text_list)
            ]
            claim.alternate_name = rating_text

        else:
            body_a = " ".join(str(v) for v in body)
            #print(type(body_t))
            rating_text_list = body_a.upper()
            rating_text = [
                i.strip() for i in common(veracities, rating_text_list)
            ]
            claim.alternate_name = rating_text

        return [claim]
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup,
                                 url: str) -> List[Claim]:
        claim = Claim()
        claim.set_url(url)
        claim.set_source("factscan")

        json_ = None
        if parsed_claim_review_page.find("script",
                                         {"type": "application/ld+json"}):
            json_ = parsed_claim_review_page.find("script", {
                "type": "application/ld+json"
            }).get_text()

        def parse_wrong_json(json_, left, right):
            if json_:
                if len(json_.split(left)) > 0:
                    return json_.split(left)[1].split(right)[0]
            else:
                return None

            # Summary box

        summary_box = parsed_claim_review_page.find("div",
                                                    {"class": "summary-box"})

        # title
        title = parsed_claim_review_page.find(
            "meta", {"property": "og:title"})['content']
        claim.set_title(title)

        # claim review date
        date = parsed_claim_review_page.find(
            'meta', {"property": "article:published_time"})
        if date:
            date_str = search_dates(
                date['content'].split("T")[0])[0][1].strftime("%Y-%m-%d")
            claim.set_date(date_str)

        # Creative work date

        summary_text = summary_box.find("p").text
        date_published = ""
        if " on " in summary_text:
            date_published = summary_text.split(" on ")[-1].strip()
        else:
            if " published " in summary_text:
                date_published = summary_text.split(" published ")[-1].strip()
            elif " dated " in summary_text:
                date_published = summary_text.split(" dated ")[-1].strip()
            elif " from " in summary_text:
                date_published = summary_text.split(" from ")[-1].strip()
            elif " sent " in summary_text:
                date_published = summary_text.split(" in ")[-1].strip()
            elif " in " in summary_text:
                date_published = summary_text.split(" in ")[-1].strip()

        if len(date_published) > 0:
            date_published = search_dates(date_published)[0][1].strftime(
                "%Y-%m-%d")
            claim.setDatePublished(date_published)

        # rating
        if json_:
            claim.set_rating_value(
                parse_wrong_json(json_, '"ratingValue":', ","))
            claim.setWorstRating(parse_wrong_json(json_, '"worstRating":',
                                                  ","))
            claim.set_best_rating(parse_wrong_json(json_, '"bestRating":',
                                                   ","))
            claim.set_alternate_name(
                parse_wrong_json(json_, '"alternateName":', ","))
        # when there is no json
        else:
            if parsed_claim_review_page.find("div",
                                             {"class": "fact-check-icon"}):
                if parsed_claim_review_page.find("div", {
                        "class": "fact-check-icon"
                }).find('img'):
                    claim_str = \
                        parsed_claim_review_page.find("div", {"class": "fact-check-icon"}).find('img')['alt'].split(
                            ":")[1]
                    claim.alternate_name = claim_str.strip()

        # body
        body = parsed_claim_review_page.find("div", {"class": "entry-content"})
        claim.set_body(body.get_text())

        # author
        author = parsed_claim_review_page.find(
            "div", {"class": "sharethefacts-speaker-name"})
        if not author:
            author = summary_box.find("p").find("strong")

        if author:
            claim.set_author(author.text)

        # same_as
        claim.setSameAs(parse_wrong_json(json_, '"sameAs": [', "]"))

        # related links
        divTag = parsed_claim_review_page.find("div",
                                               {"class": "entry-content"})
        related_links = []
        for link in divTag.findAll('a', href=True):
            related_links.append(link['href'])
        claim.set_refered_links(related_links)

        if parsed_claim_review_page.find("div",
                                         {"class": "sharethefacts-statement"}):
            claim.set_claim(
                parsed_claim_review_page.find(
                    "div", {
                        "class": "sharethefacts-statement"
                    }).get_text())
        else:
            claim.set_claim(claim.title)

        tags = []

        for tag in parsed_claim_review_page.findAll(
                'meta', {"property": "article:tag"}):
            tags.append(tag["content"])
        if len(tags) == 0:
            for tag in parsed_claim_review_page.findAll(
                    "a", {"rel": "category tag"}):
                tags.append(tag.text)
        claim.set_tags(", ".join(tags))

        return [claim]
Ejemplo n.º 12
0
def get_all_claims(criteria):
    print(criteria.maxClaims)
    # performing a search by each letter, and adding each article to a urls_ var.
    now = datetime.datetime.now()
    urls_ = {}
    for page_number in range(1, 500):
        if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims):
            break
        try:
            page = urllib.request.urlopen(
                "https://g1.globo.com/e-ou-nao-e/index/feed/pagina-" +
                str(page_number) + ".ghtml").read()
        except:
            break
        soup = BeautifulSoup(page, "lxml")
        soup.prettify()
        links = soup.findAll('a', {"class": "feed-post-link"}, href=True)
        if len(links) != 0:
            for anchor in links:
                if (anchor['href'] not in list(urls_.keys())):
                    if (criteria.maxClaims > 0
                            and len(urls_) >= criteria.maxClaims):
                        break
                    urls_[anchor['href']] = page_number
                    print("adding " + str(anchor['href']))
        else:
            print("break!")
            break

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url, conclusion in urls_.items():
        print(
            str(index) + "/" + str(len(list(urls_.keys()))) + " extracting " +
            str(url))
        index += 1

        url_complete = str(url)

        # print url_complete
        page = urllib.request.urlopen(url_complete).read().decode(
            'utf-8', 'ignore')
        soup = BeautifulSoup(page, "lxml")
        soup.prettify("utf-8")

        claim_ = Claim()
        claim_.set_url(url_complete)
        claim_.set_source("g1")

        if (criteria.html):
            claim_.setHtml(soup.prettify("utf-8"))

        try:
            # title
            # if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1):
            title = soup.find("h1", {"class": "content-head__title"})
            claim_.set_title(title.text)

            # date

            date_ = soup.find('time', {"itemprop": "datePublished"})
            if date_:
                date_str = date_.get_text().split(" ")[1]
                claim_.set_date(
                    dateparser.parse(date_str, settings={
                        'DATE_ORDER': 'DMY'
                    }).strftime("%Y-%m-%d"))
            # print claim_.date

            # body
            body = soup.find("article")
            claim_.set_body(body.get_text().replace("\n", "").replace(
                "TwitterFacebookE-mailWhatsApp", ""))

            # related links
            divTag = soup.find("article", {"itemprop": "articleBody"})
            related_links = []
            for link in divTag.findAll('a', href=True):
                related_links.append(link['href'])
            claim_.set_refered_links(related_links)

            # claim
            claim_conclusion = soup.find("h1", {
                "class": "content-head__title"
            }).get_text()
            # claim_.setClaim(claim_conclusion)
            # if (len(claim_conclusion.split("?"))>1):
            claim_.set_claim(claim_conclusion.split("?")[0])
            claim_.setConclusion(claim_conclusion.split("?")[1])
            # if (claim_element.find_previous_sibling("figure") and claim_element.find_previous_sibling("figure").findAll("figcaption")):
            # 	claim_.setConclusion(claim_element.find_previous_sibling("figure").findAll("figcaption")[-1:][0].get_text())
            # print claim_.claim.decode("utf-8") + " ====> "
            # print claim_.conclusion.decode("utf-8")
            # print "-->"+ str(claim_.conclusion)

            claims.append(claim_.generate_dictionary())
        except:
            print("Error ->" + str(url_complete))

    # creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf
Ejemplo n.º 13
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]:
        claim = Claim()
        claim.set_url(url)
        claim.set_source("politifact")


        # Claim
        title = parsed_claim_review_page.find("div", {"class": "m-statement__quote"})
        claim.set_claim(title.text)

        # title
        title = parsed_claim_review_page.find("h2", {"class": "c-title"})
        claim.set_title(title.text)
        
        # date
        date = parsed_claim_review_page.find('span', {"class": "m-author__date"})
        if date:
            date_str = search_dates(date.text)[0][1].strftime("%Y-%m-%d")
            claim.set_date(date_str)

        # rating
        statement_body=parsed_claim_review_page.find("div", {"class", "m-statement__body"})
        statement_detail = statement_body.find("div", {"class", "c-image"})
        statement_detail_image=statement_detail.find("picture")
        statement_detail_image_alt=statement_detail_image.find("img",{"class", "c-image__original"})
        if statement_detail_image_alt:
            claim.alternate_name = statement_detail_image_alt['alt']

        # body
        body = parsed_claim_review_page.find("article", {"class": "m-textblock"})
        claim.set_body(body.get_text())

        # author
        statement_meta = parsed_claim_review_page.find("div", {"class": "m-statement__meta"})
        if statement_meta:
            author = statement_meta.find("a").text
            claim.set_author(author)

        # date published
        if statement_meta:
            meta_text = statement_meta.text
            if "on" in meta_text:
                meta_text = meta_text.split(" on ")[1]
            if "in" in meta_text:
                meta_text = meta_text.split(" in ")[0]
            if meta_text:
                date = search_dates(meta_text)
                if date:
                    date = date[0][1].strftime("%Y-%m-%d")
                    claim.setDatePublished(date)
        
        # related links
        div_tag = parsed_claim_review_page.find("article", {"class": "m-textblock"})
        related_links = []
        for link in body.find_all('a', href=True):
            related_links.append(link['href'])
        claim.set_refered_links(related_links)

        claim.set_claim(parsed_claim_review_page.find("div", {"class": "m-statement__quote"}).text.strip())
        
        tags = []
        ul_tag = parsed_claim_review_page.find("ul", {"class", "m-list"})
        if ul_tag:
            ul_tag_contents = ul_tag.findAll("li", {"class", "m-list__item"})
            for a in ul_tag_contents:
                a_tag=a.find("a", title=True)
                a_tag_text=a_tag['title']
                tags.append(a_tag_text)

        if statement_body:
            topics = statement_body.find("ul", {"class", "m-list"}).find_all("a")
            for link in topics:
                text = link['title']
                tags.append(text)
            claim.set_tags(",".join(tags))

        return [claim]
Ejemplo n.º 14
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup,
                                 url: str) -> List[Claim]:
        self.claim = self.extract_claim(parsed_claim_review_page)
        self.review = self.extract_review(parsed_claim_review_page)

        claim = Claim()
        claim.set_rating_value(
            self.extract_rating_value(parsed_claim_review_page))
        claim.set_rating(
            FatabyyanoFactCheckingSiteExtractor.translate_rating_value(
                self.extract_rating_value(parsed_claim_review_page)))
        claim.set_source("fatabyyano")
        claim.set_author("fatabyyano")
        claim.set_date_published(self.extract_date(parsed_claim_review_page))
        claim.set_claim(self.claim)
        claim.set_body(self.review)
        claim.set_refered_links(self.extract_links(parsed_claim_review_page))
        claim.set_title(self.extract_claim(parsed_claim_review_page))
        claim.set_date(self.extract_date(parsed_claim_review_page))
        claim.set_url(url)
        claim.set_tags(self.extract_tags(parsed_claim_review_page))

        return [claim]
Ejemplo n.º 15
0
def get_all_claims(criteria):
    headers = {
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }

    # performing a search by each letter, and adding each article to a urls_ var.
    urls_ = {}
    for page_number in range(1, 500):
        if 0 < criteria.maxClaims <= len(urls_):
            break
        url = "https://www.washingtonpost.com/news/fact-checker/page/" + str(
            page_number) + "/"
        if page_number == 1:
            url = "https://www.washingtonpost.com/news/fact-checker/?utm_term=.c0f1538d1850"

        # try:
        print(url)
        page = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(page.text, "lxml")
        soup.prettify()
        print(page.text)
        links = soup.findAll("div", {"class": "story-headline"})
        print(links)
        if len(links) == 0:
            break

        for anchor in links:
            anchor = anchor.find("a")
            ind_ = str(anchor['href'])
            if ind_ not in list(urls_.keys()):
                if 0 < criteria.maxClaims <= len(urls_):
                    break
                urls_[ind_] = ind_

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url, conclusion in urls_.items():
        print(
            str(index) + "/" + str(len(list(urls_.keys()))) + " extracting " +
            str(url))
        index += 1

        url_complete = str(url)

        # print url_complete
        try:
            page = requests.get(url_complete, headers=headers, timeout=5)
            soup = BeautifulSoup(page.text, "lxml")
            soup.prettify("utf-8")

            claim_ = Claim()
            claim_.set_url(url_complete)
            claim_.set_source("washingtonpost")

            if criteria.html:
                claim_.setHtml(soup.prettify("utf-8"))

            # title
            title = soup.find("h1", {"class": "article__title"})
            claim_.set_title(title.text)

            # date

            date_ = soup.find('div', {"class": "widget__content"}).find("p")
            if date_:
                date_str = search_dates(date_.text)[0][1].strftime("%Y-%m-%d")
                claim_.set_date(date_str)

            # body
            body = soup.find("div", {"class": "article__text"})
            claim_.set_body(body.get_text())

            # related links
            divTag = soup.find("div", {"class": "article__text"})
            related_links = []
            for link in divTag.findAll('a', href=True):
                related_links.append(link['href'])
            claim_.set_refered_links(related_links)

            claim_.set_claim(soup.find("h1", {"class": "article__title"}).text)
            tags = []

            for tag in soup.findAll('meta', {"property": "article:tag"}):
                tags.append(tag["content"])
            claim_.set_tags(", ".join(tags))

            claims.append(claim_.generate_dictionary())
        except:
            print("Error ->" + str(url_complete))

    # creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf
Ejemplo n.º 16
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup,
                                 url: str) -> List[Claim]:
        #print(parsed_claim_review_page)
        local_claims = []
        claim = Claim()
        claim.set_url(url)
        claim.set_source("africacheck")

        # title

        title = parsed_claim_review_page.find("meta", {"property": "og:title"})

        global_title_text = title['content']
        print(global_title_text)
        claim.set_title(global_title_text)

        # date
        date = parsed_claim_review_page.find('span', {"class": "published"})
        #print(date.text)
        global_date_str = ""
        if date:
            #global_date_str = search_dates(date.text.split(" ")[0])[0][1].strftime("%Y-%m-%d")
            datee = date.text

            global_date_str0 = re.search("[0-9]+ [a-zA-Z]+ [0-9]+", datee)
            global_date_str = global_date_str0.group(0)

            print(global_date_str)
            claim.set_date(global_date_str)

        #arrettttttttttttttttttttttttttttttttttttttttttttttttt
        # rating
        global_truth_rating = ""
        if parsed_claim_review_page.find(
                "div", {"class": "article-details__verdict"}):  #changer
            global_truth = parsed_claim_review_page.find(
                "div", {"class": "article-details__verdict"})
            div_rating = global_truth.find('div')  #changer
            div_rating_class = div_rating["class"][1]  #changer
            div_rating_class_verdict0 = re.search(
                "[-][a-zA-Z]+", div_rating_class).group(0)  #changer
            global_truth_rating = str(
                re.search("[a-zA-Z]+", div_rating_class_verdict0).group(0))
            print(re.search("[a-zA-Z]+",
                            div_rating_class_verdict0).group(0))  #changer

        if parsed_claim_review_page.find("div", {"class": "verdict-stamp"}):
            global_truth_rating = parsed_claim_review_page.find(
                "div", {
                    "class": "verdict-stamp"
                }).get_text()
        if parsed_claim_review_page.find("div", {"class": "verdict"}):
            global_truth_rating = parsed_claim_review_page.find(
                "div", {
                    "class": "verdict"
                }).get_text()
        if parsed_claim_review_page.find(
                "div", {"class": "report-verdict indicator"}):
            global_truth_rating = parsed_claim_review_page.find(
                "div", {
                    "class": "report-verdict indicator"
                }).get_text()
            if parsed_claim_review_page.find(
                    "div", {
                        "class": "report-verdict indicator"
                    }).find('span'):
                global_truth_rating = parsed_claim_review_page.find(
                    "div", {
                        "class": "report-verdict indicator"
                    }).find('span').get_text()

        claim.set_rating(global_truth_rating)  #changer

        # author
        #author = parsed_claim_review_page.find("div", {"class": "sharethefacts-speaker-name"})
        if parsed_claim_review_page.findAll("div",
                                            {"class": "author-details"}):
            author0 = parsed_claim_review_page.findAll(
                "div", {"class": "author-details"})  #changer
            #print(author0)
            if author0:
                for author in author0:
                    claim.set_author(author.find('h4').get_text())
                    print(author.find('h4').get_text())
        elif parsed_claim_review_page.find("div", {"class": "author-details"}):
            author = parsed_claim_review_page.find(
                "div", {"class": "author-details"})  #changer
            print(author)
            if author:
                claim.set_author(author.find('h4').get_text())
        #if author:
        # claim.set_author(author.get_text())

        # when there is no json

        date = parsed_claim_review_page.find('span', {"class": "published"})
        #print(date.text)
        global_date_str = ""
        if date:
            #global_date_str = search_dates(date.text.split(" ")[0])[0][1].strftime("%Y-%m-%d")
            datee = date.text

            global_date_str0 = re.search("[0-9]+ [a-zA-Z]+ [0-9]+", datee)
            global_date_str = global_date_str0.group(0)

            print(global_date_str)
            claim.set_date(global_date_str)

        #date = parsed_claim_review_page.find("time", {"class": "datetime"})
        #print(date)
        #if date:
        #claim.set_date(date.get_text())

        tags = []

        for tag in parsed_claim_review_page.findAll(
                'meta', {"property": "article:tag"}):
            tags.append(tag["content"])
        claim.set_tags(", ".join(tags))
        print(tags)

        global_claim_text = ""
        #report_claim_div = parsed_claim_review_page.find("div", {"class": "report-claim"})
        report_claim_div0 = parsed_claim_review_page.findAll(
            "div", {"class": "grid-x grid-padding-x"})
        report_claim_div = report_claim_div0[1]
        #report_claim_div = parsed_claim_review_page.find("div", {"class": "clearfix text-formatted field field--name-body field--type-text-with-summary field--label-hidden field__item"})
        #for pr in report_claim_div:
        #    print(pr)
        #print(report_claim_div[1])
        if report_claim_div:
            if report_claim_div.find("p") is not None:
                claim.set_claim(report_claim_div.find("p").get_text())
            #print(report_claim_div.find("p").get_text())
        else:
            claim.set_claim(claim.title)

        inline_ratings = parsed_claim_review_page.findAll(
            "div", {"class", "inline-rating"})
        #print(inline_ratings)
        #entry_section = parsed_claim_review_page.find("section", {"class", "entry-content"})  # type: Tag
        entry_section = parsed_claim_review_page.find(
            "section", {"class", "cell"})  # type: Tag
        entry_section_full_text = entry_section.text
        # There are several claims checked within the page. Common date, author, tags ,etc.
        if inline_ratings and len(inline_ratings) > 0:
            entry_contents = entry_section.contents  # type : List[Tag]
            current_index = 0

            # First we extract the bit of text common to everything until we meed a sub-section
            body_text, links, current_index = get_text_and_links_until_next_header(
                entry_contents, current_index)
            claim.set_body(body_text)
            claim.set_refered_links(links)

            while current_index < len(entry_contents):
                current_index = forward_until_inline_rating(
                    entry_contents, current_index)
                inline_rating_div = entry_contents[current_index]
                if isinstance(inline_rating_div, NavigableString):
                    break
                claim_text = inline_rating_div.find("p", {
                    "class": "claim-content"
                }).text
                inline_rating = inline_rating_div.find(
                    "div", {"class", "indicator"}).find("span").text
                previous_current_index = current_index
                inline_body_text, inline_links, current_index = get_text_and_links_until_next_header(
                    entry_contents, current_index)
                if previous_current_index == current_index:
                    current_index += 1
                inline_claim = Claim()
                inline_claim.set_source("africacheck")
                inline_claim.set_claim(claim_text)
                inline_claim.set_rating(inline_rating)
                inline_claim.set_refered_links(",".join(inline_links))
                inline_claim.set_body(inline_body_text)
                inline_claim.set_tags(", ".join(tags))
                inline_claim.set_date(global_date_str)
                inline_claim.set_url(url)
                if author:
                    inline_claim.set_author(author.get_text())
                inline_claim.set_title(global_title_text)

                local_claims.append(inline_claim)
        elif "PROMISE:" in entry_section_full_text and "VERDICT:" in entry_section_full_text:
            entry_contents = entry_section.contents  # type : List[Tag]
            current_index = 0

            # First we extract the bit of text common to everything until we meed a sub-section
            body_text, links, current_index = get_text_and_links_until_next_header(
                entry_contents, current_index)
            claim.set_body(body_text)
            claim.set_refered_links(links)

            while current_index < len(entry_contents):
                inline_rating_div = entry_contents[current_index]
                if isinstance(inline_rating_div, NavigableString):
                    break
                claim_text = entry_contents[current_index + 2].span.text
                inline_rating = entry_contents[current_index + 4].span.text
                current_index += 5
                previous_current_index = current_index
                inline_body_text, inline_links, current_index = get_text_and_links_until_next_header(
                    entry_contents, current_index)
                if previous_current_index == current_index:
                    current_index += 1
                inline_claim = Claim()
                inline_claim.set_source("africacheck")
                inline_claim.set_claim(claim_text)
                inline_claim.set_rating(inline_rating)
                inline_claim.set_refered_links(",".join(inline_links))
                inline_claim.set_body(inline_body_text)
                inline_claim.set_tags(", ".join(tags))
                inline_claim.set_date(global_date_str)
                inline_claim.set_url(url)
                if author:
                    inline_claim.set_author(author.get_text())
                inline_claim.set_title(global_title_text)

                local_claims.append(inline_claim)

        else:
            # body
            #body = parsed_claim_review_page.find("div", {"id": "main"})
            body = parsed_claim_review_page.find(
                "div", {"id": "block-mainpagecontent"})
            claim.set_body(body.get_text())
            # related links
            divTag = parsed_claim_review_page.find(
                "div", {"id": "block-mainpagecontent"})
            related_links = []
            for link in divTag.findAll('a', href=True):
                related_links.append(link['href'])
            claim.set_refered_links(",".join(related_links))

        local_claims.append(claim)

        return local_claims
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup,
                                 url: str) -> List[Claim]:
        claim = Claim()
        self.claim = self.extract_claim(parsed_claim_review_page)
        self.review = self.extract_review(parsed_claim_review_page)
        rating_value = self.extract_rating_value(parsed_claim_review_page)
        claim.set_rating_value(rating_value)
        claim.set_alternate_name(self.translate_rating_value(rating_value))
        claim.set_source(self.extract_author(
            parsed_claim_review_page))  # auteur de la review
        claim.set_author(self.extract_claimed_by(
            parsed_claim_review_page))  # ? auteur de la claim?
        # claim.setDatePublished(self.extract_date(parsed_claim_review_page)) #? publication de la claim
        claim.set_claim(self.claim)
        claim.set_body(self.review)
        claim.set_refered_links(self.extract_links(parsed_claim_review_page))
        claim.set_title(self.extract_title(parsed_claim_review_page))
        # date de la publication de la review
        claim.set_date(self.extract_date(parsed_claim_review_page))
        claim.set_url(url)
        claim.set_tags(self.extract_tags(parsed_claim_review_page))

        # extract_entities returns two variables
        json_claim, json_body = self.extract_entities(self.claim, self.review)
        claim.set_claim_entities(json_claim)
        claim.set_body_entities(json_body)
        return [claim]
Ejemplo n.º 18
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup,
                                 url: str) -> List[Claim]:
        claim = Claim()
        claim.set_url(url)
        claim.set_source("checkyourfact")

        # title
        title = parsed_claim_review_page.find('article').find("h1")
        claim.set_title(title.text.replace("FACT CHECK: ", ""))

        url_date = url.replace("https://checkyourfact.com/",
                               "").replace("/", " ").split(" ")
        claim.set_date(url_date[0] + "-" + url_date[1] + "-" + url_date[2])

        # body
        body = parsed_claim_review_page.find("article")
        claim.set_body(body.get_text())

        # related links
        div_tag = parsed_claim_review_page.find("article")
        related_links = []
        for link in div_tag.findAll('a', href=True):
            related_links.append(link['href'])
        claim.set_refered_links(related_links)

        claim.set_claim(claim.title)

        # rating
        rating = find_by_text(parsed_claim_review_page, "Verdict", "span")
        if rating:
            rating_text = rating[0].text.split(":")[-1].strip()
            claim.set_rating(rating_text)
        else:
            pass

        tags = []

        for tag in parsed_claim_review_page.findAll(
                'meta', {"property": "article:tag"}):
            tags.append(tag["content"])
        claim.set_tags(", ".join(tags))
        if len(claim.rating) == 0:
            return []
        else:
            return [claim]
Ejemplo n.º 19
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup,
                                 url: str) -> List[Claim]:
        if url in url_blacklist:
            return []
        claim = Claim()

        # url
        claim.url = str(url)

        # souce
        claim.source = "snopes"

        # title
        title = None
        if parsed_claim_review_page.select('article > header > h1'):
            for tmp in parsed_claim_review_page.select(
                    'article > header > h1'):
                title = tmp.text.strip()
            #sub_title = parsed_claim_review_page.select( 'article > header > h2' )
            claim.title = str(title.strip())

        # author
        author_list = []
        author_links = []
        if parsed_claim_review_page.select(
                'article > header > ul.list-unstyled.authors.list-unstyled.d-flex.flex-wrap.comma-separated > li > a'
        ):
            for author_a in parsed_claim_review_page.select(
                    'article > header > ul.list-unstyled.authors.list-unstyled.d-flex.flex-wrap.comma-separated > li > a'
            ):
                if hasattr(author_a, 'href'):
                    author_list.append(author_a.text.strip())
                    author_links.append(author_a.attrs['href'])
                else:
                    print("no author?")

        claim.author = ", ".join(author_list)
        claim.author_url = (", ".join(author_links))

        # review_author ?
        # -

        # date
        datePub = None
        dateUpd = None
        date_str = ""
        date_ = parsed_claim_review_page.find('ul', {"class": "dates"})

        if date_:
            dates = date_.find('li', {"class": "font-weight-bold text-muted"})
            dateSpans = dates.span
            for dateItems in dateSpans:
                if dateItems == 'Published':
                    datePub = dateItems.next.strip()
                    date_str = dateparser.parse(datePub).strftime("%Y-%m-%d")
                    claim.date_published = date_str
                    claim.date = date_str
                if dateItems == 'Updated':
                    dateUpd = dateItems.next.strip()
                    date_str = dateparser.parse(dateUpd).strftime("%Y-%m-%d")
                    claim.date = date_str

        # claim image?
        # -

        # claim
        claim_text = None
        if parsed_claim_review_page.select(
                'article > div > div.claim-text.card-body'):
            for p in parsed_claim_review_page.select(
                    'article > div > div.claim-text.card-body'):
                if hasattr(p, 'text'):
                    claim_text = p.text.strip()
            claim.claim = str(claim_text).strip()

        # rating -> https://www.snopes.com/fact-check-ratings/
        rating = None
        if parsed_claim_review_page.select(
                'article > div > div > div > div.media-body > span'):
            for rating_span in parsed_claim_review_page.select(
                    'article > div > div > div > div.media-body > span'):
                rating = rating_span.text.strip()
            claim.rating = str(rating).replace('"', "").strip()
        # claim.set_rating_value( rating )

        # rating best
        whats_true = None
        if parsed_claim_review_page.select(
                'article > div > div > div.whats-true > div > p'):
            for rating_span_true in parsed_claim_review_page.select(
                    'article > div > div > div.whats-true > div > p'):
                whats_true = rating_span_true.text.strip()
            if whats_true:
                whats_true = str(whats_true).replace('"', "")
                # Text: (not Numerical value)
                # claim.best_rating = whats_true

        # rating worst
        whats_true = False
        if parsed_claim_review_page.select(
                'article > div > div > div.whats-false > div > p'):
            for rating_span_false in parsed_claim_review_page.select(
                    'article > div > div > div.whats-false > div > p'):
                whats_false = rating_span_false.text.strip()
            if whats_false:
                whats_false = str(whats_true).replace('"', "")
                # Text: (not Numerical value)
                # claim.worst_rating = whats_false

        # rating Undetermined?
        whats_undetermined = False
        if parsed_claim_review_page.select(
                'article > div > div > div.whats-undetermined > div > p'):
            for rating_span_undetermined in parsed_claim_review_page.select(
                    'article > div > div > div.whats-undetermined > div > p'):
                whats_undetermined = rating_span_undetermined.text.strip()
            if whats_undetermined:
                whats_undetermined = str(whats_undetermined).replace('"', "")
                # Text: (not Numerical value)
                # claim.whats_undetermined = whats_undetermined

        # rating value ?
        # -

        # Body descriptioon
        text = ""
        if parsed_claim_review_page.select(
                'article > div.single-body.card.card-body.rich-text > p'):
            for child in parsed_claim_review_page.select(
                    'article > div.single-body.card.card-body.rich-text > p'):
                text += " " + child.text
            body_description = text.strip()
            claim.body = str(body_description).strip()

        # related links
        related_links = []
        if parsed_claim_review_page.select(
                'article > div.single-body.card.card-body > p > a'):
            for link in parsed_claim_review_page.select(
                    'article > div.single-body.card.card-body > p > a'):
                if hasattr(link, 'href'):
                    related_links.append(link['href'])
            claim.referred_links = related_links

        # tags
        tags = []
        if parsed_claim_review_page.select(
                'article > footer > div > a > div > div'):
            for tag in parsed_claim_review_page.select(
                    'article > footer > div > a > div > div'):
                if hasattr(tag, 'text'):
                    tags.append(tag.text.strip())
            claim.tags = ", ".join(tags)

        # same as ?
        # -

        #  No Rating? No Claim?
        if not claim_text or not rating:
            print(url)
            if not rating:
                print("-> Rating cannot be found!")
            if not claim_text:
                print("-> Claim cannot be found!")
            return []

        return [claim]
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]:
        claim = Claim()
        claim.set_url(url)
        claim.set_source("eufactcheck")

        #title
        #Since the title always starts with claim followed by the title of the article we split the string based on ":"
        full_title = parsed_claim_review_page.find("div", {"class":"page-title-head hgroup"}).find("h1").get_text().split(":")
        claim.set_title(full_title[1])

        #date
        full_date = parsed_claim_review_page.find("time", {"class":"entry-date updated"})['datetime'].split("T")
        claim.set_date(full_date[0])

        #body
        body = parsed_claim_review_page.find('div', {"class":"entry-content"})
        claim.set_body(body.get_text())

        #related related_links
        related_links = []
        for link in body.findAll('a', href=True):
            related_links.append(link['href'])
        claim.set_refered_links(related_links)

        claim.set_claim(claim.title)

        #rating
        rating = full_title[0].strip()
        claim.set_alternate_name(rating)
        return [claim]
Ejemplo n.º 21
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup,
                                 url: str) -> List[Claim]:
        claim = Claim()
        claim_txt = self.extract_claim(parsed_claim_review_page)
        review = self.extract_review(parsed_claim_review_page)
        rating_value = self.extract_rating_value(parsed_claim_review_page, url)
        claim.set_rating(rating_value)
        claim.set_source("Vishvanews")  # auteur de la review
        claim.review_author = self.extract_author(parsed_claim_review_page)
        claim.set_author(self.extract_claimed_by(
            parsed_claim_review_page))  # ? auteur de la claim?
        # claim.setDatePublished(self.extract_date(parsed_claim_review_page)) #? publication de la claim
        claim.set_claim(claim_txt)
        claim.set_body(review)
        claim.set_refered_links(self.extract_links(parsed_claim_review_page))
        claim.set_title(self.extract_title(parsed_claim_review_page))
        # date de la publication de la review
        claim.set_date(self.extract_date(parsed_claim_review_page))
        claim.set_url(url)
        claim.set_tags(self.extract_tags(parsed_claim_review_page))

        # extract_entities returns two variables
        json_claim, json_body = self.extract_entities(claim_txt, review)
        claim.claim_entities = json_claim
        claim.body_entities = json_body

        if claim.rating != "":
            return [claim]
        else:
            return []
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup,
                                 url: str) -> List[Claim]:
        if url in url_blacklist:
            return []

        claim = Claim()
        claim.set_url(url)
        claim.set_source("snopes")

        # title
        article = parsed_claim_review_page.find("article",
                                                {'class', 'main-post'})
        header = article.find("header")
        title = header.find("h1")
        claim.set_title(title.text)

        card = article.find("div", {"class": "content-wrapper card"})
        card_body = card.find("div", {'class': 'content'})

        # date
        date_str = ""
        rating = None
        claim_text = None
        date_ = parsed_claim_review_page.find('span',
                                              {"class": "date date-published"})
        # print date_["content"]
        if not date_:
            date_ = parsed_claim_review_page.find(
                'span', {"class": "date date-last-update"})
        if date_:
            date_str = dateparser.parse(date_.text).strftime("%Y-%m-%d")

        # body

        ads = card_body.findAll("div")
        for ad in ads:
            ad.decompose()

        ads = card_body.findAll("div", {"class": "snopes-bt"})
        for ad in ads:
            ad.decompose()

        text = ""
        contents = card_body.findChildren()
        for child in contents:
            text += child.text

        body_description = text

        # author
        author = parsed_claim_review_page.find("a", {"class": "author"})

        rating_div = None
        if not rating:
            rating = parsed_claim_review_page.find("span",
                                                   {"class": "rating-name"})
        if not rating:
            rating_div = parsed_claim_review_page.find(
                "div", {"class": "media rating"})
        if not rating and not rating_div:
            rating_div = parsed_claim_review_page.find("div",
                                                       {"class": "claim-old"})
        if not rating and not rating_div:
            rating_div = parsed_claim_review_page.find(
                "div", {"class": "rating-wrapper card"})
        if rating_div:
            rating = rating_div.find("h5")
            if not rating:
                rating = rating_div.find("span")
        if not rating:
            # Oldest page format
            rating = parsed_claim_review_page.find("font",
                                                   {"class", "status_color"})
            if rating:
                rating = rating.find("b")

        # related links
        related_links = []
        for link in card_body.findAll('a', href=True):
            related_links.append(link['href'])

        if not claim_text:
            claim_p = parsed_claim_review_page.find('p', {"class": "claim"})
            if not claim_p:
                claim_div = parsed_claim_review_page.find(
                    'div', {"class": "claim"})
                if not claim_div:
                    claim_div = parsed_claim_review_page.find(
                        'div', {"class": "claim-old"})
                if not claim_div:
                    claim_text = ""
                else:
                    claim_text = claim_div.find("p").text

            else:
                claim_text = claim_p.text
        else:
            claim_text = claim_text.strip()

        tags = []
        for tag in parsed_claim_review_page.findAll(
                'meta', {"property": "article:tag"}):
            tags.append(tag["content"])

        if not date_str or not claim_text or not body_description or not rating:
            claim_text, body_description, date_str, rating = handle_legacy_page_structures(
                card_body, claim_text, body_description, date_str, rating)
        claim.set_date(date_str)
        claim.set_body(body_description)
        claim.set_tags(", ".join(tags))
        claim.set_refered_links(related_links)

        if author:
            claim.review_author = author.text.strip()

        if len(claim_text) > 3 and len(claim_text.split("\n")) < 5:
            claim.set_claim(claim_text)
        else:
            if header:
                h1 = header.find("h1")
                claim_text = h1.text
                if claim_text:
                    claim.set_claim(claim_text)
                else:
                    print("Claim text cannot be found!")
                    return []

            else:
                return []

        if rating:
            claim.set_alternate_name(rating.text)
        else:
            return []

        return [claim]
Ejemplo n.º 23
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup,
                                 url: str) -> List[Claim]:
        self.claim = self.extract_claim(parsed_claim_review_page)
        self.review = self.extract_review(parsed_claim_review_page)

        claim = Claim()
        claim.set_rating_value(
            self.extract_rating_value(parsed_claim_review_page))
        claim.set_alternate_name(
            FatabyyanoFactCheckingSiteExtractor.translate_rating_value(
                self.extract_rating_value(parsed_claim_review_page)))
        claim.set_source("fatabyyano")
        claim.set_author("fatabyyano")
        claim.setDatePublished(self.extract_date(parsed_claim_review_page))
        claim.set_claim(self.claim)
        claim.set_body(self.review)
        claim.set_refered_links(self.extract_links(parsed_claim_review_page))
        claim.set_title(self.extract_claim(parsed_claim_review_page))
        claim.set_date(self.extract_date(parsed_claim_review_page))
        claim.set_url(url)
        claim.set_tags(self.extract_tags(parsed_claim_review_page))
        # extract_entities returns two variables
        json_claim, json_body = self.extract_entities(self.claim, self.review)
        claim.set_claim_entities(json_claim)
        claim.set_body_entities(json_body)

        return [claim]
def get_all_claims(criteria):
    headers = {
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }

    # print criteria.maxClaims
    # performing a search by each letter, and adding each article to a urls_ var.
    now = datetime.datetime.now()
    urls_ = {}
    types = [
        "true", "mostly-true", "half-true", "barely-true", "false",
        "pants-fire", "no-flip", "half-flip", "full-flop"
    ]
    last_page = []
    for page_number in range(1, 500):
        if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims):
            break
        url = "https://www.channel4.com/news/factcheck/page/" + str(
            page_number)
        # url="http://www.politifact.com/truth-o-meter/rulings/"+str(type_)+"/?page="+str(page_number)
        try:
            page = requests.get(url, headers=headers, timeout=5)
            soup = BeautifulSoup(page.text, "lxml")
            soup.prettify()

            links = soup.findAll("li", {"class": "feature factcheck"})
            if (len(links) != 0) or (links != last_page):
                for anchor in links:
                    anchor = anchor.find('a', {"class": "permalink"},
                                         href=True)
                    ind_ = str(anchor['href'])
                    if (ind_ not in list(urls_.keys())):
                        if (criteria.maxClaims > 0
                                and len(urls_) >= criteria.maxClaims):
                            break
                        if (ind_ not in criteria.avoid_url):
                            urls_[ind_] = ind_
                            print("adding " + str(ind_))
                last_page = links
            else:
                print("break!")
                break
        except:
            print("error=>" + str(url))

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url, conclusion in urls_.items():
        print(
            str(index) + "/" + str(len(list(urls_.keys()))) + " extracting " +
            str(url))
        index += 1

        url_complete = str(url)

        # print url_complete
        try:
            page = requests.get(url_complete, headers=headers, timeout=5)
            soup = BeautifulSoup(page.text, "lxml")
            soup.prettify("utf-8")

            claim_ = Claim()
            claim_.set_url(url_complete)
            claim_.set_source("channel4")

            if (criteria.html):
                claim_.setHtml(soup.prettify("utf-8"))

            # title
            # if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1):
            title = soup.find("div", {
                "class": "factcheck-article-header"
            }).find("h1").get_text()
            claim_.set_title(title)

            # date

            date_ = soup.find('li', {"class": "pubDateTime"})
            # print date_["content"]
            if date_:
                date_str = search_dates(
                    date_['data-time'])[0][1].strftime("%Y-%m-%d")
                # print date_str
                claim_.set_date(date_str)
            # print claim_.date

            # body
            body = soup.find("div", {"class": "article-body article-main"})
            claim_.set_body(body.get_text())

            # related links
            divTag = soup.find("div", {"class": "article-body article-main"})
            related_links = []
            for link in divTag.findAll('a', href=True):
                related_links.append(link['href'])
            claim_.set_refered_links(related_links)

            claim_.set_claim(title)

            tags = []

            for tag in soup.findAll('meta', {"property": "article:tag"}):
                # print "achou"
                tags.append(tag["content"])
            claim_.set_tags(", ".join(tags))

            # if (claim_.conclusion.replace(" ","")=="" or claim_.claim.replace(" ","")==""):
            # 	print claim_.conclusion
            # 	print claim_.claim
            # 	raise ValueError('No conclusion or claim')

            claims.append(claim_.generate_dictionary())
        except:
            print("Error ->" + str(url_complete))

    # creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf
Ejemplo n.º 25
0
def new_claim(f_link, date, title, tags):
    claim_ = Claim()
    claim_.set_url(f_link)
    claim_.set_title(title)
    claim_.set_tags(tags)
    date_ = date.strip().split()
    date_ = "-".join([date_[4], date_[2], date_[0]])
    claim_.set_date(dateparser.parse(date_).strftime("%Y-%m-%d"))
    claim_.set_source("publica")
    claim_.set_body("")
    return claim_
Ejemplo n.º 26
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup,
                                 url: str) -> List[Claim]:
        claims = []
        claim = Claim()

        # url
        claim.url = str(url)

        # souce
        claim.source = "fullfact"

        # title
        title = None
        if parsed_claim_review_page.select(
                'body > main > div > div > section > article > h1'):
            for tmp in parsed_claim_review_page.select(
                    'body > main > div > div > section > article > h1'):
                title = tmp.text.strip()
            claim.title = str(title.strip())

        # author
        author_list = []
        author_links = []
        # single author?
        if parsed_claim_review_page.select(
                'article > section.social-media > div > div > ul > li > span > cite'
        ):
            for author_a in parsed_claim_review_page.select(
                    'article > section.social-media > div > div > ul > li > span > cite'
            ):
                if hasattr(author_a, 'text'):
                    author_list.append(author_a.text.strip())
                # if hasattr( author_a, 'href' ):
                #    author_list.append( author_a.text.strip() )
                #    author_links.append( author_a.attrs['href'] )
                else:
                    print("no author? https://fullfact.org/about/our-team/")

        claim.author = ", ".join(author_list)
        #claim.author_url = ( ", ".join( author_links ) )

        # date
        datePub = None
        dateUpd = None
        date_str = ""
        # updated?
        if parsed_claim_review_page.select('article > div.published-at'):
            for date_ in parsed_claim_review_page.select(
                    'article > div.published-at'):
                if hasattr(date_, 'text'):
                    datePub = date_.text.strip()
                    if "|" in datePub:
                        split_datePub = datePub.split("|")
                        if len(split_datePub) > 0:
                            datePub = split_datePub[0].strip()
                    date_str = dateparser.parse(datePub).strftime("%Y-%m-%d")
                    claim.date_published = date_str
                    claim.date = date_str
                else:
                    print("no date?")

        # Body descriptioon
        text = ""
        if parsed_claim_review_page.select('article > p'):
            for child in parsed_claim_review_page.select('article > p'):
                text += " " + child.text
            body_description = text.strip()
            claim.body = str(body_description).strip()

        # related links (in page body text <p>)
        related_links = []
        if parsed_claim_review_page.select('article > p > a'):
            for link in parsed_claim_review_page.select('article > p > a'):
                try:
                    if hasattr(link, 'href'):
                        if 'http' in link['href']:
                            related_links.append(link['href'])
                        else:
                            related_links.append("https://fullfact.org" +
                                                 link['href'])
                except KeyError as e:
                    print("->KeyError: " + str(e))
                    continue
                except IndexError as e:
                    print("->IndexError : " + str(e))
                    continue

        # related links (in Related fact checks)
        if parsed_claim_review_page.select(
                'section.related-factchecks > div > ul > li > a'):
            for link in parsed_claim_review_page.select(
                    'section.related-factchecks > div > ul > li > a'):
                try:
                    if hasattr(link, 'href'):
                        if 'http' in link['href']:
                            related_links.append(link['href'])
                        else:
                            related_links.append("https://fullfact.org" +
                                                 link['href'])
                except KeyError as e:
                    print("->KeyError: " + str(e))
                    continue
                except IndexError as e:
                    print("->IndexError: " + str(e))
                    continue

        if related_links:
            claim.referred_links = related_links

        # cannot be found on fullfact:
        # self.tags = ""
        # self.author_url = ""
        # self.date_published = ""
        # self.same_as = ""
        # self.rating_value = ""
        # self.worst_rating = ""
        # self.best_rating = ""
        # self.review_author = ""

        # claim # multiple (local) claims: 'article > div > div > div.row.no-gutters.card-body-text > div > div > p' ?
        claim_text_list = []
        claim_text = None
        # rating -> VERDICT: extract_conclusion -> true, false, ...
        claim_verdict_list = []
        claim_verdict = None

        column = "claim"  # or verdict:
        if parsed_claim_review_page.select(
                'body > main > div > div > section > article > div > div > div.row.no-gutters.card-body-text > div > div > p'
        ):
            for p in parsed_claim_review_page.select(
                    'body > main > div > div > section > article > div > div > div.row.no-gutters.card-body-text > div > div > p'
            ):
                if hasattr(p, 'text'):
                    if column == "claim":
                        claim_text_list.append(p.text.strip())
                        if claim_text == None:
                            claim_text = p.text.strip()
                        column = "verdict"
                    else:
                        rating_word_list = p.text
                        conclusion_text = self._conclusion_processor.extract_conclusion(
                            rating_word_list)
                        #print ("conclusion_text: " + conclusion_text)
                        rating = str(conclusion_text).replace('"', "").strip()
                        if "." in rating:
                            split_name = rating.split(".")
                            if len(split_name) > 0:
                                rating = split_name[0]
                        claim_verdict_list.append(rating)
                        if claim_verdict == None:
                            claim_verdict = rating

                        column = "claim"

            # First local claim and rating:
            claim.claim = claim_text
            claim.rating = claim_verdict

            # All claims and ratings "comma" separated: get all claims?
            # claim.claim = ", ".join( claim_text_list )
            # claim.rating = ", ".join( verdict_text_list )

            # Create multiple claims from the main one and add change then the claim text and verdict (rating):
            c = 0
            while c <= len(claim_text_list) - 1:
                claims.append(claim)
                claims[c].claim = claim_text_list[c]
                claims[c].rating = claim_verdict_list[c]
                c += 1

            # for local_claim in claim_text_list:
            #    claims[claim[len(claim)]] = claims[claim[len(claim)-1]]

        # No Rating? No Claim?
        if not claim.claim or not claim.rating:
            print(url)
            if not claim.rating:
                print("-> Rating cannot be found!")
            if not claim.claim:
                print("-> Claim cannot be found!")
            return []

        # return [claim]
        return claims
Ejemplo n.º 27
0
def get_all_claims(criteria):
    print(criteria.maxClaims)
    # performing a search by each letter, and adding each article to a urls_ var.
    now = datetime.datetime.now()
    urls_ = {}
    for page_number in range(1, 500):
        if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims):
            break
        try:
            url = "https://correctiv.org/echtjetzt/artikel/seite/" + str(
                page_number) + "/"
            page = urllib.request.urlopen(url).read()
        except:
            break
        soup = BeautifulSoup(page, "lxml")
        soup.prettify()
        links = soup.findAll('a', {"class": "entry-list-item__link"},
                             href=True)
        if len(links) != 0:
            for anchor in links:
                url_to_add = "https://correctiv.org" + str(anchor['href'])
                if (url_to_add not in list(urls_.keys())):
                    if (criteria.maxClaims > 0
                            and len(urls_) >= criteria.maxClaims):
                        break
                    urls_[url_to_add] = page_number
                    print("adding " + str(url_to_add))
        else:
            print("break!")
            break

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url, conclusion in urls_.items():
        print(
            str(index) + "/" + str(len(list(urls_.keys()))) + " extracting " +
            str(url))
        index += 1

        url_complete = str(url)

        # print url_complete
        try:
            page = urllib.request.urlopen(url_complete).read().decode(
                'utf-8', 'ignore')
            soup = BeautifulSoup(page, "lxml")
            soup.prettify("utf-8")

            claim_ = Claim()
            claim_.set_url(url_complete)
            claim_.set_source("correctiv")

            if (criteria.html):
                claim_.setHtml(soup.prettify("utf-8"))

            # title
            # if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1):
            title = soup.find("h1", {"class": "article-header__headline"})
            claim_.set_title(
                title.text.replace("Faktencheck:", "").replace("\n", ""))

            date_ = soup.find('time',
                              {"class": "article-body__publishing-date"})
            # print date_["content"]
            if date_:
                date_str = search_dates(
                    date_['title'].split("T")[0])[0][1].strftime("%Y-%m-%d")
                # print date_str
                claim_.set_date(date_str)
            # print claim_.date

            # body
            body = soup.find("div", {"class": "article-body__main"})
            claim_.set_body(body.get_text())

            # related links
            divTag = soup.find("div", {"class": "article-body__main"})
            related_links = []
            for link in divTag.findAll('a', href=True):
                related_links.append(link['href'])
            claim_.set_refered_links(related_links)

            claim_.set_claim(claim_.title)
            conclsion = soup.find(
                'div', {"class": "article-body__claimreview claimreview"})
            if conclsion:
                claim_.setConclusion(
                    conclsion.text.replace("Unsere Bewertung: ",
                                           "").replace("\n", ""))

            tags = []

            for tag in soup.findAll('meta', {"property": "article:tag"}):
                # print "achou"
                tags.append(tag["content"])
            claim_.set_tags(", ".join(tags))

            claims.append(claim_.generate_dictionary())
        except:
            print("Error ->" + str(url_complete))

    # creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf
Ejemplo n.º 28
0
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup,
                                 url: str) -> List[Claim]:
        claim = Claim()

        data = parsed_claim_review_page.find(string=re.compile("schema.org"))
        data = json.loads(str(data))

        node_zero = data['@graph'][0]

        if node_zero and 'claimReviewed' in node_zero.keys():
            claim_str = node_zero['claimReviewed']
            if claim_str and len(claim_str) > 0:
                claim.set_claim(claim_str)
            else:
                return []

        rating = data['@graph'][0]['reviewRating']
        if rating and 'alternateName' in rating.keys():
            claim.set_rating(rating['alternateName'])
            try:
                claim.set_best_rating(rating['bestRating'])
                claim.set_worst_rating(rating['worstRating'])
                claim.set_rating_value(rating['ratingValue'])
            except Exception:
                pass
        else:
            return []

        if 'author' in data['@graph'][0]['itemReviewed'].keys():
            author = data['@graph'][0]['itemReviewed']['author']
            if author and 'name' in author.keys():
                if len(str(author['name'])) > 0:
                    claim.set_author(author['name'])

        claim.set_url(url)
        claim.set_source("factual_afp")

        try:
            title = data['@graph'][0]['name']
            claim.set_title(title)
        except Exception:
            pass

        try:
            claim.set_date(data['@graph'][0]['itemReviewed']['datePublished'])
        except Exception:
            pass

        try:
            date = data['@graph'][0]['datePublished']
            claim.set_date_published(date.split(' ')[0])
        except Exception:
            pass

        body = parsed_claim_review_page.find(
            'div', {'class': 'article-entry clearfix'})
        claim.set_body(body.text)

        links = []
        children = parsed_claim_review_page.find(
            'div', {
                'class': 'article-entry clearfix'
            }).children
        for child in children:
            try:
                if child.name == 'aside':
                    continue
                elems = child.findAll('a')
                for elem in elems:
                    links.append(elem['href'])
            except Exception as e:
                continue
        claim.set_refered_links(links)

        return [claim]
    def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]:
        claims = []

        ol = parsed_claim_review_page.find('ol', {'class': 'breadcrumb col-xs-12'})
        elems = ol.findAll('a')
        keywords = []
        for elem in elems:
            keywords.append(elem.text)


        #extraction of brief claims
        d = parsed_claim_review_page.findAll('div', {"id":"briefClaimConclusion"})
        if len(d) != 0 :
            d = d[0].find('div', {"class":"box-panel"})
            divs = d.children
            for div in divs:
                if(type(div) == type(d) and div.name == 'div'):
                    try:
                        claim = Claim()
                        claim.set_url(url)
                        claim.set_source("fullfact")
                        claim_str = div.find('div', {"class": "col-xs-12 col-sm-6 col-left"}).find('p').text
                        conclusion = div.find('div', {"class": "col-xs-12 col-sm-6 col-right"}).find('p').text

                        claim.set_claim(claim_str)
                        claim.set_alternate_name(conclusion)
                        claim.set_tags(','.join(keywords))
                        claims.append(claim)
                    except Exception as e:
                        continue

        #Extraction of quotes
        quotes = parsed_claim_review_page.findAll('blockquote')

        if(len(claims) == 0 or len(quotes) == 0):
            return claims

        for quote in quotes:            
            claim = Claim()
            claim.set_url(url)
            claim.set_source("fullfact")
            try:
                p = quote.findAll('p')

                if(len(p) == 1): # if there one paragraph then there is no author nor date
                    claim.set_claim(p[0].text)
                    claim.set_tags(','.join(keywords))
                    claims.append(claim)
                    continue

                claim_str = ''
                for x in p[:-1]: # Sometimes the quotes is made of 2 paragraphes or more
                    claim_str = x.text

                if(len(claim_str) < 4): # if it's too small it is not a claim
                    continue

                p = p[-1] #last paragraph always mentions the author and the date
                author = p.text.split(',')[:-1] #and there is always a semicolon seperating the two
                date = p.text.split(',')[-1]

                while not claim_str[0].isalnum():
                    claim_str = claim_str[1:]
                
                while not claim_str[-1].isalnum():
                    claim_str = claim_str[:-1]

                claim.set_claim(claim_str)
                claim.set_author(''.join(author))
            except Exception as e:
                continue

            try: 
                a = p.find('a') #usually the date is mentionned with the link where the claim was said
                d = datetime.strptime(a.text, '%d %B %Y').strftime("%Y-%m-%d")
                claim.set_refered_links(a['href'])
                claim.setDate(d)
            except Exception as e:
                try:
                    d = datetime.strptime(date[1:-1], ' %d %B %Y').strftime("%Y-%m-%d")
                    claim.setDate(d)
                except Exception as e:
                    pass
            claim.set_tags(keywords)
            claims.append(claim)

        return claims
Ejemplo n.º 30
0
def get_all_claims(criteria):
    headers = {
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }

    # performing a search by each letter, and adding each article to a urls_ var.
    urls_ = {}
    last_page = []
    #print("fafafafafafa")
    for page_number in range(1, 500):
        if 0 < criteria.maxClaims <= len(urls_):
            break

        url = "https://africacheck.org/latest-reports/page/" + str(
            page_number) + "/"
        try:
            page = requests.get(url, headers=headers, timeout=5)
            soup = BeautifulSoup(page.text, "lxml")
            soup.prettify()
            links = soup.findAll("div", {"class": "article-content"})

            if (len(links) != 0) or (links != last_page):
                for anchor in links:
                    anchor = anchor.find('a', href=True)
                    ind_ = str(anchor['href'])
                    if ind_ not in list(urls_.keys()):
                        if 0 < criteria.maxClaims <= len(urls_):
                            break
                        if ind_ not in criteria.avoid_url:
                            urls_[ind_] = ind_
                            print("adding " + str(ind_))
                last_page = links
            else:
                print("break!")
                break
        except:
            print("error=>" + str(url))

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url, conclusion in urls_.items():
        print(
            str(index) + "/" + str(len(list(urls_.keys()))) + " extracting " +
            str(url))
        index += 1

        url_complete = str(url)

        # print url_complete
        # try:
        page = requests.get(url_complete, headers=headers, timeout=5)
        soup = BeautifulSoup(page.text, "lxml")
        soup.prettify("utf-8")

        claim_ = Claim()
        claim_.set_url(url_complete)
        claim_.set_source("africacheck")

        # title
        title = soup.find("meta", {"property": "og:title"})
        title_content = title['content']
        if "|" in title_content:
            title_content = title_content.split("|")[-1]
        claim_.set_title(title_content)

        # date

        date_ = soup.find('time')
        # print date_["content"]
        if date_:
            date_str = search_dates(
                date_['datetime'].split(" ")[0])[0][1].strftime("%Y-%m-%d")
            # print date_str
            claim_.set_date(date_str)
        # print claim_.date

        # rating

        truth_rating = ""
        if soup.find("div", {"class": "verdict-stamp"}):
            truth_rating = soup.find("div", {
                "class": "verdict-stamp"
            }).get_text()
        if soup.find("div", {"class": "verdict"}):
            truth_rating = soup.find("div", {"class": "verdict"}).get_text()
        if soup.find("div", {"class": "indicator"}):
            truth_rating = soup.find("div", {"class": "indicator"}).get_text()
            if soup.find("div", {"class": "indicator"}).find('span'):
                truth_rating = soup.find("div", {
                    "class": "indicator"
                }).find('span').get_text()

        claim_.set_rating(
            str(re.sub('[^A-Za-z0-9 -]+', '', truth_rating)).lower().strip())

        # when there is no json

        date_ = soup.find("time", {"class": "datetime"})
        if date_:
            claim_.set_date(date_.get_text())

        # body
        body = soup.find("div", {"id": "main"})
        claim_.set_body(body.get_text())

        # author
        author = soup.find("div", {"class": "sharethefacts-speaker-name"})
        if author:
            claim_.set_author(author.get_text())

        # related links
        divTag = soup.find("div", {"id": "main"})
        related_links = []
        for link in divTag.findAll('a', href=True):
            related_links.append(link['href'])
        claim_.set_refered_links(related_links)

        if soup.find("div", {"class": "report-claim"}):
            claim_.set_claim(
                soup.find("div", {
                    "class": "report-claim"
                }).find("strong").get_text())
        else:
            claim_.set_claim(claim_.title)

        tags = []

        for tag in soup.findAll('meta', {"property": "article:tag"}):
            tags.append(tag["content"])
        claim_.set_tags(", ".join(tags))

        claims.append(claim_.generate_dictionary())

    # creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf