def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("checkyourfact") # title title = parsed_claim_review_page.find('article').find("h1") claim.set_title(title.text.replace("FACT CHECK: ", "")) url_date = url.replace("https://checkyourfact.com/", "").replace("/", " ").split(" ") claim.set_date(url_date[0] + "-" + url_date[1] + "-" + url_date[2]) # author & author_url if parsed_claim_review_page.select('detail > article > author'): for author in parsed_claim_review_page.select( 'detail > article > author'): if (hasattr(author, "data-slug")): author_str = author.text.split("|")[0].strip().split( "\n")[0] claim.author = author_str claim.author_url = "https://checkyourfact.com/author/" + author[ 'data-slug'] break # body body = parsed_claim_review_page.find("article") claim.set_body(body.get_text()) # related links div_tag = parsed_claim_review_page.find("article") related_links = [] for link in div_tag.findAll('a', href=True): related_links.append(link['href']) claim.set_refered_links(related_links) claim.set_claim(claim.title) # rating rating = find_by_text(parsed_claim_review_page, "Verdict", "span") if rating: rating_text = rating[0].text.split(":")[-1].strip() claim.set_rating(rating_text) else: pass tags = [] for tag in parsed_claim_review_page.findAll( 'meta', {"property": "article:tag"}): tags.append(tag["content"]) claim.set_tags(", ".join(tags)) if len(claim.rating) == 0: return [] else: return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: if url in url_blacklist: return [] claim = Claim() # url claim.url = str(url) # souce claim.source = "snopes" # title title = None if parsed_claim_review_page.select('article > header > h1'): for tmp in parsed_claim_review_page.select( 'article > header > h1'): title = tmp.text.strip() #sub_title = parsed_claim_review_page.select( 'article > header > h2' ) claim.title = str(title.strip()) # author author_list = [] author_links = [] if parsed_claim_review_page.select( 'article > header > ul.list-unstyled.authors.list-unstyled.d-flex.flex-wrap.comma-separated > li > a' ): for author_a in parsed_claim_review_page.select( 'article > header > ul.list-unstyled.authors.list-unstyled.d-flex.flex-wrap.comma-separated > li > a' ): if hasattr(author_a, 'href'): author_list.append(author_a.text.strip()) author_links.append(author_a.attrs['href']) else: print("no author?") claim.author = ", ".join(author_list) claim.author_url = (", ".join(author_links)) # review_author ? # - # date datePub = None dateUpd = None date_str = "" date_ = parsed_claim_review_page.find('ul', {"class": "dates"}) if date_: dates = date_.find('li', {"class": "font-weight-bold text-muted"}) dateSpans = dates.span for dateItems in dateSpans: if dateItems == 'Published': datePub = dateItems.next.strip() date_str = dateparser.parse(datePub).strftime("%Y-%m-%d") claim.date_published = date_str claim.date = date_str if dateItems == 'Updated': dateUpd = dateItems.next.strip() date_str = dateparser.parse(dateUpd).strftime("%Y-%m-%d") claim.date = date_str # claim image? # - # claim claim_text = None if parsed_claim_review_page.select( 'article > div > div.claim-text.card-body'): for p in parsed_claim_review_page.select( 'article > div > div.claim-text.card-body'): if hasattr(p, 'text'): claim_text = p.text.strip() claim.claim = str(claim_text).strip() # rating -> https://www.snopes.com/fact-check-ratings/ rating = None if parsed_claim_review_page.select( 'article > div > div > div > div.media-body > span'): for rating_span in parsed_claim_review_page.select( 'article > div > div > div > div.media-body > span'): rating = rating_span.text.strip() claim.rating = str(rating).replace('"', "").strip() # claim.set_rating_value( rating ) # rating best whats_true = None if parsed_claim_review_page.select( 'article > div > div > div.whats-true > div > p'): for rating_span_true in parsed_claim_review_page.select( 'article > div > div > div.whats-true > div > p'): whats_true = rating_span_true.text.strip() if whats_true: whats_true = str(whats_true).replace('"', "") # Text: (not Numerical value) # claim.best_rating = whats_true # rating worst whats_true = False if parsed_claim_review_page.select( 'article > div > div > div.whats-false > div > p'): for rating_span_false in parsed_claim_review_page.select( 'article > div > div > div.whats-false > div > p'): whats_false = rating_span_false.text.strip() if whats_false: whats_false = str(whats_true).replace('"', "") # Text: (not Numerical value) # claim.worst_rating = whats_false # rating Undetermined? whats_undetermined = False if parsed_claim_review_page.select( 'article > div > div > div.whats-undetermined > div > p'): for rating_span_undetermined in parsed_claim_review_page.select( 'article > div > div > div.whats-undetermined > div > p'): whats_undetermined = rating_span_undetermined.text.strip() if whats_undetermined: whats_undetermined = str(whats_undetermined).replace('"', "") # Text: (not Numerical value) # claim.whats_undetermined = whats_undetermined # rating value ? # - # Body descriptioon text = "" if parsed_claim_review_page.select( 'article > div.single-body.card.card-body.rich-text > p'): for child in parsed_claim_review_page.select( 'article > div.single-body.card.card-body.rich-text > p'): text += " " + child.text body_description = text.strip() claim.body = str(body_description).strip() # related links related_links = [] if parsed_claim_review_page.select( 'article > div.single-body.card.card-body > p > a'): for link in parsed_claim_review_page.select( 'article > div.single-body.card.card-body > p > a'): if hasattr(link, 'href'): related_links.append(link['href']) claim.referred_links = related_links # tags tags = [] if parsed_claim_review_page.select( 'article > footer > div > a > div > div'): for tag in parsed_claim_review_page.select( 'article > footer > div > a > div > div'): if hasattr(tag, 'text'): tags.append(tag.text.strip()) claim.tags = ", ".join(tags) # same as ? # - # No Rating? No Claim? if not claim_text or not rating: print(url) if not rating: print("-> Rating cannot be found!") if not claim_text: print("-> Claim cannot be found!") return [] return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claims = [] claim = Claim() # url claim.url = str(url) # souce claim.source = "fullfact" # title title = None if parsed_claim_review_page.select( 'body > main > div > div > section > article > h1'): for tmp in parsed_claim_review_page.select( 'body > main > div > div > section > article > h1'): title = tmp.text.strip() claim.title = str(title.strip()) # author author_list = [] author_links = [] # single author? if parsed_claim_review_page.select( 'article > section.social-media > div > div > ul > li > span > cite' ): for author_a in parsed_claim_review_page.select( 'article > section.social-media > div > div > ul > li > span > cite' ): if hasattr(author_a, 'text'): author_list.append(author_a.text.strip()) # if hasattr( author_a, 'href' ): # author_list.append( author_a.text.strip() ) # author_links.append( author_a.attrs['href'] ) else: print("no author? https://fullfact.org/about/our-team/") claim.author = ", ".join(author_list) #claim.author_url = ( ", ".join( author_links ) ) # date datePub = None dateUpd = None date_str = "" # updated? if parsed_claim_review_page.select('article > div.published-at'): for date_ in parsed_claim_review_page.select( 'article > div.published-at'): if hasattr(date_, 'text'): datePub = date_.text.strip() if "|" in datePub: split_datePub = datePub.split("|") if len(split_datePub) > 0: datePub = split_datePub[0].strip() date_str = dateparser.parse(datePub).strftime("%Y-%m-%d") claim.date_published = date_str claim.date = date_str else: print("no date?") # Body descriptioon text = "" if parsed_claim_review_page.select('article > p'): for child in parsed_claim_review_page.select('article > p'): text += " " + child.text body_description = text.strip() claim.body = str(body_description).strip() # related links (in page body text <p>) related_links = [] if parsed_claim_review_page.select('article > p > a'): for link in parsed_claim_review_page.select('article > p > a'): try: if hasattr(link, 'href'): if 'http' in link['href']: related_links.append(link['href']) else: related_links.append("https://fullfact.org" + link['href']) except KeyError as e: print("->KeyError: " + str(e)) continue except IndexError as e: print("->IndexError : " + str(e)) continue # related links (in Related fact checks) if parsed_claim_review_page.select( 'section.related-factchecks > div > ul > li > a'): for link in parsed_claim_review_page.select( 'section.related-factchecks > div > ul > li > a'): try: if hasattr(link, 'href'): if 'http' in link['href']: related_links.append(link['href']) else: related_links.append("https://fullfact.org" + link['href']) except KeyError as e: print("->KeyError: " + str(e)) continue except IndexError as e: print("->IndexError: " + str(e)) continue if related_links: claim.referred_links = related_links # cannot be found on fullfact: # self.tags = "" # self.author_url = "" # self.date_published = "" # self.same_as = "" # self.rating_value = "" # self.worst_rating = "" # self.best_rating = "" # self.review_author = "" # claim # multiple (local) claims: 'article > div > div > div.row.no-gutters.card-body-text > div > div > p' ? claim_text_list = [] claim_text = None # rating -> VERDICT: extract_conclusion -> true, false, ... claim_verdict_list = [] claim_verdict = None column = "claim" # or verdict: if parsed_claim_review_page.select( 'body > main > div > div > section > article > div > div > div.row.no-gutters.card-body-text > div > div > p' ): for p in parsed_claim_review_page.select( 'body > main > div > div > section > article > div > div > div.row.no-gutters.card-body-text > div > div > p' ): if hasattr(p, 'text'): if column == "claim": claim_text_list.append(p.text.strip()) if claim_text == None: claim_text = p.text.strip() column = "verdict" else: rating_word_list = p.text conclusion_text = self._conclusion_processor.extract_conclusion( rating_word_list) #print ("conclusion_text: " + conclusion_text) rating = str(conclusion_text).replace('"', "").strip() if "." in rating: split_name = rating.split(".") if len(split_name) > 0: rating = split_name[0] claim_verdict_list.append(rating) if claim_verdict == None: claim_verdict = rating column = "claim" # First local claim and rating: claim.claim = claim_text claim.rating = claim_verdict # All claims and ratings "comma" separated: get all claims? # claim.claim = ", ".join( claim_text_list ) # claim.rating = ", ".join( verdict_text_list ) # Create multiple claims from the main one and add change then the claim text and verdict (rating): c = 0 while c <= len(claim_text_list) - 1: claims.append(claim) claims[c].claim = claim_text_list[c] claims[c].rating = claim_verdict_list[c] c += 1 # for local_claim in claim_text_list: # claims[claim[len(claim)]] = claims[claim[len(claim)-1]] # No Rating? No Claim? if not claim.claim or not claim.rating: print(url) if not claim.rating: print("-> Rating cannot be found!") if not claim.claim: print("-> Claim cannot be found!") return [] # return [claim] return claims
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("newtral") #title, claim and autor claim title = parsed_claim_review_page.find("meta", attrs={'property': 'og:title'})['content'] title = title.strip().split("|")[0] claim.set_title(title) entry_content = parsed_claim_review_page.find( "div", attrs={'class': 'entry-content'}) #print (title) dospunto = re.search(r'(: «)', title) dospunt = re.search(r'(: “)', title) if dospunto: claim_a = title.split(":") auteur = claim_a[0].strip() claim.author = auteur claim_text = claim_a[1].strip("« »") claim.claim = claim_text #print (claim_a) elif dospunt: claim_b = title.split(":") auteur = claim_b[0].strip() # print ("auteur:" , auteur) claim.author = auteur claim_text = claim_b[1].strip(": “ ”") # print ("claim :", claim) claim.claim = claim_text else: pass #multiple title or claim claim_mult = entry_content.findAll('h2') if claim_mult: claim_al = [i.text.strip() for i in claim_mult] dospunt = re.search(r'(: “)', claim_al) dospunto = re.search(r'(: «)', claim_al) if dospunt: claim_b = title.split(":") auteur = claim_b[0].strip() # print ("auteur:" , auteur) claim.author = auteur claim_text = claim_b[1].strip(": “ ”") # print ("claim :", claim) claim.claim = claim_text elif dospunto: claim_a = title.split(":") auteur = claim_a[0].strip() claim.author = auteur claim_text = claim_a[1].strip("« »") claim.claim = claim_text #print (claim_a) else: claim.set_title(claim_al) #tags tags = parsed_claim_review_page.find_all( "meta", attrs={'property': 'article:tag'}) tag_list = [] for tag in tags: tag_text = tag['content'] tag_list.append(tag_text) claim.set_tags(",".join(tag_list)) #date pubished published = parsed_claim_review_page.find( "meta", attrs={'property': 'article:published_time'})['content'] claim.date_published = published.strip() #autor article author_span = parsed_claim_review_page.find( "span", attrs={'class': 'c-article__author'}) author_a = author_span.find("a") author_url = author_a['href'] author_text = author_a.text author_text = re.sub('Por', '', author_text).strip() claim.author_url = author_url claim.review_author = author_text # Recuperation du texte de l'article entry_text = "" body_t = entry_content.find_all('p') body = [text.text.strip() for text in body_t] entry_text += " ".join(body) + "\n" claim.body = entry_text # Recuperation des liens dans le texte de l'article links = [ link['href'] for link in entry_content.find_all('a', href=True) ] claim.referred_links = links #Veracite intro = parsed_claim_review_page.find( "div", attrs={'class': 'c-article__intro'}) veracities = [ "ENGAÑOSA", "ENGAÑOSO", "FALSO", "FALSA", "FALSOS", "VERDADERO", "VERDAD A MEDIAS" ] def common(a, b): c = [value for value in a if value in b] return c if intro: intro_p = " ".join(str(v) for v in intro) #print(type(body_t)) rating_text_list = intro_p.upper() rating_text = [ i.strip() for i in common(veracities, rating_text_list) ] claim.alternate_name = rating_text else: body_a = " ".join(str(v) for v in body) #print(type(body_t)) rating_text_list = body_a.upper() rating_text = [ i.strip() for i in common(veracities, rating_text_list) ] claim.alternate_name = rating_text return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("newtral") title = parsed_claim_review_page.find("meta", attrs={'property': 'og:title'})['content'] title = title.strip().split("|")[0] claim.set_title(title) dospunto = re.search(r'(: «)', title) dospunt = re.search(r'(: “)', title) if dospunto: claim_a = title.split(":") auteur = claim_a[0].strip() claim.author = auteur # print ("auteur:" , auteur) claim_text = claim_a[1].strip("« »") claim.claim = claim_text elif dospunt: claim_b = title.split(":") auteur = claim_b[0].strip() # print ("auteur:" , auteur) claim.author = auteur claim_text = claim_b[1].strip(": “ ”") # print ("claim :", claim) claim.claim = claim_text else: pass tags = parsed_claim_review_page.find_all("meta", attrs={'property': 'article:tag'}) tag_list = [] for tag in tags: tag_text = tag['content'] tag_list.append(tag_text) claim.set_tags(",".join(tag_list)) published = parsed_claim_review_page.find("meta", attrs={'property': 'article:published_time'})[ 'content'] claim.date_published = published.strip() entry_content = parsed_claim_review_page.find("div", attrs={'class': 'entry-content'}) intro = parsed_claim_review_page.find("div", attrs={'class': 'c-article__intro'}) if intro is None: intro_rating_p = entry_content.find("em") if intro_rating_p is None: intro_rating_p = entry_content.find("p") if intro_rating_p is None: intro_rating_p = entry_content.find("div") else: intro_rating_p = intro.p rating_in_image = False if intro_rating_p is None: # Rating in image... rating_in_image = True rating_text = "" else: rating_text = intro_rating_p.get_text() rating_re_es_falso = regex.compile( r"(La afirmación es|La afirmación es una|La declaración es|Es|El dato es" + \ "|La comparación de Colau es)? ?([\p{Lu}| ]+)(\.| –|,| )") es_falso_match = rating_re_es_falso.match(rating_text) if es_falso_match is not None and es_falso_match.group(2) is not None: rating_text = es_falso_match.group(2) else: if not rating_in_image: is_there_b = intro_rating_p.find('b') if is_there_b is not None: rating_text = is_there_b.text else: is_there_strong = intro_rating_p.find("strong") if is_there_strong is not None: rating_text = is_there_strong.text else: pass claim.rating = rating_text author_span = parsed_claim_review_page.find("span", attrs={'class': 'c-article__author'}) author_a = author_span.find("a") author_url = author_a['href'] author_text = author_a.text author_text = re.sub('Por', '', author_text).strip() claim.author_url = author_url claim.review_author = author_text # Recuperation du texte de l'article entry_text = "" body_t = entry_content.find_all('p') body = [text.text.strip() for text in body_t] entry_text += " ".join(body) + "\n" claim.body = entry_text # Recuperation des liens dans le texte de l'article links = [link['href'] for link in entry_content.find_all('a', href=True)] claim.referred_links = links # else: # title = container.h3.text # titles.append(title) # # print("title", title) # claim_c = hd.h1.text.split(":") # claim_d = hd.h1.text.strip() # # if claim_c: # auteur = claim_c[0].strip() # auteurs.append(auteur) # print("auteur:", auteur) # claim = claim_c[1].strip("« »") # claims.append(claim) # # print ("claim :", claim) # # else : # # print (claim_d) # return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: local_claims = [] claim = Claim() claim.set_url(url) claim.set_source("africacheck") # title title = parsed_claim_review_page.find("meta", {"property": "og:title"}) global_title_text = title['content'] claim.set_title(global_title_text) # date date = parsed_claim_review_page.find("span", { "class": "published" }).next global_date_str = "" if date: # global_date_str = search_dates(date['datetime'].split(" ")[0])[0][1].strftime("%Y-%m-%d") global_date_str = search_dates(date)[0][1].strftime("%Y-%m-%d") claim.set_date(global_date_str) # author author = parsed_claim_review_page.find("div", {"class": "author-details"}) if author: claim.set_author(author.get_text()) if parsed_claim_review_page.select('div.author-details > a > h4'): for child in parsed_claim_review_page.select( 'div.author-details > a > h4'): try: claim.author = child.get_text() continue except KeyError: print("KeyError: Skip") if parsed_claim_review_page.select('div.author-details > a'): for child in parsed_claim_review_page.select( 'div.author-details > a'): try: claim.author_url = child['href'] continue except KeyError: print("KeyError: Skip") # tags tags = [] for tag in parsed_claim_review_page.findAll( 'meta', {"property": "article:tag"}): tags.append(tag["content"]) claim.set_tags(", ".join(tags)) # claim entry_section = parsed_claim_review_page.find("section", {"class", "cell"}) verdict_box = parsed_claim_review_page.find( "div", {"class", "article-details__verdict"}) if verdict_box and len( verdict_box) > 0 and "Verdict" in verdict_box.text: report_claim_div = parsed_claim_review_page.find( "div", {"class": "field--name-field-claims"}) if report_claim_div: claim.set_claim(report_claim_div.get_text()) else: claim.set_claim(claim.title) # rating inline_ratings = parsed_claim_review_page.findAll( "div", {"class", "rating"}) if inline_ratings: if (hasattr(inline_ratings[0], 'class')): try: if ('class' in inline_ratings[0].attrs): if (inline_ratings[0].attrs['class'][1]): rating_tmp = inline_ratings[0].attrs['class'][ 1] claim.rating = rating_tmp.replace( 'rating--', '').replace("-", "").capitalize() except KeyError: print("KeyError: Skip") else: # alternative rating (If there is no article--aside box with verdict) global_truth_rating = "" if parsed_claim_review_page.find("div", {"class": "verdict-stamp"}): global_truth_rating = parsed_claim_review_page.find( "div", { "class": "verdict-stamp" }).get_text() if parsed_claim_review_page.find("div", {"class": "verdict"}): global_truth_rating = parsed_claim_review_page.find( "div", { "class": "verdict" }).get_text() if parsed_claim_review_page.find("div", {"class": "indicator"}): global_truth_rating = parsed_claim_review_page.find( "div", { "class": "indicator" }).get_text() if parsed_claim_review_page.find("div", { "class": "indicator" }).find('span'): global_truth_rating = parsed_claim_review_page.find( "div", { "class": "indicator" }).find('span').get_text() # If still no rathing value, try to extract from picture name if (global_truth_rating == ""): filename = "" if parsed_claim_review_page.select( 'div.hero__image > picture'): for child in parsed_claim_review_page.select( 'div.hero__image > picture'): # child.contents[1].attrs['srcset'] if (hasattr(child, 'contents')): try: filename = child.contents[1].attrs['srcset'] continue except KeyError: print("KeyError: Skip") if (filename != ""): filename_split = filename.split("/") filename_split = filename_split[len(filename_split) - 1].split(".png") filename_split = filename_split[0].split("_") if len(filename_split) == 1: global_truth_rating = filename_split[0] else: global_truth_rating = filename_split[ len(filename_split) - 1] claim.set_rating( str(re.sub('[^A-Za-z0-9 -]+', '', global_truth_rating)).lower().strip().replace( "pfalse", "false").replace("-", "").capitalize()) if (not self.rating_value_is_valid(claim.rating)): print("\nURL: " + url) print("\n Rating:" + claim.rating) claim.rating = "" # body body = parsed_claim_review_page.find("div", {"class": "article--main"}) claim.set_body(body.get_text()) # related links related_links = [] for link in body.findAll('a', href=True): related_links.append(link['href']) claim.set_refered_links(related_links) if claim.rating: return [claim] else: return []