def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: if url in url_blacklist: return [] claim = Claim() # url claim.url = str(url) # souce claim.source = "snopes" # title title = None if parsed_claim_review_page.select('article > header > h1'): for tmp in parsed_claim_review_page.select( 'article > header > h1'): title = tmp.text.strip() #sub_title = parsed_claim_review_page.select( 'article > header > h2' ) claim.title = str(title.strip()) # author author_list = [] author_links = [] if parsed_claim_review_page.select( 'article > header > ul.list-unstyled.authors.list-unstyled.d-flex.flex-wrap.comma-separated > li > a' ): for author_a in parsed_claim_review_page.select( 'article > header > ul.list-unstyled.authors.list-unstyled.d-flex.flex-wrap.comma-separated > li > a' ): if hasattr(author_a, 'href'): author_list.append(author_a.text.strip()) author_links.append(author_a.attrs['href']) else: print("no author?") claim.author = ", ".join(author_list) claim.author_url = (", ".join(author_links)) # review_author ? # - # date datePub = None dateUpd = None date_str = "" date_ = parsed_claim_review_page.find('ul', {"class": "dates"}) if date_: dates = date_.find('li', {"class": "font-weight-bold text-muted"}) dateSpans = dates.span for dateItems in dateSpans: if dateItems == 'Published': datePub = dateItems.next.strip() date_str = dateparser.parse(datePub).strftime("%Y-%m-%d") claim.date_published = date_str claim.date = date_str if dateItems == 'Updated': dateUpd = dateItems.next.strip() date_str = dateparser.parse(dateUpd).strftime("%Y-%m-%d") claim.date = date_str # claim image? # - # claim claim_text = None if parsed_claim_review_page.select( 'article > div > div.claim-text.card-body'): for p in parsed_claim_review_page.select( 'article > div > div.claim-text.card-body'): if hasattr(p, 'text'): claim_text = p.text.strip() claim.claim = str(claim_text).strip() # rating -> https://www.snopes.com/fact-check-ratings/ rating = None if parsed_claim_review_page.select( 'article > div > div > div > div.media-body > span'): for rating_span in parsed_claim_review_page.select( 'article > div > div > div > div.media-body > span'): rating = rating_span.text.strip() claim.rating = str(rating).replace('"', "").strip() # claim.set_rating_value( rating ) # rating best whats_true = None if parsed_claim_review_page.select( 'article > div > div > div.whats-true > div > p'): for rating_span_true in parsed_claim_review_page.select( 'article > div > div > div.whats-true > div > p'): whats_true = rating_span_true.text.strip() if whats_true: whats_true = str(whats_true).replace('"', "") # Text: (not Numerical value) # claim.best_rating = whats_true # rating worst whats_true = False if parsed_claim_review_page.select( 'article > div > div > div.whats-false > div > p'): for rating_span_false in parsed_claim_review_page.select( 'article > div > div > div.whats-false > div > p'): whats_false = rating_span_false.text.strip() if whats_false: whats_false = str(whats_true).replace('"', "") # Text: (not Numerical value) # claim.worst_rating = whats_false # rating Undetermined? whats_undetermined = False if parsed_claim_review_page.select( 'article > div > div > div.whats-undetermined > div > p'): for rating_span_undetermined in parsed_claim_review_page.select( 'article > div > div > div.whats-undetermined > div > p'): whats_undetermined = rating_span_undetermined.text.strip() if whats_undetermined: whats_undetermined = str(whats_undetermined).replace('"', "") # Text: (not Numerical value) # claim.whats_undetermined = whats_undetermined # rating value ? # - # Body descriptioon text = "" if parsed_claim_review_page.select( 'article > div.single-body.card.card-body.rich-text > p'): for child in parsed_claim_review_page.select( 'article > div.single-body.card.card-body.rich-text > p'): text += " " + child.text body_description = text.strip() claim.body = str(body_description).strip() # related links related_links = [] if parsed_claim_review_page.select( 'article > div.single-body.card.card-body > p > a'): for link in parsed_claim_review_page.select( 'article > div.single-body.card.card-body > p > a'): if hasattr(link, 'href'): related_links.append(link['href']) claim.referred_links = related_links # tags tags = [] if parsed_claim_review_page.select( 'article > footer > div > a > div > div'): for tag in parsed_claim_review_page.select( 'article > footer > div > a > div > div'): if hasattr(tag, 'text'): tags.append(tag.text.strip()) claim.tags = ", ".join(tags) # same as ? # - # No Rating? No Claim? if not claim_text or not rating: print(url) if not rating: print("-> Rating cannot be found!") if not claim_text: print("-> Claim cannot be found!") return [] return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) #print("\r" + url) claim.set_source("politifact") # Claim title = parsed_claim_review_page.find("div", {"class": "m-statement__quote"}) claim.set_claim(title.text.strip()) # title title = parsed_claim_review_page.find("h2", {"class": "c-title"}) claim.set_title(title.text.strip()) # date date = parsed_claim_review_page.find('span', {"class": "m-author__date"}) if date: date_str = search_dates(date.text)[0][1].strftime("%Y-%m-%d") claim.set_date(date_str) # rating # https://static.politifact.com/politifact/rulings/meter-mostly-false.jpg statement_body=parsed_claim_review_page.find("div", {"class", "m-statement__body"}) statement_detail = statement_body.find("div", {"class", "c-image"}) statement_detail_image=statement_detail.find("picture") statement_detail_image_alt=statement_detail_image.find("img",{"class", "c-image__original"}) if statement_detail_image_alt: #claim.alternate_name = statement_detail_image_alt['src'].split("rulings/")[1].split(".jpg")[0] if self.translate_rating_value(statement_detail_image_alt['alt']) != "": claim.rating = self.translate_rating_value(statement_detail_image_alt['alt']) else: claim.rating = statement_detail_image_alt['alt'] # body body = parsed_claim_review_page.find("article", {"class": "m-textblock"}) #body.find("div", {"class": "artembed"}).decompose() #claim.set_body(body.get_text()) text ="" if parsed_claim_review_page.select( 'main > section > div.t-row__center > article.m-textblock' ): for child in parsed_claim_review_page.select( 'main > section > div.t-row__center > article.m-textblock' ): for element in child.contents: if (element.name == "div"): valid = True # check for illegal JS element in artembed (tag): if (hasattr( element, 'class' )): try: if ('class' in element.attrs): if (element.attrs['class'][0] == "artembed"): if (element.text.startswith("\r\nwindow.gciAnalyticsUAID")): valid = False except KeyError: print("KeyError: Skip") else: valid = True if hasattr( element, 'text' ): #if (element.text == "We rate this claim False." and url == "https://www.politifact.com/staff/kelsey-tamakloe/"): if (url == "https://www.politifact.com/staff/kelsey-tamakloe/"): print("\r" + str(element.text)) if (valid == True): if (element): if (hasattr( element, 'text' )): text += " " + str(element.text) else: text += " " + str(element) body_description = text.strip() claim.body = str(body_description).strip() # author author_meta = parsed_claim_review_page.find("div", {"class": "m-author__content"}) if author_meta: author = author_meta.find("a").text claim.set_author(author) author_url = author_meta.find("a") if author_url.attrs["href"] != "": claim.author_url = "https://www.politifact.com" + author_url.attrs["href"] # date published statement_meta = parsed_claim_review_page.find("div", {"class": "m-statement__meta"}) if statement_meta: meta_text = statement_meta.text if "on" in meta_text: meta_text = meta_text.split(" on ")[1] if "in" in meta_text: meta_text = meta_text.split(" in ")[0] if meta_text: date = search_dates(meta_text) if date: date = date[0][1].strftime("%Y-%m-%d") claim.date = date # related links div_tag = parsed_claim_review_page.find("article", {"class": "m-textblock"}) related_links = [] for link in body.find_all('a', href=True): if (link['href'][0] == "/"): related_links.append("https://www.politifact.com" + link['href']) else: related_links.append(link['href']) claim.set_refered_links(related_links) claim.set_claim(parsed_claim_review_page.find("div", {"class": "m-statement__quote"}).text.strip()) tags = [] ul_tag = parsed_claim_review_page.find("ul", {"class", "m-list"}) if ul_tag: ul_tag_contents = ul_tag.findAll("li", {"class", "m-list__item"}) for a in ul_tag_contents: a_tag=a.find("a", title=True) a_tag_text=a_tag['title'] tags.append(a_tag_text) if statement_body: topics = statement_body.find("ul", {"class", "m-list"}).find_all("a") for link in topics: text = link['title'] tags.append(text) claim.set_tags(",".join(tags)) return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("newtral") #title, claim and autor claim title = parsed_claim_review_page.find("meta", attrs={'property': 'og:title'})['content'] title = title.strip().split("|")[0] claim.set_title(title) entry_content = parsed_claim_review_page.find( "div", attrs={'class': 'entry-content'}) #print (title) dospunto = re.search(r'(: «)', title) dospunt = re.search(r'(: “)', title) if dospunto: claim_a = title.split(":") auteur = claim_a[0].strip() claim.author = auteur claim_text = claim_a[1].strip("« »") claim.claim = claim_text #print (claim_a) elif dospunt: claim_b = title.split(":") auteur = claim_b[0].strip() # print ("auteur:" , auteur) claim.author = auteur claim_text = claim_b[1].strip(": “ ”") # print ("claim :", claim) claim.claim = claim_text else: pass #multiple title or claim claim_mult = entry_content.findAll('h2') if claim_mult: claim_al = [i.text.strip() for i in claim_mult] dospunt = re.search(r'(: “)', claim_al) dospunto = re.search(r'(: «)', claim_al) if dospunt: claim_b = title.split(":") auteur = claim_b[0].strip() # print ("auteur:" , auteur) claim.author = auteur claim_text = claim_b[1].strip(": “ ”") # print ("claim :", claim) claim.claim = claim_text elif dospunto: claim_a = title.split(":") auteur = claim_a[0].strip() claim.author = auteur claim_text = claim_a[1].strip("« »") claim.claim = claim_text #print (claim_a) else: claim.set_title(claim_al) #tags tags = parsed_claim_review_page.find_all( "meta", attrs={'property': 'article:tag'}) tag_list = [] for tag in tags: tag_text = tag['content'] tag_list.append(tag_text) claim.set_tags(",".join(tag_list)) #date pubished published = parsed_claim_review_page.find( "meta", attrs={'property': 'article:published_time'})['content'] claim.date_published = published.strip() #autor article author_span = parsed_claim_review_page.find( "span", attrs={'class': 'c-article__author'}) author_a = author_span.find("a") author_url = author_a['href'] author_text = author_a.text author_text = re.sub('Por', '', author_text).strip() claim.author_url = author_url claim.review_author = author_text # Recuperation du texte de l'article entry_text = "" body_t = entry_content.find_all('p') body = [text.text.strip() for text in body_t] entry_text += " ".join(body) + "\n" claim.body = entry_text # Recuperation des liens dans le texte de l'article links = [ link['href'] for link in entry_content.find_all('a', href=True) ] claim.referred_links = links #Veracite intro = parsed_claim_review_page.find( "div", attrs={'class': 'c-article__intro'}) veracities = [ "ENGAÑOSA", "ENGAÑOSO", "FALSO", "FALSA", "FALSOS", "VERDADERO", "VERDAD A MEDIAS" ] def common(a, b): c = [value for value in a if value in b] return c if intro: intro_p = " ".join(str(v) for v in intro) #print(type(body_t)) rating_text_list = intro_p.upper() rating_text = [ i.strip() for i in common(veracities, rating_text_list) ] claim.alternate_name = rating_text else: body_a = " ".join(str(v) for v in body) #print(type(body_t)) rating_text_list = body_a.upper() rating_text = [ i.strip() for i in common(veracities, rating_text_list) ] claim.alternate_name = rating_text return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claims = [] claim = Claim() # url claim.url = str(url) # souce claim.source = "fullfact" # title title = None if parsed_claim_review_page.select( 'body > main > div > div > section > article > h1'): for tmp in parsed_claim_review_page.select( 'body > main > div > div > section > article > h1'): title = tmp.text.strip() claim.title = str(title.strip()) # author author_list = [] author_links = [] # single author? if parsed_claim_review_page.select( 'article > section.social-media > div > div > ul > li > span > cite' ): for author_a in parsed_claim_review_page.select( 'article > section.social-media > div > div > ul > li > span > cite' ): if hasattr(author_a, 'text'): author_list.append(author_a.text.strip()) # if hasattr( author_a, 'href' ): # author_list.append( author_a.text.strip() ) # author_links.append( author_a.attrs['href'] ) else: print("no author? https://fullfact.org/about/our-team/") claim.author = ", ".join(author_list) #claim.author_url = ( ", ".join( author_links ) ) # date datePub = None dateUpd = None date_str = "" # updated? if parsed_claim_review_page.select('article > div.published-at'): for date_ in parsed_claim_review_page.select( 'article > div.published-at'): if hasattr(date_, 'text'): datePub = date_.text.strip() if "|" in datePub: split_datePub = datePub.split("|") if len(split_datePub) > 0: datePub = split_datePub[0].strip() date_str = dateparser.parse(datePub).strftime("%Y-%m-%d") claim.date_published = date_str claim.date = date_str else: print("no date?") # Body descriptioon text = "" if parsed_claim_review_page.select('article > p'): for child in parsed_claim_review_page.select('article > p'): text += " " + child.text body_description = text.strip() claim.body = str(body_description).strip() # related links (in page body text <p>) related_links = [] if parsed_claim_review_page.select('article > p > a'): for link in parsed_claim_review_page.select('article > p > a'): try: if hasattr(link, 'href'): if 'http' in link['href']: related_links.append(link['href']) else: related_links.append("https://fullfact.org" + link['href']) except KeyError as e: print("->KeyError: " + str(e)) continue except IndexError as e: print("->IndexError : " + str(e)) continue # related links (in Related fact checks) if parsed_claim_review_page.select( 'section.related-factchecks > div > ul > li > a'): for link in parsed_claim_review_page.select( 'section.related-factchecks > div > ul > li > a'): try: if hasattr(link, 'href'): if 'http' in link['href']: related_links.append(link['href']) else: related_links.append("https://fullfact.org" + link['href']) except KeyError as e: print("->KeyError: " + str(e)) continue except IndexError as e: print("->IndexError: " + str(e)) continue if related_links: claim.referred_links = related_links # cannot be found on fullfact: # self.tags = "" # self.author_url = "" # self.date_published = "" # self.same_as = "" # self.rating_value = "" # self.worst_rating = "" # self.best_rating = "" # self.review_author = "" # claim # multiple (local) claims: 'article > div > div > div.row.no-gutters.card-body-text > div > div > p' ? claim_text_list = [] claim_text = None # rating -> VERDICT: extract_conclusion -> true, false, ... claim_verdict_list = [] claim_verdict = None column = "claim" # or verdict: if parsed_claim_review_page.select( 'body > main > div > div > section > article > div > div > div.row.no-gutters.card-body-text > div > div > p' ): for p in parsed_claim_review_page.select( 'body > main > div > div > section > article > div > div > div.row.no-gutters.card-body-text > div > div > p' ): if hasattr(p, 'text'): if column == "claim": claim_text_list.append(p.text.strip()) if claim_text == None: claim_text = p.text.strip() column = "verdict" else: rating_word_list = p.text conclusion_text = self._conclusion_processor.extract_conclusion( rating_word_list) #print ("conclusion_text: " + conclusion_text) rating = str(conclusion_text).replace('"', "").strip() if "." in rating: split_name = rating.split(".") if len(split_name) > 0: rating = split_name[0] claim_verdict_list.append(rating) if claim_verdict == None: claim_verdict = rating column = "claim" # First local claim and rating: claim.claim = claim_text claim.rating = claim_verdict # All claims and ratings "comma" separated: get all claims? # claim.claim = ", ".join( claim_text_list ) # claim.rating = ", ".join( verdict_text_list ) # Create multiple claims from the main one and add change then the claim text and verdict (rating): c = 0 while c <= len(claim_text_list) - 1: claims.append(claim) claims[c].claim = claim_text_list[c] claims[c].rating = claim_verdict_list[c] c += 1 # for local_claim in claim_text_list: # claims[claim[len(claim)]] = claims[claim[len(claim)-1]] # No Rating? No Claim? if not claim.claim or not claim.rating: print(url) if not claim.rating: print("-> Rating cannot be found!") if not claim.claim: print("-> Claim cannot be found!") return [] # return [claim] return claims
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("newtral") title = parsed_claim_review_page.find("meta", attrs={'property': 'og:title'})['content'] title = title.strip().split("|")[0] claim.set_title(title) dospunto = re.search(r'(: «)', title) dospunt = re.search(r'(: “)', title) if dospunto: claim_a = title.split(":") auteur = claim_a[0].strip() claim.author = auteur # print ("auteur:" , auteur) claim_text = claim_a[1].strip("« »") claim.claim = claim_text elif dospunt: claim_b = title.split(":") auteur = claim_b[0].strip() # print ("auteur:" , auteur) claim.author = auteur claim_text = claim_b[1].strip(": “ ”") # print ("claim :", claim) claim.claim = claim_text else: pass tags = parsed_claim_review_page.find_all("meta", attrs={'property': 'article:tag'}) tag_list = [] for tag in tags: tag_text = tag['content'] tag_list.append(tag_text) claim.set_tags(",".join(tag_list)) published = parsed_claim_review_page.find("meta", attrs={'property': 'article:published_time'})[ 'content'] claim.date_published = published.strip() entry_content = parsed_claim_review_page.find("div", attrs={'class': 'entry-content'}) intro = parsed_claim_review_page.find("div", attrs={'class': 'c-article__intro'}) if intro is None: intro_rating_p = entry_content.find("em") if intro_rating_p is None: intro_rating_p = entry_content.find("p") if intro_rating_p is None: intro_rating_p = entry_content.find("div") else: intro_rating_p = intro.p rating_in_image = False if intro_rating_p is None: # Rating in image... rating_in_image = True rating_text = "" else: rating_text = intro_rating_p.get_text() rating_re_es_falso = regex.compile( r"(La afirmación es|La afirmación es una|La declaración es|Es|El dato es" + \ "|La comparación de Colau es)? ?([\p{Lu}| ]+)(\.| –|,| )") es_falso_match = rating_re_es_falso.match(rating_text) if es_falso_match is not None and es_falso_match.group(2) is not None: rating_text = es_falso_match.group(2) else: if not rating_in_image: is_there_b = intro_rating_p.find('b') if is_there_b is not None: rating_text = is_there_b.text else: is_there_strong = intro_rating_p.find("strong") if is_there_strong is not None: rating_text = is_there_strong.text else: pass claim.rating = rating_text author_span = parsed_claim_review_page.find("span", attrs={'class': 'c-article__author'}) author_a = author_span.find("a") author_url = author_a['href'] author_text = author_a.text author_text = re.sub('Por', '', author_text).strip() claim.author_url = author_url claim.review_author = author_text # Recuperation du texte de l'article entry_text = "" body_t = entry_content.find_all('p') body = [text.text.strip() for text in body_t] entry_text += " ".join(body) + "\n" claim.body = entry_text # Recuperation des liens dans le texte de l'article links = [link['href'] for link in entry_content.find_all('a', href=True)] claim.referred_links = links # else: # title = container.h3.text # titles.append(title) # # print("title", title) # claim_c = hd.h1.text.split(":") # claim_d = hd.h1.text.strip() # # if claim_c: # auteur = claim_c[0].strip() # auteurs.append(auteur) # print("auteur:", auteur) # claim = claim_c[1].strip("« »") # claims.append(claim) # # print ("claim :", claim) # # else : # # print (claim_d) # return [claim]