def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("newtral") #title, claim and autor claim title = parsed_claim_review_page.find("meta", attrs={'property': 'og:title'})['content'] title = title.strip().split("|")[0] claim.set_title(title) entry_content = parsed_claim_review_page.find( "div", attrs={'class': 'entry-content'}) #print (title) dospunto = re.search(r'(: «)', title) dospunt = re.search(r'(: “)', title) if dospunto: claim_a = title.split(":") auteur = claim_a[0].strip() claim.author = auteur claim_text = claim_a[1].strip("« »") claim.claim = claim_text #print (claim_a) elif dospunt: claim_b = title.split(":") auteur = claim_b[0].strip() # print ("auteur:" , auteur) claim.author = auteur claim_text = claim_b[1].strip(": “ ”") # print ("claim :", claim) claim.claim = claim_text else: pass #multiple title or claim claim_mult = entry_content.findAll('h2') if claim_mult: claim_al = [i.text.strip() for i in claim_mult] dospunt = re.search(r'(: “)', claim_al) dospunto = re.search(r'(: «)', claim_al) if dospunt: claim_b = title.split(":") auteur = claim_b[0].strip() # print ("auteur:" , auteur) claim.author = auteur claim_text = claim_b[1].strip(": “ ”") # print ("claim :", claim) claim.claim = claim_text elif dospunto: claim_a = title.split(":") auteur = claim_a[0].strip() claim.author = auteur claim_text = claim_a[1].strip("« »") claim.claim = claim_text #print (claim_a) else: claim.set_title(claim_al) #tags tags = parsed_claim_review_page.find_all( "meta", attrs={'property': 'article:tag'}) tag_list = [] for tag in tags: tag_text = tag['content'] tag_list.append(tag_text) claim.set_tags(",".join(tag_list)) #date pubished published = parsed_claim_review_page.find( "meta", attrs={'property': 'article:published_time'})['content'] claim.date_published = published.strip() #autor article author_span = parsed_claim_review_page.find( "span", attrs={'class': 'c-article__author'}) author_a = author_span.find("a") author_url = author_a['href'] author_text = author_a.text author_text = re.sub('Por', '', author_text).strip() claim.author_url = author_url claim.review_author = author_text # Recuperation du texte de l'article entry_text = "" body_t = entry_content.find_all('p') body = [text.text.strip() for text in body_t] entry_text += " ".join(body) + "\n" claim.body = entry_text # Recuperation des liens dans le texte de l'article links = [ link['href'] for link in entry_content.find_all('a', href=True) ] claim.referred_links = links #Veracite intro = parsed_claim_review_page.find( "div", attrs={'class': 'c-article__intro'}) veracities = [ "ENGAÑOSA", "ENGAÑOSO", "FALSO", "FALSA", "FALSOS", "VERDADERO", "VERDAD A MEDIAS" ] def common(a, b): c = [value for value in a if value in b] return c if intro: intro_p = " ".join(str(v) for v in intro) #print(type(body_t)) rating_text_list = intro_p.upper() rating_text = [ i.strip() for i in common(veracities, rating_text_list) ] claim.alternate_name = rating_text else: body_a = " ".join(str(v) for v in body) #print(type(body_t)) rating_text_list = body_a.upper() rating_text = [ i.strip() for i in common(veracities, rating_text_list) ] claim.alternate_name = rating_text return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("factscan") json_ = None if parsed_claim_review_page.find("script", {"type": "application/ld+json"}): json_ = parsed_claim_review_page.find("script", { "type": "application/ld+json" }).get_text() def parse_wrong_json(json_, left, right): if json_: if len(json_.split(left)) > 0: return json_.split(left)[1].split(right)[0] else: return None # Summary box summary_box = parsed_claim_review_page.find("div", {"class": "summary-box"}) # title title = parsed_claim_review_page.find( "meta", {"property": "og:title"})['content'] claim.set_title(title) # claim review date date = parsed_claim_review_page.find( 'meta', {"property": "article:published_time"}) if date: date_str = search_dates( date['content'].split("T")[0])[0][1].strftime("%Y-%m-%d") claim.set_date(date_str) # Creative work date summary_text = summary_box.find("p").text date_published = "" if " on " in summary_text: date_published = summary_text.split(" on ")[-1].strip() else: if " published " in summary_text: date_published = summary_text.split(" published ")[-1].strip() elif " dated " in summary_text: date_published = summary_text.split(" dated ")[-1].strip() elif " from " in summary_text: date_published = summary_text.split(" from ")[-1].strip() elif " sent " in summary_text: date_published = summary_text.split(" in ")[-1].strip() elif " in " in summary_text: date_published = summary_text.split(" in ")[-1].strip() if len(date_published) > 0: date_published = search_dates(date_published)[0][1].strftime( "%Y-%m-%d") claim.setDatePublished(date_published) # rating if json_: claim.set_rating_value( parse_wrong_json(json_, '"ratingValue":', ",")) claim.setWorstRating(parse_wrong_json(json_, '"worstRating":', ",")) claim.set_best_rating(parse_wrong_json(json_, '"bestRating":', ",")) claim.set_alternate_name( parse_wrong_json(json_, '"alternateName":', ",")) # when there is no json else: if parsed_claim_review_page.find("div", {"class": "fact-check-icon"}): if parsed_claim_review_page.find("div", { "class": "fact-check-icon" }).find('img'): claim_str = \ parsed_claim_review_page.find("div", {"class": "fact-check-icon"}).find('img')['alt'].split( ":")[1] claim.alternate_name = claim_str.strip() # body body = parsed_claim_review_page.find("div", {"class": "entry-content"}) claim.set_body(body.get_text()) # author author = parsed_claim_review_page.find( "div", {"class": "sharethefacts-speaker-name"}) if not author: author = summary_box.find("p").find("strong") if author: claim.set_author(author.text) # same_as claim.setSameAs(parse_wrong_json(json_, '"sameAs": [', "]")) # related links divTag = parsed_claim_review_page.find("div", {"class": "entry-content"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim.set_refered_links(related_links) if parsed_claim_review_page.find("div", {"class": "sharethefacts-statement"}): claim.set_claim( parsed_claim_review_page.find( "div", { "class": "sharethefacts-statement" }).get_text()) else: claim.set_claim(claim.title) tags = [] for tag in parsed_claim_review_page.findAll( 'meta', {"property": "article:tag"}): tags.append(tag["content"]) if len(tags) == 0: for tag in parsed_claim_review_page.findAll( "a", {"rel": "category tag"}): tags.append(tag.text) claim.set_tags(", ".join(tags)) return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("politifact") # Claim title = parsed_claim_review_page.find("div", {"class": "m-statement__quote"}) claim.set_claim(title.text) # title title = parsed_claim_review_page.find("h2", {"class": "c-title"}) claim.set_title(title.text) # date date = parsed_claim_review_page.find('span', {"class": "m-author__date"}) if date: date_str = search_dates(date.text)[0][1].strftime("%Y-%m-%d") claim.set_date(date_str) # rating statement_body=parsed_claim_review_page.find("div", {"class", "m-statement__body"}) statement_detail = statement_body.find("div", {"class", "c-image"}) statement_detail_image=statement_detail.find("picture") statement_detail_image_alt=statement_detail_image.find("img",{"class", "c-image__original"}) if statement_detail_image_alt: claim.alternate_name = statement_detail_image_alt['alt'] # body body = parsed_claim_review_page.find("article", {"class": "m-textblock"}) claim.set_body(body.get_text()) # author statement_meta = parsed_claim_review_page.find("div", {"class": "m-statement__meta"}) if statement_meta: author = statement_meta.find("a").text claim.set_author(author) # date published if statement_meta: meta_text = statement_meta.text if "on" in meta_text: meta_text = meta_text.split(" on ")[1] if "in" in meta_text: meta_text = meta_text.split(" in ")[0] if meta_text: date = search_dates(meta_text) if date: date = date[0][1].strftime("%Y-%m-%d") claim.setDatePublished(date) # related links div_tag = parsed_claim_review_page.find("article", {"class": "m-textblock"}) related_links = [] for link in body.find_all('a', href=True): related_links.append(link['href']) claim.set_refered_links(related_links) claim.set_claim(parsed_claim_review_page.find("div", {"class": "m-statement__quote"}).text.strip()) tags = [] ul_tag = parsed_claim_review_page.find("ul", {"class", "m-list"}) if ul_tag: ul_tag_contents = ul_tag.findAll("li", {"class", "m-list__item"}) for a in ul_tag_contents: a_tag=a.find("a", title=True) a_tag_text=a_tag['title'] tags.append(a_tag_text) if statement_body: topics = statement_body.find("ul", {"class", "m-list"}).find_all("a") for link in topics: text = link['title'] tags.append(text) claim.set_tags(",".join(tags)) return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("newtral") title = parsed_claim_review_page.find("meta", attrs={'property': 'og:title'})['content'] title = title.strip().split("|")[0] claim.set_title(title) dospunto = re.search(r'(: «)', title) dospunt = re.search(r'(: “)', title) if dospunto: claim_a = title.split(":") auteur = claim_a[0].strip() claim.author = auteur # print ("auteur:" , auteur) claim_text = claim_a[1].strip("« »") claim.claim = claim_text elif dospunt: claim_b = title.split(":") auteur = claim_b[0].strip() # print ("auteur:" , auteur) claim.author = auteur claim_text = claim_b[1].strip(": “ ”") # print ("claim :", claim) claim.claim = claim_text else: pass tags = parsed_claim_review_page.find_all("meta", attrs={'property': 'article:tag'}) tag_list = [] for tag in tags: tag_text = tag['content'] tag_list.append(tag_text) claim.set_tags(",".join(tag_list)) published = parsed_claim_review_page.find("meta", attrs={'property': 'article:published_time'})[ 'content'] claim.date_published = published.strip() entry_content = parsed_claim_review_page.find("div", attrs={'class': 'entry-content'}) intro = parsed_claim_review_page.find("div", attrs={'class': 'c-article__intro'}) if intro is None: intro_rating_p = entry_content.find("em") if intro_rating_p is None: intro_rating_p = entry_content.find("p") if intro_rating_p is None: intro_rating_p = entry_content.find("div") else: intro_rating_p = intro.p rating_in_image = False if intro_rating_p is None: # Rating in image... rating_in_image = True rating_text = "" else: rating_text = intro_rating_p.get_text() rating_re_es_falso = regex.compile( r"(La afirmación es|La afirmación es una|La declaración es|Es|El dato es" + \ "|La comparación de Colau es)? ?([\p{Lu}| ]+)(\.| –|,| )") es_falso_match = rating_re_es_falso.match(rating_text) if es_falso_match is not None and es_falso_match.group(2) is not None: rating_text = es_falso_match.group(2) else: if not rating_in_image: is_there_b = intro_rating_p.find('b') if is_there_b is not None: rating_text = is_there_b.text else: is_there_strong = intro_rating_p.find("strong") if is_there_strong is not None: rating_text = is_there_strong.text else: pass claim.alternate_name = rating_text author_span = parsed_claim_review_page.find("span", attrs={'class': 'c-article__author'}) author_a = author_span.find("a") author_url = author_a['href'] author_text = author_a.text author_text = re.sub('Por', '', author_text).strip() claim.author_url = author_url claim.review_author = author_text # Recuperation du texte de l'article entry_text = "" body_t = entry_content.find_all('p') body = [text.text.strip() for text in body_t] entry_text += " ".join(body) + "\n" claim.body = entry_text # Recuperation des liens dans le texte de l'article links = [link['href'] for link in entry_content.find_all('a', href=True)] claim.referred_links = links # else: # title = container.h3.text # titles.append(title) # # print("title", title) # claim_c = hd.h1.text.split(":") # claim_d = hd.h1.text.strip() # # if claim_c: # auteur = claim_c[0].strip() # auteurs.append(auteur) # print("auteur:", auteur) # claim = claim_c[1].strip("« »") # claims.append(claim) # # print ("claim :", claim) # # else : # # print (claim_d) # return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("politifact") # title title = parsed_claim_review_page.find("h1", {"class": "article__title"}) claim.set_title(title.text) # date date = parsed_claim_review_page.find('div', { "class": "widget__content" }).find("p") if date: date_str = search_dates(date.text)[0][1].strftime("%Y-%m-%d") claim.set_date(date_str) # rating rating_div = parsed_claim_review_page.find( "div", {"itemprop": "reviewRating"}) if rating_div: rating_value = rating_div.find("div", {"itemprop": "ratingValue"}) if rating_value: claim.rating_value = rating_value.text worst_rating = rating_div.find("div", {"itemprop": "worstRating"}) if worst_rating: claim.worst_rating = worst_rating.text best_rating = rating_div.find("div", {"itemprop": "bestRating"}) if best_rating: claim.best_rating = best_rating.text alternate_name = rating_div.find("div", {"itemprop": "alternateName"}) if alternate_name: claim.alternate_name = alternate_name.text else: statement_detail = parsed_claim_review_page.find( "img", {"class", "statement-detail"}) if statement_detail: claim.alternate_name = statement_detail['alt'] # body body = parsed_claim_review_page.find("div", {"class": "article__text"}) claim.set_body(body.get_text()) # author statement_meta = parsed_claim_review_page.find( "p", {"class": "statement__meta"}) if statement_meta: author = statement_meta.find("a").text claim.set_author(author) else: author = parsed_claim_review_page.find( "div", {"itemprop": "itemReviewed"}) if author: author = author.find("div", {"itemprop": "author"}) author_text = author.text claim.set_author(author_text) # same as rating_div = parsed_claim_review_page.find( "div", {"itemprop": "itemReviewed"}) if rating_div and rating_div.find("div", {"itemprop": "sameAs"}): claim.setSameAs( rating_div.find("div", { "itemprop": "sameAs" }).get_text()) # date published if statement_meta: meta_text = statement_meta.text if "on" in meta_text: meta_text = meta_text.split(" on ")[1] if "in" in meta_text: meta_text = meta_text.split(" in ")[0] if meta_text: date = search_dates(meta_text) if date: date = date[0][1].strftime("%Y-%m-%d") claim.setDatePublished(date) else: rating_div = parsed_claim_review_page.find( "div", {"itemprop": "itemReviewed"}) if rating_div and rating_div.find("div", {"itemprop": "datePublished"}): claim.setDatePublished( rating_div.find("div", { "itemprop": "datePublished" }).get_text()) # related links div_tag = parsed_claim_review_page.find("div", {"class": "article__text"}) related_links = [] for link in div_tag.findAll('a', href=True): related_links.append(link['href']) claim.set_refered_links(related_links) claim.set_claim( parsed_claim_review_page.find("div", { "class": "statement__text" }).text.strip()) tags = [] about_widget = parsed_claim_review_page.find( "div", {"class", "widget_about-article"}) if about_widget: about_widget_contents = about_widget.find( "div", {"class", "widget__content"}) for p in about_widget_contents.findAll("p"): text = p.text if "Subjects:" in text: for subject in p.findAll("a"): tags.append(subject.text) claim.set_tags(",".join(tags)) return [claim]
def extract_claim_and_review(self, parsed_claim_review_page: BeautifulSoup, url: str) -> List[Claim]: claim = Claim() claim.set_url(url) claim.set_source("truthorfiction") title = parsed_claim_review_page.find( "meta", {"property": "og:title"})['content'] claim.set_title(title) article = parsed_claim_review_page.find("article") # date date_ = parsed_claim_review_page.find( 'meta', {"property": "article:published_time"})['content'] if date_: date_str = date_.split("T")[0] claim.set_date(date_str) # body content = [ tag for tag in article.contents if not isinstance(tag, NavigableString) ] body = content[-1] # type: Tag if body.has_attr("class") and "content-source" in body['class']: body = content[-2] claim.set_body(body.text.strip()) # related links related_links = [] for link in body.findAll('a', href=True): related_links.append(link['href']) claim.set_refered_links(related_links) description = article.find("div", {"class", "claim-description"}) rating = article.find("div", {"class", "rating-description"}) if description and rating: claim.set_claim(description.text) claim.alternate_name = rating.text else: h1 = article.find("h1") text = h1.text.replace("–", "-") split_text = text.split("-") rating_text = split_text[-1] claim_text = "".join(split_text[0:-1]) if len(claim_text) == 0 or "-" not in text: return [] else: claim.set_alternate_name(rating_text) claim.set_claim(claim_text) tags = [] for tag in parsed_claim_review_page.findAll( "meta", {"property", "article:tags"}): tag_str = tag['content'] tags.append(tag_str) claim.set_tags(", ".join(tags)) return [claim]