def getClaims(self, query): """Get claims. Arguments: query {str} -- the initial query """ # folderPath = os.path.join(folderpath, 'final') # fullPath = os.path.join(self.rootpath, folderPath) self.preprocessData.generateTweetsLines(self.folderpath) claimExtractor = Claim.ClaimExtractor(self.rootpath, self.folderpath) tweets_list = list(self.helper.getTweet(self.folderpath)) cleanedTweets = [] for tweet in tweets_list: c1 = self.preprocessData.cleanTweet(tweet.text) cleanedTweets.append(c1) print("Parsing...") mergedNoun, sortedSubject2Number, \ subject2tweetInfo, parsedTweets = claimExtractor.collectSubject( tweets_list, cleanedTweets) # sortedSubject2Number = self.helper.loadJson( # os.path.join(self.folderpath, "final", "sorted_subject2number.json")) # subject2tweetInfo = self.helper.loadJson( # os.path.join(self.folderpath, "final", "subject2tweetInfo.json")) # parsedTweets = self.helper.loadJson( # os.path.join(self.folderpath, "final", "tweets_id2Info.json")) candidateClaimsMergedClause, \ candidateFullClaimsMergedClause = claimExtractor.getCandidateClaims( tweets_list, mergedNoun, sortedSubject2Number, subject2tweetInfo, parsedTweets, query[1:]) return candidateClaimsMergedClause, candidateFullClaimsMergedClause
def getClusterClaims(self, query, eps): claims, fullClaims = self.getClaims(query) # filePath = os.path.join(self.folderpath, "final", # "candidateClaimsMergedClause.json") # print("filePath ", filePath) # claims = self.helper.loadJson(filePath) # filePath = os.path.join(self.folderpath, "final", # "candidateFullClaimsMergedClause.json") # print("filePath ", filePath) # fullClaims = self.helper.loadJson(filePath) skipthoughts_model = { "name": "skipthoughts", "modelPath": self.rootpath + "/.." + "/skip_thoughts/pretrained/skip_thoughts_uni_2017_02_02/exp_vocab", "checkpointPath": "model.ckpt-501424" } sent2vec_model = { "name": "sent2vec", "modelPath": "/lustre/scratch/haoxu/twitter_bigrams.bin" } self.getSimilarity = Claim.GetSimilarity(self.rootpath, self.folderpath, skipthoughts_model) tweets_list = list(self.helper.getTweet(self.folderpath)) cluster2claimsIndexes, cluster2coreSampleIndices = self.getSimilarity.getClusteredClaims( claims, tweets_list, eps) return cluster2claimsIndexes, cluster2coreSampleIndices, claims, fullClaims
def briefAdditionalRows(soup, result, url, idClaim, listeURL, rubri, motsCles, t, liensRevue, d): claim = soup.find('div', {"class": "col-xs-12 col-sm-6 col-left"}) if claim: claim_ = claim_obj.Claim() claim_.setSource("fullfact") claim_.setUrl(url) claim_.setClaim(claim.get_text().replace("\nClaim\n", "")) claim_.setIdClaim(idClaim) claim_.setRubrique(rubri) claim_.setKeyWordsRP("RelatedPosts", motsCles) claim_.setTitle(t) claim_.setDate(d) conclusion = soup.find('div', {"class": "col-xs-12 col-sm-6 col-right"}) if conclusion: claim_.setConclusion(conclusion.get_text().replace( "\nConclusion\n", "")) c = conclusion.get_text().replace("\nConclusion\n", "") claim_.setVerdictTompo(TraitementConclusion.fonctionPrincipale(c)) claim_.setBody(result) claim_.setRelated_posts("RelatedPosts", listeURL) claim_.setLiensRevue(liensRevue) return claim_ else: return "empty"
def readClaim(self, xml): claim = Claim.Claim() for event in xml.getchildren(): if event.tag == 'claimtype': claim.claimtype = self.readTerm(event) elif event.tag == 'label': # We store the full protocol,label construct for # consistency with the technical parts, so it is left to # the __str__ of claim to select the right element claim.label = self.readTerm(event) elif event.tag == 'protocol': claim.protocol = self.readTerm(event) elif event.tag == 'role': claim.role = self.readTerm(event) elif event.tag == 'parameter': claim.parameter = self.readTerm(event) elif event.tag == 'failed': claim.failed = int(event.text) elif event.tag == 'count': claim.count = int(event.text) elif event.tag == 'states': claim.states = int(event.text) elif event.tag == 'complete': claim.complete = True elif event.tag == 'timebound': claim.timebound = True else: print >> sys.stderr, "Warning unknown tag in claim: %s" % claim.tag claim.analyze() return claim
def getSimilarTweets4Claim(self): # get claims claims = list(self.helper.getClaim(self.folderpath)) # lowercase # claims = [claim.lower() for claim in claims] # get tweets tweets = self.helper.getTweet(self.folderpath) cleanedTweets = [] for tweet in tweets: # tweets_list.append(tweet) c1 = self.preprocessData.cleanTweet(tweet.text) cleanedTweets.append(c1) getSimilarity = Claim.GetSimilarity( "/home/hao/Workplace/HaoXu/Data/skip_thoughts/pretrained/skip_thoughts_uni_2017_02_02/exp_vocab", "model.ckpt-501424") sentences, tweetIndex = getSimilarity.splitSentences(cleanedTweets) encodedSentences = getSimilarity.encodeSen(sentences) # for index, claim in enumerate(claims): encodedClaims = getSimilarity.encodeSen(claims) claims2tweets = getSimilarity.getTweets4Claims(sentences, encodedSentences, claims, encodedClaims, tweetIndex) for claimID, sentInfos in claims2tweets.items(): claims2tweets[claimID] = self.preprocessData.sortListofLists( sentInfos, False) self.helper.dumpJson(self.folderpath, "final/sorted_claims2tweets.json", claims2tweets) print("sorted_claims2tweets.json have been saved.")
def new_claim(f_link, date, title, tags): claim_ = claim_obj.Claim() claim_.setUrl(f_link) claim_.setTitle(title) claim_.setTags(tags) date_ = date.strip().split() date_ = "-".join([date_[4], date_[2], date_[0]]) claim_.setDate(dateparser.parse(date_).strftime("%Y-%m-%d")) claim_.setSource("publica") claim_.setBody("") return claim_
def getClusterRankClaims(self, query, eps): cluster2claimsIndexes, cluster2coreSampleIndices, \ claims, fullClaims = self.getClusterClaims( query, eps) getSimilarity = Claim.GetSimilarity(self.rootpath, self.folderpath) tweets_list = list(self.helper.getTweet(self.folderpath)) rankedClusterClaims = getSimilarity.rankClusteredClaims( cluster2claimsIndexes, cluster2coreSampleIndices, claims, fullClaims, tweets_list) return rankedClusterClaims """similarClaimsComponents, sortedSimilarClaims = getSimilarity.getSimilarClaims(
def addClaim(customer_id): ''' Creates new Claim object and adds it to both InsuranceCompany list of all claims and the corresponding Customer-objects list of claims. :param customer_id: :return: ''' c = company.getCustomerById(customer_id) if (c != None): claim = Claim(request.args.get('date'), request.args.get('incident_description'), request.args.get('claim_amount')) c.addClaim(claim) company.addClaim(claim) return jsonify( success=c != None, message="Customer not found")
def do_newclaim(self, inp): 'Create a new claim and switch the context to that claim' self.prompt = "New Claim> " "Ask for the name of the claim, as well as verify it." "When the name is verified create a folder for it." goodname = False name = "" while not goodname: name: str = input("Enter Claim Name: ") goodname = self.verifyname(name) claimdate = datetime.datetime.now().date() claimname = str(claimdate) + "_" + name self.prompt = claimname + "> " claimdir = "Claims\\"+claimname imagesdir = "Claims\\"+claimname+"\\Images" os.mkdir(claimdir) os.mkdir(imagesdir) #TODO properly validate date daterange = input("Provide the start-end dates in the format MM/DD/YYYY-MM/DD/YYYY (currently does not check, be careful): ") startdate,enddate = self.verifydate(daterange) newclaim = Claim.Claim(claimdir, claimname,startdate,enddate) self.claim = newclaim self.prompt = claimname + "> "
def get_all_claims(criteria): print criteria.maxClaims #performing a search by each letter, and adding each article to a urls_ var. now = datetime.datetime.now() urls_ = {} for page_number in range(1, 500): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break try: page = urllib2.urlopen( "https://g1.globo.com/e-ou-nao-e/index/feed/pagina-" + str(page_number) + ".ghtml").read() except: break soup = BeautifulSoup(page, "lxml") soup.prettify() links = soup.findAll('a', {"class": "feed-post-link"}, href=True) if len(links) != 0: for anchor in links: if (anchor['href'] not in urls_.keys()): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break urls_[anchor['href']] = page_number print "adding " + str(anchor['href']) else: print("break!") break claims = [] index = 0 # visiting each article's dictionary and extract the content. for url, conclusion in urls_.iteritems(): print str(index) + "/" + str(len( urls_.keys())) + " extracting " + str(url) index += 1 url_complete = str(url) #print url_complete page = urllib2.urlopen(url_complete).read().decode('utf-8', 'ignore') soup = BeautifulSoup(page, "lxml") soup.prettify("utf-8") claim_ = claim_obj.Claim() claim_.setUrl(url_complete) claim_.setSource("g1") if (criteria.html): claim_.setHtml(soup.prettify("utf-8")) try: #title #if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1): title = soup.find("h1", {"class": "content-head__title"}) claim_.setTitle(title.text) #date date_ = soup.find('time', {"itemprop": "datePublished"}) if date_: date_str = date_.get_text().split(" ")[1] claim_.setDate( dateparser.parse(date_str, settings={ 'DATE_ORDER': 'DMY' }).strftime("%Y-%m-%d")) #print claim_.date #body body = soup.find("article") claim_.setBody(body.get_text().replace("\n", "").replace( "TwitterFacebookE-mailWhatsApp", "")) #related links divTag = soup.find("article", {"itemprop": "articleBody"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.setRefered_links(related_links) #claim claim_conclusion = soup.find("h1", { "class": "content-head__title" }).get_text() #claim_.setClaim(claim_conclusion) #if (len(claim_conclusion.split("?"))>1): claim_.setClaim(claim_conclusion.split("?")[0]) claim_.setConclusion(claim_conclusion.split("?")[1]) # if (claim_element.find_previous_sibling("figure") and claim_element.find_previous_sibling("figure").findAll("figcaption")): # claim_.setConclusion(claim_element.find_previous_sibling("figure").findAll("figcaption")[-1:][0].get_text()) #print claim_.claim.decode("utf-8") + " ====> " #print claim_.conclusion.decode("utf-8") #print "-->"+ str(claim_.conclusion) claims.append(claim_.getDict()) except: print "Error ->" + str(url_complete) #creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf
def get_all_claims(criteria): #performing a search by each letter, and adding each article to a urls_ var. alfab = "bcdefghijklmnopqrstuvxyz" urls_ = {} for l in alfab: for page_number in range(1, 500): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break try: page = urllib2.urlopen("http://fullfact.org/search/?q=" + l + "&page=" + str(page_number)).read() except: break soup = BeautifulSoup(page, "lxml") soup.prettify() links = soup.findAll('a', {"rel": "bookmark"}, href=True) if len(links) != 0: for anchor in links: urls_[anchor['href']] = [l, page_number] print "adding " + str(anchor['href']) if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break else: print("break!") break claims = [] index = 0 # visiting each article's dictionary and extract the content. for url in urls_.keys(): print str(index) + "/" + str(len(urls_)) + " extracting " + str(url) index += 1 claim_ = claim_obj.Claim() claim_.setSource("fullfact") try: url_complete = "http://fullfact.org" + url claim_.setUrl(url_complete) page = urllib2.urlopen(url_complete).read() soup = BeautifulSoup(page, "lxml") soup.prettify() #claim claim = soup.find('div', {"class": "col-xs-12 col-sm-6 col-left"}) if claim: claim_.setClaim(claim.get_text().replace("\nClaim\n", "")) #conclusin conclusion = soup.find('div', {"class": "col-xs-12 col-sm-6 col-right"}) if conclusion: claim_.setConclusion(conclusion.get_text().replace( "\nConclusion\n", "")) #title title = soup.find("div", { "class": "container main-container" }).find('h1') claim_.setTitle(title.text) #date date = soup.find("p", {"class": "hidden-xs hidden-sm date updated"}) claim_.setDate( dateparser.parse(date.get_text().replace( "Published:", "")).strftime("%Y-%m-%d")) #body body = soup.find("div", {"class": "article-post-content"}) claim_.setBody(body.get_text()) #related links divTag = soup.find("div", {"class": "row"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.setRefered_links(related_links) claims.append(claim_.getDict()) except: print "error=>" + url_complete #creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf
def get_all_claims(criteria): headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } #print criteria.maxClaims #performing a search by each letter, and adding each article to a urls_ var. now = datetime.datetime.now() urls_ = {} types = [ "true", "mostly-true", "half-true", "barely-true", "false", "pants-fire", "no-flip", "half-flip", "full-flop" ] last_page = [] for page_number in range(1, 500): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break url = "https://www.channel4.com/news/factcheck/page/" + str( page_number) #url="http://www.politifact.com/truth-o-meter/rulings/"+str(type_)+"/?page="+str(page_number) try: page = requests.get(url, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify() links = soup.findAll("li", {"class": "feature factcheck"}) if (len(links) != 0) or (links != last_page): for anchor in links: anchor = anchor.find('a', {"class": "permalink"}, href=True) ind_ = str(anchor['href']) if (ind_ not in urls_.keys()): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break if (ind_ not in criteria.avoid_url): urls_[ind_] = ind_ print "adding " + str(ind_) last_page = links else: print("break!") break except: print "error=>" + str(url) claims = [] index = 0 # visiting each article's dictionary and extract the content. for url, conclusion in urls_.iteritems(): print str(index) + "/" + str(len( urls_.keys())) + " extracting " + str(url) index += 1 url_complete = str(url) #print url_complete try: page = requests.get(url_complete, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify("utf-8") claim_ = claim_obj.Claim() claim_.setUrl(url_complete) claim_.setSource("channel4") if (criteria.html): claim_.setHtml(soup.prettify("utf-8")) #title #if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1): title = soup.find("div", { "class": "factcheck-article-header" }).find("h1").get_text() claim_.setTitle(title) #date date_ = soup.find('li', {"class": "pubDateTime"}) #print date_["content"] if date_: date_str = search_dates( date_['data-time'])[0][1].strftime("%Y-%m-%d") #print date_str claim_.setDate(date_str) #print claim_.date #body body = soup.find("div", {"class": "article-body article-main"}) claim_.setBody(body.get_text()) #related links divTag = soup.find("div", {"class": "article-body article-main"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.setRefered_links(related_links) claim_.setClaim(title) conclusion_ = body.find_all("h2", text=lambda t: t and "verdict" in t) if conclusion_: conclusion_str = conclusion_[0].find_next_sibling("p").text claim_.setConclusion(conclusion_str) tags = [] for tag in soup.findAll('meta', {"property": "article:tag"}): #print "achou" tags.append(tag["content"]) claim_.setTags(", ".join(tags)) # if (claim_.conclusion.replace(" ","")=="" or claim_.claim.replace(" ","")==""): # print claim_.conclusion # print claim_.claim # raise ValueError('No conclusion or claim') claims.append(claim_.getDict()) except: print "Error ->" + str(url_complete) #creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf
def get_all_claims(criteria): print criteria.maxClaims #performing a search by each letter, and adding each article to a urls_ var. now = datetime.datetime.now() urls_ = {} for page_number in range(1, 500): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break try: url = "https://correctiv.org/echtjetzt/artikel/seite/" + str( page_number) + "/" page = urllib2.urlopen(url).read() except: break soup = BeautifulSoup(page, "lxml") soup.prettify() links = soup.findAll('a', {"class": "entry-list-item__link"}, href=True) if len(links) != 0: for anchor in links: url_to_add = "https://correctiv.org" + str(anchor['href']) if (url_to_add not in urls_.keys()): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break urls_[url_to_add] = page_number print "adding " + str(url_to_add) else: print("break!") break claims = [] index = 0 # visiting each article's dictionary and extract the content. for url, conclusion in urls_.iteritems(): print str(index) + "/" + str(len( urls_.keys())) + " extracting " + str(url) index += 1 url_complete = str(url) #print url_complete try: page = urllib2.urlopen(url_complete).read().decode( 'utf-8', 'ignore') soup = BeautifulSoup(page, "lxml") soup.prettify("utf-8") claim_ = claim_obj.Claim() claim_.setUrl(url_complete) claim_.setSource("correctiv") if (criteria.html): claim_.setHtml(soup.prettify("utf-8")) #title #if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1): title = soup.find("h1", {"class": "article-header__headline"}) claim_.setTitle( title.text.replace("Faktencheck:", "").replace("\n", "")) date_ = soup.find('time', {"class": "article-body__publishing-date"}) #print date_["content"] if date_: date_str = search_dates( date_['title'].split("T")[0])[0][1].strftime("%Y-%m-%d") #print date_str claim_.setDate(date_str) #print claim_.date #body body = soup.find("div", {"class": "article-body__main"}) claim_.setBody(body.get_text()) #related links divTag = soup.find("div", {"class": "article-body__main"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.setRefered_links(related_links) claim_.setClaim(claim_.title) conclsion = soup.find( 'div', {"class": "article-body__claimreview claimreview"}) if conclsion: claim_.setConclusion( conclsion.text.replace("Unsere Bewertung: ", "").replace("\n", "")) tags = [] for tag in soup.findAll('meta', {"property": "article:tag"}): #print "achou" tags.append(tag["content"]) claim_.setTags(", ".join(tags)) claims.append(claim_.getDict()) except: print "Error ->" + str(url_complete) #creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf
def get_all_claims(criteria): headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'} #print criteria.maxClaims #performing a search by each letter, and adding each article to a urls_ var. now = datetime.datetime.now() urls_={} types=["a"] last_page=[] for type_ in types: for page_number in range (1,500): if (criteria.maxClaims > 0 and len(urls_)>= criteria.maxClaims): break try: #url="http://www.politifact.com/truth-o-meter/rulings/"+str(type_)+"/?page="+str(page_number) url="https://www.truthorfiction.com/page/"+str(page_number)+"/?s="+str(type_) page = requests.get(url, headers=headers, timeout=5) soup = BeautifulSoup(page.text,"lxml") soup.prettify() links = soup.findAll("h2",{"class":"grid-title"}) if (len(links) > 0) and (links != last_page): for anchor in links: anchor = anchor.find('a', href=True) ind_=str(anchor['href']) if (ind_ not in urls_.keys()): if (criteria.maxClaims > 0 and len(urls_)>= criteria.maxClaims): break if (ind_ not in criteria.avoid_url): urls_[ind_]=anchor.get_text() print "adding "+str(ind_) last_page = links else: print ("break!") break except: print "error=>"+str(url) claims=[] index=0 # visiting each article's dictionary and extract the content. for url,title_claim in urls_.iteritems(): print str(index) + "/"+ str(len(urls_.keys()))+ " extracting "+str(url) index+=1 url_complete=str(url) #print #print url_complete try: page = requests.get(url_complete, headers=headers, timeout=5) soup = BeautifulSoup(page.text,"lxml") soup.prettify("utf-8") claim_ = claim_obj.Claim() claim_.setUrl(url_complete) claim_.setSource("truthorfiction") if (criteria.html): claim_.setHtml(soup.prettify("utf-8")) #title #title=title_claim.split("-")[0] title=title_claim[:title_claim.rfind("-")] conclusion=title_claim.split("-")[-1:][0].replace("!","") claim_.setTitle(title) #date date_ = soup.find('div', {"class": "post-box-meta-single"}).find("span") #print date_["content"] if date_ : date_str=search_dates(date_.text.replace(",",""), settings={'DATE_ORDER': 'MDY'})[0][1].strftime("%Y-%m-%d") #print date_str claim_.setDate(date_str) #print claim_.date #body body=soup.find("div",{"class":"inner-post-entry"}) claim_.setBody(body.get_text()) #related links divTag = soup.find("div",{"class":"inner-post-entry"}) related_links=[] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.setRefered_links(related_links) claim_.setClaim(title) claim_.setConclusion(conclusion) tags=[] for tag in soup.findAll('a', {"rel":"tag"}, href=True): tag_str=tag.text tags.append(tag_str) claim_.setTags(", ".join(tags)) if (claim_.conclusion.replace(" ","")=="" or claim_.claim.replace(" ","")==""): raise ValueError('No conclusion or claim') claims.append(claim_.getDict()) except: print "Error ->" + str(url_complete) #creating a pandas dataframe pdf=pd.DataFrame(claims) return pdf
def get_all_claims(criteria): headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } urls_ = {} for page_number in range(1, 500): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break try: url = "https://checkyourfact.com/page/" + str(page_number) + "/" page = requests.get(url, headers=headers, timeout=10) soup = BeautifulSoup(page.text, "lxml") soup.prettify() except: break links = soup.find('articles').findAll('a', href=True) if len(links) != 0: for anchor in links: ind_ = "http://checkyourfact.com" + str(anchor['href']) if (ind_ not in urls_.keys()): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break if (ind_ not in criteria.avoid_url): urls_[ind_] = page_number print "adding " + str(ind_) else: print("break!") break claims = [] index = 0 # visiting each article's dictionary and extract the content. for url, conclusion in urls_.iteritems(): print str(index) + "/" + str(len( urls_.keys())) + " extracting " + str(url) index += 1 url_complete = str(url) #print url_complete try: page = requests.get(url_complete, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify("utf-8") claim_ = claim_obj.Claim() claim_.setUrl(url_complete) claim_.setSource("checkyourfact") if (criteria.html): claim_.setHtml(soup.prettify("utf-8")) #title title = soup.find('article').find("h1") claim_.setTitle(title.text.replace("FACT CHECK: ", "")) date_str = search_dates( url_complete.replace("http://dailycaller.com/", "").replace("/", " "), settings={'DATE_ORDER': 'YMD'})[0][1].strftime("%Y-%m-%d") #print date_str claim_.setDate(date_str) #print claim_.date #body body = soup.find("article") claim_.setBody(body.get_text()) #related links divTag = soup.find("article") related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.setRefered_links(related_links) claim_.setClaim(claim_.title) for strong in soup.find('article').findAll('strong'): if "Verdict:" in strong.text: claim_.setConclusion(strong.text.replace("Verdict: ", "")) tags = [] for tag in soup.findAll('meta', {"property": "article:tag"}): #print "achou" tags.append(tag["content"]) claim_.setTags(", ".join(tags)) claims.append(claim_.getDict()) except: print "Error ->" + str(url_complete) #creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf
def get_all_claims(criteria): #performing a search by each letter, and adding each article to a urls_ var. now = datetime.datetime.now() urls_ = {} letters = [ "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "m", "o", "p", "q", "x", "y", "z" ] letters = ["a"] for l in letters: for page in range(1, 500): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break try: print("http://www.mimikama.at/page/" + str(page) + "/?s=" + l) page = urllib2.urlopen("http://www.mimikama.at/page/" + str(page) + "/?s=" + l).read() except: break soup = BeautifulSoup(page, "lxml") soup.prettify() links = soup.find('div', { "class": "td-ss-main-content" }).findAll('a', {"rel": "bookmark"}, href=True) if len(links) != 0: for anchor in links: if (anchor['href'] not in urls_.keys()): urls_[anchor['href']] = l print "adding " + str(anchor['href']) if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break else: print("break!") break claims = [] index = 0 # visiting each article's dictionary and extract the content. for url in urls_.keys(): try: print str(index) + "/" + str(len( urls_.keys())) + " extracting " + str(url) index += 1 claim_ = claim_obj.Claim() claim_.setSource("mimikama") url_complete = url claim_.setUrl(url_complete) page = urllib2.urlopen(url_complete, timeout=5).read() soup = BeautifulSoup(page, "lxml") soup.prettify() #conclusin # conclusion=soup.find('div', {"class": "td-post-content"}).find('h2') # if conclusion : # claim_.setConclusion(conclusion.get_text()) #title title = soup.find("h1", {"class": "entry-title"}) claim_.setTitle(title.text) #claim #claim = soup.find('div', {"class": "td-post-content"}).find('h2') #if claim and claim.find_previous('strong'): # claim_.setClaim(claim.find_previous('strong').get_text()) #else: claim_.setClaim(claim_.title) #date date = soup.find("time", {"class": "entry-date updated td-module-date"}) #print date #print (search_dates(date.get_text())[0][1].strftime("%Y-%m-%d")) claim_.setDate( search_dates(date.get_text())[0][1].strftime("%Y-%m-%d")) #related links divTag = soup.find("div", {"class": "td-post-content"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.setRefered_links(related_links) body = soup.find("div", {"class": "td-post-content"}) claim_.setBody(body.get_text()) claims.append(claim_.getDict()) except: print "Erro =>" + url #creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf
def get_all_claims(criteria): #performing a search by each letter, and adding each article to a urls_ var. now = datetime.datetime.now() urls_={} for year in range (2015,now.year+1): for month in range (1,13): if (criteria.maxClaims > 0 and len(urls_)>= criteria.maxClaims): break try: page = urllib2.urlopen("http://piaui.folha.uol.com.br/lupa/"+str(year)+"/"+str(month)+"/").read() except: break soup = BeautifulSoup(page,"lxml") soup.prettify() links = soup.find('div', {"class": "lista-noticias"}).findAll('a', href=True) if len(links) != 0: for anchor in links: if (anchor['href'] not in urls_.keys()): urls_[anchor['href']]=[year,month] print "adding "+str(anchor['href']) if (criteria.maxClaims > 0 and len(urls_)>= criteria.maxClaims): break else: print ("break!") break claims=[] index=0 # visiting each article's dictionary and extract the content. for url in urls_.keys(): print str(index) + "/"+ str(len(urls_.keys()))+ " extracting "+str(url) index+=1 try: claim_ = claim_obj.Claim() claim_.setSource("lupa") url_complete=url claim_.setUrl(url_complete) page = urllib2.urlopen(url_complete).read() soup = BeautifulSoup(page,"lxml") soup.prettify() if (criteria.html): claim_.setHtml(soup.prettify()) #conclusin conclusion=soup.find('div', {"class": "etiqueta"}) if conclusion : claim_.setConclusion(conclusion.get_text()) #title title=soup.find("h2", {"class": "bloco-title"}) claim_.setTitle(title.text) #claim claim = soup.find('div', {"class": "post-inner"}).find('div', {"class": "etiqueta"}) if claim and claim.find_previous('strong'): claim_.setClaim(claim.find_previous('strong').get_text()) else: claim_.setClaim(claim_.title) #date date=soup.find("div", {"class": "bloco-meta"}) claim_.setDate(dateparser.parse(date.text.split("|")[0], settings={'DATE_ORDER': 'DMY'}).strftime("%Y-%m-%d")) #related links divTag = soup.find("div", {"class": "post-inner"}) related_links=[] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.setRefered_links(related_links) #related links body = soup.find("div", {"class": "post-inner"}) claim_.setBody(body.get_text()) # tags tags_ = [t.text for t in soup.findAll('a', {'rel':'tag'})] claim_.setTags(tags_) claims.append(claim_.getDict()) except: print "error=>"+str(url_complete) #creating a pandas dataframe pdf=pd.DataFrame(claims) return pdf
def get_all_claims(criteria): headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } #print criteria.maxClaims #performing a search by each letter, and adding each article to a urls_ var. now = datetime.datetime.now() urls_ = {} types = [ "true", "mostly-true", "half-true", "barely-true", "false", "pants-fire", "no-flip", "half-flip", "full-flop" ] last_page = [] for page_number in range(1, 500): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break url = "http://factscan.ca/page/" + str(page_number) + "/" #url="http://www.politifact.com/truth-o-meter/rulings/"+str(type_)+"/?page="+str(page_number) try: page = requests.get(url, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify() links = soup.findAll( "h1", {"class": "post-title entry-title home-feed-title"}) if (len(links) != 0) or (links != last_page): for anchor in links: anchor = anchor.find('a', href=True) ind_ = str(anchor['href']) if (ind_ not in urls_.keys()): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break if (ind_ not in criteria.avoid_url): urls_[ind_] = ind_ print "adding " + str(ind_) last_page = links else: print("break!") break except: print "error=>" + str(url) claims = [] index = 0 # visiting each article's dictionary and extract the content. for url, conclusion in urls_.iteritems(): print str(index) + "/" + str(len( urls_.keys())) + " extracting " + str(url) index += 1 url_complete = str(url) #print url_complete try: page = requests.get(url_complete, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify("utf-8") claim_ = claim_obj.Claim() claim_.setUrl(url_complete) claim_.setSource("factscan") if (criteria.html): claim_.setHtml(soup.prettify("utf-8")) #print url_complete #import ast json_ = None if (soup.find("script", {"type": "application/ld+json"})): json_ = soup.find("script", { "type": "application/ld+json" }).get_text() def parse_wrong_json(json_, left, right): if json_: if (len(json_.split(left)) > 0): return json_.split(left)[1].split(right)[0] else: return None #title #if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1): title = soup.find("meta", {"property": "og:title"})['content'] claim_.setTitle(title) #date date_ = soup.find('meta', {"property": "article:published_time"}) #print date_["content"] if date_: date_str = search_dates( date_['content'].split("T")[0])[0][1].strftime("%Y-%m-%d") #print date_str claim_.setDate(date_str) #print claim_.date #rating claim_.setRatingValue( parse_wrong_json(json_, '"ratingValue":', ",")) claim_.setWorstRating( parse_wrong_json(json_, '"worstRating":', ",")) claim_.setBestRating(parse_wrong_json(json_, '"bestRating":', ",")) claim_.setAlternateName( parse_wrong_json(json_, '"alternateName":', ",")) #when there is no json if (claim_.alternateName == None): #print "não achou conclusao" if (soup.find("div", {"class": "fact-check-icon"})): #print "passou" if (soup.find("div", { "class": "fact-check-icon" }).find('img')): #print "passou2" claim_str = soup.find("div", { "class": "fact-check-icon" }).find('img')['alt'].split(":")[1] #print claim_str claim_.alternateName = claim_str #print claim_.alternateName #body body = soup.find("div", {"class": "entry-content"}) claim_.setBody(body.get_text()) #author author = soup.find("div", {"class": "sharethefacts-speaker-name"}) if (author): claim_.setAuthor(author.get_text()) #sameas claim_.setSameAs(parse_wrong_json(json_, '"sameAs": [', "]")) #obj=soup.find("div",{"itemprop":"itemReviewed"}) #if (obj and obj.find("div",{"itemprop":"datePublished"})): #print parse_wrong_json(json_,'"}, "datePublished":',",") #claim_.setDatePublished() #related links divTag = soup.find("div", {"class": "entry-content"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.setRefered_links(related_links) if (soup.find("div", {"class": "sharethefacts-statement"})): claim_.setClaim( soup.find("div", { "class": "sharethefacts-statement" }).get_text()) else: claim_.setClaim(claim_.title) claim_.setConclusion( soup.find("div", { "class": "fact-check-icon-loop" }).find('img')['alt'].replace("FactsCan Score: ", "")) tags = [] for tag in soup.findAll('meta', {"property": "article:tag"}): #print "achou" tags.append(tag["content"]) claim_.setTags(", ".join(tags)) if (claim_.conclusion.replace(" ", "") == "" or claim_.claim.replace(" ", "") == ""): #print " eroor conclusion or claim" #print claim_.claim #print claim_.conclusion raise ValueError('No conclusion or claim') claims.append(claim_.getDict()) except: print "Error ->" + str(url_complete) #creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf
def getNews(self, folderpath, top=5): event2timeScope = { "Gabapentin_0628_0121": ["2017-06-28T00:00:00Z", "2018-01-21T00:00:00Z"], "SanctuaryCities_0516_0523": ["2018-05-16T00:00:00Z", "2018-05-23T00:00:00Z"], "WhereAreTheChildren_0418_0527": ["2018-04-18T00:00:00Z", "2018-05-27T00:00:00Z"], "Ingraham_0618_0624": ["2018-06-18T00:00:00Z", "2018-06-24T00:00:00Z"], "ItsJustAJacket_0621_0624": ["2018-06-21T00:00:00Z", "2018-06-24T00:00:00Z"], "immigrants_0622_0624": ["2018-06-22T00:00:00Z", "2018-06-24T00:00:00Z"], "JetLi_0519_0523": ["2018-05-19T00:00:00Z", "2018-05-23T00:00:00Z"], "BandyLee_0110_0115": ["2018-01-10T00:00:00Z", "2018-01-15T00:00:00Z"], "JackBreuer_1228_0115": ["2017-12-28T00:00:00Z", "2018-01-15T00:00:00Z"], "SouthwestKey_0620_0624": ["2018-06-20T00:00:00Z", "2018-06-24T00:00:00Z"], "Capriccio_0516_0523_new": ["2018-05-16T00:00:00Z", "2018-05-23T00:00:00Z"] } if not self.getSimilarity: skipthoughts_model = { "name": "skipthoughts", "modelPath": self.rootpath + "/.." + "/skip_thoughts/pretrained/skip_thoughts_uni_2017_02_02/exp_vocab", "checkpointPath": "model.ckpt-501424" } sent2vec_model = { "name": "sent2vec", "modelPath": "/lustre/scratch/haoxu/twitter_bigrams.bin" } self.getSimilarity = Claim.GetSimilarity(self.rootpath, self.folderpath, skipthoughts_model) folderPath = os.path.join(self.folderpath, "final") rankedClusterClaims = self.helper.loadJson( folderPath + "/ranked_cluster_claims.json") count = 0 for index, info in enumerate(rankedClusterClaims): if count >= top: continue count += 1 query = info[0] start = event2timeScope[folderpath][0] end = event2timeScope[folderpath][1] # find similar news alylienNewsAPI = Information.AylienNewsAPI() news = alylienNewsAPI.getNews(query, start, end, 10) if len(news) == 0: print("no news.") print("folder ", folderpath) print("info ", info) continue titles = alylienNewsAPI.getTitles(news) # find final news based the most similar news query = self.getSimilarity.getSimilarNews(query, titles) finalNews = alylienNewsAPI.getNews(query, start, end, 10) self.helper.dumpPickle(folderPath + "/news_bk", str(index) + "_news.pickle", finalNews) finalNewsDict = [i.to_dict() for i in finalNews] self.helper.dumpJson(folderPath + "/news", str(index) + "_news.json", finalNewsDict) print("{}th claim: news has been saved.".format(index))
def get_all_claims(criteria): headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } #print criteria.maxClaims #performing a search by each letter, and adding each article to a urls_ var. now = datetime.datetime.now() urls_ = {} types = [ "true", "mostly-true", "half-true", "barely-true", "false", "pants-fire", "no-flip", "half-flip", "full-flop" ] last_page = [] for page_number in range(1, 500): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break url = "https://africacheck.org/latest-reports/page/" + str( page_number) + "/" #url="http://www.politifact.com/truth-o-meter/rulings/"+str(type_)+"/?page="+str(page_number) try: page = requests.get(url, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify() links = soup.findAll("div", {"class": "article-content"}) if (len(links) != 0) or (links != last_page): for anchor in links: anchor = anchor.find('a', href=True) ind_ = str(anchor['href']) if (ind_ not in urls_.keys()): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break if (ind_ not in criteria.avoid_url): urls_[ind_] = ind_ print "adding " + str(ind_) last_page = links else: print("break!") break except: print "error=>" + str(url) claims = [] index = 0 # visiting each article's dictionary and extract the content. for url, conclusion in urls_.iteritems(): print str(index) + "/" + str(len( urls_.keys())) + " extracting " + str(url) index += 1 url_complete = str(url) #print url_complete #try: page = requests.get(url_complete, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify("utf-8") claim_ = claim_obj.Claim() claim_.setUrl(url_complete) claim_.setSource("africacheck") if (criteria.html): claim_.setHtml(soup.prettify("utf-8")) #title #if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1): title = soup.find("meta", {"property": "og:title"}) claim_.setTitle(title['content']) #date date_ = soup.find('time') #print date_["content"] if date_: date_str = search_dates( date_['datetime'].split(" ")[0])[0][1].strftime("%Y-%m-%d") #print date_str claim_.setDate(date_str) #print claim_.date #rating conclusion_ = "" if (soup.find("div", {"class": "verdict-stamp"})): conclusion_ = soup.find("div", { "class": "verdict-stamp" }).get_text() if (soup.find("div", {"class": "verdict"})): conclusion_ = soup.find("div", {"class": "verdict"}).get_text() if (soup.find("div", {"class": "indicator"})): conclusion_ = soup.find("div", {"class": "indicator"}).get_text() if (soup.find("div", {"class": "indicator"}).find('span')): conclusion_ = soup.find("div", { "class": "indicator" }).find('span').get_text() claim_.setAlternateName( str(re.sub('[^A-Za-z0-9\ -]+', '', conclusion_)).lower().strip()) #when there is no json date_ = soup.find("time", {"class": "datetime"}) if (date_): claim_.setDate(date_.get_text()) #print claim_.alternateName #body body = soup.find("div", {"id": "main"}) claim_.setBody(body.get_text()) #author author = soup.find("div", {"class": "sharethefacts-speaker-name"}) if (author): claim_.setAuthor(author.get_text()) #related links divTag = soup.find("div", {"id": "main"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.setRefered_links(related_links) if (soup.find("div", {"class": "report-claim"})): claim_.setClaim( soup.find("div", { "class": "report-claim" }).find("strong").get_text()) else: claim_.setClaim(claim_.title) tags = [] for tag in soup.findAll('meta', {"property": "article:tag"}): #print "achou" tags.append(tag["content"]) claim_.setTags(", ".join(tags)) claims.append(claim_.getDict()) #except: # print "Error ->" + str(url_complete) #creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf
def exactractionClaim(page, url, maxClaims): global urlTraite, urls_, claims, idClaim, uriSansClaim, nbClaims if nbClaims < maxClaims: print(str(nbClaims) + "/" + str(maxClaims) + " extracting " + str(url)) soup = BeautifulSoup(page, "lxml") soup.prettify() claim_ = claim_obj.Claim() claim = soup.find('div', {"class": "col-xs-12 col-sm-6 col-left"}) #si la page contient une claim et une conclusion. if claim: nbClaims += 1 claim_.setSource("fullfact") claim_.setUrl(url) claim_.setClaim(claim.get_text().replace("\nClaim\n", "")) claim_.setIdClaim(idClaim) #texte de la conclusion. conclusion = soup.find('div', {"class": "col-xs-12 col-sm-6 col-right"}) if conclusion: claim_.setConclusion(conclusion.get_text().replace( "\nConclusion\n", "")) c = conclusion.get_text().replace("\nConclusion\n", "") fonct = TraitementConclusion.fonctionPrincipale(c) claim_.setVerdictTompo( TraitementConclusion.fonctionPrincipale(c)) title = soup.find("div", {"class": "header"}) t = "" if title: t = title.find("h1").get_text() claim_.setTitle(t) date = soup.find("p", {"class": "date"}) d = "" if date: d = date.find("span").get_text() claim_.setDate(d) #texte de la revue. body = soup.find("div", {"class": "article-post-content"}) if body: liensRevue = [] text = [] bod = body.find("div", class_=False, id=False) if bod: for b in bod.findAll("p"): for link in b.findAll('a', href=True): liensRevue.append(link['href']) text.append(b.get_text()) result = " ".join(text) claim_.setLiensRevue(liensRevue) claim_.setBody(result) #extraction du nom de la rubrique du claim. categories = soup.find('ol', {"class": "breadcrumb"}) if categories: rub = [] for c in categories.findAll('a', href=True): rub.append(c.get_text()) rubri = rub[1].lower() claim_.setRubrique(rubri) #extraction des claims contenus dans la rubrique "related posts" du claim courant. relp = getPosts.getRelatedPosts(soup) #appel du programme qui extrait les mots clés/thématique pour lesquels les claims ont été mis en ensemble dans "related posts". l = relationsEntreLesClaims.relationClaims(1, "RelatedPosts", relp, RP=True) motsCles = l[-1] if not (rub[-1].lower() == "online"): motsCles.append(rub[-1]) print("\n commun subjects -section related posts-: " + str(motsCles)) del l[-1] #stockage des URL des claims de "related posts" et mots clés associés dans l'attribut relatesPosts du claim courant claim_.setRelated_posts("RelatedPosts", l) claim_.setKeyWordsRP("RelatedPosts", motsCles) #Cas où il y a plusieurs claims/conclusions traitées par la même revue. autresClaims = soup.find_all('div', {"class": "briefAdditionalRows"}) if autresClaims: nbClaims += len(autresClaims) if nbClaims > maxClaims: return "break" for row in autresClaims: c = additionalRows.briefAdditionalRows( row, result, url, idClaim, l, rubri, motsCles, t, liensRevue, d) if c != "empty": claims.append(c.getDict()) idClaim += 1 #stockage du claim courant dans la rubrique associée (necéssaire dans l'étape de clustering pour déduire s'il y a une relation entre claims de même rubrique). triClaimsParRubrique(rubri, claim_) #appel récursif sur les claims de related posts. if len(relp) != 0: for r in relp: if not (r[0] in urlTraite): try: page = urlopen(r[0]).read() urlTraite.append(r[0]) exactractionClaim(page, r[0], maxClaims) except: continue #Si la page qu'on scrappe ne contient pas de claim/conclusion, juste une revue. else: uriSansClaim += 1 print("page : " + url + " without a claim !") ls = soup.findAll('a', href=True) if len(ls) != 0: for anchor in ls: u = "http://fullfact.org" + anchor['href'].replace( "?utm_source=content_page&utm_medium=related_content", "") if (not (u in urls_) and not (u in urlTraite)): urls_.append(u) else: return "break"
def addClaim(self, customer, date, incident_description, claim_amount): a = Claim(customer, date, incident_description, claim_amount) self.claims.append(a) print(a.ID) return a.ID
def get_all_claims(criteria): headers = { 'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } #print criteria.maxClaims #performing a search by each letter, and adding each article to a urls_ var. now = datetime.datetime.now() urls_ = {} last_page = [] for page_number in range(1, 500): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break url = "https://theferret.scot/category/fact-check/page/" + str( page_number) + "/" #try: page = requests.get(url, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify() links = soup.findAll("h1", {"class": "entry-title"}) if (len(links) != 0) or (links != last_page): for anchor in links: anchor = anchor.find('a', {"rel": "bookmark"}, href=True) ind_ = str(anchor['href']) if (ind_ not in urls_.keys()): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break urls_[ind_] = page print "adding " + str(ind_) last_page = links else: print("break!") break #except: # print "error=>"+str(url) claims = [] index = 0 # visiting each article's dictionary and extract the content. for url, conclusion in urls_.iteritems(): print str(index) + "/" + str(len( urls_.keys())) + " extracting " + str(url) index += 1 url_complete = str(url) #print url_complete try: page = requests.get(url_complete, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify("utf-8") claim_ = claim_obj.Claim() claim_.setUrl(url_complete) claim_.setSource("theferret") if (criteria.html): claim_.setHtml(soup.prettify("utf-8")) #title #if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1): title = soup.find("h1", {"class": "cover-title"}) claim_.setTitle(title.text) #date date_ = soup.find('div', {"class": "widget__content"}).find("p") #print date_["content"] if date_: date_str = search_dates(date_.text)[0][1].strftime("%Y-%m-%d") #print date_str claim_.setDate(date_str) #print claim_.date #body body = soup.find("div", {"class": "article__text"}) claim_.setBody(body.get_text()) #related links divTag = soup.find("div", {"class": "article__text"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.setRefered_links(related_links) claim_.setClaim(soup.find("h1", {"class": "article__title"}).text) claim_.setConclusion(conclusion) tags = [] for tag in soup.findAll('meta', {"property": "article:tag"}): #print "achou" tags.append(tag["content"]) claim_.setTags(", ".join(tags)) claims.append(claim_.getDict()) except: print "Error ->" + str(url_complete) #creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf
def get_all_claims(criteria): headers = { 'user-agent': 'Mozilla/5.5 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36' } #print criteria.maxClaims #performing a search by each letter, and adding each article to a urls_ var. now = datetime.datetime.now() urls_ = {} types = [ "true", "mostly-true", "half-true", "barely-true", "false", "pants-fire", "no-flip", "half-flip", "full-flop" ] last_page = [] for type_ in types: for page_number in range(1, 500): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break url = "http://www.politifact.com/truth-o-meter/rulings/" + str( type_) + "/?page=" + str(page_number) try: page = requests.get(url, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify() links = soup.findAll("p", {"class": "statement__text"}) if (len(links) != 0) or (links != last_page): for anchor in links: anchor = anchor.find('a', {"class": "link"}, href=True) ind_ = "http://www.politifact.com" + str( anchor['href']) if (ind_ not in urls_.keys()): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break if (ind_ not in criteria.avoid_url): urls_[ind_] = type_ print "adding " + str(ind_) last_page = links else: print("break!") break except: print "error=>" + str(url) claims = [] index = 0 # visiting each article's dictionary and extract the content. for url, conclusion in urls_.iteritems(): print str(index) + "/" + str(len( urls_.keys())) + " extracting " + str(url) index += 1 url_complete = str(url) #print url_complete try: page = requests.get(url_complete, headers=headers, timeout=5) soup = BeautifulSoup(page.text, "lxml") soup.prettify("utf-8") claim_ = claim_obj.Claim() claim_.setUrl(url_complete) claim_.setSource("politifact") if (criteria.html): claim_.setHtml(soup.prettify("utf-8")) #title #if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1): title = soup.find("h1", {"class": "article__title"}) claim_.setTitle(title.text) #date date_ = soup.find('div', {"class": "widget__content"}).find("p") #print date_["content"] if date_: date_str = search_dates(date_.text)[0][1].strftime("%Y-%m-%d") #print date_str claim_.setDate(date_str) #print claim_.date #rating obj = soup.find("div", {"itemprop": "reviewRating"}) if (obj): claim_.ratingValue = obj.find("div", { "itemprop": "ratingValue" }).text claim_.worstRating = obj.find("div", { "itemprop": "worstRating" }).text claim_.bestRating = obj.find("div", { "itemprop": "bestRating" }).text claim_.alternateName = obj.find("div", { "itemprop": "alternateName" }).text else: claim_.setConclusion(conclusion) #body body = soup.find("div", {"class": "article__text"}) claim_.setBody(body.get_text()) #author author = soup.find("div", {"itemprop": "itemReviewed"}) if (author and author.find("div", {"itemprop": "author"})): claim_.setAuthor( author.find("div", { "itemprop": "author" }).find("div", { "itemprop": "name" }).get_text()) #sameas obj = soup.find("div", {"itemprop": "itemReviewed"}) if (obj and obj.find("div", {"itemprop": "sameAs"})): claim_.setSameAs( obj.find("div", { "itemprop": "sameAs" }).get_text()) #sameAs obj = soup.find("div", {"itemprop": "itemReviewed"}) if (obj and obj.find("div", {"itemprop": "datePublished"})): claim_.setDatePublished( obj.find("div", { "itemprop": "datePublished" }).get_text()) #related links divTag = soup.find("div", {"class": "article__text"}) related_links = [] for link in divTag.findAll('a', href=True): related_links.append(link['href']) claim_.setRefered_links(related_links) claim_.setClaim(soup.find("h1", {"class": "article__title"}).text) tags = [] for tag in soup.findAll('meta', {"property": "article:tag"}): #print "achou" tags.append(tag["content"]) claim_.setTags(", ".join(tags)) claims.append(claim_.getDict()) except: print "Error ->" + str(url_complete) #creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf
def get_all_claims(criteria): print criteria.maxClaims #performing a search by each letter, and adding each article to a urls_ var. now = datetime.datetime.now() urls_ = {} for type_ in [ "verdadeiro", "impreciso", "exagerado", "contraditorio", "insustentavel", "falso" ]: for page_number in range(1, 500): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break try: page = urllib2.urlopen( "http://aosfatos.org/noticias/checamos/" + str(type_) + "/?page=" + str(page_number)).read() except: break soup = BeautifulSoup(page, "lxml") soup.prettify() links = soup.findAll('a', {"class": "card third"}, href=True) if len(links) != 0: for anchor in links: if (anchor['href'] not in urls_.keys()): if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims): break urls_[anchor['href']] = type_ print "adding " + str(anchor['href']) else: print("break!") break claims = [] index = 0 # visiting each article's dictionary and extract the content. for url, conclusion in urls_.iteritems(): print str(index) + "/" + str(len( urls_.keys())) + " extracting " + str(url) index += 1 url_complete = "https://aosfatos.org/" + str(url) #print url_complete page = urllib2.urlopen(url_complete).read().decode('utf-8', 'ignore') soup = BeautifulSoup(page, "lxml") soup.prettify("utf-8") for claim_element in soup.findAll("blockquote"): claim_ = claim_obj.Claim() claim_.setUrl(url_complete) claim_.setSource("aosfatos") #date date_ = soup.find('p', {"class": "publish_date"}) if date_: date_str = date_.get_text().replace("\n", "").replace( " ", "").split(",")[0] claim_.setDate(dateparser.parse(date_str).strftime("%Y-%m-%d")) #title title = soup.findAll("h1") claim_.setTitle(title[1].text) #body body = soup.find("article") claim_.setBody(body.get_text().replace("\n", "").replace( "TwitterFacebookE-mailWhatsApp", "")) #related links divTag = soup.find("article").find("hr") related_links = [] for link in divTag.find_all_next('a', href=True): related_links.append(link['href']) claim_.setRefered_links(related_links) #claim claim_.setClaim(claim_element.get_text()) if (claim_element.find_previous_sibling("figure") and claim_element.find_previous_sibling("figure").findAll( "figcaption")): claim_.setConclusion( claim_element.find_previous_sibling("figure").findAll( "figcaption")[-1:][0].get_text()) #print claim_.claim.decode("utf-8") + " ====> " #print claim_.conclusion.decode("utf-8") #print "-->"+ str(claim_.conclusion) claims.append(claim_.getDict()) #creating a pandas dataframe pdf = pd.DataFrame(claims) return pdf