Example #1
0
    def getClaims(self, query):
        """Get claims.

        Arguments:
            query {str} -- the initial query
        """
        # folderPath = os.path.join(folderpath, 'final')
        # fullPath = os.path.join(self.rootpath, folderPath)
        self.preprocessData.generateTweetsLines(self.folderpath)

        claimExtractor = Claim.ClaimExtractor(self.rootpath, self.folderpath)

        tweets_list = list(self.helper.getTweet(self.folderpath))
        cleanedTweets = []
        for tweet in tweets_list:
            c1 = self.preprocessData.cleanTweet(tweet.text)
            cleanedTweets.append(c1)
        print("Parsing...")
        mergedNoun, sortedSubject2Number, \
            subject2tweetInfo, parsedTweets = claimExtractor.collectSubject(
                tweets_list, cleanedTweets)
        # sortedSubject2Number = self.helper.loadJson(
        #     os.path.join(self.folderpath, "final", "sorted_subject2number.json"))
        # subject2tweetInfo = self.helper.loadJson(
        #     os.path.join(self.folderpath, "final", "subject2tweetInfo.json"))
        # parsedTweets = self.helper.loadJson(
        #     os.path.join(self.folderpath, "final", "tweets_id2Info.json"))
        candidateClaimsMergedClause, \
            candidateFullClaimsMergedClause = claimExtractor.getCandidateClaims(
                tweets_list, mergedNoun, sortedSubject2Number, subject2tweetInfo,
                parsedTweets, query[1:])

        return candidateClaimsMergedClause, candidateFullClaimsMergedClause
Example #2
0
    def getClusterClaims(self, query, eps):
        claims, fullClaims = self.getClaims(query)
        # filePath = os.path.join(self.folderpath, "final",
        #                         "candidateClaimsMergedClause.json")
        # print("filePath ", filePath)
        # claims = self.helper.loadJson(filePath)

        # filePath = os.path.join(self.folderpath, "final",
        #                         "candidateFullClaimsMergedClause.json")
        # print("filePath ", filePath)
        # fullClaims = self.helper.loadJson(filePath)

        skipthoughts_model = {
            "name": "skipthoughts",
            "modelPath": self.rootpath + "/.." +
            "/skip_thoughts/pretrained/skip_thoughts_uni_2017_02_02/exp_vocab",
            "checkpointPath": "model.ckpt-501424"
        }
        sent2vec_model = {
            "name": "sent2vec",
            "modelPath": "/lustre/scratch/haoxu/twitter_bigrams.bin"
        }

        self.getSimilarity = Claim.GetSimilarity(self.rootpath,
                                                 self.folderpath,
                                                 skipthoughts_model)
        tweets_list = list(self.helper.getTweet(self.folderpath))
        cluster2claimsIndexes, cluster2coreSampleIndices = self.getSimilarity.getClusteredClaims(
            claims, tweets_list, eps)
        return cluster2claimsIndexes, cluster2coreSampleIndices, claims, fullClaims
Example #3
0
def briefAdditionalRows(soup, result, url, idClaim, listeURL, rubri, motsCles,
                        t, liensRevue, d):

    claim = soup.find('div', {"class": "col-xs-12 col-sm-6 col-left"})
    if claim:
        claim_ = claim_obj.Claim()
        claim_.setSource("fullfact")
        claim_.setUrl(url)
        claim_.setClaim(claim.get_text().replace("\nClaim\n", ""))
        claim_.setIdClaim(idClaim)
        claim_.setRubrique(rubri)
        claim_.setKeyWordsRP("RelatedPosts", motsCles)
        claim_.setTitle(t)
        claim_.setDate(d)

        conclusion = soup.find('div',
                               {"class": "col-xs-12 col-sm-6 col-right"})
        if conclusion:
            claim_.setConclusion(conclusion.get_text().replace(
                "\nConclusion\n", ""))
            c = conclusion.get_text().replace("\nConclusion\n", "")
            claim_.setVerdictTompo(TraitementConclusion.fonctionPrincipale(c))

        claim_.setBody(result)

        claim_.setRelated_posts("RelatedPosts", listeURL)

        claim_.setLiensRevue(liensRevue)

        return claim_
    else:
        return "empty"
Example #4
0
    def readClaim(self, xml):
        claim = Claim.Claim()
        for event in xml.getchildren():
            if event.tag == 'claimtype':
                claim.claimtype = self.readTerm(event)
            elif event.tag == 'label':
                # We store the full protocol,label construct for
                # consistency with the technical parts, so it is left to
                # the __str__ of claim to select the right element
                claim.label = self.readTerm(event)
            elif event.tag == 'protocol':
                claim.protocol = self.readTerm(event)
            elif event.tag == 'role':
                claim.role = self.readTerm(event)
            elif event.tag == 'parameter':
                claim.parameter = self.readTerm(event)

            elif event.tag == 'failed':
                claim.failed = int(event.text)
            elif event.tag == 'count':
                claim.count = int(event.text)
            elif event.tag == 'states':
                claim.states = int(event.text)

            elif event.tag == 'complete':
                claim.complete = True
            elif event.tag == 'timebound':
                claim.timebound = True
            else:
                print >> sys.stderr, "Warning unknown tag in claim: %s" % claim.tag

        claim.analyze()
        return claim
Example #5
0
    def getSimilarTweets4Claim(self):
        # get claims
        claims = list(self.helper.getClaim(self.folderpath))
        # lowercase
        # claims = [claim.lower() for claim in claims]
        # get tweets
        tweets = self.helper.getTweet(self.folderpath)
        cleanedTweets = []
        for tweet in tweets:
            # tweets_list.append(tweet)
            c1 = self.preprocessData.cleanTweet(tweet.text)
            cleanedTweets.append(c1)
        getSimilarity = Claim.GetSimilarity(
            "/home/hao/Workplace/HaoXu/Data/skip_thoughts/pretrained/skip_thoughts_uni_2017_02_02/exp_vocab",
            "model.ckpt-501424")

        sentences, tweetIndex = getSimilarity.splitSentences(cleanedTweets)
        encodedSentences = getSimilarity.encodeSen(sentences)

        # for index, claim in enumerate(claims):
        encodedClaims = getSimilarity.encodeSen(claims)
        claims2tweets = getSimilarity.getTweets4Claims(sentences,
                                                       encodedSentences,
                                                       claims, encodedClaims,
                                                       tweetIndex)

        for claimID, sentInfos in claims2tweets.items():
            claims2tweets[claimID] = self.preprocessData.sortListofLists(
                sentInfos, False)
        self.helper.dumpJson(self.folderpath,
                             "final/sorted_claims2tweets.json", claims2tweets)
        print("sorted_claims2tweets.json have been saved.")
Example #6
0
def new_claim(f_link, date, title, tags):
    claim_ = claim_obj.Claim()
    claim_.setUrl(f_link)
    claim_.setTitle(title)
    claim_.setTags(tags)
    date_ = date.strip().split()
    date_ = "-".join([date_[4], date_[2], date_[0]])
    claim_.setDate(dateparser.parse(date_).strftime("%Y-%m-%d"))
    claim_.setSource("publica")
    claim_.setBody("")
    return claim_
Example #7
0
 def getClusterRankClaims(self, query, eps):
     cluster2claimsIndexes, cluster2coreSampleIndices, \
         claims, fullClaims = self.getClusterClaims(
             query, eps)
     getSimilarity = Claim.GetSimilarity(self.rootpath, self.folderpath)
     tweets_list = list(self.helper.getTweet(self.folderpath))
     rankedClusterClaims = getSimilarity.rankClusteredClaims(
         cluster2claimsIndexes, cluster2coreSampleIndices, claims,
         fullClaims, tweets_list)
     return rankedClusterClaims
     """similarClaimsComponents, sortedSimilarClaims = getSimilarity.getSimilarClaims(
Example #8
0
def addClaim(customer_id):
    '''
    Creates new Claim object and adds it to both InsuranceCompany list of all claims and the corresponding
    Customer-objects list of claims.
    :param customer_id:
    :return:
    '''
    c = company.getCustomerById(customer_id)
    if (c != None):
        claim = Claim(request.args.get('date'), request.args.get('incident_description'), request.args.get('claim_amount'))
        c.addClaim(claim)
        company.addClaim(claim)
    return jsonify(
        success=c != None,
        message="Customer not found")
Example #9
0
    def do_newclaim(self, inp):
        'Create a new claim and switch the context to that claim'
        self.prompt = "New Claim> "

        "Ask for the name of the claim, as well as verify it."
        "When the name is verified create a folder for it."
        goodname = False
        name = ""
        while not goodname:
            name: str = input("Enter Claim Name: ")
            goodname = self.verifyname(name)
        claimdate = datetime.datetime.now().date()
        claimname = str(claimdate) + "_" + name
        self.prompt = claimname + "> "
        claimdir = "Claims\\"+claimname
        imagesdir = "Claims\\"+claimname+"\\Images"
        os.mkdir(claimdir)
        os.mkdir(imagesdir)
        #TODO properly validate date
        daterange = input("Provide the start-end dates in the format MM/DD/YYYY-MM/DD/YYYY (currently does not check, be careful): ")
        startdate,enddate = self.verifydate(daterange)
        newclaim = Claim.Claim(claimdir, claimname,startdate,enddate)
        self.claim = newclaim
        self.prompt = claimname + "> "
Example #10
0
def get_all_claims(criteria):
    print criteria.maxClaims
    #performing a search by each letter, and adding each article to a urls_ var.
    now = datetime.datetime.now()
    urls_ = {}
    for page_number in range(1, 500):
        if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims):
            break
        try:
            page = urllib2.urlopen(
                "https://g1.globo.com/e-ou-nao-e/index/feed/pagina-" +
                str(page_number) + ".ghtml").read()
        except:
            break
        soup = BeautifulSoup(page, "lxml")
        soup.prettify()
        links = soup.findAll('a', {"class": "feed-post-link"}, href=True)
        if len(links) != 0:
            for anchor in links:
                if (anchor['href'] not in urls_.keys()):
                    if (criteria.maxClaims > 0
                            and len(urls_) >= criteria.maxClaims):
                        break
                    urls_[anchor['href']] = page_number
                    print "adding " + str(anchor['href'])
        else:
            print("break!")
            break

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url, conclusion in urls_.iteritems():
        print str(index) + "/" + str(len(
            urls_.keys())) + " extracting " + str(url)
        index += 1

        url_complete = str(url)

        #print url_complete
        page = urllib2.urlopen(url_complete).read().decode('utf-8', 'ignore')
        soup = BeautifulSoup(page, "lxml")
        soup.prettify("utf-8")

        claim_ = claim_obj.Claim()
        claim_.setUrl(url_complete)
        claim_.setSource("g1")

        if (criteria.html):
            claim_.setHtml(soup.prettify("utf-8"))

        try:
            #title
            #if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1):
            title = soup.find("h1", {"class": "content-head__title"})
            claim_.setTitle(title.text)

            #date

            date_ = soup.find('time', {"itemprop": "datePublished"})
            if date_:
                date_str = date_.get_text().split(" ")[1]
                claim_.setDate(
                    dateparser.parse(date_str, settings={
                        'DATE_ORDER': 'DMY'
                    }).strftime("%Y-%m-%d"))
                #print claim_.date

            #body
            body = soup.find("article")
            claim_.setBody(body.get_text().replace("\n", "").replace(
                "TwitterFacebookE-mailWhatsApp", ""))

            #related links
            divTag = soup.find("article", {"itemprop": "articleBody"})
            related_links = []
            for link in divTag.findAll('a', href=True):
                related_links.append(link['href'])
            claim_.setRefered_links(related_links)

            #claim
            claim_conclusion = soup.find("h1", {
                "class": "content-head__title"
            }).get_text()
            #claim_.setClaim(claim_conclusion)
            #if (len(claim_conclusion.split("?"))>1):
            claim_.setClaim(claim_conclusion.split("?")[0])
            claim_.setConclusion(claim_conclusion.split("?")[1])
            # if (claim_element.find_previous_sibling("figure") and claim_element.find_previous_sibling("figure").findAll("figcaption")):
            # 	claim_.setConclusion(claim_element.find_previous_sibling("figure").findAll("figcaption")[-1:][0].get_text())
            #print claim_.claim.decode("utf-8") + " ====> "
            #print claim_.conclusion.decode("utf-8")
            #print "-->"+ str(claim_.conclusion)

            claims.append(claim_.getDict())
        except:
            print "Error ->" + str(url_complete)

#creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf
Example #11
0
def get_all_claims(criteria):

    #performing a search by each letter, and adding each article to a urls_ var.

    alfab = "bcdefghijklmnopqrstuvxyz"
    urls_ = {}
    for l in alfab:
        for page_number in range(1, 500):
            if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims):
                break
            try:
                page = urllib2.urlopen("http://fullfact.org/search/?q=" + l +
                                       "&page=" + str(page_number)).read()
            except:
                break
            soup = BeautifulSoup(page, "lxml")
            soup.prettify()

            links = soup.findAll('a', {"rel": "bookmark"}, href=True)
            if len(links) != 0:
                for anchor in links:
                    urls_[anchor['href']] = [l, page_number]
                    print "adding " + str(anchor['href'])
                    if (criteria.maxClaims > 0
                            and len(urls_) >= criteria.maxClaims):
                        break
            else:
                print("break!")
                break

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url in urls_.keys():
        print str(index) + "/" + str(len(urls_)) + " extracting " + str(url)
        index += 1
        claim_ = claim_obj.Claim()
        claim_.setSource("fullfact")

        try:
            url_complete = "http://fullfact.org" + url
            claim_.setUrl(url_complete)
            page = urllib2.urlopen(url_complete).read()
            soup = BeautifulSoup(page, "lxml")
            soup.prettify()

            #claim
            claim = soup.find('div', {"class": "col-xs-12 col-sm-6 col-left"})
            if claim:
                claim_.setClaim(claim.get_text().replace("\nClaim\n", ""))

            #conclusin
            conclusion = soup.find('div',
                                   {"class": "col-xs-12 col-sm-6 col-right"})
            if conclusion:
                claim_.setConclusion(conclusion.get_text().replace(
                    "\nConclusion\n", ""))

            #title
            title = soup.find("div", {
                "class": "container main-container"
            }).find('h1')
            claim_.setTitle(title.text)

            #date
            date = soup.find("p",
                             {"class": "hidden-xs hidden-sm date updated"})
            claim_.setDate(
                dateparser.parse(date.get_text().replace(
                    "Published:", "")).strftime("%Y-%m-%d"))

            #body
            body = soup.find("div", {"class": "article-post-content"})
            claim_.setBody(body.get_text())

            #related links
            divTag = soup.find("div", {"class": "row"})
            related_links = []
            for link in divTag.findAll('a', href=True):
                related_links.append(link['href'])
            claim_.setRefered_links(related_links)

            claims.append(claim_.getDict())
        except:
            print "error=>" + url_complete

#creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf
Example #12
0
def get_all_claims(criteria):
    headers = {
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }

    #print criteria.maxClaims
    #performing a search by each letter, and adding each article to a urls_ var.
    now = datetime.datetime.now()
    urls_ = {}
    types = [
        "true", "mostly-true", "half-true", "barely-true", "false",
        "pants-fire", "no-flip", "half-flip", "full-flop"
    ]
    last_page = []
    for page_number in range(1, 500):
        if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims):
            break
        url = "https://www.channel4.com/news/factcheck/page/" + str(
            page_number)
        #url="http://www.politifact.com/truth-o-meter/rulings/"+str(type_)+"/?page="+str(page_number)
        try:
            page = requests.get(url, headers=headers, timeout=5)
            soup = BeautifulSoup(page.text, "lxml")
            soup.prettify()

            links = soup.findAll("li", {"class": "feature factcheck"})
            if (len(links) != 0) or (links != last_page):
                for anchor in links:
                    anchor = anchor.find('a', {"class": "permalink"},
                                         href=True)
                    ind_ = str(anchor['href'])
                    if (ind_ not in urls_.keys()):
                        if (criteria.maxClaims > 0
                                and len(urls_) >= criteria.maxClaims):
                            break
                        if (ind_ not in criteria.avoid_url):
                            urls_[ind_] = ind_
                            print "adding " + str(ind_)
                last_page = links
            else:
                print("break!")
                break
        except:
            print "error=>" + str(url)

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url, conclusion in urls_.iteritems():
        print str(index) + "/" + str(len(
            urls_.keys())) + " extracting " + str(url)
        index += 1

        url_complete = str(url)

        #print url_complete
        try:
            page = requests.get(url_complete, headers=headers, timeout=5)
            soup = BeautifulSoup(page.text, "lxml")
            soup.prettify("utf-8")

            claim_ = claim_obj.Claim()
            claim_.setUrl(url_complete)
            claim_.setSource("channel4")

            if (criteria.html):
                claim_.setHtml(soup.prettify("utf-8"))

            #title
            #if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1):
            title = soup.find("div", {
                "class": "factcheck-article-header"
            }).find("h1").get_text()
            claim_.setTitle(title)

            #date

            date_ = soup.find('li', {"class": "pubDateTime"})
            #print date_["content"]
            if date_:
                date_str = search_dates(
                    date_['data-time'])[0][1].strftime("%Y-%m-%d")
                #print date_str
                claim_.setDate(date_str)
                #print claim_.date

            #body
            body = soup.find("div", {"class": "article-body article-main"})
            claim_.setBody(body.get_text())

            #related links
            divTag = soup.find("div", {"class": "article-body article-main"})
            related_links = []
            for link in divTag.findAll('a', href=True):
                related_links.append(link['href'])
            claim_.setRefered_links(related_links)

            claim_.setClaim(title)

            conclusion_ = body.find_all("h2",
                                        text=lambda t: t and "verdict" in t)
            if conclusion_:
                conclusion_str = conclusion_[0].find_next_sibling("p").text
                claim_.setConclusion(conclusion_str)

            tags = []

            for tag in soup.findAll('meta', {"property": "article:tag"}):
                #print "achou"
                tags.append(tag["content"])
            claim_.setTags(", ".join(tags))

            # if (claim_.conclusion.replace(" ","")=="" or claim_.claim.replace(" ","")==""):
            # 	print claim_.conclusion
            # 	print claim_.claim
            # 	raise ValueError('No conclusion or claim')

            claims.append(claim_.getDict())
        except:
            print "Error ->" + str(url_complete)

#creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf
Example #13
0
def get_all_claims(criteria):
    print criteria.maxClaims
    #performing a search by each letter, and adding each article to a urls_ var.
    now = datetime.datetime.now()
    urls_ = {}
    for page_number in range(1, 500):
        if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims):
            break
        try:
            url = "https://correctiv.org/echtjetzt/artikel/seite/" + str(
                page_number) + "/"
            page = urllib2.urlopen(url).read()
        except:
            break
        soup = BeautifulSoup(page, "lxml")
        soup.prettify()
        links = soup.findAll('a', {"class": "entry-list-item__link"},
                             href=True)
        if len(links) != 0:
            for anchor in links:
                url_to_add = "https://correctiv.org" + str(anchor['href'])
                if (url_to_add not in urls_.keys()):
                    if (criteria.maxClaims > 0
                            and len(urls_) >= criteria.maxClaims):
                        break
                    urls_[url_to_add] = page_number
                    print "adding " + str(url_to_add)
        else:
            print("break!")
            break

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url, conclusion in urls_.iteritems():
        print str(index) + "/" + str(len(
            urls_.keys())) + " extracting " + str(url)
        index += 1

        url_complete = str(url)

        #print url_complete
        try:
            page = urllib2.urlopen(url_complete).read().decode(
                'utf-8', 'ignore')
            soup = BeautifulSoup(page, "lxml")
            soup.prettify("utf-8")

            claim_ = claim_obj.Claim()
            claim_.setUrl(url_complete)
            claim_.setSource("correctiv")

            if (criteria.html):
                claim_.setHtml(soup.prettify("utf-8"))

            #title
            #if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1):
            title = soup.find("h1", {"class": "article-header__headline"})
            claim_.setTitle(
                title.text.replace("Faktencheck:", "").replace("\n", ""))

            date_ = soup.find('time',
                              {"class": "article-body__publishing-date"})
            #print date_["content"]
            if date_:
                date_str = search_dates(
                    date_['title'].split("T")[0])[0][1].strftime("%Y-%m-%d")
                #print date_str
                claim_.setDate(date_str)
                #print claim_.date

            #body
            body = soup.find("div", {"class": "article-body__main"})
            claim_.setBody(body.get_text())

            #related links
            divTag = soup.find("div", {"class": "article-body__main"})
            related_links = []
            for link in divTag.findAll('a', href=True):
                related_links.append(link['href'])
            claim_.setRefered_links(related_links)

            claim_.setClaim(claim_.title)
            conclsion = soup.find(
                'div', {"class": "article-body__claimreview claimreview"})
            if conclsion:
                claim_.setConclusion(
                    conclsion.text.replace("Unsere Bewertung: ",
                                           "").replace("\n", ""))

            tags = []

            for tag in soup.findAll('meta', {"property": "article:tag"}):
                #print "achou"
                tags.append(tag["content"])
            claim_.setTags(", ".join(tags))

            claims.append(claim_.getDict())
        except:
            print "Error ->" + str(url_complete)

#creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf
Example #14
0
def get_all_claims(criteria):
	headers = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'}


	#print criteria.maxClaims
	#performing a search by each letter, and adding each article to a urls_ var.
	now = datetime.datetime.now()
	urls_={}
	types=["a"]
	last_page=[]
	for type_ in types:
		for page_number in range (1,500):
			if (criteria.maxClaims > 0 and len(urls_)>= criteria.maxClaims):
				break
			try:
				#url="http://www.politifact.com/truth-o-meter/rulings/"+str(type_)+"/?page="+str(page_number)
				url="https://www.truthorfiction.com/page/"+str(page_number)+"/?s="+str(type_)
				page = requests.get(url, headers=headers, timeout=5)
				soup = BeautifulSoup(page.text,"lxml")
				soup.prettify()

				links = soup.findAll("h2",{"class":"grid-title"})
				if (len(links) > 0) and (links != last_page):
					for anchor in links:
						anchor = anchor.find('a', href=True)
						ind_=str(anchor['href'])
						if (ind_ not in urls_.keys()):
							if (criteria.maxClaims > 0 and len(urls_)>= criteria.maxClaims):
								break
							if (ind_ not in criteria.avoid_url):
								urls_[ind_]=anchor.get_text()
								print "adding "+str(ind_)
					last_page  = links
				else:
					print ("break!")
					break
			except:
				print "error=>"+str(url)

	claims=[]
	index=0
	# visiting each article's dictionary and extract the content.
	for url,title_claim in urls_.iteritems():  
		print str(index) + "/"+ str(len(urls_.keys()))+ " extracting "+str(url)
		index+=1

		url_complete=str(url)
		#print 

		#print url_complete
		try: 
			page = requests.get(url_complete, headers=headers, timeout=5)
			soup = BeautifulSoup(page.text,"lxml")
			soup.prettify("utf-8")

			claim_ =  claim_obj.Claim()
			claim_.setUrl(url_complete)
			claim_.setSource("truthorfiction")

			if (criteria.html):
				claim_.setHtml(soup.prettify("utf-8"))

			#title
			#title=title_claim.split("-")[0]
			title=title_claim[:title_claim.rfind("-")]
			conclusion=title_claim.split("-")[-1:][0].replace("!","")
			claim_.setTitle(title)

			#date

			date_ = soup.find('div', {"class": "post-box-meta-single"}).find("span") 
			#print date_["content"]
			if date_ : 
				date_str=search_dates(date_.text.replace(",",""), settings={'DATE_ORDER': 'MDY'})[0][1].strftime("%Y-%m-%d")
				#print date_str
				claim_.setDate(date_str)
				#print claim_.date


			#body
			body=soup.find("div",{"class":"inner-post-entry"})
			claim_.setBody(body.get_text())

			#related links
			divTag = soup.find("div",{"class":"inner-post-entry"})
			related_links=[]
			for link in divTag.findAll('a', href=True):
			    related_links.append(link['href'])
			claim_.setRefered_links(related_links)
			


			claim_.setClaim(title)
			claim_.setConclusion(conclusion)

			tags=[]

			for tag in soup.findAll('a', {"rel":"tag"}, href=True):
				tag_str=tag.text
				tags.append(tag_str)
			claim_.setTags(", ".join(tags))

			if (claim_.conclusion.replace(" ","")=="" or claim_.claim.replace(" ","")==""):
				raise ValueError('No conclusion or claim')

			claims.append(claim_.getDict())
		except:
			print "Error ->" + str(url_complete)

    #creating a pandas dataframe
	pdf=pd.DataFrame(claims)
	return pdf
def get_all_claims(criteria):
    headers = {
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }

    urls_ = {}
    for page_number in range(1, 500):
        if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims):
            break
        try:
            url = "https://checkyourfact.com/page/" + str(page_number) + "/"
            page = requests.get(url, headers=headers, timeout=10)
            soup = BeautifulSoup(page.text, "lxml")
            soup.prettify()
        except:
            break
        links = soup.find('articles').findAll('a', href=True)
        if len(links) != 0:
            for anchor in links:
                ind_ = "http://checkyourfact.com" + str(anchor['href'])
                if (ind_ not in urls_.keys()):
                    if (criteria.maxClaims > 0
                            and len(urls_) >= criteria.maxClaims):
                        break
                    if (ind_ not in criteria.avoid_url):
                        urls_[ind_] = page_number
                        print "adding " + str(ind_)
        else:
            print("break!")
            break

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url, conclusion in urls_.iteritems():
        print str(index) + "/" + str(len(
            urls_.keys())) + " extracting " + str(url)
        index += 1

        url_complete = str(url)

        #print url_complete
        try:
            page = requests.get(url_complete, headers=headers, timeout=5)
            soup = BeautifulSoup(page.text, "lxml")
            soup.prettify("utf-8")

            claim_ = claim_obj.Claim()
            claim_.setUrl(url_complete)
            claim_.setSource("checkyourfact")

            if (criteria.html):
                claim_.setHtml(soup.prettify("utf-8"))

            #title
            title = soup.find('article').find("h1")
            claim_.setTitle(title.text.replace("FACT CHECK: ", ""))

            date_str = search_dates(
                url_complete.replace("http://dailycaller.com/",
                                     "").replace("/", " "),
                settings={'DATE_ORDER': 'YMD'})[0][1].strftime("%Y-%m-%d")
            #print date_str
            claim_.setDate(date_str)
            #print claim_.date

            #body
            body = soup.find("article")
            claim_.setBody(body.get_text())

            #related links
            divTag = soup.find("article")
            related_links = []
            for link in divTag.findAll('a', href=True):
                related_links.append(link['href'])
            claim_.setRefered_links(related_links)

            claim_.setClaim(claim_.title)

            for strong in soup.find('article').findAll('strong'):
                if "Verdict:" in strong.text:
                    claim_.setConclusion(strong.text.replace("Verdict: ", ""))

            tags = []

            for tag in soup.findAll('meta', {"property": "article:tag"}):
                #print "achou"
                tags.append(tag["content"])
            claim_.setTags(", ".join(tags))

            claims.append(claim_.getDict())
        except:
            print "Error ->" + str(url_complete)

#creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf
Example #16
0
def get_all_claims(criteria):
    #performing a search by each letter, and adding each article to a urls_ var.
    now = datetime.datetime.now()
    urls_ = {}
    letters = [
        "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "m",
        "o", "p", "q", "x", "y", "z"
    ]
    letters = ["a"]
    for l in letters:
        for page in range(1, 500):
            if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims):
                break
            try:
                print("http://www.mimikama.at/page/" + str(page) + "/?s=" + l)
                page = urllib2.urlopen("http://www.mimikama.at/page/" +
                                       str(page) + "/?s=" + l).read()
            except:
                break
            soup = BeautifulSoup(page, "lxml")
            soup.prettify()
            links = soup.find('div', {
                "class": "td-ss-main-content"
            }).findAll('a', {"rel": "bookmark"}, href=True)
            if len(links) != 0:
                for anchor in links:
                    if (anchor['href'] not in urls_.keys()):
                        urls_[anchor['href']] = l
                        print "adding " + str(anchor['href'])
                        if (criteria.maxClaims > 0
                                and len(urls_) >= criteria.maxClaims):
                            break
            else:
                print("break!")
                break

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url in urls_.keys():
        try:
            print str(index) + "/" + str(len(
                urls_.keys())) + " extracting " + str(url)
            index += 1
            claim_ = claim_obj.Claim()
            claim_.setSource("mimikama")
            url_complete = url
            claim_.setUrl(url_complete)
            page = urllib2.urlopen(url_complete, timeout=5).read()
            soup = BeautifulSoup(page, "lxml")
            soup.prettify()

            #conclusin
            # conclusion=soup.find('div', {"class": "td-post-content"}).find('h2')
            # if conclusion :
            # 	claim_.setConclusion(conclusion.get_text())

            #title
            title = soup.find("h1", {"class": "entry-title"})
            claim_.setTitle(title.text)

            #claim
            #claim = soup.find('div', {"class": "td-post-content"}).find('h2')
            #if claim and claim.find_previous('strong'):
            #	claim_.setClaim(claim.find_previous('strong').get_text())
            #else:
            claim_.setClaim(claim_.title)

            #date
            date = soup.find("time",
                             {"class": "entry-date updated td-module-date"})
            #print date

            #print (search_dates(date.get_text())[0][1].strftime("%Y-%m-%d"))
            claim_.setDate(
                search_dates(date.get_text())[0][1].strftime("%Y-%m-%d"))

            #related links
            divTag = soup.find("div", {"class": "td-post-content"})
            related_links = []
            for link in divTag.findAll('a', href=True):
                related_links.append(link['href'])
            claim_.setRefered_links(related_links)

            body = soup.find("div", {"class": "td-post-content"})
            claim_.setBody(body.get_text())

            claims.append(claim_.getDict())
        except:
            print "Erro =>" + url

#creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf
Example #17
0
def get_all_claims(criteria):
	#performing a search by each letter, and adding each article to a urls_ var.
	now = datetime.datetime.now()
	urls_={}
	for year in range (2015,now.year+1):
		for month in range (1,13):
			if (criteria.maxClaims > 0 and len(urls_)>= criteria.maxClaims):
				break
			try:
				page = urllib2.urlopen("http://piaui.folha.uol.com.br/lupa/"+str(year)+"/"+str(month)+"/").read()
			except:
				break
			soup = BeautifulSoup(page,"lxml")
			soup.prettify()
			links = soup.find('div', {"class": "lista-noticias"}).findAll('a', href=True)
			if len(links) != 0:
				for anchor in links:
					if (anchor['href'] not in urls_.keys()):
						urls_[anchor['href']]=[year,month]
						print "adding "+str(anchor['href'])
						if (criteria.maxClaims > 0 and len(urls_)>= criteria.maxClaims):
							break
			else:
			    print ("break!")
			    break

	claims=[]
	index=0
	# visiting each article's dictionary and extract the content.
	for url in urls_.keys():
		print str(index) + "/"+ str(len(urls_.keys()))+ " extracting "+str(url)
		index+=1
		try:
			claim_ =  claim_obj.Claim()
			claim_.setSource("lupa")
			url_complete=url
			claim_.setUrl(url_complete)
			page = urllib2.urlopen(url_complete).read()
			soup = BeautifulSoup(page,"lxml")
			soup.prettify()

			if (criteria.html):
				claim_.setHtml(soup.prettify())


			#conclusin
			conclusion=soup.find('div', {"class": "etiqueta"})
			if conclusion :
				claim_.setConclusion(conclusion.get_text())

			#title
			title=soup.find("h2", {"class": "bloco-title"})
			claim_.setTitle(title.text)


			#claim
			claim = soup.find('div', {"class": "post-inner"}).find('div', {"class": "etiqueta"})
			if claim and claim.find_previous('strong'):
				claim_.setClaim(claim.find_previous('strong').get_text())
			else:
				claim_.setClaim(claim_.title)

			#date
			date=soup.find("div", {"class": "bloco-meta"})
			claim_.setDate(dateparser.parse(date.text.split("|")[0], settings={'DATE_ORDER': 'DMY'}).strftime("%Y-%m-%d"))

			#related links
			divTag = soup.find("div", {"class": "post-inner"})
			related_links=[]
			for link in divTag.findAll('a', href=True):
			    related_links.append(link['href'])
			claim_.setRefered_links(related_links)

			#related links
			body = soup.find("div", {"class": "post-inner"})
			claim_.setBody(body.get_text())

			# tags
			tags_ = [t.text for t in soup.findAll('a', {'rel':'tag'})]
			claim_.setTags(tags_)

			claims.append(claim_.getDict())
		except:
			print "error=>"+str(url_complete)

    #creating a pandas dataframe
	pdf=pd.DataFrame(claims)
	return pdf
Example #18
0
def get_all_claims(criteria):
    headers = {
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }

    #print criteria.maxClaims
    #performing a search by each letter, and adding each article to a urls_ var.
    now = datetime.datetime.now()
    urls_ = {}
    types = [
        "true", "mostly-true", "half-true", "barely-true", "false",
        "pants-fire", "no-flip", "half-flip", "full-flop"
    ]
    last_page = []
    for page_number in range(1, 500):
        if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims):
            break
        url = "http://factscan.ca/page/" + str(page_number) + "/"
        #url="http://www.politifact.com/truth-o-meter/rulings/"+str(type_)+"/?page="+str(page_number)
        try:
            page = requests.get(url, headers=headers, timeout=5)
            soup = BeautifulSoup(page.text, "lxml")
            soup.prettify()
            links = soup.findAll(
                "h1", {"class": "post-title entry-title home-feed-title"})
            if (len(links) != 0) or (links != last_page):
                for anchor in links:
                    anchor = anchor.find('a', href=True)
                    ind_ = str(anchor['href'])
                    if (ind_ not in urls_.keys()):
                        if (criteria.maxClaims > 0
                                and len(urls_) >= criteria.maxClaims):
                            break
                        if (ind_ not in criteria.avoid_url):
                            urls_[ind_] = ind_
                            print "adding " + str(ind_)
                last_page = links
            else:
                print("break!")
                break
        except:
            print "error=>" + str(url)

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url, conclusion in urls_.iteritems():
        print str(index) + "/" + str(len(
            urls_.keys())) + " extracting " + str(url)
        index += 1

        url_complete = str(url)

        #print url_complete
        try:
            page = requests.get(url_complete, headers=headers, timeout=5)
            soup = BeautifulSoup(page.text, "lxml")
            soup.prettify("utf-8")

            claim_ = claim_obj.Claim()
            claim_.setUrl(url_complete)
            claim_.setSource("factscan")

            if (criteria.html):
                claim_.setHtml(soup.prettify("utf-8"))

            #print url_complete
            #import ast
            json_ = None
            if (soup.find("script", {"type": "application/ld+json"})):
                json_ = soup.find("script", {
                    "type": "application/ld+json"
                }).get_text()

            def parse_wrong_json(json_, left, right):
                if json_:
                    if (len(json_.split(left)) > 0):
                        return json_.split(left)[1].split(right)[0]
                else:
                    return None

            #title
            #if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1):
            title = soup.find("meta", {"property": "og:title"})['content']
            claim_.setTitle(title)

            #date

            date_ = soup.find('meta', {"property": "article:published_time"})
            #print date_["content"]
            if date_:
                date_str = search_dates(
                    date_['content'].split("T")[0])[0][1].strftime("%Y-%m-%d")
                #print date_str
                claim_.setDate(date_str)
                #print claim_.date

            #rating

            claim_.setRatingValue(
                parse_wrong_json(json_, '"ratingValue":', ","))
            claim_.setWorstRating(
                parse_wrong_json(json_, '"worstRating":', ","))
            claim_.setBestRating(parse_wrong_json(json_, '"bestRating":', ","))
            claim_.setAlternateName(
                parse_wrong_json(json_, '"alternateName":', ","))

            #when there is no json
            if (claim_.alternateName == None):
                #print "não achou conclusao"
                if (soup.find("div", {"class": "fact-check-icon"})):
                    #print "passou"
                    if (soup.find("div", {
                            "class": "fact-check-icon"
                    }).find('img')):
                        #print "passou2"
                        claim_str = soup.find("div", {
                            "class": "fact-check-icon"
                        }).find('img')['alt'].split(":")[1]
                        #print claim_str
                        claim_.alternateName = claim_str
            #print claim_.alternateName
            #body
            body = soup.find("div", {"class": "entry-content"})
            claim_.setBody(body.get_text())

            #author
            author = soup.find("div", {"class": "sharethefacts-speaker-name"})
            if (author):
                claim_.setAuthor(author.get_text())

            #sameas
            claim_.setSameAs(parse_wrong_json(json_, '"sameAs": [', "]"))

            #obj=soup.find("div",{"itemprop":"itemReviewed"})
            #if (obj and obj.find("div",{"itemprop":"datePublished"})):
            #print parse_wrong_json(json_,'"}, "datePublished":',",")

            #claim_.setDatePublished()

            #related links
            divTag = soup.find("div", {"class": "entry-content"})
            related_links = []
            for link in divTag.findAll('a', href=True):
                related_links.append(link['href'])
            claim_.setRefered_links(related_links)

            if (soup.find("div", {"class": "sharethefacts-statement"})):
                claim_.setClaim(
                    soup.find("div", {
                        "class": "sharethefacts-statement"
                    }).get_text())
            else:
                claim_.setClaim(claim_.title)

            claim_.setConclusion(
                soup.find("div", {
                    "class": "fact-check-icon-loop"
                }).find('img')['alt'].replace("FactsCan Score: ", ""))

            tags = []

            for tag in soup.findAll('meta', {"property": "article:tag"}):
                #print "achou"
                tags.append(tag["content"])
            claim_.setTags(", ".join(tags))

            if (claim_.conclusion.replace(" ", "") == ""
                    or claim_.claim.replace(" ", "") == ""):
                #print " eroor conclusion or claim"
                #print claim_.claim
                #print claim_.conclusion
                raise ValueError('No conclusion or claim')

            claims.append(claim_.getDict())
        except:
            print "Error ->" + str(url_complete)

#creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf
Example #19
0
    def getNews(self, folderpath, top=5):
        event2timeScope = {
            "Gabapentin_0628_0121":
            ["2017-06-28T00:00:00Z", "2018-01-21T00:00:00Z"],
            "SanctuaryCities_0516_0523":
            ["2018-05-16T00:00:00Z", "2018-05-23T00:00:00Z"],
            "WhereAreTheChildren_0418_0527":
            ["2018-04-18T00:00:00Z", "2018-05-27T00:00:00Z"],
            "Ingraham_0618_0624":
            ["2018-06-18T00:00:00Z", "2018-06-24T00:00:00Z"],
            "ItsJustAJacket_0621_0624":
            ["2018-06-21T00:00:00Z", "2018-06-24T00:00:00Z"],
            "immigrants_0622_0624":
            ["2018-06-22T00:00:00Z", "2018-06-24T00:00:00Z"],
            "JetLi_0519_0523":
            ["2018-05-19T00:00:00Z", "2018-05-23T00:00:00Z"],
            "BandyLee_0110_0115":
            ["2018-01-10T00:00:00Z", "2018-01-15T00:00:00Z"],
            "JackBreuer_1228_0115":
            ["2017-12-28T00:00:00Z", "2018-01-15T00:00:00Z"],
            "SouthwestKey_0620_0624":
            ["2018-06-20T00:00:00Z", "2018-06-24T00:00:00Z"],
            "Capriccio_0516_0523_new":
            ["2018-05-16T00:00:00Z", "2018-05-23T00:00:00Z"]
        }
        if not self.getSimilarity:
            skipthoughts_model = {
                "name": "skipthoughts",
                "modelPath": self.rootpath + "/.." +
                "/skip_thoughts/pretrained/skip_thoughts_uni_2017_02_02/exp_vocab",
                "checkpointPath": "model.ckpt-501424"
            }
            sent2vec_model = {
                "name": "sent2vec",
                "modelPath": "/lustre/scratch/haoxu/twitter_bigrams.bin"
            }
            self.getSimilarity = Claim.GetSimilarity(self.rootpath,
                                                     self.folderpath,
                                                     skipthoughts_model)

        folderPath = os.path.join(self.folderpath, "final")
        rankedClusterClaims = self.helper.loadJson(
            folderPath + "/ranked_cluster_claims.json")
        count = 0
        for index, info in enumerate(rankedClusterClaims):
            if count >= top:
                continue
            count += 1
            query = info[0]
            start = event2timeScope[folderpath][0]
            end = event2timeScope[folderpath][1]

            # find similar news
            alylienNewsAPI = Information.AylienNewsAPI()
            news = alylienNewsAPI.getNews(query, start, end, 10)
            if len(news) == 0:
                print("no news.")
                print("folder ", folderpath)
                print("info ", info)
                continue
            titles = alylienNewsAPI.getTitles(news)
            # find final news based the most similar news
            query = self.getSimilarity.getSimilarNews(query, titles)
            finalNews = alylienNewsAPI.getNews(query, start, end, 10)
            self.helper.dumpPickle(folderPath + "/news_bk",
                                   str(index) + "_news.pickle", finalNews)
            finalNewsDict = [i.to_dict() for i in finalNews]
            self.helper.dumpJson(folderPath + "/news",
                                 str(index) + "_news.json", finalNewsDict)
            print("{}th claim: news has been saved.".format(index))
Example #20
0
def get_all_claims(criteria):
    headers = {
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }

    #print criteria.maxClaims
    #performing a search by each letter, and adding each article to a urls_ var.
    now = datetime.datetime.now()
    urls_ = {}
    types = [
        "true", "mostly-true", "half-true", "barely-true", "false",
        "pants-fire", "no-flip", "half-flip", "full-flop"
    ]
    last_page = []
    for page_number in range(1, 500):
        if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims):
            break

        url = "https://africacheck.org/latest-reports/page/" + str(
            page_number) + "/"
        #url="http://www.politifact.com/truth-o-meter/rulings/"+str(type_)+"/?page="+str(page_number)
        try:
            page = requests.get(url, headers=headers, timeout=5)
            soup = BeautifulSoup(page.text, "lxml")
            soup.prettify()
            links = soup.findAll("div", {"class": "article-content"})
            if (len(links) != 0) or (links != last_page):
                for anchor in links:
                    anchor = anchor.find('a', href=True)
                    ind_ = str(anchor['href'])
                    if (ind_ not in urls_.keys()):
                        if (criteria.maxClaims > 0
                                and len(urls_) >= criteria.maxClaims):
                            break
                        if (ind_ not in criteria.avoid_url):
                            urls_[ind_] = ind_
                            print "adding " + str(ind_)
                last_page = links
            else:
                print("break!")
                break
        except:
            print "error=>" + str(url)

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url, conclusion in urls_.iteritems():
        print str(index) + "/" + str(len(
            urls_.keys())) + " extracting " + str(url)
        index += 1

        url_complete = str(url)

        #print url_complete
        #try:
        page = requests.get(url_complete, headers=headers, timeout=5)
        soup = BeautifulSoup(page.text, "lxml")
        soup.prettify("utf-8")

        claim_ = claim_obj.Claim()
        claim_.setUrl(url_complete)
        claim_.setSource("africacheck")

        if (criteria.html):
            claim_.setHtml(soup.prettify("utf-8"))

        #title
        #if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1):
        title = soup.find("meta", {"property": "og:title"})
        claim_.setTitle(title['content'])

        #date

        date_ = soup.find('time')
        #print date_["content"]
        if date_:
            date_str = search_dates(
                date_['datetime'].split(" ")[0])[0][1].strftime("%Y-%m-%d")
            #print date_str
            claim_.setDate(date_str)
            #print claim_.date

        #rating

        conclusion_ = ""
        if (soup.find("div", {"class": "verdict-stamp"})):
            conclusion_ = soup.find("div", {
                "class": "verdict-stamp"
            }).get_text()
        if (soup.find("div", {"class": "verdict"})):
            conclusion_ = soup.find("div", {"class": "verdict"}).get_text()
        if (soup.find("div", {"class": "indicator"})):
            conclusion_ = soup.find("div", {"class": "indicator"}).get_text()
            if (soup.find("div", {"class": "indicator"}).find('span')):
                conclusion_ = soup.find("div", {
                    "class": "indicator"
                }).find('span').get_text()

        claim_.setAlternateName(
            str(re.sub('[^A-Za-z0-9\ -]+', '', conclusion_)).lower().strip())

        #when there is no json

        date_ = soup.find("time", {"class": "datetime"})
        if (date_):
            claim_.setDate(date_.get_text())

        #print claim_.alternateName
        #body
        body = soup.find("div", {"id": "main"})
        claim_.setBody(body.get_text())

        #author
        author = soup.find("div", {"class": "sharethefacts-speaker-name"})
        if (author):
            claim_.setAuthor(author.get_text())

        #related links
        divTag = soup.find("div", {"id": "main"})
        related_links = []
        for link in divTag.findAll('a', href=True):
            related_links.append(link['href'])
        claim_.setRefered_links(related_links)

        if (soup.find("div", {"class": "report-claim"})):
            claim_.setClaim(
                soup.find("div", {
                    "class": "report-claim"
                }).find("strong").get_text())
        else:
            claim_.setClaim(claim_.title)

        tags = []

        for tag in soup.findAll('meta', {"property": "article:tag"}):
            #print "achou"
            tags.append(tag["content"])
        claim_.setTags(", ".join(tags))

        claims.append(claim_.getDict())
        #except:
        #	print "Error ->" + str(url_complete)

        #creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf
Example #21
0
def exactractionClaim(page, url, maxClaims):
    global urlTraite, urls_, claims, idClaim, uriSansClaim, nbClaims
    if nbClaims < maxClaims:
        print(str(nbClaims) + "/" + str(maxClaims) + " extracting " + str(url))
        soup = BeautifulSoup(page, "lxml")
        soup.prettify()
        claim_ = claim_obj.Claim()

        claim = soup.find('div', {"class": "col-xs-12 col-sm-6 col-left"})

        #si la page contient une claim et une conclusion.
        if claim:
            nbClaims += 1
            claim_.setSource("fullfact")
            claim_.setUrl(url)
            claim_.setClaim(claim.get_text().replace("\nClaim\n", ""))
            claim_.setIdClaim(idClaim)

            #texte de la conclusion.
            conclusion = soup.find('div',
                                   {"class": "col-xs-12 col-sm-6 col-right"})
            if conclusion:
                claim_.setConclusion(conclusion.get_text().replace(
                    "\nConclusion\n", ""))
                c = conclusion.get_text().replace("\nConclusion\n", "")
                fonct = TraitementConclusion.fonctionPrincipale(c)
                claim_.setVerdictTompo(
                    TraitementConclusion.fonctionPrincipale(c))

            title = soup.find("div", {"class": "header"})
            t = ""
            if title:
                t = title.find("h1").get_text()
                claim_.setTitle(t)

            date = soup.find("p", {"class": "date"})
            d = ""
            if date:
                d = date.find("span").get_text()
                claim_.setDate(d)

#texte de la revue.
            body = soup.find("div", {"class": "article-post-content"})
            if body:
                liensRevue = []
                text = []

                bod = body.find("div", class_=False, id=False)
                if bod:

                    for b in bod.findAll("p"):
                        for link in b.findAll('a', href=True):
                            liensRevue.append(link['href'])
                        text.append(b.get_text())
                    result = " ".join(text)
                    claim_.setLiensRevue(liensRevue)
                    claim_.setBody(result)

            #extraction du nom de la rubrique du claim.
            categories = soup.find('ol', {"class": "breadcrumb"})
            if categories:
                rub = []
                for c in categories.findAll('a', href=True):
                    rub.append(c.get_text())

                rubri = rub[1].lower()
                claim_.setRubrique(rubri)

            #extraction des claims contenus dans la rubrique "related posts" du claim courant.
            relp = getPosts.getRelatedPosts(soup)
            #appel du programme qui extrait les mots clés/thématique pour lesquels les claims ont été mis en ensemble dans "related posts".
            l = relationsEntreLesClaims.relationClaims(1,
                                                       "RelatedPosts",
                                                       relp,
                                                       RP=True)
            motsCles = l[-1]

            if not (rub[-1].lower() == "online"):
                motsCles.append(rub[-1])
            print("\n commun subjects -section related posts-: " +
                  str(motsCles))
            del l[-1]

            #stockage des URL des claims de "related posts" et mots clés associés dans l'attribut relatesPosts du claim courant
            claim_.setRelated_posts("RelatedPosts", l)
            claim_.setKeyWordsRP("RelatedPosts", motsCles)

            #Cas où il y a plusieurs claims/conclusions traitées par la même revue.
            autresClaims = soup.find_all('div',
                                         {"class": "briefAdditionalRows"})
            if autresClaims:
                nbClaims += len(autresClaims)
                if nbClaims > maxClaims:
                    return "break"
                for row in autresClaims:
                    c = additionalRows.briefAdditionalRows(
                        row, result, url, idClaim, l, rubri, motsCles, t,
                        liensRevue, d)
                    if c != "empty":
                        claims.append(c.getDict())
            idClaim += 1

            #stockage du claim courant dans la rubrique associée (necéssaire dans l'étape de clustering pour déduire s'il y a une relation entre claims de même rubrique).
            triClaimsParRubrique(rubri, claim_)

            #appel récursif sur les claims de related posts.
            if len(relp) != 0:
                for r in relp:

                    if not (r[0] in urlTraite):
                        try:
                            page = urlopen(r[0]).read()
                            urlTraite.append(r[0])
                            exactractionClaim(page, r[0], maxClaims)
                        except:
                            continue

        #Si la page qu'on scrappe ne contient pas de claim/conclusion, juste une revue.
        else:
            uriSansClaim += 1
            print("page :  " + url + "   without a claim !")
            ls = soup.findAll('a', href=True)
            if len(ls) != 0:
                for anchor in ls:
                    u = "http://fullfact.org" + anchor['href'].replace(
                        "?utm_source=content_page&utm_medium=related_content",
                        "")
                    if (not (u in urls_) and not (u in urlTraite)):
                        urls_.append(u)

    else:
        return "break"
Example #22
0
 def addClaim(self, customer, date, incident_description, claim_amount):
     a = Claim(customer, date, incident_description, claim_amount)
     self.claims.append(a)
     print(a.ID)
     return a.ID
Example #23
0
def get_all_claims(criteria):
    headers = {
        'user-agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }

    #print criteria.maxClaims
    #performing a search by each letter, and adding each article to a urls_ var.
    now = datetime.datetime.now()
    urls_ = {}
    last_page = []
    for page_number in range(1, 500):
        if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims):
            break
        url = "https://theferret.scot/category/fact-check/page/" + str(
            page_number) + "/"
        #try:
        page = requests.get(url, headers=headers, timeout=5)
        soup = BeautifulSoup(page.text, "lxml")
        soup.prettify()

        links = soup.findAll("h1", {"class": "entry-title"})
        if (len(links) != 0) or (links != last_page):
            for anchor in links:
                anchor = anchor.find('a', {"rel": "bookmark"}, href=True)
                ind_ = str(anchor['href'])
                if (ind_ not in urls_.keys()):
                    if (criteria.maxClaims > 0
                            and len(urls_) >= criteria.maxClaims):
                        break
                    urls_[ind_] = page
                    print "adding " + str(ind_)
            last_page = links
        else:
            print("break!")
            break
        #except:
        #	print "error=>"+str(url)

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url, conclusion in urls_.iteritems():
        print str(index) + "/" + str(len(
            urls_.keys())) + " extracting " + str(url)
        index += 1

        url_complete = str(url)

        #print url_complete
        try:
            page = requests.get(url_complete, headers=headers, timeout=5)
            soup = BeautifulSoup(page.text, "lxml")
            soup.prettify("utf-8")

            claim_ = claim_obj.Claim()
            claim_.setUrl(url_complete)
            claim_.setSource("theferret")

            if (criteria.html):
                claim_.setHtml(soup.prettify("utf-8"))

            #title
            #if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1):
            title = soup.find("h1", {"class": "cover-title"})
            claim_.setTitle(title.text)

            #date

            date_ = soup.find('div', {"class": "widget__content"}).find("p")
            #print date_["content"]
            if date_:
                date_str = search_dates(date_.text)[0][1].strftime("%Y-%m-%d")
                #print date_str
                claim_.setDate(date_str)
                #print claim_.date

            #body
            body = soup.find("div", {"class": "article__text"})
            claim_.setBody(body.get_text())

            #related links
            divTag = soup.find("div", {"class": "article__text"})
            related_links = []
            for link in divTag.findAll('a', href=True):
                related_links.append(link['href'])
            claim_.setRefered_links(related_links)

            claim_.setClaim(soup.find("h1", {"class": "article__title"}).text)
            claim_.setConclusion(conclusion)

            tags = []

            for tag in soup.findAll('meta', {"property": "article:tag"}):
                #print "achou"
                tags.append(tag["content"])
            claim_.setTags(", ".join(tags))

            claims.append(claim_.getDict())
        except:
            print "Error ->" + str(url_complete)

#creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf
def get_all_claims(criteria):
    headers = {
        'user-agent':
        'Mozilla/5.5 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'
    }

    #print criteria.maxClaims
    #performing a search by each letter, and adding each article to a urls_ var.
    now = datetime.datetime.now()
    urls_ = {}
    types = [
        "true", "mostly-true", "half-true", "barely-true", "false",
        "pants-fire", "no-flip", "half-flip", "full-flop"
    ]
    last_page = []
    for type_ in types:
        for page_number in range(1, 500):
            if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims):
                break
            url = "http://www.politifact.com/truth-o-meter/rulings/" + str(
                type_) + "/?page=" + str(page_number)
            try:
                page = requests.get(url, headers=headers, timeout=5)
                soup = BeautifulSoup(page.text, "lxml")
                soup.prettify()

                links = soup.findAll("p", {"class": "statement__text"})
                if (len(links) != 0) or (links != last_page):
                    for anchor in links:
                        anchor = anchor.find('a', {"class": "link"}, href=True)
                        ind_ = "http://www.politifact.com" + str(
                            anchor['href'])
                        if (ind_ not in urls_.keys()):
                            if (criteria.maxClaims > 0
                                    and len(urls_) >= criteria.maxClaims):
                                break
                            if (ind_ not in criteria.avoid_url):
                                urls_[ind_] = type_
                                print "adding " + str(ind_)
                    last_page = links
                else:
                    print("break!")
                    break
            except:
                print "error=>" + str(url)

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url, conclusion in urls_.iteritems():
        print str(index) + "/" + str(len(
            urls_.keys())) + " extracting " + str(url)
        index += 1

        url_complete = str(url)

        #print url_complete
        try:
            page = requests.get(url_complete, headers=headers, timeout=5)
            soup = BeautifulSoup(page.text, "lxml")
            soup.prettify("utf-8")

            claim_ = claim_obj.Claim()
            claim_.setUrl(url_complete)
            claim_.setSource("politifact")

            if (criteria.html):
                claim_.setHtml(soup.prettify("utf-8"))

            #title
            #if (soup.find("h1",{"class":"content-head__title"}) and len(soup.find("h1",{"class":"content-head__title"}).get_text().split("?"))>1):
            title = soup.find("h1", {"class": "article__title"})
            claim_.setTitle(title.text)

            #date

            date_ = soup.find('div', {"class": "widget__content"}).find("p")
            #print date_["content"]
            if date_:
                date_str = search_dates(date_.text)[0][1].strftime("%Y-%m-%d")
                #print date_str
                claim_.setDate(date_str)
                #print claim_.date

            #rating
            obj = soup.find("div", {"itemprop": "reviewRating"})
            if (obj):
                claim_.ratingValue = obj.find("div", {
                    "itemprop": "ratingValue"
                }).text
                claim_.worstRating = obj.find("div", {
                    "itemprop": "worstRating"
                }).text
                claim_.bestRating = obj.find("div", {
                    "itemprop": "bestRating"
                }).text
                claim_.alternateName = obj.find("div", {
                    "itemprop": "alternateName"
                }).text
            else:
                claim_.setConclusion(conclusion)

            #body
            body = soup.find("div", {"class": "article__text"})
            claim_.setBody(body.get_text())

            #author
            author = soup.find("div", {"itemprop": "itemReviewed"})
            if (author and author.find("div", {"itemprop": "author"})):
                claim_.setAuthor(
                    author.find("div", {
                        "itemprop": "author"
                    }).find("div", {
                        "itemprop": "name"
                    }).get_text())

            #sameas
            obj = soup.find("div", {"itemprop": "itemReviewed"})
            if (obj and obj.find("div", {"itemprop": "sameAs"})):
                claim_.setSameAs(
                    obj.find("div", {
                        "itemprop": "sameAs"
                    }).get_text())

            #sameAs
            obj = soup.find("div", {"itemprop": "itemReviewed"})
            if (obj and obj.find("div", {"itemprop": "datePublished"})):
                claim_.setDatePublished(
                    obj.find("div", {
                        "itemprop": "datePublished"
                    }).get_text())

            #related links
            divTag = soup.find("div", {"class": "article__text"})
            related_links = []
            for link in divTag.findAll('a', href=True):
                related_links.append(link['href'])
            claim_.setRefered_links(related_links)

            claim_.setClaim(soup.find("h1", {"class": "article__title"}).text)

            tags = []

            for tag in soup.findAll('meta', {"property": "article:tag"}):
                #print "achou"
                tags.append(tag["content"])
            claim_.setTags(", ".join(tags))

            claims.append(claim_.getDict())
        except:
            print "Error ->" + str(url_complete)

#creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf
Example #25
0
def get_all_claims(criteria):
    print criteria.maxClaims
    #performing a search by each letter, and adding each article to a urls_ var.
    now = datetime.datetime.now()
    urls_ = {}
    for type_ in [
            "verdadeiro", "impreciso", "exagerado", "contraditorio",
            "insustentavel", "falso"
    ]:
        for page_number in range(1, 500):
            if (criteria.maxClaims > 0 and len(urls_) >= criteria.maxClaims):
                break
            try:
                page = urllib2.urlopen(
                    "http://aosfatos.org/noticias/checamos/" + str(type_) +
                    "/?page=" + str(page_number)).read()
            except:
                break
            soup = BeautifulSoup(page, "lxml")
            soup.prettify()
            links = soup.findAll('a', {"class": "card third"}, href=True)
            if len(links) != 0:
                for anchor in links:
                    if (anchor['href'] not in urls_.keys()):
                        if (criteria.maxClaims > 0
                                and len(urls_) >= criteria.maxClaims):
                            break
                        urls_[anchor['href']] = type_
                        print "adding " + str(anchor['href'])
            else:
                print("break!")
                break

    claims = []
    index = 0
    # visiting each article's dictionary and extract the content.
    for url, conclusion in urls_.iteritems():
        print str(index) + "/" + str(len(
            urls_.keys())) + " extracting " + str(url)
        index += 1

        url_complete = "https://aosfatos.org/" + str(url)

        #print url_complete
        page = urllib2.urlopen(url_complete).read().decode('utf-8', 'ignore')
        soup = BeautifulSoup(page, "lxml")
        soup.prettify("utf-8")

        for claim_element in soup.findAll("blockquote"):
            claim_ = claim_obj.Claim()
            claim_.setUrl(url_complete)
            claim_.setSource("aosfatos")

            #date
            date_ = soup.find('p', {"class": "publish_date"})
            if date_:
                date_str = date_.get_text().replace("\n", "").replace(
                    "  ", "").split(",")[0]
                claim_.setDate(dateparser.parse(date_str).strftime("%Y-%m-%d"))

            #title
            title = soup.findAll("h1")
            claim_.setTitle(title[1].text)

            #body
            body = soup.find("article")
            claim_.setBody(body.get_text().replace("\n", "").replace(
                "TwitterFacebookE-mailWhatsApp", ""))

            #related links
            divTag = soup.find("article").find("hr")
            related_links = []
            for link in divTag.find_all_next('a', href=True):
                related_links.append(link['href'])
            claim_.setRefered_links(related_links)

            #claim
            claim_.setClaim(claim_element.get_text())
            if (claim_element.find_previous_sibling("figure")
                    and claim_element.find_previous_sibling("figure").findAll(
                        "figcaption")):
                claim_.setConclusion(
                    claim_element.find_previous_sibling("figure").findAll(
                        "figcaption")[-1:][0].get_text())
            #print claim_.claim.decode("utf-8") + " ====> "
            #print claim_.conclusion.decode("utf-8")
            #print "-->"+ str(claim_.conclusion)

            claims.append(claim_.getDict())

#creating a pandas dataframe
    pdf = pd.DataFrame(claims)
    return pdf