def retrieve_urls(self, parsed_claim_review_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) -> \
            List[str]:
        """
            :parsed_listing_page: --> une page (parsed) qui liste des claims
            :listing_page_url:    --> l'url associé à la page ci-dessus
            :number_of_page:      --> number_of_page
            :return:              --> la liste des url de toutes les claims
        """
        urls = []
        # First single page:
        page_contend = caching.get(listing_page_url,
                                   headers=self.headers,
                                   timeout=5)
        page = BeautifulSoup(page_contend, "lxml")
        if page is not None:
            for ex_url in self.extract_urls(page):
                urls.append(ex_url)

        # All pages >0:
        for page_number in tqdm(range(2, number_of_pages)):
            if 0 < self.configuration.maxClaims < len(urls):
                break
            url = listing_page_url + "page/" + str(page_number) + "/"
            page_contend = caching.get(url, headers=self.headers, timeout=5)
            page = BeautifulSoup(page_contend, "lxml")
            if page is not None:
                for ex_url in self.extract_urls(page):
                    urls.append(ex_url)
        return urls
    def get_all_claims(self):
        claims = []  # type : List[Claim]

        listing_pages = self.retrieve_listing_page_urls()
        for listing_page_url in listing_pages:
            print("Fetching listing pages from " + listing_page_url)
            page = caching.get(listing_page_url,
                               headers=self.headers,
                               timeout=5)
            if not page:
                continue
            parsed_listing_page = BeautifulSoup(
                page, self.configuration.parser_engine)
            number_of_pages = self.find_page_count(parsed_listing_page)
            if number_of_pages and number_of_pages < 0:
                number_of_pages = None

            urls = self.retrieve_urls(parsed_listing_page, listing_page_url,
                                      number_of_pages)

            print("Extracting claims listed in " + listing_page_url)
            for url in tqdm(urls):
                try:
                    if "http" in url:
                        review_page = caching.get(url,
                                                  headers=self.headers,
                                                  timeout=6)
                        if review_page:
                            parsed_claim_review_page = BeautifulSoup(
                                review_page, self.configuration.parser_engine)
                            claim = get_claim_from_cache(url)
                            if not claim:
                                local_claims = self.extract_claim_and_review(
                                    parsed_claim_review_page, url)
                                for claim in local_claims:
                                    self._annotate_claim(claim)
                                if len(local_claims) > 1:
                                    for claim in local_claims:
                                        claims.append(
                                            claim.generate_dictionary())
                                elif len(
                                        local_claims) == 1 and local_claims[0]:
                                    claims.append(
                                        local_claims[0].generate_dictionary())
                                    cache_claim(local_claims[0])
                                else:
                                    self.failed_log.write(url + "\n")
                                    self.failed_log.flush()
                            else:
                                claims.append(claim.generate_dictionary())
                except ConnectionError:
                    pass
        self.failed_log.close()
        return pandas.DataFrame(claims)
Beispiel #3
0
    def find_page_count(self, parsed_listing_page: BeautifulSoup) -> int:
        count = 26
        url = "https://checkyourfact.com/page/" + str(count + 1)
        result = caching.get(url, headers=self.headers, timeout=10)
        if result:
            while result:
                count += 1
                url = "https://www.washingtonpost.com/news/fact-checker/page/" + str(count)
                result = caching.get(url, headers=self.headers, timeout=10)
        else:
            count -= 1

        return count
    def retrieve_listing_page_urls(self) -> List[str]:
        links = []
        different_categories_value = ["disinformation-cases"]
        url_begins = [
            "https://euvsdisinfo.eu/", "https://euvsdisinfo.eu/ru/",
            "https://euvsdisinfo.eu/it/", "https://euvsdisinfo.eu/es/",
            "https://euvsdisinfo.eu/fr/", "https://euvsdisinfo.eu/de/"
        ]

        for url in url_begins:
            for value in different_categories_value:
                #different_urls.append(url + value + "/")
                # data = caching.get(f""+ url + value + "/")
                data = caching.get(
                    "https://euvsdisinfo.eu/disinformation-cases/",
                    headers=self.headers,
                    timeout=15)
                soup = BeautifulSoup(data, 'html.parser')
                nb = self.find_page_count(soup)
                for x in range(0, int(nb / 10)):
                    links.append(url + value + '/?offset=' + str(x * 10))

        # data = caching.get('https://euvsdisinfo.eu/disinformation-cases')
        # soup = BeautifulSoup(data, 'html.parser')
        # nb = self.find_page_count(soup)
        # links = []
        # for x in range(0, int(nb/10)):
        #     links.append('https://euvsdisinfo.eu/disinformation-cases/?offset='+str(x*10))
        return links
 def retrieve_listing_page_urls(self) -> List[str]:
     data = caching.get('https://euvsdisinfo.eu/disinformation-cases')
     soup = BeautifulSoup(data, 'html.parser')
     nb = self.find_page_count(soup)
     links = []
     for x in range(0, int(nb/10)):
         links.append('https://euvsdisinfo.eu/disinformation-cases/?offset='+str(x*10))
     return links
Beispiel #6
0
 def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \
         -> List[str]:
     urls = []
     offset = 1
     links = caching.get(
         f"https://loadmore.aap.com.au/category?category=6&postOffset={offset}&perPage=100"
     )
     offset = 100
     while links != "[]":
         parsed_json = json.loads(links)
         for link in parsed_json:
             urls.append(link['link'])
         links = caching.get(
             f"https://loadmore.aap.com.au/category?category=6&postOffset={offset}&perPage=100"
         )
         offset += 100
     return urls
 def retrieve_listing_page_urls(self) -> List[str]:
     listings_url = "https://www.politifact.com/truth-o-meter/rulings/"
     page = caching.get(listings_url, headers=self.headers, timeout=5)
     parsed = BeautifulSoup(page, "lxml")
     main_tag = parsed.find("main",
                            {"class": "main"})  # type: BeautifulSoup
     links = main_tag.find_all("a", href=True)
     return ["http://www.politifact.com" + link['href'] for link in links]
Beispiel #8
0
    def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \
            -> List[str]:
        query_url = "https://www.newtral.es/wp-json/wp/v2/posts?per_page=100&offset={offset}&categories=1" + \
                    "&exclude=80729%2C79970%2C78262%2C78455%2C77275%2C77315%2C77161%2C76907%2C76298" + \
                    "%2C75434%2C74706%2C74103%2C74062&_locale=user"

        urls = []

        json_output = caching.get(query_url.format(offset=0), headers=self.headers, timeout=5)
        offset = 0

        while json_output.strip() != "[]":
            pages = json.loads(json_output)
            for page in pages:
                urls.append(page['link'])
            offset += 100
            json_output = caching.get(query_url.format(offset=offset), headers=self.headers, timeout=5)
        return urls
 def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \
         -> List[str]:
     urls = self.extract_urls(parsed_listing_page)
     for page_number in tqdm(range(2, number_of_pages)):
         url = "http://factscan.ca/page/" + str(page_number) + "/"
         page = caching.get(url, headers=self.headers, timeout=5)
         current_parsed_listing_page = BeautifulSoup(page, "lxml")
         urls += self.extract_urls(current_parsed_listing_page)
     return urls
Beispiel #10
0
    def find_page_count(self, parsed_listing_page: BeautifulSoup) -> int:
        count = 5
        url = "https://www.polygraph.info/z/20382?p=" + str(count + 1)
        result = caching.get(url, headers=self.headers, timeout=10)
        if result:
            while result:
                count += 1
                url = "https://www.polygraph.info/z/20382?p=" + str(count)
                result = caching.get(url, headers=self.headers, timeout=10)
                if result:
                    parsed = BeautifulSoup(result, self.configuration.parser_engine)
                    articles = parsed.findAll("li", {"class": "fc__item"})
                    if not articles or len(articles) == 0:
                        break
        else:
            count -= 1

        return count - 1
Beispiel #11
0
 def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \
         -> List[str]:
     urls = [] #self.extract_urls(parsed_listing_page)
     for page_number in tqdm(range(1, number_of_pages)):
         url = "https://www.truthorfiction.com/category/fact-checks/page/" + str(page_number) + "/"
         page = caching.get(url, headers=self.headers, timeout=20)
         current_parsed_listing_page = BeautifulSoup(page, "lxml")
         urls += self.extract_urls(current_parsed_listing_page)
     return urls
    def find_page_count(self, parsed_listing_page: BeautifulSoup) -> int:
        count = 26
        url = "https://checkyourfact.com/page/" + str(count + 1)
        result = caching.get(url, headers=self.headers, timeout=10)
        if result:
            while result:
                count += 1
                url = "https://checkyourfact.com/page/" + str(count)
                result = caching.get(url, headers=self.headers, timeout=10)
                if result:
                    parsed = BeautifulSoup(result,
                                           self.configuration.parser_engine)
                    articles = parsed.find("articles").findAll("article")
                    if not articles or len(articles) == 0:
                        break
        else:
            count -= 1

        return count
Beispiel #13
0
 def find_page_count(self, parsed_listing_page: BeautifulSoup) -> int:
     page_nav = parsed_listing_page.find("div", {"class": "nav-previous"})
     last_page_link = page_nav.findAll("a")[0]['href']
     page_re = re.compile("https://www.truthorfiction.com/category/fact-checks/page/([0-9]+)/")
     max_page = int(page_re.match(last_page_link).group(1))
     if (max_page >= 2) and ((max_page*10) <= self.configuration.maxClaims):
         page = caching.get(last_page_link, headers=self.headers, timeout=5)
         if page:
             parsed_listing_page = BeautifulSoup(page, self.configuration.parser_engine)
             max_page = self.find_page_count(parsed_listing_page)
     return max_page
Beispiel #14
0
    def retrieve_urls(self, parsed_listing_page: BeautifulSoup,
                      listing_page_url: str,
                      number_of_pages: int) -> List[str]:
        urls = self.extract_urls(parsed_listing_page)
        for page_number in trange(1, number_of_pages):
            url = listing_page_url + "?page=" + str(int(page_number))
            page = caching.get(url, headers=self.headers, timeout=20)
            current_parsed_listing_page = BeautifulSoup(page, "lxml")
            urls += self.extract_urls(current_parsed_listing_page)

        return urls
 def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \
         -> List[str]:
     urls = self.extract_urls(parsed_listing_page)
     for page_number in tqdm(range(2, number_of_pages)):
         if 0 < self.configuration.maxClaims < len(urls):
             break
         url = listing_page_url + "/page/" + str(page_number)
         page = caching.get(url, headers=self.headers, timeout=5)
         current_parsed_listing_page = BeautifulSoup(page, "lxml")
         urls = urls + self.extract_urls(current_parsed_listing_page)
     return urls
Beispiel #16
0
    def find_page_count(self, parsed_listing_page: BeautifulSoup) -> int:
        next_link = parsed_listing_page.find("a", {"class", "btn-next btn"})['href']
        next_page_contents = caching.get(next_link, headers=self.headers, timeout=5)
        next_page = BeautifulSoup(next_page_contents, "lxml")

        title_text = next_page.find(
            "title").text  # Format u'Fact Checks Archive | Page 2 of 1069 | Snopes.com'
        max_page_pattern = re.compile("Page [0-9]+ of ([0-9+]+)")
        result = max_page_pattern.match(title_text.split("|")[1].strip())
        max_page = int(result.group(1))
        return max_page
Beispiel #17
0
 def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \
         -> List[str]:
     urls = self.extract_urls(parsed_listing_page)
     page_number = 2
     while True:
         url = listing_page_url + "?page=" + str(page_number)
         page = caching.get(url, headers=self.headers, timeout=5)
        if not page:
             break
         current_parsed_listing_page = BeautifulSoup(page, "lxml")
         urls += self.extract_urls(current_parsed_listing_page)
Beispiel #18
0
 def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \
         -> List[str]:
     urls = []
     offset = 1
     links = caching.get(
         f"https://loadmore.aap.com.au/category?category=6&postOffset={offset}&perPage=100"
     )
     offset = 100
     tmp_counter = 0
     while links != "[]" and tmp_counter < self.configuration.maxClaims:
         parsed_json = json.loads(links)
         for link in parsed_json:
             tmp_counter += 1
             urls.append(link['link'])
             if (self.configuration.maxClaims <= tmp_counter):
                 break
         links = caching.get(
             f"https://loadmore.aap.com.au/category?category=6&postOffset={offset}&perPage=100"
         )
         offset += 100
     return urls
Beispiel #19
0
 def get(self, url):
     """ @return the webpage """
     headers = {
         'User-Agent':
         'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
     }
     html = caching.get(url, headers=headers)
     soup = BeautifulSoup(html, 'lxml')
     # removing some useless tags
     for s in soup.select("script, iframe, head, header, footer, style"):
         s.extract()
     return soup
Beispiel #20
0
 def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \
         -> List[str]:
     urls = self.extract_urls(parsed_listing_page)
     print(number_of_pages)
     for page_number in tqdm(range(0, number_of_pages)):
         url = "https://africacheck.org/fact-checks?field_article_type_value=reports&field_rated_value=All&field_country_value=All&sort_bef_combine=created_DESC&sort_by=created&sort_order=DESC&page=" + str(
             page_number) + "/"
         page = caching.get(url, headers=self.headers, timeout=5)
         #print(url)
         current_parsed_listing_page = BeautifulSoup(page, "lxml")
         #print(current_parsed_listing_page)
         urls += self.extract_urls(current_parsed_listing_page)
     return urls
Beispiel #21
0
    def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \
            -> List[str]:
        urls = self.extract_urls(parsed_listing_page)

        for page_number in tqdm(range(2, number_of_pages)):
            url = "https://checkyourfact.com/page/" + str(page_number) + "/"
            page = caching.get(url, headers=self.headers, timeout=5)
            if page:
                current_parsed_listing_page = BeautifulSoup(page, "lxml")
                urls += self.extract_urls(current_parsed_listing_page)
            else:
                break
        print(urls)
        return urls
Beispiel #22
0
 def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \
         -> List[str]:
     urls = self.extract_urls(parsed_listing_page)
     for page_number in tqdm(range(0, number_of_pages)):
         # each page 9 articles:
         if ((page_number * 9) + 18 >= self.configuration.maxClaims):
             break
         #url = "https://africacheck.org/latest-reports/page/" + str(page_number) + "/"
         url = "https://africacheck.org/search?rt_bef_combine=created_DESC&sort_by=created&sort_order=DESC&page=" + str(
             page_number)
         page = caching.get(url, headers=self.headers, timeout=5)
         current_parsed_listing_page = BeautifulSoup(page, "lxml")
         urls += self.extract_urls(current_parsed_listing_page)
     return urls
Beispiel #23
0
 def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \
         -> List[str]:
     # lien de la premiere page -> liste de textes
     urls = self.extract_urls(parsed_listing_page)
     # parcours from 2 to end
     for page_number in tqdm(range(2, number_of_pages + 1)):
         url = "https://www.polygraph.info/z/20382?p=" + str(page_number) + "/"
         # load from cache (download if not exists, sinon load )
         page = caching.get(url, headers=self.headers, timeout=5)
         if page:
             # parser avec BeautifulSoup la page
             current_parsed_listing_page = BeautifulSoup(page, "lxml")
             # extriare les liens dans cette page et rajoute dans urls
             urls += self.extract_urls(current_parsed_listing_page)
         else:
             break
     return urls
    def retrieve_urls(self, parsed_listing_page: BeautifulSoup, listing_page_url: str, number_of_pages: int) \
            -> List[str]:
        urls = self.extract_urls(parsed_listing_page)
        page_number = 2
        while True and ((page_number*30) <= self.configuration.maxClaims):
            url = listing_page_url + "?page=" + str(page_number)
            page = caching.get(url, headers=self.headers, timeout=5)
            if page is not None:
                current_parsed_listing_page = BeautifulSoup(page, "lxml")
            else:
                break

            nav_buttons = current_parsed_listing_page.find_all("section", attrs={'class': 't-row'})
            nav_buttons = nav_buttons[-1].find_all("li", attrs={'class': 'm-list__item'})

            if len(nav_buttons) == 1:
                break
            else:
                urls += self.extract_urls(current_parsed_listing_page)
            page_number += 1
            #print("\rr: " + url)
        return urls 
 def find_last_page(self):  #returns last page listing articles
     page = 80  #86
     count = 32
     lim = -1
     #Dichotomy
     while count >= 1:
         url = "https://factuel.afp.com/?page=" + str(int(page))
         result = caching.get(url, headers=self.headers, timeout=10)
         parsed = BeautifulSoup(result, self.configuration.parser_engine)
         article = parsed.findAll("article")
         if lim > 0:
             count = count / 2
         if (len(article) != 0):
             if count < 1:
                 return int(page)
             page = page + count
         else:
             if lim == -1:
                 lim = page
                 count = count / 2
             elif count < 1:
                 return int(page - 1)
             page = page - count