Esempio n. 1
0
    def _keyword_search(
        self,
        keyword: str,
        starting_date: str,
        ending_date: str,
    ) -> list[PublicoNews]:

        # Normalize keyword
        keyword = keyword.lower()
        # Start page number
        page_number = 1
        # Create news URL list
        collected_news_urls = []

        # Parse `starting_date`
        starting_date = datetime_from_string(starting_date, order="YMD")
        # Parse `ending_date`
        ending_date = datetime_from_string(ending_date, order="YMD")

        while (
            response := requests.get(
                f"https://www.publico.pt/api/list/search/?query={keyword}&start={starting_date.strftime('%d-%m-%Y')}&end={ending_date.strftime('%d-%m-%Y')}&page={page_number}"
            ).text
        ) != "[]":
            # Read the json data
            data = json.loads(response)
            # Get the URLs (this search type needs fullUrl)
            urls = [d.get("fullUrl") for d in data]
            # Append URLs to list
            collected_news_urls += urls
            # Increment page
            page_number += 1
Esempio n. 2
0
    def _tag_search(
        self,
        tag: str,
        starting_date: str,
        ending_date: str,
    ) -> list[PublicoNews]:

        # Normalize tag
        tag = tag.replace(" ", "-").lower()
        # Start page number
        page_number = 1
        # Flag to stop the search
        stop_entire_search = False
        # Create news URL list
        collected_news_urls = []

        # Parse `starting_date`
        starting_date = datetime_from_string(starting_date, order="YMD").date()
        # Parse `ending_date`
        ending_date = datetime_from_string(ending_date, order="YMD").date()

        while (
            response := requests.get(
                f"https://www.publico.pt/api/list/{tag}?page={page_number}"
            ).text
        ) != "[]":
            # Read the json data
            data = json.loads(response)
            # iterate over each news dict
            for item in data:
                # If news out of lower bound date, stop the search
                if (
                    datetime_from_string(
                        item.get("data"),
                        order="YMD",
                    ).date()
                    < starting_date
                ):
                    stop_entire_search = True  # Will break main loop
                    break  # Will break current loop

                # If news more recent that end date, SKIP AHEAD
                elif (
                    datetime_from_string(
                        item.get("data"),
                        order="YMD",
                    ).date()
                    > ending_date
                ):
                    continue

                # If news inside the date rage, collect the URL
                else:
                    collected_news_urls.append(item.get("shareUrl"))
            if stop_entire_search:
                break
            # Increment page
            page_number += 1
    def test_correct_job_response(self):
        """
        Tests that the correct elements are returned when
        fetching job results.
        """
        response = self.api.post(
            reverse("publico_url_search"),
            {
                "urls": [
                    "https://www.publico.pt/2021/01/31/economia/noticia/irs-contribuintes-podem-validar-agregado-familiar-ate-15-fevereiro-1948701"
                ],
            },
        )

        # Assert that a `job_id` is returned
        self.assertIn("job_id", response.data)

        # Assert that a `results_url` is returned
        self.assertIn("results_url", response.data)

        # here we dispatch the worker so that job gets done in sync mode
        get_worker().work(burst=True)
        # Now make the request to get the results
        response = self.api.get(response.data["results_url"])

        self.assertIn("number_of_news", response.data)
        self.assertIn("date", response.data)
        # Check that date is (almost) equal to now
        # maximum 1 sec diff
        self.assertTrue(
            abs(now() - datetime_from_string(
                response.data["date"],
                order="YMD",
            )) < datetime.timedelta(seconds=1))
        self.assertIn("news", response.data)
    def test_keyword_search_job(self):
        """
        Tests enqueing a Publico's keyword search job and retrieveing news from the job
        """
        start_date = "2020-3-1"
        end_date = "2020-3-15"
        response = self.api.post(
            reverse("publico_keyword_search"),
            {
                "keywords": ["luanda leaks"],
                "starting_date": start_date,
                "ending_date": end_date,
            },
            format="json",
        )

        self.assertEqual(
            response.status_code,
            status.HTTP_200_OK,
        )
        # Assert that a `job_id` is returned
        self.assertIn("job_id", response.data)

        # Assert that a `results_url` is returned
        self.assertIn("results_url", response.data)

        # Make the worker dispatch all jobs in sync mode
        get_worker().work(burst=True)

        # Now make the request to get the results
        response = self.api.get(response.data["results_url"])

        # Assert that response is status code 200
        self.assertEqual(
            response.status_code,
            status.HTTP_200_OK,
        )

        # Number of news should be in response
        self.assertIn("number_of_news", response.data)

        # Number of news should be 7
        self.assertEqual(response.data["number_of_news"], 5)

        # Get found_news
        found_news = response.data["news"]

        # Number of news in the list should be 5 (re-check)
        self.assertEqual(len(found_news), 5)

        for news in found_news:
            # Check if news is well constructed
            self.assertIn("title", news)
            self.assertTrue(isinstance(news["title"], str))

            self.assertIn("description", news)
            self.assertTrue(isinstance(news["description"], str))

            self.assertIn("url", news)
            self.assertTrue(isinstance(news["url"], str))

            self.assertIn("rubric", news)
            self.assertTrue(isinstance(news["rubric"], str))

            self.assertIn("date", news)
            self.assertTrue(isinstance(news["date"], str))

            self.assertIn("authors", news)
            self.assertTrue(isinstance(news["authors"], list))

            self.assertIn("is_opinion", news)
            self.assertTrue(isinstance(news["is_opinion"], bool))

            self.assertIn("text", news)
            self.assertTrue(isinstance(news["text"], str))

            # Check that date is inside bound
            self.assertTrue(
                datetime_from_string(start_date, order="YMD").date() <=
                datetime_from_string(news["date"], order="YMD").date() <=
                datetime_from_string(end_date, order="YMD").date(),
                msg="News out of expected date range",
            )
Esempio n. 5
0
    def from_html_string(self, html_string: str) -> News:
        """
        Builds a News object from a given URL.

        Parameters
        ----------
        html_string : str
            A news page HTML's string

        Returns
        -------
        News
            The built News object

        Raises
        ------
        UnsupportedNews
            If news is one of the following types: "interativo", "multimedia", "perguntas"
        """

        # Build HTML tree
        tree = html.fromstring(html_string)

        # Extract URL
        try:
            url = tree.xpath("//meta[@property='og:url']")[0].get("content")
        except IndexError:
            raise UnsupportedNews

        # If news is of type 'interativo', 'multimedia' or 'perguntas' raise exception
        if any(
            x in url
            for x in [
                "interativo",
                "multimedia",
                "perguntas",
            ]
        ):
            raise UnsupportedNews(url)

        try:
            # Get news section from url path and capitalize it
            rubric = urlparse(url).path.split("/")[1].capitalize()
        except IndexError:
            raise UnsupportedNews(url)

        # Get if news is opinion article from rubric
        is_opinion = rubric == "Opiniao"

        # CM has subjornals with different HTML's (e.g. Vidas - www.vidas.pt)
        # Needs custom webscrapping for each subjornal
        parsed_url_netloc = urlparse(url).netloc
        if parsed_url_netloc == "www.cmjornal.pt":
            parse_func = self._parse_cm_news_info
        elif parsed_url_netloc == "www.vidas.pt":
            parse_func = self._parse_vidas_news_info
        else:
            raise UnsupportedNews(
                f"Unknow news URL netloc: {parsed_url_netloc}"
            )

        # Call the correct method for finding
        # `text`, `description` and `date` elements
        (
            text,
            description,
            published_at,
            authors,
        ) = parse_func(tree, is_opinion)

        # Date must be parsed
        published_at = datetime_from_string(published_at)

        # Remove ads in case they exist
        text = text.split("Para aceder a todos os Exclusivos CM")[0].split(
            "Ler o artigo completo"
        )[0]
        # CM text contains extra white, aswell as carriage
        text = " ".join(text.split())
        # Find title
        title = tree.xpath("//div[@class='centro']//h1//text()")[0]

        return News(
            title,
            description,
            url,
            rubric,
            published_at,
            authors,
            is_opinion,
            text,
        )