def _keyword_search( self, keyword: str, starting_date: str, ending_date: str, ) -> list[PublicoNews]: # Normalize keyword keyword = keyword.lower() # Start page number page_number = 1 # Create news URL list collected_news_urls = [] # Parse `starting_date` starting_date = datetime_from_string(starting_date, order="YMD") # Parse `ending_date` ending_date = datetime_from_string(ending_date, order="YMD") while ( response := requests.get( f"https://www.publico.pt/api/list/search/?query={keyword}&start={starting_date.strftime('%d-%m-%Y')}&end={ending_date.strftime('%d-%m-%Y')}&page={page_number}" ).text ) != "[]": # Read the json data data = json.loads(response) # Get the URLs (this search type needs fullUrl) urls = [d.get("fullUrl") for d in data] # Append URLs to list collected_news_urls += urls # Increment page page_number += 1
def _tag_search( self, tag: str, starting_date: str, ending_date: str, ) -> list[PublicoNews]: # Normalize tag tag = tag.replace(" ", "-").lower() # Start page number page_number = 1 # Flag to stop the search stop_entire_search = False # Create news URL list collected_news_urls = [] # Parse `starting_date` starting_date = datetime_from_string(starting_date, order="YMD").date() # Parse `ending_date` ending_date = datetime_from_string(ending_date, order="YMD").date() while ( response := requests.get( f"https://www.publico.pt/api/list/{tag}?page={page_number}" ).text ) != "[]": # Read the json data data = json.loads(response) # iterate over each news dict for item in data: # If news out of lower bound date, stop the search if ( datetime_from_string( item.get("data"), order="YMD", ).date() < starting_date ): stop_entire_search = True # Will break main loop break # Will break current loop # If news more recent that end date, SKIP AHEAD elif ( datetime_from_string( item.get("data"), order="YMD", ).date() > ending_date ): continue # If news inside the date rage, collect the URL else: collected_news_urls.append(item.get("shareUrl")) if stop_entire_search: break # Increment page page_number += 1
def test_correct_job_response(self): """ Tests that the correct elements are returned when fetching job results. """ response = self.api.post( reverse("publico_url_search"), { "urls": [ "https://www.publico.pt/2021/01/31/economia/noticia/irs-contribuintes-podem-validar-agregado-familiar-ate-15-fevereiro-1948701" ], }, ) # Assert that a `job_id` is returned self.assertIn("job_id", response.data) # Assert that a `results_url` is returned self.assertIn("results_url", response.data) # here we dispatch the worker so that job gets done in sync mode get_worker().work(burst=True) # Now make the request to get the results response = self.api.get(response.data["results_url"]) self.assertIn("number_of_news", response.data) self.assertIn("date", response.data) # Check that date is (almost) equal to now # maximum 1 sec diff self.assertTrue( abs(now() - datetime_from_string( response.data["date"], order="YMD", )) < datetime.timedelta(seconds=1)) self.assertIn("news", response.data)
def test_keyword_search_job(self): """ Tests enqueing a Publico's keyword search job and retrieveing news from the job """ start_date = "2020-3-1" end_date = "2020-3-15" response = self.api.post( reverse("publico_keyword_search"), { "keywords": ["luanda leaks"], "starting_date": start_date, "ending_date": end_date, }, format="json", ) self.assertEqual( response.status_code, status.HTTP_200_OK, ) # Assert that a `job_id` is returned self.assertIn("job_id", response.data) # Assert that a `results_url` is returned self.assertIn("results_url", response.data) # Make the worker dispatch all jobs in sync mode get_worker().work(burst=True) # Now make the request to get the results response = self.api.get(response.data["results_url"]) # Assert that response is status code 200 self.assertEqual( response.status_code, status.HTTP_200_OK, ) # Number of news should be in response self.assertIn("number_of_news", response.data) # Number of news should be 7 self.assertEqual(response.data["number_of_news"], 5) # Get found_news found_news = response.data["news"] # Number of news in the list should be 5 (re-check) self.assertEqual(len(found_news), 5) for news in found_news: # Check if news is well constructed self.assertIn("title", news) self.assertTrue(isinstance(news["title"], str)) self.assertIn("description", news) self.assertTrue(isinstance(news["description"], str)) self.assertIn("url", news) self.assertTrue(isinstance(news["url"], str)) self.assertIn("rubric", news) self.assertTrue(isinstance(news["rubric"], str)) self.assertIn("date", news) self.assertTrue(isinstance(news["date"], str)) self.assertIn("authors", news) self.assertTrue(isinstance(news["authors"], list)) self.assertIn("is_opinion", news) self.assertTrue(isinstance(news["is_opinion"], bool)) self.assertIn("text", news) self.assertTrue(isinstance(news["text"], str)) # Check that date is inside bound self.assertTrue( datetime_from_string(start_date, order="YMD").date() <= datetime_from_string(news["date"], order="YMD").date() <= datetime_from_string(end_date, order="YMD").date(), msg="News out of expected date range", )
def from_html_string(self, html_string: str) -> News: """ Builds a News object from a given URL. Parameters ---------- html_string : str A news page HTML's string Returns ------- News The built News object Raises ------ UnsupportedNews If news is one of the following types: "interativo", "multimedia", "perguntas" """ # Build HTML tree tree = html.fromstring(html_string) # Extract URL try: url = tree.xpath("//meta[@property='og:url']")[0].get("content") except IndexError: raise UnsupportedNews # If news is of type 'interativo', 'multimedia' or 'perguntas' raise exception if any( x in url for x in [ "interativo", "multimedia", "perguntas", ] ): raise UnsupportedNews(url) try: # Get news section from url path and capitalize it rubric = urlparse(url).path.split("/")[1].capitalize() except IndexError: raise UnsupportedNews(url) # Get if news is opinion article from rubric is_opinion = rubric == "Opiniao" # CM has subjornals with different HTML's (e.g. Vidas - www.vidas.pt) # Needs custom webscrapping for each subjornal parsed_url_netloc = urlparse(url).netloc if parsed_url_netloc == "www.cmjornal.pt": parse_func = self._parse_cm_news_info elif parsed_url_netloc == "www.vidas.pt": parse_func = self._parse_vidas_news_info else: raise UnsupportedNews( f"Unknow news URL netloc: {parsed_url_netloc}" ) # Call the correct method for finding # `text`, `description` and `date` elements ( text, description, published_at, authors, ) = parse_func(tree, is_opinion) # Date must be parsed published_at = datetime_from_string(published_at) # Remove ads in case they exist text = text.split("Para aceder a todos os Exclusivos CM")[0].split( "Ler o artigo completo" )[0] # CM text contains extra white, aswell as carriage text = " ".join(text.split()) # Find title title = tree.xpath("//div[@class='centro']//h1//text()")[0] return News( title, description, url, rubric, published_at, authors, is_opinion, text, )