def initiatives(self) -> Generator[InitiativeImport, None, None]: page_counter = 1 try: while page_counter < 100: list_page_url = self.config.list_endpoint + f"&page={page_counter}" # schemas: defines fields to be scraped # schema: fieldname:{xpath,all,cast,transform} schemas = {'initiatives': {'xpath': '//a[@href and contains(@class, "postpreview-content")]', 'all': True, 'transform': lambda elements: self.find_initiative_links(elements)}} # initialize TreeParser using url and schemas, returns html tree initiative_parser = TreeParser(list_page_url, None, schemas) if initiative_parser.tree is None: break output = initiative_parser.apply_schemas() for uri in output['initiatives']: yield InitiativeImport(source_uri=uri[0]) page_counter = page_counter + 1 except Exception as ex: raise ScrapeException("Error loading list of initiatives") from ex
def test_should_handle_scrape_exception(self): self.pf_source_mock.initiatives = \ MagicMock(side_effect=ScrapeException("Failed loading the list")) self.scraper.scrape() assert self.scraper.get_current_batch().state == BatchImportState.FAILED
def test_should_log_listing_exception(self): self.pf_source_mock.initiatives = \ MagicMock(side_effect=ScrapeException("Failed loading the list")) self.scraper.scrape() self.logger_mock.exception.assert_called_once_with("Error while reading list of initiatives")
def scrape_collection_exception(self): self.pf_source_mock.initiatives = MagicMock( return_value=iter([InitiativeImport(source_uri="test/123")])) self.pf_source_mock.complete = Mock( side_effect=ScrapeException("Test")) self.scraper.scrape()
def test_should_log_item_exception(self): self.pf_source_mock.initiatives = MagicMock(return_value=iter([InitiativeImport( source_uri="test/123" )])) self.pf_source_mock.complete = \ MagicMock(side_effect=ScrapeException("Failed loading item")) self.scraper.scrape() self.logger_mock.exception.assert_called_once_with("Error while collecting initiative test/123")
def complete(self, initiative: InitiativeImport): try: # Robots.txt mentions 10 secs crawl delay. time.sleep(10) session_metadata = self.item_parser.get_session_metadata(initiative.source_uri) full_initiative = self.item_parser.apply_schemas(metadata=session_metadata, url=initiative.source_uri) for key, value in full_initiative.items(): setattr(initiative, key, value) if not initiative.location: initiative.location = self.config.location except Exception as ex: raise ScrapeException(f"Error scraping {initiative.source_uri}") from ex