def test_should_flag_batch_imported_on_success(self): first_import = InitiativeImport() self.pf_source_mock.initiatives = MagicMock(return_value=iter([first_import])) self.scraper.scrape() assert self.scraper.get_current_batch().state == BatchImportState.IMPORTED
def assert_running_batch_on_iter(self): started_batch = self.scraper.get_current_batch() assert started_batch is not None assert started_batch.platform == self.db_platform assert started_batch.state == "running" return iter([InitiativeImport()])
def test_should_set_batch_stopped_time(self): self.pf_source_mock.initiatives = MagicMock(return_value=iter([InitiativeImport()])) self.scraper.scrape() batch = self.scraper.get_current_batch() assert batch.started_at < batch.stopped_at
def initiatives(self) -> Generator[InitiativeImport, None, None]: page_counter = 1 try: while page_counter < 100: list_page_url = self.config.list_endpoint + f"&page={page_counter}" # schemas: defines fields to be scraped # schema: fieldname:{xpath,all,cast,transform} schemas = {'initiatives': {'xpath': '//a[@href and contains(@class, "postpreview-content")]', 'all': True, 'transform': lambda elements: self.find_initiative_links(elements)}} # initialize TreeParser using url and schemas, returns html tree initiative_parser = TreeParser(list_page_url, None, schemas) if initiative_parser.tree is None: break output = initiative_parser.apply_schemas() for uri in output['initiatives']: yield InitiativeImport(source_uri=uri[0]) page_counter = page_counter + 1 except Exception as ex: raise ScrapeException("Error loading list of initiatives") from ex
def test_should_add_completed_initiative_to_batch(self): first_import = InitiativeImport() self.pf_source_mock.initiatives = MagicMock(return_value=iter([first_import])) self.scraper.scrape() assert self.scraper.get_current_batch().initiatives[0] == first_import
def scrape(): db = Db() # read data response = requests.get( 'https://api-server-271218.appspot.com/v1/tasks?zipcode=') result = json.loads(response.content) # print(result) questions = result['data']['tasks'] for card in questions: db.session.add( InitiativeImport( name=card['firstName'], category=card['TaskType']['name'], description=card['description'], group="demand", source= 'https://www.gewoonmensendiemensenwillenhelpen.nl/ik-wil-helpen', source_id=card['id'], location=card['zipcode'] + ' ' + card['city'], frequency=card['when'], )) db.session.commit()
def test_should_iterate_platform_source(self): first_import = InitiativeImport() self.pf_source_mock.initiatives = MagicMock(return_value=iter([first_import])) self.scraper.scrape() self.pf_source_mock.complete.assert_called_once_with(first_import)
def scrape(self): db = Db() page = requests.get(self.URL) soup = BeautifulSoup(page.content, 'html.parser') results = soup.find(class_='container') questions = results.find_all(class_='card') for card in questions: title = card.find('h5').text.strip(' \t\n\r') rawlocation = card.find('h6').text.strip(' \t\n\r') # remove (maps) from rawlocation, split on first space rawlocation = rawlocation.strip('(Maps)') description = card.find('p', class_='card-text').text.strip(' \t\n\r') db.session.add( InitiativeImport( category=title, description=description, group="demand", source=self.URL, location=rawlocation, )) db.session.commit()
def test_should_set_location(self, request_mock): self.setup_item_request(request_mock, self.item_nolocation_response) test_source = MijnBuurtjeSource(self.config) actual = InitiativeImport(source_uri=self.config.details_endpoint + "1234/loc-test") test_source.complete(actual) assert actual.location == self.config.location
def scrape_collection_exception(self): self.pf_source_mock.initiatives = MagicMock( return_value=iter([InitiativeImport(source_uri="test/123")])) self.pf_source_mock.complete = Mock( side_effect=ScrapeException("Test")) self.scraper.scrape()
def test_should_set_platform_url_as_source(self): self.pf_source_mock.initiatives = MagicMock( return_value=iter([InitiativeImport(source_uri="test/123")])) self.scraper.scrape() actual = self.scraper.get_current_batch().initiatives[0] assert self.scraper.platform_url == actual.source
def test_invalid_stop_throws_error(self): self.pf_source_mock.initiatives = MagicMock( return_value=iter([InitiativeImport()])) self.scraper.scrape() batch = self.scraper.get_current_batch() with self.assertRaises(ValueError): batch.stop(BatchImportState.PROCESSED)
def test_should_log_item_exception(self): self.pf_source_mock.initiatives = MagicMock(return_value=iter([InitiativeImport( source_uri="test/123" )])) self.pf_source_mock.complete = \ MagicMock(side_effect=ScrapeException("Failed loading item")) self.scraper.scrape() self.logger_mock.exception.assert_called_once_with("Error while collecting initiative test/123")
def test_should_have_set_scraped_at(self): self.pf_source_mock.initiatives = MagicMock( return_value=iter([InitiativeImport(source_uri="test/123")])) self.scraper.scrape() now = datetime.utcnow() actual = self.scraper.get_current_batch().initiatives[0].scraped_at # can't mock datetime.utcnow so this is my workaround. datediff = now - actual assert datediff.seconds < 1
def scrape(self): db = Db() for company in self.zorgheldenautos: db.session.add( InitiativeImport( name=company, group="zorgheldenauto", source='https://www.auto.nl/zorgheldenauto', )) db.session.commit()
def test_should_wrap_complete_exceptions(self, request_mock): test_source = MijnBuurtjeSource(self.config) self.setup_item_request(request_mock, self.item_response) with patch.object(TreeParser, 'apply_schemas', side_effect=HtmlParseError()): with self.assertRaises(ScrapeException): _ = test_source.complete( InitiativeImport(source_uri=self.config.details_endpoint + "1234"))
def setUp(self, request_mock): self.response = responses.read("nlvoorelkaar_demand.html") scraper = NLvoorElkaar() self.source = scraper._sources[1] self.url = "https://www.nlvoorelkaar.nl/hulpvragen/183242" request_mock.get(self.url, text=self.response, status_code=200) self.request_mock = request_mock self.actual = InitiativeImport(source_id=183242, source_uri=self.url) scraper._sources[1].complete(self.actual)
def setUp(self, request_mock): self.response = responses.read("nlvoorelkaar_supply.html") scraper = NLvoorElkaar() self.source = scraper._sources[0] self.url = "https://www.nlvoorelkaar.nl/hulpaanbod/179582" request_mock.get(self.url, text=self.response, status_code=200) self.request_mock = request_mock self.actual = InitiativeImport(source_id=179582, source_uri=self.url) scraper._sources[0].complete(self.actual)
def complete(self, initiative: InitiativeImport): try: # Robots.txt mentions 10 secs crawl delay. time.sleep(10) session_metadata = self.item_parser.get_session_metadata(initiative.source_uri) full_initiative = self.item_parser.apply_schemas(metadata=session_metadata, url=initiative.source_uri) for key, value in full_initiative.items(): setattr(initiative, key, value) if not initiative.location: initiative.location = self.config.location except Exception as ex: raise ScrapeException(f"Error scraping {initiative.source_uri}") from ex
def scrape(self): db = Db() counter = 1 while counter > 0: # print(self.URL + str(counter)) page = requests.get(self.URL + str(counter)) soup = BeautifulSoup(page.content, 'html.parser') results = soup.find_all(class_='postpreview') if len(results) > 0: counter += 1 for card in results: try: title = card.find(class_='heading3 heading3--semibold' ).text.strip(' \t\n\r') name = card.find(class_='entity-content-title').text description = card.find( class_='paragraph').text.strip(' \t\n\r') rawtheme = card.find( class_='postpreview-subtitle').text link = card.find(class_='postpreview-content') final_link = link['href'] source_id = final_link.split('/')[-2] db.session.add( InitiativeImport( name=name + " - " + title, description=description, group=rawtheme, source=final_link, source_id=source_id, )) except: print(card) pass else: counter = -1 db.session.commit()
def test_missing_plaats(self): scraper = NLvoorElkaar() item = scraper._sources[0].complete( InitiativeImport( source_id=179582, source_uri="https://www.nlvoorelkaar.nl/hulpvragen/183242"))
'notes': { 'xpath': '' } } # description (xpath) returns multiple elements, define clean_description and set in schemas.description.transform clean_description = lambda x: ''.join( [re.sub('\n|\n |\n |\s{2,}', '', e) for e in x]) schemas['description']['transform'] = clean_description records = [] # store scraper output in records c = 1 MAX_URLS = 10 for url in initiatief_urls: metadata = { 'source_url': re.findall('https:\/\/([A-Z,a-z,0-9,\.]+)\/', url)[0], 'source_uri': url, 'scraped_at': str(dt.datetime.now()), 'created_at': dt.datetime.strftime(dt.datetime.now(), '%Y-%m-%d') } TreeParser0 = TreeParser(url, None, schemas) output = TreeParser0.apply_schemas(metadata) records.append(output) if c > MAX_URLS: # scraping all 300+ initiatives took a bit long break c = c + 1 # Step 3 insert into db for r in records: # TODO: determine which fields are inserted into DB db.session.add(InitiativeImport())