def initialize(self): '''Generate a list of fake identifiers to harvest''' # Here you comes your implementation. # You should iter over a remote endpoint to list identifiers # to harvest and optionnaly store extra data for _ in range(faker.pyint()): self.add_item(faker.uuid4()) # Accept kwargs to store data
def test_html_description(self): node = BNode() g = Graph() g.add((node, RDF.type, DCAT.Dataset)) g.add((node, DCT.identifier, Literal(faker.uuid4()))) g.add((node, DCT.title, Literal(faker.sentence()))) g.add((node, DCT.description, Literal('<div>a description</div>'))) dataset = dataset_from_rdf(g) dataset.validate() assert isinstance(dataset, Dataset) assert dataset.description == 'a description'
def test_minimal_ckan_response(rmock): '''CKAN Harvester should accept the minimum dataset payload''' CKAN_URL = 'https://harvest.me/' API_URL = '{}api/3/action/'.format(CKAN_URL) PACKAGE_LIST_URL = '{}package_list'.format(API_URL) PACKAGE_SHOW_URL = '{}package_show'.format(API_URL) name = faker.unique_string() json = { 'success': True, 'result': { 'id': faker.uuid4(), 'name': name, 'title': faker.sentence(), 'maintainer': faker.name(), 'tags': [], 'private': False, 'maintainer_email': faker.email(), 'license_id': None, 'metadata_created': faker.iso8601(), 'organization': None, 'metadata_modified': faker.iso8601(), 'author': None, 'author_email': None, 'notes': faker.paragraph(), 'license_title': None, 'state': None, 'revision_id': faker.unique_string(), 'type': 'dataset', 'resources': [], # extras is not always present so we exclude it from the minimal payload } } source = HarvestSourceFactory(backend='ckan', url=CKAN_URL) rmock.get(PACKAGE_LIST_URL, json={ 'success': True, 'result': [name] }, status_code=200, headers={'Content-Type': 'application/json'}) rmock.get(PACKAGE_SHOW_URL, json=json, status_code=200, headers={'Content-Type': 'application/json'}) actions.run(source.slug) source.reload() assert source.get_last_job().status == 'done'
def test_all_fields(self): uri = 'https://test.org/dataset' node = URIRef(uri) g = Graph() id = faker.uuid4() title = faker.sentence() acronym = faker.word() description = faker.paragraph() tags = faker.words(nb=3) start = faker.past_date(start_date='-30d') end = faker.future_date(end_date='+30d') g.set((node, RDF.type, DCAT.Dataset)) g.set((node, DCT.identifier, Literal(id))) g.set((node, DCT.title, Literal(title))) g.set((node, SKOS.altLabel, Literal(acronym))) g.set((node, DCT.description, Literal(description))) g.set((node, DCT.accrualPeriodicity, FREQ.daily)) pot = BNode() g.add((node, DCT.temporal, pot)) g.set((pot, RDF.type, DCT.PeriodOfTime)) g.set((pot, SCHEMA.startDate, Literal(start))) g.set((pot, SCHEMA.endDate, Literal(end))) for tag in tags: g.add((node, DCAT.keyword, Literal(tag))) dataset = dataset_from_rdf(g) dataset.validate() assert isinstance(dataset, Dataset) assert dataset.title == title assert dataset.acronym == acronym assert dataset.description == description assert dataset.frequency == 'daily' assert set(dataset.tags) == set(tags) assert isinstance(dataset.temporal_coverage, db.DateRange) assert dataset.temporal_coverage.start == start assert dataset.temporal_coverage.end == end extras = dataset.extras assert 'dct:identifier' in extras assert extras['dct:identifier'] == id assert 'uri' in extras assert extras['uri'] == uri
def test_all_fields(self): uri = 'https://test.org/dataset' node = URIRef(uri) g = Graph() id = faker.uuid4() title = faker.sentence() acronym = faker.word() description = faker.paragraph() tags = faker.words(nb=3) start = faker.past_date(start_date='-30d') end = faker.future_date(end_date='+30d') g.set((node, RDF.type, DCAT.Dataset)) g.set((node, DCT.identifier, Literal(id))) g.set((node, DCT.title, Literal(title))) g.set((node, SKOS.altLabel, Literal(acronym))) g.set((node, DCT.description, Literal(description))) g.set((node, DCT.accrualPeriodicity, FREQ.daily)) pot = BNode() g.add((node, DCT.temporal, pot)) g.set((pot, RDF.type, DCT.PeriodOfTime)) g.set((pot, SCHEMA.startDate, Literal(start))) g.set((pot, SCHEMA.endDate, Literal(end))) for tag in tags: g.add((node, DCAT.keyword, Literal(tag))) dataset = dataset_from_rdf(g) dataset.validate() self.assertIsInstance(dataset, Dataset) self.assertEqual(dataset.title, title) self.assertEqual(dataset.acronym, acronym) self.assertEqual(dataset.description, description) self.assertEqual(dataset.frequency, 'daily') self.assertEqual(set(dataset.tags), set(tags)) self.assertIsInstance(dataset.temporal_coverage, db.DateRange) self.assertEqual(dataset.temporal_coverage.start, start) self.assertEqual(dataset.temporal_coverage.end, end) extras = dataset.extras self.assertIn('dct:identifier', extras) self.assertEqual(extras['dct:identifier'], id) self.assertIn('uri', extras) self.assertEqual(extras['uri'], uri)