def process(self, item): '''Generate a random dataset from a fake identifier''' # Get or create a harvested dataset with this identifier. # Harvest metadata are already filled on creation. dataset = self.get_dataset(item.remote_id) # Here you comes your implementation. You should : # - fetch the remote dataset (if necessary) # - validate the fetched payload # - map its content to the dataset fields # - store extra significant data in the `extra` attribute # - map resources data dataset.title = faker.sentence() dataset.description = faker.text() dataset.tags = list(set(faker.words(nb=faker.pyint()))) # Resources for i in range(faker.pyint()): dataset.resources.append( Resource(title=faker.sentence(), description=faker.text(), url=faker.url(), filetype='remote', mime=faker.mime_type(category='text'), format=faker.file_extension(category='text'), filesize=faker.pyint())) return dataset
def metadata_factory(url, data=None): response = { 'etag': '', 'url': url, 'content-length': faker.pyint(), 'content-disposition': '', 'content-md5': faker.md5(), 'content-location': '', 'expires': faker.iso8601(), 'status': 200, 'updated': faker.iso8601(), 'last-modified': faker.iso8601(), 'content-encoding': 'gzip', 'content-type': faker.mime_type() } if data: response.update(data) return json.dumps(response)
def all_metadata(): resource_data = { 'name': faker.sentence(), 'description': faker.paragraph(), 'url': faker.unique_url(), 'mimetype': faker.mime_type(), 'format': faker.file_extension(), } data = { 'name': faker.unique_string(), 'title': faker.sentence(), 'notes': faker.paragraph(), 'tags': [{ 'name': faker.unique_string() } for _ in range(3)], 'resources': [resource_data], } return data, {'resource_data': resource_data}
def test_all_resource_fields(self): node = BNode() g = Graph() title = faker.sentence() url = faker.uri() description = faker.paragraph() filesize = faker.pyint() issued = faker.date_time_between(start_date='-60d', end_date='-30d') modified = faker.past_datetime(start_date='-30d') mime = faker.mime_type() sha1 = faker.sha1() g.add((node, RDF.type, DCAT.Distribution)) g.add((node, DCT.title, Literal(title))) g.add((node, DCT.description, Literal(description))) g.add((node, DCAT.downloadURL, Literal(url))) g.add((node, DCT.issued, Literal(issued))) g.add((node, DCT.modified, Literal(modified))) g.add((node, DCAT.bytesSize, Literal(filesize))) g.add((node, DCAT.mediaType, Literal(mime))) g.add((node, DCT.term('format'), Literal('CSV'))) checksum = BNode() g.add((node, SPDX.checksum, checksum)) g.add((checksum, RDF.type, SPDX.Checksum)) g.add((checksum, SPDX.algorithm, SPDX.checksumAlgorithm_sha1)) g.add((checksum, SPDX.checksumValue, Literal(sha1))) resource = resource_from_rdf(g) resource.validate() assert isinstance(resource, Resource) assert resource.title == title assert resource.url == url assert resource.description == description assert resource.filesize == filesize assert resource.mime == mime assert isinstance(resource.checksum, Checksum) assert resource.checksum.type == 'sha1' assert resource.checksum.value == sha1 assert resource.published == issued assert resource.modified == modified assert resource.format == 'csv'
def metadata_factory(url, data=None): """Base for a mocked Croquemort HTTP response""" response = { 'etag': '', 'checked-url': url, 'content-length': faker.pyint(), 'content-disposition': '', 'content-md5': faker.md5(), 'content-location': '', 'expires': faker.iso8601(), 'final-status-code': 200, 'updated': faker.iso8601(), 'last-modified': faker.iso8601(), 'content-encoding': 'gzip', 'content-type': faker.mime_type() } if data: response.update(data) return json.dumps(response)