def test_distribution_format_IMT_field(self): g = Graph() dataset1 = URIRef("http://example.org/datasets/1") g.add((dataset1, RDF.type, DCAT.Dataset)) distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") imt = BNode() g.add((imt, RDF.type, DCT.IMT)) g.add((imt, RDF.value, Literal('text/turtle'))) g.add((imt, RDFS.label, Literal('Turtle'))) g.add((distribution1_1, RDF.type, DCAT.Distribution)) g.add((distribution1_1, DCT['format'], imt)) g.add((dataset1, DCAT.distribution, distribution1_1)) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] resource = datasets[0]['resources'][0] eq_(resource['format'], u'Turtle') eq_(resource['mimetype'], u'text/turtle')
def test_dataset_turtle_1(self): contents = self._get_file_contents('dataset_deri.ttl') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents, _format='n3') datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] eq_(dataset['title'], 'Abandoned Vehicles') eq_(len(dataset['resources']), 1) resource = dataset['resources'][0] eq_(resource['name'], u'CSV distribution of: Abandoned Vehicles') eq_( resource['url'], u'http://data.london.gov.uk/datafiles/environment/abandoned-vehicles-borough.csv' ) eq_(resource['uri'], u'http://data.london.gov.uk/dataset/Abandoned_Vehicles/csv')
def test_distribution_format_format_normalized(self): g = Graph() dataset1 = URIRef("http://example.org/datasets/1") g.add((dataset1, RDF.type, DCAT.Dataset)) distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") g.add((distribution1_1, RDF.type, DCAT.Distribution)) g.add((distribution1_1, DCAT.mediaType, Literal('text/csv'))) g.add((distribution1_1, DCT['format'], Literal('Comma Separated Values'))) g.add((dataset1, DCAT.distribution, distribution1_1)) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] resource = datasets[0]['resources'][0] if toolkit.check_ckan_version(min_version='2.3'): eq_(resource['format'], u'CSV') eq_(resource['mimetype'], u'text/csv') else: eq_(resource['format'], u'Comma Separated Values')
def test_dataset_compatibility_mode(self): contents = self._get_file_contents('dataset.rdf') p = RDFParser(profiles=['euro_dcat_ap'], compatibility_mode=True) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] def _get_extra_value(key): v = [ extra['value'] for extra in dataset['extras'] if extra['key'] == key ] return v[0] if v else None eq_(_get_extra_value('dcat_issued'), u'2012-05-10') eq_(_get_extra_value('dcat_modified'), u'2012-05-10T21:04:00') eq_(_get_extra_value('dcat_publisher_name'), 'Publishing Organization for dataset 1') eq_(_get_extra_value('dcat_publisher_email'), '*****@*****.**') eq_(_get_extra_value('language'), 'ca,en,es')
def test_dataset_json_ld_1(self): contents = self._get_file_contents('catalog_pod.jsonld') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents, _format='json-ld') datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] extras = dict((e['key'], e['value']) for e in dataset['extras']) eq_(dataset['title'], 'U.S. Widget Manufacturing Statistics') eq_(extras['contact_name'], 'Jane Doe') eq_(extras['contact_email'], 'mailto:[email protected]') eq_(extras['publisher_name'], 'Widget Services') eq_(extras['publisher_email'], '*****@*****.**') eq_(len(dataset['resources']), 4) resource = [ r for r in dataset['resources'] if r['name'] == 'widgets.csv' ][0] eq_(resource['name'], u'widgets.csv') eq_( resource['url'], u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv') eq_( resource['download_url'], u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv')
def test_spatial_both_geojson_and_wkt(self): g = Graph() dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) spatial_uri = URIRef('http://geonames/Newark') g.add((dataset, DCT.spatial, spatial_uri)) g.add((spatial_uri, RDF.type, DCT.Location)) g.add((spatial_uri, LOCN.geometry, Literal('{"type": "Point", "coordinates": [23, 45]}', datatype=GEOJSON_IMT))) g.add((spatial_uri, LOCN.geometry, Literal('POINT (67 89)', datatype=GSP.wktLiteral))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] extras = self._extras(datasets[0]) eq_(extras['spatial'], '{"type": "Point", "coordinates": [23, 45]}')
def test_spatial_one_dct_spatial_instance_no_uri(self): g = Graph() dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) location_ref = BNode() g.add((dataset, DCT.spatial, location_ref)) g.add((location_ref, RDF.type, DCT.Location)) g.add((location_ref, LOCN.geometry, Literal('{"type": "Point", "coordinates": [23, 45]}', datatype=GEOJSON_IMT))) g.add((location_ref, SKOS.prefLabel, Literal('Newark'))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] extras = self._extras(datasets[0]) assert_true('spatial_uri' not in extras) eq_(extras['spatial_text'], 'Newark') eq_(extras['spatial'], '{"type": "Point", "coordinates": [23, 45]}')
def test_datasets_none_found(self): p = RDFParser() p.g = Graph() eq_(len([d for d in p.datasets()]), 0)
def test_profiles_are_called_on_datasets(self): p = RDFParser() p._profiles = [MockRDFProfile1, MockRDFProfile2] p.g = _default_graph() for dataset in p.datasets(): assert dataset['profile_1'] assert dataset['profile_2']
def test_tags_with_commas(self): g = Graph() dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) g.add((dataset, DCAT.keyword, Literal('Tree, forest, shrub'))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] eq_(len(datasets[0]['tags']), 3)
def test_datasets(self): p = RDFParser() p.g = _default_graph() datasets = [] for dataset in p.datasets(): assert 'title' in dataset datasets.append(dataset) eq_(len(datasets), 3)
def test_dataset_version_adms(self): g = Graph() dataset1 = URIRef("http://example.org/datasets/1") g.add((dataset1, RDF.type, DCAT.Dataset)) g.add((dataset1, ADMS.version, Literal('2.3a'))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g dataset = [d for d in p.datasets()][0] eq_(dataset['version'], u'2.3a')
def test_catalog_xml_rdf(self): contents = self._get_file_contents('catalog.rdf') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 2) dataset = (datasets[0] if datasets[0]['title'] == 'Example dataset 1' else datasets[1]) eq_(dataset['title'], 'Example dataset 1') eq_(len(dataset['resources']), 3) eq_(len(dataset['tags']), 2)
def test_dataset_license_from_distribution_by_uri(self): # license_id retrieved from the URI of dcat:license object g = Graph() dataset = URIRef("http://example.org/datasets/1") g.add((dataset, RDF.type, DCAT.Dataset)) distribution = URIRef("http://example.org/datasets/1/ds/1") g.add((dataset, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) g.add((distribution, DCT.license, URIRef("http://www.opendefinition.org/licenses/cc-by"))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g dataset = [d for d in p.datasets()][0] eq_(dataset['license_id'], 'cc-by')
def test_spatial_uri_only(self): g = Graph() dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) spatial_uri = URIRef('http://geonames/Newark') g.add((dataset, DCT.spatial, spatial_uri)) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] extras = self._extras(datasets[0]) eq_(extras['spatial_uri'], 'http://geonames/Newark') assert_true('spatial_text' not in extras) assert_true('spatial' not in extras)
def test_dataset_license_from_distribution_by_title(self): # license_id retrieved from dct:title of dcat:license object g = Graph() dataset = URIRef("http://example.org/datasets/1") g.add((dataset, RDF.type, DCAT.Dataset)) distribution = URIRef("http://example.org/datasets/1/ds/1") g.add((distribution, RDF.type, DCAT.Distribution)) g.add((dataset, DCAT.distribution, distribution)) license = BNode() g.add((distribution, DCT.license, license)) g.add((license, DCT.title, Literal("Creative Commons Attribution"))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g dataset = [d for d in p.datasets()][0] eq_(dataset['license_id'], 'cc-by')
def test_distribution_format_format_only(self): g = Graph() dataset1 = URIRef("http://example.org/datasets/1") g.add((dataset1, RDF.type, DCAT.Dataset)) distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") g.add((distribution1_1, RDF.type, DCAT.Distribution)) g.add((distribution1_1, DCT['format'], Literal('CSV'))) g.add((dataset1, DCAT.distribution, distribution1_1)) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] resource = datasets[0]['resources'][0] eq_(resource['format'], u'CSV')
def test_spatial_rdfs_label(self): g = Graph() dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) spatial_uri = URIRef('http://geonames/Newark') g.add((dataset, DCT.spatial, spatial_uri)) g.add((spatial_uri, RDF.type, DCT.Location)) g.add((spatial_uri, RDFS.label, Literal('Newark'))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] extras = self._extras(datasets[0]) eq_(extras['spatial_text'], 'Newark')
def test_distribution_format_imt_normalized(self): g = Graph() dataset1 = URIRef("http://example.org/datasets/1") g.add((dataset1, RDF.type, DCAT.Dataset)) distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") g.add((distribution1_1, RDF.type, DCAT.Distribution)) g.add((distribution1_1, DCAT.mediaType, Literal('text/unknown-imt'))) g.add((dataset1, DCAT.distribution, distribution1_1)) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] resource = datasets[0]['resources'][0] eq_(resource['format'], u'text/unknown-imt') eq_(resource['mimetype'], u'text/unknown-imt')
def test_distribution_download_url(self): g = Graph() dataset1 = URIRef("http://example.org/datasets/1") g.add((dataset1, RDF.type, DCAT.Dataset)) distribution1_1 = URIRef("http://example.org/datasets/1/ds/1") g.add((distribution1_1, RDF.type, DCAT.Distribution)) g.add((distribution1_1, DCAT.downloadURL, Literal('http://download.url.org'))) g.add((dataset1, DCAT.distribution, distribution1_1)) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] resource = datasets[0]['resources'][0] eq_(resource['url'], u'http://download.url.org') eq_(resource['download_url'], u'http://download.url.org')
def test_spatial_wkt_only(self): g = Graph() dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) spatial_uri = URIRef('http://geonames/Newark') g.add((dataset, DCT.spatial, spatial_uri)) g.add((spatial_uri, RDF.type, DCT.Location)) g.add((spatial_uri, LOCN.geometry, Literal('POINT (67 89)', datatype=GSP.wktLiteral))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] extras = self._extras(datasets[0]) # NOTE: geomet returns floats for coordinates on WKT -> GeoJSON eq_(extras['spatial'], '{"type": "Point", "coordinates": [67.0, 89.0]}')
def test_spatial_wrong_geometries(self): g = Graph() dataset = URIRef('http://example.org/datasets/1') g.add((dataset, RDF.type, DCAT.Dataset)) spatial_uri = URIRef('http://geonames/Newark') g.add((dataset, DCT.spatial, spatial_uri)) g.add((spatial_uri, RDF.type, DCT.Location)) g.add((spatial_uri, LOCN.geometry, Literal('Not GeoJSON', datatype=GEOJSON_IMT))) g.add((spatial_uri, LOCN.geometry, Literal('Not WKT', datatype=GSP.wktLiteral))) p = RDFParser(profiles=['euro_dcat_ap']) p.g = g datasets = [d for d in p.datasets()] extras = self._extras(datasets[0]) assert_true('spatial' not in extras)
def gather_stage(self, harvest_job): log.debug('In IAESTRDFHarvester gather_stage') rdf_format = None if harvest_job.source.config: rdf_format = json.loads(harvest_job.source.config).get("rdf_format") # Get file contents of first page next_page_url = harvest_job.source.url guids_in_source = [] object_ids = [] last_content_hash = None while next_page_url: for harvester in p.PluginImplementations(IIAESTRDFHarvester): next_page_url, before_download_errors = harvester.before_download(next_page_url, harvest_job) for error_msg in before_download_errors: self._save_gather_error(error_msg, harvest_job) if not next_page_url: return [] content, rdf_format = self._get_content_and_type(next_page_url, harvest_job, 1, content_type=rdf_format) content_hash = hashlib.md5() if content: content_hash.update(content) if last_content_hash: if content_hash.digest() == last_content_hash.digest(): log.warning('Remote content was the same even when using a paginated URL, skipping') break else: last_content_hash = content_hash # TODO: store content? for harvester in p.PluginImplementations(IIAESTRDFHarvester): content, after_download_errors = harvester.after_download(content, harvest_job) for error_msg in after_download_errors: self._save_gather_error(error_msg, harvest_job) if not content: return [] # TODO: profiles conf parser = RDFParser() try: parser.parse(content, _format=rdf_format) except RDFParserException, e: self._save_gather_error('Error parsing the RDF file: {0}'.format(e), harvest_job) return [] log.warning('Parser complete.') try: for dataset in parser.datasets(): if not dataset.get('name'): dataset['name'] = self._gen_new_name(dataset['title']) log.warning('Generando dataset: {0}'.format(dataset['name']) ) # Unless already set by the parser, get the owner organization (if any) # from the harvest source dataset if not dataset.get('owner_org'): source_dataset = model.Package.get(harvest_job.source.id) if source_dataset.owner_org: dataset['owner_org'] = source_dataset.owner_org # Try to get a unique identifier for the harvested dataset guid = self._get_guid(dataset) if not guid: self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset), harvest_job) continue dataset['extras'].append({'key': 'guid', 'value': guid}) guids_in_source.append(guid) obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(dataset)) obj.save() object_ids.append(obj.id) except Exception, e: self._save_gather_error('Error when processsing dataset: %r / %s' % (e, traceback.format_exc()), harvest_job) return []
def test_dataset_all_fields(self): contents = self._get_file_contents('dataset.rdf') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] # Basic fields eq_(dataset['title'], u'Zimbabwe Regional Geochemical Survey.') eq_( dataset['notes'], u'During the period 1982-86 a team of geologists from the British Geological Survey ...' ) eq_(dataset['url'], 'http://dataset.info.org') eq_(dataset['version'], '2.3') eq_(dataset['license_id'], 'cc-nc') # Tags eq_(sorted(dataset['tags'], key=lambda k: k['name']), [{ 'name': u'exploration' }, { 'name': u'geochemistry' }, { 'name': u'geology' }]) # Extras def _get_extra_value(key): v = [ extra['value'] for extra in dataset['extras'] if extra['key'] == key ] return v[0] if v else None def _get_extra_value_as_list(key): value = _get_extra_value(key) return json.loads(value) if value else [] # Simple values eq_(_get_extra_value('issued'), u'2012-05-10') eq_(_get_extra_value('modified'), u'2012-05-10T21:04:00') eq_(_get_extra_value('identifier'), u'9df8df51-63db-37a8-e044-0003ba9b0d98') eq_(_get_extra_value('version_notes'), u'New schema added') eq_(_get_extra_value('temporal_start'), '1905-03-01') eq_(_get_extra_value('temporal_end'), '2013-01-05') eq_(_get_extra_value('frequency'), 'http://purl.org/cld/freq/daily') eq_(_get_extra_value('spatial_uri'), 'http://publications.europa.eu/mdr/authority/country/ZWE') eq_(_get_extra_value('publisher_uri'), 'http://orgs.vocab.org/some-org') eq_(_get_extra_value('publisher_name'), 'Publishing Organization for dataset 1') eq_(_get_extra_value('publisher_email'), '*****@*****.**') eq_(_get_extra_value('publisher_url'), 'http://some.org') eq_(_get_extra_value('publisher_type'), 'http://purl.org/adms/publishertype/NonProfitOrganisation') eq_(_get_extra_value('contact_name'), 'Point of Contact') eq_(_get_extra_value('contact_email'), 'mailto:[email protected]') eq_(_get_extra_value('access_rights'), 'public') eq_(_get_extra_value('provenance'), 'Some statement about provenance') eq_(_get_extra_value('dcat_type'), 'test-type') # Lists eq_(sorted(_get_extra_value_as_list('language')), [u'ca', u'en', u'es']) eq_(sorted(_get_extra_value_as_list('theme')), [ u'Earth Sciences', u'http://eurovoc.europa.eu/100142', u'http://eurovoc.europa.eu/209065' ]) eq_(sorted(_get_extra_value_as_list('conforms_to')), [u'Standard 1', u'Standard 2']) eq_(sorted(_get_extra_value_as_list('alternate_identifier')), [u'alternate-identifier-1', u'alternate-identifier-2']) eq_(sorted(_get_extra_value_as_list('documentation')), [u'http://dataset.info.org/doc1', u'http://dataset.info.org/doc2']) eq_(sorted(_get_extra_value_as_list('related_resource')), [ u'http://dataset.info.org/related1', u'http://dataset.info.org/related2' ]) eq_(sorted(_get_extra_value_as_list('has_version')), [ u'https://data.some.org/catalog/datasets/derived-dataset-1', u'https://data.some.org/catalog/datasets/derived-dataset-2' ]) eq_(sorted(_get_extra_value_as_list('is_version_of')), [u'https://data.some.org/catalog/datasets/original-dataset']) eq_(sorted(_get_extra_value_as_list('source')), [ u'https://data.some.org/catalog/datasets/source-dataset-1', u'https://data.some.org/catalog/datasets/source-dataset-2' ]) eq_(sorted(_get_extra_value_as_list('sample')), [ u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/sample' ]) # Dataset URI eq_( _get_extra_value('uri'), u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98' ) # Resources eq_(len(dataset['resources']), 1) resource = dataset['resources'][0] # Simple values eq_(resource['name'], u'Some website') eq_(resource['description'], u'A longer description') eq_(resource['format'], u'HTML') eq_(resource['mimetype'], u'text/html') eq_(resource['issued'], u'2012-05-11') eq_(resource['modified'], u'2012-05-01T00:04:06') eq_(resource['status'], u'http://purl.org/adms/status/Completed') eq_(resource['hash'], u'4304cf2e751e6053c90b1804c89c0ebb758f395a') eq_(resource['hash_algorithm'], u'http://spdx.org/rdf/terms#checksumAlgorithm_sha1') # Lists for item in [ ('documentation', [ u'http://dataset.info.org/distribution1/doc1', u'http://dataset.info.org/distribution1/doc2' ]), ('language', [u'ca', u'en', u'es']), ('conforms_to', [u'Standard 1', u'Standard 2']), ]: eq_(sorted(json.loads(resource[item[0]])), item[1]) # These two are likely to need clarification eq_(resource['license'], u'http://creativecommons.org/licenses/by-nc/2.0/') eq_(resource['rights'], u'Some statement about rights') eq_(resource['url'], u'http://www.bgs.ac.uk/gbase/geochemcd/home.html') assert 'download_url' not in resource eq_(resource['size'], 12323) # Distribution URI eq_( resource['uri'], u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/1' )