def test_dataset_json_ld_with_at_graph(self): contents = self._get_file_contents('catalog_with_at_graph.jsonld') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents, _format='json-ld') datasets = [d for d in p.datasets()] assert len(datasets) == 1 dataset = datasets[0] extras = dict((e['key'], e['value']) for e in dataset['extras']) assert dataset['title'] == 'Title dataset' assert extras['contact_name'] == 'Jane Doe' # mailto gets removed for storage and is added again on output assert extras['contact_email'] == '*****@*****.**' assert len(dataset['resources']) == 1 resource = dataset['resources'][0] assert resource['name'] == u'download.zip' assert resource['url'] == u'http://example2.org/files/download.zip' assert resource[ 'access_url'] == u'https://ckan.example.org/dataset/d4ce4e6e-ab89-44cb-bf5c-33a162c234de/resource/a289c289-55c9-410f-b4c7-f88e5f6f4e47' assert resource[ 'download_url'] == u'http://example2.org/files/download.zip'
def test_dataset_ttl(self): dataset = factories.Dataset(notes='Test dataset') url = url_for('dcat_dataset', _id=dataset['id'], _format='ttl') app = self._get_test_app() response = app.get(url) eq_(response.headers['Content-Type'], 'text/turtle') content = response.body # Parse the contents to check it's an actual serialization p = RDFParser() p.parse(content, _format='turtle') dcat_datasets = [d for d in p.datasets()] eq_(len(dcat_datasets), 1) dcat_dataset = dcat_datasets[0] eq_(dcat_dataset['title'], dataset['title']) eq_(dcat_dataset['notes'], dataset['notes'])
def test_dataset_json_ld_1(self): contents = self._get_file_contents('catalog_pod.jsonld') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents, _format='json-ld') datasets = [d for d in p.datasets()] assert len(datasets) == 1 dataset = datasets[0] extras = dict((e['key'], e['value']) for e in dataset['extras']) assert dataset['title'] == 'U.S. Widget Manufacturing Statistics' assert extras['contact_name'] == 'Jane Doe' # mailto gets removed for storage and is added again on output assert extras['contact_email'] == '*****@*****.**' assert extras['publisher_name'] == 'Widget Services' assert extras['publisher_email'] == '*****@*****.**' assert len(dataset['resources']) == 4 resource = [ r for r in dataset['resources'] if r['name'] == 'widgets.csv' ][0] assert resource['name'] == u'widgets.csv' assert resource[ 'url'] == u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv' assert resource[ 'download_url'] == u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv'
def test_parse_dataset_default_lang_de(self): maxrdf = self._get_max_rdf() p = RDFParser(profiles=['euro_dcat_ap', 'dcatap_de']) p.parse(maxrdf) self._add_basic_fields_with_languages(p) datasets = [d for d in p.datasets()] self.assertEqual(len(datasets), 1) dataset = datasets[0] # Title and description to be in default language "de" self.assertEqual(dataset.get('title'), u'Naturräume Geest und Marsch (DE)') self.assertEqual( dataset.get('notes'), u'Die Zuordnung des Hamburger Stadtgebietes zu den Naturräumen Geest und Marsch wird dargestellt. (DE)' ) # Publisher and ContactPoint extras = dataset.get('extras') self.assertTrue(len(extras) > 0) self._assert_extras_string( extras, 'publisher_name', u'Behörde für Umwelt und Energie (BUE), Amt für Umweltschutz (DE)') self._assert_extras_string(extras, 'contact_name', u'Herr Dr. Michael Schröder (DE)') # Resources self._assert_resource_lang(dataset, 'DE')
def test_dataset_compatibility_mode(self): contents = self._get_file_contents('dataset.rdf') p = RDFParser(profiles=['euro_dcat_ap'], compatibility_mode=True) p.parse(contents) datasets = [d for d in p.datasets()] assert len(datasets) == 1 dataset = datasets[0] def _get_extra_value(key): v = [ extra['value'] for extra in dataset['extras'] if extra['key'] == key ] return v[0] if v else None assert _get_extra_value('dcat_issued') == u'2012-05-10' assert _get_extra_value('dcat_modified') == u'2012-05-10T21:04:00' assert _get_extra_value( 'dcat_publisher_name') == 'Publishing Organization for dataset 1' assert _get_extra_value('dcat_publisher_email') == '*****@*****.**' assert _get_extra_value('language') == 'ca,en,es'
def test_dataset_json_ld_1(self): contents = self._get_file_contents('catalog_pod.jsonld') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents, _format='json-ld') datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] extras = dict((e['key'], e['value']) for e in dataset['extras']) eq_(dataset['title'], 'U.S. Widget Manufacturing Statistics') eq_(extras['contact_name'], 'Jane Doe') eq_(extras['contact_email'], 'mailto:[email protected]') eq_(extras['publisher_name'], 'Widget Services') eq_(extras['publisher_email'], '*****@*****.**') eq_(len(dataset['resources']), 4) resource = [ r for r in dataset['resources'] if r['name'] == 'widgets.csv' ][0] eq_(resource['name'], u'widgets.csv') eq_( resource['url'], u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv') eq_( resource['download_url'], u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv')
def test_catalog_modified_date(self): dataset1 = factories.Dataset(title='First dataset') time.sleep(1) dataset2 = factories.Dataset(title='Second dataset') url = url_for('dcat_catalog', _format='ttl', modified_since=dataset2['metadata_modified']) app = self._get_test_app() response = app.get(url) content = response.body p = RDFParser() p.parse(content, _format='turtle') dcat_datasets = [d for d in p.datasets()] eq_(len(dcat_datasets), 1) eq_(dcat_datasets[0]['title'], dataset2['title'])
def test_dataset_json_ld_1(self): contents = self._get_file_contents("catalog_pod.jsonld") p = RDFParser(profiles=["euro_dcat_ap"]) p.parse(contents, _format="json-ld") datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] extras = dict((e["key"], e["value"]) for e in dataset["extras"]) eq_(dataset["title"], "U.S. Widget Manufacturing Statistics") eq_(extras["contact_name"], "Jane Doe") eq_(extras["contact_email"], "mailto:[email protected]") eq_(extras["publisher_name"], "Widget Services") eq_(extras["publisher_email"], "*****@*****.**") eq_(len(dataset["resources"]), 4) resource = [r for r in dataset["resources"] if r["name"] == "widgets.csv"][0] eq_(resource["name"], u"widgets.csv") eq_(resource["url"], u"https://data.agency.gov/datasets/widgets-statistics/widgets.csv") eq_(resource["download_url"], u"https://data.agency.gov/datasets/widgets-statistics/widgets.csv")
def test_dataset_ttl(self): dataset = factories.Dataset(notes="Test dataset") url = url_for("dcat_dataset", _id=dataset["id"], _format="ttl") app = self._get_test_app() response = app.get(url) eq_(response.headers["Content-Type"], "text/turtle") content = response.body # Parse the contents to check it's an actual serialization p = RDFParser() p.parse(content, _format="turtle") dcat_datasets = [d for d in p.datasets()] eq_(len(dcat_datasets), 1) dcat_dataset = dcat_datasets[0] eq_(dcat_dataset["title"], dataset["title"]) eq_(dcat_dataset["notes"], dataset["notes"])
def test_dataset_json_ld_1(self): contents = self._get_file_contents('catalog_pod.jsonld') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents, _format='json-ld') datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] extras = dict((e['key'], e['value']) for e in dataset['extras']) eq_(dataset['title'], 'U.S. Widget Manufacturing Statistics') eq_(extras['contact_name'], 'Jane Doe') eq_(extras['contact_email'], 'mailto:[email protected]') eq_(extras['publisher_name'], 'Widget Services') eq_(extras['publisher_email'], '*****@*****.**') eq_(len(dataset['resources']), 4) resource = [r for r in dataset['resources'] if r['name'] == 'widgets.csv'][0] eq_(resource['name'], u'widgets.csv') eq_(resource['url'], u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv') eq_(resource['download_url'], u'https://data.agency.gov/datasets/widgets-statistics/widgets.csv')
def test_dataset_turtle_1(self): contents = self._get_file_contents('dataset_deri.ttl') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents, _format='n3') datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] eq_(dataset['title'], 'Abandoned Vehicles') eq_(len(dataset['resources']), 1) resource = dataset['resources'][0] eq_(resource['name'], u'CSV distribution of: Abandoned Vehicles') eq_( resource['url'], u'http://data.london.gov.uk/datafiles/environment/abandoned-vehicles-borough.csv' ) eq_(resource['uri'], u'http://data.london.gov.uk/dataset/Abandoned_Vehicles/csv')
def test_dataset_ttl(self): dataset = factories.Dataset( notes='Test dataset' ) url = url_for('dcat_dataset', _id=dataset['id'], _format='ttl') app = self._get_test_app() response = app.get(url) eq_(response.headers['Content-Type'], 'text/turtle') content = response.body # Parse the contents to check it's an actual serialization p = RDFParser() p.parse(content, _format='turtle') dcat_datasets = [d for d in p.datasets()] eq_(len(dcat_datasets), 1) dcat_dataset = dcat_datasets[0] eq_(dcat_dataset['title'], dataset['title']) eq_(dcat_dataset['notes'], dataset['notes'])
def test_subthemes(self): load_themes() subthemes = [{ 'theme': 'AGRI', 'subthemes': [ 'http://eurovoc.europa.eu/100253', 'http://eurovoc.europa.eu/100258' ] }, { 'theme': 'ENVI', 'subthemes': [] }] dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'title': 'Dataset di test DCAT_AP-IT', 'notes': 'dcatapit dataset di test', 'metadata_created': '2015-06-26T15:21:09.034694', 'metadata_modified': '2015-06-26T15:21:09.075774', 'tags': [{ 'name': 'Tag 1' }, { 'name': 'Tag 2' }], 'issued': '2016-11-29', 'modified': '2016-11-29', 'frequency': 'UPDATE_CONT', 'publisher_name': 'bolzano', 'publisher_identifier': '234234234', 'creator_name': 'test', 'creator_identifier': '412946129', 'holder_name': 'bolzano', 'holder_identifier': '234234234', 'alternate_identifier': 'ISBN,TEST', 'theme': json.dumps(subthemes), } s = RDFSerializer() p = RDFParser(profiles=['euro_dcat_ap', 'it_dcat_ap']) serialized = s.serialize_dataset(dataset) p.parse(serialized) datasets = list(p.datasets()) assert len(datasets) == 1 d = datasets[0] themes = json.loads(dataset['theme']) assert (len(themes) == len(subthemes) == 2) for t in themes: if t['theme'] == 'ENVI': assert t['subthemes'] == [] elif t['theme'] == 'AGRI': assert set(t['subthemes']) == set(subthemes[0]['subthemes']) else: assert False, "Unknown theme: {}".format(t)
def test_publisher(self): contents = self._get_file_contents('catalog_dati_unibo.rdf') p = RDFParser(profiles=['it_dcat_ap']) p.parse(contents) g = p.g datasets = [d for d in p.datasets()] assert(len(datasets)> 1) for d in datasets: did = d['identifier'] pname = d.get('publisher_name') pid = d.get('publisher_identifier') dat_ref = list(g.subjects(DCT.identifier, Literal(did)))[0] pub_ref = g.value(dat_ref, DCT.publisher) pubnames = list(g.objects(pub_ref, FOAF.name)) if not pubnames: assert pname is None and pid is None,\ "Got {}/{} for publisher, when no ref in graph".format(pname, pid) else: assert pname and pid, "no pname {} and pid {} for {}".format(pname, pid, pubnames) lang_hit = False for lname in pubnames: if hasattr(lname, 'lang'): if lname.lang and lname.lang == DEFAULT_LANG: lang_hit = pname == lname.value else: if not lang_hit: lang_hit = pname == lname.value assert lang_hit, "There should be lang hit"
def test_parse_dataset_default_lang_not_in_graph(self): maxrdf = self._get_max_rdf() p = RDFParser(profiles=['euro_dcat_ap', 'dcatap_de']) p.parse(maxrdf) self._add_basic_fields_with_languages(p) datasets = [d for d in p.datasets()] self.assertEqual(len(datasets), 1) dataset = datasets[0] # Title and description random self.assertIn(u'Naturräume Geest und Marsch', dataset.get('title')) self.assertIn( u'Die Zuordnung des Hamburger Stadtgebietes zu den Naturräumen Geest und Marsch wird dargestellt', dataset.get('notes')) # Publisher and ContactPoint extras = dataset.get('extras') self.assertTrue(len(extras) > 0) self.assertIn(u'Behörde für Umwelt und Energie (BUE), Amt für Umweltschutz', self._get_value_from_extras(extras, 'publisher_name')) self.assertIn(u'Herr Dr. Michael Schröder', self._get_value_from_extras(extras, 'contact_name')) # Resources resources = dataset.get('resources') self.assertEqual(len(resources), 2) for res in resources: # Title and description random self.assertIn(u'Naturräume Geest und Marsch', res.get('name')) self.assertIn( u'Das ist eine deutsche Beschreibung der Distribution', res.get('description'))
def test_creators(self): creators = [{'creator_name': {DEFAULT_LANG: 'abc', 'it': 'abc it'}, 'creator_identifier': "ABC"}, {'creator_name': {DEFAULT_LANG: 'cde', 'it': 'cde it'}, 'creator_identifier': "CDE"}, ] dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'title': 'Dataset di test DCAT_AP-IT', 'notes': 'dcatapit dataset di test', 'metadata_created': '2015-06-26T15:21:09.034694', 'metadata_modified': '2015-06-26T15:21:09.075774', 'tags': [{'name': 'Tag 1'}, {'name': 'Tag 2'}], 'issued':'2016-11-29', 'modified':'2016-11-29', 'identifier':'ISBN', 'temporal_start':'2016-11-01', 'temporal_end':'2016-11-30', 'frequency':'UPDATE_CONT', 'publisher_name':'bolzano', 'publisher_identifier':'234234234', 'creator_name':'test', 'creator_identifier':'412946129', 'holder_name':'bolzano', 'holder_identifier':'234234234', 'alternate_identifier':'ISBN,TEST', 'theme':'{ECON,ENVI}', 'geographical_geonames_url':'http://www.geonames.org/3181913', 'language':'{DEU,ENG,ITA}', 'is_version_of':'http://dcat.geo-solutions.it/dataset/energia-da-fonti-rinnovabili2', 'creator': json.dumps(creators) } s = RDFSerializer() p = RDFParser(profiles=['euro_dcat_ap', 'it_dcat_ap']) serialized = s.serialize_dataset(dataset) p.parse(serialized) datasets = list(p.datasets()) assert len(datasets) == 1 d = datasets[0] creators.append({'creator_identifier': dataset['creator_identifier'], 'creator_name': {DEFAULT_LANG: dataset['creator_name']}}) creators_dict = dict((v['creator_identifier'], v) for v in creators) creators_in = json.loads(d['creator']) for c in creators_in: assert c['creator_identifier'] in creators_dict.keys(), "no {} key in {}".format(c['creator_identifier'], creators_dict.keys()) assert c['creator_name'] == creators_dict[c['creator_identifier']]['creator_name'],\ "{} vs {}".format(c['creator_name'], creators_dict[c['creator_identifier']]['creator_name']) for c in creators_dict.keys(): assert c in [_c['creator_identifier'] for _c in creators_in] cdata = creators_dict[c] assert cdata in creators_in
def test_temporal_coverage(self): load_themes() temporal_coverage = [{'temporal_start': '2001-01-01T00:00:00', 'temporal_end': '2001-02-01T10:11:12'}, {'temporal_start': '2001-01-01T00:00:00', 'temporal_end': '2001-02-01T10:11:12'}, ] dataset = { 'id': '4b6fe9ca-dc77-4cec-92a4-55c6624a5bd6', 'name': 'test-dataset', 'title': 'Dataset di test DCAT_AP-IT', 'notes': 'dcatapit dataset di test', 'metadata_created': '2015-06-26T15:21:09.034694', 'metadata_modified': '2015-06-26T15:21:09.075774', 'tags': [{'name': 'Tag 1'}, {'name': 'Tag 2'}], 'issued':'2016-11-29', 'modified':'2016-11-29', 'identifier':'ISBN', 'temporal_start':'2016-11-01T00:00:00', 'temporal_end':'2016-11-30T00:00:00', 'temporal_coverage': json.dumps(temporal_coverage), 'frequency':'UPDATE_CONT', 'publisher_name':'bolzano', 'publisher_identifier':'234234234', 'creator_name':'test', 'creator_identifier':'412946129', 'holder_name':'bolzano', 'holder_identifier':'234234234', 'alternate_identifier':'ISBN,TEST', 'theme':'{ECON,ENVI}', 'geographical_geonames_url':'http://www.geonames.org/3181913', 'language':'{DEU,ENG,ITA}', 'is_version_of':'http://dcat.geo-solutions.it/dataset/energia-da-fonti-rinnovabili2', } s = RDFSerializer() p = RDFParser(profiles=['euro_dcat_ap', 'it_dcat_ap']) serialized = s.serialize_dataset(dataset) p.parse(serialized) datasets = list(p.datasets()) assert len(datasets) == 1 d = datasets[0] temporal_coverage.append({'temporal_start': dataset['temporal_start'], 'temporal_end': dataset['temporal_end']}) try: validators.dcatapit_temporal_coverage(d['temporal_coverage'], {}) # this should not raise exception assert True except validators.Invalid, err: assert False, "Temporal coverage should be valid: {}".format(err)
def test_alternate_identifiers(self): contents = self._get_file_contents('dataset_identifier.rdf') p = RDFParser(profiles=['it_dcat_ap']) p.parse(contents) g = p.g datasets = [d for d in p.datasets()] assert len(datasets) == 1 assert datasets[0]['alternate_identifier'] =='[{"identifier": "ISBN:alt id 123", "agent": {}}]',\ datasets[0]['alternate_identifier']
def test_catalog(self): contents = self._get_file_contents('catalog.xml') p = RDFParser(profiles=['swiss_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 2)
def test_alternate_identifiers(self): with open(get_example_file('dataset_identifier.rdf'), 'r') as f: contents = f.read() p = RDFParser(profiles=['it_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] assert len(datasets) == 1 assert datasets[0]['alternate_identifier'] == '[{"identifier": "ISBN:alt id 123", "agent": {}}]',\ datasets[0]['alternate_identifier']
def test_parse_without_pagination(self): data = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"> <rdfs:SomeClass rdf:about="http://example.org"> <rdfs:label>Some label</rdfs:label> </rdfs:SomeClass> </rdf:RDF> ''' p = RDFParser() p.parse(data) eq_(p.next_page(), None)
def test_parse_without_pagination(self): data = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"> <rdfs:SomeClass rdf:about="http://example.org"> <rdfs:label>Some label</rdfs:label> </rdfs:SomeClass> </rdf:RDF> ''' p = RDFParser() p.parse(data) assert p.next_page() is None
def test_parse_data_different_format(self): data = ''' @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . <http://example.org> a rdfs:SomeClass ; rdfs:label "Some label" . ''' p = RDFParser() eq_(len(p.g), 0) p.parse(data, _format='n3') eq_(len(p.g), 2)
def test_parse_data_different_format(self): data = ''' @prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> . @prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> . <http://example.org> a rdfs:SomeClass ; rdfs:label "Some label" . ''' p = RDFParser() assert len(p.g) == 0 p.parse(data, _format='n3') assert len(p.g) == 2
def test_catalog_xml_rdf(self): contents = self._get_file_contents("catalog.rdf") p = RDFParser(profiles=["euro_dcat_ap"]) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 2) dataset = datasets[0] if datasets[0]["title"] == "Example dataset 1" else datasets[1] eq_(dataset["title"], "Example dataset 1") eq_(len(dataset["resources"]), 3) eq_(len(dataset["tags"]), 2)
def test_catalog_xml_rdf(self): contents = self._get_file_contents('catalog.rdf') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] assert len(datasets) == 2 dataset = (datasets[0] if datasets[0]['title'] == 'Example dataset 1' else datasets[1]) assert dataset['title'] == 'Example dataset 1' assert len(dataset['resources']) == 3 assert len(dataset['tags']) == 2
def test_dataset_show_without_format(self): dataset = factories.Dataset(notes='Test dataset') content = helpers.call_action('dcat_dataset_show', id=dataset['id']) # Parse the contents to check it's an actual serialization p = RDFParser() p.parse(content) dcat_datasets = [d for d in p.datasets()] eq_(len(dcat_datasets), 1) dcat_dataset = dcat_datasets[0] eq_(dcat_dataset['title'], dataset['title']) eq_(dcat_dataset['notes'], dataset['notes'])
def test_catalog_xml_rdf(self): contents = self._get_file_contents('catalog.rdf') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 2) dataset = (datasets[0] if datasets[0]['title'] == 'Example dataset 1' else datasets[1]) eq_(dataset['title'], 'Example dataset 1') eq_(len(dataset['resources']), 3) eq_(len(dataset['tags']), 2)
def gather_stage(self, harvest_job): log.debug('In DCATRDFHarvester gather_stage') # Get file contents url = harvest_job.source.url for harvester in p.PluginImplementations(IDCATRDFHarvester): url, before_download_errors = harvester.before_download( url, harvest_job) for error_msg in before_download_errors: self._save_gather_error(error_msg, harvest_job) if not url: return False rdf_format = None if harvest_job.source.config: rdf_format = json.loads( harvest_job.source.config).get("rdf_format") content, rdf_format = self._get_content_and_type( url, harvest_job, 1, content_type=rdf_format) # TODO: store content? for harvester in p.PluginImplementations(IDCATRDFHarvester): content, after_download_errors = harvester.after_download( content, harvest_job) for error_msg in after_download_errors: self._save_gather_error(error_msg, harvest_job) if not content: return False # TODO: profiles conf parser = RDFParser() try: parser.parse(content, _format=rdf_format) except RDFParserException, e: self._save_gather_error( 'Error parsing the RDF file: {0}'.format(e), harvest_job) return False
def test_parse_data(self): data = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#"> <rdfs:SomeClass rdf:about="http://example.org"> <rdfs:label>Some label</rdfs:label> </rdfs:SomeClass> </rdf:RDF> ''' p = RDFParser() eq_(len(p.g), 0) p.parse(data) eq_(len(p.g), 2)
def parse_chunk(self, harvest_job, content, rdf_format, guids_in_source, object_ids): # TODO: store content? for harvester in p.PluginImplementations(IDCATRDFHarvester): content, after_download_errors = harvester.after_download(content, harvest_job) for error_msg in after_download_errors: self._save_gather_error(error_msg, harvest_job) if not content: return False # TODO: profiles conf parser = RDFParser() try: parser.parse(content, _format=rdf_format) except RDFParserException, e: self._save_gather_error('Error parsing the RDF file: {0}'.format(e), harvest_job) return False
def test_dataset_show_with_format(): dataset = factories.Dataset(notes='Test dataset') content = helpers.call_action('dcat_dataset_show', id=dataset['id'], _format='xml') # Parse the contents to check it's an actual serialization p = RDFParser() p.parse(content, _format='xml') dcat_datasets = [d for d in p.datasets()] assert len(dcat_datasets) == 1 dcat_dataset = dcat_datasets[0] assert dcat_dataset['title'] == dataset['title'] assert dcat_dataset['notes'] == dataset['notes']
def test_dataset_show_without_format(self): dataset = factories.Dataset( notes='Test dataset' ) content = helpers.call_action('dcat_dataset_show', id=dataset['id']) # Parse the contents to check it's an actual serialization p = RDFParser() p.parse(content) dcat_datasets = [d for d in p.datasets()] eq_(len(dcat_datasets), 1) dcat_dataset = dcat_datasets[0] eq_(dcat_dataset['title'], dataset['title']) eq_(dcat_dataset['notes'], dataset['notes'])
def test_dataset_turtle_1(self): contents = self._get_file_contents("dataset_deri.ttl") p = RDFParser(profiles=["euro_dcat_ap"]) p.parse(contents, _format="n3") datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] eq_(dataset["title"], "Abandoned Vehicles") eq_(len(dataset["resources"]), 1) resource = dataset["resources"][0] eq_(resource["name"], u"CSV distribution of: Abandoned Vehicles") eq_(resource["url"], u"http://data.london.gov.uk/datafiles/environment/abandoned-vehicles-borough.csv") eq_(resource["uri"], u"http://data.london.gov.uk/dataset/Abandoned_Vehicles/csv")
def test_dataset_turtle_1(self): contents = self._get_file_contents('dataset_deri.ttl') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents, _format='n3') datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] eq_(dataset['title'], 'Abandoned Vehicles') eq_(len(dataset['resources']), 1) resource = dataset['resources'][0] eq_(resource['name'], u'CSV distribution of: Abandoned Vehicles') eq_(resource['url'], u'http://data.london.gov.uk/datafiles/environment/abandoned-vehicles-borough.csv') eq_(resource['uri'], u'http://data.london.gov.uk/dataset/Abandoned_Vehicles/csv')
def test_dataset_issued_with_year_before_1900(self): contents = self._get_file_contents('1894.xml') p = RDFParser(profiles=['swiss_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] # Check date values eq_(dataset['issued'], -2398377600) issued = datetime.fromtimestamp(dataset['issued']) eq_(issued.date().isoformat(), u'1893-12-31') eq_(dataset['modified'], 1524528000) modified = datetime.fromtimestamp(dataset['modified']) eq_(modified.date().isoformat(), u'2018-04-24')
def test_parse_pagination_last_page(self): data = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:hydra="http://www.w3.org/ns/hydra/core#"> <hydra:PagedCollection rdf:about="http://example.com/catalog.xml?page=3"> <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">245</hydra:totalItems> <hydra:lastPage>http://example.com/catalog.xml?page=3</hydra:lastPage> <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">100</hydra:itemsPerPage> <hydra:firstPage>http://example.com/catalog.xml?page=1</hydra:firstPage> <hydra:previousPage>http://example.com/catalog.xml?page=2</hydra:previousPage> </hydra:PagedCollection> </rdf:RDF> ''' p = RDFParser() p.parse(data) assert p.next_page() is None
def test_parse_pagination_last_page(self): data = '''<?xml version="1.0" encoding="utf-8" ?> <rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns:rdfs="http://www.w3.org/2000/01/rdf-schema#" xmlns:hydra="http://www.w3.org/ns/hydra/core#"> <hydra:PagedCollection rdf:about="http://example.com/catalog.xml?page=3"> <hydra:totalItems rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">245</hydra:totalItems> <hydra:lastPage>http://example.com/catalog.xml?page=3</hydra:lastPage> <hydra:itemsPerPage rdf:datatype="http://www.w3.org/2001/XMLSchema#integer">100</hydra:itemsPerPage> <hydra:firstPage>http://example.com/catalog.xml?page=1</hydra:firstPage> <hydra:previousPage>http://example.com/catalog.xml?page=2</hydra:previousPage> </hydra:PagedCollection> </rdf:RDF> ''' p = RDFParser() p.parse(data) eq_(p.next_page(), None)
def gather_stage(self, harvest_job): log.debug('In DCATRDFHarvester gather_stage') # Get file contents url = harvest_job.source.url for harvester in p.PluginImplementations(IDCATRDFHarvester): url, before_download_errors = harvester.before_download(url, harvest_job) for error_msg in before_download_errors: self._save_gather_error(error_msg, harvest_job) if not url: return False rdf_format = None if harvest_job.source.config: rdf_format = json.loads(harvest_job.source.config).get("rdf_format") content, rdf_format = self._get_content_and_type(url, harvest_job, 1, content_type=rdf_format) # TODO: store content? for harvester in p.PluginImplementations(IDCATRDFHarvester): content, after_download_errors = harvester.after_download(content, harvest_job) for error_msg in after_download_errors: self._save_gather_error(error_msg, harvest_job) if not content: return False # TODO: profiles conf parser = RDFParser() try: parser.parse(content, _format=rdf_format) except RDFParserException, e: self._save_gather_error('Error parsing the RDF file: {0}'.format(e), harvest_job) return False
def test_catalog_ttl(self): for i in xrange(4): factories.Dataset() url = url_for('dcat_catalog', _format='ttl') app = self._get_test_app() response = app.get(url) eq_(response.headers['Content-Type'], 'text/turtle') content = response.body # Parse the contents to check it's an actual serialization p = RDFParser() p.parse(content, _format='turtle') dcat_datasets = [d for d in p.datasets()] eq_(len(dcat_datasets), 4)
def test_dataset_compatibility_mode(self): contents = self._get_file_contents("dataset.rdf") p = RDFParser(profiles=["euro_dcat_ap"], compatibility_mode=True) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] def _get_extra_value(key): v = [extra["value"] for extra in dataset["extras"] if extra["key"] == key] return v[0] if v else None eq_(_get_extra_value("dcat_issued"), u"2012-05-10") eq_(_get_extra_value("dcat_modified"), u"2012-05-10T21:04:00") eq_(_get_extra_value("dcat_publisher_name"), "Publishing Organization for dataset 1") eq_(_get_extra_value("dcat_publisher_email"), "*****@*****.**") eq_(_get_extra_value("language"), "ca,en,es")
def test_catalog_ttl(self): for i in xrange(4): factories.Dataset() url = url_for("dcat_catalog", _format="ttl") app = self._get_test_app() response = app.get(url) eq_(response.headers["Content-Type"], "text/turtle") content = response.body # Parse the contents to check it's an actual serialization p = RDFParser() p.parse(content, _format="turtle") dcat_datasets = [d for d in p.datasets()] eq_(len(dcat_datasets), 4)
def test_catalog_modified_date(self): dataset1 = factories.Dataset(title="First dataset") time.sleep(1) dataset2 = factories.Dataset(title="Second dataset") url = url_for("dcat_catalog", _format="ttl", modified_since=dataset2["metadata_modified"]) app = self._get_test_app() response = app.get(url) content = response.body p = RDFParser() p.parse(content, _format="turtle") dcat_datasets = [d for d in p.datasets()] eq_(len(dcat_datasets), 1) eq_(dcat_datasets[0]["title"], dataset2["title"])
def test_catalog_default(self): for i in xrange(4): factories.Dataset() url = url_for('dcat_catalog', _format='rdf') app = self._get_test_app() response = app.get(url) eq_(response.headers['Content-Type'], 'application/rdf+xml') content = response.body # Parse the contents to check it's an actual serialization p = RDFParser() p.parse(content, _format='xml') dcat_datasets = [d for d in p.datasets()] eq_(len(dcat_datasets), 4)
def _update_package_in_triplestore(self, package_id, package_org): '''Updates the package with the given package ID in the triple store.''' uri = 'n/a' # Get uri of dataset rdf = self._get_rdf(package_id) rdf_parser = RDFParser() rdf_parser.parse(rdf) # Should be only one dataset for uri in rdf_parser._datasets(): self.triplestore_client.delete_dataset_in_triplestore(uri) self.triplestore_client.create_dataset_in_triplestore(rdf, uri) # shacl-validate the graph validation_rdf = self.shacl_validation_client.validate( rdf, uri, package_org) if validation_rdf: # update in mqa-triplestore self.triplestore_client.delete_dataset_in_triplestore_mqa( uri, package_org) self.triplestore_client.create_dataset_in_triplestore_mqa( validation_rdf, uri) return uri
def test_dataset_compatibility_mode(self): contents = self._get_file_contents('dataset.rdf') p = RDFParser(profiles=['euro_dcat_ap'], compatibility_mode=True) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] def _get_extra_value(key): v = [extra['value'] for extra in dataset['extras'] if extra['key'] == key] return v[0] if v else None eq_(_get_extra_value('dcat_issued'), u'2012-05-10') eq_(_get_extra_value('dcat_modified'), u'2012-05-10T21:04:00') eq_(_get_extra_value('dcat_publisher_name'), 'Publishing Organization for dataset 1') eq_(_get_extra_value('dcat_publisher_email'), '*****@*****.**') eq_(_get_extra_value('language'), 'ca,en,es')
def gather_stage(self, harvest_job): log.debug('In DCATRDFHarvester gather_stage') # Get file contents url = harvest_job.source.url for harvester in p.PluginImplementations(IDCATRDFHarvester): url, before_download_errors = harvester.before_download(url, harvest_job) for error_msg in before_download_errors: self._save_gather_error(error_msg, harvest_job) if not url: return False content = self._get_content(url, harvest_job, 1) # TODO: store content? for harvester in p.PluginImplementations(IDCATRDFHarvester): content, after_download_errors = harvester.after_download(content, harvest_job) for error_msg in after_download_errors: self._save_gather_error(error_msg, harvest_job) if not content: return False # TODO: profiles conf parser = RDFParser() # TODO: format conf try: parser.parse(content) except RDFParserException, e: self._save_gather_error('Error parsing the RDF file: {0}'.format(e), harvest_job) return False
def _delete_dataset_in_triplestore(self, harvest_object): ''' Deletes the package with the given package ID in the triple store. ''' try: if self.triplestore_client.is_available(): package_id = harvest_object.package_id LOGGER.debug( u'Start deleting dataset with ID %s from triplestore.', package_id) context = {'user': self._get_user_name()} rdf = toolkit.get_action('dcat_dataset_show')(context, { 'id': package_id }) rdf_parser = RDFParser() rdf_parser.parse(rdf) # Should be only one dataset uri = next(rdf_parser._datasets(), None) source_dataset = model.Package.get(harvest_object.source.id) self._delete_dataset_in_triplestore_by_uri(uri, source_dataset) except RDFParserException as ex: LOGGER.warn( u'Error while parsing the RDF file for dataset with ID %s: %s', package_id, ex)
def test_parse_data_raises_on_parse_error(self): p = RDFParser() data = 'Wrong data' with pytest.raises(RDFParserException): p.parse('') with pytest.raises(RDFParserException): p.parse(data) with pytest.raises(RDFParserException): p.parse(data, _format='n3')
def gather_stage(self, harvest_job): log.debug('In DCATRDFHarvester gather_stage') rdf_format = None if harvest_job.source.config: rdf_format = json.loads(harvest_job.source.config).get("rdf_format") # Get file contents of first page next_page_url = harvest_job.source.url guids_in_source = [] object_ids = [] while next_page_url: for harvester in p.PluginImplementations(IDCATRDFHarvester): next_page_url, before_download_errors = harvester.before_download(next_page_url, harvest_job) for error_msg in before_download_errors: self._save_gather_error(error_msg, harvest_job) if not next_page_url: return [] content, rdf_format = self._get_content_and_type(next_page_url, harvest_job, 1, content_type=rdf_format) # TODO: store content? for harvester in p.PluginImplementations(IDCATRDFHarvester): content, after_download_errors = harvester.after_download(content, harvest_job) for error_msg in after_download_errors: self._save_gather_error(error_msg, harvest_job) if not content: return [] # TODO: profiles conf parser = RDFParser() try: parser.parse(content, _format=rdf_format) except RDFParserException, e: self._save_gather_error('Error parsing the RDF file: {0}'.format(e), harvest_job) return [] for dataset in parser.datasets(): if not dataset.get('name'): dataset['name'] = self._gen_new_name(dataset['title']) # Unless already set by the parser, get the owner organization (if any) # from the harvest source dataset if not dataset.get('owner_org'): source_dataset = model.Package.get(harvest_job.source.id) if source_dataset.owner_org: dataset['owner_org'] = source_dataset.owner_org # Try to get a unique identifier for the harvested dataset guid = self._get_guid(dataset) if not guid: self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset), harvest_job) continue dataset['extras'].append({'key': 'guid', 'value': guid}) guids_in_source.append(guid) obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(dataset)) obj.save() object_ids.append(obj.id) # get the next page next_page_url = parser.next_page()
def gather_stage(self, harvest_job): log.debug('In DCATRDFHarvester gather_stage') rdf_format = None if harvest_job.source.config: rdf_format = json.loads(harvest_job.source.config).get("rdf_format") # Get file contents of first page next_page_url = harvest_job.source.url guids_in_source = [] object_ids = [] last_content_hash = None self._names_taken = [] while next_page_url: for harvester in p.PluginImplementations(IDCATRDFHarvester): next_page_url, before_download_errors = harvester.before_download(next_page_url, harvest_job) for error_msg in before_download_errors: self._save_gather_error(error_msg, harvest_job) if not next_page_url: return [] content, rdf_format = self._get_content_and_type(next_page_url, harvest_job, 1, content_type=rdf_format) content_hash = hashlib.md5() if content: content_hash.update(content) if last_content_hash: if content_hash.digest() == last_content_hash.digest(): log.warning('Remote content was the same even when using a paginated URL, skipping') break else: last_content_hash = content_hash # TODO: store content? for harvester in p.PluginImplementations(IDCATRDFHarvester): content, after_download_errors = harvester.after_download(content, harvest_job) for error_msg in after_download_errors: self._save_gather_error(error_msg, harvest_job) if not content: return [] # TODO: profiles conf parser = RDFParser() try: parser.parse(content, _format=rdf_format) except RDFParserException, e: self._save_gather_error('Error parsing the RDF file: {0}'.format(e), harvest_job) return [] try: source_dataset = model.Package.get(harvest_job.source.id) for dataset in parser.datasets(): if not dataset.get('name'): dataset['name'] = self._gen_new_name(dataset['title']) if dataset['name'] in self._names_taken: suffix = len([i for i in self._names_taken if i.startswith(dataset['name'] + '-')]) + 1 dataset['name'] = '{}-{}'.format(dataset['name'], suffix) self._names_taken.append(dataset['name']) # Unless already set by the parser, get the owner organization (if any) # from the harvest source dataset if not dataset.get('owner_org'): if source_dataset.owner_org: dataset['owner_org'] = source_dataset.owner_org # Try to get a unique identifier for the harvested dataset guid = self._get_guid(dataset, source_url=source_dataset.url) if not guid: self._save_gather_error('Could not get a unique identifier for dataset: {0}'.format(dataset), harvest_job) continue dataset['extras'].append({'key': 'guid', 'value': guid}) guids_in_source.append(guid) obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(dataset)) obj.save() object_ids.append(obj.id) except Exception, e: self._save_gather_error('Error when processsing dataset: %r / %s' % (e, traceback.format_exc()), harvest_job) return []
def test_dataset_all_fields(self): contents = self._get_file_contents("dataset.rdf") p = RDFParser(profiles=["euro_dcat_ap"]) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] # Basic fields eq_(dataset["title"], u"Zimbabwe Regional Geochemical Survey.") eq_(dataset["notes"], u"During the period 1982-86 a team of geologists from the British Geological Survey ...") eq_(dataset["url"], "http://dataset.info.org") eq_(dataset["version"], "2.3") # Tags eq_( sorted(dataset["tags"], key=lambda k: k["name"]), [{"name": u"exploration"}, {"name": u"geochemistry"}, {"name": u"geology"}], ) # Extras def _get_extra_value(key): v = [extra["value"] for extra in dataset["extras"] if extra["key"] == key] return v[0] if v else None def _get_extra_value_as_list(key): value = _get_extra_value(key) return json.loads(value) if value else [] # Simple values eq_(_get_extra_value("issued"), u"2012-05-10") eq_(_get_extra_value("modified"), u"2012-05-10T21:04:00") eq_(_get_extra_value("identifier"), u"9df8df51-63db-37a8-e044-0003ba9b0d98") eq_(_get_extra_value("version_notes"), u"New schema added") eq_(_get_extra_value("temporal_start"), "1905-03-01") eq_(_get_extra_value("temporal_end"), "2013-01-05") eq_(_get_extra_value("frequency"), "http://purl.org/cld/freq/daily") eq_(_get_extra_value("spatial_uri"), "http://publications.europa.eu/mdr/authority/country/ZWE") eq_(_get_extra_value("publisher_uri"), "http://orgs.vocab.org/some-org") eq_(_get_extra_value("publisher_name"), "Publishing Organization for dataset 1") eq_(_get_extra_value("publisher_email"), "*****@*****.**") eq_(_get_extra_value("publisher_url"), "http://some.org") eq_(_get_extra_value("publisher_type"), "http://purl.org/adms/publishertype/NonProfitOrganisation") eq_(_get_extra_value("contact_name"), "Point of Contact") eq_(_get_extra_value("contact_email"), "mailto:[email protected]") eq_(_get_extra_value("access_rights"), "public") eq_(_get_extra_value("provenance"), "Some statement about provenance") eq_(_get_extra_value("dcat_type"), "test-type") # Lists eq_(sorted(_get_extra_value_as_list("language")), [u"ca", u"en", u"es"]) eq_( sorted(_get_extra_value_as_list("theme")), [u"Earth Sciences", u"http://eurovoc.europa.eu/100142", u"http://eurovoc.europa.eu/209065"], ) eq_(sorted(_get_extra_value_as_list("conforms_to")), [u"Standard 1", u"Standard 2"]) eq_( sorted(_get_extra_value_as_list("alternate_identifier")), [u"alternate-identifier-1", u"alternate-identifier-2"], ) eq_( sorted(_get_extra_value_as_list("documentation")), [u"http://dataset.info.org/doc1", u"http://dataset.info.org/doc2"], ) eq_( sorted(_get_extra_value_as_list("related_resource")), [u"http://dataset.info.org/related1", u"http://dataset.info.org/related2"], ) eq_( sorted(_get_extra_value_as_list("has_version")), [ u"https://data.some.org/catalog/datasets/derived-dataset-1", u"https://data.some.org/catalog/datasets/derived-dataset-2", ], ) eq_( sorted(_get_extra_value_as_list("is_version_of")), [u"https://data.some.org/catalog/datasets/original-dataset"], ) eq_( sorted(_get_extra_value_as_list("source")), [ u"https://data.some.org/catalog/datasets/source-dataset-1", u"https://data.some.org/catalog/datasets/source-dataset-2", ], ) eq_( sorted(_get_extra_value_as_list("sample")), [u"https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/sample"], ) # Dataset URI eq_(_get_extra_value("uri"), u"https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98") # Resources eq_(len(dataset["resources"]), 1) resource = dataset["resources"][0] # Simple values eq_(resource["name"], u"Some website") eq_(resource["description"], u"A longer description") eq_(resource["format"], u"HTML") eq_(resource["mimetype"], u"text/html") eq_(resource["issued"], u"2012-05-11") eq_(resource["modified"], u"2012-05-01T00:04:06") eq_(resource["status"], u"http://purl.org/adms/status/Completed") eq_(resource["hash"], u"4304cf2e751e6053c90b1804c89c0ebb758f395a") eq_(resource["hash_algorithm"], u"http://spdx.org/rdf/terms#checksumAlgorithm_sha1") # Lists for item in [ ( "documentation", [u"http://dataset.info.org/distribution1/doc1", u"http://dataset.info.org/distribution1/doc2"], ), ("language", [u"ca", u"en", u"es"]), ("conforms_to", [u"Standard 1", u"Standard 2"]), ]: eq_(sorted(json.loads(resource[item[0]])), item[1]) # These two are likely to need clarification eq_(resource["license"], u"http://creativecommons.org/licenses/by/3.0/") eq_(resource["rights"], u"Some statement about rights") eq_(resource["url"], u"http://www.bgs.ac.uk/gbase/geochemcd/home.html") assert "download_url" not in resource eq_(resource["size"], 12323) # Distribution URI eq_(resource["uri"], u"https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/1")
def gather_stage(self, harvest_job): log.debug('In DCATRDFHarvester gather_stage') rdf_format = None if harvest_job.source.config: rdf_format = json.loads( harvest_job.source.config).get("rdf_format") # Get file contents of first page next_page_url = harvest_job.source.url guids_in_source = [] object_ids = [] last_content_hash = None while next_page_url: for harvester in p.PluginImplementations(IDCATRDFHarvester): next_page_url, before_download_errors = harvester.before_download( next_page_url, harvest_job) for error_msg in before_download_errors: self._save_gather_error(error_msg, harvest_job) if not next_page_url: return [] content, rdf_format = self._get_content_and_type( next_page_url, harvest_job, 1, content_type=rdf_format) content_hash = hashlib.md5() content_hash.update(content) if last_content_hash: if content_hash.digest() == last_content_hash.digest(): log.warning( 'Remote content was the same even when using a paginated URL, skipping' ) break else: last_content_hash = content_hash # TODO: store content? for harvester in p.PluginImplementations(IDCATRDFHarvester): content, after_download_errors = harvester.after_download( content, harvest_job) for error_msg in after_download_errors: self._save_gather_error(error_msg, harvest_job) if not content: return [] # TODO: profiles conf parser = RDFParser() try: parser.parse(content, _format=rdf_format) except RDFParserException, e: self._save_gather_error( 'Error parsing the RDF file: {0}'.format(e), harvest_job) return [] for dataset in parser.datasets(): if not dataset.get('name'): dataset['name'] = self._gen_new_name(dataset['title']) # Unless already set by the parser, get the owner organization (if any) # from the harvest source dataset if not dataset.get('owner_org'): source_dataset = model.Package.get(harvest_job.source.id) if source_dataset.owner_org: dataset['owner_org'] = source_dataset.owner_org # Try to get a unique identifier for the harvested dataset guid = self._get_guid(dataset) if not guid: self._save_gather_error( 'Could not get a unique identifier for dataset: {0}'. format(dataset), harvest_job) continue dataset['extras'].append({'key': 'guid', 'value': guid}) guids_in_source.append(guid) obj = HarvestObject(guid=guid, job=harvest_job, content=json.dumps(dataset)) obj.save() object_ids.append(obj.id) # get the next page next_page_url = parser.next_page()
def test_dataset_all_fields(self): contents = self._get_file_contents('dataset.rdf') p = RDFParser(profiles=['euro_dcat_ap']) p.parse(contents) datasets = [d for d in p.datasets()] eq_(len(datasets), 1) dataset = datasets[0] # Basic fields eq_(dataset['title'], u'Zimbabwe Regional Geochemical Survey.') eq_(dataset['notes'], u'During the period 1982-86 a team of geologists from the British Geological Survey ...') eq_(dataset['url'], 'http://dataset.info.org') eq_(dataset['version'], '2.3') # Tags eq_(sorted(dataset['tags'], key=lambda k: k['name']), [{'name': u'exploration'}, {'name': u'geochemistry'}, {'name': u'geology'}]) # Extras def _get_extra_value(key): v = [extra['value'] for extra in dataset['extras'] if extra['key'] == key] return v[0] if v else None def _get_extra_value_as_list(key): value = _get_extra_value(key) return json.loads(value) if value else [] # Simple values eq_(_get_extra_value('issued'), u'2012-05-10') eq_(_get_extra_value('modified'), u'2012-05-10T21:04:00') eq_(_get_extra_value('identifier'), u'9df8df51-63db-37a8-e044-0003ba9b0d98') eq_(_get_extra_value('alternate_identifier'), u'alternate-identifier-x343') eq_(_get_extra_value('version_notes'), u'New schema added') eq_(_get_extra_value('temporal_start'), '1905-03-01') eq_(_get_extra_value('temporal_end'), '2013-01-05') eq_(_get_extra_value('frequency'), 'http://purl.org/cld/freq/daily') eq_(_get_extra_value('spatial_uri'), 'http://publications.europa.eu/mdr/authority/country/ZWE') eq_(_get_extra_value('publisher_uri'), 'http://orgs.vocab.org/some-org') eq_(_get_extra_value('publisher_name'), 'Publishing Organization for dataset 1') eq_(_get_extra_value('publisher_email'), '*****@*****.**') eq_(_get_extra_value('publisher_url'), 'http://some.org') eq_(_get_extra_value('publisher_type'), 'http://purl.org/adms/publishertype/NonProfitOrganisation') eq_(_get_extra_value('contact_name'), 'Point of Contact') eq_(_get_extra_value('contact_email'), 'mailto:[email protected]') # Lists eq_(sorted(_get_extra_value_as_list('language')), [u'ca', u'en', u'es']) eq_(sorted(_get_extra_value_as_list('theme')), [u'Earth Sciences', u'http://eurovoc.europa.eu/100142', u'http://eurovoc.europa.eu/209065']) eq_(sorted(_get_extra_value_as_list('conforms_to')), [u'Standard 1', u'Standard 2']) # Dataset URI eq_(_get_extra_value('uri'), u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98') # Resources eq_(len(dataset['resources']), 1) resource = dataset['resources'][0] # Simple values eq_(resource['name'], u'Some website') eq_(resource['description'], u'A longer description') eq_(resource['format'], u'HTML') eq_(resource['mimetype'], u'text/html') eq_(resource['issued'], u'2012-05-11') eq_(resource['modified'], u'2012-05-01T00:04:06') eq_(resource['status'], u'http://purl.org/adms/status/Completed') # These two are likely to need clarification eq_(resource['license'], u'http://creativecommons.org/licenses/by/3.0/') eq_(resource['rights'], u'Some statement about rights') eq_(resource['url'], u'http://www.bgs.ac.uk/gbase/geochemcd/home.html') assert 'download_url' not in resource eq_(resource['size'], 12323) # Distribution URI eq_(resource['uri'], u'https://data.some.org/catalog/datasets/9df8df51-63db-37a8-e044-0003ba9b0d98/1')