def _prepare_resource_format(self, resource): resource_format = '' # get format from media_type field if available if not resource_format and resource.get('media_type'): # noqa resource_format = resource['media_type'].split('/')[-1].lower() # get format from format field if available (lol) if not resource_format and resource.get('format'): resource_format = resource['format'].split('/')[-1].lower() # check if 'media_type' or 'format' can be mapped has_format = (map_to_valid_format(resource_format) is not None) # if the fields can't be mapped, # try to parse the download_url as a last resort if not has_format and resource.get('download_url'): path = urlparse.urlparse(resource['download_url']).path ext = os.path.splitext(path)[1] if ext: resource_format = ext.replace('.', '').lower() mapped_format = map_to_valid_format(resource_format) if mapped_format: # if format could be successfully mapped write it to format field resource['format'] = mapped_format elif not resource.get('download_url'): resource['format'] = 'SERVICE' else: # else return empty string (this will be indexed as N/A) resource['format'] = '' return resource
def _prepare_resource_format(self, resource): resource_format = '' # get format from download_url file extension if available if resource.get('download_url') is not None: path = urlparse.urlparse(resource['download_url']).path ext = os.path.splitext(path)[1] if ext: resource_format = ext.replace('.', '').lower() # get format from media_type field if available if not resource_format and resource.get('media_type') is not None: # noqa resource_format = resource['media_type'].split('/')[-1].lower() # get format from format field if available (lol) if not resource_format and resource.get('format') is not None: resource_format = resource['format'].split('/')[-1].lower() mapped_format = map_to_valid_format(resource_format) if mapped_format: resource['format'] = mapped_format return resource
def graph_from_dataset(self, dataset_dict, dataset_ref): # noqa g = self.g for prefix, namespace in namespaces.iteritems(): g.bind(prefix, namespace) g.add((dataset_ref, RDF.type, DCAT.Dataset)) # Basic fields items = [ ('identifier', DCT.identifier, ['guid', 'id'], Literal), ('version', OWL.versionInfo, ['dcat_version'], Literal), ('version_notes', ADMS.versionNotes, None, Literal), ('frequency', DCT.accrualPeriodicity, None, Literal), ('access_rights', DCT.accessRights, None, Literal), ('dcat_type', DCT.type, None, Literal), ('provenance', DCT.provenance, None, Literal), ('spatial', DCT.spatial, None, Literal), ] self._add_triples_from_dict(dataset_dict, dataset_ref, items) self._add_multilang_value(dataset_ref, DCT.description, 'description', dataset_dict) # noqa self._add_multilang_value(dataset_ref, DCT.title, 'title', dataset_dict) # noqa # LandingPage g.add((dataset_ref, DCAT.landingPage, Literal(dataset_dict['url']))) self._add_multilang_value(dataset_ref, DCAT.keyword, 'keywords', dataset_dict) # noqa # Dates items = [ ('issued', DCT.issued, ['metadata_created'], Literal), ('modified', DCT.modified, ['metadata_modified'], Literal), ] self._add_date_triples_from_dict(dataset_dict, dataset_ref, items) # Update Interval accrual_periodicity = dataset_dict.get('accrual_periodicity') if accrual_periodicity: g.add(( dataset_ref, DCT.accrualPeriodicity, URIRef(accrual_periodicity) )) # Lists items = [ ('language', DCT.language, None, Literal), ('theme', DCAT.theme, None, URIRef), ('conforms_to', DCT.conformsTo, None, Literal), ('alternate_identifier', ADMS.identifier, None, Literal), ('documentation', FOAF.page, None, Literal), ('has_version', DCT.hasVersion, None, Literal), ('is_version_of', DCT.isVersionOf, None, Literal), ('source', DCT.source, None, Literal), ('sample', ADMS.sample, None, Literal), ] self._add_list_triples_from_dict(dataset_dict, dataset_ref, items) # Relations if dataset_dict.get('relations'): relations = dataset_dict.get('relations') for relation in relations: relation_name = relation['label'] relation_url = relation['url'] relation = URIRef(relation_url) g.add((relation, RDFS.label, Literal(relation_name))) g.add((dataset_ref, DCT.relation, relation)) # References if dataset_dict.get('see_alsos'): references = dataset_dict.get('see_alsos') for reference in references: reference_identifier = reference['dataset_identifier'] g.add((dataset_ref, RDFS.seeAlso, Literal(reference_identifier))) # noqa # Contact details if dataset_dict.get('contact_points'): contact_points = self._get_dataset_value(dataset_dict, 'contact_points') # noqa for contact_point in contact_points: contact_details = BNode() contact_point_email = contact_point['email'] contact_point_name = contact_point['name'] g.add((contact_details, RDF.type, VCARD.Organization)) g.add((contact_details, VCARD.hasEmail, URIRef(contact_point_email))) # noqa g.add((contact_details, VCARD.fn, Literal(contact_point_name))) g.add((dataset_ref, DCAT.contactPoint, contact_details)) # Publisher if dataset_dict.get('publishers'): publishers = dataset_dict.get('publishers') for publisher in publishers: publisher_name = publisher['label'] publisher_details = BNode() g.add((publisher_details, RDF.type, RDF.Description)) g.add((publisher_details, RDFS.label, Literal(publisher_name))) g.add((dataset_ref, DCT.publisher, publisher_details)) # Temporals temporals = dataset_dict.get('temporals') if temporals: for temporal in temporals: start = temporal['start_date'] end = temporal['end_date'] if start or end: temporal_extent = BNode() g.add((temporal_extent, RDF.type, DCT.PeriodOfTime)) if start: self._add_date_triple(temporal_extent, SCHEMA.startDate, start) # noqa if end: self._add_date_triple(temporal_extent, SCHEMA.endDate, end) # noqa g.add((dataset_ref, DCT.temporal, temporal_extent)) # Themes groups = self._get_dataset_value(dataset_dict, 'groups') for group_name in groups: g.add(( dataset_ref, DCAT.theme, URIRef(ogd_theme_base_url + group_name.get('name')) )) # Resources for resource_dict in dataset_dict.get('resources', []): distribution = URIRef(resource_uri(resource_dict)) g.add((dataset_ref, DCAT.distribution, distribution)) g.add((distribution, RDF.type, DCAT.Distribution)) # Simple values items = [ ('status', ADMS.status, None, Literal), ('rights', DCT.rights, None, Literal), ('license', DCT.license, None, Literal), ('identifier', DCT.identifier, None, Literal), ('media_type', DCAT.mediaType, None, Literal), ('spatial', DCT.spatial, None, Literal), ] self._add_triples_from_dict(resource_dict, distribution, items) self._add_multilang_value(distribution, DCT.title, 'display_name', dataset_dict) # noqa self._add_multilang_value(distribution, DCT.description, 'description', dataset_dict) # noqa # Lists items = [ ('documentation', FOAF.page, None, Literal), ('language', DCT.language, None, Literal), ('conforms_to', DCT.conformsTo, None, Literal), ] self._add_list_triples_from_dict(resource_dict, distribution, items) # URL url = resource_dict.get('url') download_url = resource_dict.get('download_url') if download_url: g.add((distribution, DCAT.downloadURL, URIRef(download_url))) g.add((distribution, DCAT.accessURL, URIRef(download_url))) if (url and not download_url) or (url and url != download_url): g.add((distribution, DCAT.accessURL, URIRef(url))) # Format from Download-Url if download_url: format_value = str(download_url).rsplit('.', 1)[1] mapped_format = map_to_valid_format(format_value) g.add((distribution, DCT['format'], Literal(mapped_format))) # Mime-Type if resource_dict.get('mimetype'): g.add(( distribution, DCAT.mediaType, Literal(resource_dict['mimetype']) )) # Dates items = [ ('issued', DCT.issued, None, Literal), ('modified', DCT.modified, None, Literal), ] self._add_date_triples_from_dict(resource_dict, distribution, items) # Numbers if resource_dict.get('byte_size'): try: g.add((distribution, DCAT.byteSize, Literal(float(resource_dict['size']), datatype=XSD.decimal))) except (ValueError, TypeError): g.add((distribution, DCAT.byteSize, Literal(resource_dict['size'])))