Beispiel #1
0
    def _prepare_resource_format(self, resource):
        resource_format = ''

        # get format from media_type field if available
        if not resource_format and resource.get('media_type'):  # noqa
            resource_format = resource['media_type'].split('/')[-1].lower()

        # get format from format field if available (lol)
        if not resource_format and resource.get('format'):
            resource_format = resource['format'].split('/')[-1].lower()

        # check if 'media_type' or 'format' can be mapped
        has_format = (map_to_valid_format(resource_format) is not None)

        # if the fields can't be mapped,
        # try to parse the download_url as a last resort
        if not has_format and resource.get('download_url'):
            path = urlparse.urlparse(resource['download_url']).path
            ext = os.path.splitext(path)[1]
            if ext:
                resource_format = ext.replace('.', '').lower()

        mapped_format = map_to_valid_format(resource_format)
        if mapped_format:
            # if format could be successfully mapped write it to format field
            resource['format'] = mapped_format
        elif not resource.get('download_url'):
            resource['format'] = 'SERVICE'
        else:
            # else return empty string (this will be indexed as N/A)
            resource['format'] = ''

        return resource
Beispiel #2
0
    def _prepare_resource_format(self, resource):
        resource_format = ''

        # get format from download_url file extension if available
        if resource.get('download_url') is not None:
            path = urlparse.urlparse(resource['download_url']).path
            ext = os.path.splitext(path)[1]
            if ext:
                resource_format = ext.replace('.', '').lower()

        # get format from media_type field if available
        if not resource_format and resource.get('media_type') is not None:  # noqa
            resource_format = resource['media_type'].split('/')[-1].lower()

        # get format from format field if available (lol)
        if not resource_format and resource.get('format') is not None:
            resource_format = resource['format'].split('/')[-1].lower()

        mapped_format = map_to_valid_format(resource_format)
        if mapped_format:
            resource['format'] = mapped_format

        return resource
Beispiel #3
0
    def graph_from_dataset(self, dataset_dict, dataset_ref):  # noqa

        g = self.g

        for prefix, namespace in namespaces.iteritems():
            g.bind(prefix, namespace)

        g.add((dataset_ref, RDF.type, DCAT.Dataset))

        # Basic fields
        items = [
            ('identifier', DCT.identifier, ['guid', 'id'], Literal),
            ('version', OWL.versionInfo, ['dcat_version'], Literal),
            ('version_notes', ADMS.versionNotes, None, Literal),
            ('frequency', DCT.accrualPeriodicity, None, Literal),
            ('access_rights', DCT.accessRights, None, Literal),
            ('dcat_type', DCT.type, None, Literal),
            ('provenance', DCT.provenance, None, Literal),
            ('spatial', DCT.spatial, None, Literal),
        ]
        self._add_triples_from_dict(dataset_dict, dataset_ref, items)

        self._add_multilang_value(dataset_ref, DCT.description, 'description', dataset_dict) # noqa
        self._add_multilang_value(dataset_ref, DCT.title, 'title', dataset_dict) # noqa

        # LandingPage
        g.add((dataset_ref, DCAT.landingPage,
               Literal(dataset_dict['url'])))

        self._add_multilang_value(dataset_ref, DCAT.keyword, 'keywords', dataset_dict) # noqa

        # Dates
        items = [
            ('issued', DCT.issued, ['metadata_created'], Literal),
            ('modified', DCT.modified, ['metadata_modified'], Literal),
        ]
        self._add_date_triples_from_dict(dataset_dict, dataset_ref, items)

        # Update Interval
        accrual_periodicity = dataset_dict.get('accrual_periodicity')
        if accrual_periodicity:
            g.add((
                dataset_ref,
                DCT.accrualPeriodicity,
                URIRef(accrual_periodicity)
            ))

        # Lists
        items = [
            ('language', DCT.language, None, Literal),
            ('theme', DCAT.theme, None, URIRef),
            ('conforms_to', DCT.conformsTo, None, Literal),
            ('alternate_identifier', ADMS.identifier, None, Literal),
            ('documentation', FOAF.page, None, Literal),
            ('has_version', DCT.hasVersion, None, Literal),
            ('is_version_of', DCT.isVersionOf, None, Literal),
            ('source', DCT.source, None, Literal),
            ('sample', ADMS.sample, None, Literal),
        ]
        self._add_list_triples_from_dict(dataset_dict, dataset_ref, items)

        # Relations
        if dataset_dict.get('relations'):
            relations = dataset_dict.get('relations')
            for relation in relations:
                relation_name = relation['label']
                relation_url = relation['url']

                relation = URIRef(relation_url)
                g.add((relation, RDFS.label, Literal(relation_name)))
                g.add((dataset_ref, DCT.relation, relation))

        # References
        if dataset_dict.get('see_alsos'):
            references = dataset_dict.get('see_alsos')
            for reference in references:
                reference_identifier = reference['dataset_identifier']
                g.add((dataset_ref, RDFS.seeAlso, Literal(reference_identifier))) # noqa

        # Contact details
        if dataset_dict.get('contact_points'):
            contact_points = self._get_dataset_value(dataset_dict, 'contact_points')  # noqa
            for contact_point in contact_points:
                contact_details = BNode()
                contact_point_email = contact_point['email']
                contact_point_name = contact_point['name']

                g.add((contact_details, RDF.type, VCARD.Organization))
                g.add((contact_details, VCARD.hasEmail, URIRef(contact_point_email))) # noqa
                g.add((contact_details, VCARD.fn, Literal(contact_point_name)))

                g.add((dataset_ref, DCAT.contactPoint, contact_details))

        # Publisher
        if dataset_dict.get('publishers'):
            publishers = dataset_dict.get('publishers')
            for publisher in publishers:
                publisher_name = publisher['label']

                publisher_details = BNode()
                g.add((publisher_details, RDF.type, RDF.Description))
                g.add((publisher_details, RDFS.label, Literal(publisher_name)))
                g.add((dataset_ref, DCT.publisher, publisher_details))

        # Temporals
        temporals = dataset_dict.get('temporals')
        if temporals:
            for temporal in temporals:
                start = temporal['start_date']
                end = temporal['end_date']
                if start or end:
                    temporal_extent = BNode()
                    g.add((temporal_extent, RDF.type, DCT.PeriodOfTime))
                    if start:
                        self._add_date_triple(temporal_extent, SCHEMA.startDate, start)  # noqa
                    if end:
                        self._add_date_triple(temporal_extent, SCHEMA.endDate, end)  # noqa
                    g.add((dataset_ref, DCT.temporal, temporal_extent))

        # Themes
        groups = self._get_dataset_value(dataset_dict, 'groups')
        for group_name in groups:
            g.add((
                dataset_ref,
                DCAT.theme,
                URIRef(ogd_theme_base_url + group_name.get('name'))
            ))

        # Resources
        for resource_dict in dataset_dict.get('resources', []):

            distribution = URIRef(resource_uri(resource_dict))

            g.add((dataset_ref, DCAT.distribution, distribution))
            g.add((distribution, RDF.type, DCAT.Distribution))

            #  Simple values
            items = [
                ('status', ADMS.status, None, Literal),
                ('rights', DCT.rights, None, Literal),
                ('license', DCT.license, None, Literal),
                ('identifier', DCT.identifier, None, Literal),
                ('media_type', DCAT.mediaType, None, Literal),
                ('spatial', DCT.spatial, None, Literal),
            ]

            self._add_triples_from_dict(resource_dict, distribution, items)

            self._add_multilang_value(distribution, DCT.title, 'display_name', dataset_dict) # noqa
            self._add_multilang_value(distribution, DCT.description, 'description', dataset_dict) # noqa

            #  Lists
            items = [
                ('documentation', FOAF.page, None, Literal),
                ('language', DCT.language, None, Literal),
                ('conforms_to', DCT.conformsTo, None, Literal),
            ]
            self._add_list_triples_from_dict(resource_dict, distribution,
                                             items)

            # URL
            url = resource_dict.get('url')
            download_url = resource_dict.get('download_url')
            if download_url:
                g.add((distribution, DCAT.downloadURL, URIRef(download_url)))
                g.add((distribution, DCAT.accessURL, URIRef(download_url)))
            if (url and not download_url) or (url and url != download_url):
                g.add((distribution, DCAT.accessURL, URIRef(url)))

            # Format from Download-Url
            if download_url:
                format_value = str(download_url).rsplit('.', 1)[1]
                mapped_format = map_to_valid_format(format_value)
                g.add((distribution, DCT['format'], Literal(mapped_format)))

            # Mime-Type
            if resource_dict.get('mimetype'):
                g.add((
                    distribution,
                    DCAT.mediaType,
                    Literal(resource_dict['mimetype'])
                ))

            # Dates
            items = [
                ('issued', DCT.issued, None, Literal),
                ('modified', DCT.modified, None, Literal),
            ]

            self._add_date_triples_from_dict(resource_dict, distribution,
                                             items)

            # Numbers
            if resource_dict.get('byte_size'):
                try:
                    g.add((distribution, DCAT.byteSize,
                           Literal(float(resource_dict['size']),
                                   datatype=XSD.decimal)))
                except (ValueError, TypeError):
                    g.add((distribution, DCAT.byteSize,
                           Literal(resource_dict['size'])))