def get_dcat_metadata(self, graph):
        dcat_metadata = dict()
        DCAT = Namespace("http://www.w3.org/ns/dcat#")

        datasets = list(graph[:RDF.type:DCAT.Dataset])
        if len(datasets) > 0:
            dcat_metadata = self.get_metadata(graph,
                                              datasets[0],
                                              type='Dataset')
            # publisher
            if idutils.is_url(dcat_metadata.get(
                    'publisher')) or dcat_metadata.get('publisher') is None:
                publisher = graph.value(datasets[0], DCTERMS.publisher)
                # FOAF preferred DCAT compliant
                publisher_name = graph.value(publisher, FOAF.name)
                dcat_metadata['publisher'] = publisher_name
                # in some cases a dc title is used (not exactly DCAT compliant)
                if dcat_metadata.get('publisher') is None:
                    publisher_title = graph.value(publisher, DCTERMS.title)
                    dcat_metadata['publisher'] = publisher_title

            # creator
            if idutils.is_url(dcat_metadata.get(
                    'creator')) or dcat_metadata.get('creator') is None:
                creators = graph.objects(datasets[0], DCTERMS.creator)
                creator_name = []
                for creator in creators:
                    creator_name.append(graph.value(creator, FOAF.name))
                if len(creator_name) > 0:
                    dcat_metadata['creator'] = creator_name

            # distribution
            distribution = graph.objects(datasets[0], DCAT.distribution)
            dcat_metadata['object_content_identifier'] = []
            for dist in distribution:
                durl = graph.value(dist, DCAT.accessURL)
                #taking only one just to check if licence is available
                dcat_metadata['license'] = graph.value(dist, DCTERMS.license)
                # TODO: check if this really works..
                dcat_metadata['access_rights'] = (
                    graph.value(dist, DCTERMS.accessRights)
                    or graph.value(dist, DCTERMS.rights))
                dtype = graph.value(dist, DCAT.mediaType)
                dsize = graph.value(dist, DCAT.bytesSize)
                dcat_metadata['object_content_identifier'].append({
                    'url':
                    str(durl),
                    'type':
                    str(dtype),
                    'size':
                    dsize
                })
                #TODO: add provenance metadata retrieval
        else:
            self.logger.info(
                'FsF-F2-01M : Found DCAT content but could not correctly parse metadata'
            )
            #in order to keep DCAT in the found metadata list, we need to pass at least one metadata value..
            dcat_metadata['object_type'] = 'Dataset'
        return dcat_metadata
    def evaluate(self):

        self.result = License(id=self.metric_number,
                              metric_identifier=self.metric_identifier,
                              metric_name=self.metric_name)
        licenses_list = []
        specified_licenses = self.fuji.metadata_merged.get('license')
        self.score.earned = 0
        spdx_found = False
        if specified_licenses is not None and specified_licenses != []:
            self.logger.log(
                self.fuji.LOG_SUCCESS,
                '{0} : Found licence information in metadata'.format(
                    self.metric_identifier))
            if isinstance(
                    specified_licenses, str
            ):  # licenses maybe string or list depending on metadata schemas
                specified_licenses = [specified_licenses]
            for l in specified_licenses:
                license_output = LicenseOutputInner()
                #license can be dict or
                license_output.license = l
                if isinstance(l, str):
                    isurl = idutils.is_url(l)
                if isurl:
                    spdx_html, spdx_osi = self.lookup_license_by_url(
                        l, self.metric_identifier)
                else:  # maybe licence name
                    spdx_html, spdx_osi = self.lookup_license_by_name(
                        l, self.metric_identifier)
                if not spdx_html:
                    self.logger.warning(
                        '{0} : NO SPDX license representation (spdx url, osi_approved) found'
                        .format(self.metric_identifier))
                else:
                    self.logger.log(
                        self.fuji.LOG_SUCCESS,
                        '{0} : Found SPDX license representation (spdx url, osi_approved)'
                        .format(self.metric_identifier))
                    spdx_found = True
                license_output.details_url = spdx_html
                license_output.osi_approved = spdx_osi
                licenses_list.append(license_output)
            self.result.test_status = "pass"
            self.setEvaluationCriteriumScore('FsF-R1.1-01M-1', 1, 'pass')
            self.score.earned = 1
            self.maturity = 1
            if spdx_found:
                self.setEvaluationCriteriumScore('FsF-R1.1-01M-2', 1, 'pass')
                self.score.earned = 2
                self.maturity = 3
        else:
            self.logger.warning(
                '{0} : License information unavailable in metadata'.format(
                    self.metric_identifier))

        self.result.output = licenses_list
        self.result.metric_tests = self.metric_tests
        self.result.score = self.score
        self.result.maturity = self.maturity
 def isLicense (self, value, metric_id):
     islicense = False
     isurl = idutils.is_url(value)
     spdx_html = None
     spdx_osi = None
     if isurl:
         spdx_html, spdx_osi = self.lookup_license_by_url(value, metric_id)
     else:
         spdx_html, spdx_osi = self.lookup_license_by_name(value, metric_id)
     if spdx_html or spdx_osi:
         islicense = True
     return islicense
Exemple #4
0
def get_related_identifiers_url(record: Record, doi_prefix: str) -> List[Dict]:
    """Create related identifiers URL.

    Args:
        related_identifiers (Record): Record API Object from where the related
        identifiers will be extracted.

        doi_prefix (str): GEO Knowledge Hub DOI Prefix.

    Returns:
        List[Dict]: List of record related identifiers (with URL resolved)

    Note:
        The `doi_prefix` is used to check if the items are managed by the GEO Knowledge Hub.
    """
    # extracting related identifiers
    related_identifiers = py_.get(record, "metadata.related_identifiers", [])

    new_related_identifiers = []
    for related_identifier in related_identifiers:
        if related_identifier.get("identifier", None):
            pass

        scheme = related_identifier["scheme"]
        identifier = related_identifier["identifier"]

        related_identifier_obj = py_.set_(py_.clone_deep(related_identifier),
                                          "url", "")

        try:
            if idutils.is_url(identifier):
                related_identifier_obj["url"] = identifier
            else:
                # checking if the doi is internal
                if idutils.is_doi(identifier):
                    identifier_split = identifier.split("/")

                    if doi_prefix and identifier_split[0] == doi_prefix:
                        related_identifier_obj["url"] = posixpath.join(
                            "/records", identifier_split[1])

                if not related_identifier_obj["url"]:
                    related_identifier_obj["url"] = idutils.to_url(
                        identifier, scheme, "https")
        except BaseException:
            related_identifier_obj["url"] = identifier
        new_related_identifiers.append(related_identifier_obj)
    return new_related_identifiers
Exemple #5
0
    def evaluate(self):

        self.result = License(id=self.fuji.count,
                              metric_identifier=self.metric_identifier,
                              metric_name=self.metric_name)
        licenses_list = []
        specified_licenses = self.fuji.metadata_merged.get('license')

        if specified_licenses is not None and specified_licenses != []:
            if isinstance(
                    specified_licenses, str
            ):  # licenses maybe string or list depending on metadata schemas
                specified_licenses = [specified_licenses]
            for l in specified_licenses:
                license_output = LicenseOutputInner()
                #license can be dict or
                license_output.license = l
                if isinstance(l, str):
                    isurl = idutils.is_url(l)
                if isurl:
                    spdx_html, spdx_osi = self.lookup_license_by_url(
                        l, self.metric_identifier)
                else:  # maybe licence name
                    spdx_html, spdx_osi = self.lookup_license_by_name(
                        l, self.metric_identifier)
                if not spdx_html:
                    self.logger.warning(
                        'FsF-R1.1-01M : NO SPDX license representation (spdx url, osi_approved) found'
                    )
                license_output.details_url = spdx_html
                license_output.osi_approved = spdx_osi
                licenses_list.append(license_output)
            self.result.test_status = "pass"
            self.score.earned = self.total_score
        else:
            self.score.earned = 0
            self.logger.warning('FsF-R1.1-01M : License unavailable')

        self.result.output = licenses_list
        self.result.score = self.score
    def get_dcat_metadata(self, graph):
        dcat_metadata = dict()
        DCAT = Namespace("http://www.w3.org/ns/dcat#")

        datasets = list(graph[:RDF.type:DCAT.Dataset])
        if len(datasets) > 0:
            dcat_metadata = self.get_metadata(graph,
                                              datasets[0],
                                              type='Dataset')
            # publisher
            if idutils.is_url(dcat_metadata.get(
                    'publisher')) or dcat_metadata.get('publisher') is None:
                publisher = graph.value(datasets[0], DCTERMS.publisher)
                # FOAF preferred DCAT compliant
                publisher_name = graph.value(publisher, FOAF.name)
                dcat_metadata['publisher'] = publisher_name
                # in some cases a dc title is used (not exactly DCAT compliant)
                if dcat_metadata.get('publisher') is None:
                    publisher_title = graph.value(publisher, DCTERMS.title)
                    dcat_metadata['publisher'] = publisher_title

            # creator
            if idutils.is_url(dcat_metadata.get(
                    'creator')) or dcat_metadata.get('creator') is None:
                creators = graph.objects(datasets[0], DCTERMS.creator)
                creator_name = []
                for creator in creators:
                    creator_name.append(graph.value(creator, FOAF.name))
                if len(creator_name) > 0:
                    dcat_metadata['creator'] = creator_name

            # distribution
            distribution = graph.objects(datasets[0], DCAT.distribution)
            dcat_metadata['object_content_identifier'] = []
            for dist in distribution:
                dtype, durl, dsize = None, None, None
                if not (graph.value(dist, DCAT.accessURL)
                        or graph.value(dist, DCAT.downloadURL)):
                    self.logger.info(
                        'FsF-F2-01M : Trying to retrieve DCAT distributions from remote location -:'
                        + str(dist))
                    try:
                        distgraph = rdflib.Graph()
                        disturl = str(dist)
                        distresponse = requests.get(
                            disturl, headers={'Accept': 'application/rdf+xml'})
                        if distresponse.text:
                            distgraph.parse(data=distresponse.text,
                                            format="application/rdf+xml")
                            extdist = list(
                                distgraph[:RDF.type:DCAT.Distribution])
                            durl = (distgraph.value(extdist[0], DCAT.accessURL)
                                    or distgraph.value(extdist[0],
                                                       DCAT.downloadURL))
                            dsize = distgraph.value(extdist[0], DCAT.byteSize)
                            dtype = distgraph.value(extdist[0], DCAT.mediaType)
                            self.logger.info(
                                'FsF-F2-01M : Found DCAT distribution URL info from remote location -:'
                                + str(durl))
                    except Exception as e:
                        self.logger.info(
                            'FsF-F2-01M : Failed to retrieve DCAT distributions from remote location -:'
                            + str(dist))
                        #print(e)
                        durl = str(dist)
                else:
                    durl = (graph.value(dist, DCAT.accessURL)
                            or graph.value(dist, DCAT.downloadURL))
                    #taking only one just to check if licence is available
                    dcat_metadata['license'] = graph.value(
                        dist, DCTERMS.license)
                    # TODO: check if this really works..
                    dcat_metadata['access_rights'] = (
                        graph.value(dist, DCTERMS.accessRights)
                        or graph.value(dist, DCTERMS.rights))
                    dtype = graph.value(dist, DCAT.mediaType)
                    dsize = graph.value(dist, DCAT.bytesSize)
                if durl or dtype or dsize:
                    if idutils.is_url(str(durl)):
                        dtype = '/'.join(str(dtype).split('/')[-2:])
                    dcat_metadata['object_content_identifier'].append({
                        'url':
                        str(durl),
                        'type':
                        dtype,
                        'size':
                        str(dsize)
                    })

            if dcat_metadata['object_content_identifier']:
                self.logger.info(
                    'FsF-F3-01M : Found data links in DCAT.org metadata -: ' +
                    str(dcat_metadata['object_content_identifier']))
                #TODO: add provenance metadata retrieval
        else:
            self.logger.info(
                'FsF-F2-01M : Found DCAT content but could not correctly parse metadata'
            )
            #in order to keep DCAT in the found metadata list, we need to pass at least one metadata value..
            #dcat_metadata['object_type'] = 'Dataset'
        return dcat_metadata