def evaluate(self):
        self.result = RelatedResource(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name)
        self.output = RelatedResourceOutput()

        self.logger.info('{0} : Total number of related resources extracted -: {1}'.format(self.metric_identifier,
                                                                                          len(self.fuji.related_resources)))

        # if self.metadata_merged.get('related_resources'):
        pid_used = False
        if self.fuji.related_resources:
            #print(self.fuji.related_resources)
            # QC check: exclude potential incorrect relation
            self.fuji.related_resources = [item for item in self.fuji.related_resources if
                                      item.get('related_resource') != self.fuji.pid_url]

            self.logger.log(self.fuji.LOG_SUCCESS, '{0} : Number of related resources after QC step -: {1}'.format(self.metric_identifier, len(
                self.fuji.related_resources)))

        if self.fuji.related_resources:  # TODO include source of relation
            for relation in self.fuji.related_resources:
                relation_identifier = IdentifierHelper(relation.get('related_resource'))
                if relation_identifier.is_persistent or 'url' in relation_identifier.identifier_schemes:
                    pid_used = True
            self.output = self.fuji.related_resources
            self.result.test_status = 'pass'
            self.setEvaluationCriteriumScore('FsF-I3-01M-1', 1, 'pass')
            self.score.earned = self.total_score
            self.maturity = 2
            if pid_used:
                self.setEvaluationCriteriumScore('FsF-I3-01M-2', 1, 'pass')
                self.maturity = 3
        self.result.metric_tests = self.metric_tests
        self.result.maturity = self.maturity
        self.result.score = self.score
        self.result.output = self.output
Esempio n. 2
0
    def check_registry_support(self):
        # check if record is listed in major catalogs -> searchable
        # DataCite registry, Google Dataset search, Mendeley data etc..
        #Using the DataCite API in case content negotiation does not work
        registries_supported = []
        #DataCite only for DOIs
        pidhelper = IdentifierHelper(self.fuji.pid_url)
        if self.fuji.pid_scheme:
            if 'doi' in self.fuji.pid_scheme:
                datacite_registry_helper = MetaDataCatalogueDataCite(self.fuji.logger)
                datacite_registry_helper.query(pidhelper.normalized_id)
                if datacite_registry_helper.islisted:
                    registries_supported.append(datacite_registry_helper.source)
        if not registries_supported:
            google_registry_helper = MetaDataCatalogueGoogleDataSearch(self.fuji.logger)
            google_registry_helper.query([pidhelper.normalized_id, self.fuji.landing_url])
            if google_registry_helper.islisted:
                registries_supported.append(google_registry_helper.source)
        if not registries_supported:
            mendeley_registry_helper = MetaDataCatalogueMendeleyData(self.fuji.logger)
            mendeley_registry_helper.query([pidhelper.normalized_id, self.fuji.landing_url])
            if mendeley_registry_helper.islisted:
                registries_supported.append(mendeley_registry_helper.source)


        return registries_supported
Esempio n. 3
0
    def evaluate(self):
        # ======= CHECK IDENTIFIER UNIQUENESS =======
        self.result = Uniqueness(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name)
        self.output = UniquenessOutput()
        schemes = [i[0] for i in idutils.PID_SCHEMES]
        self.logger.info('FsF-F1-01D : Using idutils schemes')
        idhelper = IdentifierHelper(self.fuji.id)
        found_ids = idhelper.identifier_schemes
        #found_ids = idutils.detect_identifier_schemes(self.fuji.id)  # some schemes like PMID are generic
        if len(found_ids) > 0:
            self.logger.log(self.fuji.LOG_SUCCESS,'FsF-F1-01D : Unique identifier schemes found {}'.format(found_ids))
            self.setEvaluationCriteriumScore('FsF-F1-01D-1',self.total_score, 'pass')
            self.maturity = 3
            self.output.guid = self.fuji.id
            self.score.earned = self.total_score

            # identify main scheme
            found_id = idhelper.preferred_schema
            self.fuji.id_scheme = idhelper.identifier_schemes[0]
            if idhelper.is_persistent:
                self.fuji.pid_scheme = found_id
                self.fuji.pid_url = idhelper.identifier_url
            self.logger.info('FsF-F1-01D : Finalized unique identifier scheme - {}'.format(found_id))
            self.output.guid_scheme = found_id
            self.result.test_status = 'pass'
        elif self.verify_uuid(self.fuji.id):
            self.logger.log(self.fuji.LOG_SUCCESS,'FsF-F1-01D : Unique identifier (UUID) scheme found')
            self.setEvaluationCriteriumScore('FsF-F1-01D-2',0.5, 'pass')
            self.result.test_status = 'pass'
            self.output.guid_scheme = 'uuid'
            self.output.guid = self.fuji.id
            self.maturity = 1
            self.score.earned = 0.5
        elif self.verify_hash(self.fuji.id):
            self.logger.log(self.fuji.LOG_SUCCESS,'FsF-F1-01D : Unique identifier (SHA,MD5) scheme found')
            self.setEvaluationCriteriumScore('FsF-F1-01D-2',0.5, 'pass')
            self.result.test_status = 'pass'
            self.output.guid_scheme = 'hash'
            self.output.guid = self.fuji.id
            self.maturity = 1
            self.score.earned = 0.5
        else:
            self.result.test_status = 'fail'
            self.score.earned = 0
            self.logger.warning('FsF-F1-01D : Failed to check the identifier scheme!.')
        self.result.score = self.score
        self.result.metric_tests = self.metric_tests
        self.result.output = self.output
        self.result.maturity = self.maturity_levels.get(self.maturity)
    def evaluate(self):
        self.result = Persistence(id=self.metric_number,
                                  metric_identifier=self.metric_identifier,
                                  metric_name=self.metric_name)
        self.output = PersistenceOutput()
        # ======= CHECK IDENTIFIER PERSISTENCE =======
        self.logger.info(
            'FsF-F1-02D : PID schemes-based assessment supported by the assessment service - {}'
            .format(Mapper.VALID_PIDS.value))
        check_url = None
        signposting_pid = None
        if self.fuji.id_scheme is not None:
            check_url = self.fuji.pid_url
            #check_url = idutils.to_url(self.fuji.id, scheme=self.fuji.id_scheme)
        if self.fuji.id_scheme == 'url':
            self.fuji.origin_url = self.fuji.id
            check_url = self.fuji.id
        if check_url:
            # ======= RETRIEVE METADATA FROM LANDING PAGE =======
            requestHelper = RequestHelper(check_url, self.logger)
            requestHelper.setAcceptType(AcceptTypes.html_xml)  # request
            neg_source, self.fuji.extruct_result = requestHelper.content_negotiate(
                'FsF-F1-02D', ignore_html=False)
            if not 'html' in str(requestHelper.content_type):
                self.logger.info(
                    'FsF-F2-01M :Content type is ' +
                    str(requestHelper.content_type) +
                    ', therefore skipping embedded metadata (microdata, RDFa) tests'
                )
                self.fuji.extruct_result = {}
            if type(self.fuji.extruct_result) != dict:
                self.fuji.extruct_result = {}
            r = requestHelper.getHTTPResponse()
            response_status = requestHelper.response_status

            if r:
                self.fuji.landing_url = requestHelper.redirect_url
                #in case the test has been repeated because a PID has been found in metadata
                #print(self.fuji.landing_url, self.fuji.input_id)
                if self.fuji.repeat_pid_check == True:
                    if self.fuji.landing_url != self.fuji.input_id:
                        self.logger.warning(
                            'FsF-F1-02D : Landing page URL resolved from PID found in metadata does not match with input URL'
                        )
                        self.logger.warning(
                            'FsF-F2-01M : Seems to be a catalogue entry or alternative representation of the data set, landing page URL resolved from PID found in metadata does not match with input URL'
                        )

                        #self.fuji.repeat_pid_check = False
                if self.fuji.landing_url not in [
                        'https://datacite.org/invalid.html'
                ]:

                    if response_status == 200:
                        # identify signposting links in header
                        header_link_string = requestHelper.getHTTPResponse(
                        ).getheader('Link')
                        if header_link_string is not None:
                            self.logger.info(
                                'FsF-F1-02D : Found signposting links in response header of landingpage'
                            )

                            for preparsed_link in header_link_string.split(
                                    ','):
                                found_link = None
                                found_type, type_match = None, None
                                found_rel, rel_match = None, None
                                found_formats, formats_match = None, None
                                parsed_link = preparsed_link.strip().split(';')
                                found_link = parsed_link[0].strip()
                                for link_prop in parsed_link[1:]:
                                    if str(link_prop).startswith('rel="'):
                                        rel_match = re.search(
                                            'rel=\"(.*?)\"', link_prop)
                                    elif str(link_prop).startswith('type="'):
                                        type_match = re.search(
                                            'type=\"(.*?)\"', link_prop)
                                    elif str(link_prop).startswith(
                                            'formats="'):
                                        formats_match = re.search(
                                            'formats=\"(.*?)\"', link_prop)
                                if type_match:
                                    found_type = type_match[1]
                                if rel_match:
                                    found_rel = rel_match[1]
                                if formats_match:
                                    found_formats = formats_match[1]
                                signposting_link_dict = {
                                    'url': found_link[1:-1],
                                    'type': found_type,
                                    'rel': found_rel,
                                    'profile': found_formats
                                }
                                if found_link:
                                    self.fuji.signposting_header_links.append(
                                        signposting_link_dict)

                        #check if there is a cite-as signposting link
                        if self.fuji.pid_scheme is None:
                            signposting_pid_link = self.fuji.get_signposting_links(
                                'cite-as')
                            if signposting_pid_link:
                                signposting_pid = signposting_pid_link[0].get(
                                    'url')
                            if signposting_pid:
                                signidhelper = IdentifierHelper
                                #found_ids = idutils.detect_identifier_schemes(signposting_pid[0])
                                found_id = signidhelper.preferred_schema
                                #if len(found_ids) > 1:
                                #    found_ids.remove('url')
                                #    found_id = found_ids[0]
                                if signidhelper.is_persistent:
                                    self.logger.info(
                                        'FsF-F1-02D : Found object identifier in signposting header links'
                                    )
                                    self.fuji.pid_scheme = found_id

                        up = urlparse(self.fuji.landing_url)
                        self.fuji.landing_origin = '{uri.scheme}://{uri.netloc}'.format(
                            uri=up)
                        self.fuji.landing_html = requestHelper.getResponseContent(
                        )
                        self.fuji.landing_content_type = requestHelper.content_type

                        self.output.resolved_url = self.fuji.landing_url  # url is active, although the identifier is not based on a pid scheme
                        self.output.resolvable_status = True
                        self.logger.info(
                            'FsF-F1-02D : Object identifier active (status code = 200)'
                        )
                        self.fuji.isMetadataAccessible = True
                    elif response_status in [401, 402, 403]:
                        self.fuji.isMetadataAccessible = False
                        self.logger.warning(
                            "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}"
                            .format(code=response_status))
                    else:
                        self.fuji.isMetadataAccessible = False
                        self.logger.warning(
                            "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}"
                            .format(code=response_status))
                else:
                    self.logger.warning(
                        "FsF-F1-02D : Invalid DOI, identifier resolved to -: {code}"
                        .format(code=self.fuji.landing_url))

            else:
                self.fuji.isMetadataAccessible = False
                self.logger.warning(
                    "FsF-F1-02D :Resource inaccessible, no response received from -: {}"
                    .format(check_url))
                if response_status in [401, 402, 403]:
                    self.logger.warning(
                        "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}"
                        .format(code=response_status))
        else:
            self.logger.warning(
                "FsF-F1-02D :Resource inaccessible, could not identify an actionable representation for the given identfier -: {}"
                .format(self.fuji.id))

        if self.fuji.pid_scheme is not None:
            # short_pid = id.normalize_pid(self.id, scheme=pid_scheme)
            if signposting_pid is None:
                idhelper = IdentifierHelper(self.fuji.id)
                self.fuji.pid_url = idhelper.identifier_url
                #self.fuji.pid_url = idutils.to_url(self.fuji.id, scheme=self.fuji.pid_scheme)
            else:
                self.fuji.pid_url = signposting_pid[0]
            self.output.pid_scheme = self.fuji.pid_scheme

            self.output.pid = self.fuji.pid_url
            self.setEvaluationCriteriumScore('FsF-F1-02D-1', 0.5, 'pass')
            self.score.earned = 0.5
            self.maturity = 1
            if self.fuji.isMetadataAccessible:
                self.setEvaluationCriteriumScore('FsF-F1-02D-2', 0.5, 'pass')
                self.maturity = 3
                self.result.test_status = 'pass'
                self.score.earned = self.total_score  # idenfier should be based on a persistence scheme and resolvable

            #print(self.metric_tests)

            self.logger.log(
                self.fuji.LOG_SUCCESS,
                'FsF-F1-02D : Persistence identifier scheme -: {}'.format(
                    self.fuji.pid_scheme))
            #self.logger.info('FsF-F1-02D : Persistence identifier scheme - {}'.format(self.fuji.pid_scheme))
        else:
            self.score.earned = 0
            self.logger.warning(
                'FsF-F1-02D : Not a persistent identifier scheme -: {}'.format(
                    self.fuji.id_scheme))

        self.result.score = self.score
        self.result.maturity = self.maturity
        self.result.metric_tests = self.metric_tests
        self.result.output = self.output
Esempio n. 5
0
def assess_by_id(body):  # noqa: E501
    """assess_by_id

    Evaluate FAIRness of a data object based on its identifier # noqa: E501

    :param body: 
    :type body: dict | bytes

    :rtype: FAIRResults
    """

    if connexion.request.is_json:
        debug = True
        results = []
        body = Body.from_dict(connexion.request.get_json())
        identifier = body.object_identifier
        debug = body.test_debug
        metadata_service_endpoint = body.metadata_service_endpoint
        oaipmh_endpoint = body.oaipmh_endpoint
        metadata_service_type = body.metadata_service_type
        usedatacite = body.use_datacite
        logger = Preprocessor.logger
        logger.info('Assessment target: ' + identifier)
        print('Assessment target: ', identifier, flush=True)
        ft = FAIRCheck(uid=identifier,
                       test_debug=debug,
                       metadata_service_url=metadata_service_endpoint,
                       metadata_service_type=metadata_service_type,
                       use_datacite=usedatacite,
                       oaipmh_endpoint=oaipmh_endpoint)
        # set target for remote logging
        remote_log_host, remote_log_path = Preprocessor.remote_log_host, Preprocessor.remote_log_path
        #print(remote_log_host, remote_log_path)
        if remote_log_host and remote_log_path:
            ft.set_remote_logging_target(remote_log_host, remote_log_path)
        uid_result, pid_result = ft.check_unique_persistent()
        ft.retrieve_metadata_embedded(ft.extruct_result)
        if ft.repeat_pid_check:
            uid_result, pid_result = ft.check_unique_persistent()
        include_embedded = True
        ft.retrieve_metadata_external()
        if ft.repeat_pid_check:
            uid_result, pid_result = ft.check_unique_persistent()

        core_metadata_result = ft.check_minimal_metatadata()
        content_identifier_included_result = ft.check_content_identifier_included(
        )
        access_level_result = ft.check_data_access_level()
        license_result = ft.check_license()
        related_resources_result = ft.check_relatedresources()
        check_searchable_result = ft.check_searchable()
        data_content_result = ft.check_data_content_metadata()
        data_file_format_result = ft.check_data_file_format()
        community_standards_result = ft.check_community_metadatastandards()
        data_provenance_result = ft.check_data_provenance()
        formal_metadata_result = ft.check_formal_metadata()
        semantic_vocab_result = ft.check_semantic_vocabulary()
        metadata_preserved_result = ft.check_metadata_preservation()
        standard_protocol_data_result = ft.check_standardised_protocol_data()
        standard_protocol_metadata_result = ft.check_standardised_protocol_metadata(
        )

        results.append(uid_result)
        results.append(pid_result)
        results.append(core_metadata_result)
        results.append(content_identifier_included_result)
        results.append(check_searchable_result)
        results.append(access_level_result)
        results.append(formal_metadata_result)
        results.append(semantic_vocab_result)
        results.append(related_resources_result)
        results.append(data_content_result)
        results.append(license_result)
        results.append(data_provenance_result)
        results.append(community_standards_result)
        results.append(data_file_format_result)
        results.append(standard_protocol_data_result)
        results.append(standard_protocol_metadata_result)
        debug_messages = ft.get_log_messages_dict()
        ft.logger_message_stream.flush()
        summary = ft.get_assessment_summary(results)
        for res_k, res_v in enumerate(results):
            if ft.isDebug:
                debug_list = debug_messages.get(res_v['metric_identifier'])
                # debug_list= ft.msg_filter.getMessage(res_v['metric_identifier'])
                if debug_list is not None:
                    results[res_k]['test_debug'] = debug_messages.get(
                        res_v['metric_identifier'])
                else:
                    results[res_k]['test_debug'] = [
                        'INFO: No debug messages received'
                    ]
            else:
                results[res_k]['test_debug'] = ['INFO: Debugging disabled']
                debug_messages = {}
        ft.logger.handlers = [ft.logger.handlers[-1]]
        #timestmp = datetime.datetime.now().replace(microsecond=0).isoformat()
        timestmp = datetime.datetime.now().replace(microsecond=0).isoformat(
        ) + "Z"  # use timestamp format from RFC 3339 as specified in openapi3
        metric_spec = Preprocessor.metric_specification
        metric_version = os.path.basename(Preprocessor.METRIC_YML_PATH)
        totalmetrics = len(results)
        request = body.to_dict()
        if ft.pid_url:
            idhelper = IdentifierHelper(ft.pid_url)
            request[
                'normalized_object_identifier'] = idhelper.get_normalized_id()
        final_response = FAIRResults(request=request,
                                     timestamp=timestmp,
                                     software_version=ft.FUJI_VERSION,
                                     test_id=ft.test_id,
                                     metric_version=metric_version,
                                     metric_specification=metric_spec,
                                     total_metrics=totalmetrics,
                                     results=results,
                                     summary=summary)
    return final_response