def getMetadataStandards(self):
     filter = ['datacite.org', 'openarchives.org',
               'purl.org/dc/']  # TODO expand filters
     #http://ws.pangaea.de/oai/provider?verb=ListMetadataFormats
     oai_endpoint = self.endpoint.split('?')[0]
     #oai_endpoint = oai_endpoint.rstrip('/')
     oai_listmetadata_url = oai_endpoint + '?verb=ListMetadataFormats'
     requestHelper = RequestHelper(url=oai_listmetadata_url,
                                   logInst=self.logger)
     requestHelper.setAcceptType(AcceptTypes.xml)
     response_type, xml = requestHelper.content_negotiate(self.metric_id)
     root = etree.fromstring(xml.content)
     metadata_nodes = root.xpath(
         '//oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat',
         namespaces=OAIMetadataProvider.oai_namespaces)
     schemas = {}
     for node in metadata_nodes:
         ele = etree.XPathEvaluator(
             node, namespaces=OAIMetadataProvider.oai_namespaces).evaluate
         metadata_prefix = ele('string(oai:metadataPrefix/text())'
                               )  # <metadataPrefix>oai_dc</metadataPrefix>
         metadata_schema = ele(
             'string(oai:schema/text())'
         )  #<schema>http://www.openarchives.org/OAI/2.0/oai_dc.xsd</schema>
         metadata_schema = metadata_schema.strip()
         self.namespaces.append(metadata_schema)
         # TODO there can be more than one OAI-PMH endpoint, https://www.re3data.org/repository/r3d100011221
         if not any(s in metadata_schema for s in filter):
             schemas[metadata_prefix] = [metadata_schema]
         else:
             self.logger.info(
                 '{0} : Skipping domain-agnostic standard listed in OAI-PMH - {1}'
                 .format(self.metric_id, metadata_prefix))
     return schemas
 def lookup_re3data(self):
     if self.client_id and self.pid_scheme:
         re3doi = RepositoryHelper.DATACITE_REPOSITORIES.get(
             self.client_id)  # {client_id,re3doi}
         short_re3doi = idutils.normalize_pid(
             re3doi, scheme='doi')  #https://doi.org/10.17616/R3XS37
         # pid -> clientId -> repo doi-> re3id, and query repository metadata from re3api
         if re3doi:
             self.logger.info('Found match re3data (DOI-based) record')
             query_url = Preprocessor.RE3DATA_API + '?query=' + short_re3doi  # https://re3data.org/api/beta/repositories?query=
             q = RequestHelper(url=query_url)
             q.setAcceptType(AcceptTypes.xml)
             re_source, xml = q.content_negotiate(metric_id='RE3DATA')
             root = etree.fromstring(xml.content)
             #<link href="https://www.re3data.org/api/beta/repository/r3d100010134" rel="self" />
             re3link = root.xpath('//link')[0].attrib['href']
             if re3link is not None:
                 self.logger.info('Found match re3data metadata record')
                 # query reposiroty metadata
                 q2 = RequestHelper(url=re3link)
                 q2.setAcceptType(AcceptTypes.xml)
                 re3_source, re3_response = q2.content_negotiate(
                     metric_id='RE3DATA')
                 self.re3metadata_raw = re3_response.content
                 self.parseRepositoryMetadata()
         else:
             self.logger.warning(
                 'No DOI of client id is available from datacite api')
Ejemplo n.º 3
0
    def getMetadataStandards(self):
        csw_endpoint = self.endpoint.split('?')[0]
        csw_listmetadata_url = csw_endpoint + '?service=CSW&request=GetCapabilities'
        requestHelper = RequestHelper(url=csw_listmetadata_url,
                                      logInst=self.logger)
        requestHelper.setAcceptType(AcceptTypes.xml)
        response_type, xml = requestHelper.content_negotiate(self.metric_id)
        schemas = {}
        if xml:
            try:
                root = etree.fromstring(requestHelper.response_content)
                metadata_nodes = root.xpath(
                    '//ows:Parameter[@name="outputSchema"]/ows:Value',
                    namespaces=OGCCSWMetadataProvider.csw_namespaces)
                for node in metadata_nodes:
                    if node.text:
                        if node.text not in self.namespaces:
                            self.namespaces.append(str(node.text))
                            schemas[str(node.text)] = str(node.text)
            except:
                self.logger.info(
                    '{0} : Could not parse XML response retrieved from OGC CSW endpoint'
                    .format(self.metric_id))

        return schemas
Ejemplo n.º 4
0
    def parse_metadata(self):
        source_name = None
        dcite_metadata = {}
        self.logger.info('FsF-F2-01M : Trying to retrieve datacite metadata')
        requestHelper = RequestHelper(self.pid_url, self.logger)
        requestHelper.setAcceptType(AcceptTypes.datacite_json)
        neg_source, ext_meta = requestHelper.content_negotiate('FsF-F2-01M')
        if ext_meta:
            try:
                dcite_metadata = jmespath.search(self.metadata_mapping.value,
                                                 ext_meta)
                if dcite_metadata:
                    self.namespaces.append('http://datacite.org/schema/')
                    source_name = self.getEnumSourceNames().DATACITE_JSON.value
                    if dcite_metadata['creator'] is None:
                        first = dcite_metadata['creator_first']
                        last = dcite_metadata['creator_last']
                        # default type of creator is []
                        if isinstance(first, list) and isinstance(last, list):
                            if len(first) == len(last):
                                names = [
                                    i + " " + j for i, j in zip(first, last)
                                ]
                                dcite_metadata['creator'] = names

                    if dcite_metadata.get('related_resources'):
                        self.logger.info(
                            'FsF-I3-01M : {0} related resource(s) extracted from -: {1}'
                            .format(len(dcite_metadata['related_resources']),
                                    source_name))
                        temp_rels = []

                        for r in dcite_metadata['related_resources']:
                            if r.get('scheme_uri'):
                                self.namespaces.append(r.get('scheme_uri'))
                            filtered = {
                                k: v
                                for k, v in r.items() if v is not None
                            }
                            temp_rels.append(filtered)
                        dcite_metadata['related_resources'] = temp_rels
                    else:
                        self.logger.info(
                            'FsF-I3-01M : No related resource(s) found in Datacite metadata'
                        )

                    # convert all values (list type) into string except 'creator','license','related_resources'
                    for key, value in dcite_metadata.items():
                        if key not in self.exclude_conversion and isinstance(
                                value, list):
                            flat = ', '.join(map(str, value))
                            dcite_metadata[key] = flat
            except Exception as e:
                self.logger.exception(
                    'Failed to extract Datacite Json -: {}'.format(e))
        return source_name, dcite_metadata
    def parse_metadata(self):
        #self.source_name = self.getEnumSourceNames().LINKED_DATA.value
        self.logger.info('FsF-F2-01M : Extract metadata from {}'.format(self.source_name))
        rdf_metadata=dict()
        requestHelper: RequestHelper = RequestHelper(self.target_url, self.logger)
        requestHelper.setAcceptType(AcceptTypes.rdf)
        neg_source,rdf_response = requestHelper.content_negotiate('FsF-F2-01M')
        #required for metric knowledge representation
        self.content_type = requestHelper.getHTTPResponse().headers['content-type']
        self.content_type = self.content_type.split(";", 1)[0]

        ontology_indicator=[rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#'),rdflib.term.URIRef('http://www.w3.org/2002/07/owl#')]
        if isinstance(rdf_response,rdflib.graph.Graph):
            self.logger.info('FsF-F2-01M : Found RDF Graph')
            # TODO: set credit score for being valid RDF
            # TODO: since its valid RDF aka semantic representation, make sure FsF-I1-01M is passed and scored

            if rdflib.term.URIRef('http://www.w3.org/ns/dcat#') in dict(list(rdf_response.namespaces())).values():
                self.logger.info('FsF-F2-01M : RDF Graph seems to contain DCAT metadata elements')
                rdf_metadata = self.get_dcat_metadata(rdf_response)
            elif bool(set(ontology_indicator) & set(dict(list(rdf_response.namespaces())).values())):
                rdf_metadata = self.get_ontology_metadata(rdf_response)
            #add found namespaces URIs to namespace
            for ns in rdf_response.namespaces():
                self.namespaces.append(str(ns[1]))
        else:
            self.logger.info('FsF-F2-01M : Expected RDF Graph but received - {0}'.format(self.content_type))
        return self.source_name, rdf_metadata
Ejemplo n.º 6
0
    def parse_metadata(self):
        XSI = "http://www.w3.org/2001/XMLSchema-instance"
        if self.link_type == 'embedded':
            source_name = self.getEnumSourceNames().LINKED_DATA.value
        elif self.link_type == 'guessed':
            source_name = self.getEnumSourceNames().GUESSED_XML.value
        elif self.link_type == 'negotiated':
            source_name = self.getEnumSourceNames().XML_NEGOTIATED.value
        dc_core_metadata = None
        requestHelper: RequestHelper = RequestHelper(self.target_url,
                                                     self.logger)
        requestHelper.setAcceptType(AcceptTypes.xml)
        neg_source, xml_response = requestHelper.content_negotiate(
            'FsF-F2-01M')
        if requestHelper.response_status == 200:
            self.logger.info(
                'FsF-F2-01M : Extract metadata from {}'.format(source_name))
            #dom = lxml.html.fromstring(self.landing_html.encode('utf8'))
            if neg_source != 'xml':
                self.logger.info(
                    'FsF-F2-01M : Expected XML but content negotiation responded: '
                    + str(neg_source))
            else:
                tree = lxml.etree.XML(xml_response)
                schema_locations = set(
                    tree.xpath("//*/@xsi:schemaLocation",
                               namespaces={'xsi': XSI}))
                #print(schema_locations)
                for schema_location in schema_locations:
                    self.namespaces = re.split('\s', schema_location)
                #TODO: implement some XSLT to handle the XML..

        return source_name, dc_core_metadata
    def lookup_re3data(self):
        if self.client_id and self.pid_scheme:

            re3doi = RepositoryHelper.DATACITE_REPOSITORIES.get(
                self.client_id)  # {client_id,re3doi}
            #print(self.client_id,'Re3DOI',re3doi, idutils.is_doi(re3doi))
            if re3doi:
                if idutils.is_doi(re3doi):
                    short_re3doi = idutils.normalize_pid(
                        re3doi, scheme='doi')  #https://doi.org/10.17616/R3XS37
                else:
                    re3doi = None

            # pid -> clientId -> repo doi-> re3id, and query repository metadata from re3api
            if re3doi:
                self.logger.info(
                    'FsF-R1.3-01M : Found match re3data (DOI-based) record')
                query_url = Preprocessor.RE3DATA_API + '?query=' + short_re3doi  # https://re3data.org/api/beta/repositories?query=
                q = RequestHelper(url=query_url)
                q.setAcceptType(AcceptTypes.xml)
                re_source, xml = q.content_negotiate(metric_id='RE3DATA')
                try:
                    if isinstance(xml, bytes):
                        xml = xml.decode().encode()
                    root = etree.fromstring(xml)

                    #<link href="https://www.re3data.org/api/beta/repository/r3d100010134" rel="self" />
                    re3link = root.xpath('//link')[0].attrib['href']
                    if re3link is not None:
                        self.logger.info(
                            'FsF-R1.3-01M : Found match re3data metadata record -: '
                            + str(re3link))
                        # query reposiroty metadata
                        q2 = RequestHelper(url=re3link)
                        q2.setAcceptType(AcceptTypes.xml)
                        re3_source, re3_response = q2.content_negotiate(
                            metric_id='RE3DATA')
                        self.re3metadata_raw = re3_response
                        self.parseRepositoryMetadata()
                except Exception as e:
                    self.logger.warning(
                        'FsF-R1.3-01M : Malformed re3data (DOI-based) record received: '
                        + str(e))
            else:
                self.logger.warning(
                    'FsF-R1.3-01M : No DOI of client id is available from datacite api'
                )
    def parse_metadata(self):
        #self.source_name = self.getEnumSourceNames().LINKED_DATA.value
        #self.logger.info('FsF-F2-01M : Trying to request RDF metadata from -: {}'.format(self.source_name))
        rdf_metadata=dict()
        if self.rdf_graph is None:
            #print(self.target_url)
            requestHelper: RequestHelper = RequestHelper(self.target_url, self.logger)
            requestHelper.setAcceptType(AcceptTypes.rdf)
            neg_source,rdf_response = requestHelper.content_negotiate('FsF-F2-01M')
            #required for metric knowledge representation

            if requestHelper.getHTTPResponse() is not None:
                self.content_type = requestHelper.getHTTPResponse().headers.get('content-type')
                if self.content_type is not None:
                    self.content_type = self.content_type.split(";", 1)[0]
                    #handle JSON-LD
                    DCAT = Namespace("http://www.w3.org/ns/dcat#")
                    if self.content_type == 'application/ld+json':
                        try:
                            jsonldgraph= rdflib.ConjunctiveGraph()
                            rdf_response = jsonldgraph.parse(data=json.dumps(rdf_response), format='json-ld')
                            rdf_response = jsonldgraph
                        except Exception as e:
                            self.logger.info('FsF-F2-01M : Parsing error, failed to extract JSON-LD -: {}'.format(e))
        else:
            neg_source, rdf_response = 'html' , self.rdf_graph



        ontology_indicator=[rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#'),rdflib.term.URIRef('http://www.w3.org/2002/07/owl#')]
        if isinstance(rdf_response,rdflib.graph.Graph):
            self.logger.info('FsF-F2-01M : Found RDF Graph')
            graph_text = rdf_response.serialize(format="ttl")
            self.getNamespacesfromIRIs(graph_text)
            # TODO: set credit score for being valid RDF
            # TODO: since its valid RDF aka semantic representation, make sure FsF-I1-01M is passed and scored
            if rdflib.term.URIRef('http://www.w3.org/ns/dcat#') in dict(list(rdf_response.namespaces())).values():
                self.logger.info('FsF-F2-01M : RDF Graph seems to contain DCAT metadata elements')
                rdf_metadata = self.get_dcat_metadata(rdf_response)
            elif bool(set(ontology_indicator) & set(dict(list(rdf_response.namespaces())).values())):
                rdf_metadata = self.get_ontology_metadata(rdf_response)
            else:
                rdf_metadata = self.get_default_metadata(rdf_response)
            #add found namespaces URIs to namespace
            for ns in rdf_response.namespaces():
                self.namespaces.append(str(ns[1]))
        else:
            self.logger.info('FsF-F2-01M : Expected RDF Graph but received -: {0}'.format(self.content_type))
        return self.source_name, rdf_metadata
Ejemplo n.º 9
0
    def parse_metadata(self, ls=None):
        jsnld_metadata = {}
        ext_meta=None
        if self.source_metadata:
            self.source_name = self.getEnumSourceNames().SCHEMAORG_EMBED.value
            ext_meta = self.source_metadata[0]
        elif self.pid_url:
            self.source_name = self.getEnumSourceNames().SCHEMAORG_NEGOTIATE.value
            # TODO (IMPORTANT) PID agency may support Schema.org in JSON-LD
            # TODO (IMPORTANT) validate schema.org
            # fallback, request (doi) metadata specified in schema.org JSON-LD
            requestHelper: RequestHelper = RequestHelper(self.pid_url, self.logger)
            requestHelper.setAcceptType(AcceptTypes.schemaorg)
            neg_source,ext_meta = requestHelper.content_negotiate('FsF-F2-01M')
        if ext_meta is not None:
            self.getNamespacesfromIRIs(ext_meta)
            self.logger.info('FsF-F2-01M : Trying to extract schema.org JSON-LD metadata from -: {}'.format(self.source_name))
            # TODO check syntax - not ending with /, type and @type
            # TODO (important) extend mapping to detect other pids (link to related entities)?
            check_context_type =  ["Dataset", "Collection"]
            try:
                #if ext_meta['@context'] in check_context_type['@context'] and ext_meta['@type'] in check_context_type["@type"]:

                if str(ext_meta['@context']).find('://schema.org') > -1:
                    if str(ext_meta['@type']).lower() not in self.SCHEMA_ORG_CONTEXT:
                        self.logger.info('FsF-F2-01M : Found JSON-LD but seems not to be a schema.org object based on the given context type')
                    elif ext_meta['@type'] not in check_context_type:
                        self.logger.info('FsF-F2-01M : Found schema.org JSON-LD but seems not to be a research data object')
                    else:
                        self.logger.info('FsF-F2-01M : Found schema.org JSON-LD which seems to be valid, based on the given context type')

                        self.namespaces.append('http://schema.org/')
                    jsnld_metadata = jmespath.search(self.metadata_mapping.value, ext_meta)
                    # TODO all properties with null values extracted through jmespath should be excluded
                    if jsnld_metadata.get('creator') is None:
                        #TODO: handle None values for first and last name
                        first = jsnld_metadata.get('creator_first')
                        last = jsnld_metadata.get('creator_last')
                        if isinstance(first, list) and isinstance(last, list):
                            if len(first) == len(last):
                                names = [str(i) + " " + str(j) for i, j in zip(first, last)]
                                jsnld_metadata['creator'] = names
                        else:
                            jsnld_metadata['creator'] = [str(first) + " " + str(last)]

                    #TODO instead of custom check there should a valdiator to evaluate the whole schema.org metadata
                    invalid_license = False
                    if jsnld_metadata.get('license'):
                        self.logger.info('FsF-R1.1-01M : License metadata found (schema.org) -: {}'.format(
                            jsnld_metadata.get('license')))

                        if isinstance(jsnld_metadata.get('license'), list):
                            jsnld_metadata['license'] = jsnld_metadata['license'][0]
                        if isinstance(jsnld_metadata.get('license'), dict):
                            ls_type = jsnld_metadata.get('license').get('@type')
                            if ls_type =='CreativeWork':
                                ls = jsnld_metadata.get('license').get('url')
                                if not ls:
                                    ls = jsnld_metadata.get('license').get('name')
                                if ls:
                                    jsnld_metadata['license'] = ls
                                else:
                                    invalid_license = True
                            else:
                                invalid_license = True
                    if invalid_license:
                        self.logger.warning('FsF-R1.1-01M : Looks like schema.org representation of license is incorrect, skipping the test.')
                        jsnld_metadata['license'] = None

                    # filter out None values of related_resources
                    if jsnld_metadata.get('related_resources'):
                        relateds = [d for d in jsnld_metadata['related_resources'] if d['related_resource'] is not None]
                        if relateds:
                            jsnld_metadata['related_resources'] = relateds
                            self.logger.info('FsF-I3-01M : {0} related resource(s) extracted from -: {1}'.format(len(jsnld_metadata['related_resources']), self.source_name))
                        else:
                            del jsnld_metadata['related_resources']
                            self.logger.info('FsF-I3-01M : No related resource(s) found in Schema.org metadata')



                    # TODO quick-fix, expand mapping expression instead
                    if jsnld_metadata.get('object_size'):
                        jsnld_metadata['object_size'] = str(jsnld_metadata['object_size'].get('value')) + ' '+ jsnld_metadata['object_size'].get('unitText')

                else:
                    self.logger.info('FsF-F2-01M : Found JSON-LD schema.org but record is not of type "Dataset"')

            except Exception as err:
                #print(err.with_traceback())
                self.logger.info('FsF-F2-01M : Failed to parse JSON-LD schema.org -: {}'.format(err))
        else:
            self.logger.info('FsF-F2-01M : Could not identify JSON-LD schema.org metadata')

        return self.source_name, jsnld_metadata
    def evaluate(self):
        self.result = Persistence(id=self.metric_number,
                                  metric_identifier=self.metric_identifier,
                                  metric_name=self.metric_name)
        self.output = PersistenceOutput()
        # ======= CHECK IDENTIFIER PERSISTENCE =======
        self.logger.info(
            'FsF-F1-02D : PID schemes-based assessment supported by the assessment service - {}'
            .format(Mapper.VALID_PIDS.value))
        check_url = None
        signposting_pid = None
        if self.fuji.id_scheme is not None:
            check_url = self.fuji.pid_url
            #check_url = idutils.to_url(self.fuji.id, scheme=self.fuji.id_scheme)
        if self.fuji.id_scheme == 'url':
            self.fuji.origin_url = self.fuji.id
            check_url = self.fuji.id
        if check_url:
            # ======= RETRIEVE METADATA FROM LANDING PAGE =======
            requestHelper = RequestHelper(check_url, self.logger)
            requestHelper.setAcceptType(AcceptTypes.html_xml)  # request
            neg_source, self.fuji.extruct_result = requestHelper.content_negotiate(
                'FsF-F1-02D', ignore_html=False)
            if not 'html' in str(requestHelper.content_type):
                self.logger.info(
                    'FsF-F2-01M :Content type is ' +
                    str(requestHelper.content_type) +
                    ', therefore skipping embedded metadata (microdata, RDFa) tests'
                )
                self.fuji.extruct_result = {}
            if type(self.fuji.extruct_result) != dict:
                self.fuji.extruct_result = {}
            r = requestHelper.getHTTPResponse()
            response_status = requestHelper.response_status

            if r:
                self.fuji.landing_url = requestHelper.redirect_url
                #in case the test has been repeated because a PID has been found in metadata
                #print(self.fuji.landing_url, self.fuji.input_id)
                if self.fuji.repeat_pid_check == True:
                    if self.fuji.landing_url != self.fuji.input_id:
                        self.logger.warning(
                            'FsF-F1-02D : Landing page URL resolved from PID found in metadata does not match with input URL'
                        )
                        self.logger.warning(
                            'FsF-F2-01M : Seems to be a catalogue entry or alternative representation of the data set, landing page URL resolved from PID found in metadata does not match with input URL'
                        )

                        #self.fuji.repeat_pid_check = False
                if self.fuji.landing_url not in [
                        'https://datacite.org/invalid.html'
                ]:

                    if response_status == 200:
                        # identify signposting links in header
                        header_link_string = requestHelper.getHTTPResponse(
                        ).getheader('Link')
                        if header_link_string is not None:
                            self.logger.info(
                                'FsF-F1-02D : Found signposting links in response header of landingpage'
                            )

                            for preparsed_link in header_link_string.split(
                                    ','):
                                found_link = None
                                found_type, type_match = None, None
                                found_rel, rel_match = None, None
                                found_formats, formats_match = None, None
                                parsed_link = preparsed_link.strip().split(';')
                                found_link = parsed_link[0].strip()
                                for link_prop in parsed_link[1:]:
                                    if str(link_prop).startswith('rel="'):
                                        rel_match = re.search(
                                            'rel=\"(.*?)\"', link_prop)
                                    elif str(link_prop).startswith('type="'):
                                        type_match = re.search(
                                            'type=\"(.*?)\"', link_prop)
                                    elif str(link_prop).startswith(
                                            'formats="'):
                                        formats_match = re.search(
                                            'formats=\"(.*?)\"', link_prop)
                                if type_match:
                                    found_type = type_match[1]
                                if rel_match:
                                    found_rel = rel_match[1]
                                if formats_match:
                                    found_formats = formats_match[1]
                                signposting_link_dict = {
                                    'url': found_link[1:-1],
                                    'type': found_type,
                                    'rel': found_rel,
                                    'profile': found_formats
                                }
                                if found_link:
                                    self.fuji.signposting_header_links.append(
                                        signposting_link_dict)

                        #check if there is a cite-as signposting link
                        if self.fuji.pid_scheme is None:
                            signposting_pid_link = self.fuji.get_signposting_links(
                                'cite-as')
                            if signposting_pid_link:
                                signposting_pid = signposting_pid_link[0].get(
                                    'url')
                            if signposting_pid:
                                signidhelper = IdentifierHelper
                                #found_ids = idutils.detect_identifier_schemes(signposting_pid[0])
                                found_id = signidhelper.preferred_schema
                                #if len(found_ids) > 1:
                                #    found_ids.remove('url')
                                #    found_id = found_ids[0]
                                if signidhelper.is_persistent:
                                    self.logger.info(
                                        'FsF-F1-02D : Found object identifier in signposting header links'
                                    )
                                    self.fuji.pid_scheme = found_id

                        up = urlparse(self.fuji.landing_url)
                        self.fuji.landing_origin = '{uri.scheme}://{uri.netloc}'.format(
                            uri=up)
                        self.fuji.landing_html = requestHelper.getResponseContent(
                        )
                        self.fuji.landing_content_type = requestHelper.content_type

                        self.output.resolved_url = self.fuji.landing_url  # url is active, although the identifier is not based on a pid scheme
                        self.output.resolvable_status = True
                        self.logger.info(
                            'FsF-F1-02D : Object identifier active (status code = 200)'
                        )
                        self.fuji.isMetadataAccessible = True
                    elif response_status in [401, 402, 403]:
                        self.fuji.isMetadataAccessible = False
                        self.logger.warning(
                            "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}"
                            .format(code=response_status))
                    else:
                        self.fuji.isMetadataAccessible = False
                        self.logger.warning(
                            "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}"
                            .format(code=response_status))
                else:
                    self.logger.warning(
                        "FsF-F1-02D : Invalid DOI, identifier resolved to -: {code}"
                        .format(code=self.fuji.landing_url))

            else:
                self.fuji.isMetadataAccessible = False
                self.logger.warning(
                    "FsF-F1-02D :Resource inaccessible, no response received from -: {}"
                    .format(check_url))
                if response_status in [401, 402, 403]:
                    self.logger.warning(
                        "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}"
                        .format(code=response_status))
        else:
            self.logger.warning(
                "FsF-F1-02D :Resource inaccessible, could not identify an actionable representation for the given identfier -: {}"
                .format(self.fuji.id))

        if self.fuji.pid_scheme is not None:
            # short_pid = id.normalize_pid(self.id, scheme=pid_scheme)
            if signposting_pid is None:
                idhelper = IdentifierHelper(self.fuji.id)
                self.fuji.pid_url = idhelper.identifier_url
                #self.fuji.pid_url = idutils.to_url(self.fuji.id, scheme=self.fuji.pid_scheme)
            else:
                self.fuji.pid_url = signposting_pid[0]
            self.output.pid_scheme = self.fuji.pid_scheme

            self.output.pid = self.fuji.pid_url
            self.setEvaluationCriteriumScore('FsF-F1-02D-1', 0.5, 'pass')
            self.score.earned = 0.5
            self.maturity = 1
            if self.fuji.isMetadataAccessible:
                self.setEvaluationCriteriumScore('FsF-F1-02D-2', 0.5, 'pass')
                self.maturity = 3
                self.result.test_status = 'pass'
                self.score.earned = self.total_score  # idenfier should be based on a persistence scheme and resolvable

            #print(self.metric_tests)

            self.logger.log(
                self.fuji.LOG_SUCCESS,
                'FsF-F1-02D : Persistence identifier scheme -: {}'.format(
                    self.fuji.pid_scheme))
            #self.logger.info('FsF-F1-02D : Persistence identifier scheme - {}'.format(self.fuji.pid_scheme))
        else:
            self.score.earned = 0
            self.logger.warning(
                'FsF-F1-02D : Not a persistent identifier scheme -: {}'.format(
                    self.fuji.id_scheme))

        self.result.score = self.score
        self.result.maturity = self.maturity
        self.result.metric_tests = self.metric_tests
        self.result.output = self.output
Ejemplo n.º 11
0
    def evaluate(self):

        self.result = Persistence(id=self.fuji.count,
                                  metric_identifier=self.metric_identifier,
                                  metric_name=self.metric_name)
        self.output = PersistenceOutput()
        # ======= CHECK IDENTIFIER PERSISTENCE =======
        self.logger.info(
            'FsF-F1-02D : PID schemes-based assessment supported by the assessment service - {}'
            .format(Mapper.VALID_PIDS.value))

        if self.fuji.pid_scheme is not None:
            check_url = idutils.to_url(self.fuji.id,
                                       scheme=self.fuji.pid_scheme)
        elif self.fuji.id_scheme == 'url':
            check_url = self.fuji.id

        # ======= RETRIEVE METADATA FROM LANDING PAGE =======
        requestHelper = RequestHelper(check_url, self.logger)
        requestHelper.setAcceptType(AcceptTypes.html)  # request
        neg_source, self.fuji.extruct_result = requestHelper.content_negotiate(
            'FsF-F1-02D')
        r = requestHelper.getHTTPResponse()
        signposting_pid = None
        if r:
            self.fuji.landing_url = requestHelper.redirect_url
            if r.status == 200:
                # identify signposting links in header
                header_link_string = requestHelper.getHTTPResponse().getheader(
                    'Link')
                if header_link_string is not None:
                    self.logger.info(
                        'FsF-F1-02D : Found signposting links in response header of landingpage'
                    )

                    for preparsed_link in header_link_string.split(','):
                        found_link = None
                        found_type, type_match = None, None
                        found_rel, rel_match = None, None
                        parsed_link = preparsed_link.strip().split(';')
                        found_link = parsed_link[0].strip()
                        for link_prop in parsed_link[1:]:
                            if str(link_prop).startswith('rel="'):
                                rel_match = re.search('rel=\"(.*?)\"',
                                                      link_prop)
                            elif str(link_prop).startswith('type="'):
                                type_match = re.search('type=\"(.*?)\"',
                                                       link_prop)
                        if type_match:
                            found_type = type_match[1]
                        if rel_match:
                            found_rel = rel_match[1]
                        signposting_link_dict = {
                            'url': found_link[1:-1],
                            'type': found_type,
                            'rel': found_rel
                        }
                        if found_link:
                            self.fuji.signposting_header_links.append(
                                signposting_link_dict)
                        '''
                        if found_rel:
                            if self.fuji.signposting_header_links.get(found_rel[1]):
                                self.fuji.signposting_header_links[found_rel[1]].append(found_link[1:-1])
                            else:
                                self.fuji.signposting_header_links[found_rel[1]]=[found_link[1:-1]]
                        '''

                #check if there is a cite-as signposting link
                if self.fuji.pid_scheme is None:
                    signposting_pid_link = self.fuji.get_signposting_links(
                        'cite-as')
                    if signposting_pid_link:
                        signposting_pid = signposting_pid_link[0].get('url')
                    if signposting_pid:
                        found_ids = idutils.detect_identifier_schemes(
                            signposting_pid[0])
                        if len(found_ids) > 1:
                            found_ids.remove('url')
                            found_id = found_ids[0]
                            if found_id in Mapper.VALID_PIDS.value:
                                self.logger.info(
                                    'FsF-F1-02D : Found object identifier in signposting header links'
                                )
                                self.fuji.pid_scheme = found_id

                up = urlparse(self.fuji.landing_url)
                self.fuji.landing_origin = '{uri.scheme}://{uri.netloc}'.format(
                    uri=up)
                self.fuji.landing_html = requestHelper.getResponseContent()

                self.output.resolved_url = self.fuji.landing_url  # url is active, although the identifier is not based on a pid scheme
                self.output.resolvable_status = True
                self.logger.info(
                    'FsF-F1-02D : Object identifier active (status code = 200)'
                )
                self.fuji.isMetadataAccessible = True
            elif r.status_code in [401, 402, 403]:
                self.fuji.isMetadataAccessible = False
                self.logger.warning(
                    "Resource inaccessible, identifier returned http status code: {code}"
                    .format(code=r.status_code))
            else:
                self.fuji.isMetadataAccessible = False
                self.logger.warning(
                    "Resource inaccessible, identifier returned http status code: {code}"
                    .format(code=r.status_code))
        else:
            self.fuji.isMetadataAccessible = False
            self.logger.warning(
                "FsF-F1-02D :Resource inaccessible, no response received from: {}"
                .format(check_url))

        if self.fuji.pid_scheme is not None:
            # short_pid = id.normalize_pid(self.id, scheme=pid_scheme)
            if signposting_pid is None:
                self.fuji.pid_url = idutils.to_url(self.fuji.id,
                                                   scheme=self.fuji.pid_scheme)
            else:
                self.fuji.pid_url = signposting_pid[0]
            self.output.pid_scheme = self.fuji.pid_scheme
            self.result.test_status = 'pass'
            self.output.pid = self.fuji.pid_url
            self.setEvaluationCriteriumScore('FsF-F1-02D-1', 0, 'pass')
            if self.fuji.isMetadataAccessible:
                self.setEvaluationCriteriumScore('FsF-F1-02D-2', 1, 'pass')
                self.score.earned = self.total_score  # idenfier should be based on a persistence scheme and resolvable

            #print(self.metric_tests)

            self.logger.log(
                self.fuji.LOG_SUCCESS,
                'FsF-F1-02D : Persistence identifier scheme - {}'.format(
                    self.fuji.pid_scheme))
            #self.logger.info('FsF-F1-02D : Persistence identifier scheme - {}'.format(self.fuji.pid_scheme))
        else:
            self.score.earned = 0
            self.logger.warning(
                'FsF-F1-02D : Not a persistent identifier scheme - {}'.format(
                    self.fuji.id_scheme))

        self.result.score = self.score
        self.result.metric_tests = self.metric_tests
        self.result.output = self.output
Ejemplo n.º 12
0
    def parse_metadata(self):
        xml_metadata = None
        xml_mapping = None
        metatree = None
        envelope_metadata = {}
        XSI = "http://www.w3.org/2001/XMLSchema-instance"
        if self.link_type == 'linked':
            source_name = self.getEnumSourceNames().TYPED_LINK.value
        if self.link_type == 'embedded':
            source_name = self.getEnumSourceNames().LINKED_DATA.value
        elif self.link_type == 'guessed':
            source_name = self.getEnumSourceNames().GUESSED_XML.value
        elif self.link_type == 'negotiated':
            source_name = self.getEnumSourceNames().XML_NEGOTIATED.value
        else:
            source_name = self.getEnumSourceNames().TYPED_LINK.value
        dc_core_metadata = None
        requestHelper = RequestHelper(self.target_url, self.logger)
        requestHelper.setAcceptType(AcceptTypes.xml)
        #self.logger.info('FsF-F2-01M : Sending request to access metadata from -: {}'.format(self.target_url))
        neg_source, xml_response = requestHelper.content_negotiate(
            'FsF-F2-01M')
        if requestHelper.getHTTPResponse() is not None:
            self.logger.info(
                'FsF-F2-01M : Trying to extract/parse metadata from -: {}'.
                format(source_name))
            #dom = lxml.html.fromstring(self.landing_html.encode('utf8'))
            if neg_source != 'xml':
                self.logger.info(
                    'FsF-F2-01M : Expected XML but content negotiation responded -: '
                    + str(neg_source))
            else:
                parser = lxml.etree.XMLParser(strip_cdata=False)
                tree = lxml.etree.XML(xml_response, parser)
                root_element = tree.tag
                if root_element.endswith('}OAI-PMH'):
                    self.logger.info(
                        'FsF-F2-01M : Found OAI-PMH type XML envelope, unpacking \'metadata\' element for further processing'
                    )
                    metatree = tree.find('.//{*}metadata/*')
                elif root_element.endswith('}mets'):
                    self.logger.info(
                        'FsF-F2-01M : Found METS type XML envelope, unpacking all \'mods\' elements for further processing'
                    )
                    envelope_metadata = self.get_mapped_xml_metadata(
                        tree, Mapper.XML_MAPPING_METS.value)
                    metatree = tree.find('.//{*}dmdSec/{*}mdWrap/{*}xmlData/*')
                elif root_element.endswith('}GetRecordsResponse'):
                    self.logger.info(
                        'FsF-F2-01M : Found OGC CSW GetRecords type XML envelope, unpacking \'SearchResults\' element for further processing'
                    )
                    metatree = tree.find('.//{*}SearchResults/*')
                elif root_element.endswith('}GetRecordByIdResponse'):
                    self.logger.info(
                        'FsF-F2-01M : Found OGC CSW GetRecordByIdResponse type XML envelope, unpacking metadata element for further processing'
                    )
                    metatree = tree.find('.//*')
                else:
                    metatree = tree
                if metatree is not None:
                    root_namespace = None
                    nsmatch = re.match(r'^\{(.+)\}(.+)$', metatree.tag)
                    schema_locations = set(
                        metatree.xpath("//*/@xsi:schemaLocation",
                                       namespaces={'xsi': XSI}))
                    for schema_location in schema_locations:
                        self.namespaces = re.split('\s', schema_location)
                    if nsmatch:
                        root_namespace = nsmatch[1]
                        root_element = nsmatch[2]
                        print('#' + root_element + '#', root_namespace)
                        self.namespaces.append(root_namespace)
                    if root_element == 'codeBook':
                        xml_mapping = Mapper.XML_MAPPING_DDI_CODEBOOK.value
                        self.logger.info(
                            'FsF-F2-01M : Identified DDI codeBook XML based on root tag'
                        )
                    elif root_element == 'dc':
                        xml_mapping = Mapper.XML_MAPPING_DUBLIN_CORE.value
                        self.logger.info(
                            'FsF-F2-01M : Identified Dublin Core XML based on root tag'
                        )
                    elif root_element == 'mods':
                        xml_mapping = Mapper.XML_MAPPING_MODS.value
                        self.logger.info(
                            'FsF-F2-01M : Identified MODS XML based on root tag'
                        )

                    elif root_element == 'eml':
                        xml_mapping = Mapper.XML_MAPPING_EML.value
                        self.logger.info(
                            'FsF-F2-01M : Identified EML XML based on root tag'
                        )
                    elif root_element == 'MD_Metadata':
                        xml_mapping = Mapper.XML_MAPPING_GCMD_ISO.value
                        self.logger.info(
                            'FsF-F2-01M : Identified ISO 19115 XML based on root tag'
                        )
                    elif root_namespace:
                        if 'datacite.org/schema' in root_namespace:
                            xml_mapping = Mapper.XML_MAPPING_DATACITE.value
                            self.logger.info(
                                'FsF-F2-01M : Identified DataCite XML based on namespace'
                            )

        if xml_mapping and metatree is not None:
            xml_metadata = self.get_mapped_xml_metadata(metatree, xml_mapping)

        if envelope_metadata:
            for envelope_key, envelope_values in envelope_metadata.items():
                if envelope_key not in xml_metadata:
                    xml_metadata[envelope_key] = envelope_values
        return source_name, xml_metadata