def getMetadataStandards(self): filter = ['datacite.org', 'openarchives.org', 'purl.org/dc/'] # TODO expand filters #http://ws.pangaea.de/oai/provider?verb=ListMetadataFormats oai_endpoint = self.endpoint.split('?')[0] #oai_endpoint = oai_endpoint.rstrip('/') oai_listmetadata_url = oai_endpoint + '?verb=ListMetadataFormats' requestHelper = RequestHelper(url=oai_listmetadata_url, logInst=self.logger) requestHelper.setAcceptType(AcceptTypes.xml) response_type, xml = requestHelper.content_negotiate(self.metric_id) root = etree.fromstring(xml.content) metadata_nodes = root.xpath( '//oai:OAI-PMH/oai:ListMetadataFormats/oai:metadataFormat', namespaces=OAIMetadataProvider.oai_namespaces) schemas = {} for node in metadata_nodes: ele = etree.XPathEvaluator( node, namespaces=OAIMetadataProvider.oai_namespaces).evaluate metadata_prefix = ele('string(oai:metadataPrefix/text())' ) # <metadataPrefix>oai_dc</metadataPrefix> metadata_schema = ele( 'string(oai:schema/text())' ) #<schema>http://www.openarchives.org/OAI/2.0/oai_dc.xsd</schema> metadata_schema = metadata_schema.strip() self.namespaces.append(metadata_schema) # TODO there can be more than one OAI-PMH endpoint, https://www.re3data.org/repository/r3d100011221 if not any(s in metadata_schema for s in filter): schemas[metadata_prefix] = [metadata_schema] else: self.logger.info( '{0} : Skipping domain-agnostic standard listed in OAI-PMH - {1}' .format(self.metric_id, metadata_prefix)) return schemas
def lookup_re3data(self): if self.client_id and self.pid_scheme: re3doi = RepositoryHelper.DATACITE_REPOSITORIES.get( self.client_id) # {client_id,re3doi} short_re3doi = idutils.normalize_pid( re3doi, scheme='doi') #https://doi.org/10.17616/R3XS37 # pid -> clientId -> repo doi-> re3id, and query repository metadata from re3api if re3doi: self.logger.info('Found match re3data (DOI-based) record') query_url = Preprocessor.RE3DATA_API + '?query=' + short_re3doi # https://re3data.org/api/beta/repositories?query= q = RequestHelper(url=query_url) q.setAcceptType(AcceptTypes.xml) re_source, xml = q.content_negotiate(metric_id='RE3DATA') root = etree.fromstring(xml.content) #<link href="https://www.re3data.org/api/beta/repository/r3d100010134" rel="self" /> re3link = root.xpath('//link')[0].attrib['href'] if re3link is not None: self.logger.info('Found match re3data metadata record') # query reposiroty metadata q2 = RequestHelper(url=re3link) q2.setAcceptType(AcceptTypes.xml) re3_source, re3_response = q2.content_negotiate( metric_id='RE3DATA') self.re3metadata_raw = re3_response.content self.parseRepositoryMetadata() else: self.logger.warning( 'No DOI of client id is available from datacite api')
def getMetadataStandards(self): csw_endpoint = self.endpoint.split('?')[0] csw_listmetadata_url = csw_endpoint + '?service=CSW&request=GetCapabilities' requestHelper = RequestHelper(url=csw_listmetadata_url, logInst=self.logger) requestHelper.setAcceptType(AcceptTypes.xml) response_type, xml = requestHelper.content_negotiate(self.metric_id) schemas = {} if xml: try: root = etree.fromstring(requestHelper.response_content) metadata_nodes = root.xpath( '//ows:Parameter[@name="outputSchema"]/ows:Value', namespaces=OGCCSWMetadataProvider.csw_namespaces) for node in metadata_nodes: if node.text: if node.text not in self.namespaces: self.namespaces.append(str(node.text)) schemas[str(node.text)] = str(node.text) except: self.logger.info( '{0} : Could not parse XML response retrieved from OGC CSW endpoint' .format(self.metric_id)) return schemas
def parse_metadata(self): source_name = None dcite_metadata = {} self.logger.info('FsF-F2-01M : Trying to retrieve datacite metadata') requestHelper = RequestHelper(self.pid_url, self.logger) requestHelper.setAcceptType(AcceptTypes.datacite_json) neg_source, ext_meta = requestHelper.content_negotiate('FsF-F2-01M') if ext_meta: try: dcite_metadata = jmespath.search(self.metadata_mapping.value, ext_meta) if dcite_metadata: self.namespaces.append('http://datacite.org/schema/') source_name = self.getEnumSourceNames().DATACITE_JSON.value if dcite_metadata['creator'] is None: first = dcite_metadata['creator_first'] last = dcite_metadata['creator_last'] # default type of creator is [] if isinstance(first, list) and isinstance(last, list): if len(first) == len(last): names = [ i + " " + j for i, j in zip(first, last) ] dcite_metadata['creator'] = names if dcite_metadata.get('related_resources'): self.logger.info( 'FsF-I3-01M : {0} related resource(s) extracted from -: {1}' .format(len(dcite_metadata['related_resources']), source_name)) temp_rels = [] for r in dcite_metadata['related_resources']: if r.get('scheme_uri'): self.namespaces.append(r.get('scheme_uri')) filtered = { k: v for k, v in r.items() if v is not None } temp_rels.append(filtered) dcite_metadata['related_resources'] = temp_rels else: self.logger.info( 'FsF-I3-01M : No related resource(s) found in Datacite metadata' ) # convert all values (list type) into string except 'creator','license','related_resources' for key, value in dcite_metadata.items(): if key not in self.exclude_conversion and isinstance( value, list): flat = ', '.join(map(str, value)) dcite_metadata[key] = flat except Exception as e: self.logger.exception( 'Failed to extract Datacite Json -: {}'.format(e)) return source_name, dcite_metadata
def parse_metadata(self): #self.source_name = self.getEnumSourceNames().LINKED_DATA.value self.logger.info('FsF-F2-01M : Extract metadata from {}'.format(self.source_name)) rdf_metadata=dict() requestHelper: RequestHelper = RequestHelper(self.target_url, self.logger) requestHelper.setAcceptType(AcceptTypes.rdf) neg_source,rdf_response = requestHelper.content_negotiate('FsF-F2-01M') #required for metric knowledge representation self.content_type = requestHelper.getHTTPResponse().headers['content-type'] self.content_type = self.content_type.split(";", 1)[0] ontology_indicator=[rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#'),rdflib.term.URIRef('http://www.w3.org/2002/07/owl#')] if isinstance(rdf_response,rdflib.graph.Graph): self.logger.info('FsF-F2-01M : Found RDF Graph') # TODO: set credit score for being valid RDF # TODO: since its valid RDF aka semantic representation, make sure FsF-I1-01M is passed and scored if rdflib.term.URIRef('http://www.w3.org/ns/dcat#') in dict(list(rdf_response.namespaces())).values(): self.logger.info('FsF-F2-01M : RDF Graph seems to contain DCAT metadata elements') rdf_metadata = self.get_dcat_metadata(rdf_response) elif bool(set(ontology_indicator) & set(dict(list(rdf_response.namespaces())).values())): rdf_metadata = self.get_ontology_metadata(rdf_response) #add found namespaces URIs to namespace for ns in rdf_response.namespaces(): self.namespaces.append(str(ns[1])) else: self.logger.info('FsF-F2-01M : Expected RDF Graph but received - {0}'.format(self.content_type)) return self.source_name, rdf_metadata
def parse_metadata(self): XSI = "http://www.w3.org/2001/XMLSchema-instance" if self.link_type == 'embedded': source_name = self.getEnumSourceNames().LINKED_DATA.value elif self.link_type == 'guessed': source_name = self.getEnumSourceNames().GUESSED_XML.value elif self.link_type == 'negotiated': source_name = self.getEnumSourceNames().XML_NEGOTIATED.value dc_core_metadata = None requestHelper: RequestHelper = RequestHelper(self.target_url, self.logger) requestHelper.setAcceptType(AcceptTypes.xml) neg_source, xml_response = requestHelper.content_negotiate( 'FsF-F2-01M') if requestHelper.response_status == 200: self.logger.info( 'FsF-F2-01M : Extract metadata from {}'.format(source_name)) #dom = lxml.html.fromstring(self.landing_html.encode('utf8')) if neg_source != 'xml': self.logger.info( 'FsF-F2-01M : Expected XML but content negotiation responded: ' + str(neg_source)) else: tree = lxml.etree.XML(xml_response) schema_locations = set( tree.xpath("//*/@xsi:schemaLocation", namespaces={'xsi': XSI})) #print(schema_locations) for schema_location in schema_locations: self.namespaces = re.split('\s', schema_location) #TODO: implement some XSLT to handle the XML.. return source_name, dc_core_metadata
def lookup_re3data(self): if self.client_id and self.pid_scheme: re3doi = RepositoryHelper.DATACITE_REPOSITORIES.get( self.client_id) # {client_id,re3doi} #print(self.client_id,'Re3DOI',re3doi, idutils.is_doi(re3doi)) if re3doi: if idutils.is_doi(re3doi): short_re3doi = idutils.normalize_pid( re3doi, scheme='doi') #https://doi.org/10.17616/R3XS37 else: re3doi = None # pid -> clientId -> repo doi-> re3id, and query repository metadata from re3api if re3doi: self.logger.info( 'FsF-R1.3-01M : Found match re3data (DOI-based) record') query_url = Preprocessor.RE3DATA_API + '?query=' + short_re3doi # https://re3data.org/api/beta/repositories?query= q = RequestHelper(url=query_url) q.setAcceptType(AcceptTypes.xml) re_source, xml = q.content_negotiate(metric_id='RE3DATA') try: if isinstance(xml, bytes): xml = xml.decode().encode() root = etree.fromstring(xml) #<link href="https://www.re3data.org/api/beta/repository/r3d100010134" rel="self" /> re3link = root.xpath('//link')[0].attrib['href'] if re3link is not None: self.logger.info( 'FsF-R1.3-01M : Found match re3data metadata record -: ' + str(re3link)) # query reposiroty metadata q2 = RequestHelper(url=re3link) q2.setAcceptType(AcceptTypes.xml) re3_source, re3_response = q2.content_negotiate( metric_id='RE3DATA') self.re3metadata_raw = re3_response self.parseRepositoryMetadata() except Exception as e: self.logger.warning( 'FsF-R1.3-01M : Malformed re3data (DOI-based) record received: ' + str(e)) else: self.logger.warning( 'FsF-R1.3-01M : No DOI of client id is available from datacite api' )
def parse_metadata(self): #self.source_name = self.getEnumSourceNames().LINKED_DATA.value #self.logger.info('FsF-F2-01M : Trying to request RDF metadata from -: {}'.format(self.source_name)) rdf_metadata=dict() if self.rdf_graph is None: #print(self.target_url) requestHelper: RequestHelper = RequestHelper(self.target_url, self.logger) requestHelper.setAcceptType(AcceptTypes.rdf) neg_source,rdf_response = requestHelper.content_negotiate('FsF-F2-01M') #required for metric knowledge representation if requestHelper.getHTTPResponse() is not None: self.content_type = requestHelper.getHTTPResponse().headers.get('content-type') if self.content_type is not None: self.content_type = self.content_type.split(";", 1)[0] #handle JSON-LD DCAT = Namespace("http://www.w3.org/ns/dcat#") if self.content_type == 'application/ld+json': try: jsonldgraph= rdflib.ConjunctiveGraph() rdf_response = jsonldgraph.parse(data=json.dumps(rdf_response), format='json-ld') rdf_response = jsonldgraph except Exception as e: self.logger.info('FsF-F2-01M : Parsing error, failed to extract JSON-LD -: {}'.format(e)) else: neg_source, rdf_response = 'html' , self.rdf_graph ontology_indicator=[rdflib.term.URIRef('http://www.w3.org/2004/02/skos/core#'),rdflib.term.URIRef('http://www.w3.org/2002/07/owl#')] if isinstance(rdf_response,rdflib.graph.Graph): self.logger.info('FsF-F2-01M : Found RDF Graph') graph_text = rdf_response.serialize(format="ttl") self.getNamespacesfromIRIs(graph_text) # TODO: set credit score for being valid RDF # TODO: since its valid RDF aka semantic representation, make sure FsF-I1-01M is passed and scored if rdflib.term.URIRef('http://www.w3.org/ns/dcat#') in dict(list(rdf_response.namespaces())).values(): self.logger.info('FsF-F2-01M : RDF Graph seems to contain DCAT metadata elements') rdf_metadata = self.get_dcat_metadata(rdf_response) elif bool(set(ontology_indicator) & set(dict(list(rdf_response.namespaces())).values())): rdf_metadata = self.get_ontology_metadata(rdf_response) else: rdf_metadata = self.get_default_metadata(rdf_response) #add found namespaces URIs to namespace for ns in rdf_response.namespaces(): self.namespaces.append(str(ns[1])) else: self.logger.info('FsF-F2-01M : Expected RDF Graph but received -: {0}'.format(self.content_type)) return self.source_name, rdf_metadata
def parse_metadata(self, ls=None): jsnld_metadata = {} ext_meta=None if self.source_metadata: self.source_name = self.getEnumSourceNames().SCHEMAORG_EMBED.value ext_meta = self.source_metadata[0] elif self.pid_url: self.source_name = self.getEnumSourceNames().SCHEMAORG_NEGOTIATE.value # TODO (IMPORTANT) PID agency may support Schema.org in JSON-LD # TODO (IMPORTANT) validate schema.org # fallback, request (doi) metadata specified in schema.org JSON-LD requestHelper: RequestHelper = RequestHelper(self.pid_url, self.logger) requestHelper.setAcceptType(AcceptTypes.schemaorg) neg_source,ext_meta = requestHelper.content_negotiate('FsF-F2-01M') if ext_meta is not None: self.getNamespacesfromIRIs(ext_meta) self.logger.info('FsF-F2-01M : Trying to extract schema.org JSON-LD metadata from -: {}'.format(self.source_name)) # TODO check syntax - not ending with /, type and @type # TODO (important) extend mapping to detect other pids (link to related entities)? check_context_type = ["Dataset", "Collection"] try: #if ext_meta['@context'] in check_context_type['@context'] and ext_meta['@type'] in check_context_type["@type"]: if str(ext_meta['@context']).find('://schema.org') > -1: if str(ext_meta['@type']).lower() not in self.SCHEMA_ORG_CONTEXT: self.logger.info('FsF-F2-01M : Found JSON-LD but seems not to be a schema.org object based on the given context type') elif ext_meta['@type'] not in check_context_type: self.logger.info('FsF-F2-01M : Found schema.org JSON-LD but seems not to be a research data object') else: self.logger.info('FsF-F2-01M : Found schema.org JSON-LD which seems to be valid, based on the given context type') self.namespaces.append('http://schema.org/') jsnld_metadata = jmespath.search(self.metadata_mapping.value, ext_meta) # TODO all properties with null values extracted through jmespath should be excluded if jsnld_metadata.get('creator') is None: #TODO: handle None values for first and last name first = jsnld_metadata.get('creator_first') last = jsnld_metadata.get('creator_last') if isinstance(first, list) and isinstance(last, list): if len(first) == len(last): names = [str(i) + " " + str(j) for i, j in zip(first, last)] jsnld_metadata['creator'] = names else: jsnld_metadata['creator'] = [str(first) + " " + str(last)] #TODO instead of custom check there should a valdiator to evaluate the whole schema.org metadata invalid_license = False if jsnld_metadata.get('license'): self.logger.info('FsF-R1.1-01M : License metadata found (schema.org) -: {}'.format( jsnld_metadata.get('license'))) if isinstance(jsnld_metadata.get('license'), list): jsnld_metadata['license'] = jsnld_metadata['license'][0] if isinstance(jsnld_metadata.get('license'), dict): ls_type = jsnld_metadata.get('license').get('@type') if ls_type =='CreativeWork': ls = jsnld_metadata.get('license').get('url') if not ls: ls = jsnld_metadata.get('license').get('name') if ls: jsnld_metadata['license'] = ls else: invalid_license = True else: invalid_license = True if invalid_license: self.logger.warning('FsF-R1.1-01M : Looks like schema.org representation of license is incorrect, skipping the test.') jsnld_metadata['license'] = None # filter out None values of related_resources if jsnld_metadata.get('related_resources'): relateds = [d for d in jsnld_metadata['related_resources'] if d['related_resource'] is not None] if relateds: jsnld_metadata['related_resources'] = relateds self.logger.info('FsF-I3-01M : {0} related resource(s) extracted from -: {1}'.format(len(jsnld_metadata['related_resources']), self.source_name)) else: del jsnld_metadata['related_resources'] self.logger.info('FsF-I3-01M : No related resource(s) found in Schema.org metadata') # TODO quick-fix, expand mapping expression instead if jsnld_metadata.get('object_size'): jsnld_metadata['object_size'] = str(jsnld_metadata['object_size'].get('value')) + ' '+ jsnld_metadata['object_size'].get('unitText') else: self.logger.info('FsF-F2-01M : Found JSON-LD schema.org but record is not of type "Dataset"') except Exception as err: #print(err.with_traceback()) self.logger.info('FsF-F2-01M : Failed to parse JSON-LD schema.org -: {}'.format(err)) else: self.logger.info('FsF-F2-01M : Could not identify JSON-LD schema.org metadata') return self.source_name, jsnld_metadata
def evaluate(self): self.result = Persistence(id=self.metric_number, metric_identifier=self.metric_identifier, metric_name=self.metric_name) self.output = PersistenceOutput() # ======= CHECK IDENTIFIER PERSISTENCE ======= self.logger.info( 'FsF-F1-02D : PID schemes-based assessment supported by the assessment service - {}' .format(Mapper.VALID_PIDS.value)) check_url = None signposting_pid = None if self.fuji.id_scheme is not None: check_url = self.fuji.pid_url #check_url = idutils.to_url(self.fuji.id, scheme=self.fuji.id_scheme) if self.fuji.id_scheme == 'url': self.fuji.origin_url = self.fuji.id check_url = self.fuji.id if check_url: # ======= RETRIEVE METADATA FROM LANDING PAGE ======= requestHelper = RequestHelper(check_url, self.logger) requestHelper.setAcceptType(AcceptTypes.html_xml) # request neg_source, self.fuji.extruct_result = requestHelper.content_negotiate( 'FsF-F1-02D', ignore_html=False) if not 'html' in str(requestHelper.content_type): self.logger.info( 'FsF-F2-01M :Content type is ' + str(requestHelper.content_type) + ', therefore skipping embedded metadata (microdata, RDFa) tests' ) self.fuji.extruct_result = {} if type(self.fuji.extruct_result) != dict: self.fuji.extruct_result = {} r = requestHelper.getHTTPResponse() response_status = requestHelper.response_status if r: self.fuji.landing_url = requestHelper.redirect_url #in case the test has been repeated because a PID has been found in metadata #print(self.fuji.landing_url, self.fuji.input_id) if self.fuji.repeat_pid_check == True: if self.fuji.landing_url != self.fuji.input_id: self.logger.warning( 'FsF-F1-02D : Landing page URL resolved from PID found in metadata does not match with input URL' ) self.logger.warning( 'FsF-F2-01M : Seems to be a catalogue entry or alternative representation of the data set, landing page URL resolved from PID found in metadata does not match with input URL' ) #self.fuji.repeat_pid_check = False if self.fuji.landing_url not in [ 'https://datacite.org/invalid.html' ]: if response_status == 200: # identify signposting links in header header_link_string = requestHelper.getHTTPResponse( ).getheader('Link') if header_link_string is not None: self.logger.info( 'FsF-F1-02D : Found signposting links in response header of landingpage' ) for preparsed_link in header_link_string.split( ','): found_link = None found_type, type_match = None, None found_rel, rel_match = None, None found_formats, formats_match = None, None parsed_link = preparsed_link.strip().split(';') found_link = parsed_link[0].strip() for link_prop in parsed_link[1:]: if str(link_prop).startswith('rel="'): rel_match = re.search( 'rel=\"(.*?)\"', link_prop) elif str(link_prop).startswith('type="'): type_match = re.search( 'type=\"(.*?)\"', link_prop) elif str(link_prop).startswith( 'formats="'): formats_match = re.search( 'formats=\"(.*?)\"', link_prop) if type_match: found_type = type_match[1] if rel_match: found_rel = rel_match[1] if formats_match: found_formats = formats_match[1] signposting_link_dict = { 'url': found_link[1:-1], 'type': found_type, 'rel': found_rel, 'profile': found_formats } if found_link: self.fuji.signposting_header_links.append( signposting_link_dict) #check if there is a cite-as signposting link if self.fuji.pid_scheme is None: signposting_pid_link = self.fuji.get_signposting_links( 'cite-as') if signposting_pid_link: signposting_pid = signposting_pid_link[0].get( 'url') if signposting_pid: signidhelper = IdentifierHelper #found_ids = idutils.detect_identifier_schemes(signposting_pid[0]) found_id = signidhelper.preferred_schema #if len(found_ids) > 1: # found_ids.remove('url') # found_id = found_ids[0] if signidhelper.is_persistent: self.logger.info( 'FsF-F1-02D : Found object identifier in signposting header links' ) self.fuji.pid_scheme = found_id up = urlparse(self.fuji.landing_url) self.fuji.landing_origin = '{uri.scheme}://{uri.netloc}'.format( uri=up) self.fuji.landing_html = requestHelper.getResponseContent( ) self.fuji.landing_content_type = requestHelper.content_type self.output.resolved_url = self.fuji.landing_url # url is active, although the identifier is not based on a pid scheme self.output.resolvable_status = True self.logger.info( 'FsF-F1-02D : Object identifier active (status code = 200)' ) self.fuji.isMetadataAccessible = True elif response_status in [401, 402, 403]: self.fuji.isMetadataAccessible = False self.logger.warning( "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}" .format(code=response_status)) else: self.fuji.isMetadataAccessible = False self.logger.warning( "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}" .format(code=response_status)) else: self.logger.warning( "FsF-F1-02D : Invalid DOI, identifier resolved to -: {code}" .format(code=self.fuji.landing_url)) else: self.fuji.isMetadataAccessible = False self.logger.warning( "FsF-F1-02D :Resource inaccessible, no response received from -: {}" .format(check_url)) if response_status in [401, 402, 403]: self.logger.warning( "FsF-F1-02D : Resource inaccessible, identifier returned http status code -: {code}" .format(code=response_status)) else: self.logger.warning( "FsF-F1-02D :Resource inaccessible, could not identify an actionable representation for the given identfier -: {}" .format(self.fuji.id)) if self.fuji.pid_scheme is not None: # short_pid = id.normalize_pid(self.id, scheme=pid_scheme) if signposting_pid is None: idhelper = IdentifierHelper(self.fuji.id) self.fuji.pid_url = idhelper.identifier_url #self.fuji.pid_url = idutils.to_url(self.fuji.id, scheme=self.fuji.pid_scheme) else: self.fuji.pid_url = signposting_pid[0] self.output.pid_scheme = self.fuji.pid_scheme self.output.pid = self.fuji.pid_url self.setEvaluationCriteriumScore('FsF-F1-02D-1', 0.5, 'pass') self.score.earned = 0.5 self.maturity = 1 if self.fuji.isMetadataAccessible: self.setEvaluationCriteriumScore('FsF-F1-02D-2', 0.5, 'pass') self.maturity = 3 self.result.test_status = 'pass' self.score.earned = self.total_score # idenfier should be based on a persistence scheme and resolvable #print(self.metric_tests) self.logger.log( self.fuji.LOG_SUCCESS, 'FsF-F1-02D : Persistence identifier scheme -: {}'.format( self.fuji.pid_scheme)) #self.logger.info('FsF-F1-02D : Persistence identifier scheme - {}'.format(self.fuji.pid_scheme)) else: self.score.earned = 0 self.logger.warning( 'FsF-F1-02D : Not a persistent identifier scheme -: {}'.format( self.fuji.id_scheme)) self.result.score = self.score self.result.maturity = self.maturity self.result.metric_tests = self.metric_tests self.result.output = self.output
def evaluate(self): self.result = Persistence(id=self.fuji.count, metric_identifier=self.metric_identifier, metric_name=self.metric_name) self.output = PersistenceOutput() # ======= CHECK IDENTIFIER PERSISTENCE ======= self.logger.info( 'FsF-F1-02D : PID schemes-based assessment supported by the assessment service - {}' .format(Mapper.VALID_PIDS.value)) if self.fuji.pid_scheme is not None: check_url = idutils.to_url(self.fuji.id, scheme=self.fuji.pid_scheme) elif self.fuji.id_scheme == 'url': check_url = self.fuji.id # ======= RETRIEVE METADATA FROM LANDING PAGE ======= requestHelper = RequestHelper(check_url, self.logger) requestHelper.setAcceptType(AcceptTypes.html) # request neg_source, self.fuji.extruct_result = requestHelper.content_negotiate( 'FsF-F1-02D') r = requestHelper.getHTTPResponse() signposting_pid = None if r: self.fuji.landing_url = requestHelper.redirect_url if r.status == 200: # identify signposting links in header header_link_string = requestHelper.getHTTPResponse().getheader( 'Link') if header_link_string is not None: self.logger.info( 'FsF-F1-02D : Found signposting links in response header of landingpage' ) for preparsed_link in header_link_string.split(','): found_link = None found_type, type_match = None, None found_rel, rel_match = None, None parsed_link = preparsed_link.strip().split(';') found_link = parsed_link[0].strip() for link_prop in parsed_link[1:]: if str(link_prop).startswith('rel="'): rel_match = re.search('rel=\"(.*?)\"', link_prop) elif str(link_prop).startswith('type="'): type_match = re.search('type=\"(.*?)\"', link_prop) if type_match: found_type = type_match[1] if rel_match: found_rel = rel_match[1] signposting_link_dict = { 'url': found_link[1:-1], 'type': found_type, 'rel': found_rel } if found_link: self.fuji.signposting_header_links.append( signposting_link_dict) ''' if found_rel: if self.fuji.signposting_header_links.get(found_rel[1]): self.fuji.signposting_header_links[found_rel[1]].append(found_link[1:-1]) else: self.fuji.signposting_header_links[found_rel[1]]=[found_link[1:-1]] ''' #check if there is a cite-as signposting link if self.fuji.pid_scheme is None: signposting_pid_link = self.fuji.get_signposting_links( 'cite-as') if signposting_pid_link: signposting_pid = signposting_pid_link[0].get('url') if signposting_pid: found_ids = idutils.detect_identifier_schemes( signposting_pid[0]) if len(found_ids) > 1: found_ids.remove('url') found_id = found_ids[0] if found_id in Mapper.VALID_PIDS.value: self.logger.info( 'FsF-F1-02D : Found object identifier in signposting header links' ) self.fuji.pid_scheme = found_id up = urlparse(self.fuji.landing_url) self.fuji.landing_origin = '{uri.scheme}://{uri.netloc}'.format( uri=up) self.fuji.landing_html = requestHelper.getResponseContent() self.output.resolved_url = self.fuji.landing_url # url is active, although the identifier is not based on a pid scheme self.output.resolvable_status = True self.logger.info( 'FsF-F1-02D : Object identifier active (status code = 200)' ) self.fuji.isMetadataAccessible = True elif r.status_code in [401, 402, 403]: self.fuji.isMetadataAccessible = False self.logger.warning( "Resource inaccessible, identifier returned http status code: {code}" .format(code=r.status_code)) else: self.fuji.isMetadataAccessible = False self.logger.warning( "Resource inaccessible, identifier returned http status code: {code}" .format(code=r.status_code)) else: self.fuji.isMetadataAccessible = False self.logger.warning( "FsF-F1-02D :Resource inaccessible, no response received from: {}" .format(check_url)) if self.fuji.pid_scheme is not None: # short_pid = id.normalize_pid(self.id, scheme=pid_scheme) if signposting_pid is None: self.fuji.pid_url = idutils.to_url(self.fuji.id, scheme=self.fuji.pid_scheme) else: self.fuji.pid_url = signposting_pid[0] self.output.pid_scheme = self.fuji.pid_scheme self.result.test_status = 'pass' self.output.pid = self.fuji.pid_url self.setEvaluationCriteriumScore('FsF-F1-02D-1', 0, 'pass') if self.fuji.isMetadataAccessible: self.setEvaluationCriteriumScore('FsF-F1-02D-2', 1, 'pass') self.score.earned = self.total_score # idenfier should be based on a persistence scheme and resolvable #print(self.metric_tests) self.logger.log( self.fuji.LOG_SUCCESS, 'FsF-F1-02D : Persistence identifier scheme - {}'.format( self.fuji.pid_scheme)) #self.logger.info('FsF-F1-02D : Persistence identifier scheme - {}'.format(self.fuji.pid_scheme)) else: self.score.earned = 0 self.logger.warning( 'FsF-F1-02D : Not a persistent identifier scheme - {}'.format( self.fuji.id_scheme)) self.result.score = self.score self.result.metric_tests = self.metric_tests self.result.output = self.output
def parse_metadata(self): xml_metadata = None xml_mapping = None metatree = None envelope_metadata = {} XSI = "http://www.w3.org/2001/XMLSchema-instance" if self.link_type == 'linked': source_name = self.getEnumSourceNames().TYPED_LINK.value if self.link_type == 'embedded': source_name = self.getEnumSourceNames().LINKED_DATA.value elif self.link_type == 'guessed': source_name = self.getEnumSourceNames().GUESSED_XML.value elif self.link_type == 'negotiated': source_name = self.getEnumSourceNames().XML_NEGOTIATED.value else: source_name = self.getEnumSourceNames().TYPED_LINK.value dc_core_metadata = None requestHelper = RequestHelper(self.target_url, self.logger) requestHelper.setAcceptType(AcceptTypes.xml) #self.logger.info('FsF-F2-01M : Sending request to access metadata from -: {}'.format(self.target_url)) neg_source, xml_response = requestHelper.content_negotiate( 'FsF-F2-01M') if requestHelper.getHTTPResponse() is not None: self.logger.info( 'FsF-F2-01M : Trying to extract/parse metadata from -: {}'. format(source_name)) #dom = lxml.html.fromstring(self.landing_html.encode('utf8')) if neg_source != 'xml': self.logger.info( 'FsF-F2-01M : Expected XML but content negotiation responded -: ' + str(neg_source)) else: parser = lxml.etree.XMLParser(strip_cdata=False) tree = lxml.etree.XML(xml_response, parser) root_element = tree.tag if root_element.endswith('}OAI-PMH'): self.logger.info( 'FsF-F2-01M : Found OAI-PMH type XML envelope, unpacking \'metadata\' element for further processing' ) metatree = tree.find('.//{*}metadata/*') elif root_element.endswith('}mets'): self.logger.info( 'FsF-F2-01M : Found METS type XML envelope, unpacking all \'mods\' elements for further processing' ) envelope_metadata = self.get_mapped_xml_metadata( tree, Mapper.XML_MAPPING_METS.value) metatree = tree.find('.//{*}dmdSec/{*}mdWrap/{*}xmlData/*') elif root_element.endswith('}GetRecordsResponse'): self.logger.info( 'FsF-F2-01M : Found OGC CSW GetRecords type XML envelope, unpacking \'SearchResults\' element for further processing' ) metatree = tree.find('.//{*}SearchResults/*') elif root_element.endswith('}GetRecordByIdResponse'): self.logger.info( 'FsF-F2-01M : Found OGC CSW GetRecordByIdResponse type XML envelope, unpacking metadata element for further processing' ) metatree = tree.find('.//*') else: metatree = tree if metatree is not None: root_namespace = None nsmatch = re.match(r'^\{(.+)\}(.+)$', metatree.tag) schema_locations = set( metatree.xpath("//*/@xsi:schemaLocation", namespaces={'xsi': XSI})) for schema_location in schema_locations: self.namespaces = re.split('\s', schema_location) if nsmatch: root_namespace = nsmatch[1] root_element = nsmatch[2] print('#' + root_element + '#', root_namespace) self.namespaces.append(root_namespace) if root_element == 'codeBook': xml_mapping = Mapper.XML_MAPPING_DDI_CODEBOOK.value self.logger.info( 'FsF-F2-01M : Identified DDI codeBook XML based on root tag' ) elif root_element == 'dc': xml_mapping = Mapper.XML_MAPPING_DUBLIN_CORE.value self.logger.info( 'FsF-F2-01M : Identified Dublin Core XML based on root tag' ) elif root_element == 'mods': xml_mapping = Mapper.XML_MAPPING_MODS.value self.logger.info( 'FsF-F2-01M : Identified MODS XML based on root tag' ) elif root_element == 'eml': xml_mapping = Mapper.XML_MAPPING_EML.value self.logger.info( 'FsF-F2-01M : Identified EML XML based on root tag' ) elif root_element == 'MD_Metadata': xml_mapping = Mapper.XML_MAPPING_GCMD_ISO.value self.logger.info( 'FsF-F2-01M : Identified ISO 19115 XML based on root tag' ) elif root_namespace: if 'datacite.org/schema' in root_namespace: xml_mapping = Mapper.XML_MAPPING_DATACITE.value self.logger.info( 'FsF-F2-01M : Identified DataCite XML based on namespace' ) if xml_mapping and metatree is not None: xml_metadata = self.get_mapped_xml_metadata(metatree, xml_mapping) if envelope_metadata: for envelope_key, envelope_values in envelope_metadata.items(): if envelope_key not in xml_metadata: xml_metadata[envelope_key] = envelope_values return source_name, xml_metadata