def __call__(self, form, field, submit=False, fields=None): if field.data: schemes = idutils.detect_identifier_schemes(field.data) if schemes: getattr(form, self.set_field).data = schemes[0] else: getattr(form, self.set_field).data = ''
def evaluate(self): # ======= CHECK IDENTIFIER UNIQUENESS ======= self.result = Uniqueness(id=self.fuji.count, metric_identifier=self.metric_identifier, metric_name=self.metric_name) self.output = UniquenessOutput() schemes = [i[0] for i in idutils.PID_SCHEMES] self.logger.info('FsF-F1-01D : Using idutils schemes') found_ids = idutils.detect_identifier_schemes(self.fuji.id) # some schemes like PMID are generic if len(found_ids) > 0: self.logger.log(self.fuji.LOG_SUCCESS,'FsF-F1-01D : Unique identifier schemes found {}'.format(found_ids)) self.setEvaluationCriteriumScore('FsF-F1-01D-1',1, 'pass') self.output.guid = self.fuji.id self.score.earned = self.total_score # identify main scheme if len(found_ids) == 1: #self.fuji.pid_url = self.fuji.id self.fuji.id_scheme = found_ids[0] #self.fuji.id_scheme = 'url' else: if 'url' in found_ids: # ['doi', 'url'] found_ids.remove('url') #self.fuji.pid_url = self.fuji.id self.fuji.id_scheme = found_ids[0] found_id = found_ids[0] # TODO: take the first element of list, e.g., [doi, handle] if found_id in Mapper.VALID_PIDS.value: self.fuji.pid_scheme = found_id self.logger.info('FsF-F1-01D : Finalized unique identifier scheme - {}'.format(found_id)) self.output.guid_scheme = found_id self.result.test_status = 'pass' self.result.score = self.score self.result.metric_tests = self.metric_tests self.result.output = self.output else: self.logger.warning('FsF-F1-01D : Failed to check the identifier scheme!.')
def validate_gnd(form, field): if field.data: schemes = idutils.detect_identifier_schemes( field.data or '' ) if 'gnd' not in schemes: raise ValidationError("Not a valid GND-identifier.")
def validate_identifier(self, data, **kwargs): """Validate the identifier format and scheme.""" identifier = data.get("identifier") scheme = data.get("scheme") if self.identifier_required and not identifier: raise ValidationError("Missing required identifier.") if identifier and not scheme: raise ValidationError( f"Missing scheme value for identifier {identifier}." ) if identifier: # at this point, `scheme` is set or validation failed earlier detected_schemes = idutils.detect_identifier_schemes(identifier) is_forbidden = scheme in self.forbidden_schemes if is_forbidden: raise ValidationError(f"Invalid scheme {scheme}.") is_not_allowed = ( self.allowed_schemes and scheme not in self.allowed_schemes ) if is_not_allowed: raise ValidationError(f"Invalid scheme {scheme}.") unknown = scheme not in detected_schemes if unknown and self.fail_on_unknown: raise ValidationError(f"Invalid scheme {scheme}.")
def check_scheme(self, data): """Validate the provided identifier scheme.""" value = data['value'] scheme = data['scheme'].lower() schemes = idutils.detect_identifier_schemes(value) if schemes and scheme not in schemes: raise ValidationError("Invalid scheme '{}'".format(data['scheme']), 'IDScheme')
def test_valueerror(): """Test for bad validators.""" # Many validators rely on a special length of the identifier before # testing further. This test, checks that the validators are still # well-behaved when the length matches, but the persistent identifier # is invalid. for i in range(20): nonsense_pid = "a" * i assert idutils.detect_identifier_schemes(nonsense_pid) == []
def detect_scheme(self, data): """Load scheme.""" id_ = data.get('identifier') scheme = data.get('scheme') if not scheme and id_: scheme = idutils.detect_identifier_schemes(id_) if scheme: data['scheme'] = scheme[0] return data
def validate_scheme(form, field): """Set scheme based on value in identifier.""" schemes = idutils.detect_identifier_schemes( form.data.get('identifier') or '' ) if schemes: field.data = schemes[0] else: field.data = ''
def pid_url(identifier, scheme=None): """Convert persistent identifier into a link.""" if scheme is None: try: scheme = idutils.detect_identifier_schemes(identifier)[0] except IndexError: scheme = None if scheme and identifier: return idutils.to_url(identifier, scheme) return ""
def metadata_command(identifiers: List[str], eager: bool = False): """Harvest metadata.""" # Detect identifier schemes identifiers = [(i, idutils.detect_identifier_schemes(i)[0]) for i in identifiers] task = harvest_metadata.s(identifiers, eager=eager) if eager: task.apply(throw=True) else: task.apply_async()
def _detect_scheme(self, identifier): """Detect the scheme of a given identifier.""" detected_schemes = idutils.detect_identifier_schemes(identifier) if self.allow_all: return detected_schemes[0] if detected_schemes else None for d in detected_schemes: if d in self.allowed_schemes: return d return None
def _deserialize(self, value, attr, data): """Deserialize persistent identifier value.""" value = super(PersistentId, self)._deserialize(value, attr, data) value = value.strip() schemes = idutils.detect_identifier_schemes(value) if self.scheme and self.scheme.lower() not in schemes: self.fail('invalid_scheme', scheme=self.scheme) if not schemes: self.fail('invalid_pid') return idutils.normalize_pid(value, schemes[0]) \ if self.normalize else value
def _extract_identifiers(self, data): """.""" ids = set() if data.get('bibcode'): ids.add((data.get('bibcode'), 'ads')) ids |= {(d, 'doi') for d in data.get('doi', []) if d} for id_ in data.get('identifier', []): try: ids.add((id_, idutils.detect_identifier_schemes(id_)[0])) except Exception: pass return [{'ID': i, 'IDScheme': s} for i, s in ids if i and s]
def __init__(self, idstring): self.identifier = idstring self.normalized_id = self.identifier if self.identifier and isinstance(self.identifier, str): if len(self.identifier) > 4 and not self.identifier.isnumeric(): #workaround to resolve lsids: #idutils.LANDING_URLS['lsid'] ='http://www.lsid.info/resolver/?lsid={pid}' #workaround to recognize https purls if 'https://purl.' in self.identifier: self.identifier = self.identifier.replace( 'https:', 'http:') generic_identifiers_org_pattern = '^([a-z0-9\._]+):(.+)' # idutils check self.identifier_schemes = idutils.detect_identifier_schemes( self.identifier) # identifiers.org check if not self.identifier_schemes: self.method = 'identifiers.org' idmatch = re.search(generic_identifiers_org_pattern, self.identifier) if idmatch: found_prefix = idmatch[1] found_suffix = idmatch[2] if found_prefix in self.IDENTIFIERS_ORG_DATA.keys(): if (re.search( self.IDENTIFIERS_ORG_DATA[found_prefix] ['pattern'], found_suffix)): self.identifier_schemes = [ found_prefix, 'identifiers_org' ] self.preferred_schema = found_prefix self.identifier_url = str( self.IDENTIFIERS_ORG_DATA[found_prefix] ['url_pattern']).replace( '{$id}', found_suffix) self.normalized_id = found_prefix.lower( ) + ':' + found_suffix else: # preferred schema if self.identifier_schemes: if len(self.identifier_schemes) > 0: if len(self.identifier_schemes) > 1: if 'url' in self.identifier_schemes: # ['doi', 'url'] self.identifier_schemes.remove('url') self.preferred_schema = self.identifier_schemes[0] self.normalized_id = idutils.normalize_pid( self.identifier, self.preferred_schema) self.identifier_url = idutils.to_url( self.identifier, self.preferred_schema) if self.preferred_schema in Mapper.VALID_PIDS.value or self.preferred_schema in self.IDENTIFIERS_ORG_DATA.keys( ): self.is_persistent = True
def __call__(self, form, field, submit=False, fields=None): scheme = None if self.scheme_field: scheme = getattr(form, self.scheme_field).data elif self.scheme: scheme = self.scheme else: schemes = idutils.detect_identifier_schemes(field.data) if schemes: scheme = schemes[0] if scheme: if field.data: field.data = idutils.normalize_pid(field.data, scheme=scheme)
def pid_url(identifier, scheme=None, url_scheme="https"): """Convert persistent identifier into a link.""" if scheme is None: try: scheme = idutils.detect_identifier_schemes(identifier)[0] except IndexError: scheme = None try: if scheme and identifier: return idutils.to_url(identifier, scheme, url_scheme=url_scheme) except Exception: current_app.logger.warning("URL generation for identifier {0} failed.".format(identifier), exc_info=True) return ""
def _detect_scheme(self, identifier): """Detect and return the scheme of a given identifier.""" detected_schemes = idutils.detect_identifier_schemes(identifier) # force setting the scheme to one of the detected when # allowed_schemes list is provided if self.allowed_schemes: for d in detected_schemes: if d in self.allowed_schemes: return d first_or_none = detected_schemes[0] if detected_schemes else None return first_or_none
def pid_url(identifier, scheme=None, url_scheme='https'): """Convert persistent identifier into a link.""" if scheme is None: try: scheme = idutils.detect_identifier_schemes(identifier)[0] except IndexError: scheme = None try: if scheme and identifier: return idutils.to_url(identifier, scheme, url_scheme=url_scheme) except Exception: current_app.logger.warning('URL generation for identifier {0} failed.' .format(identifier), exc_info=True) return ''
def validate_data(self, data): """Validate identifier and scheme.""" id_ = data.get('identifier') scheme = data.get('scheme') if not id_: raise ValidationError('Identifier is required.', field_names=['identifier']) schemes = idutils.detect_identifier_schemes(id_) if not schemes: raise ValidationError('Not a valid persistent identifier.', field_names=['identifier']) if scheme not in schemes: raise ValidationError('Not a valid {0} identifier.'.format(scheme), field_names=['identifier'])
def check_scheme(self, data, **kwargs): """Validate the provided identifier scheme.""" value = data['value'] scheme = data['scheme'].lower() schemes = idutils.detect_identifier_schemes(value) # TODO: "pmid" scheme with value '11781516' collides (with ean8) # if schemes and scheme not in schemes: # raise ValidationError("Invalid scheme '{}'".format( # data['scheme']), 'IDScheme') #Check for valid github url if scheme == 'url' and 'github' in value: try: GithubUtility.parse_url_info(value) except: raise ValidationError("Invalid github repo or release '{}'".format(value))
def check_identifiers(self): uuidresult = {'id': 1, 'metric_id': 'FsF-F1-01D', 'passed': False} pidresult = {'id': 2, 'metric_id': 'FsF-F1-02D', 'passed': False} try: #try to find an identifier schema for the given string foundpids = id.detect_identifier_schemes(self.uid) if len(foundpids) > 0: #if schema found we have an id which can be found by idutils uuidresult['passed'] = True uuidresult['output'] = { 'uuid': self.uid, 'uuid_schema': foundpids } #now we check if the schema is listed in our valid pid list in this case it is also a pid realpids = [ value for value in foundpids if value in self.validpids ] if len(realpids) > 0: pidresult['passed'] = True if foundpids[0] == 'url': self.pid_url = self.uid else: # we try to find an actionable representation of the pid (URL) self.pid_url = id.to_url(pid, scheme=realpids[0]) #we should log here if this fails.. #Now we try to perform a HTTP GET request r = requests.get(self.pid_url) if r.status_code == 200: if len(realpids) > 0: self.pid = id.normalize_pid(pid, scheme=realpids[0]) self.landing_url = r.url self.landing_html = r.text pidresult['output'] = { 'pid': self.pid, 'resolved_url': self.landing_url, 'pid_schema': realpids } else: self.error.append('FsF-F1: HTTP Error: ' + str(r.status_code)) except BaseException as err: self.error.append('FsF-F1: Failed to check the given identifier' + str(err)) self.results.append(uuidresult) self.results.append(pidresult)
def __init__(self, idstring): self.identifier = idstring self.normalized_id = self.identifier if len(self.identifier) > 4 and not self.identifier.isnumeric(): generic_identifiers_org_pattern = '^([a-z0-9\._]+):(.+)' # idutils check self.identifier_schemes = idutils.detect_identifier_schemes( self.identifier) # identifiers.org check if not self.identifier_schemes: self.method = 'identifiers.org' idmatch = re.search(generic_identifiers_org_pattern, self.identifier) if idmatch: found_prefix = idmatch[1] found_suffix = idmatch[2] if found_prefix in self.IDENTIFIERS_ORG_DATA.keys(): if (re.search( self.IDENTIFIERS_ORG_DATA[found_prefix] ['pattern'], found_suffix)): self.identifier_schemes = [ found_prefix, 'identifiers_org' ] self.preferred_schema = found_prefix self.identifier_url = str( self.IDENTIFIERS_ORG_DATA[found_prefix] ['url_pattern']).replace('{$id}', found_suffix) self.normalized_id = found_prefix.lower( ) + ':' + found_suffix else: # preferred schema if len(self.identifier_schemes) > 0: if len(self.identifier_schemes) > 1: if 'url' in self.identifier_schemes: # ['doi', 'url'] self.identifier_schemes.remove('url') self.preferred_schema = self.identifier_schemes[0] self.normalized_id = idutils.normalize_pid( self.identifier, self.preferred_schema) self.identifier_url = idutils.to_url(self.identifier, self.preferred_schema) if self.preferred_schema in Mapper.VALID_PIDS.value or self.preferred_schema in self.IDENTIFIERS_ORG_DATA.keys( ): self.is_persistent = True
def validate_data(self, data): """Validate identifier and scheme.""" id_ = data.get('identifier') scheme = data.get('scheme') if not id_: raise ValidationError( 'Identifier is required.', field_names=['identifier'] ) schemes = idutils.detect_identifier_schemes(id_) if not schemes: raise ValidationError( 'Not a valid persistent identifier.', field_names=['identifier'] ) if scheme not in schemes: raise ValidationError( 'Not a valid {0} identifier.'.format(scheme), field_names=['identifier'] )
def load_scheme(self, data, **kwargs): """Loads the schema of the identifier.""" identifier = data.get("identifier") if not identifier: return data scheme = data.get("scheme") if not scheme: detected_schemes = idutils.detect_identifier_schemes(identifier) else: # if given, use it detected_schemes = [scheme.lower()] # check if given or any detected is allowed data["scheme"] = self._intersect_with_order(detected_schemes) if not data["scheme"]: # no match between detected and allowed # will fail at validation step data.pop("scheme", None) return data
def validate_identifier(self, data, **kwargs): """Validate the identifier format and scheme.""" identifier = data.get("identifier") scheme = data.get("scheme") # If requried if not identifier and self.required: raise ValidationError("Missing required identifier.") if identifier: detected_schemes = idutils.detect_identifier_schemes(identifier) # A scheme should be present at this stage detected or provided if not scheme: raise ValidationError("Missing required scheme.") # Check if identifier is valid according to scheme. if scheme not in detected_schemes: raise ValidationError(f"Invalid identifier format or scheme.") # Check if scheme is allowed if not self.allow_all and scheme not in self.allowed_schemes: raise ValidationError("Scheme not allowed. Must be " f"one of {self.allowed_schemes}.")
def test_detect_schemes(): """Test scheme detection.""" for i, expected_schemes, normalized_value, url_value in identifiers: schemes = idutils.detect_identifier_schemes(i) assert schemes == expected_schemes
def check_scheme(self, data): """Validate the provided identifier scheme.""" value = data['value'] scheme = data['scheme'].lower() schemes = idutils.detect_identifier_schemes(value)
def __call__(self, form, field): """Validate.""" schemes = idutils.detect_identifier_schemes(field.data) if not schemes: raise ValidationError(self.message)
with open("README.md", "r") as infile: for line in infile: if line.startswith("|"): # start of table break else: intro.append(line) # generate the new README records = list() with open("maverefs.tsv", mode="r", newline="") as infile, open("README.md", mode="w", newline="") as outfile: # rewrite the introductory text for line in intro: print(line, end="", file=outfile) # regenerate the table line-by-line reader = csv.DictReader(infile, delimiter="\t") writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames, delimiter="|", lineterminator="|\n") print("|", end="", file=outfile) writer.writeheader() print("|---" * len(reader.fieldnames) + "|\n", end="", file=outfile) for row in reader: if idutils.is_pmid(row["PMID"]): row["PMID"] = format_md_link(row["PMID"], "pmid") if idutils.is_doi(row["DOI"]): row["DOI"] = format_md_link(row["DOI"], "doi") raw_data_schemes = idutils.detect_identifier_schemes(row["Raw Data"]) if len(raw_data_schemes) == 1: # uniquely identified the raw data row["Raw Data"] = format_md_link(row["Raw Data"], raw_data_schemes[0]) print("|", end="", file=outfile) writer.writerow(row)
def test_idempotence(): """Test persistent id normalization.""" for i, expected_schemes, normalized_value, url_value in identifiers: val_norm = idutils.normalize_pid(i, expected_schemes[0]) assert expected_schemes[0] in \ idutils.detect_identifier_schemes(val_norm)
def evaluate(self): self.result = Persistence(id=self.fuji.count, metric_identifier=self.metric_identifier, metric_name=self.metric_name) self.output = PersistenceOutput() # ======= CHECK IDENTIFIER PERSISTENCE ======= self.logger.info( 'FsF-F1-02D : PID schemes-based assessment supported by the assessment service - {}' .format(Mapper.VALID_PIDS.value)) if self.fuji.pid_scheme is not None: check_url = idutils.to_url(self.fuji.id, scheme=self.fuji.pid_scheme) elif self.fuji.id_scheme == 'url': check_url = self.fuji.id # ======= RETRIEVE METADATA FROM LANDING PAGE ======= requestHelper = RequestHelper(check_url, self.logger) requestHelper.setAcceptType(AcceptTypes.html) # request neg_source, self.fuji.extruct_result = requestHelper.content_negotiate( 'FsF-F1-02D') r = requestHelper.getHTTPResponse() signposting_pid = None if r: self.fuji.landing_url = requestHelper.redirect_url if r.status == 200: # identify signposting links in header header_link_string = requestHelper.getHTTPResponse().getheader( 'Link') if header_link_string is not None: self.logger.info( 'FsF-F1-02D : Found signposting links in response header of landingpage' ) for preparsed_link in header_link_string.split(','): found_link = None found_type, type_match = None, None found_rel, rel_match = None, None parsed_link = preparsed_link.strip().split(';') found_link = parsed_link[0].strip() for link_prop in parsed_link[1:]: if str(link_prop).startswith('rel="'): rel_match = re.search('rel=\"(.*?)\"', link_prop) elif str(link_prop).startswith('type="'): type_match = re.search('type=\"(.*?)\"', link_prop) if type_match: found_type = type_match[1] if rel_match: found_rel = rel_match[1] signposting_link_dict = { 'url': found_link[1:-1], 'type': found_type, 'rel': found_rel } if found_link: self.fuji.signposting_header_links.append( signposting_link_dict) ''' if found_rel: if self.fuji.signposting_header_links.get(found_rel[1]): self.fuji.signposting_header_links[found_rel[1]].append(found_link[1:-1]) else: self.fuji.signposting_header_links[found_rel[1]]=[found_link[1:-1]] ''' #check if there is a cite-as signposting link if self.fuji.pid_scheme is None: signposting_pid_link = self.fuji.get_signposting_links( 'cite-as') if signposting_pid_link: signposting_pid = signposting_pid_link[0].get('url') if signposting_pid: found_ids = idutils.detect_identifier_schemes( signposting_pid[0]) if len(found_ids) > 1: found_ids.remove('url') found_id = found_ids[0] if found_id in Mapper.VALID_PIDS.value: self.logger.info( 'FsF-F1-02D : Found object identifier in signposting header links' ) self.fuji.pid_scheme = found_id up = urlparse(self.fuji.landing_url) self.fuji.landing_origin = '{uri.scheme}://{uri.netloc}'.format( uri=up) self.fuji.landing_html = requestHelper.getResponseContent() self.output.resolved_url = self.fuji.landing_url # url is active, although the identifier is not based on a pid scheme self.output.resolvable_status = True self.logger.info( 'FsF-F1-02D : Object identifier active (status code = 200)' ) self.fuji.isMetadataAccessible = True elif r.status_code in [401, 402, 403]: self.fuji.isMetadataAccessible = False self.logger.warning( "Resource inaccessible, identifier returned http status code: {code}" .format(code=r.status_code)) else: self.fuji.isMetadataAccessible = False self.logger.warning( "Resource inaccessible, identifier returned http status code: {code}" .format(code=r.status_code)) else: self.fuji.isMetadataAccessible = False self.logger.warning( "FsF-F1-02D :Resource inaccessible, no response received from: {}" .format(check_url)) if self.fuji.pid_scheme is not None: # short_pid = id.normalize_pid(self.id, scheme=pid_scheme) if signposting_pid is None: self.fuji.pid_url = idutils.to_url(self.fuji.id, scheme=self.fuji.pid_scheme) else: self.fuji.pid_url = signposting_pid[0] self.output.pid_scheme = self.fuji.pid_scheme self.result.test_status = 'pass' self.output.pid = self.fuji.pid_url self.setEvaluationCriteriumScore('FsF-F1-02D-1', 0, 'pass') if self.fuji.isMetadataAccessible: self.setEvaluationCriteriumScore('FsF-F1-02D-2', 1, 'pass') self.score.earned = self.total_score # idenfier should be based on a persistence scheme and resolvable #print(self.metric_tests) self.logger.log( self.fuji.LOG_SUCCESS, 'FsF-F1-02D : Persistence identifier scheme - {}'.format( self.fuji.pid_scheme)) #self.logger.info('FsF-F1-02D : Persistence identifier scheme - {}'.format(self.fuji.pid_scheme)) else: self.score.earned = 0 self.logger.warning( 'FsF-F1-02D : Not a persistent identifier scheme - {}'.format( self.fuji.id_scheme)) self.result.score = self.score self.result.metric_tests = self.metric_tests self.result.output = self.output
def retrieve_metadata_embedded(self, extruct_metadata): isPid = False if self.pid_scheme: isPid = True # ========= retrieve embedded rdfa and microdata metadata ======== micro_meta = extruct_metadata.get('microdata') microdata_collector = MetaDataCollectorMicroData( loggerinst=self.logger, sourcemetadata=micro_meta, mapping=Mapper.MICRODATA_MAPPING) source_micro, micro_dict = microdata_collector.parse_metadata() if micro_dict: self.metadata_sources.append((source_micro, 'embedded')) self.namespace_uri.extend(microdata_collector.getNamespaces()) micro_dict = self.exclude_null(micro_dict) for i in micro_dict.keys(): if i in self.reference_elements: self.metadata_merged[i] = micro_dict[i] self.reference_elements.remove(i) self.logger.log( self.LOG_SUCCESS, 'FsF-F2-01M : Found microdata metadata: ' + str(micro_dict.keys())) #================== RDFa RDFA_ns = rdflib.Namespace("http://www.w3.org/ns/rdfa#") rdfasource = MetaDataCollector.Sources.RDFA.value rdfagraph = None errors = [] try: rdfagraph = rdflib.Graph() rdfagraph.parse(data=self.landing_html, format='rdfa') rdfa_collector = MetaDataCollectorRdf(loggerinst=self.logger, target_url=self.landing_url, source=rdfasource, rdf_graph=rdfagraph) source_rdfa, rdfa_dict = rdfa_collector.parse_metadata() self.metadata_sources.append((rdfasource, 'embedded')) self.namespace_uri.extend(rdfa_collector.getNamespaces()) #rdfa_dict['object_identifier']=self.pid_url rdfa_dict = self.exclude_null(rdfa_dict) for i in rdfa_dict.keys(): if i in self.reference_elements: self.metadata_merged[i] = rdfa_dict[i] self.reference_elements.remove(i) self.logger.log( self.LOG_SUCCESS, 'FsF-F2-01M : Found RDFa metadata: ' + str(rdfa_dict.keys())) except: self.logger.info( 'FsF-F2-01M : RDFa metadata parsing exception, probably no RDFa embedded in HTML' ) # ========= retrieve schema.org (embedded, or from via content-negotiation if pid provided) ========= ext_meta = extruct_metadata.get('json-ld') if self.use_datacite is True: target_url = self.pid_url else: target_url = self.landing_url schemaorg_collector = MetaDataCollectorSchemaOrg( loggerinst=self.logger, sourcemetadata=ext_meta, mapping=Mapper.SCHEMAORG_MAPPING, pidurl=target_url) source_schemaorg, schemaorg_dict = schemaorg_collector.parse_metadata() schemaorg_dict = self.exclude_null(schemaorg_dict) if schemaorg_dict: self.namespace_uri.extend(schemaorg_collector.namespaces) #not_null_sco = [k for k, v in schemaorg_dict.items() if v is not None] if source_schemaorg == MetaDataCollector.Sources.SCHEMAORG_EMBED.value: self.metadata_sources.append((source_schemaorg, 'embedded')) else: self.metadata_sources.append((source_schemaorg, 'negotiated')) if schemaorg_dict.get('related_resources'): self.related_resources.extend( schemaorg_dict.get('related_resources')) if schemaorg_dict.get('object_content_identifier'): self.logger.info( 'FsF-F3-01M : Found data links in Schema.org metadata : ' + str(schemaorg_dict.get('object_content_identifier'))) # add object type for future reference for i in schemaorg_dict.keys(): if i in self.reference_elements: self.metadata_merged[i] = schemaorg_dict[i] self.reference_elements.remove(i) self.logger.log( self.LOG_SUCCESS, 'FsF-F2-01M : Found Schema.org metadata: ' + str(schemaorg_dict.keys())) else: self.logger.info('FsF-F2-01M : Schema.org metadata UNAVAILABLE') # ========= retrieve dublin core embedded in html page ========= if self.reference_elements: self.logger.info('FsF-F2-01M : Checking for DublinCore metadata') dc_collector = MetaDataCollectorDublinCore( loggerinst=self.logger, sourcemetadata=self.landing_html, mapping=Mapper.DC_MAPPING) source_dc, dc_dict = dc_collector.parse_metadata() dc_dict = self.exclude_null(dc_dict) if dc_dict: self.namespace_uri.extend(dc_collector.namespaces) #not_null_dc = [k for k, v in dc_dict.items() if v is not None] self.metadata_sources.append((source_dc, 'embedded')) if dc_dict.get('related_resources'): self.related_resources.extend( dc_dict.get('related_resources')) for d in dc_dict.keys(): if d in self.reference_elements: self.metadata_merged[d] = dc_dict[d] self.reference_elements.remove(d) self.logger.log( self.LOG_SUCCESS, 'FsF-F2-01M : Found DublinCore metadata: ' + str(dc_dict.keys())) else: self.logger.info( 'FsF-F2-01M : DublinCore metadata UNAVAILABLE') # ======== retrieve OpenGraph metadata ext_meta = extruct_metadata.get('opengraph') opengraph_collector = MetaDataCollectorOpenGraph( loggerinst=self.logger, sourcemetadata=ext_meta, mapping=Mapper.OG_MAPPING) source_opengraph, opengraph_dict = opengraph_collector.parse_metadata() opengraph_dict = self.exclude_null(opengraph_dict) if opengraph_dict: self.namespace_uri.extend(opengraph_collector.namespaces) self.metadata_sources.append((source_opengraph, 'embedded')) for i in opengraph_dict.keys(): if i in self.reference_elements: self.metadata_merged[i] = opengraph_dict[i] self.reference_elements.remove(i) self.logger.log( self.LOG_SUCCESS, 'FsF-F2-01M : Found OpenGraph metadata: ' + str(opengraph_dict.keys())) else: self.logger.info('FsF-F2-01M : Schema.org metadata UNAVAILABLE') #========= retrieve signposting data links data_sign_links = self.get_signposting_links('item') if data_sign_links: self.logger.info( 'FsF-F3-01M : Found data links in response header (signposting) : ' + str(len(data_sign_links))) if self.metadata_merged.get('object_content_identifier') is None: self.metadata_merged[ 'object_content_identifier'] = data_sign_links #========= retrieve typed data object links ========= data_meta_links = self.get_html_typed_links(rel='item') if data_meta_links: self.logger.info( 'FsF-F3-01M : Found data links in HTML head (link rel=item) : ' + str(len(data_meta_links))) if self.metadata_merged.get('object_content_identifier') is None: self.metadata_merged[ 'object_content_identifier'] = data_meta_links # self.metadata_sources.append((MetaDataCollector.Sources.TYPED_LINK.value,'linked')) #Now if an identifier has been detected in the metadata, potentially check for persistent identifier has to be repeated.. if self.metadata_merged.get('object_identifier'): if isinstance(self.metadata_merged.get('object_identifier'), list): identifiertotest = self.metadata_merged.get( 'object_identifier')[0] else: identifiertotest = self.metadata_merged.get( 'object_identifier') if self.pid_scheme is None: #print(self.metadata_merged.get('object_identifier')) found_pids_in_metadata = idutils.detect_identifier_schemes( identifiertotest) if len(found_pids_in_metadata) > 1: if 'url' in found_pids_in_metadata: found_pids_in_metadata.remove('url') found_id = found_pids_in_metadata[0] if found_id in Mapper.VALID_PIDS.value: self.logger.info( 'FsF-F2-01M : Found object identifier in metadata, repeating PID check for FsF-F1-02D' ) self.logger.log( self.LOG_SUCCESS, 'FsF-F1-02D : Found object identifier in metadata during FsF-F2-01M, PID check was repeated' ) self.repeat_pid_check = True self.pid_scheme = found_id self.id = identifiertotest
def get_scheme(self, obj): """Get scheme.""" scheme = obj.get('scheme') if not scheme and obj.get('identifier'): scheme = idutils.detect_identifier_schemes(obj['identifier'])[0] return scheme or ""