def test_normalize_pid(): """Test persistent id normalization.""" for i, expected_schemes, normalized_value, url_value in identifiers: assert idutils.normalize_pid(i, expected_schemes[0]) == \ normalized_value or i assert idutils.normalize_pid(None, 'handle') is None
def test_to_url(): """Test URL generation.""" for i, expected_schemes, normalized_value, url_value in identifiers: assert idutils.to_url( idutils.normalize_pid(i, expected_schemes[0]), expected_schemes[0] ) == url_value assert idutils.to_url( idutils.normalize_pid(i, expected_schemes[0]), expected_schemes[0], url_scheme='https', ) == (url_value.replace('http://', 'https://') # If the value is already a URL its scheme is preserved if expected_schemes[0] not in ['purl', 'url'] else url_value)
def lookup_re3data(self): if self.client_id and self.pid_scheme: re3doi = RepositoryHelper.DATACITE_REPOSITORIES.get( self.client_id) # {client_id,re3doi} short_re3doi = idutils.normalize_pid( re3doi, scheme='doi') #https://doi.org/10.17616/R3XS37 # pid -> clientId -> repo doi-> re3id, and query repository metadata from re3api if re3doi: self.logger.info('Found match re3data (DOI-based) record') query_url = Preprocessor.RE3DATA_API + '?query=' + short_re3doi # https://re3data.org/api/beta/repositories?query= q = RequestHelper(url=query_url) q.setAcceptType(AcceptTypes.xml) re_source, xml = q.content_negotiate(metric_id='RE3DATA') root = etree.fromstring(xml.content) #<link href="https://www.re3data.org/api/beta/repository/r3d100010134" rel="self" /> re3link = root.xpath('//link')[0].attrib['href'] if re3link is not None: self.logger.info('Found match re3data metadata record') # query reposiroty metadata q2 = RequestHelper(url=re3link) q2.setAcceptType(AcceptTypes.xml) re3_source, re3_response = q2.content_negotiate( metric_id='RE3DATA') self.re3metadata_raw = re3_response.content self.parseRepositoryMetadata() else: self.logger.warning( 'No DOI of client id is available from datacite api')
def normalize_value(self, data, **kwargs): """Normalize identifier value.""" try: data['ID'] = idutils.normalize_pid(data['ID'], data['IDScheme']) return data except Exception: current_app.logger.warning( 'Failed to normalize PID value.', extra={'data': data})
def normalize_identifier(self, data, **kwargs): """Normalizes the identifier based on the scheme.""" identifier = data.get("identifier") # It can be empty if not required if identifier: # at this point, `scheme` is set or validation failed earlier scheme = data["scheme"] data["identifier"] = idutils.normalize_pid(identifier, scheme) return data
def normalize_identifier(self, data, **kwargs): """Normalizes the identifier based on the scheme.""" identifier = data.get("identifier") # It can be empty if not required if identifier: # At this point scheme should exist or had failed scheme = data["scheme"] data["identifier"] = idutils.normalize_pid(identifier, scheme) return data
def _deserialize(self, value, attr, data): """Deserialize persistent identifier value.""" value = super(PersistentId, self)._deserialize(value, attr, data) value = value.strip() schemes = idutils.detect_identifier_schemes(value) if self.scheme and self.scheme.lower() not in schemes: self.fail('invalid_scheme', scheme=self.scheme) if not schemes: self.fail('invalid_pid') return idutils.normalize_pid(value, schemes[0]) \ if self.normalize else value
def __init__(self, idstring): self.identifier = idstring self.normalized_id = self.identifier if self.identifier and isinstance(self.identifier, str): if len(self.identifier) > 4 and not self.identifier.isnumeric(): #workaround to resolve lsids: #idutils.LANDING_URLS['lsid'] ='http://www.lsid.info/resolver/?lsid={pid}' #workaround to recognize https purls if 'https://purl.' in self.identifier: self.identifier = self.identifier.replace( 'https:', 'http:') generic_identifiers_org_pattern = '^([a-z0-9\._]+):(.+)' # idutils check self.identifier_schemes = idutils.detect_identifier_schemes( self.identifier) # identifiers.org check if not self.identifier_schemes: self.method = 'identifiers.org' idmatch = re.search(generic_identifiers_org_pattern, self.identifier) if idmatch: found_prefix = idmatch[1] found_suffix = idmatch[2] if found_prefix in self.IDENTIFIERS_ORG_DATA.keys(): if (re.search( self.IDENTIFIERS_ORG_DATA[found_prefix] ['pattern'], found_suffix)): self.identifier_schemes = [ found_prefix, 'identifiers_org' ] self.preferred_schema = found_prefix self.identifier_url = str( self.IDENTIFIERS_ORG_DATA[found_prefix] ['url_pattern']).replace( '{$id}', found_suffix) self.normalized_id = found_prefix.lower( ) + ':' + found_suffix else: # preferred schema if self.identifier_schemes: if len(self.identifier_schemes) > 0: if len(self.identifier_schemes) > 1: if 'url' in self.identifier_schemes: # ['doi', 'url'] self.identifier_schemes.remove('url') self.preferred_schema = self.identifier_schemes[0] self.normalized_id = idutils.normalize_pid( self.identifier, self.preferred_schema) self.identifier_url = idutils.to_url( self.identifier, self.preferred_schema) if self.preferred_schema in Mapper.VALID_PIDS.value or self.preferred_schema in self.IDENTIFIERS_ORG_DATA.keys( ): self.is_persistent = True
def __call__(self, form, field, submit=False, fields=None): scheme = None if self.scheme_field: scheme = getattr(form, self.scheme_field).data elif self.scheme: scheme = self.scheme else: schemes = idutils.detect_identifier_schemes(field.data) if schemes: scheme = schemes[0] if scheme: if field.data: field.data = idutils.normalize_pid(field.data, scheme=scheme)
def lookup_re3data(self): if self.client_id and self.pid_scheme: re3doi = RepositoryHelper.DATACITE_REPOSITORIES.get( self.client_id) # {client_id,re3doi} #print(self.client_id,'Re3DOI',re3doi, idutils.is_doi(re3doi)) if re3doi: if idutils.is_doi(re3doi): short_re3doi = idutils.normalize_pid( re3doi, scheme='doi') #https://doi.org/10.17616/R3XS37 else: re3doi = None # pid -> clientId -> repo doi-> re3id, and query repository metadata from re3api if re3doi: self.logger.info( 'FsF-R1.3-01M : Found match re3data (DOI-based) record') query_url = Preprocessor.RE3DATA_API + '?query=' + short_re3doi # https://re3data.org/api/beta/repositories?query= q = RequestHelper(url=query_url) q.setAcceptType(AcceptTypes.xml) re_source, xml = q.content_negotiate(metric_id='RE3DATA') try: if isinstance(xml, bytes): xml = xml.decode().encode() root = etree.fromstring(xml) #<link href="https://www.re3data.org/api/beta/repository/r3d100010134" rel="self" /> re3link = root.xpath('//link')[0].attrib['href'] if re3link is not None: self.logger.info( 'FsF-R1.3-01M : Found match re3data metadata record -: ' + str(re3link)) # query reposiroty metadata q2 = RequestHelper(url=re3link) q2.setAcceptType(AcceptTypes.xml) re3_source, re3_response = q2.content_negotiate( metric_id='RE3DATA') self.re3metadata_raw = re3_response self.parseRepositoryMetadata() except Exception as e: self.logger.warning( 'FsF-R1.3-01M : Malformed re3data (DOI-based) record received: ' + str(e)) else: self.logger.warning( 'FsF-R1.3-01M : No DOI of client id is available from datacite api' )
def check_identifiers(self): uuidresult = {'id': 1, 'metric_id': 'FsF-F1-01D', 'passed': False} pidresult = {'id': 2, 'metric_id': 'FsF-F1-02D', 'passed': False} try: #try to find an identifier schema for the given string foundpids = id.detect_identifier_schemes(self.uid) if len(foundpids) > 0: #if schema found we have an id which can be found by idutils uuidresult['passed'] = True uuidresult['output'] = { 'uuid': self.uid, 'uuid_schema': foundpids } #now we check if the schema is listed in our valid pid list in this case it is also a pid realpids = [ value for value in foundpids if value in self.validpids ] if len(realpids) > 0: pidresult['passed'] = True if foundpids[0] == 'url': self.pid_url = self.uid else: # we try to find an actionable representation of the pid (URL) self.pid_url = id.to_url(pid, scheme=realpids[0]) #we should log here if this fails.. #Now we try to perform a HTTP GET request r = requests.get(self.pid_url) if r.status_code == 200: if len(realpids) > 0: self.pid = id.normalize_pid(pid, scheme=realpids[0]) self.landing_url = r.url self.landing_html = r.text pidresult['output'] = { 'pid': self.pid, 'resolved_url': self.landing_url, 'pid_schema': realpids } else: self.error.append('FsF-F1: HTTP Error: ' + str(r.status_code)) except BaseException as err: self.error.append('FsF-F1: Failed to check the given identifier' + str(err)) self.results.append(uuidresult) self.results.append(pidresult)
def __init__(self, idstring): self.identifier = idstring self.normalized_id = self.identifier if len(self.identifier) > 4 and not self.identifier.isnumeric(): generic_identifiers_org_pattern = '^([a-z0-9\._]+):(.+)' # idutils check self.identifier_schemes = idutils.detect_identifier_schemes( self.identifier) # identifiers.org check if not self.identifier_schemes: self.method = 'identifiers.org' idmatch = re.search(generic_identifiers_org_pattern, self.identifier) if idmatch: found_prefix = idmatch[1] found_suffix = idmatch[2] if found_prefix in self.IDENTIFIERS_ORG_DATA.keys(): if (re.search( self.IDENTIFIERS_ORG_DATA[found_prefix] ['pattern'], found_suffix)): self.identifier_schemes = [ found_prefix, 'identifiers_org' ] self.preferred_schema = found_prefix self.identifier_url = str( self.IDENTIFIERS_ORG_DATA[found_prefix] ['url_pattern']).replace('{$id}', found_suffix) self.normalized_id = found_prefix.lower( ) + ':' + found_suffix else: # preferred schema if len(self.identifier_schemes) > 0: if len(self.identifier_schemes) > 1: if 'url' in self.identifier_schemes: # ['doi', 'url'] self.identifier_schemes.remove('url') self.preferred_schema = self.identifier_schemes[0] self.normalized_id = idutils.normalize_pid( self.identifier, self.preferred_schema) self.identifier_url = idutils.to_url(self.identifier, self.preferred_schema) if self.preferred_schema in Mapper.VALID_PIDS.value or self.preferred_schema in self.IDENTIFIERS_ORG_DATA.keys( ): self.is_persistent = True
def test_tourl(): """Test URL generation.""" for i, expected_schemes, normalized_value, url_value in identifiers: assert idutils.to_url( idutils.normalize_pid(i, expected_schemes[0]), expected_schemes[0] ) == url_value
def test_idempotence(): """Test persistent id normalization.""" for i, expected_schemes, normalized_value, url_value in identifiers: val_norm = idutils.normalize_pid(i, expected_schemes[0]) assert expected_schemes[0] in \ idutils.detect_identifier_schemes(val_norm)
def normalize_identifier(self, data): """Normalize identifier.""" data['identifier'] = idutils.normalize_pid( data['identifier'], data['scheme'])
def normalize_identifier(self, data): """Normalize identifier.""" data['identifier'] = idutils.normalize_pid(data['identifier'], data['scheme'])
def test_tourl(): """Test URL generation.""" for i, expected_schemes, normalized_value, url_value in identifiers: assert idutils.to_url(idutils.normalize_pid(i, expected_schemes[0]), expected_schemes[0]) == url_value
def update_metadata(id_value: str, scheme: str, data: dict, create_identity_events: bool = True, create_missing_groups: bool = True, providers: List[str] = None, link_publication_date: str = None): """.""" from ..events.api import EventAPI scheme = scheme.lower() id_value = idutils.normalize_pid(id_value, scheme) target_identifiers = set() for i in data.get('Identifier', []): value, target_scheme = i['ID'], i['IDScheme'].lower() value = idutils.normalize_pid(value, target_scheme) target_identifiers.add((value, target_scheme)) # Check if there are identity links that can be created: if create_identity_events and len(target_identifiers) > 0: events = [] providers = providers or ['unknown'] providers = [{'Name': provider} for provider in providers] link_publication_date = link_publication_date or \ datetime.now().isoformat() source_id_obj = {'ID': id_value, 'IDScheme': scheme} for target_value, target_scheme in target_identifiers: if not ((id_value, scheme) == (target_value, target_scheme)): target_id_obj = {'ID': target_value, 'IDScheme': target_scheme} payload = { 'RelationshipType': { 'Name': 'IsRelatedTo', 'SubTypeSchema': 'DataCite', 'SubType': 'IsIdenticalTo' }, 'Target': { 'Identifier': target_id_obj, 'Type': {'Name': 'unknown'} }, 'LinkProvider': providers, 'Source': { 'Identifier': source_id_obj, 'Type': {'Name': 'unknown'} }, 'LinkPublicationDate': link_publication_date, } events.append(payload) for event_chunk in chunks(events, 100): try: EventAPI.handle_event( list(event_chunk), no_index=True, eager=True) except ValueError as exc: error_obj = ErrorMonitoring(origin="update_metadata", error=repr(exc), n_retries = 99, payload=event_chunk) db.session.add(error_obj) db.session.commit() current_app.logger.exception( 'Error while processing identity event') id_group = get_group_from_id(id_value, scheme) if not id_group and create_missing_groups: identifier = Identifier( value=id_value, scheme=scheme).fetch_or_create_id() db.session.commit() id_group, _ = get_or_create_groups(identifier) db.session.commit() id_group.data.update(data) db.session.commit()