Example #1
0
 def __call__(self, form, field, submit=False, fields=None):
     if field.data:
         schemes = idutils.detect_identifier_schemes(field.data)
         if schemes:
             getattr(form, self.set_field).data = schemes[0]
         else:
             getattr(form, self.set_field).data = ''
 def evaluate(self):
     # ======= CHECK IDENTIFIER UNIQUENESS =======
     self.result = Uniqueness(id=self.fuji.count, metric_identifier=self.metric_identifier, metric_name=self.metric_name)
     self.output = UniquenessOutput()
     schemes = [i[0] for i in idutils.PID_SCHEMES]
     self.logger.info('FsF-F1-01D : Using idutils schemes')
     found_ids = idutils.detect_identifier_schemes(self.fuji.id)  # some schemes like PMID are generic
     if len(found_ids) > 0:
         self.logger.log(self.fuji.LOG_SUCCESS,'FsF-F1-01D : Unique identifier schemes found {}'.format(found_ids))
         self.setEvaluationCriteriumScore('FsF-F1-01D-1',1, 'pass')
         self.output.guid = self.fuji.id
         self.score.earned = self.total_score
         # identify main scheme
         if len(found_ids) == 1:
             #self.fuji.pid_url = self.fuji.id
             self.fuji.id_scheme = found_ids[0]
             #self.fuji.id_scheme = 'url'
         else:
             if 'url' in found_ids:  # ['doi', 'url']
                 found_ids.remove('url')
                 #self.fuji.pid_url = self.fuji.id
             self.fuji.id_scheme = found_ids[0]
         found_id = found_ids[0]  # TODO: take the first element of list, e.g., [doi, handle]
         if found_id in Mapper.VALID_PIDS.value:
             self.fuji.pid_scheme = found_id
         self.logger.info('FsF-F1-01D : Finalized unique identifier scheme - {}'.format(found_id))
         self.output.guid_scheme = found_id
         self.result.test_status = 'pass'
         self.result.score = self.score
         self.result.metric_tests = self.metric_tests
         self.result.output = self.output
     else:
         self.logger.warning('FsF-F1-01D : Failed to check the identifier scheme!.')
Example #3
0
 def validate_gnd(form, field):
     if field.data:
         schemes = idutils.detect_identifier_schemes(
             field.data or ''
         )
         if 'gnd' not in schemes:
             raise ValidationError("Not a valid GND-identifier.")
 def __call__(self, form, field, submit=False, fields=None):
     if field.data:
         schemes = idutils.detect_identifier_schemes(field.data)
         if schemes:
             getattr(form, self.set_field).data = schemes[0]
         else:
             getattr(form, self.set_field).data = ''
Example #5
0
    def validate_identifier(self, data, **kwargs):
        """Validate the identifier format and scheme."""
        identifier = data.get("identifier")
        scheme = data.get("scheme")

        if self.identifier_required and not identifier:
            raise ValidationError("Missing required identifier.")

        if identifier and not scheme:
            raise ValidationError(
                f"Missing scheme value for identifier {identifier}."
            )

        if identifier:
            # at this point, `scheme` is set or validation failed earlier
            detected_schemes = idutils.detect_identifier_schemes(identifier)

            is_forbidden = scheme in self.forbidden_schemes
            if is_forbidden:
                raise ValidationError(f"Invalid scheme {scheme}.")

            is_not_allowed = (
                self.allowed_schemes and scheme not in self.allowed_schemes
            )
            if is_not_allowed:
                raise ValidationError(f"Invalid scheme {scheme}.")

            unknown = scheme not in detected_schemes
            if unknown and self.fail_on_unknown:
                raise ValidationError(f"Invalid scheme {scheme}.")
Example #6
0
 def check_scheme(self, data):
     """Validate the provided identifier scheme."""
     value = data['value']
     scheme = data['scheme'].lower()
     schemes = idutils.detect_identifier_schemes(value)
     if schemes and scheme not in schemes:
         raise ValidationError("Invalid scheme '{}'".format(data['scheme']),
                               'IDScheme')
Example #7
0
def test_valueerror():
    """Test for bad validators."""
    # Many validators rely on a special length of the identifier before
    # testing further. This test, checks that the validators are still
    # well-behaved when the length matches, but the persistent identifier
    # is invalid.
    for i in range(20):
        nonsense_pid = "a" * i
        assert idutils.detect_identifier_schemes(nonsense_pid) == []
Example #8
0
 def detect_scheme(self, data):
     """Load scheme."""
     id_ = data.get('identifier')
     scheme = data.get('scheme')
     if not scheme and id_:
         scheme = idutils.detect_identifier_schemes(id_)
         if scheme:
             data['scheme'] = scheme[0]
     return data
Example #9
0
 def detect_scheme(self, data):
     """Load scheme."""
     id_ = data.get('identifier')
     scheme = data.get('scheme')
     if not scheme and id_:
         scheme = idutils.detect_identifier_schemes(id_)
         if scheme:
             data['scheme'] = scheme[0]
     return data
Example #10
0
 def validate_scheme(form, field):
     """Set scheme based on value in identifier."""
     schemes = idutils.detect_identifier_schemes(
         form.data.get('identifier') or ''
     )
     if schemes:
         field.data = schemes[0]
     else:
         field.data = ''
Example #11
0
def test_valueerror():
    """Test for bad validators."""
    # Many validators rely on a special length of the identifier before
    # testing further. This test, checks that the validators are still
    # well-behaved when the length matches, but the persistent identifier
    # is invalid.
    for i in range(20):
        nonsense_pid = "a" * i
        assert idutils.detect_identifier_schemes(nonsense_pid) == []
Example #12
0
def pid_url(identifier, scheme=None):
    """Convert persistent identifier into a link."""
    if scheme is None:
        try:
            scheme = idutils.detect_identifier_schemes(identifier)[0]
        except IndexError:
            scheme = None
    if scheme and identifier:
        return idutils.to_url(identifier, scheme)
    return ""
Example #13
0
def pid_url(identifier, scheme=None):
    """Convert persistent identifier into a link."""
    if scheme is None:
        try:
            scheme = idutils.detect_identifier_schemes(identifier)[0]
        except IndexError:
            scheme = None
    if scheme and identifier:
        return idutils.to_url(identifier, scheme)
    return ""
Example #14
0
def metadata_command(identifiers: List[str], eager: bool = False):
    """Harvest metadata."""
    # Detect identifier schemes
    identifiers = [(i, idutils.detect_identifier_schemes(i)[0])
                   for i in identifiers]
    task = harvest_metadata.s(identifiers, eager=eager)
    if eager:
        task.apply(throw=True)
    else:
        task.apply_async()
Example #15
0
    def _detect_scheme(self, identifier):
        """Detect the scheme of a given identifier."""
        detected_schemes = idutils.detect_identifier_schemes(identifier)

        if self.allow_all:
            return detected_schemes[0] if detected_schemes else None

        for d in detected_schemes:
            if d in self.allowed_schemes:
                return d

        return None
Example #16
0
    def _deserialize(self, value, attr, data):
        """Deserialize persistent identifier value."""
        value = super(PersistentId, self)._deserialize(value, attr, data)
        value = value.strip()

        schemes = idutils.detect_identifier_schemes(value)
        if self.scheme and self.scheme.lower() not in schemes:
            self.fail('invalid_scheme', scheme=self.scheme)
        if not schemes:
            self.fail('invalid_pid')
        return idutils.normalize_pid(value, schemes[0]) \
            if self.normalize else value
Example #17
0
 def _extract_identifiers(self, data):
     """."""
     ids = set()
     if data.get('bibcode'):
         ids.add((data.get('bibcode'), 'ads'))
     ids |= {(d, 'doi') for d in data.get('doi', []) if d}
     for id_ in data.get('identifier', []):
         try:
             ids.add((id_, idutils.detect_identifier_schemes(id_)[0]))
         except Exception:
             pass
     return [{'ID': i, 'IDScheme': s} for i, s in ids if i and s]
Example #18
0
    def _deserialize(self, value, attr, data):
        """Deserialize persistent identifier value."""
        value = super(PersistentId, self)._deserialize(value, attr, data)
        value = value.strip()

        schemes = idutils.detect_identifier_schemes(value)
        if self.scheme and self.scheme.lower() not in schemes:
            self.fail('invalid_scheme', scheme=self.scheme)
        if not schemes:
            self.fail('invalid_pid')
        return idutils.normalize_pid(value, schemes[0]) \
            if self.normalize else value
 def __init__(self, idstring):
     self.identifier = idstring
     self.normalized_id = self.identifier
     if self.identifier and isinstance(self.identifier, str):
         if len(self.identifier) > 4 and not self.identifier.isnumeric():
             #workaround to resolve lsids:
             #idutils.LANDING_URLS['lsid'] ='http://www.lsid.info/resolver/?lsid={pid}'
             #workaround to recognize https purls
             if 'https://purl.' in self.identifier:
                 self.identifier = self.identifier.replace(
                     'https:', 'http:')
             generic_identifiers_org_pattern = '^([a-z0-9\._]+):(.+)'
             # idutils check
             self.identifier_schemes = idutils.detect_identifier_schemes(
                 self.identifier)
             # identifiers.org check
             if not self.identifier_schemes:
                 self.method = 'identifiers.org'
                 idmatch = re.search(generic_identifiers_org_pattern,
                                     self.identifier)
                 if idmatch:
                     found_prefix = idmatch[1]
                     found_suffix = idmatch[2]
                     if found_prefix in self.IDENTIFIERS_ORG_DATA.keys():
                         if (re.search(
                                 self.IDENTIFIERS_ORG_DATA[found_prefix]
                             ['pattern'], found_suffix)):
                             self.identifier_schemes = [
                                 found_prefix, 'identifiers_org'
                             ]
                             self.preferred_schema = found_prefix
                         self.identifier_url = str(
                             self.IDENTIFIERS_ORG_DATA[found_prefix]
                             ['url_pattern']).replace(
                                 '{$id}', found_suffix)
                         self.normalized_id = found_prefix.lower(
                         ) + ':' + found_suffix
             else:
                 # preferred schema
                 if self.identifier_schemes:
                     if len(self.identifier_schemes) > 0:
                         if len(self.identifier_schemes) > 1:
                             if 'url' in self.identifier_schemes:  # ['doi', 'url']
                                 self.identifier_schemes.remove('url')
                         self.preferred_schema = self.identifier_schemes[0]
                         self.normalized_id = idutils.normalize_pid(
                             self.identifier, self.preferred_schema)
                     self.identifier_url = idutils.to_url(
                         self.identifier, self.preferred_schema)
             if self.preferred_schema in Mapper.VALID_PIDS.value or self.preferred_schema in self.IDENTIFIERS_ORG_DATA.keys(
             ):
                 self.is_persistent = True
 def __call__(self, form, field, submit=False, fields=None):
     scheme = None
     if self.scheme_field:
         scheme = getattr(form, self.scheme_field).data
     elif self.scheme:
         scheme = self.scheme
     else:
         schemes = idutils.detect_identifier_schemes(field.data)
         if schemes:
             scheme = schemes[0]
     if scheme:
         if field.data:
             field.data = idutils.normalize_pid(field.data, scheme=scheme)
Example #21
0
def pid_url(identifier, scheme=None, url_scheme="https"):
    """Convert persistent identifier into a link."""
    if scheme is None:
        try:
            scheme = idutils.detect_identifier_schemes(identifier)[0]
        except IndexError:
            scheme = None
    try:
        if scheme and identifier:
            return idutils.to_url(identifier, scheme, url_scheme=url_scheme)
    except Exception:
        current_app.logger.warning("URL generation for identifier {0} failed.".format(identifier), exc_info=True)
    return ""
Example #22
0
 def __call__(self, form, field, submit=False, fields=None):
     scheme = None
     if self.scheme_field:
         scheme = getattr(form, self.scheme_field).data
     elif self.scheme:
         scheme = self.scheme
     else:
         schemes = idutils.detect_identifier_schemes(field.data)
         if schemes:
             scheme = schemes[0]
     if scheme:
         if field.data:
             field.data = idutils.normalize_pid(field.data, scheme=scheme)
Example #23
0
    def _detect_scheme(self, identifier):
        """Detect and return the scheme of a given identifier."""
        detected_schemes = idutils.detect_identifier_schemes(identifier)

        # force setting the scheme to one of the detected when
        # allowed_schemes list is provided
        if self.allowed_schemes:
            for d in detected_schemes:
                if d in self.allowed_schemes:
                    return d

        first_or_none = detected_schemes[0] if detected_schemes else None
        return first_or_none
Example #24
0
def pid_url(identifier, scheme=None, url_scheme='https'):
    """Convert persistent identifier into a link."""
    if scheme is None:
        try:
            scheme = idutils.detect_identifier_schemes(identifier)[0]
        except IndexError:
            scheme = None
    try:
        if scheme and identifier:
            return idutils.to_url(identifier, scheme, url_scheme=url_scheme)
    except Exception:
        current_app.logger.warning('URL generation for identifier {0} failed.'
                                   .format(identifier), exc_info=True)
    return ''
Example #25
0
    def validate_data(self, data):
        """Validate identifier and scheme."""
        id_ = data.get('identifier')
        scheme = data.get('scheme')
        if not id_:
            raise ValidationError('Identifier is required.',
                                  field_names=['identifier'])

        schemes = idutils.detect_identifier_schemes(id_)
        if not schemes:
            raise ValidationError('Not a valid persistent identifier.',
                                  field_names=['identifier'])
        if scheme not in schemes:
            raise ValidationError('Not a valid {0} identifier.'.format(scheme),
                                  field_names=['identifier'])
Example #26
0
 def check_scheme(self, data, **kwargs):
     """Validate the provided identifier scheme."""
     value = data['value']
     scheme = data['scheme'].lower()
     schemes = idutils.detect_identifier_schemes(value)
     # TODO: "pmid" scheme with value '11781516' collides (with ean8)
     # if schemes and scheme not in schemes:
     #     raise ValidationError("Invalid scheme '{}'".format(
     #         data['scheme']), 'IDScheme')
     
     #Check for valid github url
     if scheme == 'url' and 'github' in value:
         try:
             GithubUtility.parse_url_info(value)
         except:
             raise ValidationError("Invalid github repo or release '{}'".format(value))
Example #27
0
    def check_identifiers(self):
        uuidresult = {'id': 1, 'metric_id': 'FsF-F1-01D', 'passed': False}
        pidresult = {'id': 2, 'metric_id': 'FsF-F1-02D', 'passed': False}
        try:
            #try to find an identifier schema for the given string
            foundpids = id.detect_identifier_schemes(self.uid)
            if len(foundpids) > 0:
                #if schema found we have an id which can be found by idutils
                uuidresult['passed'] = True
                uuidresult['output'] = {
                    'uuid': self.uid,
                    'uuid_schema': foundpids
                }
                #now we check if the schema is listed in our valid pid list in this case it is also a pid
                realpids = [
                    value for value in foundpids if value in self.validpids
                ]
                if len(realpids) > 0:
                    pidresult['passed'] = True
                if foundpids[0] == 'url':
                    self.pid_url = self.uid
                else:
                    # we try to find an actionable representation of the pid (URL)
                    self.pid_url = id.to_url(pid, scheme=realpids[0])
                    #we should log here if this fails..
                #Now we try to perform a HTTP GET request
                r = requests.get(self.pid_url)
                if r.status_code == 200:
                    if len(realpids) > 0:
                        self.pid = id.normalize_pid(pid, scheme=realpids[0])
                    self.landing_url = r.url
                    self.landing_html = r.text
                    pidresult['output'] = {
                        'pid': self.pid,
                        'resolved_url': self.landing_url,
                        'pid_schema': realpids
                    }
                else:
                    self.error.append('FsF-F1: HTTP Error: ' +
                                      str(r.status_code))
        except BaseException as err:
            self.error.append('FsF-F1: Failed to check the given identifier' +
                              str(err))

        self.results.append(uuidresult)
        self.results.append(pidresult)
Example #28
0
 def __init__(self, idstring):
     self.identifier = idstring
     self.normalized_id = self.identifier
     if len(self.identifier) > 4 and not self.identifier.isnumeric():
         generic_identifiers_org_pattern = '^([a-z0-9\._]+):(.+)'
         # idutils check
         self.identifier_schemes = idutils.detect_identifier_schemes(
             self.identifier)
         # identifiers.org check
         if not self.identifier_schemes:
             self.method = 'identifiers.org'
             idmatch = re.search(generic_identifiers_org_pattern,
                                 self.identifier)
             if idmatch:
                 found_prefix = idmatch[1]
                 found_suffix = idmatch[2]
                 if found_prefix in self.IDENTIFIERS_ORG_DATA.keys():
                     if (re.search(
                             self.IDENTIFIERS_ORG_DATA[found_prefix]
                         ['pattern'], found_suffix)):
                         self.identifier_schemes = [
                             found_prefix, 'identifiers_org'
                         ]
                         self.preferred_schema = found_prefix
                     self.identifier_url = str(
                         self.IDENTIFIERS_ORG_DATA[found_prefix]
                         ['url_pattern']).replace('{$id}', found_suffix)
                     self.normalized_id = found_prefix.lower(
                     ) + ':' + found_suffix
         else:
             # preferred schema
             if len(self.identifier_schemes) > 0:
                 if len(self.identifier_schemes) > 1:
                     if 'url' in self.identifier_schemes:  # ['doi', 'url']
                         self.identifier_schemes.remove('url')
                 self.preferred_schema = self.identifier_schemes[0]
                 self.normalized_id = idutils.normalize_pid(
                     self.identifier, self.preferred_schema)
             self.identifier_url = idutils.to_url(self.identifier,
                                                  self.preferred_schema)
         if self.preferred_schema in Mapper.VALID_PIDS.value or self.preferred_schema in self.IDENTIFIERS_ORG_DATA.keys(
         ):
             self.is_persistent = True
Example #29
0
    def validate_data(self, data):
        """Validate identifier and scheme."""
        id_ = data.get('identifier')
        scheme = data.get('scheme')
        if not id_:
            raise ValidationError(
                'Identifier is required.',
                field_names=['identifier']
            )

        schemes = idutils.detect_identifier_schemes(id_)
        if not schemes:
            raise ValidationError(
                'Not a valid persistent identifier.',
                field_names=['identifier']
            )
        if scheme not in schemes:
            raise ValidationError(
                'Not a valid {0} identifier.'.format(scheme),
                field_names=['identifier']
            )
Example #30
0
    def load_scheme(self, data, **kwargs):
        """Loads the schema of the identifier."""
        identifier = data.get("identifier")
        if not identifier:
            return data

        scheme = data.get("scheme")
        if not scheme:
            detected_schemes = idutils.detect_identifier_schemes(identifier)
        else:
            # if given, use it
            detected_schemes = [scheme.lower()]

        # check if given or any detected is allowed
        data["scheme"] = self._intersect_with_order(detected_schemes)

        if not data["scheme"]:
            # no match between detected and allowed
            # will fail at validation step
            data.pop("scheme", None)

        return data
Example #31
0
    def validate_identifier(self, data, **kwargs):
        """Validate the identifier format and scheme."""
        identifier = data.get("identifier")
        scheme = data.get("scheme")

        # If requried
        if not identifier and self.required:
            raise ValidationError("Missing required identifier.")

        if identifier:
            detected_schemes = idutils.detect_identifier_schemes(identifier)

            # A scheme should be present at this stage detected or provided
            if not scheme:
                raise ValidationError("Missing required scheme.")

            # Check if identifier is valid according to scheme.
            if scheme not in detected_schemes:
                raise ValidationError(f"Invalid identifier format or scheme.")

            # Check if scheme is allowed
            if not self.allow_all and scheme not in self.allowed_schemes:
                raise ValidationError("Scheme not allowed. Must be "
                                      f"one of {self.allowed_schemes}.")
Example #32
0
def test_detect_schemes():
    """Test scheme detection."""
    for i, expected_schemes, normalized_value, url_value in identifiers:
        schemes = idutils.detect_identifier_schemes(i)
        assert schemes == expected_schemes
Example #33
0
 def check_scheme(self, data):
     """Validate the provided identifier scheme."""
     value = data['value']
     scheme = data['scheme'].lower()
     schemes = idutils.detect_identifier_schemes(value)
 def __call__(self, form, field):
     """Validate."""
     schemes = idutils.detect_identifier_schemes(field.data)
     if not schemes:
         raise ValidationError(self.message)
Example #35
0
    with open("README.md", "r") as infile:
        for line in infile:
            if line.startswith("|"):  # start of table
                break
            else:
                intro.append(line)

    # generate the new README
    records = list()
    with open("maverefs.tsv", mode="r", newline="") as infile, open("README.md", mode="w", newline="") as outfile:
        # rewrite the introductory text
        for line in intro:
            print(line, end="", file=outfile)

        # regenerate the table line-by-line
        reader = csv.DictReader(infile, delimiter="\t")
        writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames, delimiter="|", lineterminator="|\n")
        print("|", end="", file=outfile)
        writer.writeheader()
        print("|---" * len(reader.fieldnames) + "|\n", end="", file=outfile)
        for row in reader:
            if idutils.is_pmid(row["PMID"]):
                row["PMID"] = format_md_link(row["PMID"], "pmid")
            if idutils.is_doi(row["DOI"]):
                row["DOI"] = format_md_link(row["DOI"], "doi")
            raw_data_schemes = idutils.detect_identifier_schemes(row["Raw Data"])
            if len(raw_data_schemes) == 1:  # uniquely identified the raw data
                row["Raw Data"] = format_md_link(row["Raw Data"], raw_data_schemes[0])
            print("|", end="", file=outfile)
            writer.writerow(row)
 def __call__(self, form, field):
     """Validate."""
     schemes = idutils.detect_identifier_schemes(field.data)
     if not schemes:
         raise ValidationError(self.message)
Example #37
0
def test_idempotence():
    """Test persistent id normalization."""
    for i, expected_schemes, normalized_value, url_value in identifiers:
        val_norm = idutils.normalize_pid(i, expected_schemes[0])
        assert expected_schemes[0] in \
            idutils.detect_identifier_schemes(val_norm)
Example #38
0
    def evaluate(self):

        self.result = Persistence(id=self.fuji.count,
                                  metric_identifier=self.metric_identifier,
                                  metric_name=self.metric_name)
        self.output = PersistenceOutput()
        # ======= CHECK IDENTIFIER PERSISTENCE =======
        self.logger.info(
            'FsF-F1-02D : PID schemes-based assessment supported by the assessment service - {}'
            .format(Mapper.VALID_PIDS.value))

        if self.fuji.pid_scheme is not None:
            check_url = idutils.to_url(self.fuji.id,
                                       scheme=self.fuji.pid_scheme)
        elif self.fuji.id_scheme == 'url':
            check_url = self.fuji.id

        # ======= RETRIEVE METADATA FROM LANDING PAGE =======
        requestHelper = RequestHelper(check_url, self.logger)
        requestHelper.setAcceptType(AcceptTypes.html)  # request
        neg_source, self.fuji.extruct_result = requestHelper.content_negotiate(
            'FsF-F1-02D')
        r = requestHelper.getHTTPResponse()
        signposting_pid = None
        if r:
            self.fuji.landing_url = requestHelper.redirect_url
            if r.status == 200:
                # identify signposting links in header
                header_link_string = requestHelper.getHTTPResponse().getheader(
                    'Link')
                if header_link_string is not None:
                    self.logger.info(
                        'FsF-F1-02D : Found signposting links in response header of landingpage'
                    )

                    for preparsed_link in header_link_string.split(','):
                        found_link = None
                        found_type, type_match = None, None
                        found_rel, rel_match = None, None
                        parsed_link = preparsed_link.strip().split(';')
                        found_link = parsed_link[0].strip()
                        for link_prop in parsed_link[1:]:
                            if str(link_prop).startswith('rel="'):
                                rel_match = re.search('rel=\"(.*?)\"',
                                                      link_prop)
                            elif str(link_prop).startswith('type="'):
                                type_match = re.search('type=\"(.*?)\"',
                                                       link_prop)
                        if type_match:
                            found_type = type_match[1]
                        if rel_match:
                            found_rel = rel_match[1]
                        signposting_link_dict = {
                            'url': found_link[1:-1],
                            'type': found_type,
                            'rel': found_rel
                        }
                        if found_link:
                            self.fuji.signposting_header_links.append(
                                signposting_link_dict)
                        '''
                        if found_rel:
                            if self.fuji.signposting_header_links.get(found_rel[1]):
                                self.fuji.signposting_header_links[found_rel[1]].append(found_link[1:-1])
                            else:
                                self.fuji.signposting_header_links[found_rel[1]]=[found_link[1:-1]]
                        '''

                #check if there is a cite-as signposting link
                if self.fuji.pid_scheme is None:
                    signposting_pid_link = self.fuji.get_signposting_links(
                        'cite-as')
                    if signposting_pid_link:
                        signposting_pid = signposting_pid_link[0].get('url')
                    if signposting_pid:
                        found_ids = idutils.detect_identifier_schemes(
                            signposting_pid[0])
                        if len(found_ids) > 1:
                            found_ids.remove('url')
                            found_id = found_ids[0]
                            if found_id in Mapper.VALID_PIDS.value:
                                self.logger.info(
                                    'FsF-F1-02D : Found object identifier in signposting header links'
                                )
                                self.fuji.pid_scheme = found_id

                up = urlparse(self.fuji.landing_url)
                self.fuji.landing_origin = '{uri.scheme}://{uri.netloc}'.format(
                    uri=up)
                self.fuji.landing_html = requestHelper.getResponseContent()

                self.output.resolved_url = self.fuji.landing_url  # url is active, although the identifier is not based on a pid scheme
                self.output.resolvable_status = True
                self.logger.info(
                    'FsF-F1-02D : Object identifier active (status code = 200)'
                )
                self.fuji.isMetadataAccessible = True
            elif r.status_code in [401, 402, 403]:
                self.fuji.isMetadataAccessible = False
                self.logger.warning(
                    "Resource inaccessible, identifier returned http status code: {code}"
                    .format(code=r.status_code))
            else:
                self.fuji.isMetadataAccessible = False
                self.logger.warning(
                    "Resource inaccessible, identifier returned http status code: {code}"
                    .format(code=r.status_code))
        else:
            self.fuji.isMetadataAccessible = False
            self.logger.warning(
                "FsF-F1-02D :Resource inaccessible, no response received from: {}"
                .format(check_url))

        if self.fuji.pid_scheme is not None:
            # short_pid = id.normalize_pid(self.id, scheme=pid_scheme)
            if signposting_pid is None:
                self.fuji.pid_url = idutils.to_url(self.fuji.id,
                                                   scheme=self.fuji.pid_scheme)
            else:
                self.fuji.pid_url = signposting_pid[0]
            self.output.pid_scheme = self.fuji.pid_scheme
            self.result.test_status = 'pass'
            self.output.pid = self.fuji.pid_url
            self.setEvaluationCriteriumScore('FsF-F1-02D-1', 0, 'pass')
            if self.fuji.isMetadataAccessible:
                self.setEvaluationCriteriumScore('FsF-F1-02D-2', 1, 'pass')
                self.score.earned = self.total_score  # idenfier should be based on a persistence scheme and resolvable

            #print(self.metric_tests)

            self.logger.log(
                self.fuji.LOG_SUCCESS,
                'FsF-F1-02D : Persistence identifier scheme - {}'.format(
                    self.fuji.pid_scheme))
            #self.logger.info('FsF-F1-02D : Persistence identifier scheme - {}'.format(self.fuji.pid_scheme))
        else:
            self.score.earned = 0
            self.logger.warning(
                'FsF-F1-02D : Not a persistent identifier scheme - {}'.format(
                    self.fuji.id_scheme))

        self.result.score = self.score
        self.result.metric_tests = self.metric_tests
        self.result.output = self.output
Example #39
0
def test_idempotence():
    """Test persistent id normalization."""
    for i, expected_schemes, normalized_value, url_value in identifiers:
        val_norm = idutils.normalize_pid(i, expected_schemes[0])
        assert expected_schemes[0] in \
            idutils.detect_identifier_schemes(val_norm)
Example #40
0
    def retrieve_metadata_embedded(self, extruct_metadata):
        isPid = False
        if self.pid_scheme:
            isPid = True
        # ========= retrieve embedded rdfa and microdata metadata ========
        micro_meta = extruct_metadata.get('microdata')
        microdata_collector = MetaDataCollectorMicroData(
            loggerinst=self.logger,
            sourcemetadata=micro_meta,
            mapping=Mapper.MICRODATA_MAPPING)
        source_micro, micro_dict = microdata_collector.parse_metadata()
        if micro_dict:
            self.metadata_sources.append((source_micro, 'embedded'))
            self.namespace_uri.extend(microdata_collector.getNamespaces())
            micro_dict = self.exclude_null(micro_dict)
            for i in micro_dict.keys():
                if i in self.reference_elements:
                    self.metadata_merged[i] = micro_dict[i]
                    self.reference_elements.remove(i)
            self.logger.log(
                self.LOG_SUCCESS, 'FsF-F2-01M : Found microdata metadata: ' +
                str(micro_dict.keys()))

        #================== RDFa
        RDFA_ns = rdflib.Namespace("http://www.w3.org/ns/rdfa#")
        rdfasource = MetaDataCollector.Sources.RDFA.value
        rdfagraph = None
        errors = []
        try:
            rdfagraph = rdflib.Graph()
            rdfagraph.parse(data=self.landing_html, format='rdfa')
            rdfa_collector = MetaDataCollectorRdf(loggerinst=self.logger,
                                                  target_url=self.landing_url,
                                                  source=rdfasource,
                                                  rdf_graph=rdfagraph)
            source_rdfa, rdfa_dict = rdfa_collector.parse_metadata()
            self.metadata_sources.append((rdfasource, 'embedded'))
            self.namespace_uri.extend(rdfa_collector.getNamespaces())
            #rdfa_dict['object_identifier']=self.pid_url
            rdfa_dict = self.exclude_null(rdfa_dict)
            for i in rdfa_dict.keys():
                if i in self.reference_elements:
                    self.metadata_merged[i] = rdfa_dict[i]
                    self.reference_elements.remove(i)
            self.logger.log(
                self.LOG_SUCCESS,
                'FsF-F2-01M : Found RDFa metadata: ' + str(rdfa_dict.keys()))
        except:
            self.logger.info(
                'FsF-F2-01M : RDFa metadata parsing exception, probably no RDFa embedded in HTML'
            )

        # ========= retrieve schema.org (embedded, or from via content-negotiation if pid provided) =========
        ext_meta = extruct_metadata.get('json-ld')

        if self.use_datacite is True:
            target_url = self.pid_url
        else:
            target_url = self.landing_url

        schemaorg_collector = MetaDataCollectorSchemaOrg(
            loggerinst=self.logger,
            sourcemetadata=ext_meta,
            mapping=Mapper.SCHEMAORG_MAPPING,
            pidurl=target_url)
        source_schemaorg, schemaorg_dict = schemaorg_collector.parse_metadata()
        schemaorg_dict = self.exclude_null(schemaorg_dict)
        if schemaorg_dict:
            self.namespace_uri.extend(schemaorg_collector.namespaces)
            #not_null_sco = [k for k, v in schemaorg_dict.items() if v is not None]
            if source_schemaorg == MetaDataCollector.Sources.SCHEMAORG_EMBED.value:
                self.metadata_sources.append((source_schemaorg, 'embedded'))
            else:
                self.metadata_sources.append((source_schemaorg, 'negotiated'))
            if schemaorg_dict.get('related_resources'):
                self.related_resources.extend(
                    schemaorg_dict.get('related_resources'))
            if schemaorg_dict.get('object_content_identifier'):
                self.logger.info(
                    'FsF-F3-01M : Found data links in Schema.org metadata : ' +
                    str(schemaorg_dict.get('object_content_identifier')))
            # add object type for future reference
            for i in schemaorg_dict.keys():
                if i in self.reference_elements:
                    self.metadata_merged[i] = schemaorg_dict[i]
                    self.reference_elements.remove(i)
            self.logger.log(
                self.LOG_SUCCESS, 'FsF-F2-01M : Found Schema.org metadata: ' +
                str(schemaorg_dict.keys()))
        else:
            self.logger.info('FsF-F2-01M : Schema.org metadata UNAVAILABLE')

        # ========= retrieve dublin core embedded in html page =========
        if self.reference_elements:
            self.logger.info('FsF-F2-01M : Checking for DublinCore metadata')
            dc_collector = MetaDataCollectorDublinCore(
                loggerinst=self.logger,
                sourcemetadata=self.landing_html,
                mapping=Mapper.DC_MAPPING)
            source_dc, dc_dict = dc_collector.parse_metadata()
            dc_dict = self.exclude_null(dc_dict)
            if dc_dict:
                self.namespace_uri.extend(dc_collector.namespaces)
                #not_null_dc = [k for k, v in dc_dict.items() if v is not None]
                self.metadata_sources.append((source_dc, 'embedded'))
                if dc_dict.get('related_resources'):
                    self.related_resources.extend(
                        dc_dict.get('related_resources'))
                for d in dc_dict.keys():
                    if d in self.reference_elements:
                        self.metadata_merged[d] = dc_dict[d]
                        self.reference_elements.remove(d)
                self.logger.log(
                    self.LOG_SUCCESS,
                    'FsF-F2-01M : Found DublinCore metadata: ' +
                    str(dc_dict.keys()))
            else:
                self.logger.info(
                    'FsF-F2-01M : DublinCore metadata UNAVAILABLE')

        # ======== retrieve OpenGraph metadata
        ext_meta = extruct_metadata.get('opengraph')
        opengraph_collector = MetaDataCollectorOpenGraph(
            loggerinst=self.logger,
            sourcemetadata=ext_meta,
            mapping=Mapper.OG_MAPPING)
        source_opengraph, opengraph_dict = opengraph_collector.parse_metadata()
        opengraph_dict = self.exclude_null(opengraph_dict)
        if opengraph_dict:
            self.namespace_uri.extend(opengraph_collector.namespaces)
            self.metadata_sources.append((source_opengraph, 'embedded'))
            for i in opengraph_dict.keys():
                if i in self.reference_elements:
                    self.metadata_merged[i] = opengraph_dict[i]
                    self.reference_elements.remove(i)
            self.logger.log(
                self.LOG_SUCCESS, 'FsF-F2-01M : Found OpenGraph metadata: ' +
                str(opengraph_dict.keys()))
        else:
            self.logger.info('FsF-F2-01M : Schema.org metadata UNAVAILABLE')

        #========= retrieve signposting data links

        data_sign_links = self.get_signposting_links('item')
        if data_sign_links:
            self.logger.info(
                'FsF-F3-01M : Found data links in response header (signposting) : '
                + str(len(data_sign_links)))
            if self.metadata_merged.get('object_content_identifier') is None:
                self.metadata_merged[
                    'object_content_identifier'] = data_sign_links

        #========= retrieve typed data object links =========

        data_meta_links = self.get_html_typed_links(rel='item')
        if data_meta_links:
            self.logger.info(
                'FsF-F3-01M : Found data links in HTML head (link rel=item) : '
                + str(len(data_meta_links)))
            if self.metadata_merged.get('object_content_identifier') is None:
                self.metadata_merged[
                    'object_content_identifier'] = data_meta_links
        # self.metadata_sources.append((MetaDataCollector.Sources.TYPED_LINK.value,'linked'))

        #Now if an identifier has been detected in the metadata, potentially check for persistent identifier has to be repeated..
        if self.metadata_merged.get('object_identifier'):
            if isinstance(self.metadata_merged.get('object_identifier'), list):
                identifiertotest = self.metadata_merged.get(
                    'object_identifier')[0]
            else:
                identifiertotest = self.metadata_merged.get(
                    'object_identifier')
            if self.pid_scheme is None:
                #print(self.metadata_merged.get('object_identifier'))
                found_pids_in_metadata = idutils.detect_identifier_schemes(
                    identifiertotest)
                if len(found_pids_in_metadata) > 1:
                    if 'url' in found_pids_in_metadata:
                        found_pids_in_metadata.remove('url')
                    found_id = found_pids_in_metadata[0]
                    if found_id in Mapper.VALID_PIDS.value:
                        self.logger.info(
                            'FsF-F2-01M : Found object identifier in metadata, repeating PID check for FsF-F1-02D'
                        )
                        self.logger.log(
                            self.LOG_SUCCESS,
                            'FsF-F1-02D : Found object identifier in metadata during FsF-F2-01M, PID check was repeated'
                        )
                        self.repeat_pid_check = True
                        self.pid_scheme = found_id
                        self.id = identifiertotest
Example #41
0
 def get_scheme(self, obj):
     """Get scheme."""
     scheme = obj.get('scheme')
     if not scheme and obj.get('identifier'):
         scheme = idutils.detect_identifier_schemes(obj['identifier'])[0]
     return scheme or ""
Example #42
0
def test_detect_schemes():
    """Test scheme detection."""
    for i, expected_schemes, normalized_value, url_value in identifiers:
        schemes = idutils.detect_identifier_schemes(i)
        assert schemes == expected_schemes