Esempio n. 1
0
    def test_u2asc(self):

        input1 = 'benìtez, n'
        input2 = u'izzet, sakallı'

        output1 = adsputils.u2asc(input1)
        output2 = adsputils.u2asc(input2)

        self.assertEqual(output1,'benitez, n')
        self.assertEqual(output2,u'izzet, sakalli')

        input3 = input2.encode('utf16')
        self.assertRaises(UnicodeHandlerError, adsputils.u2asc, input3)
 def get_author_init(self, namestring):
     try:
         instring = html.unescape(namestring)
         outstring = u2asc(instring)[0]
         if outstring.isalpha():
             return outstring.upper()
     except Exception as err:
         raise BadAuthorInitialException(err)
     else:
         return '.'
Esempio n. 3
0
    def harvest_author_info(self, orcidid, name=None, facts=None):
        """
        Does the hard job of querying public and private 
        API's for whatever information we want to collect
        about the ORCID ID;
        
        At this stage, we want to mainly retrieve author
        names (ie. variations of the author name)
        
        :param: orcidid - String
        :param: name - String, name of the author (optional)
        :param: facts - dict, info about the author
        
        :return: dict with various keys: name, author, author_norm, orcid_name
                (if available)
        """

        author_data = {}

        # first verify the public ORCID profile
        j = self.get_public_orcid_profile(orcidid)
        if j is None:
            self.logger.error(
                'We cant verify public profile of: http://orcid.org/%s' %
                orcidid)
        else:
            # no need to check ORCID API version here; this is always fresh and must use current API
            # j['person']['name']['family-name']
            if 'person' in j and 'name' in j['person'] and \
                'family-name' in j['person']['name'] and \
                'given-names' in j['person']['name']:

                fname = (j['person']['name'].get('family-name', {})
                         or {}).get('value', None)
                gname = (j['person']['name'].get('given-names', {})
                         or {}).get('value', None)

                if fname and gname:
                    author_data['orcid_name'] = ['%s, %s' % (fname, gname)]
                    author_data['name'] = author_data['orcid_name'][0]

        # search for the orcidid in our database (but only the publisher populated fiels)
        # we can't trust other fiels to bootstrap our database
        r = requests.get(
                    '%(endpoint)s?q=%(query)s&fl=author,author_norm,orcid_pub&rows=100&sort=pubdate+desc' % \
                    {
                     'endpoint': self._config.get('API_SOLR_QUERY_ENDPOINT'),
                     'query' : 'orcid_pub:%s' % names.cleanup_orcidid(orcidid),
                    },
                    headers={'Authorization': 'Bearer %s' % self._config.get('API_TOKEN')})

        if r.status_code != 200:
            self.logger.error(
                'Failed getting data from our own API! (err: %s)' %
                r.status_code)
            raise Exception(r.text)

        # go through the documents and collect all the names that correspond to the ORCID
        master_set = {}
        for doc in r.json()['response']['docs']:
            for k, v in names.extract_names(orcidid, doc).items():
                if v:
                    master_set.setdefault(k, {})
                    n = names.cleanup_name(v)
                    if not master_set[k].has_key(n):
                        master_set[k][n] = 0
                    master_set[k][n] += 1

        # get ADS data about the user
        # 0000-0003-3052-0819 | {"authorizedUser": true, "currentAffiliation": "Australian Astronomical Observatory", "nameVariations": ["Green, Andrew W.", "Green, Andy", "Green, Andy W."]}

        r = self.get_ads_orcid_profile(orcidid)
        if r:
            _author = r
            _info = _author.get('info', {}) or {}
            if _info.get('authorizedUser', False):
                author_data['authorized'] = True
            if _info.get('currentAffiliation', False):
                author_data['current_affiliation'] = _info[
                    'currentAffiliation']
            _vars = _info.get('nameVariations', None)
            if _vars:
                master_set.setdefault('author', {})
                for x in _vars:
                    x = names.cleanup_name(x)
                    v = master_set['author'].get(x, 1)
                    master_set['author'][x] = v

        # elect the most frequent name to become the 'author name'
        # TODO: this will choose the normalized names (as that is shorter)
        # maybe we should choose the longest (but it is not too important
        # because the matcher will be checking all name variants during
        # record update)
        mx = 0
        for k, v in master_set.items():
            author_data[k] = sorted(list(v.keys()))
            for name, freq in v.items():
                if freq > mx:
                    author_data['name'] = name

        # automatically add the short names, because they make us find
        # more matches
        short_names = set()
        for x in ('author', 'orcid_name', 'author_norm'):
            if x in author_data and author_data[x]:
                for name in author_data[x]:
                    for variant in names.build_short_forms(name):
                        short_names.add(variant)
        if len(short_names):
            author_data['short_name'] = sorted(list(short_names))

        # Create the transliterated/ascii form of the name, in case there are accented Unicode characters
        asc_names = set()
        for x in ('author', 'orcid_name', 'author_norm', 'short_name'):
            if x in author_data and author_data[x]:
                for name in author_data[x]:
                    asc_names.add(u2asc(name))
        if len(asc_names):
            author_data['ascii_name'] = sorted(list(asc_names))

        return author_data
Esempio n. 4
0
 def get_author_init(self,namestring):
     output = u2asc(namestring)
     for c in output:
         if c.isalpha():
             return c
     return u'.'
Esempio n. 5
0
def update_record(rec, claim, min_levenshtein):
    """
    update the ADS Record; we'll add ORCID information into it
    (at the correct position)

    :param: rec - JSON structure, it contains metadata; we expect
            it to have 'authors' field, and 'claims' field

    :param: claim - JSON structure, it contains claim data,
            especially:
                orcidid
                author
                author_norm
            We use those field to find out which author made the
            claim.

    :return: tuple(clain_category, position) or None if no record
        was updated
    """
    assert (isinstance(rec, dict))
    assert (isinstance(claim, dict))
    assert ('authors' in rec)
    assert ('claims' in rec)
    assert (isinstance(rec['authors'], list))

    claims = rec.get('claims', {})
    rec['claims'] = claims
    authors = rec.get('authors', [])

    # make sure the claims have the necessary structure
    fld_name = u'unverified'
    if 'account_id' in claim and claim[
            'account_id']:  # the claim was made by ADS verified user
        fld_name = u'verified'

    num_authors = len(authors)

    if fld_name not in claims or claims[fld_name] is None:
        claims[fld_name] = ['-'] * num_authors
    elif len(claims[fld_name]) < num_authors:  # check the length is correct
        claims[fld_name] += ['-'] * (num_authors - len(claims[fld_name]))
    elif len(claims[fld_name]) > num_authors:
        claims[fld_name] = claims[fld_name][0:num_authors]

    # always remove the orcidid
    modified = False
    orcidid = claim['orcidid']
    for v in list(claims.values()):
        while orcidid in v:
            v[v.index(orcidid)] = '-'
            modified = True

    variant_keys = ('author', 'orcid_name', 'author_norm', 'short_name',
                    'ascii_name')

    # first check to see if there's an exact name match on the appropriate keys
    claims_clean = set()
    for key in variant_keys:
        for variant in claim.get(key, []):
            if bool(variant.strip()):
                try:
                    claims_clean.add(
                        names.cleanup_name(variant).lower().encode('utf-8'))
                except RuntimeError:
                    # don't add a blank variant to the set
                    continue

    aidx = 0
    for author in rec['authors']:
        try:
            author_clean = names.cleanup_name(author).lower().encode('utf8')
        except RuntimeError:
            # don't add a blank name to the set
            continue
        if author_clean in claims_clean:
            claims[fld_name][aidx] = claim.get(
                'status', 'created') == 'removed' and '-' or orcidid
            return (fld_name, aidx)
        # also try the transliterated/ascii form of the author name
        elif u2asc(author_clean) in claims_clean:
            claims[fld_name][aidx] = claim.get(
                'status', 'created') == 'removed' and '-' or orcidid
            return (fld_name, aidx)
        aidx += 1

    # if there is no exact match, try on Levenshtein distance, searching using descending priority
    for fx in variant_keys:
        if fx in claim and claim[fx]:
            #c = [x for x in claim[fx] if bool(x.strip())]
            assert (isinstance(claim[fx], list))
            idx = find_orcid_position(rec['authors'],
                                      claim[fx],
                                      min_levenshtein=min_levenshtein)
            if idx > -1:
                if idx >= num_authors:
                    logger.error(
                        u'Index is beyond list boundary: \n' +
                        u'Field {fx}, author {author}, len(authors)={la}, len({fx})=lfx'
                        .format(fx=fx,
                                author=claim[fx],
                                la=num_authors,
                                lfx=len(claim[fx])))
                    continue

                claims[fld_name][idx] = claim.get(
                    'status', 'created') == 'removed' and '-' or orcidid
                return (fld_name, idx)

    if modified:
        return ('removed', -1)
Esempio n. 6
0
def find_orcid_position(authors_list, name_variants, min_levenshtein=0.9):
    """
    Find the position of ORCID in the list of other strings

    :param authors_list - array of names that will be searched
    :param name_variants - array of names of a single author

    :return list of positions that match
    """
    try:
        al = [
            names.cleanup_name(x).lower().encode('utf8') for x in authors_list
        ]
    except RuntimeError:
        logger.error('Blank author present in author list: %s' % authors_list)
        return -1
    # compute similarity between all authors (and the supplied variants)
    # this is not very efficient, however the lists should be small
    # and short, so 3000 operations take less than 1s)
    res = []
    res_asc = []
    aidx = vidx = 0
    nv = []
    for name in name_variants:
        try:
            variant = names.cleanup_name(name).lower().encode('utf8')
            nv.append(variant)
        except RuntimeError:
            # don't accept a blank name
            continue
        if bool(variant.strip()):
            aidx = 0
            for author in al:
                res.append((Levenshtein.ratio(author, variant), aidx, vidx))
                # check transliterated/ascii form of names in author list if name is different from ascii version
                if u2asc(author) != author:
                    if sys.version_info > (3, ):
                        res_asc.append((Levenshtein.ratio(
                            u2asc(author).encode(), variant), aidx, vidx))
                    else:
                        res_asc.append(
                            (Levenshtein.ratio(u2asc(author),
                                               variant), aidx, vidx))
                else:
                    res_asc.append(res[-1])
                aidx += 1
        vidx += 1

    # sort results from the highest match
    res = sorted(res, key=lambda x: x[0], reverse=True)
    res_asc = sorted(res_asc, key=lambda x: x[0], reverse=True)

    if len(res) == 0:
        return -1

    # if transliterated forms have a higher Lev ratio, accept the transliterated form
    if res_asc[0][0] > res[0][0]:
        res = res_asc

    if res[0][0] < min_levenshtein:
        # test submatch (0.6470588235294118, 19, 0) (required:0.69) closest: vernetto, s, variant: vernetto, silvia teresa
        author_name = al[res[0][1]]
        variant_name = nv[res[0][2]]
        if author_name in variant_name or variant_name in author_name:
            if sys.version_info < (3, ):
                logger.debug(u'Using submatch for: %s (required:%s) closest: %s, variant: %s' \
                             % (res[0], min_levenshtein,
                                unicode(author_name, 'utf-8'),
                                unicode(variant_name, 'utf-8')))
            else:
                logger.debug('Using submatch for: %s (required:%s) closest: %s, variant: %s' \
                             % (res[0], min_levenshtein,
                                author_name,
                                variant_name))
            return res[0][1]

        if sys.version_info < (3, ):
            logger.debug(u'No match found: the closest is: %s (required:%s) closest: %s, variant: %s' \
                            % (res[0], min_levenshtein,
                               unicode(author_name, 'utf-8'),
                               unicode(variant_name, 'utf-8')))
        else:
            logger.debug('No match found: the closest is: %s (required:%s) closest: %s, variant: %s' \
                         % (res[0], min_levenshtein,
                            author_name,
                            variant_name))
        return -1

    logger.debug('Found match: %s (min_levenstein=%s), authors=%s',
                 authors_list[res[0][1]], min_levenshtein, authors_list)
    return res[0][1]