def test_u2asc(self): input1 = 'benìtez, n' input2 = u'izzet, sakallı' output1 = adsputils.u2asc(input1) output2 = adsputils.u2asc(input2) self.assertEqual(output1,'benitez, n') self.assertEqual(output2,u'izzet, sakalli') input3 = input2.encode('utf16') self.assertRaises(UnicodeHandlerError, adsputils.u2asc, input3)
def get_author_init(self, namestring): try: instring = html.unescape(namestring) outstring = u2asc(instring)[0] if outstring.isalpha(): return outstring.upper() except Exception as err: raise BadAuthorInitialException(err) else: return '.'
def harvest_author_info(self, orcidid, name=None, facts=None): """ Does the hard job of querying public and private API's for whatever information we want to collect about the ORCID ID; At this stage, we want to mainly retrieve author names (ie. variations of the author name) :param: orcidid - String :param: name - String, name of the author (optional) :param: facts - dict, info about the author :return: dict with various keys: name, author, author_norm, orcid_name (if available) """ author_data = {} # first verify the public ORCID profile j = self.get_public_orcid_profile(orcidid) if j is None: self.logger.error( 'We cant verify public profile of: http://orcid.org/%s' % orcidid) else: # no need to check ORCID API version here; this is always fresh and must use current API # j['person']['name']['family-name'] if 'person' in j and 'name' in j['person'] and \ 'family-name' in j['person']['name'] and \ 'given-names' in j['person']['name']: fname = (j['person']['name'].get('family-name', {}) or {}).get('value', None) gname = (j['person']['name'].get('given-names', {}) or {}).get('value', None) if fname and gname: author_data['orcid_name'] = ['%s, %s' % (fname, gname)] author_data['name'] = author_data['orcid_name'][0] # search for the orcidid in our database (but only the publisher populated fiels) # we can't trust other fiels to bootstrap our database r = requests.get( '%(endpoint)s?q=%(query)s&fl=author,author_norm,orcid_pub&rows=100&sort=pubdate+desc' % \ { 'endpoint': self._config.get('API_SOLR_QUERY_ENDPOINT'), 'query' : 'orcid_pub:%s' % names.cleanup_orcidid(orcidid), }, headers={'Authorization': 'Bearer %s' % self._config.get('API_TOKEN')}) if r.status_code != 200: self.logger.error( 'Failed getting data from our own API! (err: %s)' % r.status_code) raise Exception(r.text) # go through the documents and collect all the names that correspond to the ORCID master_set = {} for doc in r.json()['response']['docs']: for k, v in names.extract_names(orcidid, doc).items(): if v: master_set.setdefault(k, {}) n = names.cleanup_name(v) if not master_set[k].has_key(n): master_set[k][n] = 0 master_set[k][n] += 1 # get ADS data about the user # 0000-0003-3052-0819 | {"authorizedUser": true, "currentAffiliation": "Australian Astronomical Observatory", "nameVariations": ["Green, Andrew W.", "Green, Andy", "Green, Andy W."]} r = self.get_ads_orcid_profile(orcidid) if r: _author = r _info = _author.get('info', {}) or {} if _info.get('authorizedUser', False): author_data['authorized'] = True if _info.get('currentAffiliation', False): author_data['current_affiliation'] = _info[ 'currentAffiliation'] _vars = _info.get('nameVariations', None) if _vars: master_set.setdefault('author', {}) for x in _vars: x = names.cleanup_name(x) v = master_set['author'].get(x, 1) master_set['author'][x] = v # elect the most frequent name to become the 'author name' # TODO: this will choose the normalized names (as that is shorter) # maybe we should choose the longest (but it is not too important # because the matcher will be checking all name variants during # record update) mx = 0 for k, v in master_set.items(): author_data[k] = sorted(list(v.keys())) for name, freq in v.items(): if freq > mx: author_data['name'] = name # automatically add the short names, because they make us find # more matches short_names = set() for x in ('author', 'orcid_name', 'author_norm'): if x in author_data and author_data[x]: for name in author_data[x]: for variant in names.build_short_forms(name): short_names.add(variant) if len(short_names): author_data['short_name'] = sorted(list(short_names)) # Create the transliterated/ascii form of the name, in case there are accented Unicode characters asc_names = set() for x in ('author', 'orcid_name', 'author_norm', 'short_name'): if x in author_data and author_data[x]: for name in author_data[x]: asc_names.add(u2asc(name)) if len(asc_names): author_data['ascii_name'] = sorted(list(asc_names)) return author_data
def get_author_init(self,namestring): output = u2asc(namestring) for c in output: if c.isalpha(): return c return u'.'
def update_record(rec, claim, min_levenshtein): """ update the ADS Record; we'll add ORCID information into it (at the correct position) :param: rec - JSON structure, it contains metadata; we expect it to have 'authors' field, and 'claims' field :param: claim - JSON structure, it contains claim data, especially: orcidid author author_norm We use those field to find out which author made the claim. :return: tuple(clain_category, position) or None if no record was updated """ assert (isinstance(rec, dict)) assert (isinstance(claim, dict)) assert ('authors' in rec) assert ('claims' in rec) assert (isinstance(rec['authors'], list)) claims = rec.get('claims', {}) rec['claims'] = claims authors = rec.get('authors', []) # make sure the claims have the necessary structure fld_name = u'unverified' if 'account_id' in claim and claim[ 'account_id']: # the claim was made by ADS verified user fld_name = u'verified' num_authors = len(authors) if fld_name not in claims or claims[fld_name] is None: claims[fld_name] = ['-'] * num_authors elif len(claims[fld_name]) < num_authors: # check the length is correct claims[fld_name] += ['-'] * (num_authors - len(claims[fld_name])) elif len(claims[fld_name]) > num_authors: claims[fld_name] = claims[fld_name][0:num_authors] # always remove the orcidid modified = False orcidid = claim['orcidid'] for v in list(claims.values()): while orcidid in v: v[v.index(orcidid)] = '-' modified = True variant_keys = ('author', 'orcid_name', 'author_norm', 'short_name', 'ascii_name') # first check to see if there's an exact name match on the appropriate keys claims_clean = set() for key in variant_keys: for variant in claim.get(key, []): if bool(variant.strip()): try: claims_clean.add( names.cleanup_name(variant).lower().encode('utf-8')) except RuntimeError: # don't add a blank variant to the set continue aidx = 0 for author in rec['authors']: try: author_clean = names.cleanup_name(author).lower().encode('utf8') except RuntimeError: # don't add a blank name to the set continue if author_clean in claims_clean: claims[fld_name][aidx] = claim.get( 'status', 'created') == 'removed' and '-' or orcidid return (fld_name, aidx) # also try the transliterated/ascii form of the author name elif u2asc(author_clean) in claims_clean: claims[fld_name][aidx] = claim.get( 'status', 'created') == 'removed' and '-' or orcidid return (fld_name, aidx) aidx += 1 # if there is no exact match, try on Levenshtein distance, searching using descending priority for fx in variant_keys: if fx in claim and claim[fx]: #c = [x for x in claim[fx] if bool(x.strip())] assert (isinstance(claim[fx], list)) idx = find_orcid_position(rec['authors'], claim[fx], min_levenshtein=min_levenshtein) if idx > -1: if idx >= num_authors: logger.error( u'Index is beyond list boundary: \n' + u'Field {fx}, author {author}, len(authors)={la}, len({fx})=lfx' .format(fx=fx, author=claim[fx], la=num_authors, lfx=len(claim[fx]))) continue claims[fld_name][idx] = claim.get( 'status', 'created') == 'removed' and '-' or orcidid return (fld_name, idx) if modified: return ('removed', -1)
def find_orcid_position(authors_list, name_variants, min_levenshtein=0.9): """ Find the position of ORCID in the list of other strings :param authors_list - array of names that will be searched :param name_variants - array of names of a single author :return list of positions that match """ try: al = [ names.cleanup_name(x).lower().encode('utf8') for x in authors_list ] except RuntimeError: logger.error('Blank author present in author list: %s' % authors_list) return -1 # compute similarity between all authors (and the supplied variants) # this is not very efficient, however the lists should be small # and short, so 3000 operations take less than 1s) res = [] res_asc = [] aidx = vidx = 0 nv = [] for name in name_variants: try: variant = names.cleanup_name(name).lower().encode('utf8') nv.append(variant) except RuntimeError: # don't accept a blank name continue if bool(variant.strip()): aidx = 0 for author in al: res.append((Levenshtein.ratio(author, variant), aidx, vidx)) # check transliterated/ascii form of names in author list if name is different from ascii version if u2asc(author) != author: if sys.version_info > (3, ): res_asc.append((Levenshtein.ratio( u2asc(author).encode(), variant), aidx, vidx)) else: res_asc.append( (Levenshtein.ratio(u2asc(author), variant), aidx, vidx)) else: res_asc.append(res[-1]) aidx += 1 vidx += 1 # sort results from the highest match res = sorted(res, key=lambda x: x[0], reverse=True) res_asc = sorted(res_asc, key=lambda x: x[0], reverse=True) if len(res) == 0: return -1 # if transliterated forms have a higher Lev ratio, accept the transliterated form if res_asc[0][0] > res[0][0]: res = res_asc if res[0][0] < min_levenshtein: # test submatch (0.6470588235294118, 19, 0) (required:0.69) closest: vernetto, s, variant: vernetto, silvia teresa author_name = al[res[0][1]] variant_name = nv[res[0][2]] if author_name in variant_name or variant_name in author_name: if sys.version_info < (3, ): logger.debug(u'Using submatch for: %s (required:%s) closest: %s, variant: %s' \ % (res[0], min_levenshtein, unicode(author_name, 'utf-8'), unicode(variant_name, 'utf-8'))) else: logger.debug('Using submatch for: %s (required:%s) closest: %s, variant: %s' \ % (res[0], min_levenshtein, author_name, variant_name)) return res[0][1] if sys.version_info < (3, ): logger.debug(u'No match found: the closest is: %s (required:%s) closest: %s, variant: %s' \ % (res[0], min_levenshtein, unicode(author_name, 'utf-8'), unicode(variant_name, 'utf-8'))) else: logger.debug('No match found: the closest is: %s (required:%s) closest: %s, variant: %s' \ % (res[0], min_levenshtein, author_name, variant_name)) return -1 logger.debug('Found match: %s (min_levenstein=%s), authors=%s', authors_list[res[0][1]], min_levenshtein, authors_list) return res[0][1]