def _compile_authors_query(query, record): parsed_name = ParsedName(record["full_name"]) nested_query = {"query": parsed_name.generate_es_query()} if "inner_hits" in query: nested_query['query']['nested']['inner_hits'] = query['inner_hits'] return nested_query
def match_literature_author(author, record): configs = [ current_app.config["AUTHOR_MATCHER_NAME_CONFIG"], current_app.config["AUTHOR_MATCHER_NAME_INITIALS_CONFIG"], ] validators = [(collaboration_validator, affiliations_validator), None] parsed_name = ParsedName.loads(author.get("full_name")) author_matcher_data = { "first_name": parsed_name.first, "last_name": parsed_name.last, "full_name": author.get("full_name"), "collaborations": get_value(record, "collaborations.value", []), "affiliations": get_value(author, "affiliations.value", []), } for config, validator in zip(configs, validators): matched_records = match_literature_author_with_config( author_matcher_data, config) matched_author_data = ( get_reference_and_bai_if_unambiguous_literature_author_match( matched_records)) if not matched_author_data and validator: for validator_function in validator: valid_matches = ( match for match in matched_records if validator_function(author_matcher_data, match)) matched_author_data = ( get_reference_and_bai_if_unambiguous_literature_author_match( valid_matches)) if matched_author_data: break if matched_author_data: return matched_author_data
def check_author_compability_with_lit_authors(literature_control_number): current_author_profile = _get_current_user_author_profile() if not current_author_profile: return False lit_record = _get_lit_record_from_db(literature_control_number) if not lit_record: return False author_name = current_author_profile.get_value("name.value") author_parsed_name = ParsedName.loads(author_name) matched_authors_recid_last_name = _check_names_compability( lit_record, author_parsed_name, last_names_only=True) if matched_authors_recid_last_name: return matched_authors_recid_last_name matched_authors_recid_full_name = _check_names_compability( lit_record, author_parsed_name) if matched_authors_recid_full_name: return matched_authors_recid_full_name matched_author_recid_name_with_initials = _find_matching_author_in_lit_record( author_parsed_name, literature_control_number) if matched_author_recid_name_with_initials: return matched_author_recid_name_with_initials
def test_parsed_name_from_parts(): parsed_name = ParsedName.from_parts("John", "Smith", "Peter", "Jr", "Sir") expected = "Smith, John Peter, Jr." result = parsed_name.dumps() assert result == expected
def test_parsed_wrong_names_and_not_fail(): names = [ (u'Proffesor.M.', u'Proffesor.M.'), (u'ˇ Sirˇ', u'Sirˇ, ˇ.'), ] for name, expected in names: assert ParsedName(name).dumps() == expected
def _name_variation_has_only_initials(name): """Detects whether the name variation consists only from initials.""" def _is_initial(name_variation): return len(name_variation) == 1 or u'.' in name_variation parsed_name = ParsedName.loads(name) return all([_is_initial(name_part) for name_part in parsed_name])
def test_parsed_name_initials(): parsed_name = ParsedName("Holland, Tom Stanley") expected = "T. S." assert expected == parsed_name.first_initials expected = ["T.", "S."] assert expected == parsed_name.first_initials_list
def get_authors(record): """Return the authors of a record. Queries the Institution records linked from the authors affiliations to add, whenever it exists, the HAL identifier of the institution to the affiliation. Args: record(InspireRecord): a record. Returns: list(dict): the authors of the record. Examples: >>> record = { ... 'authors': [ ... 'affiliations': [ ... { ... 'record': { ... '$ref': 'http://localhost:5000/api/institutions/902725', ... } ... }, ... ], ... ], ... } >>> authors = get_authors(record) >>> authors[0]['hal_id'] '300037' """ hal_id_map = _get_hal_id_map(record) result = [] for author in record.get('authors', []): affiliations = [] parsed_name = ParsedName.loads(author['full_name']) first_name, last_name = parsed_name.first, parsed_name.last for affiliation in author.get('affiliations', []): recid = get_recid_from_ref(affiliation.get('record')) if recid in hal_id_map and hal_id_map[recid]: affiliations.append({'hal_id': hal_id_map[recid]}) result.append({ 'affiliations': affiliations, 'first_name': first_name, 'last_name': last_name, }) return result
def author_name_contains_fullnames(author_name): """Recognizes whether the name contains full name parts and not initials or only lastname. Returns: bool: True if name has only full name parts, e.g. 'Ellis John', False otherwise. So for example, False is returned for 'Ellis, J.' or 'Ellis'. """ parsed_name = ParsedName(author_name) if len(parsed_name) == 1: return False elif any([is_initial_of_a_name(name_part) for name_part in parsed_name]): return False return True
def build_texkey_first_part(cls, data): full_name = get_value(data, "authors[0].full_name") if full_name: parsed_name = ParsedName.loads(full_name) parsed_name = (parsed_name.last if len(parsed_name) > 1 else full_name.split(",")[0]) else: parsed_name = None if parsed_name and len(data["authors"]) < 10: return cls.sanitize(parsed_name) elif "collaborations" in data: return cls.sanitize(data["collaborations"][0]["value"]) elif "corporate_author" in data: return cls.sanitize(data["corporate_author"][0]) elif "proceedings" in data["document_type"]: return cls.sanitize("Proceedings") elif parsed_name: return cls.sanitize(parsed_name) return None
def generate_minimal_name_variations(author_name): """Generate a small number of name variations. Notes: Unidecodes the name, so that we use its transliterated version, since this is how the field is being indexed. For names with more than one part, {lastname} x {non lastnames, non lastnames initial} variations. Additionally, it generates the swapped version of those, for supporting queries like ``Mele Salvatore`` which ``ParsedName`` parses as lastname: Salvatore and firstname: Mele. So in those cases, we need to generate both ``Mele, Salvatore`` and ``Mele, S``. Wherever, the '-' is replaced by ' ', it's done because it's the way the name variations are being index, thus we want our minimal name variations to be generated identically. This has to be done after the creation of ParsedName, otherwise the name is parsed differently. E.g. 'Caro-Estevez' as is, it's a lastname, if we replace the '-' with ' ', then it's a firstname and lastname. """ parsed_name = ParsedName.loads(unidecode(author_name)) if len(parsed_name) > 1: lastnames = parsed_name.last.replace('-', ' ') non_lastnames = ' '.join(parsed_name.first_list + parsed_name.middle_list + parsed_name.suffix_list) # Strip extra whitespace added if any of middle_list and suffix_list are empty. non_lastnames = non_lastnames.strip().replace('-', ' ') # Adding into a set first, so as to drop identical name variations. return list({ name_variation.lower() for name_variation in [ lastnames + ' ' + non_lastnames, lastnames + ' ' + non_lastnames[0], non_lastnames + ' ' + lastnames, non_lastnames + ' ' + lastnames[0], ] if not _name_variation_has_only_initials(name_variation) }) else: return [parsed_name.dumps().replace('-', ' ').lower()]
def get_display_name_for_author_name(author_name): parsed_name = ParsedName.loads(author_name) return " ".join(parsed_name.first_list + parsed_name.last_list)
def _generate_author_query(self, author_name): """Generates a query handling specifically authors. Notes: There are three main cases: 1) ``a Smith`` This will just generate a ``match`` query on ``last_name`` 2) ``a John Smith`` This will just generate a ``match`` query on ``last_name`` and a ``prefix`` query on ``first_name`` and a ``match`` query on the initial ``J``. This will return results from ``Smith, John`` and ``Smith, J`` but not from ``Smith, Jane``. 3) ``a J Smith`` This will just generate a ``match`` query on ``last_name`` and a match query on ``first_name.initials``. Please note, cases such as ``J.D.`` have been properly handled by the tokenizer. """ parsed_name = ParsedName(author_name) def _match_query_with_names_initials_analyzer_with_and_operator( field, value): return { "match": { self.KEYWORD_TO_ES_FIELDNAME[field]: { "query": value, 'operator': 'AND', "analyzer": "names_initials_analyzer" } } } def _match_query_with_and_operator(field, value): return { 'match': { self.KEYWORD_TO_ES_FIELDNAME[field]: { 'query': value, 'operator': 'AND' } } } def _match_phrase_prefix_query(field, value): return { "match_phrase_prefix": { self.KEYWORD_TO_ES_FIELDNAME[field]: { "query": value, "analyzer": "names_analyzer" } } } if len(parsed_name) == 1 and '.' not in parsed_name.first: # ParsedName returns first name if there is only one name i.e. `Smith` # in our case we consider it as a lastname last_name = parsed_name.first query = _match_query_with_and_operator("author_last_name", last_name) return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query) bool_query_build = [] bool_query_build.append( _match_query_with_and_operator("author_last_name", parsed_name.last)) should_query = [] first_names = retokenize_first_names(parsed_name.first_list) for name in first_names: name_query = [] if is_initial_of_a_name(name): name_query.append( _match_query_with_names_initials_analyzer_with_and_operator( "author_first_name_initials", name)) else: name_query.extend([ _match_phrase_prefix_query("author_first_name", name), _match_query_with_names_initials_analyzer_with_and_operator( "author_first_name", name) ]) should_query.append( wrap_queries_in_bool_clauses_if_more_than_one( name_query, use_must_clause=False)) bool_query_build.append( wrap_queries_in_bool_clauses_if_more_than_one( should_query, use_must_clause=True)) query = wrap_queries_in_bool_clauses_if_more_than_one( bool_query_build, use_must_clause=True) return generate_nested_query(self.AUTHORS_NESTED_QUERY_PATH, query)
def get_author_display_name(name): """Returns the display name in format Firstnames Lastnames""" parsed_name = ParsedName.loads(name) return " ".join(parsed_name.first_list + parsed_name.last_list)
def _generate_author_query(self, author_name): """Generates a query handling specifically authors. Notes: The match query is generic enough to return many results. Then, using the filter clause we truncate these so that we imitate legacy's behaviour on returning more "exact" results. E.g. Searching for `Smith, John` shouldn't return papers of 'Smith, Bob'. Additionally, doing a ``match`` with ``"operator": "and"`` in order to be even more exact in our search, by requiring that ``full_name`` field contains both """ parsed_name = ParsedName(author_name) def _is_initial(name_part): return len(name_part) == 1 or u'.' in name_part # This case we treat ti just like lastname if len(parsed_name) == 1: query = { 'bool': { 'must': { 'match': { ElasticSearchVisitor.KEYWORD_TO_ES_FIELDNAME['author_last_name_raw']: author_name } } } } return generate_nested_query( ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, query) if ',' not in author_name: parts = author_name.split(' ') lastname = parts[-1] firstnames = parts[0:-1] else: parts = author_name.split(',') lastname = parts[0] firstnames = parts[1].replace('.', ' ').split(' ') # do something else query_build = [] for name in firstnames: if _is_initial(name): query_build.append( {"match": { "authors.first_name": name.strip() }}) else: query_build.append({ "prefix": { "authors.first_name.raw": name.strip().lower() } }) query = { 'bool': { 'must': [{ "match": { "authors.last_name.raw": { "query": lastname.strip(), "operator": "AND" } } }, { "bool": { "must": query_build } }] } } return generate_nested_query( ElasticSearchVisitor.AUTHORS_NESTED_QUERY_PATH, query)