Example #1
0
    def test_original_and_full_name(self):
        """Test that original and full name access works"""
        for n in namesA:
            name = ADSName.parse(n)
            self.assertEqual(name.original_name, n)
            self.assertEqual(name.bare_original_name, n)

            name = ADSName.parse(n.upper())
            self.assertEqual(name.original_name, n.upper())
            self.assertEqual(name.bare_original_name, n.upper())
            self.assertNotEqual(name.original_name, n)
            self.assertNotEqual(name.bare_original_name, n)
            self.assertEqual(name.full_name, n)
            self.assertEqual(name.qualified_full_name, n)

            for modifier in ['=', '<', '>', '<=', '>=']:
                name = ADSName.parse(modifier + n)
                self.assertEqual(name.original_name, modifier + n)
                self.assertEqual(name.bare_original_name, n)
                self.assertEqual(name.full_name, n)
                self.assertEqual(name.qualified_full_name, modifier + n)

            for modifier, cor_modifier in zip(['=<', '=>'], ['<=', '>=']):
                name = ADSName.parse(modifier + n)
                self.assertEqual(name.original_name, modifier + n)
                self.assertEqual(name.bare_original_name, n)
                self.assertEqual(name.full_name, n)
                self.assertEqual(name.qualified_full_name, cor_modifier + n)
Example #2
0
def _insert_document_data(pairings, doc_data, repo, excluded_names):
    """Stores all required document data, and back fills indices"""
    for k1 in pairings.keys():
        author1 = ADSName.parse(k1)
        for k2 in pairings[k1].keys():
            author2 = ADSName.parse(k2)
            replacement = []

            for bibcode in pairings[k1][k2]:
                if bibcode in doc_data:
                    doc_record = doc_data[bibcode]
                else:
                    doc_record = repo.get_document(bibcode).asdict()
                    lb.on_doc_loaded()
                    del doc_record['bibcode']
                    del doc_record['timestamp']
                    del doc_record['doctype']
                    doc_data[bibcode] = doc_record

                auth_1_idx, auth_2_idx = _find_indices(doc_record['authors'],
                                                       bibcode, author1,
                                                       author2, excluded_names)

                replacement.append((bibcode, auth_1_idx, auth_2_idx))
            pairings[k1][k2] = replacement
Example #3
0
    def get_papers_for_orcid_id(self, orcid_id):
        orcid_id = normalize_orcid_id(orcid_id)
        lb.i(f"Querying ADS for orcid id " + orcid_id)
        query = f"orcid:({orcid_id})"

        documents = self._inner_query_for_author(query, 1)

        author_record = AuthorRecord(name=ADSName.parse(orcid_id,
                                                        preserve=True),
                                     documents=[])
        names = set()
        for document in documents:
            try:
                i = document.orcid_ids.index(orcid_id)
            except ValueError:
                lb.w(f"ORCID ID not found in {document.bibcode}")
                continue
            author_record.documents.append(document.bibcode)
            names.add(document.authors[i])

        # Find the most-detailed form of the name
        if len(names):
            names = [ADSName.parse(n) for n in names]
            intermed = [(n.level_of_detail, len(n.full_name), n)
                        for n in names]
            intermed.sort(reverse=True)
            author_record.name = intermed[0][-1]
        return author_record, documents
Example #4
0
def get_name_as_in_ADS(target_name, names_in_result: []):
    """For presentation in the UI, figures out how to capitalize a name
    
    The user may have typed in the query names in all lowercase. For the large
    banner at the top of the page, it would be nice to format the names more
    properly. Rather than just defaulting to first-letter-uppercase, we can
    use our ADS data to present the name in a form (or one of the forms) ADS
    has for the name. This means we may also pick up diacritics.
    
    Looks through all the publications belonging to the name and how the
    author's name appears in those publications. Grabs (one of) the
    most-detailed forms. If it contains more given names than the target
    names, truncates the list. Shortens given names to initials if the target
    name has an initial at that position."""
    # Unique-ify names_in_result
    names_in_result = list(set(names_in_result))

    repo = Repository(can_skip_refresh=True)
    names_in_result = [ADSName.parse(name) for name in names_in_result]
    orcid = is_orcid_id(target_name)
    if orcid:
        record = repo.get_author_record_by_orcid_id(target_name)
    else:
        target_name = ADSName.parse(target_name)
        record = repo.get_author_record(target_name)

    aliases = record.appears_as.keys()
    aliases = [ADSName.parse(alias) for alias in aliases]
    # Remove all aliases that aren't consistent with any of the name forms
    # used in the set of possible chains. E.g. if the user searched for
    # "Last" and all chains terminate at "Last, B.", then we shouldn't view
    # "Last, I." as a viable alias.
    aliases = [alias for alias in aliases if alias in names_in_result]

    # Grab the most-detailed alias. As tie-breaker, choose the form with the
    # most publications.
    alias = sorted([(a.level_of_detail,
                     len(record.appears_as[a.original_name]), a.original_name)
                    for a in aliases])[-1][-1]
    alias = ADSName.parse(alias, preserve=True)

    if orcid:
        gns = alias.given_names
    else:
        # Trim it down to size
        gns = alias.given_names
        if len(gns) > len(target_name.given_names):
            gns = gns[:len(target_name.given_names)]

        # Ensure we have initials where we need them
        gns = [
            gn if len(tgn) > 1 else gn[0]
            for gn, tgn in zip(gns, target_name.given_names)
        ]

    final_name = ADSName.parse(alias.last_name, *gns, preserve=True)
    return final_name.full_name
Example #5
0
    def test_repr(self):
        """Test than string representations of ADSNames are as expected"""
        for name in namesA:
            for modifier in ['', '=', '>', '<', '<=', '>=']:
                name2 = modifier + name
                self.assertEqual(name2, repr(ADSName.parse(name2)))

            name2 = ">=" + name
            self.assertEqual(name2, repr(ADSName.parse("=>" + name)))

            name2 = "<=" + name
            self.assertEqual(name2, repr(ADSName.parse("=<" + name)))
Example #6
0
    def test_preserve_case(self):
        # Ensure the parsing cache is populated
        for name in namesA:
            parsed = ADSName.parse(name.upper())
            self.assertEqual(parsed.full_name, name)

        for name in namesA:
            parsed = ADSName.parse(name.upper(), preserve=True)
            self.assertEqual(parsed.full_name, name.upper())

        for name in namesA:
            parsed = ADSName.parse(name.upper())
            parsed = ADSName.parse(parsed, preserve=True)
            self.assertEqual(parsed.full_name, name.upper())
Example #7
0
 def get_author_record(self, author: Name) -> AuthorRecord:
     author = ADSName.parse(author)
     try:
         author_record = cache_buddy.load_author(author)
     except CacheMiss:
         author_record = self._try_generating_author_record(author)
         if author_record is None:
             author_record, documents = \
                 self.ads_buddy.get_papers_for_author(author)
             cache_buddy.cache_documents(documents)
             if type(author_record) == AuthorRecord:
                 self._fill_in_coauthors(author_record)
                 if len(author_record.documents):
                     cache_buddy.cache_author(author_record)
             else:
                 for rec in author_record.values():
                     self._fill_in_coauthors(rec)
                 cache_buddy.cache_authors([
                     ar for ar in author_record.values()
                     if len(ar.documents)
                 ])
                 author_record = author_record[author]
     lb.on_author_queried()
     lb.on_doc_queried(len(author_record.documents))
     return author_record
Example #8
0
def _find_indices(authors, bibcode, author1, author2, excluded_names):
    key1 = (bibcode, author1.original_name)
    key2 = (bibcode, author2.original_name)
    try:
        auth_1_idx = indices_cache[key1]
    except KeyError:
        auth_1_idx = None
    try:
        auth_2_idx = indices_cache[key2]
    except KeyError:
        auth_2_idx = None

    for i, author in enumerate(authors):
        if auth_1_idx is not None and auth_2_idx is not None:
            break
        author = ADSName.parse(author)
        if author in excluded_names:
            continue
        if auth_1_idx is None and author1 == author:
            auth_1_idx = i
        if auth_2_idx is None and author2 == author:
            auth_2_idx = i

    indices_cache[key1] = auth_1_idx
    indices_cache[key2] = auth_2_idx
    return auth_1_idx, auth_2_idx
Example #9
0
def load_author(key):
    if key[0] in '<>=':
        raise CacheMiss(key)

    orcid = "ORCID" in key
    if orcid:
        name = None
    else:
        name = ADSName.parse(key)
    docs = []
    coauthors = defaultdict(list)
    appears_as = defaultdict(list)
    for bibcode, document in documents.items():
        matched = None
        # Go through the document's authors until/if we find our search author
        for orcid_id, author in zip_longest(document['orcid_ids'],
                                            document['authors']):
            if orcid and orcid_id == key:
                matched = author
                aname = ADSName.parse(author)
                if name is None or aname.is_more_specific_than(name):
                    name = aname
            elif not orcid and name == author:
                matched = author
        if matched is not None:
            docs.append(bibcode)
            idx = len(docs) - 1
            appears_as[matched].append(idx)
            for coauthor in document['authors']:
                if coauthor != matched:
                    coauthors[coauthor].append(idx)
    if len(docs) or key.endswith("nodocs"):
        for coauthor, coauthor_dat in coauthors.items():
            coauthors[coauthor] = ','.join(str(i) for i in coauthor_dat)
        for alias, alias_dat in appears_as.items():
            appears_as[alias] = ','.join(str(i) for i in alias_dat)
        return {
            # defaultdict doesn't play nicely with AuthorRecord's asdict()
            'name': name.qualified_full_name,
            'documents': docs,
            'coauthors': dict(**coauthors),
            'appears_as': dict(**appears_as),
            'timestamp': TIME,
            'version': AUTHOR_VERSION_NUMBER,
        }
    else:
        raise CacheMiss(key)
Example #10
0
 def test_exact_equality(self):
     for i in range(len(namesA)):
         aname = ADSName.parse("=" + namesA[i])
         self.assertEqual(aname, namesA[i])
         self.assertEqual(aname, aname)
         for j in range(len(namesA)):
             if i != j:
                 self.assertNotEqual(namesA[j], aname)
Example #11
0
    def test_full_name_formatting(self):
        """Test than name parsing is insensitive to spacing and periods"""
        for n in namesA:
            name1 = ADSName.parse(n)
            name2 = ADSName.parse(n.replace(", ", ",").replace(".", ""))
            self.assertEqual(str(name1), str(name2))
            self.assertEqual(name1, name2)

            for modifier in ['=', '<', '>', '<=', '>=']:
                name1 = ADSName.parse(modifier + n)
                name2 = ADSName.parse(
                    (modifier + n).replace(", ", ",").replace(".", ""))
                self.assertEqual(str(name1), str(name2))
                if '=' in modifier:
                    self.assertEqual(name1, name2)
                else:
                    self.assertNotEqual(name1, name2)
Example #12
0
def normalize_author_names(paper_choices, repo):
    """Re-builds a chain with names representative of the linking papers.
    
    Builds a new chain where each name is as seen in the top paper choice
    for that chain link. Names that aren't the first or last in the chain
    appear on two chosen papers, and of those two versions of the name, the
    least specific is chosen."""
    new_chain = []
    for i, pc in enumerate(zip(*paper_choices)):
        bibcode, a1idx, a2idx = pc[0]
        doc = repo.get_document(bibcode)
        a1name = doc.authors[a1idx]
        a2name = doc.authors[a2idx]
        if (i != 0 and ADSName.parse(a1name).level_of_detail < ADSName.parse(
                new_chain[-1]).level_of_detail):
            new_chain[-1] = a1name
        elif i == 0:
            new_chain.append(a1name)
        new_chain.append(a2name)
    return tuple(new_chain)
Example #13
0
    def notify_of_upcoming_author_request(self, *authors):
        authors = [ADSName.parse(author) for author in authors]
        # If appropriate, the backing cache will pre-fetch the data while
        # checking if it exists
        is_in_cache = cache_buddy.authors_are_in_cache(authors)
        authors = [a for a, iic in zip(authors, is_in_cache) if not iic]

        can_generate = self._can_generate_author_requests(authors)
        authors = [a for a, cg in zip(authors, can_generate) if not cg]

        self.ads_buddy.add_authors_to_prefetch_queue(*authors)
Example #14
0
 def test_level_of_detail(self):
     self.assertEqual(0, ADSName.parse("last").level_of_detail)
     self.assertEqual(3, ADSName.parse("last, f").level_of_detail)
     self.assertEqual(10, ADSName.parse("last, first").level_of_detail)
     self.assertEqual(6, ADSName.parse("last, f m").level_of_detail)
     self.assertEqual(13, ADSName.parse("last, f middle").level_of_detail)
     self.assertEqual(20,
                      ADSName.parse("last, first middle").level_of_detail)
     self.assertEqual(23,
                      ADSName.parse("last, first middle m").level_of_detail)
Example #15
0
    def test_equality(self):
        for data in namesB:
            nameB = ADSName.parse(data[0])
            results = data[1:]
            for i, result in enumerate(results):
                if result:
                    self.assertEqual(nameB, namesA[i])
                else:
                    self.assertNotEqual(nameB, namesA[i])

        self.assertNotEqual(nameB, 1)
        self.assertNotEqual(nameB, "a string")
Example #16
0
 def _select_authors_to_prefetch(self):
     lb.d(f"{len(self.prefetch_queue)} authors in prefetch queue")
     n_prefetches = MAXIMUM_RESPONSE_SIZE // ESTIMATED_DOCUMENTS_PER_AUTHOR - 1
     if n_prefetches > len(self.prefetch_queue):
         n_prefetches = len(self.prefetch_queue)
     if n_prefetches <= 0:
         return []
     prefetches = []
     for _ in range(n_prefetches):
         name = self.prefetch_queue.popleft()
         self.prefetch_set.remove(name)
         prefetches.append(ADSName.parse(name))
     return prefetches
Example #17
0
def graph_translation(chains, source, dest):
    # We have a list of chains---the table in the web view. These chains
    # may contain many different forms of a name, and it's important to
    # preserve that for the table display. But for the graph display, it's
    # better to collapse down to one node per person, not one node per
    # name form. So within each column, we want to canonicalize each name
    # to the least-detailed form of that name appearing in the column.
    source = get_name_as_in_ADS(source, [c[0] for c in chains])
    dest = get_name_as_in_ADS(dest, [c[-1] for c in chains])
    source = ADSName.parse(source)
    dest = ADSName.parse(dest)

    nads = []
    for i in range(len(chains[0])):
        # This dict will map names to canonical forms
        nad = NameAwareDict()
        nads.append(nad)
        if i == 0:
            nad[source] = source
        if i == len(chains[0]) - 1:
            nad[dest] = dest
        for chain in chains:
            name = ADSName.parse(chain[i])
            if name in nad:
                if name.level_of_detail < nad[name].level_of_detail:
                    nad[name] = name
            else:
                nad[name] = name

    mappings = []
    for i, nad in enumerate(nads):
        mapping = {}
        mappings.append(mapping)
        for chain in chains:
            name = chain[i].lower()
            mapping[name] = nad[name].original_name
    return mappings
Example #18
0
    def test_modifier_functions(self):
        for mod, req_exact, req_less, req_more, allow_same in (
            ['', False, False, False, True],
            ['>', False, False, True, False],
            ['<', False, True, False, False],
            ['=', True, False, False, False],
            ['>=', False, False, True, True],
            ['<=', False, True, False, True],
        ):
            name = ADSName.parse(mod + namesA[1])
            self.assertEqual(name.require_exact_match, req_exact)
            self.assertEqual(name.require_less_specific, req_less)
            self.assertEqual(name.require_more_specific, req_more)
            self.assertEqual(name.allow_same_specific, allow_same)

            self.assertEqual(name.excludes_self, (req_less or req_more)
                             and not allow_same)
            self.assertEqual(name.has_modifiers(), mod != '')
            self.assertEqual(name.modifiers, mod)
            self.assertEqual(name.without_modifiers().full_name, namesA[1])
Example #19
0
def _score_author_chain_link(con1, con2, repo):
    """Scores the reliability of name matching between two papers

    Accepts two "connections", tuples containing a bibcode followed by two
    indices locating an author in the author list of the associated bibccode.
    The author in question will be indicated by the latter index in the first
    connection and the earlier index in the second connection.

    When ORCID ids are available, they solely determine the score, which will
    fall between 0.7 and 1 depending on the source of the ORCID ids. Otherwise
    the score will be derived from the faction of overlap between the author's
    affiliations in the two papers, and the level of detail of the
    author's name as printed in the two papers. These scores will fall in the
    range (0, 0.4), with contributions of up to 0.3 from affiliation matching
    and up to 0.1 from name detail"""
    doc1 = repo.get_document(con1[0])
    doc2 = repo.get_document(con2[0])
    idx1 = con1[2]
    idx2 = con2[1]
    orcid_id_1 = doc1.orcid_ids[idx1]
    orcid_id_2 = doc2.orcid_ids[idx2]
    if orcid_id_1 != '' and orcid_id_2 != '':
        if orcid_id_1 == orcid_id_2:
            orcid_src_1 = doc1.orcid_id_src[idx1]
            orcid_src_2 = doc2.orcid_id_src[idx2]
            # Looking at the source of ADS's ORCID id data, each score element
            # is 1 for orcid_pub, .92 for orcid_user, and .84 for orcid_other.
            # The values for the two ORCID ids are multiplied together
            score1 = 1 - .08 * (orcid_src_1 - 1)
            score2 = 1 - .08 * (orcid_src_2 - 1)
            return score1 * score2
        else:
            # The ORCID ids _don't_ match!
            return None

    # Attempt some affiliation fuzzy-matching
    # _process_affil will do some processing and return a list of the
    # comma-delimited chunks in the affiliation.
    affil1 = _process_affil(doc1.affils[idx1])
    affil2 = _process_affil(doc2.affils[idx2])

    # Compute the fraction of the chunks of each affil that are present
    # in the other
    try:
        one_in_two = sum(chunk in affil2 for chunk in affil1) / len(affil1)
        two_in_one = sum(chunk in affil1 for chunk in affil2) / len(affil2)
    except ZeroDivisionError:
        one_in_two = 0
        two_in_one = 0
    # Average these two fractions
    affil_frac_in_common = (one_in_two + two_in_one) / 2

    # Put the score in the range (0, 0.3)
    affil_score = affil_frac_in_common * .3

    name1 = ADSName.parse(doc1.authors[idx1])
    name2 = ADSName.parse(doc2.authors[idx2])
    if name1 != name2:
        # This can occur, e.g. if J. Doe was encountered first, creating a
        # J. Doe node in PathFinder, then Jane and John Doe were encountered
        # and added to that node, and now a proposed chain runs from Jane
        # to John.
        return None
    detail1 = name1.level_of_detail
    detail2 = name2.level_of_detail
    # level_of_detail examples:
    # Last, First Middle: 20
    # Last, First, M: 13
    # Last, First: 10
    # Last, F: 3
    # Last: 0
    #
    # We'll score based on the less-detailed name, take 20 as the ideal value,
    # and put the name score in the range (0, 0.1)
    detail_score = min(detail1, detail2) / 20 * .1

    return detail_score + affil_score
Example #20
0
    def test_with_synonyms(self):
        synonyms = [
            "test_synAA; test_synAB", "test_synB, a; test_synB, b",
            "test_synCA, q; test_synCB, q", "test_synD, a; test_synD, b c",
            "test_synEB, b; test_synEA, a",
            "test_synFA, a b c d; test_synFB, a",
            "test_synGA, a b c d; test_synGB, a; test_synGC, b"
        ]
        # Hack: inject test synonyms
        ads_name._name_cache.clear()
        ads_name._parse_name_synonyms(synonyms)

        for synonym in synonyms:
            names = synonym.split(';')

            # The second copy is for the deletion tests later
            nad = NameAwareDict()
            nad2 = NameAwareDict()
            for i, name in enumerate(names):
                nad[name] = i
                nad2[name] = i

            # Do the insertion in both orders, to ensure we try both
            # "canonical first" and "canonical last"
            nad_rev = NameAwareDict()
            nad_rev2 = NameAwareDict()
            for i, name in enumerate(reversed(names)):
                nad_rev[name] = i
                nad_rev2[name] = i

            # Ensure that, after inserting under one form and updating under
            # the other form, we can get the latest value from either form.
            for name in names:
                self.assertEqual(nad[name], i)
                self.assertEqual(nad_rev[name], i)

            # Check other misc methods
            for name in names:
                self.assertIn(name, nad)
                self.assertIn(name, nad_rev)

            self.assertEqual(len(nad), 1)
            self.assertEqual(len(nad_rev), 1)

            self.assertEqual(nad.keys(), (ADSName.parse(names[-1]), ))
            self.assertEqual(nad_rev.keys(), (ADSName.parse(names[0]), ))

            self.assertEqual(nad.values(), (i, ))
            self.assertEqual(nad_rev.values(), (i, ))

            # Ensure that deleting one form deletes them all.
            del nad[names[0]]
            self.assertEqual(len(nad), 0)
            for name in names:
                self.assertNotIn(name, nad)

            del nad2[names[1]]
            self.assertEqual(len(nad2), 0)
            for name in names:
                self.assertNotIn(name, nad2)

            del nad_rev[names[0]]
            self.assertEqual(len(nad_rev), 0)
            for name in names:
                self.assertNotIn(name, nad_rev)

            del nad_rev2[names[1]]
            self.assertEqual(len(nad_rev2), 0)
            for name in names:
                self.assertNotIn(name, nad_rev2)

        # Verify functionality with '@' modifier
        for synonym in synonyms:
            names_orig = synonym.split(';')

            for names in [names_orig, list(reversed(names_orig))]:
                # We'll insert under one name, then verify we can't access
                # or delete under the other
                nad1 = NameAwareDict()
                nad2 = NameAwareDict()
                nad3 = NameAwareDict()
                nad4 = NameAwareDict()

                nad1[names[0]] = 1
                nad2[names[-1]] = 1
                nad3['@' + names[0]] = 1
                nad4['@' + names[-1]] = 1

                with self.assertRaises(KeyError):
                    nad1['@' + names[-1]]
                with self.assertRaises(KeyError):
                    nad2['@' + names[0]]
                with self.assertRaises(KeyError):
                    nad3[names[-1]]
                with self.assertRaises(KeyError):
                    nad4[names[0]]

                # I don't think it's worth it to test modification because
                # it's hard to define how it should work. If we store under
                # 'name' which has 'name2' as a synonym, we get the same
                # value for 'name' and 'name2'. If we then store under
                # '@name2', what should we get when retrieving as 'name2'?
                # If we then store again under 'name', what should we get
                # for 'name2'? Or for '@name2'?

                # nad1['@' + names[-1]] = 2
                # self.assertEqual(nad1[names[0]], 1)
                # nad1['@' + names[0]] = 2
                # self.assertEqual(nad1[names[-1]], 1)
                # nad1[names[-1]] = 2
                # self.assertEqual('@' + nad1[names[0]], 1)
                # nad1[names[0]] = 2
                # self.assertEqual('@' + nad1[names[-1]], 1)

                with self.assertRaises(KeyError):
                    del nad1['@' + names[-1]]
                with self.assertRaises(KeyError):
                    del nad2['@' + names[0]]
                with self.assertRaises(KeyError):
                    del nad3[names[-1]]
                with self.assertRaises(KeyError):
                    del nad4[names[0]]

        # Remove our test synonyms
        ads_name._name_cache.clear()
        ads_name._name_synonyms.clear()
        ads_name._load_synonyms()
Example #21
0
    def test_with_specificity(self):
        nad = NameAwareDict()

        for name in diff_names:
            nad[name] = PathNode(name)

        for i, name in enumerate(equal_names):
            lt = ADSName.parse("<" + str(name))
            lte = ADSName.parse("<=" + str(name))
            gt = ADSName.parse(">" + str(name))
            gte = ADSName.parse(">=" + str(name))
            ex = ADSName.parse("=" + str(name))

            if i == 0:
                self.assertNotIn(lt, nad)
                self.assertNotIn(lte, nad)
            else:
                self.assertIn(lt, nad)
                self.assertIn(lte, nad)
            self.assertNotIn(gt, nad)
            self.assertNotIn(gte, nad)
            self.assertNotIn(ex, nad)

            # Node "Last, First" will match and overwrite an existing entry
            # for "Last, F"
            nad[name] = PathNode(name)

            self.assertNotIn(lt, nad)
            self.assertIn(gte, nad)
            self.assertIn(lte, nad)
            self.assertNotIn(gt, nad)
            self.assertIn(ex, nad)

        nad = NameAwareDict()

        for name in diff_names:
            nad[name] = PathNode(name)

        for i, name in enumerate(equal_names[::-1]):
            lt = ADSName.parse("<" + str(name))
            lte = ADSName.parse("<=" + str(name))
            gt = ADSName.parse(">" + str(name))
            gte = ADSName.parse(">=" + str(name))
            ex = ADSName.parse("=" + str(name))

            if i == 0:
                self.assertNotIn(gt, nad)
                self.assertNotIn(gte, nad)
            else:
                self.assertIn(gt, nad)
                self.assertIn(gte, nad)
            self.assertNotIn(lt, nad)
            self.assertNotIn(lte, nad)
            self.assertNotIn(ex, nad)

            # Node "Last, First" will match and overwrite an existing entry
            # for "Last, F"
            nad[name] = PathNode(name)

            self.assertNotIn(lt, nad)
            self.assertIn(gte, nad)
            self.assertIn(lte, nad)
            self.assertNotIn(gt, nad)
            self.assertIn(ex, nad)
Example #22
0
    def test_synonyms(self):
        synonyms = [
            "test_synAA;test_synAB", "test_synBB, ;test_synBA,",
            "test_synCA, q; test_synCB, q", "test_synD, a; test_synD, b c",
            "test_synEB, b; test_synEA, a",
            "test_synFA, a b c d; test_synFB, a"
        ]
        # Hack: inject test synonyms
        ads_name._name_cache.clear()
        ads_name._parse_name_synonyms(synonyms)

        for syn in synonyms:
            names = syn.split(';')
            self.assertEqual(ADSName.parse(names[0]), ADSName.parse(names[1]))
            self.assertNotEqual('@' + ADSName.parse(names[0]),
                                '@' + ADSName.parse(names[1]))
            self.assertNotEqual('@' + ADSName.parse(names[0]),
                                ADSName.parse(names[1]))
            self.assertNotEqual(ADSName.parse(names[0]),
                                '@' + ADSName.parse(names[1]))
            for other_synonyms in synonyms:
                if other_synonyms != syn:
                    other_names = other_synonyms.split(';')
                    for other_name in other_names:
                        self.assertNotEqual(ADSName.parse(names[0]),
                                            ADSName.parse(other_name))
                        self.assertNotEqual(ADSName.parse(names[1]),
                                            ADSName.parse(other_name))

        # A synonym without given names should work with given names provided
        self.assertEqual(ADSName.parse("test_synAA, a"),
                         ADSName.parse("test_synAB, abc"))

        # A synonym with given names should work without given names provided
        self.assertEqual(ADSName.parse("test_synEA"),
                         ADSName.parse("test_synEB"))

        # "test_synD, b c" should be selected as canonical.
        self.assertEqual(ADSName.parse("test_synD, a b c d"),
                         ADSName.parse("test_synD, b"))
        self.assertEqual(
            ADSName.parse("test_synD, a b c d").synonym,
            ADSName.parse("test_synD, b c"))
        self.assertIsNone(ADSName.parse("test_synD, b c d").synonym)

        # Names not matching a synonym should be unaffected
        self.assertIsNone(ADSName.parse("test_synD, e").synonym)
        self.assertIsNone(ADSName.parse("test_synEA, f").synonym)
        self.assertIsNone(ADSName.parse("test_synEA, f").synonym)

        # Synonyms should be possibilities, not mandatory. So 'test_synFB, q',
        # which is not synonym-ized due to the differing initial, should still
        # be equal to 'test_synFB', which gets synonym-ized to 'test_synFA'
        self.assertEqual(ADSName.parse("test_synFB"),
                         ADSName.parse("test_synFB, q"))

        # Nothing should be changed when using the `preserve` flag
        self.assertIsNone(
            ADSName.parse("test_synEA, abc d.", preserve=True).synonym)
        self.assertIsNone(
            ADSName.parse("test_synEA, abc d.", preserve=True).synonym)
        self.assertNotEqual(ADSName.parse("test_synEA, abc d.", preserve=True),
                            ADSName.parse("test_synEB, b", preserve=True))

        # Remove our test synonyms
        ads_name._name_cache.clear()
        ads_name._name_synonyms.clear()
        ads_name._load_synonyms()
Example #23
0
    def get_papers_for_author(self, query_author):
        query_author = ADSName.parse(query_author)

        query_authors = self._select_authors_to_prefetch()
        if query_author not in query_authors:
            query_authors.append(query_author)

        lb.i(f"Querying ADS for author " + query_author.qualified_full_name)
        if len(query_authors) > 1:
            lb.i(" Also prefetching. Query: " +
                 "; ".join([a.qualified_full_name for a in query_authors]))

        query_strings = []
        for author in query_authors:
            query_string = '"' + author.full_name + '"'
            if author.require_exact_match:
                query_string = "=" + query_string
            query_strings.append(query_string)
        query = " OR ".join(query_strings)
        query = f"author:({query})"

        documents = self._inner_query_for_author(query, len(query_authors))

        author_records = NameAwareDict()
        for author in query_authors:
            author_records[author] = AuthorRecord(name=author, documents=[])
        # We need to go through all the documents and match them to our
        # author list. This is critically important if we're pre-fetching
        # authors, but it's also important to support the "<" and ">"
        # specificity selectors for author names
        for document in documents:
            matched = False
            names = [ADSName.parse(n) for n in document.authors]
            for name in names:
                try:
                    author_records[name].documents.append(document.bibcode)
                    matched = True
                except KeyError:
                    pass
            if (not matched and all(
                    not a.require_more_specific and not a.require_less_specific
                    for a in query_authors)):
                # See if we can guess which names should have been matched
                guesses = []
                doc_authors = [n.full_name for n in names]
                doc_authors_initialized = \
                    [n.convert_to_initials().full_name for n in names]
                for query_author in query_authors:
                    guess = difflib.get_close_matches(query_author.full_name,
                                                      doc_authors,
                                                      n=1,
                                                      cutoff=0.8)
                    if len(guess):
                        guesses.append(
                            f"{query_author.full_name} -> {guess[0]}")
                    else:
                        # Try again, changing names to use initials throughout
                        guess = difflib.get_close_matches(
                            query_author.convert_to_initials().full_name,
                            doc_authors_initialized,
                            n=1,
                            cutoff=0.7)
                        if len(guess):
                            # Having found a match with initialized names,
                            # report using the full form of each name
                            chosen_doc_author = doc_authors[
                                doc_authors_initialized.index(guess[0])]
                            guesses.append(f"{query_author.full_name}"
                                           f" -> {chosen_doc_author}")
                msg = "ADS Buddy: No matches for " + document.bibcode
                if len(guesses):
                    msg += " . Guesses: " + "; ".join(guesses)
                lb.w(msg)

        for author_record in author_records.values():
            # Remove any duplicate document listings
            # Becomes important for papers with _many_ authors, e.g. LIGO
            # papers, which use only initials and so can have duplicate names
            author_record.documents = sorted(set(author_record.documents))

        if len(query_authors) == 1:
            return author_records[query_author], documents
        else:
            return author_records, documents
Example #24
0
    def test_specificity_equality(self):
        for i, name1 in enumerate(namesA):
            name1_lt = ADSName.parse("<" + name1)
            name1_gt = ADSName.parse(">" + name1)
            name1_lte = ADSName.parse("<=" + name1)
            name1_gte = ADSName.parse(">=" + name1)

            self.assertNotEqual(name1_lt, name1_lt)
            self.assertNotEqual(name1_gt, name1_gt)
            self.assertNotEqual(name1_lt, name1_gt)
            self.assertNotEqual(name1_gt, name1_lt)

            self.assertEqual(name1_lte, name1_lte)
            self.assertEqual(name1_gte, name1_gte)
            self.assertEqual(name1_lte, name1_gte)
            self.assertEqual(name1_gte, name1_lte)

            self.assertNotEqual(name1_lte, name1_lt)
            self.assertNotEqual(name1_gte, name1_gt)
            self.assertNotEqual(name1_lte, name1_gt)
            self.assertNotEqual(name1_gte, name1_lt)

            self.assertNotEqual(name1_lt, name1_lte)
            self.assertNotEqual(name1_gt, name1_gte)
            self.assertNotEqual(name1_lt, name1_gte)
            self.assertNotEqual(name1_gt, name1_lte)

            for j, name2 in enumerate(namesA):
                name2 = ADSName.parse(name2)
                # A larger index corresponds to more specificity, with a
                # few exceptions
                if i == j:
                    self.assertNotEqual(name1_lt, name2)
                    self.assertNotEqual(name2, name1_lt)
                    self.assertNotEqual(name1_gt, name2)
                    self.assertNotEqual(name2, name1_gt)

                    self.assertEqual(name1_lte, name2)
                    self.assertEqual(name2, name1_lte)
                    self.assertEqual(name1_gte, name2)
                    self.assertEqual(name2, name1_gte)
                elif ((i == 2 and j == 4) or (i == 3 and j in (4, 5))
                      or (i == 4 and j in (2, 3)) or (i == 5 and j == 3)):
                    self.assertNotEqual(name1_lt, name2)
                    self.assertNotEqual(name2, name1_lt)
                    self.assertNotEqual(name1_gt, name2)
                    self.assertNotEqual(name2, name1_gt)

                    self.assertNotEqual(name1_lte, name2)
                    self.assertNotEqual(name2, name1_lte)
                    self.assertNotEqual(name1_gte, name2)
                    self.assertNotEqual(name2, name1_gte)
                elif i > j:
                    self.assertEqual(name1_lt, name2)
                    self.assertEqual(name2, name1_lt)
                    self.assertNotEqual(name1_gt, name2)
                    self.assertNotEqual(name2, name1_gt)

                    self.assertEqual(name1_lte, name2)
                    self.assertEqual(name2, name1_lte)
                    self.assertNotEqual(name1_gte, name2)
                    self.assertNotEqual(name2, name1_gte)
                elif i < j:
                    self.assertNotEqual(name1_lt, name2)
                    self.assertNotEqual(name2, name1_lt)
                    self.assertEqual(name1_gt, name2)
                    self.assertEqual(name2, name1_gt)

                    self.assertNotEqual(name1_lte, name2)
                    self.assertNotEqual(name2, name1_lte)
                    self.assertEqual(name1_gte, name2)
                    self.assertEqual(name2, name1_gte)
                else:
                    self.fail("Shouldn't get here")
Example #25
0
    def _article_to_record(self, article):
        # Not every ORCID ID field is returned for every document, and not
        # every returned list has an entry for each author
        for key in ('orcid_pub', 'orcid_user', 'orcid_other'):
            if key not in article:
                article[key] = []
            article[key] = ['' if x == '-' else x for x in article[key]]
            article[key] += \
                [''] * (len(article['author']) - len(article[key]))

        # Choose one ORCID ID for each author
        orcid_id = []
        orcid_src = []
        for op, ou, oo in zip(article['orcid_pub'], article['orcid_user'],
                              article['orcid_other']):
            if op != '' and is_orcid_id(op):
                orcid_id.append(normalize_orcid_id(op))
                orcid_src.append(1)
            elif ou != '' and is_orcid_id(ou):
                orcid_id.append(normalize_orcid_id(ou))
                orcid_src.append(2)
            elif oo != '' and is_orcid_id(oo):
                orcid_id.append(normalize_orcid_id(oo))
                orcid_src.append(3)
            else:
                orcid_id.append('')
                orcid_src.append(0)

        article['aff'] = ['' if x == '-' else x for x in article['aff']]

        document = DocumentRecord(
            bibcode=article["bibcode"],
            title=(unescape(article["title"][0])
                   if "title" in article else "[No title given]"),
            authors=[unescape(a) for a in article["author"]],
            affils=[unescape(a) for a in article["aff"]],
            doctype=article["doctype"],
            keywords=([unescape(k) for k in article["keyword"]]
                      if "keyword" in article else []),
            publication=(unescape(article["pub"])
                         if "pub" in article else "[Publication not given]"),
            pubdate=article["date"],
            citation_count=(article["citation_count"]
                            if "citation_count" in article else 0),
            read_count=(article["read_count"]
                        if "read_count" in article else 0),
            orcid_ids=orcid_id,
            orcid_id_src=orcid_src)

        # Alter the DocumentRecord in-place to remove invalid author names
        bad_indices = []
        names = []
        for i, author in enumerate(document.authors):
            try:
                name = ADSName.parse(author)
            except InvalidName:
                lb.w(f"Invalid name for {document.bibcode}: {author}")
                bad_indices.append(i)
                continue

            if name.full_name in ("et al", "anonymous"):
                bad_indices.append(i)
                continue

            names.append(name)

        for i in reversed(bad_indices):
            document.delete_author(i)

        return document
Example #26
0
from unittest import TestCase

import names.ads_name as ads_name
from names.ads_name import ADSName
from names.name_aware import NameAwareDict, NameAwareSet
from path_node import PathNode

equal_names_str = [
    "Murray, S.", "Murray, Stephen", "Murray, Stephen S",
    "Murray, Stephen Steve"
]
equal_names = [ADSName.parse(n) for n in equal_names_str]

diff_names_str = ["Murray, Eva", "Burray, Eva", "Murray, Eric"]
diff_names = [ADSName.parse(n) for n in diff_names_str]


class TestNameAwareDict(TestCase):
    def test_get_set_item(self):
        nad = NameAwareDict()

        with self.assertRaises(KeyError):
            nad[diff_names[0]]
        with self.assertRaises(KeyError):
            nad[diff_names[1]]

        node = PathNode(equal_names[0])
        nad[equal_names[0]] = node

        diff_nodes = []
        for name in diff_names:
Example #27
0
    def find_path(self):
        lb.on_start_path_finding()
        self.n_iterations = 0

        if is_orcid_id(self.orig_src):
            src_rec = self.repository.get_author_record_by_orcid_id(
                self.orig_src)
            self.src = PathNode(name=src_rec.name,
                                dist_from_src=0,
                                legal_bibcodes=set(src_rec.documents))
        else:
            src_rec = self.repository.get_author_record(self.orig_src)
            self.src = PathNode(name=self.orig_src, dist_from_src=0)

        if is_orcid_id(self.orig_dest):
            dest_rec = self.repository.get_author_record_by_orcid_id(
                self.orig_dest)
            self.dest = PathNode(name=dest_rec.name,
                                 dist_from_dest=0,
                                 legal_bibcodes=set(dest_rec.documents))
        else:
            dest_rec = self.repository.get_author_record(self.orig_dest)
            self.dest = PathNode(name=self.orig_dest, dist_from_dest=0)

        # If we were given a name and an ORCID ID and they turn out to refer
        # to the same person, error out.
        mixed_name_formats = (
            (type(self.orig_src) == ADSName and type(self.orig_dest) == str) or
            (type(self.orig_src) == str and type(self.orig_dest) == ADSName))
        if mixed_name_formats and src_rec.name == dest_rec.name:
            raise PathFinderError(
                "src_is_dest_after_orcid",
                'After looking up the ORCID ID, the "source" and "destination"'
                ' identities are equal (or at least overlap).')

        self.nodes[src_rec.name] = self.src
        self.nodes[dest_rec.name] = self.dest
        self.authors_to_expand_src_next.append(self.src.name)
        self.authors_to_expand_dest_next.append(self.dest.name)

        if (len(src_rec.documents) == 0 or all(
            [d in self.excluded_bibcodes for d in src_rec.documents])):
            raise PathFinderError(
                "src_empty",
                "No documents found for " + self.src.name.original_name)
        if (len(dest_rec.documents) == 0 or all(
            [d in self.excluded_bibcodes for d in dest_rec.documents])):
            raise PathFinderError(
                "dest_empty",
                "No documents found for " + self.dest.name.original_name)

        while True:
            lb.d("Beginning new iteration")
            lb.d(f"{len(self.authors_to_expand_src_next)} "
                 "authors on src side")
            lb.d(f"{len(self.authors_to_expand_dest_next)} "
                 "authors on dest side")
            if (len(self.authors_to_expand_src_next) == 0
                    or len(self.authors_to_expand_dest_next) == 0):
                raise PathFinderError(
                    "no_authors_to_expand", "No connections possible after "
                    f"{self.n_iterations} iterations")
            # Of the two lists of authors we could expand, let's always
            # choose the shortest. This tends to get us to a solution
            # faster.
            expanding_from_src = (len(self.authors_to_expand_src_next) < len(
                self.authors_to_expand_dest_next))
            lb.d("Expanding from "
                 f"{'src' if expanding_from_src else 'dest'} side")

            authors = (self.authors_to_expand_src
                       if expanding_from_src else self.authors_to_expand_dest)
            authors_next = (self.authors_to_expand_src_next
                            if expanding_from_src else
                            self.authors_to_expand_dest_next)
            authors.clear()
            authors.extend(authors_next)
            authors_next.clear()

            # There's no point pre-fetching for only one author, and this
            # ensures we don't re-fetch the src and dest authors if they
            # were provided by ORCID ID
            if len(authors) > 1:
                self.repository.notify_of_upcoming_author_request(*authors)
            for expand_author in authors:
                lb.d(f"Expanding author {expand_author}")
                expand_node = self.nodes[expand_author]
                expand_node_dist = expand_node.dist(expanding_from_src)

                # We already have src and dest records handy, and this special
                # handling is required if either was provided by ORCID ID
                if expand_node is self.src:
                    record = src_rec
                elif expand_node is self.dest:
                    record = dest_rec
                else:
                    record = self.repository.get_author_record(expand_author)

                # Here's a tricky one. If "<=Last, F" is in the exclude
                # list, and if we previously came across "Last, First" and
                # we're now expanding that node, we're ok using papers
                # written under "Last, First" but we're _not_ ok using
                # papers written under "Last, F.". So we need to ensure
                # we're allowed to use each paper by ensuring Last, First's
                # name appears on it in a way that's not excluded.
                ok_aliases = [
                    name for name in record.appears_as
                    if name not in self.excluded_names
                ]
                if (len(self.excluded_bibcodes)
                        or len(ok_aliases) != len(record.appears_as)):
                    ok_bibcodes = {
                        bibcode
                        for alias in ok_aliases
                        for bibcode in record.appears_as[alias]
                        if bibcode not in self.excluded_bibcodes
                    }
                else:
                    ok_bibcodes = None

                for coauthor, bibcodes in record.coauthors.items():
                    # lb.d(f"  Checking coauthor {coauthor}")
                    if ok_bibcodes is not None:
                        bibcodes = [
                            bibcode for bibcode in bibcodes
                            if bibcode in ok_bibcodes
                        ]
                    if len(bibcodes) == 0:
                        continue

                    coauthor = ADSName.parse(coauthor)
                    if coauthor in self.excluded_names:
                        # lb.d("   Author is excluded")
                        continue

                    try:
                        node = self.nodes[coauthor]
                        # lb.d(f"   Author exists in graph")
                    except KeyError:
                        # lb.d(f"   New author added to graph")
                        lb.on_coauthor_seen()
                        node = PathNode(name=coauthor)
                        self.nodes[coauthor] = node
                        node.set_dist(expand_node_dist + 1, expanding_from_src)
                        node.neighbors(expanding_from_src).add(expand_node)
                        links = node.links(expanding_from_src)[expand_node]
                        links.update(bibcodes)
                        authors_next.append(coauthor)
                        continue

                    # if (node.dist(expanding_from_src)
                    #         <= expand_node_dist):
                    # This node is closer to the src/dest than we are
                    # and must have been encountered in a
                    # previous expansion cycle. Ignore it.
                    # pass
                    if (node.dist(expanding_from_src) > expand_node_dist):
                        # We provide an equal-or-better route from the
                        # src/dest than the route (if any) that this node
                        # is aware of, meaning this node is a viable next
                        # step along the chain from the src/dest through
                        # us. That it already exists suggests it has
                        # multiple chains of equal length connecting it to
                        # the src or dest.
                        # If the src or dest was given via ORCID ID, we need
                        # to make sure we have a valid connection. (E.g. if
                        # the given ID is for one J Doe and our expand_author
                        # is connected to a different J Doe, we need to
                        # exclude that.
                        if len(node.legal_bibcodes):
                            legal_bibcodes = set(
                                bibcodes) & node.legal_bibcodes
                        else:
                            legal_bibcodes = bibcodes
                        if len(legal_bibcodes):
                            links = node.links(expanding_from_src)[expand_node]
                            links.update(legal_bibcodes)
                            node.set_dist(expand_node_dist + 1,
                                          expanding_from_src)
                            node.neighbors(expanding_from_src).add(expand_node)
                            # lb.d(f"   Added viable step")
                            if self.node_connects(node, expanding_from_src):
                                self.connecting_nodes.add(node)
                                lb.d(f"   Connecting author found!")
            lb.d("All expansions complete")
            self.n_iterations += 1
            if len(self.connecting_nodes) > 0:
                break
            elif self.n_iterations > 8:
                raise PathFinderError(
                    "too_far",
                    "The distance is >8, which is quite far. Giving up.")
            else:
                continue
        self.produce_final_graph()
        lb.set_n_connections(len(self.connecting_nodes))
        lb.set_distance(self.src.dist_from_dest)
        lb.on_stop_path_finding()
Example #28
0
 def test_errors(self):
     with self.assertRaises(InvalidName):
         ADSName.parse(",last, first")
     with self.assertRaises(InvalidName):
         ADSName.parse(",last")
Example #29
0
 def __post_init__(self):
     if self.name is not None:
         self.name = ADSName.parse(self.name)
     if self.timestamp == -1:
         self.timestamp = int(time.time())
Example #30
0
    def __init__(self, src, dest, excluded_names=None):
        self.repository = Repository()
        if not key_is_valid(src) and not is_orcid_id(src):
            raise PathFinderError("invalid_char_in_name",
                                  'The "source" name is invalid.')
        if not key_is_valid(dest) and not is_orcid_id(dest):
            raise PathFinderError("invalid_char_in_name",
                                  'The "destination" name is invalid.')

        names_to_be_queried = []
        if is_orcid_id(src):
            src = normalize_orcid_id(src)
        else:
            try:
                src = ADSName.parse(src)
            except InvalidName:
                raise PathFinderError("invalid_char_in_name",
                                      'The "source" name is invalid.')
            if src.excludes_self:
                raise PathFinderError(
                    "src_invalid_lt_gt",
                    "'<' and '>' are invalid modifiers for the source and "
                    "destination authors and can only be used in the "
                    "exclusions "
                    "list. Try '<=' or '>=' instead.")
            names_to_be_queried.append(src)

        if is_orcid_id(dest):
            dest = normalize_orcid_id(dest)
        else:
            try:
                dest = ADSName.parse(dest)
            except InvalidName:
                raise PathFinderError("invalid_char_in_name",
                                      'The "destination" name is invalid.')
            if dest.excludes_self:
                raise PathFinderError(
                    "dest_invalid_lt_gt",
                    "'<' and '>' are invalid modifiers for the source and "
                    "destination authors and can only be used in the "
                    "exclusions "
                    "list. Try '<=' or '>=' instead.")
            names_to_be_queried.append(dest)

        if type(src) == type(dest) and src == dest:
            raise PathFinderError(
                "src_is_dest",
                'The "source" and "destination" names are equal (or at least'
                ' consistent). The distance is zero. APPA would like something'
                ' more challenging, please.')

        self.excluded_names = NameAwareSet()
        self.excluded_bibcodes = set()
        if excluded_names is not None:
            if type(excluded_names) is str:
                excluded_names = [excluded_names]
            for name in excluded_names:
                name = name.strip()
                if name == '':
                    continue
                elif is_bibcode(name):
                    self.excluded_bibcodes.add(name)
                else:
                    try:
                        self.excluded_names.add(ADSName.parse(name))
                    except InvalidName:
                        raise PathFinderError(
                            "invalid_excl",
                            f"'{name}' is an invalid name to exclude.")

        self.repository.notify_of_upcoming_author_request(*names_to_be_queried)
        self.authors_to_expand_src = []
        self.authors_to_expand_src_next = []
        self.authors_to_expand_dest = []
        self.authors_to_expand_dest_next = []

        self.nodes = NameAwareDict()
        self.connecting_nodes = set()

        self.orig_src = src
        self.orig_dest = dest