Beispiel #1
0
    def get_papers_for_orcid_id(self, orcid_id):
        orcid_id = normalize_orcid_id(orcid_id)
        lb.i(f"Querying ADS for orcid id " + orcid_id)
        query = f"orcid:({orcid_id})"

        documents = self._inner_query_for_author(query, 1)

        author_record = AuthorRecord(name=ADSName.parse(orcid_id,
                                                        preserve=True),
                                     documents=[])
        names = set()
        for document in documents:
            try:
                i = document.orcid_ids.index(orcid_id)
            except ValueError:
                lb.w(f"ORCID ID not found in {document.bibcode}")
                continue
            author_record.documents.append(document.bibcode)
            names.add(document.authors[i])

        # Find the most-detailed form of the name
        if len(names):
            names = [ADSName.parse(n) for n in names]
            intermed = [(n.level_of_detail, len(n.full_name), n)
                        for n in names]
            intermed.sort(reverse=True)
            author_record.name = intermed[0][-1]
        return author_record, documents
Beispiel #2
0
def _rank_author_chains(chains: [], repo, pairings):
    items = []
    for chain in chains:
        scores, paper_choices = _score_author_chain(chain, repo, pairings)
        if scores is None:
            continue
        # We'd like papers to be sorted by score descending, and then
        # alphabetically by title as the tie-breaker. So here we look up those
        # titles. `paper_choices` looks like:
        # ( [ (bibcode, 0, 1), (bibcode, 0, 1), (bibcode, 0, 1) ],
        #   [ (bibcode, 0, 1), (bibcode, 0, 1), (bibcode, 0, 1) ] )
        # Each column represents a chain link (A -> B)'
        # Each row gives you one paper for each chain link
        # We want to replace each inner tuple with a paper title
        titles = [[
            repo.get_document(bibcode).title for bibcode, _, _ in paper_choice
        ] for paper_choice in paper_choices]

        # This should happen here, since later we use author names
        # as a secondary key for sorting.
        new_chain = normalize_author_names(paper_choices, repo)

        # Negate scores so we have a sort that's descending by actual score
        # and then ascending by title
        intermed = zip([-s for s in scores], titles, paper_choices)
        intermed = sorted(intermed)
        scores, _, paper_choices = zip(*intermed)
        items.append((scores[0], new_chain, paper_choices))

    if len(items) == 0:
        return None

    # The scores are still negative, so now we get a sort that's descending by
    # actual score and then ascending by author names.
    intermed = sorted(items)

    # Since we normalized the chains earlier, it's possible that we have
    # duplicate chains (two different forms of a name that have been normalized
    # to the same form). Let's de-duplicate. Since it's possible that the two
    # forms have different paper choices, doing it here means we can choose
    # the highest-ranked form.
    chains_we_have_seen = set()
    result = []
    for score, chain, pc in intermed:
        if chain not in chains_we_have_seen:
            result.append((-score, chain, pc))
        chains_we_have_seen.add(chain)

    if len(result) != len(chains):
        lb.w(f"{len(chains) - len(result)} / {len(chains)} chains invalidated")

    return result
Beispiel #3
0
    def _do_query_for_author(self, params, n_authors):
        t_start = time.time()
        r = requests.get("https://api.adsabs.harvard.edu/v1/search/query",
                         params=params,
                         headers={"Authorization": f"Bearer {ADS_TOKEN}"},
                         timeout=(6.05, 6 * n_authors))
        t_elapsed = time.time() - t_start
        lb.on_network_complete(t_elapsed)
        if t_elapsed > 2 * n_authors:
            lb.w(f"Long ADS query: {t_elapsed:.2f} s for {params['q']}")

        if 'X-RateLimit-Remaining' in r.headers:
            if int(r.headers.get('X-RateLimit-Remaining', 1)) <= 1:
                reset = time.strftime(
                    "%Y-%m-%d %H:%M:%S UTC",
                    time.gmtime(int(r.headers.get('X-RateLimit-Reset', 0))))
                raise ADSRateLimitError(r.headers.get('X-RateLimit-Limit'),
                                        reset)
        else:
            lb.w("ADS query did not return X-RateLimit-Remaining")

        r_data = r.json()
        if "error" in r_data:
            raise ADSError('ads_error', r_data['error']['msg'])

        documents = self._articles_to_records(r_data['response']['docs'])

        if r_data['response']['numFound'] > len(documents) + params['start']:
            lb.i(f"Got too many documents in request."
                 f" numFound: {r_data['response']['numFound']}"
                 f" start: {params['start']}"
                 f" docs rec'd: {len(documents)}")
            params['start'] += len(documents)
            documents.extend(self._do_query_for_author(params, n_authors))

        return documents
Beispiel #4
0
    def get_papers_for_author(self, query_author):
        query_author = ADSName.parse(query_author)

        query_authors = self._select_authors_to_prefetch()
        if query_author not in query_authors:
            query_authors.append(query_author)

        lb.i(f"Querying ADS for author " + query_author.qualified_full_name)
        if len(query_authors) > 1:
            lb.i(" Also prefetching. Query: " +
                 "; ".join([a.qualified_full_name for a in query_authors]))

        query_strings = []
        for author in query_authors:
            query_string = '"' + author.full_name + '"'
            if author.require_exact_match:
                query_string = "=" + query_string
            query_strings.append(query_string)
        query = " OR ".join(query_strings)
        query = f"author:({query})"

        documents = self._inner_query_for_author(query, len(query_authors))

        author_records = NameAwareDict()
        for author in query_authors:
            author_records[author] = AuthorRecord(name=author, documents=[])
        # We need to go through all the documents and match them to our
        # author list. This is critically important if we're pre-fetching
        # authors, but it's also important to support the "<" and ">"
        # specificity selectors for author names
        for document in documents:
            matched = False
            names = [ADSName.parse(n) for n in document.authors]
            for name in names:
                try:
                    author_records[name].documents.append(document.bibcode)
                    matched = True
                except KeyError:
                    pass
            if (not matched and all(
                    not a.require_more_specific and not a.require_less_specific
                    for a in query_authors)):
                # See if we can guess which names should have been matched
                guesses = []
                doc_authors = [n.full_name for n in names]
                doc_authors_initialized = \
                    [n.convert_to_initials().full_name for n in names]
                for query_author in query_authors:
                    guess = difflib.get_close_matches(query_author.full_name,
                                                      doc_authors,
                                                      n=1,
                                                      cutoff=0.8)
                    if len(guess):
                        guesses.append(
                            f"{query_author.full_name} -> {guess[0]}")
                    else:
                        # Try again, changing names to use initials throughout
                        guess = difflib.get_close_matches(
                            query_author.convert_to_initials().full_name,
                            doc_authors_initialized,
                            n=1,
                            cutoff=0.7)
                        if len(guess):
                            # Having found a match with initialized names,
                            # report using the full form of each name
                            chosen_doc_author = doc_authors[
                                doc_authors_initialized.index(guess[0])]
                            guesses.append(f"{query_author.full_name}"
                                           f" -> {chosen_doc_author}")
                msg = "ADS Buddy: No matches for " + document.bibcode
                if len(guesses):
                    msg += " . Guesses: " + "; ".join(guesses)
                lb.w(msg)

        for author_record in author_records.values():
            # Remove any duplicate document listings
            # Becomes important for papers with _many_ authors, e.g. LIGO
            # papers, which use only initials and so can have duplicate names
            author_record.documents = sorted(set(author_record.documents))

        if len(query_authors) == 1:
            return author_records[query_author], documents
        else:
            return author_records, documents
Beispiel #5
0
    def _article_to_record(self, article):
        # Not every ORCID ID field is returned for every document, and not
        # every returned list has an entry for each author
        for key in ('orcid_pub', 'orcid_user', 'orcid_other'):
            if key not in article:
                article[key] = []
            article[key] = ['' if x == '-' else x for x in article[key]]
            article[key] += \
                [''] * (len(article['author']) - len(article[key]))

        # Choose one ORCID ID for each author
        orcid_id = []
        orcid_src = []
        for op, ou, oo in zip(article['orcid_pub'], article['orcid_user'],
                              article['orcid_other']):
            if op != '' and is_orcid_id(op):
                orcid_id.append(normalize_orcid_id(op))
                orcid_src.append(1)
            elif ou != '' and is_orcid_id(ou):
                orcid_id.append(normalize_orcid_id(ou))
                orcid_src.append(2)
            elif oo != '' and is_orcid_id(oo):
                orcid_id.append(normalize_orcid_id(oo))
                orcid_src.append(3)
            else:
                orcid_id.append('')
                orcid_src.append(0)

        article['aff'] = ['' if x == '-' else x for x in article['aff']]

        document = DocumentRecord(
            bibcode=article["bibcode"],
            title=(unescape(article["title"][0])
                   if "title" in article else "[No title given]"),
            authors=[unescape(a) for a in article["author"]],
            affils=[unescape(a) for a in article["aff"]],
            doctype=article["doctype"],
            keywords=([unescape(k) for k in article["keyword"]]
                      if "keyword" in article else []),
            publication=(unescape(article["pub"])
                         if "pub" in article else "[Publication not given]"),
            pubdate=article["date"],
            citation_count=(article["citation_count"]
                            if "citation_count" in article else 0),
            read_count=(article["read_count"]
                        if "read_count" in article else 0),
            orcid_ids=orcid_id,
            orcid_id_src=orcid_src)

        # Alter the DocumentRecord in-place to remove invalid author names
        bad_indices = []
        names = []
        for i, author in enumerate(document.authors):
            try:
                name = ADSName.parse(author)
            except InvalidName:
                lb.w(f"Invalid name for {document.bibcode}: {author}")
                bad_indices.append(i)
                continue

            if name.full_name in ("et al", "anonymous"):
                bad_indices.append(i)
                continue

            names.append(name)

        for i in reversed(bad_indices):
            document.delete_author(i)

        return document