Example #1
0
def main(conf):
    if conf.cache:
        paperstore = PaperStore()
    else:
        paperstore = None

    if conf.engine == "scholar":
        searcher = GScholarSearcher(paperstore)
    # elif conf.engine == "pubmed":
    #     searcher = PubMedSearcher(paperstore)
    else:
        raise ValueError

    if conf.query_file:
        with open(conf.query_file, 'r') as f:
            query = f.read()
    else:
        query = conf.query

    print("Query:", query)

    results = searcher.search(query, min_year=conf.year_start, max_results=conf.max)

    if conf.cache:
        found, missing = paperstore.matchResultsWithPapers(results)

        papers_to_add = [Paper(res.bib, res.extra_data) for res in missing]
        paperstore.updatePapers(papers_to_add)

    writeBibtex([Paper(res.bib, res.extra_data) for res in results], conf.file)
def main(conf):
    if conf.cache:
        paperstore = PaperStore()
    else:
        paperstore = None

    bib_entries = loadRefsFromHTML(conf.input)

    results = getSearchResultsFromBib(bib_entries)

    if paperstore:
        found, missing = paperstore.matchResultsWithPapers(results)
    else:
        found = []
        missing = results

    papers_to_add = [Paper(res.bib, res.extra_data) for res in missing]

    counter = 0

    for res in found:
        if res.bib.get('url'):
            if addUrlIfNewWithType(res.paper, res['url'], 'endnote'):
                counter += 1
        if res.bib.get('eprint'):
            if addUrlIfNewWithType(res.paper, res['eprint'], 'endnote'):
                counter += 1

    papers_existing = [res.paper for res in found]
    paperstore.updatePapers(papers_existing)

    print('Papers found', len(papers_existing))
    print('Papers not found', len(papers_to_add))
    print('Added', counter, 'urls')
def loadEntriesAndSetUp(input, use_cache=True, max_results=10000000):
    if use_cache:
        paperstore = PaperStore()
    else:
        paperstore = None

    bib_entries = readInputBib(input)
    results = getSearchResultsFromBib(bib_entries, max_results)

    results = simpleResultDeDupe(results)

    if paperstore:
        found, missing = paperstore.matchResultsWithPapers(results)
    else:
        found = []
        missing = results

    papers_to_add = [Paper(res.bib, res.extra_data) for res in missing]
    papers_existing = [mergeResultData(res, res.paper) for res in found]

    all_papers = papers_to_add + papers_existing

    # FIXME: a second dedupe is needed because it seems I'm matching the wrong paper
    # a total of 5 records suffer from this so it's no big deal
    all_papers = simpleResultDeDupe(all_papers)

    return paperstore, papers_to_add, papers_existing, all_papers
Example #4
0
def set_union(a, b):
    res = set(a.keys()) | set(b.keys())
    full_dict = merge_two_dicts(a, b)
    res_list = [value for key, value in full_dict.items() if key in res]
    return [Paper(x, {}) for x in res_list]
Example #5
0
def set_intersect(a, b):
    res = set(a.keys()) & set(b.keys())
    res_list = [value for key, value in a.items() if key in res]
    return [Paper(x, {}) for x in res_list]
def enrichMetadata(paper: Paper, identity):
    """
    Tries to retrieve metadata from Crossref and abstract from SemanticScholar for a given paper,
    Google Scholar bib if all else fails

    :param paper: Paper instance
    """
    paper.title = basicTitleCleaning(paper.title)
    original_title = paper.title

    if paper.pmid and not paper.extra_data.get("done_pubmed"):
        pubmed_scraper.enrichWithMetadata(paper)
        paper.extra_data['done_pubmed'] = True

    # if we don't have a DOI, we need to find it on Crossref
    if not paper.doi and not paper.extra_data.get('done_crossref', False):
        crossref_scraper.matchPaperFromResults(paper, identity)

        if paper.doi:
            new_bib = getBibtextFromDOI(paper.doi)
            paper = mergeResultData(
                paper, SearchResult(1, new_bib[0], 'crossref',
                                    paper.extra_data))
        paper.extra_data['done_crossref'] = True

    # if we have a DOI and we haven't got the abstract yet
    if paper.doi and not paper.extra_data.get('done_semanticscholar'):
        semanticscholarmetadata.getMetadata(paper)
        paper.extra_data['done_semanticscholar'] = True

    # try PubMed if we still don't have a  PMID
    if not paper.pmid and not paper.extra_data.get('done_pubmed'):
        # if (not paper.doi or not paper.has_full_abstract) and not paper.pmid and not paper.extra_data.get('done_pubmed'):
        if pubmed_scraper.matchPaperFromResults(paper,
                                                identity,
                                                ok_title_distance=0.4):
            pubmed_scraper.enrichWithMetadata(paper)
        paper.extra_data['done_pubmed'] = True

    # still no DOI? maybe we can get something from SemanticScholar
    if not paper.extra_data.get('ss_id') and not paper.extra_data.get(
            'done_semanticscholar'):
        semanticscholarmetadata.matchPaperFromResults(paper, identity)
        paper.extra_data['done_semanticscholar'] = True

    # # time to try Scopus, see if it's behind a paywall
    # if not paper.doi and not paper.extra_data.get('done_scopus'):
    #     semanticscholarmetadata.getMetadata(paper)
    #     paper.extra_data['done_semanticscholar'] = True

    # if we don't have an abstract maybe it's on arXiv
    if not paper.has_full_abstract and not paper.extra_data.get('done_arxiv'):
        # if not paper.extra_data.get('done_arxiv'):
        arxiv_scraper.matchPaperFromResults(paper,
                                            identity,
                                            ok_title_distance=0.35)
        paper.extra_data['done_arxiv'] = True

    # try to get open access links if DOI present and missing PDF link
    if not paper.has_pdf_link and paper.doi and not paper.extra_data.get(
            'done_unpaywall'):
        unpaywall_scraper.getMetadata(paper, identity)
        paper.extra_data['done_unpaywall'] = True

    # if all else has failed but we have a link to Google Scholar bib data, get that
    if not paper.year and paper.extra_data.get('url_scholarbib'):
        scholar_scraper.getBibtex(paper)

    if paper.title != original_title:
        print('Original: %s\nNew: %s' % (original_title, paper.title))
    paper.bib = fixBibData(paper.bib, 1)
    def getMetadata(self, paper, get_citing_papers=False):
        if not paper.doi and not paper.extra_data.get('ss_id'):
            raise ValueError('paper has no DOI or SSID')

        if paper.extra_data.get('ss_id'):
            unique_id = paper.extra_data.get('ss_id')
        else:
            unique_id = paper.doi

        url = 'https://api.semanticscholar.org/v1/paper/' + unique_id

        r = self.request(url)
        d = r.json()

        if 'error' in d:
            print("SemanticScholar error:", d['error'])
            return

        for field in ['abstract', 'year', 'venue']:
            if d.get(field):
                paper.bib[field] = str(d[field])

        if d.get('arxivId'):
            paper.arxivid = d['arxivId']

        for topic in d['topics']:
            # we really don't need to store the url, it's just
            # https://www.semanticscholar.org/topic/{topicId}
            del topic['url']

        authors = self.loadSSAuthors(d['authors'])
        paper.bib['author'] = authorListFromDict(authors)

        paper.extra_data['ss_topics'] = d['topics']
        paper.extra_data['ss_authors'] = d['authors']
        paper.extra_data['ss_id'] = d['paperId']

        if get_citing_papers:
            citing_papers = []
            for index, citation in enumerate(d['citations']):
                ss_authors = semanticscholarmetadata.loadSSAuthors(
                    citation['authors'])
                authors = authorListFromDict(ss_authors)

                bib = {
                    'title': citation['title'],
                    'author': authors,
                    'year': citation['year'],
                    'doi': citation['year'],
                }
                bib = fixBibData(bib, index)

                extra_data = {
                    'ss_id': citation['paperId'],
                    'ss_influential': citation['isInfluential'],
                    'ss_authors': ss_authors
                }
                if citation.get('arxivId'):
                    extra_data['arxivid'] = citation.get('arxivId')

                new_paper = Paper(bib, extra_data)
                citing_papers.append(new_paper)
            return paper, citing_papers
        return paper