def mergeResultData(result1, result2): """ Merges bibtex and extra_data dictionaries for a SearchResult and/or a Paper :param result1: :param result2: :return: """ # if there's no year we should update the ID after getting the year to_update_id = not result1.bib.get('year') or not 'ID' in result1.bib for field in BIB_FIELDS_TRANSFER: if len(str(result2.bib.get(field, ''))) > len( str(result1.bib.get(field, ''))): result1.bib[field] = str(result2.bib[field]) for field in ['ID', 'ENTRYTYPE']: if field in result2.bib: result1.bib[field] = str(result2.bib[field]) if 'ID' not in result2.bib and to_update_id: if 'ID' in result1.bib: del result1.bib['ID'] fixBibData(result1.bib, 1) for field in result2.extra_data: if field not in result1.extra_data: result1.extra_data[field] = result2.extra_data[field] if 'urls' in result2.extra_data: for url in result2.extra_data['urls']: addUrlIfNew(result1, url['url'], url['type'], url['source']) refreshDOIfromURLs(result1) return result1
def search(self, query, min_year=None, max_year=None, max_results=MAX_RESULTS): # TODO implement max year if min_year: scholarly.scholarly._PUBSEARCH = '/scholar?as_ylo=' + str( min_year) + '&q={0}' query = scholarly.search_pubs_query(query) results = [] index = 0 for result in tqdm(query, desc="Getting results", total=max_results): bib = fixBibData(result.bib, index) extra_data = {} for field in ['citedby', 'url_scholarbib']: if hasattr(result, field): extra_data[field] = getattr(result, field) if hasattr(result, 'id_scholarcitedby'): extra_data['scholarid'] = result.id_scholarcitedby for field in ['url', 'eprint']: if hasattr(result, field): bib[field] = getattr(result, field) addUrlIfNewWithType(result, result.url, 'scholar') doi = getDOIfromURL(bib.get('url')) if not doi: doi = getDOIfromURL(bib.get('eprint', '')) if doi: bib['doi'] = doi result = SearchResult(index, bib, result.source, extra_data) results.append(result) index += 1 if len(results) == max_results: break if len(results) % 10 == 0: self.randomSleep() return results
def readRIS(filename): with open(filename, 'r') as f: entries = readris(f) res = [] for entry in entries: entry['author'] = authorListFromListOfAuthors(entry.get('authors', [])) if 'authors' in entry: del entry['authors'] new_type = 'article' if entry.get('type_of_reference'): if entry['type_of_reference'] in reverse_type_mapping: new_type = reverse_type_mapping[entry['type_of_reference']] entry['ENTRYTYPE'] = new_type entry = fixBibData(entry, 0) res.append(entry) return res
def enrichMetadata(paper: Paper, identity): """ Tries to retrieve metadata from Crossref and abstract from SemanticScholar for a given paper, Google Scholar bib if all else fails :param paper: Paper instance """ paper.title = basicTitleCleaning(paper.title) original_title = paper.title if paper.pmid and not paper.extra_data.get("done_pubmed"): pubmed_scraper.enrichWithMetadata(paper) paper.extra_data['done_pubmed'] = True # if we don't have a DOI, we need to find it on Crossref if not paper.doi and not paper.extra_data.get('done_crossref', False): crossref_scraper.matchPaperFromResults(paper, identity) if paper.doi: new_bib = getBibtextFromDOI(paper.doi) paper = mergeResultData( paper, SearchResult(1, new_bib[0], 'crossref', paper.extra_data)) paper.extra_data['done_crossref'] = True # if we have a DOI and we haven't got the abstract yet if paper.doi and not paper.extra_data.get('done_semanticscholar'): semanticscholarmetadata.getMetadata(paper) paper.extra_data['done_semanticscholar'] = True # try PubMed if we still don't have a PMID if not paper.pmid and not paper.extra_data.get('done_pubmed'): # if (not paper.doi or not paper.has_full_abstract) and not paper.pmid and not paper.extra_data.get('done_pubmed'): if pubmed_scraper.matchPaperFromResults(paper, identity, ok_title_distance=0.4): pubmed_scraper.enrichWithMetadata(paper) paper.extra_data['done_pubmed'] = True # still no DOI? maybe we can get something from SemanticScholar if not paper.extra_data.get('ss_id') and not paper.extra_data.get( 'done_semanticscholar'): semanticscholarmetadata.matchPaperFromResults(paper, identity) paper.extra_data['done_semanticscholar'] = True # # time to try Scopus, see if it's behind a paywall # if not paper.doi and not paper.extra_data.get('done_scopus'): # semanticscholarmetadata.getMetadata(paper) # paper.extra_data['done_semanticscholar'] = True # if we don't have an abstract maybe it's on arXiv if not paper.has_full_abstract and not paper.extra_data.get('done_arxiv'): # if not paper.extra_data.get('done_arxiv'): arxiv_scraper.matchPaperFromResults(paper, identity, ok_title_distance=0.35) paper.extra_data['done_arxiv'] = True # try to get open access links if DOI present and missing PDF link if not paper.has_pdf_link and paper.doi and not paper.extra_data.get( 'done_unpaywall'): unpaywall_scraper.getMetadata(paper, identity) paper.extra_data['done_unpaywall'] = True # if all else has failed but we have a link to Google Scholar bib data, get that if not paper.year and paper.extra_data.get('url_scholarbib'): scholar_scraper.getBibtex(paper) if paper.title != original_title: print('Original: %s\nNew: %s' % (original_title, paper.title)) paper.bib = fixBibData(paper.bib, 1)
def getMetadata(self, paper, get_citing_papers=False): if not paper.doi and not paper.extra_data.get('ss_id'): raise ValueError('paper has no DOI or SSID') if paper.extra_data.get('ss_id'): unique_id = paper.extra_data.get('ss_id') else: unique_id = paper.doi url = 'https://api.semanticscholar.org/v1/paper/' + unique_id r = self.request(url) d = r.json() if 'error' in d: print("SemanticScholar error:", d['error']) return for field in ['abstract', 'year', 'venue']: if d.get(field): paper.bib[field] = str(d[field]) if d.get('arxivId'): paper.arxivid = d['arxivId'] for topic in d['topics']: # we really don't need to store the url, it's just # https://www.semanticscholar.org/topic/{topicId} del topic['url'] authors = self.loadSSAuthors(d['authors']) paper.bib['author'] = authorListFromDict(authors) paper.extra_data['ss_topics'] = d['topics'] paper.extra_data['ss_authors'] = d['authors'] paper.extra_data['ss_id'] = d['paperId'] if get_citing_papers: citing_papers = [] for index, citation in enumerate(d['citations']): ss_authors = semanticscholarmetadata.loadSSAuthors( citation['authors']) authors = authorListFromDict(ss_authors) bib = { 'title': citation['title'], 'author': authors, 'year': citation['year'], 'doi': citation['year'], } bib = fixBibData(bib, index) extra_data = { 'ss_id': citation['paperId'], 'ss_influential': citation['isInfluential'], 'ss_authors': ss_authors } if citation.get('arxivId'): extra_data['arxivid'] = citation.get('arxivId') new_paper = Paper(bib, extra_data) citing_papers.append(new_paper) return paper, citing_papers return paper