def get_data_from_doi(self, doi): doi_doc = FullDoc(doi=doi) if doi_doc.read(self.client): # print("doi_doc.title: ", doi_doc.title) doi_doc.write() else: print("Read document failed.") return doi id = None if not 'pubmed-id' in doi_doc._data.keys(): print("no pubmed-id, trying with title") # try with title Entrez.email = '*****@*****.**' query = doi_doc.title handle = Entrez.esearch(db='pubmed', retmode='xml', term=query) results = Entrez.read(handle) if int(results['Count']) > 0: id = results['IdList'] else: id = doi_doc._data['pubmed-id'] if id != None: return self.fetch_data_from_pubmed(id) else: print("no pubmed id") return doi
def get_paper(doi: str): ## ScienceDirect (full-text) document example using DOI doi_doc = FullDoc(doi=doi) if doi_doc.read(client): return doi_doc.data else: return False
def ElsevierScraper(client, target_DOI): """ Uses the Elsevier API with a valid key and a DOI to download the plain text article, including the title, abstract, pub date, and the references as an unstructured string. :param client: Elsevier client containing the API key :param target_DOI: DOI of the article being scraped :return: list containing the DOI, title, abstract, publication date, full text and unstructured string of references """ print(target_DOI) doi_doc = FullDoc(doi=target_DOI) if doi_doc.read(client): data = doi_doc.data coreData = data['coredata'] abstract = coreData['dc:description'] text = str(data['originalText']).split(abstract)[-1] try: references = text.split("References")[1] except IndexError: references = "NA" text = text.split("References")[0] date = coreData['prism:coverDisplayDate'] return [target_DOI, doi_doc.title, abstract, date, text, references] else: # Could save this to a separate file but it's easier to search for NA and make a new sublist after the fact print("Error: couldn't read {}.".format(target_DOI)) return [target_DOI, "NA", "NA", "NA", "NA", "NA"]
def readFullDocWithDOI(self, doiID='10.1016/S1525-1578(10)60571-5'): ## ScienceDirect (full-text) document example using DOI doi_doc = FullDoc(doi=doiID) if doi_doc.read(self.client): print("doi_doc.title: ", doi_doc.title) doi_doc.write() else: print("Read document failed.")
def readFullDocWithPII(self, sd_piiID='S1270963817323015'): ## ScienceDirect (full-text) document example using PII pii_doc = FullDoc(sd_pii=sd_piiID) if pii_doc.read(self.client): print(pii_doc) print("pii_doc.title: ", pii_doc.title) pii_doc.write() else: print("Read document failed.")
def __init__(self, *args): """ """ print("PII : ", args[0]) self._sd_article = FullDoc(sd_pii=args[0]) print("init SD 1") if not self._sd_article.read(els_client=args[1]): print("raise HTTPError") raise HTTPError
def find_abstract(doi): client = elsevier_auth() ## ScienceDirect (full-text) document example using DOI doi_doc = FullDoc(doi=doi) if doi_doc.read(client): print("doi_doc.title: ", doi_doc.title) print("doi_doc.abstract: ", doi_doc.data['coredata']['dc:description']) doi_doc.write() else: print("Read document failed.")
class TestFullDoc: """Test ScienceDirect article functionality""" ## Test data full_pii_uri = "https://api.elsevier.com/content/article/pii/S1674927814000082" sd_pii = 'S1674927814000082' full_doi_uri = "https://api.elsevier.com/content/article/doi/10.1016/S1525-1578(10)60571-5" doi = '10.1016/S1525-1578(10)60571-5' ## Test initialization def test_init_uri(self): """ Test case: uri is set correctly during initialization with uri""" myFullDoc = FullDoc(uri = self.full_pii_uri) assert myFullDoc.uri == self.full_pii_uri def test_init_sd_pii(self): """ Test case: uri is set correctly during initialization with ScienceDirect PII""" myFullDoc = FullDoc(sd_pii = self.sd_pii) assert myFullDoc.uri == self.full_pii_uri def test_init_doi(self): """ Test case: uri is set correctly during initialization with DOI""" myFullDoc = FullDoc(doi = self.doi) assert myFullDoc.uri == self.full_doi_uri ## Test reading/writing author profile data bad_client = ElsClient("dummy") good_client = ElsClient(config['apikey'], inst_token = config['insttoken']) good_client.local_dir = str(test_path) myFullDoc = FullDoc(uri = full_pii_uri) def test_read_good_bad_client(self): """Test case: using a well-configured client leads to successful read and using a badly-configured client does not.""" assert self.myFullDoc.read(self.bad_client) == False assert self.myFullDoc.read(self.good_client) == True def test_json_to_dict(self): """Test case: the JSON read by the full article object from the API is parsed into a Python dictionary""" assert type(self.myFullDoc.data) == dict def test_title_getter(self): """Test case: the title attribute is returned as a non-empty string""" assert (type(self.myFullDoc.title) == str and self.myFullDoc.title != '') def test_write(self): """Test case: the full article object's data is written to a file with the ID in the filename""" self.myFullDoc.write() ## TODO: replace following (strung-together replace) with regex assert util.file_exist_with_id( self.myFullDoc.data['coredata']['pii'].replace('-','').replace('(','').replace(')',''))
def get_data_from_doi(self, doi, title): id = None affil = None pub_name = None pub_type = None # try: try: doi_doc = ScopusSearch(doi, subscriber=False) if 'pubmed-id' in doi_doc._json[0].keys(): id = doi_doc._json[0]["pubmed-id"] if 'affiliation' in doi_doc._json[0].keys(): affil = doi_doc._json[0]['affiliation'] pub_name = doi_doc._json[0]['prism:publicationName'] pub_type = doi_doc._json[0]['subtypeDescription'] except: print("failed with scopus") if id == None: doi_doc = FullDoc(doi=doi) if doi_doc.read(self.client): # print("doi_doc.title: ", doi_doc.title) doi_doc.write() pub_name = doi_doc.data['coredata']['prism:publicationName'] if 'pubType' in doi_doc.data['coredata'].keys(): pub_type = str(doi_doc.data['coredata']['pubType']).strip() else: print( "Read document failed. no id for doi {}. trying with title" .format(doi)) doi_doc = None # return doi, affil id = None if doi_doc == None or (not 'pubmed-id' in doi_doc._data.keys()): print("trying with title") # try with title Entrez.email = '*****@*****.**' if doi_doc == None: query = title else: query = doi_doc.title handle = Entrez.esearch(db='pubmed', retmode='xml', term=query) results = Entrez.read(handle) if int(results['Count']) > 0: id = results['IdList'] else: id = doi_doc._data['pubmed-id'] if id != None: return self.fetch_data_from_pubmed(id), affil, pub_name, pub_type else: print("no pubmed id found for doi {}".format(doi)) return doi, affil, pub_name, pub_type
def get_doc(self, dtype, identity): """ This method retrieves a 'Doc' object from the Elsevier API. The doc object contains metadata and full-text information about a publication associated with a given PII. Parameters: ----------- dtype(str,required): The type of identification string being used to access the document. (Almost always PII in our case.) identity: The actual identification string/ PII that will be used to query. """ if dtype == 'pii': doc = FullDoc(sd_pii = identity) elif dtype == 'doi': doc= FullDoc(doi = identity) if doc.read(ElsClient(self.API_list[0])): #print ("doc.title: ", doc.title) doc.write() else: print ("Read document failed.") return doc
def get_authors_data_by_doi(self, doi): doi_doc = FullDoc(doi=doi) if doi_doc.read(self.client): print("doi_doc.title: ", doi_doc.title) doi_doc.write() else: print("Read document failed.") return doi id = None if not 'pubmed-id' in doi_doc._data.keys(): print("no pubmed-id, trying with title") # try with title Entrez.email = '*****@*****.**' query = doi_doc.title handle = Entrez.esearch(db='pubmed', retmode='xml', term=query) results = Entrez.read(handle) if int(results['Count']) > 0: id = results['IdList'] else: id = doi_doc._data['pubmed-id'] if id != None: Entrez.email = '*****@*****.**' handle = Entrez.efetch(db='pubmed', retmode='xml', id=id) results = Entrez.read(handle) print(results) if len(results['PubmedArticle']) > 0 and ( 'MedlineCitation' in results['PubmedArticle'][0].keys() ) and ('Article' in results['PubmedArticle'][0]['MedlineCitation'].keys()): if 'AuthorList' in results['PubmedArticle'][0][ 'MedlineCitation']['Article'].keys(): authors_list = results['PubmedArticle'][0][ 'MedlineCitation']['Article']['AuthorList'] dates = results['PubmedArticle'][0]['PubmedData'][ 'History'] else: print("no authors list {}".format( results['PubmedArticle'][0]['MedlineCitation'] ['Article'])) return doi else: print("missing keys") return doi else: print("no pubmed id") return doi return authors_list
def search(self,query="A Lightweight Autoencoder"): doc_srch = ElsSearch(query,'sciencedirect') doc_srch.execute(self.client, get_all = False) for _,doc in doc_srch.results_df.iterrows(): pii_doc = FullDoc(sd_pii = doc['pii']) if pii_doc.read(self.client): try: abstract = " ".join(pii_doc.data['coredata']['dc:description'].split()[1:]) doc_id = str(hex(time.time().as_integer_ratio()[0])) title = doc['dc:title'] pdf_link = doc['link']['scidir'] dates = doc['load-date'].split('-')[0] self.data[doc_id] = {"title": title, "year": dates, "link": pdf_link, "Abstract":abstract} except: pass else: print("Doc Skipped!!")
fieldnames = ['doi', 'title', 'text'] writer = csv.DictWriter(file, fieldnames=fieldnames) writer.writeheader() doi_list = [] middle = [f"{i:02}" for i in range(13)] last = [f"{i:03}" for i in range(100)] # generate potential doi list for m in middle: for l in last: doi_list.append(year + "." + m + "." + l) # loop through doi list for doi_str in doi_list: full_doi = '10.1016/j.jesp.' + doi_str doi_doc = FullDoc(doi=full_doi) # check if has content if doi_doc.read(client): try: doi = full_doi title = doi_doc.title text = doi_doc.data[ "originalText"] # ['scopus-eid', 'originalText', 'scopus-id', 'pubmed-id', 'coredata', 'objects', 'link'] writer.writerow({ 'doi': doi, 'title': title, 'text': text }) # , 'text': text except: continue
def test_init_doi(self): """ Test case: uri is set correctly during initialization with DOI""" myFullDoc = FullDoc(doi = self.doi) assert myFullDoc.uri == self.full_doi_uri
def test_init_sd_pii(self): """ Test case: uri is set correctly during initialization with ScienceDirect PII""" myFullDoc = FullDoc(sd_pii = self.sd_pii) assert myFullDoc.uri == self.full_pii_uri
def test_init_uri(self): """ Test case: uri is set correctly during initialization with uri""" myFullDoc = FullDoc(uri = self.full_pii_uri) assert myFullDoc.uri == self.full_pii_uri
link = papers[i]['link'][0]['@href'] idx = link.find('pii/') pii.append(link[idx + 4:]) i += 1 # remove duplicates for pii_this in pii: if pii_this in pii_total: pii.remove(pii_this) pii_total.extend(pii) # get url, title, and abstract of all 100 articles n = 0 j = 1 for p in pii: pii_doc = FullDoc(sd_pii=p) try: if pii_doc.read(client): # get title title = pii_doc.title title.strip() title.replace('\n', ' ') # get abstract text = pii_doc.data["coredata"]["dc:description"] if text is not None: text = text.strip() if text.startswith( ('ABSTRACT', 'Abstract', 'Summary')): text = text[8:] text = text.strip() # remove extra whitespace
## Load configuration con_file = open("config.json") config = json.load(con_file) con_file.close() ## Initialize client client = ElsClient(config['apikey']) client.inst_token = config['insttoken'] str='SRCTITLE(IEEE Transactions on Pattern Analysis and Machine Intelligence) AND PUBYEAR > 2018 ' myDocSearch=ElsSearch(str, 'scopus') myDocSearch.execute(client, get_all=False) myDocSearch.results pii_doc = FullDoc(sd_pii='0181551220301406') if pii_doc.read(client): print("pii_doc.title: ", pii_doc.title) pii_doc.write() else: print("Read document failed.") str='affil(Public Health and Infection Research Group, Faculty of Health Sciences)' myDocSearch=ElsSearch(str, 'affiliation') myDocSearch.execute(client, get_all=False) myDocSearch.results str='SRCTITLE(Artifical)' myDocSearch=ElsSearch(str, 'source') myDocSearch.execute(client, get_all=False) myDocSearch.results
config = json.load(config_file) GET_ALL = config[ 'get_all'] # False gets one chunk (25) True gets all or max (5000) FULL_TEXT = config['full_text'] # Save fulltext OPEN_ACCESS = config[ 'open_access'] # Search only openaccess documents (so we can get the full text) # "public policy AND (impact OR result OR evaluation OR evidence) AND (climate OR environment)" query = config['query'] if OPEN_ACCESS: query = "openaccess(1) AND " + query client = ElsClient(config['api_key']) doc_srch = ElsSearch(query, 'sciencedirect') doc_srch.execute(client, get_all=GET_ALL) for doc in doc_srch.results: doi = doc['dc:identifier'] print(doi) if FULL_TEXT: ## ScienceDirect (full-text) document example using DOI doi_doc = FullDoc(doi=doi) if doi_doc.read(client): doi_doc.write() else: print("Read full-text failed for DOI", doi) print("# Found", len(doc_srch.results), "results.")
print ("my_aff.name: ", my_aff.name) my_aff.write() else: print ("Read affiliation failed.") ## Scopus (Abtract) document example # Initialize document with ID as integer scp_doc = AbsDoc(scp_id = 84872135457) if scp_doc.read(client): print ("scp_doc.title: ", scp_doc.title) scp_doc.write() else: print ("Read document failed.") ## ScienceDirect (full-text) document example using PII pii_doc = FullDoc(sd_pii = 'S1674927814000082') if pii_doc.read(client): print ("pii_doc.title: ", pii_doc.title) pii_doc.write() else: print ("Read document failed.") ## ScienceDirect (full-text) document example using DOI doi_doc = FullDoc(doi = '10.1016/S1525-1578(10)60571-5') if doi_doc.read(client): print ("doi_doc.title: ", doi_doc.title) doi_doc.write() else: print ("Read document failed.")
def pii_fulltext(pii=None): ## ScienceDirect (full-text) document example using PII pii_doc = FullDoc(sd_pii=pii) if pii_doc.read(client): abstract = pii_doc.data['coredata']['dc:description'] return abstract
def doi_fulltext(doi=None): """ScienceDirect (full-text) document example using DOI""" doi_doc = FullDoc(doi=doi) if doi_doc.read(client): abstract = doi_doc.data['coredata']['dc:description'] return abstract
@author: josephwy """ import os import re import pandas as pd import pickle import json import requests as r import sys sys.path.append('R:\\JoePriceResearch\\Python\\Anaconda3\\Lib\\site-packages') from elsapy.elsclient import ElsClient from elsapy.elsprofile import ElsAuthor, ElsAffil from elsapy.elsdoc import FullDoc, AbsDoc from elsapy.elssearch import ElsSearch with open("config.json") as con_file: config = json.load(con_file) ## Initialize client client = ElsClient(config['apikey']) client.inst_token = config['insttoken'] doi_doc = FullDoc(doi="10.1016/S1525-1578(10)60571-5") if doi_doc.read(client): print("doi_doc.title: ", doi_doc.title) doi_doc.write() print(doi_doc.data['originalText']) else: print("Read document failed.")