def get_data_from_doi(self, doi):
        doi_doc = FullDoc(doi=doi)
        if doi_doc.read(self.client):
            # print("doi_doc.title: ", doi_doc.title)
            doi_doc.write()
        else:
            print("Read document failed.")
            return doi
        id = None
        if not 'pubmed-id' in doi_doc._data.keys():
            print("no pubmed-id, trying with title")
            # try with title
            Entrez.email = '*****@*****.**'
            query = doi_doc.title
            handle = Entrez.esearch(db='pubmed', retmode='xml', term=query)
            results = Entrez.read(handle)
            if int(results['Count']) > 0:
                id = results['IdList']
        else:
            id = doi_doc._data['pubmed-id']
        if id != None:
            return self.fetch_data_from_pubmed(id)

        else:
            print("no pubmed id")
            return doi
def get_paper(doi: str):
    ## ScienceDirect (full-text) document example using DOI
    doi_doc = FullDoc(doi=doi)
    if doi_doc.read(client):
        return doi_doc.data
    else:
        return False
def ElsevierScraper(client, target_DOI):
    """
    Uses the Elsevier API with a valid key and a DOI to download the plain text article, including the title, abstract,
    pub date, and the references as an unstructured string.

    :param client: Elsevier client containing the API key
    :param target_DOI: DOI of the article being scraped
    :return: list containing the DOI, title, abstract, publication date, full text and unstructured string of references
    """

    print(target_DOI)
    doi_doc = FullDoc(doi=target_DOI)
    if doi_doc.read(client):
        data = doi_doc.data
        coreData = data['coredata']
        abstract = coreData['dc:description']
        text = str(data['originalText']).split(abstract)[-1]
        try:
            references = text.split("References")[1]
        except IndexError:
            references = "NA"
        text = text.split("References")[0]
        date = coreData['prism:coverDisplayDate']
        return [target_DOI, doi_doc.title, abstract, date, text, references]
    else:
        # Could save this to a separate file but it's easier to search for NA and make a new sublist after the fact
        print("Error: couldn't read {}.".format(target_DOI))
        return [target_DOI, "NA", "NA", "NA", "NA", "NA"]
 def readFullDocWithDOI(self, doiID='10.1016/S1525-1578(10)60571-5'):
     ## ScienceDirect (full-text) document example using DOI
     doi_doc = FullDoc(doi=doiID)
     if doi_doc.read(self.client):
         print("doi_doc.title: ", doi_doc.title)
         doi_doc.write()
     else:
         print("Read document failed.")
 def readFullDocWithPII(self, sd_piiID='S1270963817323015'):
     ## ScienceDirect (full-text) document example using PII
     pii_doc = FullDoc(sd_pii=sd_piiID)
     if pii_doc.read(self.client):
         print(pii_doc)
         print("pii_doc.title: ", pii_doc.title)
         pii_doc.write()
     else:
         print("Read document failed.")
Example #6
0
 def __init__(self, *args):
     """
     
     """
     print("PII : ", args[0])
     self._sd_article = FullDoc(sd_pii=args[0])
     print("init SD 1")
     if not self._sd_article.read(els_client=args[1]):
         print("raise HTTPError")
         raise HTTPError
Example #7
0
def find_abstract(doi):
    client = elsevier_auth()
    ## ScienceDirect (full-text) document example using DOI
    doi_doc = FullDoc(doi=doi)
    if doi_doc.read(client):
        print("doi_doc.title: ", doi_doc.title)
        print("doi_doc.abstract: ", doi_doc.data['coredata']['dc:description'])
        doi_doc.write()
    else:
        print("Read document failed.")
Example #8
0
class TestFullDoc:
    """Test ScienceDirect article functionality"""
    
    ## Test data
    full_pii_uri = "https://api.elsevier.com/content/article/pii/S1674927814000082"
    sd_pii = 'S1674927814000082'
    full_doi_uri = "https://api.elsevier.com/content/article/doi/10.1016/S1525-1578(10)60571-5"
    doi = '10.1016/S1525-1578(10)60571-5'
    
    ## Test initialization
    def test_init_uri(self):
        """ Test case: uri is set correctly during initialization with uri"""
        myFullDoc = FullDoc(uri = self.full_pii_uri)
        assert myFullDoc.uri == self.full_pii_uri
        
    def test_init_sd_pii(self):
        """ Test case: uri is set correctly during initialization with ScienceDirect PII"""
        myFullDoc = FullDoc(sd_pii = self.sd_pii)
        assert myFullDoc.uri == self.full_pii_uri
        
    def test_init_doi(self):
        """ Test case: uri is set correctly during initialization with DOI"""
        myFullDoc = FullDoc(doi = self.doi)
        assert myFullDoc.uri == self.full_doi_uri
        
    ## Test reading/writing author profile data
    bad_client = ElsClient("dummy")
    good_client = ElsClient(config['apikey'], inst_token = config['insttoken'])
    good_client.local_dir = str(test_path)

    myFullDoc = FullDoc(uri = full_pii_uri)
    
    def test_read_good_bad_client(self):
        """Test case: using a well-configured client leads to successful read
            and using a badly-configured client does not."""
        assert self.myFullDoc.read(self.bad_client) == False
        assert self.myFullDoc.read(self.good_client) == True

    def test_json_to_dict(self):
        """Test case: the JSON read by the full article object from the 
            API is parsed into a Python dictionary"""
        assert type(self.myFullDoc.data) == dict
        
    def test_title_getter(self):
        """Test case: the title attribute is returned as a non-empty string"""
        assert (type(self.myFullDoc.title) == str and self.myFullDoc.title != '')
        
    def test_write(self):
        """Test case: the full article object's data is written to a file with the ID in the filename"""
        self.myFullDoc.write()
        ## TODO: replace following (strung-together replace) with regex
        assert util.file_exist_with_id(
                self.myFullDoc.data['coredata']['pii'].replace('-','').replace('(','').replace(')',''))
Example #9
0
    def get_data_from_doi(self, doi, title):
        id = None
        affil = None
        pub_name = None
        pub_type = None
        # try:
        try:
            doi_doc = ScopusSearch(doi, subscriber=False)
            if 'pubmed-id' in doi_doc._json[0].keys():
                id = doi_doc._json[0]["pubmed-id"]
            if 'affiliation' in doi_doc._json[0].keys():
                affil = doi_doc._json[0]['affiliation']
            pub_name = doi_doc._json[0]['prism:publicationName']
            pub_type = doi_doc._json[0]['subtypeDescription']
        except:
            print("failed with scopus")
        if id == None:
            doi_doc = FullDoc(doi=doi)
            if doi_doc.read(self.client):
                # print("doi_doc.title: ", doi_doc.title)
                doi_doc.write()
                pub_name = doi_doc.data['coredata']['prism:publicationName']
                if 'pubType' in doi_doc.data['coredata'].keys():
                    pub_type = str(doi_doc.data['coredata']['pubType']).strip()
            else:
                print(
                    "Read document failed. no id for doi {}. trying with title"
                    .format(doi))
                doi_doc = None
                # return doi, affil
            id = None
            if doi_doc == None or (not 'pubmed-id' in doi_doc._data.keys()):
                print("trying with title")
                # try with title
                Entrez.email = '*****@*****.**'
                if doi_doc == None:
                    query = title
                else:
                    query = doi_doc.title
                handle = Entrez.esearch(db='pubmed', retmode='xml', term=query)
                results = Entrez.read(handle)
                if int(results['Count']) > 0:
                    id = results['IdList']
            else:
                id = doi_doc._data['pubmed-id']
        if id != None:
            return self.fetch_data_from_pubmed(id), affil, pub_name, pub_type

        else:
            print("no pubmed id found for doi {}".format(doi))
            return doi, affil, pub_name, pub_type
    def get_doc(self, dtype, identity):
        """
        This method retrieves a 'Doc' object from the Elsevier API. The doc object contains metadata and full-text information
        about a publication associated with a given PII.

        Parameters:
        -----------
        dtype(str,required): The type of identification string being used to access the document. (Almost always PII in our case.)

        identity: The actual identification string/ PII that will be used to query.
        """
        if dtype == 'pii':
            doc = FullDoc(sd_pii = identity)
        elif dtype == 'doi':
            doc= FullDoc(doi = identity)

        if doc.read(ElsClient(self.API_list[0])):
                #print ("doc.title: ", doc.title)
                doc.write()
        else:
            print ("Read document failed.")

        return doc
    def get_authors_data_by_doi(self, doi):
        doi_doc = FullDoc(doi=doi)
        if doi_doc.read(self.client):
            print("doi_doc.title: ", doi_doc.title)
            doi_doc.write()
        else:
            print("Read document failed.")
            return doi
        id = None
        if not 'pubmed-id' in doi_doc._data.keys():
            print("no pubmed-id, trying with title")
            # try with title
            Entrez.email = '*****@*****.**'
            query = doi_doc.title
            handle = Entrez.esearch(db='pubmed', retmode='xml', term=query)
            results = Entrez.read(handle)
            if int(results['Count']) > 0:
                id = results['IdList']
        else:
            id = doi_doc._data['pubmed-id']
        if id != None:
            Entrez.email = '*****@*****.**'
            handle = Entrez.efetch(db='pubmed', retmode='xml', id=id)
            results = Entrez.read(handle)
            print(results)
            if len(results['PubmedArticle']) > 0 and (
                    'MedlineCitation' in results['PubmedArticle'][0].keys()
            ) and ('Article'
                   in results['PubmedArticle'][0]['MedlineCitation'].keys()):
                if 'AuthorList' in results['PubmedArticle'][0][
                        'MedlineCitation']['Article'].keys():
                    authors_list = results['PubmedArticle'][0][
                        'MedlineCitation']['Article']['AuthorList']
                    dates = results['PubmedArticle'][0]['PubmedData'][
                        'History']
                else:
                    print("no authors list {}".format(
                        results['PubmedArticle'][0]['MedlineCitation']
                        ['Article']))
                    return doi
            else:
                print("missing keys")
                return doi

        else:
            print("no pubmed id")
            return doi

        return authors_list
 def search(self,query="A Lightweight Autoencoder"):
     doc_srch = ElsSearch(query,'sciencedirect')
     doc_srch.execute(self.client, get_all = False)
     for _,doc in doc_srch.results_df.iterrows():
         pii_doc = FullDoc(sd_pii = doc['pii'])
         if pii_doc.read(self.client):
             try:
                 abstract = " ".join(pii_doc.data['coredata']['dc:description'].split()[1:])
                 doc_id = str(hex(time.time().as_integer_ratio()[0]))
                 title = doc['dc:title']
                 pdf_link = doc['link']['scidir']
                 dates = doc['load-date'].split('-')[0]
                 self.data[doc_id] = {"title": title, "year": dates, "link": pdf_link, "Abstract":abstract}
             except:
                 pass
         else:
             print("Doc Skipped!!")
Example #13
0
        fieldnames = ['doi', 'title', 'text']
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()

        doi_list = []
        middle = [f"{i:02}" for i in range(13)]
        last = [f"{i:03}" for i in range(100)]
        # generate potential doi list
        for m in middle:
            for l in last:
                doi_list.append(year + "." + m + "." + l)

        # loop through doi list
        for doi_str in doi_list:
            full_doi = '10.1016/j.jesp.' + doi_str
            doi_doc = FullDoc(doi=full_doi)
            # check if has content
            if doi_doc.read(client):
                try:
                    doi = full_doi
                    title = doi_doc.title
                    text = doi_doc.data[
                        "originalText"]  # ['scopus-eid', 'originalText', 'scopus-id', 'pubmed-id', 'coredata', 'objects', 'link']
                    writer.writerow({
                        'doi': doi,
                        'title': title,
                        'text': text
                    })  # , 'text': text
                except:
                    continue
Example #14
0
 def test_init_doi(self):
     """ Test case: uri is set correctly during initialization with DOI"""
     myFullDoc = FullDoc(doi = self.doi)
     assert myFullDoc.uri == self.full_doi_uri
Example #15
0
 def test_init_sd_pii(self):
     """ Test case: uri is set correctly during initialization with ScienceDirect PII"""
     myFullDoc = FullDoc(sd_pii = self.sd_pii)
     assert myFullDoc.uri == self.full_pii_uri
Example #16
0
 def test_init_uri(self):
     """ Test case: uri is set correctly during initialization with uri"""
     myFullDoc = FullDoc(uri = self.full_pii_uri)
     assert myFullDoc.uri == self.full_pii_uri
            link = papers[i]['link'][0]['@href']
            idx = link.find('pii/')
            pii.append(link[idx + 4:])
            i += 1

        #   remove duplicates
        for pii_this in pii:
            if pii_this in pii_total:
                pii.remove(pii_this)
        pii_total.extend(pii)

        #   get url, title, and abstract of all 100 articles
        n = 0
        j = 1
        for p in pii:
            pii_doc = FullDoc(sd_pii=p)
            try:
                if pii_doc.read(client):
                    #   get title
                    title = pii_doc.title
                    title.strip()
                    title.replace('\n', ' ')
                    #   get abstract
                    text = pii_doc.data["coredata"]["dc:description"]
                    if text is not None:
                        text = text.strip()
                        if text.startswith(
                            ('ABSTRACT', 'Abstract', 'Summary')):
                            text = text[8:]
                            text = text.strip()
                        # remove extra whitespace
Example #18
0
## Load configuration
con_file = open("config.json")
config = json.load(con_file)
con_file.close()

## Initialize client
client = ElsClient(config['apikey'])
client.inst_token = config['insttoken']

str='SRCTITLE(IEEE Transactions on Pattern Analysis and Machine Intelligence) AND PUBYEAR > 2018 '
myDocSearch=ElsSearch(str, 'scopus')
myDocSearch.execute(client, get_all=False)

myDocSearch.results

pii_doc = FullDoc(sd_pii='0181551220301406')
if pii_doc.read(client):
    print("pii_doc.title: ", pii_doc.title)
    pii_doc.write()
else:
    print("Read document failed.")

str='affil(Public Health and Infection Research Group, Faculty of Health Sciences)'
myDocSearch=ElsSearch(str, 'affiliation')
myDocSearch.execute(client, get_all=False)
myDocSearch.results

str='SRCTITLE(Artifical)'
myDocSearch=ElsSearch(str, 'source')
myDocSearch.execute(client, get_all=False)
myDocSearch.results
    config = json.load(config_file)

GET_ALL = config[
    'get_all']  # False gets one chunk (25) True gets all or max (5000)
FULL_TEXT = config['full_text']  # Save fulltext
OPEN_ACCESS = config[
    'open_access']  # Search only openaccess documents (so we can get the full text)

# "public policy AND (impact OR result OR evaluation OR evidence) AND (climate OR environment)"
query = config['query']

if OPEN_ACCESS:
    query = "openaccess(1) AND " + query

client = ElsClient(config['api_key'])

doc_srch = ElsSearch(query, 'sciencedirect')
doc_srch.execute(client, get_all=GET_ALL)

for doc in doc_srch.results:
    doi = doc['dc:identifier']
    print(doi)
    if FULL_TEXT:
        ## ScienceDirect (full-text) document example using DOI
        doi_doc = FullDoc(doi=doi)
        if doi_doc.read(client):
            doi_doc.write()
        else:
            print("Read full-text failed for DOI", doi)

print("# Found", len(doc_srch.results), "results.")
Example #20
0
    print ("my_aff.name: ", my_aff.name)
    my_aff.write()
else:
    print ("Read affiliation failed.")

## Scopus (Abtract) document example
# Initialize document with ID as integer
scp_doc = AbsDoc(scp_id = 84872135457)
if scp_doc.read(client):
    print ("scp_doc.title: ", scp_doc.title)
    scp_doc.write()   
else:
    print ("Read document failed.")

## ScienceDirect (full-text) document example using PII
pii_doc = FullDoc(sd_pii = 'S1674927814000082')
if pii_doc.read(client):
    print ("pii_doc.title: ", pii_doc.title)
    pii_doc.write()   
else:
    print ("Read document failed.")

## ScienceDirect (full-text) document example using DOI
doi_doc = FullDoc(doi = '10.1016/S1525-1578(10)60571-5')
if doi_doc.read(client):
    print ("doi_doc.title: ", doi_doc.title)
    doi_doc.write()   
else:
    print ("Read document failed.")

Example #21
0
def pii_fulltext(pii=None):
    ## ScienceDirect (full-text) document example using PII
    pii_doc = FullDoc(sd_pii=pii)
    if pii_doc.read(client):
        abstract = pii_doc.data['coredata']['dc:description']
        return abstract
Example #22
0
def doi_fulltext(doi=None):
    """ScienceDirect (full-text) document example using DOI"""
    doi_doc = FullDoc(doi=doi)
    if doi_doc.read(client):
        abstract = doi_doc.data['coredata']['dc:description']
        return abstract
Example #23
0
@author: josephwy
"""
import os
import re
import pandas as pd
import pickle
import json
import requests as r
import sys
sys.path.append('R:\\JoePriceResearch\\Python\\Anaconda3\\Lib\\site-packages')

from elsapy.elsclient import ElsClient
from elsapy.elsprofile import ElsAuthor, ElsAffil
from elsapy.elsdoc import FullDoc, AbsDoc
from elsapy.elssearch import ElsSearch

with open("config.json") as con_file:
    config = json.load(con_file)

## Initialize client
client = ElsClient(config['apikey'])
client.inst_token = config['insttoken']

doi_doc = FullDoc(doi="10.1016/S1525-1578(10)60571-5")
if doi_doc.read(client):
    print("doi_doc.title: ", doi_doc.title)
    doi_doc.write()
    print(doi_doc.data['originalText'])
else:
    print("Read document failed.")