def get_data_from_doi(self, doi):
        doi_doc = FullDoc(doi=doi)
        if doi_doc.read(self.client):
            # print("doi_doc.title: ", doi_doc.title)
            doi_doc.write()
        else:
            print("Read document failed.")
            return doi
        id = None
        if not 'pubmed-id' in doi_doc._data.keys():
            print("no pubmed-id, trying with title")
            # try with title
            Entrez.email = '*****@*****.**'
            query = doi_doc.title
            handle = Entrez.esearch(db='pubmed', retmode='xml', term=query)
            results = Entrez.read(handle)
            if int(results['Count']) > 0:
                id = results['IdList']
        else:
            id = doi_doc._data['pubmed-id']
        if id != None:
            return self.fetch_data_from_pubmed(id)

        else:
            print("no pubmed id")
            return doi
def get_paper(doi: str):
    ## ScienceDirect (full-text) document example using DOI
    doi_doc = FullDoc(doi=doi)
    if doi_doc.read(client):
        return doi_doc.data
    else:
        return False
def ElsevierScraper(client, target_DOI):
    """
    Uses the Elsevier API with a valid key and a DOI to download the plain text article, including the title, abstract,
    pub date, and the references as an unstructured string.

    :param client: Elsevier client containing the API key
    :param target_DOI: DOI of the article being scraped
    :return: list containing the DOI, title, abstract, publication date, full text and unstructured string of references
    """

    print(target_DOI)
    doi_doc = FullDoc(doi=target_DOI)
    if doi_doc.read(client):
        data = doi_doc.data
        coreData = data['coredata']
        abstract = coreData['dc:description']
        text = str(data['originalText']).split(abstract)[-1]
        try:
            references = text.split("References")[1]
        except IndexError:
            references = "NA"
        text = text.split("References")[0]
        date = coreData['prism:coverDisplayDate']
        return [target_DOI, doi_doc.title, abstract, date, text, references]
    else:
        # Could save this to a separate file but it's easier to search for NA and make a new sublist after the fact
        print("Error: couldn't read {}.".format(target_DOI))
        return [target_DOI, "NA", "NA", "NA", "NA", "NA"]
 def readFullDocWithDOI(self, doiID='10.1016/S1525-1578(10)60571-5'):
     ## ScienceDirect (full-text) document example using DOI
     doi_doc = FullDoc(doi=doiID)
     if doi_doc.read(self.client):
         print("doi_doc.title: ", doi_doc.title)
         doi_doc.write()
     else:
         print("Read document failed.")
 def readFullDocWithPII(self, sd_piiID='S1270963817323015'):
     ## ScienceDirect (full-text) document example using PII
     pii_doc = FullDoc(sd_pii=sd_piiID)
     if pii_doc.read(self.client):
         print(pii_doc)
         print("pii_doc.title: ", pii_doc.title)
         pii_doc.write()
     else:
         print("Read document failed.")
Example #6
0
def find_abstract(doi):
    client = elsevier_auth()
    ## ScienceDirect (full-text) document example using DOI
    doi_doc = FullDoc(doi=doi)
    if doi_doc.read(client):
        print("doi_doc.title: ", doi_doc.title)
        print("doi_doc.abstract: ", doi_doc.data['coredata']['dc:description'])
        doi_doc.write()
    else:
        print("Read document failed.")
    def get_authors_data_by_doi(self, doi):
        doi_doc = FullDoc(doi=doi)
        if doi_doc.read(self.client):
            print("doi_doc.title: ", doi_doc.title)
            doi_doc.write()
        else:
            print("Read document failed.")
            return doi
        id = None
        if not 'pubmed-id' in doi_doc._data.keys():
            print("no pubmed-id, trying with title")
            # try with title
            Entrez.email = '*****@*****.**'
            query = doi_doc.title
            handle = Entrez.esearch(db='pubmed', retmode='xml', term=query)
            results = Entrez.read(handle)
            if int(results['Count']) > 0:
                id = results['IdList']
        else:
            id = doi_doc._data['pubmed-id']
        if id != None:
            Entrez.email = '*****@*****.**'
            handle = Entrez.efetch(db='pubmed', retmode='xml', id=id)
            results = Entrez.read(handle)
            print(results)
            if len(results['PubmedArticle']) > 0 and (
                    'MedlineCitation' in results['PubmedArticle'][0].keys()
            ) and ('Article'
                   in results['PubmedArticle'][0]['MedlineCitation'].keys()):
                if 'AuthorList' in results['PubmedArticle'][0][
                        'MedlineCitation']['Article'].keys():
                    authors_list = results['PubmedArticle'][0][
                        'MedlineCitation']['Article']['AuthorList']
                    dates = results['PubmedArticle'][0]['PubmedData'][
                        'History']
                else:
                    print("no authors list {}".format(
                        results['PubmedArticle'][0]['MedlineCitation']
                        ['Article']))
                    return doi
            else:
                print("missing keys")
                return doi

        else:
            print("no pubmed id")
            return doi

        return authors_list
 def search(self,query="A Lightweight Autoencoder"):
     doc_srch = ElsSearch(query,'sciencedirect')
     doc_srch.execute(self.client, get_all = False)
     for _,doc in doc_srch.results_df.iterrows():
         pii_doc = FullDoc(sd_pii = doc['pii'])
         if pii_doc.read(self.client):
             try:
                 abstract = " ".join(pii_doc.data['coredata']['dc:description'].split()[1:])
                 doc_id = str(hex(time.time().as_integer_ratio()[0]))
                 title = doc['dc:title']
                 pdf_link = doc['link']['scidir']
                 dates = doc['load-date'].split('-')[0]
                 self.data[doc_id] = {"title": title, "year": dates, "link": pdf_link, "Abstract":abstract}
             except:
                 pass
         else:
             print("Doc Skipped!!")
    def get_doc(self, dtype, identity):
        """
        This method retrieves a 'Doc' object from the Elsevier API. The doc object contains metadata and full-text information
        about a publication associated with a given PII.

        Parameters:
        -----------
        dtype(str,required): The type of identification string being used to access the document. (Almost always PII in our case.)

        identity: The actual identification string/ PII that will be used to query.
        """
        if dtype == 'pii':
            doc = FullDoc(sd_pii = identity)
        elif dtype == 'doi':
            doc= FullDoc(doi = identity)

        if doc.read(ElsClient(self.API_list[0])):
                #print ("doc.title: ", doc.title)
                doc.write()
        else:
            print ("Read document failed.")

        return doc
Example #10
0
class ScienceDirectArticle(ASSArticle):
    def __init__(self, *args):
        """
        
        """
        print("PII : ", args[0])
        self._sd_article = FullDoc(sd_pii=args[0])
        print("init SD 1")
        if not self._sd_article.read(els_client=args[1]):
            print("raise HTTPError")
            raise HTTPError

    def doi(self):
        """Gets the document's DOI"""
        try:
            doi = self._sd_article.data["coredata"]["dc:identifier"]
            # log.info("Check DOI",doi_converter(doi))
            return ass_scrap_util.doi_converter(doi)
        except KeyError:
            doi = ["No DOI"]
            log.warning("No DOI")
            return ass_scrap_util.doi_converter(doi)

    def issn(self):
        pass

    def title(self):
        """Gets the document's title"""
        sd_title = re.sub("/", " ", self._sd_article.title)
        # log.info("Check title",sd_title)
        return sd_title

    def abstract(self):
        """Gets the document's abstract"""
        return self._sd_article.data["coredata"]["dc:description"]

    def is_undesired(self):
        """ Tells if this article is undesired or not """
        title_revue = self.title()
        try:
            if "Editorial" in title_revue:
                log.info("Editorial")
                return True
            if title_revue == "Index":
                log.info("Index")
                return True
            if "Title Page" in title_revue:
                log.info("Title page")
                return True
            if "Subject Index" in title_revue:
                log.info("Subject Index")
                return True
            if "Preface" in title_revue:
                log.info("Preface")
                return True
            if "Letter to the Editor" in self._sd_article.data["coredata"][
                    "pubType"]:
                log.info(str(self._sd_article.data["coredata"]["pubType"]))
                return True
            if "Book review" in self._sd_article.data["coredata"]["pubType"]:
                log.info(str(self._sd_article.data["coredata"]["pubType"]))
                return True
            if "Author index" in title_revue:
                log.info("Author index")
                return True

        except KeyError:
            return False

    def author_checking(self):
        try:
            if self._sd_article.data["coredata"]["dc:creator"][0]["$"] == str:
                log.debug("find Author 1")
                return True
            if self._sd_article.data["coredata"]["dc:creator"]["$"] == str:
                log.debug("find Author 2")
                return True
        except KeyError:
            log.warning("No Author")
            return False

    def author_1(self):

        if self.author_checking:
            try:
                author_brut = self._sd_article.data["coredata"]["dc:creator"][
                    0]["$"]
                if author_brut:
                    log.debug("author_1: 2", author_brut)
                    author = re.sub(r'(,|\.)', '', author_brut)
                    log.debug("author_1: 3", author)
                    author_sub = re.sub(r'(^\w+\b \w)', "", author)

                    log.debug("author_1: 4", author_sub)
                    author_final = re.sub(author_sub, "", author)
                    log.debug("author_1: 5", author_final)
                    AUTHOR = author_final.upper()
                    log.debug("author_1: 6", AUTHOR)
                    AUTHOR = unicodedata.normalize('NFD', AUTHOR).encode(
                        'ASCII', 'ignore')
                    log.debug("author_1: 7", AUTHOR)
                    AUTHOR = re.sub(r'(b|\|\.|\')', '', str(AUTHOR))
                    log.debug("author_1: 8")
                    return AUTHOR
                else:
                    log.debug("author_1: Author -", author_brut)
                    author = re.sub(r'(,|\.)', '', author_brut)
                    author_sub = re.sub(r'(^\w+\b \w)', "", author)
                    author_final = re.sub(author_sub, "", author)
                    AUTHOR = author_final.upper()
                    AUTHOR = unicodedata.normalize('NFD', AUTHOR).encode(
                        'ASCII', 'ignore')
                    AUTHOR = re.sub(r'(b|\|\.|\')', '', str(AUTHOR))
                    return AUTHOR
            except KeyError:
                log.warning("Author Error => KeyError")
                return False

        else:
            log.warning("Author_checking false")
            pass

    def concat_title(self):

        concat_title = self.title()
        concat_title = re.sub(r'\W', '', concat_title)
        CONCAT_TITLE = concat_title.upper()
        log.debug("concat_title", CONCAT_TITLE)
        # CONCAT_TITLE = CONCAT_TITLE.encode('ASCII','ignore')
        TITLE = re.sub(r'(AND|OF|THE|TO)', "", CONCAT_TITLE)
        log.debug(TITLE)
        return TITLE

    def text(self):
        """Gets the document's text"""
        log.debug("text : 1")
        txt = self._sd_article.data["originalText"]
        txt = re.sub(r' Nomenclature', "", txt)
        log.debug("text : 2")
        auteur = str(self.author_1())

        # auteur = re.sub(r'\W','',auteur)
        log.debug("text : 3")
        txt_1 = ".*" + auteur
        log.debug("text : 4" + str(txt_1))

        text_1 = re.sub(r'%s' % txt_1, "", txt)
        log.debug("text : 5")

        text_sub = re.sub(r'(1\.1|2)\W.*', '', text_1)
        # print ("\n2eme étape :",text_sub)

        if "serial JL" in text_sub:
            # print ("Syntax author")
            # title = self.concat_title()
            # print(type(title))
            # print(title)
            # title_sub = ".*{}".format(title)
            # print ("title_sub",title_sub)
            # text_brut = re.sub(r'%s'%title_sub,'',txt)
            # #print(text_brut)
            # text_brut = re.sub(r'^\D+','',text_brut)
            # print(text_brut)
            # intro = re.sub(r'(1\.1|2)(.|\n)*','',text_brut)
            # #print("\n2 :",text_brut)
            # print("\n Intro :",intro)
            # text_alone = re.sub(r'.*%s'%intro,"",txt)
            log.warning("Syntax author => text_cleaner")
            return ass_scrap_util.text_cleaner(txt)

        else:
            text_alone = re.sub(r'.*%s' % text_sub, "", text_1)
            log.debug("text : 6")
            text_alone = re.sub(r'[^a-zA-Z0-9_ ]', "", text_alone)
            log.debug("text : 6,5")
            text_alone = ass_scrap_util.text_cleaner(text_alone)
            text_alone = re.sub(r'( References).*', "", text_alone)
            log.debug("text : 7")
            # cln_txt = text_cleaner(txt)
            return text_alone

    def keywords(self):
        """Gets the document's Keywords"""
        try:
            kw = self._sd_article.data["coredata"]["dcterms:subject"]
            KW_list = [item['$'] for item in kw]
            return KW_list
        except KeyError:
            KW_list = ["No Keyword"]
            return KW_list
    config = json.load(config_file)

GET_ALL = config[
    'get_all']  # False gets one chunk (25) True gets all or max (5000)
FULL_TEXT = config['full_text']  # Save fulltext
OPEN_ACCESS = config[
    'open_access']  # Search only openaccess documents (so we can get the full text)

# "public policy AND (impact OR result OR evaluation OR evidence) AND (climate OR environment)"
query = config['query']

if OPEN_ACCESS:
    query = "openaccess(1) AND " + query

client = ElsClient(config['api_key'])

doc_srch = ElsSearch(query, 'sciencedirect')
doc_srch.execute(client, get_all=GET_ALL)

for doc in doc_srch.results:
    doi = doc['dc:identifier']
    print(doi)
    if FULL_TEXT:
        ## ScienceDirect (full-text) document example using DOI
        doi_doc = FullDoc(doi=doi)
        if doi_doc.read(client):
            doi_doc.write()
        else:
            print("Read full-text failed for DOI", doi)

print("# Found", len(doc_srch.results), "results.")
Example #12
0
    my_aff.write()
else:
    print ("Read affiliation failed.")

## Scopus (Abtract) document example
# Initialize document with ID as integer
scp_doc = AbsDoc(scp_id = 84872135457)
if scp_doc.read(client):
    print ("scp_doc.title: ", scp_doc.title)
    scp_doc.write()   
else:
    print ("Read document failed.")

## ScienceDirect (full-text) document example using PII
pii_doc = FullDoc(sd_pii = 'S1674927814000082')
if pii_doc.read(client):
    print ("pii_doc.title: ", pii_doc.title)
    pii_doc.write()   
else:
    print ("Read document failed.")

## ScienceDirect (full-text) document example using DOI
doi_doc = FullDoc(doi = '10.1016/S1525-1578(10)60571-5')
if doi_doc.read(client):
    print ("doi_doc.title: ", doi_doc.title)
    doi_doc.write()   
else:
    print ("Read document failed.")


## Load list of documents from the API into affilation and author objects.
Example #13
0
def pii_fulltext(pii=None):
    ## ScienceDirect (full-text) document example using PII
    pii_doc = FullDoc(sd_pii=pii)
    if pii_doc.read(client):
        abstract = pii_doc.data['coredata']['dc:description']
        return abstract
Example #14
0
def doi_fulltext(doi=None):
    """ScienceDirect (full-text) document example using DOI"""
    doi_doc = FullDoc(doi=doi)
    if doi_doc.read(client):
        abstract = doi_doc.data['coredata']['dc:description']
        return abstract
            pii.append(link[idx + 4:])
            i += 1

        #   remove duplicates
        for pii_this in pii:
            if pii_this in pii_total:
                pii.remove(pii_this)
        pii_total.extend(pii)

        #   get url, title, and abstract of all 100 articles
        n = 0
        j = 1
        for p in pii:
            pii_doc = FullDoc(sd_pii=p)
            try:
                if pii_doc.read(client):
                    #   get title
                    title = pii_doc.title
                    title.strip()
                    title.replace('\n', ' ')
                    #   get abstract
                    text = pii_doc.data["coredata"]["dc:description"]
                    if text is not None:
                        text = text.strip()
                        if text.startswith(
                            ('ABSTRACT', 'Abstract', 'Summary')):
                            text = text[8:]
                            text = text.strip()
                        # remove extra whitespace
                        text.replace("\n", "")
                        text = " ".join(text.split())