def get_data_from_doi(self, doi): doi_doc = FullDoc(doi=doi) if doi_doc.read(self.client): # print("doi_doc.title: ", doi_doc.title) doi_doc.write() else: print("Read document failed.") return doi id = None if not 'pubmed-id' in doi_doc._data.keys(): print("no pubmed-id, trying with title") # try with title Entrez.email = '*****@*****.**' query = doi_doc.title handle = Entrez.esearch(db='pubmed', retmode='xml', term=query) results = Entrez.read(handle) if int(results['Count']) > 0: id = results['IdList'] else: id = doi_doc._data['pubmed-id'] if id != None: return self.fetch_data_from_pubmed(id) else: print("no pubmed id") return doi
def get_paper(doi: str): ## ScienceDirect (full-text) document example using DOI doi_doc = FullDoc(doi=doi) if doi_doc.read(client): return doi_doc.data else: return False
def ElsevierScraper(client, target_DOI): """ Uses the Elsevier API with a valid key and a DOI to download the plain text article, including the title, abstract, pub date, and the references as an unstructured string. :param client: Elsevier client containing the API key :param target_DOI: DOI of the article being scraped :return: list containing the DOI, title, abstract, publication date, full text and unstructured string of references """ print(target_DOI) doi_doc = FullDoc(doi=target_DOI) if doi_doc.read(client): data = doi_doc.data coreData = data['coredata'] abstract = coreData['dc:description'] text = str(data['originalText']).split(abstract)[-1] try: references = text.split("References")[1] except IndexError: references = "NA" text = text.split("References")[0] date = coreData['prism:coverDisplayDate'] return [target_DOI, doi_doc.title, abstract, date, text, references] else: # Could save this to a separate file but it's easier to search for NA and make a new sublist after the fact print("Error: couldn't read {}.".format(target_DOI)) return [target_DOI, "NA", "NA", "NA", "NA", "NA"]
def readFullDocWithDOI(self, doiID='10.1016/S1525-1578(10)60571-5'): ## ScienceDirect (full-text) document example using DOI doi_doc = FullDoc(doi=doiID) if doi_doc.read(self.client): print("doi_doc.title: ", doi_doc.title) doi_doc.write() else: print("Read document failed.")
def readFullDocWithPII(self, sd_piiID='S1270963817323015'): ## ScienceDirect (full-text) document example using PII pii_doc = FullDoc(sd_pii=sd_piiID) if pii_doc.read(self.client): print(pii_doc) print("pii_doc.title: ", pii_doc.title) pii_doc.write() else: print("Read document failed.")
def find_abstract(doi): client = elsevier_auth() ## ScienceDirect (full-text) document example using DOI doi_doc = FullDoc(doi=doi) if doi_doc.read(client): print("doi_doc.title: ", doi_doc.title) print("doi_doc.abstract: ", doi_doc.data['coredata']['dc:description']) doi_doc.write() else: print("Read document failed.")
def get_authors_data_by_doi(self, doi): doi_doc = FullDoc(doi=doi) if doi_doc.read(self.client): print("doi_doc.title: ", doi_doc.title) doi_doc.write() else: print("Read document failed.") return doi id = None if not 'pubmed-id' in doi_doc._data.keys(): print("no pubmed-id, trying with title") # try with title Entrez.email = '*****@*****.**' query = doi_doc.title handle = Entrez.esearch(db='pubmed', retmode='xml', term=query) results = Entrez.read(handle) if int(results['Count']) > 0: id = results['IdList'] else: id = doi_doc._data['pubmed-id'] if id != None: Entrez.email = '*****@*****.**' handle = Entrez.efetch(db='pubmed', retmode='xml', id=id) results = Entrez.read(handle) print(results) if len(results['PubmedArticle']) > 0 and ( 'MedlineCitation' in results['PubmedArticle'][0].keys() ) and ('Article' in results['PubmedArticle'][0]['MedlineCitation'].keys()): if 'AuthorList' in results['PubmedArticle'][0][ 'MedlineCitation']['Article'].keys(): authors_list = results['PubmedArticle'][0][ 'MedlineCitation']['Article']['AuthorList'] dates = results['PubmedArticle'][0]['PubmedData'][ 'History'] else: print("no authors list {}".format( results['PubmedArticle'][0]['MedlineCitation'] ['Article'])) return doi else: print("missing keys") return doi else: print("no pubmed id") return doi return authors_list
def search(self,query="A Lightweight Autoencoder"): doc_srch = ElsSearch(query,'sciencedirect') doc_srch.execute(self.client, get_all = False) for _,doc in doc_srch.results_df.iterrows(): pii_doc = FullDoc(sd_pii = doc['pii']) if pii_doc.read(self.client): try: abstract = " ".join(pii_doc.data['coredata']['dc:description'].split()[1:]) doc_id = str(hex(time.time().as_integer_ratio()[0])) title = doc['dc:title'] pdf_link = doc['link']['scidir'] dates = doc['load-date'].split('-')[0] self.data[doc_id] = {"title": title, "year": dates, "link": pdf_link, "Abstract":abstract} except: pass else: print("Doc Skipped!!")
def get_doc(self, dtype, identity): """ This method retrieves a 'Doc' object from the Elsevier API. The doc object contains metadata and full-text information about a publication associated with a given PII. Parameters: ----------- dtype(str,required): The type of identification string being used to access the document. (Almost always PII in our case.) identity: The actual identification string/ PII that will be used to query. """ if dtype == 'pii': doc = FullDoc(sd_pii = identity) elif dtype == 'doi': doc= FullDoc(doi = identity) if doc.read(ElsClient(self.API_list[0])): #print ("doc.title: ", doc.title) doc.write() else: print ("Read document failed.") return doc
class ScienceDirectArticle(ASSArticle): def __init__(self, *args): """ """ print("PII : ", args[0]) self._sd_article = FullDoc(sd_pii=args[0]) print("init SD 1") if not self._sd_article.read(els_client=args[1]): print("raise HTTPError") raise HTTPError def doi(self): """Gets the document's DOI""" try: doi = self._sd_article.data["coredata"]["dc:identifier"] # log.info("Check DOI",doi_converter(doi)) return ass_scrap_util.doi_converter(doi) except KeyError: doi = ["No DOI"] log.warning("No DOI") return ass_scrap_util.doi_converter(doi) def issn(self): pass def title(self): """Gets the document's title""" sd_title = re.sub("/", " ", self._sd_article.title) # log.info("Check title",sd_title) return sd_title def abstract(self): """Gets the document's abstract""" return self._sd_article.data["coredata"]["dc:description"] def is_undesired(self): """ Tells if this article is undesired or not """ title_revue = self.title() try: if "Editorial" in title_revue: log.info("Editorial") return True if title_revue == "Index": log.info("Index") return True if "Title Page" in title_revue: log.info("Title page") return True if "Subject Index" in title_revue: log.info("Subject Index") return True if "Preface" in title_revue: log.info("Preface") return True if "Letter to the Editor" in self._sd_article.data["coredata"][ "pubType"]: log.info(str(self._sd_article.data["coredata"]["pubType"])) return True if "Book review" in self._sd_article.data["coredata"]["pubType"]: log.info(str(self._sd_article.data["coredata"]["pubType"])) return True if "Author index" in title_revue: log.info("Author index") return True except KeyError: return False def author_checking(self): try: if self._sd_article.data["coredata"]["dc:creator"][0]["$"] == str: log.debug("find Author 1") return True if self._sd_article.data["coredata"]["dc:creator"]["$"] == str: log.debug("find Author 2") return True except KeyError: log.warning("No Author") return False def author_1(self): if self.author_checking: try: author_brut = self._sd_article.data["coredata"]["dc:creator"][ 0]["$"] if author_brut: log.debug("author_1: 2", author_brut) author = re.sub(r'(,|\.)', '', author_brut) log.debug("author_1: 3", author) author_sub = re.sub(r'(^\w+\b \w)', "", author) log.debug("author_1: 4", author_sub) author_final = re.sub(author_sub, "", author) log.debug("author_1: 5", author_final) AUTHOR = author_final.upper() log.debug("author_1: 6", AUTHOR) AUTHOR = unicodedata.normalize('NFD', AUTHOR).encode( 'ASCII', 'ignore') log.debug("author_1: 7", AUTHOR) AUTHOR = re.sub(r'(b|\|\.|\')', '', str(AUTHOR)) log.debug("author_1: 8") return AUTHOR else: log.debug("author_1: Author -", author_brut) author = re.sub(r'(,|\.)', '', author_brut) author_sub = re.sub(r'(^\w+\b \w)', "", author) author_final = re.sub(author_sub, "", author) AUTHOR = author_final.upper() AUTHOR = unicodedata.normalize('NFD', AUTHOR).encode( 'ASCII', 'ignore') AUTHOR = re.sub(r'(b|\|\.|\')', '', str(AUTHOR)) return AUTHOR except KeyError: log.warning("Author Error => KeyError") return False else: log.warning("Author_checking false") pass def concat_title(self): concat_title = self.title() concat_title = re.sub(r'\W', '', concat_title) CONCAT_TITLE = concat_title.upper() log.debug("concat_title", CONCAT_TITLE) # CONCAT_TITLE = CONCAT_TITLE.encode('ASCII','ignore') TITLE = re.sub(r'(AND|OF|THE|TO)', "", CONCAT_TITLE) log.debug(TITLE) return TITLE def text(self): """Gets the document's text""" log.debug("text : 1") txt = self._sd_article.data["originalText"] txt = re.sub(r' Nomenclature', "", txt) log.debug("text : 2") auteur = str(self.author_1()) # auteur = re.sub(r'\W','',auteur) log.debug("text : 3") txt_1 = ".*" + auteur log.debug("text : 4" + str(txt_1)) text_1 = re.sub(r'%s' % txt_1, "", txt) log.debug("text : 5") text_sub = re.sub(r'(1\.1|2)\W.*', '', text_1) # print ("\n2eme étape :",text_sub) if "serial JL" in text_sub: # print ("Syntax author") # title = self.concat_title() # print(type(title)) # print(title) # title_sub = ".*{}".format(title) # print ("title_sub",title_sub) # text_brut = re.sub(r'%s'%title_sub,'',txt) # #print(text_brut) # text_brut = re.sub(r'^\D+','',text_brut) # print(text_brut) # intro = re.sub(r'(1\.1|2)(.|\n)*','',text_brut) # #print("\n2 :",text_brut) # print("\n Intro :",intro) # text_alone = re.sub(r'.*%s'%intro,"",txt) log.warning("Syntax author => text_cleaner") return ass_scrap_util.text_cleaner(txt) else: text_alone = re.sub(r'.*%s' % text_sub, "", text_1) log.debug("text : 6") text_alone = re.sub(r'[^a-zA-Z0-9_ ]', "", text_alone) log.debug("text : 6,5") text_alone = ass_scrap_util.text_cleaner(text_alone) text_alone = re.sub(r'( References).*', "", text_alone) log.debug("text : 7") # cln_txt = text_cleaner(txt) return text_alone def keywords(self): """Gets the document's Keywords""" try: kw = self._sd_article.data["coredata"]["dcterms:subject"] KW_list = [item['$'] for item in kw] return KW_list except KeyError: KW_list = ["No Keyword"] return KW_list
config = json.load(config_file) GET_ALL = config[ 'get_all'] # False gets one chunk (25) True gets all or max (5000) FULL_TEXT = config['full_text'] # Save fulltext OPEN_ACCESS = config[ 'open_access'] # Search only openaccess documents (so we can get the full text) # "public policy AND (impact OR result OR evaluation OR evidence) AND (climate OR environment)" query = config['query'] if OPEN_ACCESS: query = "openaccess(1) AND " + query client = ElsClient(config['api_key']) doc_srch = ElsSearch(query, 'sciencedirect') doc_srch.execute(client, get_all=GET_ALL) for doc in doc_srch.results: doi = doc['dc:identifier'] print(doi) if FULL_TEXT: ## ScienceDirect (full-text) document example using DOI doi_doc = FullDoc(doi=doi) if doi_doc.read(client): doi_doc.write() else: print("Read full-text failed for DOI", doi) print("# Found", len(doc_srch.results), "results.")
my_aff.write() else: print ("Read affiliation failed.") ## Scopus (Abtract) document example # Initialize document with ID as integer scp_doc = AbsDoc(scp_id = 84872135457) if scp_doc.read(client): print ("scp_doc.title: ", scp_doc.title) scp_doc.write() else: print ("Read document failed.") ## ScienceDirect (full-text) document example using PII pii_doc = FullDoc(sd_pii = 'S1674927814000082') if pii_doc.read(client): print ("pii_doc.title: ", pii_doc.title) pii_doc.write() else: print ("Read document failed.") ## ScienceDirect (full-text) document example using DOI doi_doc = FullDoc(doi = '10.1016/S1525-1578(10)60571-5') if doi_doc.read(client): print ("doi_doc.title: ", doi_doc.title) doi_doc.write() else: print ("Read document failed.") ## Load list of documents from the API into affilation and author objects.
def pii_fulltext(pii=None): ## ScienceDirect (full-text) document example using PII pii_doc = FullDoc(sd_pii=pii) if pii_doc.read(client): abstract = pii_doc.data['coredata']['dc:description'] return abstract
def doi_fulltext(doi=None): """ScienceDirect (full-text) document example using DOI""" doi_doc = FullDoc(doi=doi) if doi_doc.read(client): abstract = doi_doc.data['coredata']['dc:description'] return abstract
pii.append(link[idx + 4:]) i += 1 # remove duplicates for pii_this in pii: if pii_this in pii_total: pii.remove(pii_this) pii_total.extend(pii) # get url, title, and abstract of all 100 articles n = 0 j = 1 for p in pii: pii_doc = FullDoc(sd_pii=p) try: if pii_doc.read(client): # get title title = pii_doc.title title.strip() title.replace('\n', ' ') # get abstract text = pii_doc.data["coredata"]["dc:description"] if text is not None: text = text.strip() if text.startswith( ('ABSTRACT', 'Abstract', 'Summary')): text = text[8:] text = text.strip() # remove extra whitespace text.replace("\n", "") text = " ".join(text.split())