def get_publication_languages(self, refresh=False): """Parse languages of published documents.""" from json import JSONDecodeError from pybliometrics.scopus.exception import Scopus404Error langs = set() for eid in self._eids: try: ab = AbstractRetrieval(eid, view="FULL", refresh=refresh) except JSONDecodeError: ab = AbstractRetrieval(eid, view="FULL", refresh=True) except Scopus404Error: continue langs.add(ab.language) self._language = "; ".join(sorted(filter(None, langs))) return self
def parse_docs(eids, refresh): """Find the set of references of provided articles. Parameters ---------- eids : list of str Scopus Document EIDs representing documents to be considered. refresh : bool Whether to refresh the cached files if they exist, or not. Returns ------- refs : set The set of Scopus Document EIDs of cited references. n_valid_refs : int The number of documents with valid reference information. """ docs = [] for eid in eids: try: docs.append(AbstractRetrieval(eid, view="FULL", refresh=refresh)) except Scopus404Error: continue ref_lst = [ab.references for ab in docs if ab.references] valid_refs = len(ref_lst) ref_ids = [ref.id for sl in ref_lst for ref in sl] refs = set(filter(None, ref_ids)) return refs, valid_refs
def parse_docs(eids, refresh): """Find abstract and references of articles published up until the given year, both as continuous string. Parameters ---------- eids : list of str Scopus Document EIDs representing documents to be considered. refresh : bool Whether to refresh the cached files if they exist, or not. Returns ------- t : tuple A tuple with our elements: The first element is a continuous string of cleaned abstracts, joined on a blank. The second element is the number of documents with valid reference information. The third element is a continuous string of Scopus Abstract EIDs representing cited references, joined on a blank. The fourth element is the number of valid abstract information. """ docs = [] for eid in eids: try: docs.append(AbstractRetrieval(eid, view="FULL", refresh=refresh)) except Scopus404Error: continue refs = [ab.references for ab in docs if ab.references] valid_refs = len(refs) refs = " ".join([ref.id for sl in refs for ref in sl]) absts = [clean_abstract(ab.abstract) for ab in docs if ab.abstract] valid_absts = len(absts) absts = " ".join(absts) return (refs, valid_refs, absts, valid_absts)
def get_publication_languages(self, refresh=False): """Parse languages of published documents.""" langs = [] for eid in self._eids: l = AbstractRetrieval(eid, view="FULL", refresh=refresh).language langs.append(l) self._language = "; ".join(sorted(list(set(filter(None, langs))))) return self
def get_publication_languages(self, refresh=False): """Parse languages of published documents.""" langs = set() for eid in self._eids: try: ab = AbstractRetrieval(eid, view="FULL", refresh=refresh) except Scopus404Error: continue langs.add(ab.language) self._language = "; ".join(sorted(filter(None, langs))) return self
def find_location(auth_ids, pubs, year, refresh): """Find the most common country, affiliation ID, and affiliation name of a scientist using her most recent publications with valid information. Parameters ---------- auth_ids : list of str A list of Scopus Author Profile IDs for which the affiliation should be searched for. pubs : list of namedtuple The publications associated with the Author IDs as returned from a scopus query. year : int The year for which we would like to have the country. refresh : bool Whether to refresh all cached files or not. Returns ------- country, affiliation_id, organization : str or None The country, city, affiliation ID, and affiliation name of the scientist in the year closest to the treatment year, given that the publications list valid information for each output. Equals None when no valid publications are found. """ from operator import attrgetter # Available papers of most recent year with publications papers = [p for p in pubs if int(p.coverDate[:4]) <= year] papers = sorted(papers, key=attrgetter("coverDate"), reverse=True) params = {"view": "FULL", "refresh": refresh} # Return most recent complete information for p in papers: try: authgroup = AbstractRetrieval(p.eid, **params).authorgroup or [] except Scopus404Error: continue authgroup = [ a for a in authgroup if a.auid in auth_ids and a.country and a.affiliation_id and a.organization ] countries = "; ".join(sorted(set([a.country for a in authgroup]))) aff_ids = "; ".join(sorted(set([a.affiliation_id for a in authgroup]))) orgs = "; ".join(sorted(set([a.organization for a in authgroup]))) if not countries and not aff_ids and not orgs: continue return countries, aff_ids, orgs # Return None-triple if all else fails return countries, aff_ids, orgs
def from_identifier(id, id_type, view='FULL'): from pybliometrics.scopus import AbstractRetrieval from pybliometrics.scopus.exception import Scopus404Error with shelve.open(SCOPUS_CACHE) as cache: key = id + '_found' if cache.get(key) is False: raise Scopus404Error() try: result = AbstractRetrieval(id, id_type=id_type, view=view) return ScopusDocument(result) except Scopus404Error: cache[key] = False raise
def eid_authorid(SCOPUS_EID): '''Given the journal details (SCOPUS_EID) the function returnal all authors name and Author Scopus_ID Parameters: str SCOPUS_EID Returns: dict Authors name and Author Scopus_ID ''' from pybliometrics.scopus import AbstractRetrieval ab = AbstractRetrieval(SCOPUS_EID) researchers = {author.given_name+ ' '+author.surname : author.auid for author in ab.authors} return researchers
def retrieve_abstract_try(eid, view='REF', param='references'): from pybliometrics.scopus import AbstractRetrieval try: refs = AbstractRetrieval(eid, view=view)._json[param] except KeyError: print('An error occurred (1) ...') return 1 except UnboundLocalError: print('An error occurred (2). Probably an empty eID provided? ') return 2 except KeyboardInterrupt: sys.exit("Interrupting due to user command.") except: print('An error occurred (?)...') return 0 else: return refs
def get_citations(dois): ''' Function that translates a list of DOIs into a citation count Parameters: dois: List of strings Contains all relevant DOIs, as obtained from LibXC ''' # Citations from scopus using Rose, Michael E. and John R. Kitchin: # "pybliometrics: Scriptable bibliometrics using a Python interface to Scopus", SoftwareX 10 (2019) 100263. from pybliometrics.scopus import AbstractRetrieval citations = 0 for doi in dois: try: ab = AbstractRetrieval(doi) #print(ab.citedby_count) citations += ab.citedby_count except: continue return citations
def find_authors(abst_path, dois_path, entries): """Listing all the authors who have at least one publlication which contains at least one of the given entries **Parameters:** * `abst_paths`: (str) path to abstracts file * `dois_path`: (str) path to the list of DOIs * `entries`: (list) list of strings, each string is an entry ** Returns: * `u_auids`: (list) set of author IDs for those who had at least published one paper that contained one of the entries * `au_dois`: (list) list of DOIs of the authors that were identified (same length as `u_auids)` """ p = utils.MatTextProcessor() # domain of the search (DOIs) doi_list = pd.read_csv(dois_path, header=None) auids = [] dois = [] with open(abst_path, 'r', encoding='utf-8') as f: for i, line in enumerate(f): if np.any([e in line for e in entries]): abst = line.split(' ') if np.any([p.normalized_formula(e) in abst for e in entries]): dois += [doi_list.iloc[i][0]] doc = AbstractRetrieval(dois[-1]) auids += [[a.auid for a in doc.authors]] # unique authors and their documents u_auids = list(np.unique(np.array(sum(auids, [])))) au_dois = [[dois[j] for j in range(len(dois)) if au in auids[j]] for au in u_auids] return u_auids, au_dois
def make_training_file(self, dois, save_dir): """Downloading, pre-processsing and storing abstracts of a set of DOIs in a text file which can be later used as the training data for tuning models like word2vec Each line of the saved file corresponds to one article and shows its title followed by the abstract ** Parameters: * dois : *(list)* list of DOIs * saved_dir : *(str)* directory to save the files """ # list of lists (each list = one line = title + abstract) save_path_abst = os.path.join(save_dir, 'abstracts') save_path_dois = os.path.join(save_dir, 'saved_DOIs') save_path_misses = os.path.join(save_dir, 'missed_DIOs') missed_dois = [] for doi in dois: try: r = AbstractRetrieval(doi) tokens = self.mat_preprocess(r.title) + self.mat_preprocess(r.description) except: #pdb.set_trace() with open(save_path_misses, 'a+', encoding='utf-8') as f: f.write(doi+'\n') continue line = ' '.join(sum(tokens,[])) doi_line = doi if doi!=dois[-1]: line += '\n' doi_line += '\n' # saving the texts with open(save_path_abst, 'a+', encoding='utf-8') as f: f.write(line) with open(save_path_dois, 'a+') as f: f.write(doi_line)
def parse_abstract(pub, refresh=350): """Extract bibliometric information and add yearly citations.""" # Basic bibliometric information s = pd.Series() s['title'] = pub.title s['eid'] = pub.eid pubyear = int(pub.coverDate.split("-")[0]) s['year'] = str(pubyear) try: pages = pub.pageRange.split("-") except AttributeError: ab = AbstractRetrieval(pub.eid, view="FULL") pages = ab.pageRange.split("-") s['num_pages'] = int(pages[1]) - int(pages[0]) s['num_auth'] = pub.author_count s['authors'] = pub.author_ids # Yearly cumulated citations co = CitationOverview(pub.eid, start=pubyear, end=2020, refresh=refresh) s['total_citations'] = sum([int(t[1]) for t in co.cc]) lags = [f"citcount_{y-pubyear}" for y, _ in co.cc] citations = cumsum([int(t[1]) for t in co.cc]) s = s.append(pd.Series(citations, index=lags)) return s
def complete_affiliations(paper_ids, sql_db, sql_cursor, logfile_path=None): logger = helpers.set_up_logger(__name__, logfile_path, False, file_mode='a') # initialize the affiliation primary key sql_cursor.execute('SELECT aff_id FROM affiliation;') all_aff_PKs = sql_cursor.fetchall() if len(all_aff_PKs)==0: aff_PK = 0 else: aff_PK = max([a[0] for a in all_aff_PKs]) + 1 sql_cursor.execute('SELECT aff_scopus_ID FROM affiliation;') curr_aff_scopus_id_list = [a[0] for a in sql_cursor.fetchall()] sql_cursor.execute('SELECT * FROM author_affiliation_mapping;') curr_author_aff_pairs = list(sql_cursor.fetchall()) pids_array = ','.join([str(p) for p in paper_ids]) sql_cursor.execute('SELECT doi, paper_id FROM paper WHERE paper_id IN {};'.format(pids_array)) RES = sql_cursor.fetchall() dois = [a[0] for a in RES] paper_ids = [a[1] for a in RES] dois_with_nonexisting_authors = [] for j,doi in enumerate(dois): try: r = AbstractRetrieval(doi) except Scopus429Error: print('Scopus resource exhausted. Check your quota.') return except: raise ValueError('Could not download doi {}'.format(doi)) if r.authors is None: continue paper_scopus_id_list = [a.auid for a in r.authors] for i,scps_id in enumerate(paper_scopus_id_list): # if repetitive author, ignore: if scps_id in paper_scopus_id_list[:i]: continue sql_cursor.execute('SELECT author_id \ FROM author \ WHERE author_scopus_ID = {}'.format(scps_id)) this_author_PK = sql_cursor.fetchall() if len(this_author_PK)==0: if doi not in dois_with_nonexisting_authors: dois_with_nonexisting_authors += [doi] logger.info('(CASE NUMBER {}) PAPER_ID {}, DOI {}: author with scopus ID {} does not exist.'.format(306+len(dois_with_nonexisting_authors), paper_ids[j], doi, scps_id)) continue else: this_author_PK = this_author_PK[0][0] # directly go to their affiliations if r.authors[i].affiliation is not None: author_aff_scopus_id_list = np.unique(r.authors[i].affiliation) else: author_aff_scopus_id_list = [] for aff_scps_id in author_aff_scopus_id_list: if aff_scps_id in curr_aff_scopus_id_list: sql_cursor.execute('SELECT aff_id \ FROM affiliation \ WHERE aff_scopus_ID = {}'.format(aff_scps_id)) this_aff_PK = sql_cursor.fetchall()[0][0] # add the pair only if the author/aff. have not already # been added to the mapping table if (this_author_PK, this_aff_PK) not in curr_author_aff_pairs: sql_cursor.execute('INSERT INTO author_affiliation_mapping \ VALUES({}, {})'.format(this_author_PK, this_aff_PK)) curr_author_aff_pairs += [(this_author_PK, this_aff_PK)] logger.info('{} have been added to A2A.'.format((r.authors[i].given_name, r.authors[i].surname, this_aff_PK))) else: lcn = np.where([x.id==aff_scps_id for x in r.affiliation])[0] if len(lcn)>0: lcn = lcn[0] aff_name = r.affiliation[lcn].name.replace('"','\\"') aff_city = r.affiliation[lcn].city aff_country = r.affiliation[lcn].country else: aff_name = 'NA' aff_city = 'NA' aff_country = 'NA' sql_cursor.execute('INSERT INTO affiliation \ VALUES({},"{}","{}","{}","{}");'.format( aff_PK, aff_scps_id, aff_name, aff_city, aff_country) ) sql_cursor.execute('INSERT INTO author_affiliation_mapping \ VALUES({}, {})'.format(this_author_PK, aff_PK)) curr_author_aff_pairs += [(this_author_PK, aff_PK)] logger.info('{} have been added to A2A.'.format((r.authors[i].given_name, r.authors[i].surname, this_aff_PK))) # update the affliations list curr_aff_scopus_id_list += [aff_scps_id] aff_PK += 1 if not(j%1000): np.savetxt('/home/jamshid/codes/data/iter_inds.txt', [j]) sql_db.commit()
#!/usr/bin/env python # -*- coding: utf-8 -*- """Tests for `scopus.AbstractRetrieval` module.""" from collections import namedtuple from nose.tools import assert_equal, assert_true from pybliometrics.scopus import AbstractRetrieval # Base information ab1 = AbstractRetrieval("2-s2.0-84930616647", view="FULL", refresh=30) # Conference proceeding and no references ab2 = AbstractRetrieval("2-s2.0-0029486824", view="FULL", refresh=30) # Issuetitle and no affiliation ab3 = AbstractRetrieval("2-s2.0-0001270077", view="FULL", refresh=30) # Author group broken and author keywords ab4 = AbstractRetrieval("2-s2.0-0000016206", view="FULL", refresh=30) # ISBN ab5 = AbstractRetrieval("2-s2.0-84919546381", view="FULL", refresh=30) # Funding, sequencebanks, chemicals ab6 = AbstractRetrieval("2-s2.0-85040230676", view="FULL", refresh=30) # Contributor group ab7 = AbstractRetrieval("2-s2.0-85050253030", view="FULL", refresh=30) # REF view ab8 = AbstractRetrieval("2-s2.0-84951753303", view="REF", refresh=30) def test_abstract(): expected = 'In this paper we propose a Bayesian analysis of seasonal '\ 'unit roots in quarterly observed time series. Seasonal unit root '\ 'processes are useful to describe economic series with changing '\
def search_scopus(query, docs=None, retrieve_orcid=True): """Search Scopus.""" documents = [] authors_cache = {} affiliations_cache = {} try: retrieved_paper_ids = ScopusSearch(query, view="STANDARD").get_eids() except ScopusQueryError: print("Impossible to process query \"{}\".".format(query)) return None if len(retrieved_paper_ids) == 0: print("No matching documents for the provided query.") return None for paper_id in tqdm(retrieved_paper_ids): try: paper = AbstractRetrieval(paper_id, view="FULL") except ValueError: print("Impossible to retrieve data for paper \"{}\".".format(paper_id)) return None doc_id = DocumentID() doc_id.parse_scopus(paper) authors = [] if paper.authors: for author in paper.authors: author_affiliations = [] if retrieve_orcid: if author.auid in authors_cache: authors.append(Author(name=author.indexed_name, orcid=authors_cache[author.auid], affiliations=author_affiliations)) else: authors_cache[author.auid] = AuthorRetrieval(author.auid).orcid authors.append(Author(name=author.indexed_name, orcid=authors_cache[author.auid], affiliations=author_affiliations)) else: authors.append(Author(name=author.indexed_name, orcid=None, affiliations=author_affiliations)) if author.affiliation: for affiliation_id in author.affiliation: if affiliation_id in affiliations_cache: affiliation = affiliations_cache[affiliation_id] else: try: affiliation = ContentAffiliationRetrieval(affiliation_id) affiliations_cache[affiliation_id] = affiliation except: affiliation= None if affiliation: author_affiliations.append(Affiliation(name=affiliation.affiliation_name, city=affiliation.city, country=affiliation.country)) references = [] if paper.refcount and int(paper.refcount) > 0 and paper.references: for reference in paper.references: if reference.title: references.append(reference.title) if paper.language: try: language = iso639.languages.get(part2b=paper.language).name except KeyError: language = None else: language = None document = Document(id=doc_id, title=paper.title, keywords=paper.authkeywords, abstract=paper.description, source=paper.publicationName, source_type=paper.aggregationType, language=language, year=int(paper.coverDate.split("-")[0]), authors=authors, references=references, publisher=paper.publisher, internal=paper) if paper.citedby_count: document.citation_count = int(paper.citedby_count) documents.append(document) if docs: return DocumentSet(docs=documents).union(docs) else: return DocumentSet(docs=documents)
def Scopus_to_SQLtable(dois, sql_db, sql_cursor, bad_dois_save_path=None): # get the last primary paper/author IDs sql_cursor.execute('SELECT paper_id FROM paper;') all_paper_PKs = sql_cursor.fetchall() if len(all_paper_PKs)==0: paper_PK = 0 else: paper_PK = max([a[0] for a in all_paper_PKs]) + 1 sql_cursor.execute('SELECT author_id FROM author;') all_author_PKs = sql_cursor.fetchall() if len(all_author_PKs)==0: author_PK = 0 else: author_PK = max([a[0] for a in all_author_PKs]) + 1 sql_cursor.execute('SELECT aff_id FROM affiliation;') all_aff_PKs = sql_cursor.fetchall() if len(all_aff_PKs)==0: aff_PK = 0 else: aff_PK = max([a[0] for a in all_aff_PKs]) + 1 # all previously entered paper DOIs to avoid repetition sql_cursor.execute('SELECT doi FROM paper;') all_dois = sql_cursor.fetchall() all_dois = [a[0] for a in all_dois] # ... same for authors sql_cursor.execute('SELECT author_scopus_ID FROM author;') curr_scopus_id_list = [a[0] for a in sql_cursor.fetchall()] sql_cursor.execute('SELECT aff_scopus_ID FROM affiliation;') # ... same for affiliations curr_aff_scopus_id_list = [a[0] for a in sql_cursor.fetchall()] # ... even same for (author, affiliation)'s, since they can be repeatitive sql_cursor.execute('SELECT * FROM author_affiliation_mapping;') curr_author_aff_pairs = list(sql_cursor.fetchall()) bad_dois = [] for i,doi in enumerate(dois): if doi in all_dois: print('{} has been already entered to the database'.format(doi)) continue try: r = AbstractRetrieval(doi) except Scopus429Error: print('Scopus resource exhausted. Check your quota.') return except: bad_dois += [doi] if bad_dois_save_path is not None: with open(bad_dois_save_path, 'a+') as bad_f: bad_f.write(doi+'\n') continue # ROW IN PAPER TABLE if r.title is not None: title = r.title.replace('\"','') title = title.replace('\\Vub\\', '|Vub|') # ad-hoc for a specific article else: title = 'NA' if r.description is not None: abst = r.description.replace('\"','') abst = abst.replace('\\Vub\\','|Vub|') # ad-hoc for a specific article abst = abst.replace('out.\\', 'out.') # ad-hoc for a specific article # yet another ad-hoc if doi=='10.1140/epjb/e2012-30482-6': abst = re.sub(r'-duration(.*?), among others', '-duration α, among others',abst) else: abst = 'NA' scomm = """INSERT INTO paper VALUES({},"{}","{}","{}","{}");""".format( paper_PK, r.doi, r.coverDate, title, abst) # taking care of unicode characters #scomm = "{}".format(scomm.encode('utf-8')) #scomm = scomm[2:-1].replace('\\', '\\\\') sql_cursor.execute(scomm) # ROW IN AUTHOR TABLE # skip the rest if no auhotrs were available if r.authors is None: paper_PK += 1 continue paper_scopus_id_list = [a.auid for a in r.authors] for i,scps_id in enumerate(paper_scopus_id_list): # if repetitive author, ignore: if scps_id in paper_scopus_id_list[:i]: continue if scps_id in curr_scopus_id_list: # extract existing author PK from scopus ID sql_cursor.execute('SELECT author_id \ FROM author \ WHERE author_scopus_ID = {}'.format(scps_id)) this_author_PK = sql_cursor.fetchall()[0][0] sql_cursor.execute('INSERT INTO paper_author_mapping VALUES({}, {})'.format( paper_PK, this_author_PK)) else: # create a row for this new author au_given_name = r.authors[i].given_name.replace('\"','') if \ r.authors[i].given_name is not None else r.authors[i].given_name au_surname = r.authors[i].surname.replace('\"','') if \ r.authors[i].surname is not None else r.authors[i].surname sql_cursor.execute('INSERT INTO author \ VALUES({}, "{}", "{}", "{}")'.format( author_PK, scps_id, au_given_name, au_surname) ) sql_cursor.execute('INSERT INTO paper_author_mapping \ VALUES({}, {})'.format( paper_PK, author_PK)) # update the global authors scopus ID list curr_scopus_id_list += [scps_id] this_author_PK = author_PK #this will be used in affiliation table author_PK += 1 # adding affiliations # --------------------- # handling None affiliations if r.authors[i].affiliation is not None: author_aff_scopus_id_list = np.unique(r.authors[i].affiliation) else: author_aff_scopus_id_list = [] for aff_scps_id in author_aff_scopus_id_list: if aff_scps_id in curr_aff_scopus_id_list: sql_cursor.execute('SELECT aff_id \ FROM affiliation \ WHERE aff_scopus_ID = {}'.format(aff_scps_id)) this_aff_PK = sql_cursor.fetchall()[0][0] # add the pair only if the author/aff. have not already # been added to the mapping table if (this_author_PK, this_aff_PK) not in curr_author_aff_pairs: sql_cursor.execute('INSERT INTO author_affiliation_mapping \ VALUES({}, {})'.format(this_author_PK, this_aff_PK)) curr_author_aff_pairs += [(this_author_PK, this_aff_PK)] else: lcn = np.where([x.id==aff_scps_id for x in r.affiliation])[0] if len(lcn)>0: lcn = lcn[0] aff_name = r.affiliation[lcn].name.replace('"','\\"') aff_city = r.affiliation[lcn].city aff_country = r.affiliation[lcn].country else: aff_name = 'NA' aff_city = 'NA' aff_country = 'NA' sql_cursor.execute('INSERT INTO affiliation \ VALUES({},"{}","{}","{}","{}");'.format( aff_PK, aff_scps_id, aff_name, aff_city, aff_country) ) sql_cursor.execute('INSERT INTO author_affiliation_mapping \ VALUES({}, {})'.format(this_author_PK, aff_PK)) curr_author_aff_pairs += [(this_author_PK, aff_PK)] # update the affliations list curr_aff_scopus_id_list += [aff_scps_id] aff_PK += 1 paper_PK += 1 sql_db.commit() return bad_dois
import pandas as pd from pybliometrics.scopus import ScopusSearch, AbstractRetrieval, AuthorRetrieval, ContentAffiliationRetrieval import networkx as nx nodes=pd.DataFrame() edges=[] gen={} Ellison = "10.1086/341871" ab = AbstractRetrieval(Ellison, view="FULL") print("\n\n===========================starting============================\n\n") nodes=pd.DataFrame() nodes = nodes.append({"id":"", "title": ab.title, "sourcetitle": ab.sourcetitle_abbreviation, "publicationyear": ab.coverDate[0:4], "eid": ab.eid, "gen": '0' }, ignore_index=True) ref_df = pd.DataFrame(ab.references) ref_df["eid"] = '2-s2.0-' + ref_df['id'] ref_df['gen'] = '-1' ref_df2 = pd.concat([ref_df['eid'], ref_df['id'], ref_df['publicationyear'], ref_df['sourcetitle'], ref_df['title'], ref_df['gen']], axis=1, keys=['eid', 'id', 'publicationyear', 'sourcetitle', 'title', 'gen'], sort=True) #ref_df2 = ref_df2.drop(18) nodes = nodes.append(ref_df2, ignore_index = True, sort=True) for row in ref_df2.itertuples(): edges.append((row.eid, ab.eid)) len(nodes) s = ScopusSearch(ab.eid) for x in s.results: if(x.eid not in list(nodes['eid'])):
from pybliometrics.scopus import ScopusSearch, AbstractRetrieval, AuthorRetrieval, ContentAffiliationRetrieval, CitationOverview import networkx as nx import sys import json nodes = pd.DataFrame() edges = [] gen = {} outp = [] # Papers # identiified using the DOI Varian = "10.1007/b104899_7" ab = AbstractRetrieval(Varian, view="FULL") # Paper 1 nodes = pd.DataFrame() nodes = nodes.append( { "id": "", "title": ab.title, "sourcetitle": ab.sourcetitle_abbreviation, "publicationyear": ab.coverDate[0:4], "eid": ab.eid, "gen": '0' }, ignore_index=True)
count += 1 readFile.close() print('document count: ' + str(count)) count = 0 with open('pubs_metadata_by_scopus.csv', 'a', encoding='utf-8') as writeFile: writer = csv.writer(writeFile) writer.writerow([ 'eid', 'date', 'title', 'citedby_count', 'authors', 'venue', 'area', 'abstract' ]) for eid in doc_eids: print() print('count: ' + str(count)) count += 1 abstractRetrieval = AbstractRetrieval(eid) print('eid: ' + eid) # venue publicationName = abstractRetrieval.publicationName if not publicationName: continue print('venue: ' + publicationName) # sjr venue link venue_link = getSjrVenueLink(publicationName) if not venue_link: continue print('venue link: ' + venue_link) # subject area subject_area = getSjrSubjectArea(venue_link) if not subject_area: continue
def import_scopus(ctx, verbose, start): """ Import scopus publication records for the authors of the pubtrack application. This command will first fetch all the information about the authors, which are defined within the pubtrack app. It uses the scopus author ID's of these authors to send requests to the scopus database. The publications of these replies are then evaluated and posted into the pubtrack app. """ # SETTING UP PUBTRACK WRAPPER config = ctx.obj['config'] pubtrack = Pubtrack(config) # SETTING UP SCOPUS WRAPPER try: pybliometrics.scopus.utils.create_config() except FileExistsError: pass finally: scopus_config['Authentication']['APIKey'] = config.get_scopus_key() # FETCHING META AUTHOR INFORMATION FROM PUBTRACK click.secho('Fetching author information from pubtrack.') author_id_name_map = {} meta_authors = pubtrack.meta_author.get()['results'] for meta_author in meta_authors: for author in meta_author['authors']: # "author_name_kitopen" returns a string with the authors name. This function essentially formats the name # in a way so that it can be used in a query string for the KITOpen database. full_name = '{} {}'.format(author['first_name'], author['last_name']) scopus_id = author['scopus_id'] author_id_name_map[scopus_id] = full_name out( verbose, ' > Adding author "{} ({})" to be processed'.format( full_name, scopus_id)) click.secho('==> Processing total of {} authors'.format( len(author_id_name_map))) # QUERY SCOPUS DATABASE click.secho( 'Querying scopus database for the publications of those authors.') date_limit = datetime.datetime(year=start, month=1, day=1) for author_id, author_name in author_id_name_map.items(): publication_count = 0 search = ScopusSearch(f'AU-ID ( {author_id} )') out(verbose, ' | Query "AU-ID ( {} )"'.format(author_id)) for result in search.results: # We'll only take publications, which have a DOI if result.doi is None: continue # requesting the detailed information from the scopus database for the current publication from the search # results try: abstract_retrieval = AbstractRetrieval(result.doi) except Exception as e: out(verbose, ' # Could not retrieve publication "{}"'.format( result.doi), fg='yellow') continue # If the publication is older than the date limit, it will be discarded publication_date = datetime.datetime.strptime( abstract_retrieval.coverDate, '%Y-%m-%d') if publication_date <= date_limit: out(verbose, ' # Publication too old "{}"({})'.format( result.doi, publication_date), fg='yellow') continue else: out(verbose, ' > Fetched publication "{}"'.format(result.doi)) adapter = ScopusPublicationAdapter(abstract_retrieval) publication = adapter.get_publication() # Filtering the authors according to the AUTHOR_LIMIT, which has been set. # We cannot just use the first few authors however, we need to make sure that the author, from which we have # this publication in the first place is in there. The rest just gets filled up... authors = [] for author in publication['authors']: if author['scopus_id'] in author_id_name_map.keys( ) or len(authors) < config.get_author_limit(): authors.append(author) publication['authors'] = authors # Now we try to actually POST the publication to the pubtrack REST API try: pubtrack.import_publication(publication) publication_count += 1 out(verbose, ' * Added to pubtrack: "{}"'.format( publication['title']), fg='green') except Exception as e: if str(e) == 'uuid': out(verbose, ' ! Error while posting to pubtrack: Already exists!', fg='red') else: out(verbose, ' ! Error while posting to pubtrack: {}'.format( str(e)), fg='red') continue out(True, ' --> Total of {} publications imported from author {}'.format( publication_count, author_id), fg='green', bold=True)
def coletar_artigos(eids_documentos, api_view): # Inicializa uma lista de dados vazia {data}; para cada entrada na lista de artigos obtidos # cria um dicionário para armazenar as informações específicas sobre o artigo e # armazena nessa lista data = [] for key in eids_documentos: record = {} error = True while error: try: paper = AbstractRetrieval(key, id_type="eid", view=api_view, refresh=True) error = False # Informações básicas. record["id"] = paper.identifier record["doi"] = paper.doi record["eid"] = paper.eid record["pii"] = paper.pii record["pubmed_id"] = paper.pubmed_id record["titulo"] = paper.title record["resumo"] = paper.abstract record["descricao"] = paper.description record["data_publicacao"] = datetime.strptime(paper.coverDate, "%Y-%m-%d").date() \ if paper.coverDate else None record["numero_citacao"] = paper.citedby_count record["idioma"] = paper.language record["tipo_publicacao"] = paper.aggregationType record["tipo_fonte"] = paper.srctype record["palavras_chaves"] = tuple( paper.authkeywords) if paper.authkeywords else None record["termos_indice"] = tuple( paper.idxterms) if paper.idxterms else None record["issn"] = paper.issn try: record["isbn"] = " ".join(paper.isbn) if type( paper.isbn) == tuple else paper.isbn except TypeError: record["isbn"] = None # Informações sobre a Conferencia e/ou Revista. record["conf_loc"] = paper.conflocation record["conferencia_nome"] = paper.confname record["revista_nome"] = paper.publicationName record["revista_ender"] = paper.publisheraddress record["titulo_ed"] = paper.issuetitle record["publis"] = paper.publisher # Informações sobre afiliação. record["affiliacoes"] = tuple([ { "id": affil.id if affil and affil.id else None, "affiliacao": affil.name if affil and affil.name else None, "pais": affil.country if affil and affil.country else None } for affil in paper.affiliation ]) if paper.affiliation else None # Informações sobre os autores. record["autores"] = tuple( [{"id": author.auid if author and author.auid else None, "nome": "{} {}".format(author.given_name, author.surname) \ if author and author.given_name and author.surname else None} for author in paper.authors]) if paper.authors else None record["autores_affil"] = tuple( [{"id": author.auid if author and author.auid else None, "nome": "{} {}".format(author.given_name, author.surname) \ if author and author.given_name and author.surname else None, "affil_id": author.affiliation_id if author and author.affiliation_id else None, "affiliacao": author.organization if author and author.organization else None, "pais": author.country if author and author.country else None} for author in paper.authorgroup]) if paper.authorgroup else None # Informações sobre referencias. record[ "ref_count"] = paper.refcount if paper.refcount else None record["references"] = tuple([ { "id": ref.id if ref and ref.id else None, "titulo": ref.title if ref and ref.title else None, "doi": ref.doi if ref and ref.doi else None, "autores": ref.authors if ref and ref.authors else None } for ref in paper.references ]) if paper.references else None except Scopus404Error: record["id"] = key print(key) error = False except Scopus429Error: config["Authentication"]["APIKey"] = _keys.pop() data.append(record) df = pd.DataFrame(data) return df
def lookup(): search = input('Enter Search Terms\n') option = input('Enter 1 for Exact search, 0 for inexact search\n') if option == '1': query = '{' + search + '}' # exact search else: query = 'TITLE-ABS-KEY( ' + search + ')' # inexact search s = ScopusSearch(query, download=False) print('Number of results: ') length = s.get_results_size() print(length) if length > 0: dl = input('Would you like to download the results y/n\n') if dl == 'y': s = ScopusSearch(query, download=True) dataframe = pd.DataFrame(pd.DataFrame( s.results)) # converts results into a dataframe pd.options.display.max_colwidth = 150 pd.options.display.max_rows = None print(dataframe[['eid', 'title']]) dataframe.iloc[:, 0] = dataframe.iloc[:, 0].astype( str) # converts the eid dataframe objects to string option2 = input( '\n Enter the row of the abstract you want to download, or enter ALL to download all\n' ) if option2 == 'ALL': for i in progressbar(range(length), "Download Progress ", 40): ab = AbstractRetrieval( dataframe.iloc[i, 0], view='FULL') # searches for abstracts using eid with open( os.path.join( '/home/benjamin/Python_Codes/Abstracts', dataframe.iloc[i, 0] + '.txt'), 'w') as f: f.write( "%s\n" % ab.abstract ) #creates individual txt files titled by their eid else: try: val = int(option2) print('Attempting to download abstract with eid ' + dataframe.iloc[val, 0]) ab = AbstractRetrieval( dataframe.iloc[val, 0], view='FULL') # searches for abstracts using eid with open( os.path.join( '/home/benjamin/Python_Codes/Abstracts', dataframe.iloc[val, 0] + '.txt'), 'w') as f: f.write("%s\n" % ab.abstract) print('Success!\n') except ValueError: print('Invalid row number\n') else: print('No results found, please try again\n')
print(fullres.scopus_abstract_retries.mean()) print(fullres.scopus_abstract_retries.max()) qq=1 qq=qq+1 print('warp') if False: lst = [] from pybliometrics.scopus import AbstractRetrieval t0 = time.time() for ii in np.arange(0,10): cur_eid = df.loc[ii,'eid'] minires = AbstractRetrieval(identifier=cur_eid, view='FULL', refresh=True, id_type='eid') try: qq = minires.authorgroup lst.append(1) except: lst.append(0) print(lst) t1 = time.time() print('expected single-thread time cost per record is: ' + str((t1-t0)/10.0)) # we expect 121 seconds cost for 100 entries print('done') # it only took 20 seconds to do 1000 records # that is 50 per second # or a speed increase of factor 50x !
return None print(author) #orcid = getattr(author, 'orcid') try: orcid = getattr(author, 'orcid') except: print('exception trying to get authors orcid') return None print('ORCID: ', orcid) return None # Remove this temporarily return orcid for index, eid in enumerate(eids): item_from_scopus = AbstractRetrieval(eid, id_type='eid', view='FULL') #print(abstract) #print(abstract.abstract) #print(eid) print(item_from_scopus.__dict__.keys()) doi = item_from_scopus.doi root = et.Element('dublin_core', schema='dc') # TODO generate this automatically xmls = { 'dc': et.Element('dublin_core', schema='dc'), 'local': et.Element('dublin_core', schema='local') }
import pandas as pd from pybliometrics.scopus import ScopusSearch, AbstractRetrieval, AuthorRetrieval, ContentAffiliationRetrieval import networkx as nx nodes = pd.DataFrame() edges = [] gen = {} Acquisti = "10.1257/jel.54.2.442" ab = AbstractRetrieval(Acquisti, view="FULL") nodes = pd.DataFrame() nodes = nodes.append( { "id": "", "title": ab.title, "sourcetitle": ab.sourcetitle_abbreviation, "publicationyear": ab.coverDate[0:4], "eid": ab.eid, "gen": '0' }, ignore_index=True) ref_df = pd.DataFrame(ab.references) ref_df["eid"] = '2-s2.0-' + ref_df['id'] ref_df['gen'] = '-1' ref_df2 = pd.concat( [ ref_df['eid'], ref_df['id'], ref_df['publicationyear'], ref_df['sourcetitle'], ref_df['title'], ref_df['gen'] ],
for author_id, full_name in AUTHORS.items(): publication_count = 0 search = ScopusSearch(f'AU-ID ( {author_id} )') logger.info('STARTING SEARCH FOR AUTHOR {}({})'.format(full_name, author_id)) for result in search.results: # We'll only take publications, which have a DOI if result.doi is None: continue # Requesting the detailed information from the scopus database for the current publication from the search # results try: abstract_retrieval = AbstractRetrieval(result.doi) logger.info(' * FETCHED publication {}'.format(result.doi)) except Exception as e: logger.error(' ! Could not retrieve scopus abstract for DOI "{}". ERROR: {}'.format(result.doi, str(e))) continue # If the publication is older than the date limit, it will be discarded publication_date = datetime.datetime.strptime(abstract_retrieval.coverDate, '%Y-%m-%d') if publication_date <= DATE_LIMIT: logger.info(' # TOO OLD publication {} with publishing date {}'.format(result.doi, abstract_retrieval.coverDate)) continue adapter = ScopusPublicationAdapter(abstract_retrieval) publication = adapter.get_publication() # Filtering the authors according to the AUTHOR_LIMIT, which has been set.