def journal_numberof_first_last_authorship(author_id): '''Input is author scopus id and get all publications from Scopus database. Filter the journal publications and return the number of first, last authorship of the researcher Parameter ---------- author_id : int or str Scopus id of Author Returns ---------- first, last : obj pandas dataframe object number of first and last author in journals ''' assert isinstance(author_id, (str, int)) au = AuthorRetrieval(author_id) publications = pd.DataFrame(au.get_document_eids(refresh=False)) articles = publications[publications['aggregationType'] == 'Journal'] first = articles[articles['author_ids'].str.startswith(author_id)] last = articles[articles['author_ids'].str.endswith(author_id)] n_first, n_last = len(first), len(last) return (n_first, n_last)
def author_publication(author_id): '''Input is author scopus id and get all publications from Scopus database. Filter the journal publications and return the number of first, last authorship of the researcher Parameter ---------- author_id : int or str Scopus id of Author Returns ---------- first, last : obj pandas dataframe object number of first and last author in journals ''' assert isinstance(author_id, (str, int)) if type(author_id) != str: author = str(author_id) else: author = author_id au = AuthorRetrieval(author_id) publications = pd.DataFrame(au.get_documents(refresh=False)) journal_publications = publications[publications['aggregationType'] == 'Journal'] first = journal_publications[ journal_publications['author_ids'].str.startswith(author)] last = journal_publications[ journal_publications['author_ids'].str.endswith(author)] first['year'] = first['coverDate'].str[:4] last['year'] = last['coverDate'].str[:4] return first, last
def author_subject_area(SCOPUS_IDs): """The function is to retrive bulk subject categories of SCOPUS author. Then custom mapped to specific keywords for futher data analysis. Parameter ---------- scopus_ids : list or tuple number of SCOPUS author id for mapping Return ---------- df : obj pandas dataframe object """ assert isinstance(SCOPUS_IDs, (list, tuple)) scopus_id = defaultdict(list) for author in SCOPUS_IDs: scopus_id['SCOPUS_ID'].append(author) #Retriving author from SCOPUS au = AuthorRetrieval(author) subjects, documents = zip(*au.categories) primary_theme, result = main_theme(subjects) scopus_id['Name'].append(au.name) scopus_id['organisation'].append(au.current_affiliation) scopus_id['Main_theme'].append(primary_theme) scopus_id['Alternative_theme'].append(' ') scopus_id['Result'].append(result) scopus_id['Subject_category'].append(subjects) scopus_id['document_number'].append(documents) ## Add columns for each 'Theme' and tranfrom subjects to match 'Theme' df = pd.DataFrame.from_dict(scopus_id) return df
def main(): scopus_nodes = read_nodes() print(f">>> Looking up {len(scopus_nodes):,} researchers") # Parse publication lists pubs = {} data = {} missing = [] for node in tqdm(scopus_nodes): # Document information eids, sources, years, coauthors, affs = perform_query(node) if not eids or not sources or not years: missing.append(node) continue sources = [s or "-" for s in sources] # Replace missing journal names # Author information au = AuthorRetrieval(node, refresh=200) try: fields = [f.abbreviation for f in au.subject_areas if f] except Exception as e: fields = [] try: aff_type = get_aff_type(au.affiliation_current) except Exception as e: au = AuthorRetrieval(node, refresh=10) try: aff_type = get_aff_type(au.affiliation_current) except Exception as e: pass # Add to storage data[node] = {"current_aff_type": aff_type, "fields": "|".join(fields)} pubs[node] = { "eids": "|".join(eids), "sources": "|".join(sources), "years": "|".join(years), "aff_ids": "|".join(affs), "coauthors": "|".join(coauthors) } if missing: print(f">>> {len(missing)} researchers w/o research publication " f"before {MAX_YEAR}:\n{','.join(missing)}") # Write out data = pd.DataFrame(data).T.sort_index() data.to_csv(TARGET_FOLDER + "data.csv", index_label="scopus_id") pubs = pd.DataFrame(pubs).T.sort_index() pubs.to_csv(TARGET_FOLDER + "pub_list.csv", index_label="scopus_id")
def test_warning_with_forwarding(): au = AuthorRetrieval("57191449583", refresh=False) with warnings.catch_warnings(record=True) as w: auth_id = au.identifier assert_equal(len(w), 1) assert_true(issubclass(w[-1].category, UserWarning)) assert_true("57191449583" in str(w[-1].message)) assert_equal(auth_id, 36854449200)
def make_name(s, euclid=True): """Format name for display: Lastname, Initial (euclid).""" au = AuthorRetrieval(s.name, refresh=False) initials = " ".join([x[0] + "." for x in au.given_name.split()]) last = au.surname.replace("*", "") label = ", ".join([last, initials]) if euclid: label += f"({s.euclid:.1f})" return label
def get_by_eid(self, eid): """Return the h-index of an author by a EID if found, None otherwise. """ au = AuthorSearch('AU-ID({})'.format(eid)) if au.get_results_size() == 0: return None assert au.get_results_size() == 1 res = AuthorRetrieval(au.authors[0][0]) return res.h_index
def scopus_author(scopus_id): '''Helper function to invoke the Scopus Author from SCOPUS database download the author contenets and retrun author object Parameter ---------- scopus_id : str or int Return ---------- author : Scopus Author object''' assert isinstance(scopus_id, (str, int)) # Retrive autor object from SCOPUS database author = AuthorRetrieval(scopus_id) return author
def GetOrcidFromScopusID(scopus_id): try: author = AuthorRetrieval(scopus_id) except: print('exception trying to get author') return None print(author) #orcid = getattr(author, 'orcid') try: orcid = getattr(author, 'orcid') except: print('exception trying to get authors orcid') return None print('ORCID: ', orcid) return None # Remove this temporarily return orcid
def get_by_name(self, first, last): """Return the h-index of an author if there is only one matching, None if none is found, or a table with EID, affiliation, town, country otherwise. """ au = AuthorSearch('AUTHLAST({}) and AUTHFIRST({})'.format(last, first)) if au.get_results_size() == 0: return [None, False] elif au.get_results_size() == 1: res = AuthorRetrieval(au.authors[0][0]) return [res.h_index, False] else: df = pd.DataFrame(au.authors) ret = [] for x in zip(df['eid'], df['affiliation'], df['city'], df['country']): tokens = x[0].split('-') ret.append([tokens[-1], x[1], x[2], x[3]]) return [ret, True]
def search_scopus(query, docs=None, retrieve_orcid=True): """Search Scopus.""" documents = [] authors_cache = {} affiliations_cache = {} try: retrieved_paper_ids = ScopusSearch(query, view="STANDARD").get_eids() except ScopusQueryError: print("Impossible to process query \"{}\".".format(query)) return None if len(retrieved_paper_ids) == 0: print("No matching documents for the provided query.") return None for paper_id in tqdm(retrieved_paper_ids): try: paper = AbstractRetrieval(paper_id, view="FULL") except ValueError: print("Impossible to retrieve data for paper \"{}\".".format(paper_id)) return None doc_id = DocumentID() doc_id.parse_scopus(paper) authors = [] if paper.authors: for author in paper.authors: author_affiliations = [] if retrieve_orcid: if author.auid in authors_cache: authors.append(Author(name=author.indexed_name, orcid=authors_cache[author.auid], affiliations=author_affiliations)) else: authors_cache[author.auid] = AuthorRetrieval(author.auid).orcid authors.append(Author(name=author.indexed_name, orcid=authors_cache[author.auid], affiliations=author_affiliations)) else: authors.append(Author(name=author.indexed_name, orcid=None, affiliations=author_affiliations)) if author.affiliation: for affiliation_id in author.affiliation: if affiliation_id in affiliations_cache: affiliation = affiliations_cache[affiliation_id] else: try: affiliation = ContentAffiliationRetrieval(affiliation_id) affiliations_cache[affiliation_id] = affiliation except: affiliation= None if affiliation: author_affiliations.append(Affiliation(name=affiliation.affiliation_name, city=affiliation.city, country=affiliation.country)) references = [] if paper.refcount and int(paper.refcount) > 0 and paper.references: for reference in paper.references: if reference.title: references.append(reference.title) if paper.language: try: language = iso639.languages.get(part2b=paper.language).name except KeyError: language = None else: language = None document = Document(id=doc_id, title=paper.title, keywords=paper.authkeywords, abstract=paper.description, source=paper.publicationName, source_type=paper.aggregationType, language=language, year=int(paper.coverDate.split("-")[0]), authors=authors, references=references, publisher=paper.publisher, internal=paper) if paper.citedby_count: document.citation_count = int(paper.citedby_count) documents.append(document) if docs: return DocumentSet(docs=documents).union(docs) else: return DocumentSet(docs=documents)
import datetime import pandas as pd import requests import re from googlesearch import search from pybliometrics.scopus import AuthorRetrieval, ContentAffiliationRetrieval, config from pybliometrics.scopus.exception import Scopus429Error import time # replace the 'x' with your values API_KEY = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' Author_id = 'xxxxxxxxxxx' # get my publications try: Documents = AuthorRetrieval(Author_id).get_documents() except Scopus429Error: # first time pybliometrics is run, it sets the API_KEY config["Authentication"]["APIKey"] = API_KEY Documents = AuthorRetrieval(Author_id).get_documents() # How many googled pages to scrape looking for email addresses num_url_search_email = 5 # today's date today = datetime.datetime.now() # prepare a dataframe to hold the results data = pd.DataFrame(columns=[ 'Name', 'Organizational Affiliation', 'Optional (email, Department)', 'Last Active'
return auth_id for author, _ in df.groupby('CONTACT_SURNAME'): researcher = df.loc[(df['CONTACT_SURNAME']==author)] eids = researcher['SCOPUS_ID'].tolist() scopus_eids = [str(eid) for eid in eids if eid is not np.nan] row = {'Author': author, 'eids': scopus_eids} #finding the author scopus id from co-authors list of the publication #Evoke the scival abstract api get co-authors from eid_authorid function try: researchers = eid_authorid(row['eids'][0]) author_scopus_id = researchers[row['Author']] #Call Scopus Author API and get pbulications EIDs match to authors au = AuthorRetrieval(author_scopus_id) #Retrive all publications of the retive author pubs = au.get_document_eids(refresh=False, cursor=False) #Get the subset which match to SCOPUS database and Central publication repos papers = set(pubs) match_publications = papers.intersection(row['eids']) #Validation scores for the authors match_score = len(match_publications)/len(row['eids']) data.loc[data['CONTACT_SURNAME']==author,'Scopus_id'] = author_scopus_id data.loc[data['CONTACT_SURNAME']==author,'publication_score'] = match_score except: continue
def test_warning_without_forwarding(): with warnings.catch_warnings(record=True) as w: au = AuthorRetrieval("24079538400", refresh=False) assert_equal(len(w), 1) assert_true(issubclass(w[-1].category, UserWarning)) assert_true("24079538400" in str(w[-1].message))
#!/usr/bin/env python # -*- coding: utf-8 -*- """Tests for `scopus.AuthorRetrieval` module.""" import warnings from collections import Counter, namedtuple from nose.tools import assert_equal, assert_true from pybliometrics.scopus import AuthorRetrieval warnings.simplefilter("always") metrics = AuthorRetrieval("7004212771", refresh=30, view="METRICS") light = AuthorRetrieval("7004212771", refresh=30, view="LIGHT") standard = AuthorRetrieval("7004212771", refresh=30, view="STANDARD") enhanced = AuthorRetrieval("7004212771", refresh=30, view="ENHANCED") def test_affiliation_current(): assert_equal(metrics.affiliation_current, None) assert_equal(light.affiliation_current, None) order = 'id parent type relationship afdispname preferred_name '\ 'parent_preferred_name country_code country address_part city '\ 'state postal_code org_domain org_URL' aff = namedtuple('Affiliation', order) expected = aff(id=110785688, parent=60027950, type='dept', relationship='author', afdispname=None, country='United States',
#!/usr/bin/env python # -*- coding: utf-8 -*- """Tests for `scopus.AuthorRetrieval` module.""" import warnings from collections import Counter from nose.tools import assert_equal, assert_true from pybliometrics.scopus import AuthorRetrieval warnings.simplefilter("always") au = AuthorRetrieval("7004212771", refresh=True) def test_affiliation_current(): assert_equal(au.affiliation_current, '110785688') def test_affiliation_history(): affs = au.affiliation_history assert_true(len(affs) >= 5) assert_true(isinstance(affs[0], str)) def test_citation_count(): assert_true(int(au.citation_count) >= 7584) def test_cited_by_count(): assert_true(int(au.cited_by_count) >= 6066)
def test_author(self): self.assertEqual( AuthorRetrieval('7005789553', refresh=True).given_name, 'Sean')
for line in infile: tokens = line.rstrip().split(";") if len(tokens) == 4 and tokens[3]: initials = [] for firstname in tokens[1].split(" "): initials.append(firstname[0] + ".") hindex = 0 if args.fake: hindex = int(random.expovariate(1 / 20.0)) else: au = AuthorSearch(f"AU-ID({tokens[3]})") if au.get_results_size() > 0: assert au.get_results_size() == 1 hindex = int( AuthorRetrieval(au.authors[0][0]).h_index) fullname = tokens[0] + " " + " ".join(initials) largest_name = max(largest_name, len(fullname)) largest_title = max(largest_title, len(tokens[2])) names.append([fullname, tokens[2], hindex]) for name in sorted(names, key=itemgetter(2), reverse=True): print( f'{name[0] + " " * (largest_name - len(name[0]))} ({name[1]}) {" " * (largest_title - len(name[1]))} {"▇" * name[2]} {name[2]}' ) except Exception as err: print(f"Error: {err}")
authors = authorSearch.authors if authors == None: print("no result with third") fisrtLetter = given_name[0] given_name = fisrtLetter authorSearch = AuthorSearch('AUTHLAST(' + family_name + ') and AUTHFIRST(' + given_name + ') and AFFIL(University)') authors = authorSearch.authors if authors == None: print("no result") continue author = authors[0] print(author[0]) authorRetrieval = AuthorRetrieval(author[0]) eid = authorRetrieval.eid first_name = authorRetrieval.given_name last_name = authorRetrieval.surname docs = ','.join(authorRetrieval.get_document_eids()) citation_count = authorRetrieval.citation_count document_count = authorRetrieval.document_count orcid = authorRetrieval.orcid name_variants = authorRetrieval.name_variants coauthors = authorRetrieval.get_coauthors coauthors_count = authorRetrieval.coauthor_count new_row = [ first_name, last_name, eid, orcid, citation_count, document_count, name_variants, docs, coauthors_count ] csvData.append(new_row)
def test_publication_range(self): self.assertEqual( AuthorRetrieval('7005789553').publication_range[0], '1985') self.assertTrue( int(AuthorRetrieval('7005789553').publication_range[1]) >= 2018)
def test_orcid(self): self.assertEqual( AuthorRetrieval('7005789553').orcid, '0000-0001-6072-8309')
def test_date_created(self): self.assertEqual( AuthorRetrieval('7005789553').date_created, (2005, 12, 3))