def journal_numberof_first_last_authorship(author_id):
    '''Input is author scopus id and get all publications 
     from Scopus database. Filter the journal publications and
     return the number of first, last authorship of the researcher
     
    Parameter
    ----------
    author_id : int or str 
                Scopus id of Author 
    
    Returns
    ----------
    first, last : obj                
               pandas dataframe object  
               number of first and last author in journals    
    '''
    assert isinstance(author_id, (str, int))

    au = AuthorRetrieval(author_id)
    publications = pd.DataFrame(au.get_document_eids(refresh=False))
    articles = publications[publications['aggregationType'] == 'Journal']

    first = articles[articles['author_ids'].str.startswith(author_id)]
    last = articles[articles['author_ids'].str.endswith(author_id)]

    n_first, n_last = len(first), len(last)

    return (n_first, n_last)
def author_publication(author_id):
    '''Input is author scopus id and get all publications 
     from Scopus database. Filter the journal publications and
     return the number of first, last authorship of the researcher
     
    Parameter
    ----------
    author_id : int or str 
                Scopus id of Author 
    
    Returns
    ----------
    first, last : obj                
               pandas dataframe object  
               number of first and last author in journals    
    '''
    assert isinstance(author_id, (str, int))
    if type(author_id) != str:
        author = str(author_id)
    else:
        author = author_id
    au = AuthorRetrieval(author_id)
    publications = pd.DataFrame(au.get_documents(refresh=False))
    journal_publications = publications[publications['aggregationType'] ==
                                        'Journal']
    first = journal_publications[
        journal_publications['author_ids'].str.startswith(author)]
    last = journal_publications[
        journal_publications['author_ids'].str.endswith(author)]

    first['year'] = first['coverDate'].str[:4]
    last['year'] = last['coverDate'].str[:4]

    return first, last
def author_subject_area(SCOPUS_IDs):
    """The function is to retrive bulk subject categories of SCOPUS author. 
    Then custom mapped to specific keywords for futher data analysis.
    
    Parameter
    ----------
    scopus_ids : list or tuple
                number of SCOPUS author id for mapping
    Return
    ----------
    df : obj
         pandas dataframe object """

    assert isinstance(SCOPUS_IDs, (list, tuple))

    scopus_id = defaultdict(list)

    for author in SCOPUS_IDs:
        scopus_id['SCOPUS_ID'].append(author)
        #Retriving author from SCOPUS
        au = AuthorRetrieval(author)
        subjects, documents = zip(*au.categories)
        primary_theme, result = main_theme(subjects)
        scopus_id['Name'].append(au.name)
        scopus_id['organisation'].append(au.current_affiliation)
        scopus_id['Main_theme'].append(primary_theme)
        scopus_id['Alternative_theme'].append(' ')
        scopus_id['Result'].append(result)
        scopus_id['Subject_category'].append(subjects)
        scopus_id['document_number'].append(documents)
    ## Add columns for each 'Theme' and tranfrom subjects to match 'Theme'

    df = pd.DataFrame.from_dict(scopus_id)

    return df
Exemple #4
0
def main():
    scopus_nodes = read_nodes()
    print(f">>> Looking up {len(scopus_nodes):,} researchers")

    # Parse publication lists
    pubs = {}
    data = {}
    missing = []
    for node in tqdm(scopus_nodes):
        # Document information
        eids, sources, years, coauthors, affs = perform_query(node)
        if not eids or not sources or not years:
            missing.append(node)
            continue
        sources = [s or "-" for s in sources]  # Replace missing journal names
        # Author information
        au = AuthorRetrieval(node, refresh=200)
        try:
            fields = [f.abbreviation for f in au.subject_areas if f]
        except Exception as e:
            fields = []
        try:
            aff_type = get_aff_type(au.affiliation_current)
        except Exception as e:
            au = AuthorRetrieval(node, refresh=10)
            try:
                aff_type = get_aff_type(au.affiliation_current)
            except Exception as e:
                pass
        # Add to storage
        data[node] = {"current_aff_type": aff_type, "fields": "|".join(fields)}
        pubs[node] = {
            "eids": "|".join(eids),
            "sources": "|".join(sources),
            "years": "|".join(years),
            "aff_ids": "|".join(affs),
            "coauthors": "|".join(coauthors)
        }
    if missing:
        print(f">>> {len(missing)} researchers w/o research publication "
              f"before {MAX_YEAR}:\n{','.join(missing)}")

    # Write out
    data = pd.DataFrame(data).T.sort_index()
    data.to_csv(TARGET_FOLDER + "data.csv", index_label="scopus_id")
    pubs = pd.DataFrame(pubs).T.sort_index()
    pubs.to_csv(TARGET_FOLDER + "pub_list.csv", index_label="scopus_id")
Exemple #5
0
def test_warning_with_forwarding():
    au = AuthorRetrieval("57191449583", refresh=False)
    with warnings.catch_warnings(record=True) as w:
        auth_id = au.identifier
        assert_equal(len(w), 1)
        assert_true(issubclass(w[-1].category, UserWarning))
        assert_true("57191449583" in str(w[-1].message))
    assert_equal(auth_id, 36854449200)
def make_name(s, euclid=True):
    """Format name for display: Lastname, Initial (euclid)."""
    au = AuthorRetrieval(s.name, refresh=False)
    initials = " ".join([x[0] + "." for x in au.given_name.split()])
    last = au.surname.replace("*", "")
    label = ", ".join([last, initials])
    if euclid:
        label += f"({s.euclid:.1f})"
    return label
Exemple #7
0
    def get_by_eid(self, eid):
        """Return the h-index of an author by a EID if found, None otherwise.
        """

        au = AuthorSearch('AU-ID({})'.format(eid))
        if au.get_results_size() == 0:
            return None

        assert au.get_results_size() == 1
        res = AuthorRetrieval(au.authors[0][0])
        return res.h_index
Exemple #8
0
def scopus_author(scopus_id):
    '''Helper function to invoke the Scopus Author from SCOPUS database
     download the author contenets and retrun author object
    
    Parameter
    ----------
    scopus_id : str or int
    
    Return
    ----------
    author : Scopus Author object'''
    assert isinstance(scopus_id, (str, int))

    # Retrive autor object from SCOPUS database
    author = AuthorRetrieval(scopus_id)

    return author
def GetOrcidFromScopusID(scopus_id):
    try:
        author = AuthorRetrieval(scopus_id)
    except:
        print('exception trying to get author')
        return None
    print(author)

    #orcid = getattr(author, 'orcid')
    try:
        orcid = getattr(author, 'orcid')
    except:
        print('exception trying to get authors orcid')
        return None
    print('ORCID: ', orcid)
    return None  # Remove this temporarily
    return orcid
Exemple #10
0
    def get_by_name(self, first, last):
        """Return the h-index of an author if there is only one matching, None if none is
        found, or a table with EID, affiliation, town, country otherwise.
        """

        au = AuthorSearch('AUTHLAST({}) and AUTHFIRST({})'.format(last, first))

        if au.get_results_size() == 0:
            return [None, False]

        elif au.get_results_size() == 1:
            res = AuthorRetrieval(au.authors[0][0])
            return [res.h_index, False]

        else:
            df = pd.DataFrame(au.authors)
            ret = []
            for x in zip(df['eid'], df['affiliation'], df['city'],
                         df['country']):
                tokens = x[0].split('-')
                ret.append([tokens[-1], x[1], x[2], x[3]])
            return [ret, True]
def search_scopus(query, docs=None, retrieve_orcid=True):
    """Search Scopus."""

    documents = []
    authors_cache = {}
    affiliations_cache = {}
    try:
        retrieved_paper_ids = ScopusSearch(query, view="STANDARD").get_eids()
    except ScopusQueryError:
        print("Impossible to process query \"{}\".".format(query))
        return None
    if len(retrieved_paper_ids) == 0:
        print("No matching documents for the provided query.")
        return None
    for paper_id in tqdm(retrieved_paper_ids):
        try:
            paper = AbstractRetrieval(paper_id, view="FULL")
        except ValueError:
            print("Impossible to retrieve data for paper \"{}\".".format(paper_id))
            return None
        doc_id = DocumentID()
        doc_id.parse_scopus(paper)
        authors = []
        if paper.authors:
            for author in paper.authors:
                author_affiliations = []
                if retrieve_orcid:
                    if author.auid in authors_cache:
                        authors.append(Author(name=author.indexed_name,
                                              orcid=authors_cache[author.auid],
                                              affiliations=author_affiliations))
                    else:
                        authors_cache[author.auid] = AuthorRetrieval(author.auid).orcid
                        authors.append(Author(name=author.indexed_name,
                                              orcid=authors_cache[author.auid],
                                              affiliations=author_affiliations))
                else:
                    authors.append(Author(name=author.indexed_name,
                                          orcid=None,
                                          affiliations=author_affiliations))
                if author.affiliation:
                    for affiliation_id in author.affiliation:
                        if affiliation_id in affiliations_cache:
                            affiliation = affiliations_cache[affiliation_id]
                        else:
                            try:
                                affiliation = ContentAffiliationRetrieval(affiliation_id)
                                affiliations_cache[affiliation_id] = affiliation
                            except:
                                affiliation= None
                        if affiliation:
                            author_affiliations.append(Affiliation(name=affiliation.affiliation_name,
                                                               city=affiliation.city,
                                                               country=affiliation.country))
        references = []
        if paper.refcount and int(paper.refcount) > 0 and paper.references:
            for reference in paper.references:
                if reference.title:
                    references.append(reference.title)
        if paper.language:
            try:
                language = iso639.languages.get(part2b=paper.language).name
            except KeyError:
                language = None
        else:
            language = None

        document = Document(id=doc_id,
                            title=paper.title,
                            keywords=paper.authkeywords,
                            abstract=paper.description,
                            source=paper.publicationName,
                            source_type=paper.aggregationType,
                            language=language,
                            year=int(paper.coverDate.split("-")[0]),
                            authors=authors,
                            references=references,
                            publisher=paper.publisher,
                            internal=paper)
        if paper.citedby_count:
            document.citation_count = int(paper.citedby_count)
        documents.append(document)
    if docs:
        return DocumentSet(docs=documents).union(docs)
    else:
        return DocumentSet(docs=documents)
Exemple #12
0
import datetime
import pandas as pd
import requests
import re
from googlesearch import search
from pybliometrics.scopus import AuthorRetrieval, ContentAffiliationRetrieval, config
from pybliometrics.scopus.exception import Scopus429Error
import time

# replace the 'x' with your values
API_KEY = 'xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx'
Author_id = 'xxxxxxxxxxx'

# get my publications
try:
    Documents = AuthorRetrieval(Author_id).get_documents()
except Scopus429Error:
    # first time pybliometrics is run, it sets the API_KEY
    config["Authentication"]["APIKey"] = API_KEY
    Documents = AuthorRetrieval(Author_id).get_documents()

# How many googled pages to scrape looking for email addresses
num_url_search_email = 5

# today's date
today = datetime.datetime.now()

# prepare a dataframe to hold the results
data = pd.DataFrame(columns=[
    'Name', 'Organizational Affiliation', 'Optional  (email, Department)',
    'Last Active'
    return auth_id


for author, _ in df.groupby('CONTACT_SURNAME'):
    researcher = df.loc[(df['CONTACT_SURNAME']==author)]
    eids = researcher['SCOPUS_ID'].tolist()
    scopus_eids = [str(eid) for eid in eids if eid is not np.nan]
    row = {'Author': author, 'eids': scopus_eids}
    
    #finding the author scopus id from co-authors list of the publication
    #Evoke the scival abstract api get co-authors from eid_authorid function
    try:
        researchers = eid_authorid(row['eids'][0])
        author_scopus_id = researchers[row['Author']]
        #Call Scopus Author API and get pbulications EIDs match to authors
        au = AuthorRetrieval(author_scopus_id)
    
    
        #Retrive all publications of the retive author
        pubs = au.get_document_eids(refresh=False, cursor=False)
    #Get the subset which match to SCOPUS database and Central publication repos
        papers = set(pubs)
        match_publications = papers.intersection(row['eids'])
    
    #Validation scores for the authors
        match_score = len(match_publications)/len(row['eids'])
    
        data.loc[data['CONTACT_SURNAME']==author,'Scopus_id'] = author_scopus_id
        data.loc[data['CONTACT_SURNAME']==author,'publication_score'] = match_score 
    except:
        continue
Exemple #14
0
def test_warning_without_forwarding():
    with warnings.catch_warnings(record=True) as w:
        au = AuthorRetrieval("24079538400", refresh=False)
        assert_equal(len(w), 1)
        assert_true(issubclass(w[-1].category, UserWarning))
        assert_true("24079538400" in str(w[-1].message))
Exemple #15
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Tests for `scopus.AuthorRetrieval` module."""

import warnings
from collections import Counter, namedtuple
from nose.tools import assert_equal, assert_true

from pybliometrics.scopus import AuthorRetrieval

warnings.simplefilter("always")

metrics = AuthorRetrieval("7004212771", refresh=30, view="METRICS")
light = AuthorRetrieval("7004212771", refresh=30, view="LIGHT")
standard = AuthorRetrieval("7004212771", refresh=30, view="STANDARD")
enhanced = AuthorRetrieval("7004212771", refresh=30, view="ENHANCED")


def test_affiliation_current():
    assert_equal(metrics.affiliation_current, None)
    assert_equal(light.affiliation_current, None)
    order = 'id parent type relationship afdispname preferred_name '\
            'parent_preferred_name country_code country address_part city '\
            'state postal_code org_domain org_URL'
    aff = namedtuple('Affiliation', order)
    expected = aff(id=110785688,
                   parent=60027950,
                   type='dept',
                   relationship='author',
                   afdispname=None,
                   country='United States',
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Tests for `scopus.AuthorRetrieval` module."""

import warnings
from collections import Counter
from nose.tools import assert_equal, assert_true

from pybliometrics.scopus import AuthorRetrieval

warnings.simplefilter("always")

au = AuthorRetrieval("7004212771", refresh=True)


def test_affiliation_current():
    assert_equal(au.affiliation_current, '110785688')


def test_affiliation_history():
    affs = au.affiliation_history
    assert_true(len(affs) >= 5)
    assert_true(isinstance(affs[0], str))


def test_citation_count():
    assert_true(int(au.citation_count) >= 7584)


def test_cited_by_count():
    assert_true(int(au.cited_by_count) >= 6066)
 def test_author(self):
     self.assertEqual(
         AuthorRetrieval('7005789553', refresh=True).given_name, 'Sean')
Exemple #18
0
            for line in infile:
                tokens = line.rstrip().split(";")
                if len(tokens) == 4 and tokens[3]:
                    initials = []
                    for firstname in tokens[1].split(" "):
                        initials.append(firstname[0] + ".")

                    hindex = 0
                    if args.fake:
                        hindex = int(random.expovariate(1 / 20.0))
                    else:
                        au = AuthorSearch(f"AU-ID({tokens[3]})")
                        if au.get_results_size() > 0:
                            assert au.get_results_size() == 1
                            hindex = int(
                                AuthorRetrieval(au.authors[0][0]).h_index)

                    fullname = tokens[0] + " " + " ".join(initials)
                    largest_name = max(largest_name, len(fullname))
                    largest_title = max(largest_title, len(tokens[2]))

                    names.append([fullname, tokens[2], hindex])

        for name in sorted(names, key=itemgetter(2), reverse=True):
            print(
                f'{name[0] + " " * (largest_name - len(name[0]))} ({name[1]}) {" " * (largest_title - len(name[1]))} {"▇" * name[2]} {name[2]}'
            )

    except Exception as err:
        print(f"Error: {err}")
            authors = authorSearch.authors
        if authors == None:
            print("no result with third")
            fisrtLetter = given_name[0]
            given_name = fisrtLetter
            authorSearch = AuthorSearch('AUTHLAST(' + family_name +
                                        ') and AUTHFIRST(' + given_name +
                                        ') and AFFIL(University)')
            authors = authorSearch.authors
        if authors == None:
            print("no result")
            continue
        author = authors[0]
        print(author[0])

        authorRetrieval = AuthorRetrieval(author[0])
        eid = authorRetrieval.eid
        first_name = authorRetrieval.given_name
        last_name = authorRetrieval.surname
        docs = ','.join(authorRetrieval.get_document_eids())
        citation_count = authorRetrieval.citation_count
        document_count = authorRetrieval.document_count
        orcid = authorRetrieval.orcid
        name_variants = authorRetrieval.name_variants
        coauthors = authorRetrieval.get_coauthors
        coauthors_count = authorRetrieval.coauthor_count
        new_row = [
            first_name, last_name, eid, orcid, citation_count, document_count,
            name_variants, docs, coauthors_count
        ]
        csvData.append(new_row)
 def test_publication_range(self):
     self.assertEqual(
         AuthorRetrieval('7005789553').publication_range[0], '1985')
     self.assertTrue(
         int(AuthorRetrieval('7005789553').publication_range[1]) >= 2018)
 def test_orcid(self):
     self.assertEqual(
         AuthorRetrieval('7005789553').orcid, '0000-0001-6072-8309')
 def test_date_created(self):
     self.assertEqual(
         AuthorRetrieval('7005789553').date_created, (2005, 12, 3))