Exemple #1
0
 def get_publication_languages(self, refresh=False):
     """Parse languages of published documents."""
     from json import JSONDecodeError
     from pybliometrics.scopus.exception import Scopus404Error
     langs = set()
     for eid in self._eids:
         try:
             ab = AbstractRetrieval(eid, view="FULL", refresh=refresh)
         except JSONDecodeError:
             ab = AbstractRetrieval(eid, view="FULL", refresh=True)
         except Scopus404Error:
             continue
         langs.add(ab.language)
     self._language = "; ".join(sorted(filter(None, langs)))
     return self
Exemple #2
0
def parse_docs(eids, refresh):
    """Find the set of references of provided articles.

    Parameters
    ----------
    eids : list of str
        Scopus Document EIDs representing documents to be considered.

    refresh : bool
        Whether to refresh the cached files if they exist, or not.

    Returns
    -------
    refs : set
        The set of Scopus Document EIDs of cited references.

    n_valid_refs : int
        The number of documents with valid reference information.
    """
    docs = []
    for eid in eids:
        try:
            docs.append(AbstractRetrieval(eid, view="FULL", refresh=refresh))
        except Scopus404Error:
            continue
    ref_lst = [ab.references for ab in docs if ab.references]
    valid_refs = len(ref_lst)
    ref_ids = [ref.id for sl in ref_lst for ref in sl]
    refs = set(filter(None, ref_ids))
    return refs, valid_refs
Exemple #3
0
def parse_docs(eids, refresh):
    """Find abstract and references of articles published up until
    the given year, both as continuous string.

    Parameters
    ----------
    eids : list of str
        Scopus Document EIDs representing documents to be considered.

    refresh : bool
        Whether to refresh the cached files if they exist, or not.

    Returns
    -------
    t : tuple
        A tuple with our elements: The first element is a continuous string
        of cleaned abstracts, joined on a blank.  The second element is the
        number of documents with valid reference information.  The third
        element is a continuous string of Scopus Abstract EIDs representing
        cited references, joined on a blank.  The fourth element is the
        number of valid abstract information.
    """
    docs = []
    for eid in eids:
        try:
            docs.append(AbstractRetrieval(eid, view="FULL", refresh=refresh))
        except Scopus404Error:
            continue
    refs = [ab.references for ab in docs if ab.references]
    valid_refs = len(refs)
    refs = " ".join([ref.id for sl in refs for ref in sl])
    absts = [clean_abstract(ab.abstract) for ab in docs if ab.abstract]
    valid_absts = len(absts)
    absts = " ".join(absts)
    return (refs, valid_refs, absts, valid_absts)
Exemple #4
0
 def get_publication_languages(self, refresh=False):
     """Parse languages of published documents."""
     langs = []
     for eid in self._eids:
         l = AbstractRetrieval(eid, view="FULL", refresh=refresh).language
         langs.append(l)
     self._language = "; ".join(sorted(list(set(filter(None, langs)))))
     return self
Exemple #5
0
 def get_publication_languages(self, refresh=False):
     """Parse languages of published documents."""
     langs = set()
     for eid in self._eids:
         try:
             ab = AbstractRetrieval(eid, view="FULL", refresh=refresh)
         except Scopus404Error:
             continue
         langs.add(ab.language)
     self._language = "; ".join(sorted(filter(None, langs)))
     return self
Exemple #6
0
def find_location(auth_ids, pubs, year, refresh):
    """Find the most common country, affiliation ID, and affiliation name
    of a scientist using her most recent publications with valid information.

    Parameters
    ----------
    auth_ids : list of str
        A list of Scopus Author Profile IDs for which the affiliation should
        be searched for.

    pubs : list of namedtuple
        The publications associated with the Author IDs as returned from a
        scopus query.

    year : int
        The year for which we would like to have the country.

    refresh : bool
        Whether to refresh all cached files or not.

    Returns
    -------
    country, affiliation_id, organization : str or None
        The country, city, affiliation ID, and affiliation name of the
        scientist in the year closest to the treatment year, given that the
        publications list valid information for each output. Equals None when
        no valid publications are found.
    """
    from operator import attrgetter
    # Available papers of most recent year with publications
    papers = [p for p in pubs if int(p.coverDate[:4]) <= year]
    papers = sorted(papers, key=attrgetter("coverDate"), reverse=True)
    params = {"view": "FULL", "refresh": refresh}
    # Return most recent complete information
    for p in papers:
        try:
            authgroup = AbstractRetrieval(p.eid, **params).authorgroup or []
        except Scopus404Error:
            continue
        authgroup = [
            a for a in authgroup if a.auid in auth_ids and a.country
            and a.affiliation_id and a.organization
        ]
        countries = "; ".join(sorted(set([a.country for a in authgroup])))
        aff_ids = "; ".join(sorted(set([a.affiliation_id for a in authgroup])))
        orgs = "; ".join(sorted(set([a.organization for a in authgroup])))
        if not countries and not aff_ids and not orgs:
            continue
        return countries, aff_ids, orgs
    # Return None-triple if all else fails
    return countries, aff_ids, orgs
Exemple #7
0
    def from_identifier(id, id_type, view='FULL'):
        from pybliometrics.scopus import AbstractRetrieval
        from pybliometrics.scopus.exception import Scopus404Error

        with shelve.open(SCOPUS_CACHE) as cache:
            key = id + '_found'
            if cache.get(key) is False:
                raise Scopus404Error()

            try:
                result = AbstractRetrieval(id, id_type=id_type, view=view)
                return ScopusDocument(result)
            except Scopus404Error:
                cache[key] = False
                raise
Exemple #8
0
def eid_authorid(SCOPUS_EID):     
    
    '''Given the journal details (SCOPUS_EID) the function returnal all authors
    name and Author Scopus_ID
    
    Parameters: str SCOPUS_EID  
               
    Returns:    dict Authors name and Author Scopus_ID
               
    '''
    from pybliometrics.scopus import AbstractRetrieval
    ab = AbstractRetrieval(SCOPUS_EID)
    
    researchers = {author.given_name+ ' '+author.surname : author.auid for author in ab.authors} 
       
    return researchers
Exemple #9
0
def retrieve_abstract_try(eid, view='REF', param='references'):
    from pybliometrics.scopus import AbstractRetrieval
    try:
        refs = AbstractRetrieval(eid, view=view)._json[param]
    except KeyError:
        print('An error occurred (1) ...')
        return 1
    except UnboundLocalError:
        print('An error occurred (2). Probably an empty eID provided? ')
        return 2
    except KeyboardInterrupt:
        sys.exit("Interrupting due to user command.")
    except:
        print('An error occurred (?)...')
        return 0
    else:
        return refs
Exemple #10
0
def get_citations(dois):
    '''
    Function that translates a list of DOIs into a citation count

    Parameters:
    dois: List of strings
        Contains all relevant DOIs, as obtained from LibXC
    '''
    # Citations from scopus using Rose, Michael E. and John R. Kitchin: 
    # "pybliometrics: Scriptable bibliometrics using a Python interface to Scopus", SoftwareX 10 (2019) 100263.
    from pybliometrics.scopus import AbstractRetrieval
    citations = 0
    for doi in dois:
        try:
            ab = AbstractRetrieval(doi)
            #print(ab.citedby_count)
            citations += ab.citedby_count
        except:
            continue
    return citations
Exemple #11
0
def find_authors(abst_path, dois_path, entries):
    """Listing all the authors who have at least one publlication which
    contains at least one of the given entries

    **Parameters:**

    * `abst_paths`: (str) path to abstracts file
    * `dois_path`: (str) path to the list of DOIs
    * `entries`: (list) list of strings, each string is an entry

    ** Returns:

    * `u_auids`: (list) set of author IDs for those who had at least
                published one paper that contained one of the entries
    * `au_dois`: (list) list of DOIs of the authors that were identified
               (same length as `u_auids)`
    """

    p = utils.MatTextProcessor()

    # domain of the search (DOIs)
    doi_list = pd.read_csv(dois_path, header=None)

    auids = []
    dois = []
    with open(abst_path, 'r', encoding='utf-8') as f:
        for i, line in enumerate(f):
            if np.any([e in line for e in entries]):
                abst = line.split(' ')
                if np.any([p.normalized_formula(e) in abst for e in entries]):
                    dois += [doi_list.iloc[i][0]]
                    doc = AbstractRetrieval(dois[-1])
                    auids += [[a.auid for a in doc.authors]]

    # unique authors and their documents
    u_auids = list(np.unique(np.array(sum(auids, []))))
    au_dois = [[dois[j] for j in range(len(dois)) if au in auids[j]]
               for au in u_auids]

    return u_auids, au_dois
Exemple #12
0
    def make_training_file(self, dois, save_dir):
        """Downloading, pre-processsing and storing abstracts of a set
        of DOIs in a text file which can be later used as the training data
        for tuning models like word2vec

        Each line of the saved file corresponds to one article and shows
        its title followed by the abstract

        ** Parameters:
            * dois : *(list)* list of DOIs
            * saved_dir : *(str)* directory to save the files
        """

        # list of lists (each list = one line = title + abstract)
        save_path_abst = os.path.join(save_dir, 'abstracts')
        save_path_dois = os.path.join(save_dir, 'saved_DOIs')
        save_path_misses = os.path.join(save_dir, 'missed_DIOs')
        missed_dois = []
        for doi in dois:
            try:
                r = AbstractRetrieval(doi)
                tokens = self.mat_preprocess(r.title) + self.mat_preprocess(r.description)
            except:
                #pdb.set_trace()
                with open(save_path_misses, 'a+', encoding='utf-8') as f:
                    f.write(doi+'\n')
                continue
            
            line = ' '.join(sum(tokens,[]))
            doi_line = doi
            if doi!=dois[-1]:
                line += '\n'
                doi_line += '\n'
                
            # saving the texts
            with open(save_path_abst, 'a+', encoding='utf-8') as f:
                f.write(line)
            with open(save_path_dois, 'a+') as f:
                f.write(doi_line)
def parse_abstract(pub, refresh=350):
    """Extract bibliometric information and add yearly citations."""
    # Basic bibliometric information
    s = pd.Series()
    s['title'] = pub.title
    s['eid'] = pub.eid
    pubyear = int(pub.coverDate.split("-")[0])
    s['year'] = str(pubyear)
    try:
        pages = pub.pageRange.split("-")
    except AttributeError:
        ab = AbstractRetrieval(pub.eid, view="FULL")
        pages = ab.pageRange.split("-")
    s['num_pages'] = int(pages[1]) - int(pages[0])
    s['num_auth'] = pub.author_count
    s['authors'] = pub.author_ids
    # Yearly cumulated citations
    co = CitationOverview(pub.eid, start=pubyear, end=2020, refresh=refresh)
    s['total_citations'] = sum([int(t[1]) for t in co.cc])
    lags = [f"citcount_{y-pubyear}" for y, _ in co.cc]
    citations = cumsum([int(t[1]) for t in co.cc])
    s = s.append(pd.Series(citations, index=lags))
    return s
Exemple #14
0
def complete_affiliations(paper_ids, sql_db, sql_cursor, logfile_path=None):

    logger = helpers.set_up_logger(__name__, logfile_path, False, file_mode='a')
    
    # initialize the affiliation primary key
    sql_cursor.execute('SELECT aff_id FROM affiliation;')
    all_aff_PKs = sql_cursor.fetchall()
    if len(all_aff_PKs)==0:
        aff_PK = 0
    else:
        aff_PK = max([a[0] for a in all_aff_PKs]) + 1
        
    sql_cursor.execute('SELECT aff_scopus_ID FROM affiliation;')
    curr_aff_scopus_id_list = [a[0] for a in sql_cursor.fetchall()]
    sql_cursor.execute('SELECT * FROM author_affiliation_mapping;')
    curr_author_aff_pairs = list(sql_cursor.fetchall())

    pids_array = ','.join([str(p) for p in paper_ids])
    sql_cursor.execute('SELECT doi, paper_id FROM paper WHERE paper_id IN {};'.format(pids_array))
    RES = sql_cursor.fetchall()
    dois = [a[0] for a in RES]
    paper_ids = [a[1] for a in RES]

    dois_with_nonexisting_authors = []
    for j,doi in enumerate(dois):
        
        try:
            r = AbstractRetrieval(doi)
        except Scopus429Error:
            print('Scopus resource exhausted. Check your quota.')
            return
        except:
            raise ValueError('Could not download doi {}'.format(doi))
        
        if r.authors is None:
            continue
        
        paper_scopus_id_list = [a.auid for a in r.authors]
        for i,scps_id in enumerate(paper_scopus_id_list):
            # if repetitive author, ignore:
            if scps_id in paper_scopus_id_list[:i]:
                continue

            sql_cursor.execute('SELECT author_id \
                                FROM author \
                                WHERE author_scopus_ID = {}'.format(scps_id))
            
            this_author_PK = sql_cursor.fetchall()
            if len(this_author_PK)==0:
                if doi not in dois_with_nonexisting_authors:
                    dois_with_nonexisting_authors += [doi]
                logger.info('(CASE NUMBER {}) PAPER_ID {}, DOI {}: author with scopus ID {} does not exist.'.format(306+len(dois_with_nonexisting_authors), paper_ids[j], doi, scps_id))
                continue
            else:
                this_author_PK = this_author_PK[0][0]
            
            # directly go to their affiliations
            if r.authors[i].affiliation is not None:
                author_aff_scopus_id_list = np.unique(r.authors[i].affiliation)
            else:
                author_aff_scopus_id_list = []
                
            for aff_scps_id in author_aff_scopus_id_list:
                if aff_scps_id in curr_aff_scopus_id_list:
                    sql_cursor.execute('SELECT aff_id \
                    FROM affiliation \
                    WHERE aff_scopus_ID = {}'.format(aff_scps_id))
                    this_aff_PK = sql_cursor.fetchall()[0][0]

                    # add the pair only if the author/aff. have not already
                    # been added to the mapping table
                    if (this_author_PK, this_aff_PK) not in curr_author_aff_pairs:
                        sql_cursor.execute('INSERT INTO author_affiliation_mapping \
                                            VALUES({}, {})'.format(this_author_PK,
                                                                   this_aff_PK))
                        curr_author_aff_pairs += [(this_author_PK, this_aff_PK)]
                        logger.info('{} have been added to A2A.'.format((r.authors[i].given_name,
                                                                         r.authors[i].surname,
                                                                         this_aff_PK)))
                else:
                    lcn = np.where([x.id==aff_scps_id for x in r.affiliation])[0]
                    if len(lcn)>0:
                        lcn = lcn[0]
                        aff_name = r.affiliation[lcn].name.replace('"','\\"')
                        aff_city = r.affiliation[lcn].city
                        aff_country = r.affiliation[lcn].country
                    else:
                        aff_name = 'NA'
                        aff_city = 'NA'
                        aff_country = 'NA'

                    sql_cursor.execute('INSERT INTO affiliation \
                                        VALUES({},"{}","{}","{}","{}");'.format(
                                            aff_PK,
                                            aff_scps_id,
                                            aff_name,
                                            aff_city,
                                            aff_country)
                    )
                    sql_cursor.execute('INSERT INTO author_affiliation_mapping \
                                        VALUES({}, {})'.format(this_author_PK, aff_PK))
                    curr_author_aff_pairs += [(this_author_PK, aff_PK)]
                    logger.info('{} have been added to A2A.'.format((r.authors[i].given_name,
                                                                     r.authors[i].surname,
                                                                     this_aff_PK)))

                    # update the affliations list
                    curr_aff_scopus_id_list += [aff_scps_id]
                    aff_PK += 1

        if not(j%1000):
            np.savetxt('/home/jamshid/codes/data/iter_inds.txt', [j])
        sql_db.commit()
Exemple #15
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Tests for `scopus.AbstractRetrieval` module."""

from collections import namedtuple
from nose.tools import assert_equal, assert_true

from pybliometrics.scopus import AbstractRetrieval

# Base information
ab1 = AbstractRetrieval("2-s2.0-84930616647", view="FULL", refresh=30)
# Conference proceeding and no references
ab2 = AbstractRetrieval("2-s2.0-0029486824", view="FULL", refresh=30)
# Issuetitle and no affiliation
ab3 = AbstractRetrieval("2-s2.0-0001270077", view="FULL", refresh=30)
# Author group broken and author keywords
ab4 = AbstractRetrieval("2-s2.0-0000016206", view="FULL", refresh=30)
# ISBN
ab5 = AbstractRetrieval("2-s2.0-84919546381", view="FULL", refresh=30)
# Funding, sequencebanks, chemicals
ab6 = AbstractRetrieval("2-s2.0-85040230676", view="FULL", refresh=30)
# Contributor group
ab7 = AbstractRetrieval("2-s2.0-85050253030", view="FULL", refresh=30)
# REF view
ab8 = AbstractRetrieval("2-s2.0-84951753303", view="REF", refresh=30)


def test_abstract():
    expected = 'In this paper we propose a Bayesian analysis of seasonal '\
        'unit roots in quarterly observed time series. Seasonal unit root '\
        'processes are useful to describe economic series with changing '\
def search_scopus(query, docs=None, retrieve_orcid=True):
    """Search Scopus."""

    documents = []
    authors_cache = {}
    affiliations_cache = {}
    try:
        retrieved_paper_ids = ScopusSearch(query, view="STANDARD").get_eids()
    except ScopusQueryError:
        print("Impossible to process query \"{}\".".format(query))
        return None
    if len(retrieved_paper_ids) == 0:
        print("No matching documents for the provided query.")
        return None
    for paper_id in tqdm(retrieved_paper_ids):
        try:
            paper = AbstractRetrieval(paper_id, view="FULL")
        except ValueError:
            print("Impossible to retrieve data for paper \"{}\".".format(paper_id))
            return None
        doc_id = DocumentID()
        doc_id.parse_scopus(paper)
        authors = []
        if paper.authors:
            for author in paper.authors:
                author_affiliations = []
                if retrieve_orcid:
                    if author.auid in authors_cache:
                        authors.append(Author(name=author.indexed_name,
                                              orcid=authors_cache[author.auid],
                                              affiliations=author_affiliations))
                    else:
                        authors_cache[author.auid] = AuthorRetrieval(author.auid).orcid
                        authors.append(Author(name=author.indexed_name,
                                              orcid=authors_cache[author.auid],
                                              affiliations=author_affiliations))
                else:
                    authors.append(Author(name=author.indexed_name,
                                          orcid=None,
                                          affiliations=author_affiliations))
                if author.affiliation:
                    for affiliation_id in author.affiliation:
                        if affiliation_id in affiliations_cache:
                            affiliation = affiliations_cache[affiliation_id]
                        else:
                            try:
                                affiliation = ContentAffiliationRetrieval(affiliation_id)
                                affiliations_cache[affiliation_id] = affiliation
                            except:
                                affiliation= None
                        if affiliation:
                            author_affiliations.append(Affiliation(name=affiliation.affiliation_name,
                                                               city=affiliation.city,
                                                               country=affiliation.country))
        references = []
        if paper.refcount and int(paper.refcount) > 0 and paper.references:
            for reference in paper.references:
                if reference.title:
                    references.append(reference.title)
        if paper.language:
            try:
                language = iso639.languages.get(part2b=paper.language).name
            except KeyError:
                language = None
        else:
            language = None

        document = Document(id=doc_id,
                            title=paper.title,
                            keywords=paper.authkeywords,
                            abstract=paper.description,
                            source=paper.publicationName,
                            source_type=paper.aggregationType,
                            language=language,
                            year=int(paper.coverDate.split("-")[0]),
                            authors=authors,
                            references=references,
                            publisher=paper.publisher,
                            internal=paper)
        if paper.citedby_count:
            document.citation_count = int(paper.citedby_count)
        documents.append(document)
    if docs:
        return DocumentSet(docs=documents).union(docs)
    else:
        return DocumentSet(docs=documents)
Exemple #17
0
def Scopus_to_SQLtable(dois,
                       sql_db, 
                       sql_cursor, 
                       bad_dois_save_path=None):

    # get the last primary paper/author IDs
    sql_cursor.execute('SELECT paper_id FROM paper;')
    all_paper_PKs = sql_cursor.fetchall()
    if len(all_paper_PKs)==0:
        paper_PK = 0
    else:
        paper_PK = max([a[0] for a in all_paper_PKs]) + 1

    sql_cursor.execute('SELECT author_id FROM author;')
    all_author_PKs = sql_cursor.fetchall()
    if len(all_author_PKs)==0:
        author_PK = 0
    else:
        author_PK = max([a[0] for a in all_author_PKs]) + 1

    sql_cursor.execute('SELECT aff_id FROM affiliation;')
    all_aff_PKs = sql_cursor.fetchall()
    if len(all_aff_PKs)==0:
        aff_PK = 0
    else:
        aff_PK = max([a[0] for a in all_aff_PKs]) + 1



    # all previously entered paper DOIs to avoid repetition
    sql_cursor.execute('SELECT doi FROM paper;')
    all_dois = sql_cursor.fetchall()
    all_dois = [a[0] for a in all_dois]
    # ... same for authors
    sql_cursor.execute('SELECT author_scopus_ID FROM author;')
    curr_scopus_id_list = [a[0] for a in sql_cursor.fetchall()]
    sql_cursor.execute('SELECT aff_scopus_ID FROM affiliation;')
    # ... same for affiliations
    curr_aff_scopus_id_list = [a[0] for a in sql_cursor.fetchall()]
    # ... even same for (author, affiliation)'s, since they can be repeatitive
    sql_cursor.execute('SELECT * FROM author_affiliation_mapping;')
    curr_author_aff_pairs = list(sql_cursor.fetchall())
    
    bad_dois = []
    for i,doi in enumerate(dois):
        if doi in all_dois:
            print('{} has been already entered to the database'.format(doi))
            continue

        try:
            r = AbstractRetrieval(doi)
        except Scopus429Error:
            print('Scopus resource exhausted. Check your quota.')
            return
        except:
            bad_dois += [doi]
            if bad_dois_save_path is not None:
                with open(bad_dois_save_path, 'a+') as bad_f:
                    bad_f.write(doi+'\n')
            continue
            

        # ROW IN PAPER TABLE
        if r.title is not None:
            title = r.title.replace('\"','')
            title = title.replace('\\Vub\\', '|Vub|') # ad-hoc for a specific article
        else:
            title = 'NA'
        if r.description is not None:
            abst = r.description.replace('\"','')
            abst = abst.replace('\\Vub\\','|Vub|') # ad-hoc for a specific article
            abst = abst.replace('out.\\', 'out.')  # ad-hoc for a specific article
            # yet another ad-hoc
            if doi=='10.1140/epjb/e2012-30482-6':
                abst = re.sub(r'-duration(.*?), among others',
                              '-duration α, among others',abst)
        else:
            abst = 'NA'
            
        scomm = """INSERT INTO paper VALUES({},"{}","{}","{}","{}");""".format(
            paper_PK,
            r.doi,
            r.coverDate,
            title,
            abst)
        # taking care of unicode characters
        #scomm = "{}".format(scomm.encode('utf-8'))
        #scomm = scomm[2:-1].replace('\\', '\\\\')

        sql_cursor.execute(scomm)


        # ROW IN AUTHOR TABLE
        # skip the rest if no auhotrs were available
        if r.authors is None:
            paper_PK += 1
            continue
        paper_scopus_id_list = [a.auid for a in r.authors]
        for i,scps_id in enumerate(paper_scopus_id_list):
            # if repetitive author, ignore:
            if scps_id in paper_scopus_id_list[:i]:
                continue
            
            if scps_id in curr_scopus_id_list:
                # extract existing author PK from scopus ID
                sql_cursor.execute('SELECT author_id \
                                    FROM author \
                                    WHERE author_scopus_ID = {}'.format(scps_id))
                this_author_PK = sql_cursor.fetchall()[0][0]
                sql_cursor.execute('INSERT INTO paper_author_mapping VALUES({}, {})'.format(
                    paper_PK, this_author_PK))
            else:
                # create a row for this new author
                au_given_name = r.authors[i].given_name.replace('\"','') if \
                    r.authors[i].given_name is not None else r.authors[i].given_name
                au_surname = r.authors[i].surname.replace('\"','') if \
                    r.authors[i].surname is not None else r.authors[i].surname
                
                sql_cursor.execute('INSERT INTO author \
                                    VALUES({}, "{}", "{}", "{}")'.format(
                                        author_PK,
                                        scps_id,
                                        au_given_name,
                                        au_surname)
                )
                sql_cursor.execute('INSERT INTO paper_author_mapping \
                                    VALUES({}, {})'.format(
                                        paper_PK, author_PK))
                
                # update the global authors scopus ID list
                curr_scopus_id_list += [scps_id]
                this_author_PK = author_PK  #this will be used in affiliation table
                author_PK += 1
                
            # adding affiliations
            # ---------------------
            # handling None affiliations
            if r.authors[i].affiliation is not None:
                author_aff_scopus_id_list = np.unique(r.authors[i].affiliation)
            else:
                author_aff_scopus_id_list = []
            for aff_scps_id in author_aff_scopus_id_list:
                if aff_scps_id in curr_aff_scopus_id_list:
                    sql_cursor.execute('SELECT aff_id \
                    FROM affiliation \
                    WHERE aff_scopus_ID = {}'.format(aff_scps_id))
                    this_aff_PK = sql_cursor.fetchall()[0][0]

                    # add the pair only if the author/aff. have not already
                    # been added to the mapping table
                    if (this_author_PK, this_aff_PK) not in curr_author_aff_pairs:
                        sql_cursor.execute('INSERT INTO author_affiliation_mapping \
                                            VALUES({}, {})'.format(this_author_PK,
                                                                   this_aff_PK))
                        curr_author_aff_pairs += [(this_author_PK, this_aff_PK)]
                else:
                    lcn = np.where([x.id==aff_scps_id for x in r.affiliation])[0]
                    if len(lcn)>0:
                        lcn = lcn[0]
                        aff_name = r.affiliation[lcn].name.replace('"','\\"')
                        aff_city = r.affiliation[lcn].city
                        aff_country = r.affiliation[lcn].country
                    else:
                        aff_name = 'NA'
                        aff_city = 'NA'
                        aff_country = 'NA'

                    sql_cursor.execute('INSERT INTO affiliation \
                                        VALUES({},"{}","{}","{}","{}");'.format(
                                            aff_PK,
                                            aff_scps_id,
                                            aff_name,
                                            aff_city,
                                            aff_country)
                    )
                    sql_cursor.execute('INSERT INTO author_affiliation_mapping \
                                        VALUES({}, {})'.format(this_author_PK, aff_PK))
                    curr_author_aff_pairs += [(this_author_PK, aff_PK)]
                    # update the affliations list
                    curr_aff_scopus_id_list += [aff_scps_id]
                    aff_PK += 1

        paper_PK += 1

        sql_db.commit()

    return bad_dois
Exemple #18
0
import pandas as pd
from pybliometrics.scopus import ScopusSearch, AbstractRetrieval, AuthorRetrieval, ContentAffiliationRetrieval
import networkx as nx

nodes=pd.DataFrame()
edges=[]
gen={}

Ellison = "10.1086/341871"  
ab = AbstractRetrieval(Ellison, view="FULL")

print("\n\n===========================starting============================\n\n")

nodes=pd.DataFrame()
nodes = nodes.append({"id":"", "title": ab.title, "sourcetitle": ab.sourcetitle_abbreviation, "publicationyear": ab.coverDate[0:4], "eid": ab.eid, "gen": '0' }, ignore_index=True)
ref_df = pd.DataFrame(ab.references)
ref_df["eid"] = '2-s2.0-' + ref_df['id']
ref_df['gen'] = '-1'

ref_df2 = pd.concat([ref_df['eid'], ref_df['id'], ref_df['publicationyear'], ref_df['sourcetitle'], ref_df['title'], ref_df['gen']], axis=1, keys=['eid', 'id', 'publicationyear', 'sourcetitle', 'title', 'gen'], sort=True)
#ref_df2 = ref_df2.drop(18)
nodes = nodes.append(ref_df2, ignore_index = True, sort=True)

for row in ref_df2.itertuples():
    edges.append((row.eid, ab.eid))

len(nodes)

s = ScopusSearch(ab.eid) 
for x in s.results:
    if(x.eid not in list(nodes['eid'])):
from pybliometrics.scopus import ScopusSearch, AbstractRetrieval, AuthorRetrieval, ContentAffiliationRetrieval, CitationOverview
import networkx as nx
import sys

import json

nodes = pd.DataFrame()
edges = []
gen = {}
outp = []

# Papers
# identiified using the DOI

Varian = "10.1007/b104899_7"
ab = AbstractRetrieval(Varian, view="FULL")

# Paper 1

nodes = pd.DataFrame()
nodes = nodes.append(
    {
        "id": "",
        "title": ab.title,
        "sourcetitle": ab.sourcetitle_abbreviation,
        "publicationyear": ab.coverDate[0:4],
        "eid": ab.eid,
        "gen": '0'
    },
    ignore_index=True)
Exemple #20
0
            count += 1
readFile.close()
print('document count: ' + str(count))

count = 0
with open('pubs_metadata_by_scopus.csv', 'a', encoding='utf-8') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerow([
        'eid', 'date', 'title', 'citedby_count', 'authors', 'venue', 'area',
        'abstract'
    ])
    for eid in doc_eids:
        print()
        print('count: ' + str(count))
        count += 1
        abstractRetrieval = AbstractRetrieval(eid)
        print('eid: ' + eid)

        # venue
        publicationName = abstractRetrieval.publicationName
        if not publicationName: continue
        print('venue: ' + publicationName)

        # sjr venue link
        venue_link = getSjrVenueLink(publicationName)
        if not venue_link: continue
        print('venue link: ' + venue_link)

        # subject area
        subject_area = getSjrSubjectArea(venue_link)
        if not subject_area: continue
Exemple #21
0
def import_scopus(ctx, verbose, start):
    """
    Import scopus publication records for the authors of the pubtrack application.

    This command will first fetch all the information about the authors, which are defined within the pubtrack app.
    It uses the scopus author ID's of these authors to send requests to the scopus database. The publications of these
    replies are then evaluated and posted into the pubtrack app.
    """
    # SETTING UP PUBTRACK WRAPPER
    config = ctx.obj['config']
    pubtrack = Pubtrack(config)

    # SETTING UP SCOPUS WRAPPER
    try:
        pybliometrics.scopus.utils.create_config()
    except FileExistsError:
        pass
    finally:
        scopus_config['Authentication']['APIKey'] = config.get_scopus_key()

    # FETCHING META AUTHOR INFORMATION FROM PUBTRACK
    click.secho('Fetching author information from pubtrack.')
    author_id_name_map = {}
    meta_authors = pubtrack.meta_author.get()['results']
    for meta_author in meta_authors:
        for author in meta_author['authors']:
            # "author_name_kitopen" returns a string with the authors name. This function essentially formats the name
            # in a way so that it can be used in a query string for the KITOpen database.
            full_name = '{} {}'.format(author['first_name'],
                                       author['last_name'])
            scopus_id = author['scopus_id']
            author_id_name_map[scopus_id] = full_name
            out(
                verbose, ' > Adding author "{} ({})" to be processed'.format(
                    full_name, scopus_id))

    click.secho('==> Processing total of {} authors'.format(
        len(author_id_name_map)))

    # QUERY SCOPUS DATABASE
    click.secho(
        'Querying scopus database for the publications of those authors.')
    date_limit = datetime.datetime(year=start, month=1, day=1)
    for author_id, author_name in author_id_name_map.items():
        publication_count = 0
        search = ScopusSearch(f'AU-ID ( {author_id} )')
        out(verbose, ' | Query "AU-ID ( {} )"'.format(author_id))

        for result in search.results:

            # We'll only take publications, which have a DOI
            if result.doi is None:
                continue

            # requesting the detailed information from the scopus database for the current publication from the search
            # results
            try:
                abstract_retrieval = AbstractRetrieval(result.doi)
            except Exception as e:
                out(verbose,
                    '   # Could not retrieve publication "{}"'.format(
                        result.doi),
                    fg='yellow')
                continue

            # If the publication is older than the date limit, it will be discarded
            publication_date = datetime.datetime.strptime(
                abstract_retrieval.coverDate, '%Y-%m-%d')
            if publication_date <= date_limit:
                out(verbose,
                    '   # Publication too old "{}"({})'.format(
                        result.doi, publication_date),
                    fg='yellow')
                continue
            else:
                out(verbose,
                    '   > Fetched publication "{}"'.format(result.doi))

            adapter = ScopusPublicationAdapter(abstract_retrieval)
            publication = adapter.get_publication()

            # Filtering the authors according to the AUTHOR_LIMIT, which has been set.
            # We cannot just use the first few authors however, we need to make sure that the author, from which we have
            # this publication in the first place is in there. The rest just gets filled up...
            authors = []
            for author in publication['authors']:
                if author['scopus_id'] in author_id_name_map.keys(
                ) or len(authors) < config.get_author_limit():
                    authors.append(author)

            publication['authors'] = authors

            # Now we try to actually POST the publication to the pubtrack REST API
            try:
                pubtrack.import_publication(publication)
                publication_count += 1
                out(verbose,
                    '   * Added to pubtrack: "{}"'.format(
                        publication['title']),
                    fg='green')
            except Exception as e:
                if str(e) == 'uuid':
                    out(verbose,
                        '   ! Error while posting to pubtrack: Already exists!',
                        fg='red')
                else:
                    out(verbose,
                        '   ! Error while posting to pubtrack: {}'.format(
                            str(e)),
                        fg='red')
                continue

        out(True,
            ' --> Total of {} publications imported from author {}'.format(
                publication_count, author_id),
            fg='green',
            bold=True)
Exemple #22
0
def coletar_artigos(eids_documentos, api_view):
    # Inicializa uma lista de dados vazia {data}; para cada entrada na lista de artigos obtidos
    # cria um dicionário para armazenar as informações específicas sobre o artigo e
    # armazena nessa lista
    data = []
    for key in eids_documentos:
        record = {}
        error = True
        while error:
            try:
                paper = AbstractRetrieval(key,
                                          id_type="eid",
                                          view=api_view,
                                          refresh=True)
                error = False
                # Informações básicas.
                record["id"] = paper.identifier
                record["doi"] = paper.doi
                record["eid"] = paper.eid
                record["pii"] = paper.pii
                record["pubmed_id"] = paper.pubmed_id
                record["titulo"] = paper.title
                record["resumo"] = paper.abstract
                record["descricao"] = paper.description
                record["data_publicacao"] = datetime.strptime(paper.coverDate, "%Y-%m-%d").date() \
                    if paper.coverDate else None
                record["numero_citacao"] = paper.citedby_count
                record["idioma"] = paper.language
                record["tipo_publicacao"] = paper.aggregationType
                record["tipo_fonte"] = paper.srctype
                record["palavras_chaves"] = tuple(
                    paper.authkeywords) if paper.authkeywords else None
                record["termos_indice"] = tuple(
                    paper.idxterms) if paper.idxterms else None
                record["issn"] = paper.issn

                try:
                    record["isbn"] = " ".join(paper.isbn) if type(
                        paper.isbn) == tuple else paper.isbn
                except TypeError:
                    record["isbn"] = None

                # Informações sobre a Conferencia e/ou Revista.
                record["conf_loc"] = paper.conflocation
                record["conferencia_nome"] = paper.confname
                record["revista_nome"] = paper.publicationName
                record["revista_ender"] = paper.publisheraddress
                record["titulo_ed"] = paper.issuetitle
                record["publis"] = paper.publisher

                # Informações sobre afiliação.
                record["affiliacoes"] = tuple([
                    {
                        "id": affil.id if affil and affil.id else None,
                        "affiliacao":
                        affil.name if affil and affil.name else None,
                        "pais":
                        affil.country if affil and affil.country else None
                    } for affil in paper.affiliation
                ]) if paper.affiliation else None

                # Informações sobre os autores.
                record["autores"] = tuple(
                    [{"id": author.auid if author and author.auid else None,
                      "nome": "{} {}".format(author.given_name, author.surname) \
                          if author and author.given_name and author.surname else None}
                     for author in paper.authors]) if paper.authors else None

                record["autores_affil"] = tuple(
                    [{"id": author.auid if author and author.auid else None,
                      "nome": "{} {}".format(author.given_name, author.surname) \
                          if author and author.given_name and author.surname else None,
                      "affil_id": author.affiliation_id if author and author.affiliation_id else None,
                      "affiliacao": author.organization if author and author.organization else None,
                      "pais": author.country if author and author.country else None}
                     for author in paper.authorgroup]) if paper.authorgroup else None

                # Informações sobre referencias.
                record[
                    "ref_count"] = paper.refcount if paper.refcount else None
                record["references"] = tuple([
                    {
                        "id": ref.id if ref and ref.id else None,
                        "titulo": ref.title if ref and ref.title else None,
                        "doi": ref.doi if ref and ref.doi else None,
                        "autores": ref.authors if ref and ref.authors else None
                    } for ref in paper.references
                ]) if paper.references else None

            except Scopus404Error:
                record["id"] = key
                print(key)
                error = False
            except Scopus429Error:
                config["Authentication"]["APIKey"] = _keys.pop()
        data.append(record)

    df = pd.DataFrame(data)

    return df
def lookup():
    search = input('Enter Search Terms\n')
    option = input('Enter 1 for Exact search, 0 for inexact search\n')

    if option == '1':
        query = '{' + search + '}'  # exact search
    else:
        query = 'TITLE-ABS-KEY( ' + search + ')'  # inexact search

    s = ScopusSearch(query, download=False)

    print('Number of results: ')
    length = s.get_results_size()
    print(length)

    if length > 0:
        dl = input('Would you like to download the results y/n\n')
        if dl == 'y':
            s = ScopusSearch(query, download=True)
            dataframe = pd.DataFrame(pd.DataFrame(
                s.results))  # converts results into a dataframe
            pd.options.display.max_colwidth = 150
            pd.options.display.max_rows = None
            print(dataframe[['eid', 'title']])
            dataframe.iloc[:, 0] = dataframe.iloc[:, 0].astype(
                str)  # converts the eid dataframe objects to string

            option2 = input(
                '\n Enter the row of the abstract you want to download, or enter ALL to download all\n'
            )

            if option2 == 'ALL':
                for i in progressbar(range(length), "Download Progress ", 40):
                    ab = AbstractRetrieval(
                        dataframe.iloc[i, 0],
                        view='FULL')  # searches for abstracts using eid
                    with open(
                            os.path.join(
                                '/home/benjamin/Python_Codes/Abstracts',
                                dataframe.iloc[i, 0] + '.txt'), 'w') as f:
                        f.write(
                            "%s\n" % ab.abstract
                        )  #creates individual txt files titled by their eid
            else:
                try:
                    val = int(option2)
                    print('Attempting to download abstract with eid ' +
                          dataframe.iloc[val, 0])
                    ab = AbstractRetrieval(
                        dataframe.iloc[val, 0],
                        view='FULL')  # searches for abstracts using eid
                    with open(
                            os.path.join(
                                '/home/benjamin/Python_Codes/Abstracts',
                                dataframe.iloc[val, 0] + '.txt'), 'w') as f:
                        f.write("%s\n" % ab.abstract)
                    print('Success!\n')
                except ValueError:
                    print('Invalid row number\n')
    else:
        print('No results found, please try again\n')
Exemple #24
0
print(fullres.scopus_abstract_retries.mean())
print(fullres.scopus_abstract_retries.max())


qq=1
qq=qq+1

print('warp')

if False:
    lst = []
    from pybliometrics.scopus import AbstractRetrieval
    t0 = time.time()
    for ii in np.arange(0,10):
        cur_eid = df.loc[ii,'eid']
        minires = AbstractRetrieval(identifier=cur_eid, view='FULL', refresh=True, id_type='eid')
        try:
            qq = minires.authorgroup
            lst.append(1)
        except:
            lst.append(0)
    print(lst)
    t1 = time.time()
    print('expected single-thread time cost per record is: ' + str((t1-t0)/10.0))
    # we expect 121 seconds cost for 100 entries

print('done')

# it only took 20 seconds to do 1000 records
# that is 50 per second
# or a speed increase of factor 50x !
        return None
    print(author)

    #orcid = getattr(author, 'orcid')
    try:
        orcid = getattr(author, 'orcid')
    except:
        print('exception trying to get authors orcid')
        return None
    print('ORCID: ', orcid)
    return None  # Remove this temporarily
    return orcid


for index, eid in enumerate(eids):
    item_from_scopus = AbstractRetrieval(eid, id_type='eid', view='FULL')
    #print(abstract)
    #print(abstract.abstract)
    #print(eid)

    print(item_from_scopus.__dict__.keys())

    doi = item_from_scopus.doi
    root = et.Element('dublin_core', schema='dc')

    # TODO generate this automatically
    xmls = {
        'dc': et.Element('dublin_core', schema='dc'),
        'local': et.Element('dublin_core', schema='local')
    }
Exemple #26
0
import pandas as pd
from pybliometrics.scopus import ScopusSearch, AbstractRetrieval, AuthorRetrieval, ContentAffiliationRetrieval
import networkx as nx

nodes = pd.DataFrame()
edges = []
gen = {}

Acquisti = "10.1257/jel.54.2.442"
ab = AbstractRetrieval(Acquisti, view="FULL")

nodes = pd.DataFrame()
nodes = nodes.append(
    {
        "id": "",
        "title": ab.title,
        "sourcetitle": ab.sourcetitle_abbreviation,
        "publicationyear": ab.coverDate[0:4],
        "eid": ab.eid,
        "gen": '0'
    },
    ignore_index=True)
ref_df = pd.DataFrame(ab.references)
ref_df["eid"] = '2-s2.0-' + ref_df['id']
ref_df['gen'] = '-1'

ref_df2 = pd.concat(
    [
        ref_df['eid'], ref_df['id'], ref_df['publicationyear'],
        ref_df['sourcetitle'], ref_df['title'], ref_df['gen']
    ],
for author_id, full_name in AUTHORS.items():
    publication_count = 0
    search = ScopusSearch(f'AU-ID ( {author_id} )')
    logger.info('STARTING SEARCH FOR AUTHOR {}({})'.format(full_name, author_id))

    for result in search.results:

        # We'll only take publications, which have a DOI
        if result.doi is None:
            continue

        # Requesting the detailed information from the scopus database for the current publication from the search
        # results
        try:
            abstract_retrieval = AbstractRetrieval(result.doi)
            logger.info(' * FETCHED publication {}'.format(result.doi))
        except Exception as e:
            logger.error(' ! Could not retrieve scopus abstract for DOI "{}". ERROR: {}'.format(result.doi, str(e)))
            continue

        # If the publication is older than the date limit, it will be discarded
        publication_date = datetime.datetime.strptime(abstract_retrieval.coverDate, '%Y-%m-%d')
        if publication_date <= DATE_LIMIT:
            logger.info(' # TOO OLD publication {} with publishing date {}'.format(result.doi, abstract_retrieval.coverDate))
            continue

        adapter = ScopusPublicationAdapter(abstract_retrieval)
        publication = adapter.get_publication()

        # Filtering the authors according to the AUTHOR_LIMIT, which has been set.