def run(self):
        """
        Generates data and writes it into the :py:meth:`~.Streams.output` target.
        """

        cur_year = self.yr
        cur_query = self.qr

        run_query = cur_query + ' AND ( PUBYEAR  =  ' + str(cur_year) + ') '

        size = ScopusSearch(run_query, refresh=True,
                            download=False).get_results_size()

        if size > 10000:
            print('scopus query with over 10k records running, careful')

        df = pd.DataFrame(ScopusSearch(run_query, refresh=True).results)

        fav_fields = [
            'eid', 'creator', 'doi', 'title', 'afid', 'affilname',
            'author_count', 'author_names', 'author_afids', 'coverDate',
            'coverDisplayDate', 'publicationName', 'issn', 'source_id',
            'eIssn', 'citedby_count', 'fund_sponsor', 'aggregationType',
            'openaccess', 'description', 'authkeywords'
        ]
        df = df[fav_fields]  # cut fields
        #
        # 1X: drop all empty eids to prevent issues later (to be safe)
        df = df.dropna(axis=0, subset=['eid'], inplace=False)

        #print(len(df))
        df.to_pickle(self.output().path)  #, encoding='utf-8')
def robust_query(q, refresh=False, fields=("eid", "coverDate")):
    """Wrapper function for individual ScopusSearch query."""
    try:
        s = ScopusSearch(q, refresh=refresh, integrity_fields=fields)
        res = s.results
    except (AttributeError, KeyError):
        res = ScopusSearch(q, refresh=True).results
    return res or []
Ejemplo n.º 3
0
def perform_query(auth_id, refresh=100, fields=["eid", "title"]):
    """Access ScopusSearch API to retrieve EIDs, sources and
    publication years.
    """
    q = f"AU-ID({auth_id})"
    try:
        res = ScopusSearch(q, refresh=refresh, integrity_fields=fields).results
        info = parse_publications(res, auth_id)
    except (AttributeError, KeyError, TypeError):
        res = ScopusSearch(q, refresh=True).results
        info = parse_publications(res, auth_id)
    if not info:
        return None, None, None, None, None
    return zip(*info)
Ejemplo n.º 4
0
def test_find_location():
    auth_id = 6701809842
    pubs = ScopusSearch("AU-ID({})".format(auth_id)).results
    ctry, aid, aff = find_location([str(auth_id)], pubs, 2000, refresh=False)
    assert_equal(ctry, "Germany")
    assert_equal(aid, "60028717")
    assert_equal(aff, "University of Munich")
Ejemplo n.º 5
0
def test_retrieve_author_info_authoryear():
    make_database(test_cache, drop=True)
    conn = connect_database(test_cache)
    # Variables
    table = "author_year"
    expected_auth = [53164702100, 57197093438]
    search_auth = [55317901900]
    year = 2016
    df2 = pd.DataFrame(expected_auth + search_auth,
                       columns=["auth_id"],
                       dtype="int64")
    df2["year"] = year
    # Insert data
    fill = robust_join(expected_auth, sep=') OR AU-ID(')
    q = f"(AU-ID({fill})) AND PUBYEAR BEF {year+1}"
    d = build_dict(ScopusSearch(q, refresh=refresh).results, expected_auth)
    expected = pd.DataFrame.from_dict(d, orient="index", dtype="int64")
    expected = expected.sort_index().rename_axis('auth_id').reset_index()
    expected["year"] = year
    expected = expected[[
        'auth_id', 'year', 'first_year', 'n_pubs', 'n_coauth'
    ]]
    insert_data(expected, conn, table=table)
    # Retrieve data
    incache, missing = retrieve_author_info(df2, conn, table)
    assert_frame_equal(incache, expected)
    assert_equal(missing['auth_id'].tolist(), search_auth)
    assert_equal(missing['year'].tolist(), [year])
Ejemplo n.º 6
0
 def create_obj(params):
     if q_type == "author":
         return AuthorSearch(**params)
     elif q_type == "docs":
         params["integrity_fields"] = fields
         params["view"] = view
         return ScopusSearch(**params)
Ejemplo n.º 7
0
def test_expand_affiliation():
    pubs = ScopusSearch(f"AU-ID(6701809842)", refresh=refresh).results
    res = pd.DataFrame(pubs)
    res = expand_affiliation(res)
    assert_true(len(res) >= 180)
    expect_columns = ['source_id', 'author_ids', 'afid']
    assert_equal(set(res.columns), set(expect_columns))
    assert_true(any(res['author_ids'].str.contains(";")))
    assert_true(all(isinstance(x, (int, float)) for x in res['afid'].unique()))
Ejemplo n.º 8
0
def test_expand_affiliation():
    auth_id = 6701809842
    pubs = ScopusSearch("AU-ID({})".format(auth_id)).results
    res = pd.DataFrame(pubs)
    res = expand_affiliation(res)
    assert_equal(len(res), 185)
    expect_columns = ['source_id', 'author_ids', 'afid']
    assert_equal(set(res.columns.tolist()), set(expect_columns))
    assert_true(any(res.author_ids.str.contains(";")))
    assert_false(any(res.afid.str.contains(";")))
Ejemplo n.º 9
0
def refine_scopus(docs: DocumentSet, *, search_title=True
                  ) -> Tuple[DocumentSet, DocumentSet]:
    """Attempt to fetch Scopus metadata for each document in the given
    set. Returns a tuple containing two sets: the documents available on
    Scopus and the remaining documents not found on Scopus.


    Documents are retrieved based on their identifier (DOI, Pubmed ID, or
    Scopus ID). Documents without a unique identifier are retrieved by
    performing a fuzzy search based on their title. This is not ideal
    and can lead to false positives (i.e., another document is found having
    the same title), thus it can be disabled if necessary.

    :param search_title: Flag to toggle searching by title.
    """
    from pybliometrics.scopus import ScopusSearch

    def callback(doc):
        id = doc.id
        if isinstance(doc, ScopusDocument):
            return doc

        if doi := id.doi:
            try:
                return ScopusDocument.from_doi(doi)
            except Exception as e:
                logging.warn(f'no document found for DOI {doi}: {e}')
                return None

        title = canonical(id.title)
        if len(title) > 10 and search_title:
            query = f'TITLE({title})'
            response = ScopusSearch(query, view='STANDARD', download=False)
            nresults = response.get_results_size()

            if nresults > 0 and nresults < 10:
                response = ScopusSearch(query, view='STANDARD')

                for record in response.results or []:
                    if canonical(record.title) == title:
                        return ScopusDocument.from_eid(record.eid)

        return None
Ejemplo n.º 10
0
def execute_search(dump_name, query):
    """Execute a search on Scopus using Scopus Query Language and print brief results- Define Query in advance"""
    t.tic()
    res = ScopusSearch(query)

    query_res = pd.DataFrame(res.results)

    # Select name for pickle data
    query_res.to_pickle('./Scopus_dumps/' + dump_name + '.pkl')
    t.toc('Query and saving DataFrame took ')
Ejemplo n.º 11
0
    def get_data_from_doi(self, doi, title):
        id = None
        affil = None
        pub_name = None
        pub_type = None
        # try:
        try:
            doi_doc = ScopusSearch(doi, subscriber=False)
            if 'pubmed-id' in doi_doc._json[0].keys():
                id = doi_doc._json[0]["pubmed-id"]
            if 'affiliation' in doi_doc._json[0].keys():
                affil = doi_doc._json[0]['affiliation']
            pub_name = doi_doc._json[0]['prism:publicationName']
            pub_type = doi_doc._json[0]['subtypeDescription']
        except:
            print("failed with scopus")
        if id == None:
            doi_doc = FullDoc(doi=doi)
            if doi_doc.read(self.client):
                # print("doi_doc.title: ", doi_doc.title)
                doi_doc.write()
                pub_name = doi_doc.data['coredata']['prism:publicationName']
                if 'pubType' in doi_doc.data['coredata'].keys():
                    pub_type = str(doi_doc.data['coredata']['pubType']).strip()
            else:
                print(
                    "Read document failed. no id for doi {}. trying with title"
                    .format(doi))
                doi_doc = None
                # return doi, affil
            id = None
            if doi_doc == None or (not 'pubmed-id' in doi_doc._data.keys()):
                print("trying with title")
                # try with title
                Entrez.email = '*****@*****.**'
                if doi_doc == None:
                    query = title
                else:
                    query = doi_doc.title
                handle = Entrez.esearch(db='pubmed', retmode='xml', term=query)
                results = Entrez.read(handle)
                if int(results['Count']) > 0:
                    id = results['IdList']
            else:
                id = doi_doc._data['pubmed-id']
        if id != None:
            return self.fetch_data_from_pubmed(id), affil, pub_name, pub_type

        else:
            print("no pubmed id found for doi {}".format(doi))
            return doi, affil, pub_name, pub_type
Ejemplo n.º 12
0
    def performSearch(self, searchWords):
        # Create Search-String
        # Searching in TITLE-ABStract-KEYwords is the default search mode on scopus
        searchString = 'TITLE-ABS-KEY('
        for i, word in enumerate(searchWords):
            searchString = searchString + word
            if (i != len(searchWords)-1):
                searchString = searchString + ' AND '
            #Last Item
            else:   
                searchString = searchString + ')'

        self.searchResult = ScopusSearch(searchString)
        self.searchWords = searchWords

        self.storeResultsInDB()
Ejemplo n.º 13
0
def search_scopus(query: str, *, limit: int = None) -> DocumentSet:
    """ Submit the given query to the Scopus API.

    :param limit: Restrict results the first `limit` documents.
    """
    from pybliometrics.scopus import ScopusSearch

    search = ScopusSearch(query, view='STANDARD')
    eids = list(search.get_eids())
    docs = []

    if limit is not None and len(eids) > limit:
        random.seed(0)
        random.shuffle(eids)
        eids = eids[:limit]

    for eid in progress_bar(eids):
        doc = ScopusDocument.from_eid(eid)
        docs.append(doc)

    return DocumentSet(docs)
def main():
    # Read in
    journals = pd.read_csv(SOURCE_FILE, index_col=0, encoding="utf8")

    # Get article information
    print(">>> Querying publications for:")
    d = []
    for idx, row in journals.iterrows():
        print("...", idx)
        for year in range(YEARS[0], YEARS[1]+1):
            q = f'SOURCE-ID({row.source_id}) AND PUBYEAR IS {year}'
            s = ScopusSearch(q, refresh=30)
            for pub in s.results:
                if pub.subtype not in DOCTYPES:
                    continue
                s = parse_abstract(pub)
                s["journal"] = row.Abbreviation
                d.append(s)
    print(f">>> Found {len(d):,} publications")

    # Turn to DataFrame
    df = pd.DataFrame.from_records(d)
    print(">>> Correcting some titles")
    repl = {"&amp;": "&", "<sup>": "", "</sup>": "", "<inf>": "", "</inf>": ""}
    for old, new in repl.items():
        df['title'] = df['title'].str.replace(old, new)
    df['simple_title'] = df['title'].apply(standardize).str.upper()
    df['top'] = df['journal'].isin(TOP_JOURNALS)*1

    # Add citation counts of reprints to original paper
    print(">>> Dropping reprints and duplicates")
    df = df.sort_values(['simple_title', 'year'])
    grouped = df.groupby('simple_title')
    left = grouped[[c for c in df.columns if "cit" not in c]].first()
    right = grouped[[c for c in df.columns if "cit" in c]].sum(min_count=1)
    df = pd.concat([left, right], axis=1)

    # Write out
    print(f">>> Saving {df.shape[0]:,} observations")
    df.set_index('simple_title').to_csv(TARGET_FILE, encoding="utf8")
Ejemplo n.º 15
0
def test_author_year_in_cache():
    create_cache(drop=True, file=test_cache)
    # Variables
    expected_auth = ["53164702100", "57197093438"]
    search_auth = ["55317901900"]
    year = 2016
    # Test empty cache
    df1 = pd.DataFrame(expected_auth, columns=["auth_id"],
                       dtype="int64")
    df1["year"] = year
    auth_y_incache, auth_y_search = author_year_in_cache(df1, file=test_cache)
    assert_frame_equal(auth_y_search, df1)
    assert_equal(len(auth_y_incache), 0)
    # Test partial retrieval
    fill = ') OR AU-ID('.join([str(a) for a in expected_auth])
    q = "(AU-ID({})) AND PUBYEAR BEF {}".format(fill, year+1)
    res = build_dict(ScopusSearch(q).results, expected_auth)
    res = pd.DataFrame.from_dict(res, orient="index", dtype="int64")
    res["year"] = year
    cols = ["year", "first_year", "n_pubs", "n_coauth"]
    res = res[cols].reset_index().rename(columns={"index": "auth_id"})
    cache_insert(res, table="author_year", file=test_cache)
    df2 = pd.DataFrame(expected_auth + search_auth,
                       columns=["auth_id"], dtype="int64")
    df2["year"] = year
    auth_y_incache, auth_y_search = author_year_in_cache(df2, file=test_cache)
    expected_auth = [int(au) for au in expected_auth]
    search_auth = [int(au) for au in search_auth]
    assert_equal(sorted(auth_y_incache.auth_id.tolist()), expected_auth)
    assert_equal(auth_y_incache.year.tolist(), [year, year])
    assert_equal(auth_y_search.auth_id.tolist(), search_auth)
    assert_equal(auth_y_search.year.tolist(), [year])
    # Test full retrieval
    auth_year_incache, auth_year_search = author_year_in_cache(df1, file=test_cache)
    assert_equal(sorted(auth_year_incache.auth_id.tolist()), expected_auth)
    assert_equal(auth_year_incache.year.tolist(), [year, year])
    assert_true(auth_year_search.empty)
Ejemplo n.º 16
0
def query_scopus_by_doi(doi, verbose=True):
    """
    get crossref records by paper doi

    :param doi: (str) doi of a paper
    :param verbose: (bool) print diagnosis message or not
    :return: (dict) result from crossref api
    """
    # goal
    scopus_results = None

    # query crossref
    query_results = ScopusSearch('DOI({})'.format(doi),
                                 max_entries=None,
                                 cursor=True)

    # filter out empty query results
    if query_results.results is not None:
        scopus_results = query_results.results[0]._asdict()
    else:
        warnings.warn(
            'Empty result from scopus when searching doi: {}'.format(doi))

    return scopus_results
Ejemplo n.º 17
0
from pybliometrics.scopus import ScopusSearch
from pybliometrics.scopus import AbstractRetrieval
from pybliometrics.scopus import AuthorRetrieval
from lxml import etree as et
import os
import requests
from config import dcmappings

saf_root_directory = 'saf'
science_direct_base_url = 'https://api.elsevier.com/content/article/doi/'

apiKey = os.environ['SCOPUS_API_KEY']
scopus_search_string = os.environ['SCOPUS_SEARCH_STRING']

s = ScopusSearch(scopus_search_string, refresh=True, view='COMPLETE')

print(s.get_results_size())
eids = s.get_eids()
counter = 0

orcid_mapping = {
    'schema': 'local',
    'attributes': {
        'element': 'contributor',
        'qualifier': 'author_orcid_id'
    }
}


def GetOrcidFromScopusID(scopus_id):
    try:
Ejemplo n.º 18
0
def main():
    s = ScopusSearch('ISSN ( 0022-3514 )')
    print(s.get_results_size())
#

# harvester
# warning: the code uses 'today' so you have to run it before midnight
#
# harvest from ScopusSearch everything from VU+VUMC of today
# because the API has issues with direct today command we instead take entire month and isolate today
#
# prepare query
VU_with_VUMC_affid = "(   AF-ID(60001997) OR    AF-ID(60008734) OR AF-ID(60029124) OR AF-ID(60012443) OR AF-ID(60109852) OR AF-ID(60026698) OR AF-ID(60013779) OR AF-ID(60032886) OR AF-ID(60000614) OR AF-ID(60030550) OR AF-ID(60013243) OR AF-ID(60026220))"
my_query = VU_with_VUMC_affid + ' AND ' + "PUBDATETXT( " + get_today_for_pubdatetxt(
) + " )"  # RECENT(1) is somehow very slow
print(my_query)
#
# call the scopussearch API
s = ScopusSearch(my_query, refresh=True, download=True)
df = pd.DataFrame(s.results)
#
# filter to records of today
today = get_today()
df = df[df.coverDate == today]
#
# here is the result (may be empty on some days)
###df

# pre-processing aspect
# we need to add extra sources, clean it, rename columns and make it ready for push
# this is a static copy, and you should migrate processing to pycharm and call that from here

# Add info on year and month
df = add_year_and_month(df, 'coverDate')  # add info columns
df_dois = df_dois.dropna()

bits = 10  # not dynamic yet
stepsize = int(np.ceil(len(df_dois) / bits) + 1)

df_total = pd.DataFrame()
for cur_bit in np.arange(0, bits):
    print('-------')
    print(cur_bit)

    df_dois_CUR = df_dois.iloc[stepsize * cur_bit:stepsize * (cur_bit + 1), :]

    doi_list_cur = df_dois_CUR['DOI'].to_list()
    cur_query = "DOI( " + " ) OR DOI( ".join(doi_list_cur) + " ) "

    if len(df_dois_CUR) > 0:
        t0 = time.time()
        fullres = pd.DataFrame(
            ScopusSearch(cur_query, download=True, refresh=True).results)
        t1 = time.time()
        print(t1 - t0)

        df_total = df_total.append(fullres)

# backmerge it first
df_total = df_total.drop_duplicates(subset='doi')
df_export = df_orig.merge(df_total, left_on='DOI', right_on='doi', how='left')

df_export.to_csv(PATH_START_PERSONAL + 'arjan.csv')
df_export.to_excel(PATH_START_PERSONAL + 'arjan.xlsx')
Ejemplo n.º 21
0
def get_scopus_arm(
        MY_YEARSET,
        start_path_with_slash,
        xpure_pack,
        df_in=None,  # there is no df_in (!))
        do_save=False):
    """
    get_scopus_arm is a refactor of the legacy scopus code from jupyter
    careful: we also have a luigi-variant of this!
    so you should use the luigi variant whenever possible
    I made this here because I needed an exact copy for a quick sprint
    
    so what does it do?
    it harvests scopus and then enriches it with unpaywall, deals, etc
    it also renames columns and deletes columns
    
    
    
    
    Use the assumption that MY_YEARSET is always 3 years
    Once we get this in Luigi it will work better than arbitrary length sets
    because this is an ATOMIC split of work and works well concurrently
    luigi will always skip parts if they already exist
    you do have to put it well in luigi: this function will be 2 pipe-types
    type-1 will do 1 year only
    type-2 will combine 3 years only
    and that is all you need because the entire pure arm is for 1 chosen year
    but can be easily extended to do multiple chosen years efficiently

    
    
    """

    xpure_pack  # is not used right now, but OK

    dict_output = {}

    for MY_YEAR in MY_YEARSET:

        print(MY_YEAR)
        # settings

        # testing
        override_query_for_testing = False
        running_on_server = False

        # paths
        if running_on_server:
            path_deals = 'C:/Users/yasing/Desktop/oa oktober/apcdeals.csv'  #check
            path_isn = 'C:/Users/yasing/Desktop/oa oktober/ISN_ISSN.csv'  #check
            path_org = 'C:/Users/yasing/Desktop/oa oktober/vu_organogram_2.xlsx'  #check
            path_out = start_path_with_slash  #'C:/Users/yasing/Desktop/oa oktober/'                              #check
            path_vsnu_afids = 'C:/Users/yasing/Desktop/oa oktober/afids_vsnu_nonfin.csv'  #check
        else:
            path_deals = r'G:\UBVU\Data_RI\raw data algemeen\apcdeals.csv'
            path_isn = r'G:\UBVU\Data_RI\raw data algemeen\ISN_ISSN.csv'
            path_org = r'G:\UBVU\Data_RI\raw data algemeen\vu_organogram_2.xlsx'
            path_out = start_path_with_slash  #'C:/Users/yasin/Desktop/oa new csv/'  # no r
            path_vsnu_afids = r'G:\UBVU\Data_RI\raw data algemeen\afids_vsnu_nonfin.csv'

        # scopus search and affiliation
        #
        # ! VUMC HAS BEEN ADDED !
        #
        chosen_affid = [
            "60008734", "60029124", "60012443", "60109852", "60026698",
            "60013779", "60032886", "60000614", "60030550", "60013243",
            "60026220", "60001997"
        ]  # I added 60001997 and thus I added VUMC
        #VU_noMC_affid = "(AF-ID(60008734) OR AF-ID(60029124) OR AF-ID(60012443) OR AF-ID(60109852) OR AF-ID(60026698) OR AF-ID(60013779) OR AF-ID(60032886) OR AF-ID(60000614) OR AF-ID(60030550) OR AF-ID(60013243) OR AF-ID(60026220))"
        VU_with_VUMC_affid = "(   AF-ID(60001997) OR    AF-ID(60008734) OR AF-ID(60029124) OR AF-ID(60012443) OR AF-ID(60109852) OR AF-ID(60026698) OR AF-ID(60013779) OR AF-ID(60032886) OR AF-ID(60000614) OR AF-ID(60030550) OR AF-ID(60013243) OR AF-ID(60026220))"
        my_query = VU_with_VUMC_affid + ' AND  ' + "( PUBYEAR  =  " + str(
            MY_YEAR) + " )"  ### "PUBDATETXT(February 2018)"

        # TITLE(TENSOR) AND

        # corresponding author
        vu_afids = chosen_affid
        # this is vsnu w/o phtu and such (borrowed from VSNU-SDG-data), but should approach the UKB list... good for now. update later.
        all_vsnu_sdg_afids = pd.read_csv(path_vsnu_afids).iloc[:, 1].astype(
            'str').to_list()

        # testing
        if override_query_for_testing:
            my_query = 'TITLE(TENSOR LPV)'
            print('overriding query for testing')

        # ETLMIG MIGRATION DONE

        # helper functions
        # ! CAREFUL! COPIED CODE
        def fn_cats(row):
            if row == 'closed':
                result = 1
            elif row == 'hybrid':
                result = 2
            elif row == 'bronze':
                result = 3
            elif row == 'green':
                result = 4
            elif row == 'gold':
                result = 5
            else:
                result = 0  # nans etc
            return result

        # entire pipeline

        # Perform ScopusSearch
        s = ScopusSearch(
            my_query, refresh=True)  #(VU_aff + " AND " + recent, refresh=True)
        df = pd.DataFrame(s.results)

        # Remove unnecessary columns
        fav_fields = [
            'eid', 'creator', 'doi', 'title', 'afid', 'affilname',
            'author_count', 'author_names', 'author_afids', 'coverDate',
            'coverDisplayDate', 'publicationName', 'issn', 'source_id',
            'eIssn', 'citedby_count', 'fund_sponsor', 'aggregationType',
            'openaccess'
        ]
        df = df[fav_fields]  # cut fields

        # Add info on year and month
        df = add_year_and_month(df, 'coverDate')  # add info columns

        # prepare the faculty_finder NLP tool
        org_info = pd.read_excel(path_org, skiprows=0)
        ff = faculty_finder(organizational_chart=org_info)

        # Per EID, get scopus abstract info, get first vu author and use NLP to find faculty
        # initialize
        df_ab = pd.DataFrame()
        df_au = pd.DataFrame()
        df_ff = pd.DataFrame()
        for counter, cur_eid in enumerate(df.eid.tolist()):

            print('getting abstract info for ' + str(counter + 1) +
                  ' out of ' + str(len(df.eid.tolist())))

            # get abstract
            dict_ab_info = get_scopus_abstract_info(cur_eid)  # !
            dict_ab_info['eid'] = cur_eid

            # get first chosen affiliation author
            dict_auth_info = get_first_chosen_affiliation_author(
                dict_ab_info['abstract_object'], chosen_affid)
            dict_auth_info['eid'] = cur_eid

            # get faculty
            if dict_auth_info['first_affil_author_has_error'] == True:
                print('no chosen affid author found at EID:' + str(cur_eid))
                dict_ff = ff.match_nan()
            else:
                # get faculty
                dict_ff = ff.match(dict_auth_info['first_affil_author_org'])
            dict_ff['eid'] = cur_eid

            df_ab = df_ab.append(dict_ab_info, ignore_index=True)
            df_au = df_au.append(dict_auth_info, ignore_index=True)
            df_ff = df_ff.append(dict_ff, ignore_index=True)

        df = df.merge(df_ab, on='eid', how='left')
        df = df.merge(df_au, on='eid', how='left')
        df = df.merge(df_ff, on='eid', how='left')

        print('df_ab,au,ff done')
        #df.to_csv(r'C:\Users\yasing\Desktop\oa oktober\oa' + my_timestamp() + '.csv')
        # df.to_pickle(path_out + 'oa_base_' + my_timestamp() + str(MY_YEAR) + '.pkl')

        # add unpaywall info
        df = add_unpaywall_columns(df, silent=False)  # !

        # add deal info
        df = add_deal_info(path_deals=path_deals, path_isn=path_isn, df_b=df)

        # add corresponding author info
        df = (corresponding_author_functions().add_corresponding_author_info(
            df=df, vu_afids=vu_afids, ukb_afids=all_vsnu_sdg_afids))

        # post-process
        df['upw_oa_color_category'] = df.upw_oa_color.apply(fn_cats)
        df['upw_oa_color_verbose'] = df['upw_oa_color'].apply(
            lambda x: 'unknown' if x is np.nan else x)

        # save it
        # save to pickle with abstract_object, for now
        # df.to_pickle(path_out  + 'oa' + my_timestamp() + str(MY_YEAR) +  '.pkl')
        # save to csv without abstract_object0
        if do_save:
            df.drop(columns=['abstract_object']).to_csv(path_out + 'oa' +
                                                        my_timestamp() +
                                                        str(MY_YEAR) + '.csv')

        # diagnose
        # verval-analyse
        print('verval-analyse')
        print('aantal scopus publicaties: ' + str(len(df)))
        print('api error: abstract API: ' +
              str(len(df[df.abstract_error_message == 'abstract api error'])))
        print('api error: authgroup/afdelinginfo: ' +
              str(df.no_author_group_warning.sum()))  # ab.authgroup error
        print('api error: authgroup.x/afdelinginfo details: ' +
              str(len(df[df.first_affil_author_has_error == True]))
              )  # ab.authgroup ok, error deeper in it
        print('api missing data: data afdelingsinfo ontbreekt no1: ' + str(
            len(df[(df.first_affil_author == None)
                   & (df.first_affil_author_has_error == False)])))
        print('api missing data: data afdelingsinfo ontbreekt no2: ' +
              str(len(df[df.first_affil_author_org == None])))
        # pas hier heb je data om mee te werken
        print(
            'no match: no faculty name match and bag of words only has trivial words (zoals lidwoorden en Amsterdam): '
            + str(
                len(df[
                    df.ff_message ==
                    'no faculty name match and bag of words only has trivial words']
                    )))
        print(
            'no match: no faculty name match and no bag of words match despite non-trivial words (vaak VUMC, soms typo): '
            + str(
                len(df[
                    df.ff_message ==
                    'no faculty name match and no bag of words match despite non-trivial words']
                    )))
        print('aantal matches: ' + str(len(df[df.ff_score > 0])))
        # diagnostics can be improved further by capturing the last 6 fails too

        # print done
        print('done')

        # extra: post-process

        ##df = pd.read_csv(r'C:\Users\yasin\Desktop\oa new csv\OA_VU2018_met_corresponding_authors.csv')
        ##list(df)

        # this also drop abstract_object(!)
        df2 = df[[
            'eid',
            'doi',
            'title',
            'year',
            'publicationName',
            'issn',
            'eIssn',
            'fund_sponsor',
            'aggregationType',
            'first_affil_author',
            'first_affil_author_org',
            'ff_match',
            'ff_match_subgroup',
            'ff_message',
            'ff_provided_organization_string',
            'ff_score',
            'ff_terms',
            'upw_free_fulltext_url',
            'upw_is_boai_license',
            'upw_is_free_to_read',
            'upw_is_subscription_journal',
            'upw_license',
            'upw_oa_color_category',
            'upw_oa_color_verbose',
            'upw_oa_color',  # internal
            'deal_name',
            'deal_owner',
            'deal_discount',
            'deal_discount_verbose',
            'deal_owner_verbose',
            'corresponding_author_surname',
            'match_affiliation_id',
            'match_surname',
            'match_indexed_name',
            'match_auid',
            'match_aut_score',
            'is_corresponding_author_a_vu_author',
            'is_corresponding_author_a_ukb_author'
        ]]
        col_rename_dict = {
            'publicationName': 'journal_name',
            'first_affil_author': 'first_VU_author',
            'first_affil_author_org': 'first_VU_author_raw_organization_info',
            'ff_match': 'faculty_(matched)',
            'ff_match_subgroup': 'subgroup_(matched)',
            'ff_message': 'diagnostics: ff message',
            'ff_provided_organization_string': 'diagnostics: ff raw input ',
            'ff_score': 'diagnostics: ff score',
            'ff_terms': 'diagnostics: ff matching words',
            'upw_free_fulltext_url': 'fulltext_free_url',
            'upw_is_boai_license': 'is_a_boai_license',
            'upw_is_free_to_read': 'is_free_to_read',
            'upw_is_subscription_journal': 'is_a_subscription_journal',
            'upw_license': 'license',
            #'upw_oa_color_category': '', # internal
            'upw_oa_color_verbose': 'open_access_color',
            #'deal_name',
            'deal_owner': 'deal_owner_raw',
            # 'deal_discount_verbose', # internal
            'deal_owner_verbose': 'deal_scope',
            #'corresponding_author_surname',
            'match_affiliation_id':
            'corresponding_author_affiliation_id_(matched)',
            'match_surname': 'corresponding_author_surname_(matched)',
            'match_indexed_name':
            'corresponding_author_indexed_name_(matched)',
            'match_auid': 'corresponding_author_author_id_(matched)',
            'match_aut_score': 'diagnostics:corresponding_author_match_score'
        }
        # 'is_corresponding_author_a_vu_author',
        # 'is_corresponding_author_a_ukb_author'}
        df2 = df2.rename(columns=col_rename_dict)

        def get_contact_point(row):
            if row.is_corresponding_author_a_vu_author is True:
                res = row['corresponding_author_indexed_name_(matched)']
            else:
                res = row['first_VU_author']
            # bij een workflow moet er even op PURE gekeken worden naar de huidige faculteit/groep van de auteur (evt hand/automatisch)
            return res

        df2['vu_contact_person'] = df2.apply(get_contact_point, axis=1)
        if do_save:
            df2.to_csv(path_out + 'knip_OA_VU' + str(MY_YEAR) +
                       '_met_corresponding_authors.csv')
            df2.to_excel(path_out + 'knip_OA_VU' + str(MY_YEAR) +
                         '_met_corresponding_authors.xlsx')

        dict_output[MY_YEAR] = df2

    print('done with scopus arm')
    return dict_output
Ejemplo n.º 22
0
def query_scopus(query_str):
    print(query_str)
    s = ScopusSearch(query_str, refresh=1200)
    print(f"Obtained {s.get_results_size()} results")
    return s
# and use pip install XX, where XX is the package name and version
# the packages required by this toolbox are in requirements.txt
# and pip installing them one by one will finish this task
# Alternatively, once available, a pipy package can be downloaded which
# will do all these installations automatically. This will be released in the future.
#
from pybliometrics.scopus import ScopusSearch
# knowledge of python packages is assumed
# if that command failed, install the package pybliometrics
# during your first run an api-key will be asked, get one from scopus
# hint: in production code always put all imports at the top
#
#
# now send out the query
# easy querying of Scopus
s = ScopusSearch(my_query)
# the variable s stores the results
#
# next we turn it into a pandas dataframe for easy handling
# we use s.results to make the wrapper return the results in suitable format
# and pandas for data handling
import pandas as pd
df = pd.DataFrame(s.results)
#
# now the data can be printed
print(df.head())
#
# this wraps up how to get scopus dsta through python automatically

# example 1: enriching your data with Altmetric
# scopus is not everything
Ejemplo n.º 24
0
    def get_piis(self, term_list, year_list, pii_path, config_path='/Users/nisarg/.scopus/config.ini', keymaster=False):
        """
        This should be a standalone method that recieves a list of journals (issns), a keyword search,
        an output path and a path to clear the cache. It should be mappable to multiple parallel processes.
        """

        fresh_keys = self.API_list

        journal_frame = self.make_jlist(jlist_url = 'https://www.elsevier.com/__data/promis_misc/sd-content/journals/jnlactivesubject.xls', \
                        journal_strings = ['chemistry','energy','molecular','atomic','chemical','biochem', \
                                           'organic','polymer','chemical engineering','biotech','colloid'])


        if pii_path[-1] is not '/':
            raise Exception('Output file path must end with /')

        if '.scopus/scopus_search' not in self.cache_path:
            raise Exception('Cache path is not a sub-directory of the scopus_search. Make sure cache path is correct.')

        # Two lists who's values correspond to each other
        issn_list = journal_frame['ISSN'].values
        journal_list = journal_frame['Journal_Title'].values
        # Find and replaces slashes and spaces in names for file storage purposes
        for j in range(len(journal_list)):
            if ':' in journal_list[j]:
                journal_list[j] = journal_list[j].replace(':','')
            elif '/' in journal_list[j]:
                journal_list[j] = journal_list[j].replace('/','_')
            elif ' ' in journal_list[j]:
                journal_list[j] = journal_list[j].replace(' ','_')

        # Build the dictionary that can be used to sequentially query elsevier for different journals and years
        query_dict = self.build_query_dict(term_list,issn_list,year_list)

        # Must write to memory, clear cache, and clear a dictionary upon starting every new journal
        for i in range(len(issn_list)):
            # At the start of every year, clear the standard output screen
            os.system('cls' if os.name == 'nt' else 'clear')
            paper_counter = 0

            issn_dict = {}
            for j in range(len(year_list)):
                # for every year in every journal, query the keywords
                print(f'{journal_list[i]} in {year_list[j]}.')

                # Want the sole 'keymaster' process to handle 429 responses by swapping the key.
                if keymaster:
                    try:
                        query_results = ScopusSearch(verbose = True,query = query_dict[issn_list[i]][year_list[j]])
                    except Scopus429Error:
                        print('entered scopus 429 error loop... replacing key')
                        newkey = fresh_keys.pop(0)
                        config["Authentication"]["APIKey"] = newkey
                        time.sleep(5)
                        query_results = ScopusSearch(verbose = True,query = query_dict[issn_list[i]][year_list[j]])
                        print('key swap worked!!')
                # If this process isn't the keymaster, try a query.
                # If it excepts, wait a few seconds for keymaster to replace key and try again.
                else:
                    try:
                        query_results = ScopusSearch(verbose = True,query = query_dict[issn_list[i]][year_list[j]])
                    except Scopus429Error:
                        print('Non key master is sleeping for 15... ')
                        time.sleep(15)
                        query_results = ScopusSearch(verbose = True,query = query_dict[issn_list[i]][year_list[j]]) # at this point, the scopus 429 error should be fixed...
                        print('Non key master slept, query has now worked.')

                # store relevant information from the results into a dictionary pertaining to that query
                year_dict = {}
                if query_results.results is not None:
                    # some of the query results might be of type None


                    for k in range(len(query_results.results)):
                        paper_counter += 1

                        result_dict = {}
                        result = query_results.results[k]

                        result_dict['pii'] = result.pii
                        result_dict['doi'] = result.doi
                        result_dict['title'] = result.title
                        result_dict['num_authors'] = result.author_count
                        result_dict['authors'] = result.author_names
                        result_dict['description'] = result.description
                        result_dict['citation_count'] = result.citedby_count
                        result_dict['keywords'] = result.authkeywords

                        year_dict[k] = result_dict

                    # Store all of the results for this year in the dictionary containing to a certain journal
                    issn_dict[year_list[j]] = year_dict
                else:
                    # if it was a None type, we will just store the empty dictionary as json
                    issn_dict[year_list[j]] = year_dict


            # Store all of the results for this journal in a folder as json file
            os.mkdir(f'{pii_path}{journal_list[i]}')
            with open(f'{pii_path}{journal_list[i]}/{journal_list[i]}.json','w') as file:
                json.dump(issn_dict, file)

            with open(f'{pii_path}{journal_list[i]}/{journal_list[i]}.txt','w') as file2:
                file2.write(f'This file contains {paper_counter} publications.')
Ejemplo n.º 25
0
nodes=pd.DataFrame()
nodes = nodes.append({"id":"", "title": ab.title, "sourcetitle": ab.sourcetitle_abbreviation, "publicationyear": ab.coverDate[0:4], "eid": ab.eid, "gen": '0' }, ignore_index=True)
ref_df = pd.DataFrame(ab.references)
ref_df["eid"] = '2-s2.0-' + ref_df['id']
ref_df['gen'] = '-1'

ref_df2 = pd.concat([ref_df['eid'], ref_df['id'], ref_df['publicationyear'], ref_df['sourcetitle'], ref_df['title'], ref_df['gen']], axis=1, keys=['eid', 'id', 'publicationyear', 'sourcetitle', 'title', 'gen'], sort=True)
#ref_df2 = ref_df2.drop(18)
nodes = nodes.append(ref_df2, ignore_index = True, sort=True)

for row in ref_df2.itertuples():
    edges.append((row.eid, ab.eid))

len(nodes)

s = ScopusSearch(ab.eid) 
for x in s.results:
    if(x.eid not in list(nodes['eid'])):
        nodes = nodes.append({"id":"", "title": x.title, "sourcetitle": "", "publicationyear": x.coverDate[0:4], "eid": x.eid, "gen": '1' }, ignore_index=True)
        print(x.title)
    edges.append((ab.eid, x.eid))

print(len(nodes))



for y in ab.references:
    try:
	    refs = AbstractRetrieval(y.id, view="FULL")
	    if(refs.references != None):
	        ref_df = pd.DataFrame(refs.references)
Ejemplo n.º 26
0
def import_scopus(ctx, verbose, start):
    """
    Import scopus publication records for the authors of the pubtrack application.

    This command will first fetch all the information about the authors, which are defined within the pubtrack app.
    It uses the scopus author ID's of these authors to send requests to the scopus database. The publications of these
    replies are then evaluated and posted into the pubtrack app.
    """
    # SETTING UP PUBTRACK WRAPPER
    config = ctx.obj['config']
    pubtrack = Pubtrack(config)

    # SETTING UP SCOPUS WRAPPER
    try:
        pybliometrics.scopus.utils.create_config()
    except FileExistsError:
        pass
    finally:
        scopus_config['Authentication']['APIKey'] = config.get_scopus_key()

    # FETCHING META AUTHOR INFORMATION FROM PUBTRACK
    click.secho('Fetching author information from pubtrack.')
    author_id_name_map = {}
    meta_authors = pubtrack.meta_author.get()['results']
    for meta_author in meta_authors:
        for author in meta_author['authors']:
            # "author_name_kitopen" returns a string with the authors name. This function essentially formats the name
            # in a way so that it can be used in a query string for the KITOpen database.
            full_name = '{} {}'.format(author['first_name'],
                                       author['last_name'])
            scopus_id = author['scopus_id']
            author_id_name_map[scopus_id] = full_name
            out(
                verbose, ' > Adding author "{} ({})" to be processed'.format(
                    full_name, scopus_id))

    click.secho('==> Processing total of {} authors'.format(
        len(author_id_name_map)))

    # QUERY SCOPUS DATABASE
    click.secho(
        'Querying scopus database for the publications of those authors.')
    date_limit = datetime.datetime(year=start, month=1, day=1)
    for author_id, author_name in author_id_name_map.items():
        publication_count = 0
        search = ScopusSearch(f'AU-ID ( {author_id} )')
        out(verbose, ' | Query "AU-ID ( {} )"'.format(author_id))

        for result in search.results:

            # We'll only take publications, which have a DOI
            if result.doi is None:
                continue

            # requesting the detailed information from the scopus database for the current publication from the search
            # results
            try:
                abstract_retrieval = AbstractRetrieval(result.doi)
            except Exception as e:
                out(verbose,
                    '   # Could not retrieve publication "{}"'.format(
                        result.doi),
                    fg='yellow')
                continue

            # If the publication is older than the date limit, it will be discarded
            publication_date = datetime.datetime.strptime(
                abstract_retrieval.coverDate, '%Y-%m-%d')
            if publication_date <= date_limit:
                out(verbose,
                    '   # Publication too old "{}"({})'.format(
                        result.doi, publication_date),
                    fg='yellow')
                continue
            else:
                out(verbose,
                    '   > Fetched publication "{}"'.format(result.doi))

            adapter = ScopusPublicationAdapter(abstract_retrieval)
            publication = adapter.get_publication()

            # Filtering the authors according to the AUTHOR_LIMIT, which has been set.
            # We cannot just use the first few authors however, we need to make sure that the author, from which we have
            # this publication in the first place is in there. The rest just gets filled up...
            authors = []
            for author in publication['authors']:
                if author['scopus_id'] in author_id_name_map.keys(
                ) or len(authors) < config.get_author_limit():
                    authors.append(author)

            publication['authors'] = authors

            # Now we try to actually POST the publication to the pubtrack REST API
            try:
                pubtrack.import_publication(publication)
                publication_count += 1
                out(verbose,
                    '   * Added to pubtrack: "{}"'.format(
                        publication['title']),
                    fg='green')
            except Exception as e:
                if str(e) == 'uuid':
                    out(verbose,
                        '   ! Error while posting to pubtrack: Already exists!',
                        fg='red')
                else:
                    out(verbose,
                        '   ! Error while posting to pubtrack: {}'.format(
                            str(e)),
                        fg='red')
                continue

        out(True,
            ' --> Total of {} publications imported from author {}'.format(
                publication_count, author_id),
            fg='green',
            bold=True)
Ejemplo n.º 27
0
from collections import namedtuple
from nose.tools import assert_equal, assert_true

from pybliometrics.scopus import ScopusSearch


order = 'eid doi pii pubmed_id title subtype creator afid affilname '\
        'affiliation_city affiliation_country author_count author_names '\
        'author_ids author_afids coverDate coverDisplayDate publicationName '\
        'issn source_id eIssn aggregationType volume issueIdentifier '\
        'article_number pageRange description authkeywords citedby_count '\
        'openaccess fund_acr fund_no fund_sponsor'
doc = namedtuple('Document', order)

# Set to refresh=False because of citation count
s_au = ScopusSearch('AU-ID(24320488600)', refresh=False)
s_j = ScopusSearch('SOURCE-ID(22900) AND PUBYEAR IS 2010', refresh=False)
q_empty = 'SOURCE-ID(19700188323) AND PUBYEAR IS 1900'
s_empty = ScopusSearch(q_empty, refresh=False)


def test_get_eids_author():
    assert_equal(s_au.get_eids(), ['2-s2.0-26444452434'])


def test_get_eids_journal():
    assert_equal(len(s_j.get_eids()), 118)


def test_get_results_size():
    assert_equal(s_au.get_results_size(), 1)
Ejemplo n.º 28
0
         ]  # <-- modifique aqui adicionando sua chave de acesso
config["Authentication"]["APIKey"] = _keys.pop()
api_view = "META"

# Descomente a próxima linha para configurar a chave de acesso durante a primeira execução do programa.
# create_config()

# Configurando os critérios de pesquisa.
query = 'TITLE-ABS-KEY("protected area" OR "conservation" OR "ecology" OR "marine protected" OR "national forest")' \
        ' AND TITLE-ABS-KEY("remote sensing" OR "earth observation" OR "Landsat" OR "Lidar" OR "MODIS" OR "Radar")' \
        ' AND TITLE-ABS-KEY("Brazil" OR "Brasil")' \
        ' AND PUBYEAR BEF 2021 AND PUBYEAR AFT 1999' \
        ' AND LANGUAGE(english OR portuguese)'

# Cria um objeto de pesquisa ScopusSearch contendo as informações para busca.
scopus = ScopusSearch(query, max_entries=None, subscriber=False, verbose=True)

# Retorna o número de registros coletados pela API.
print("Número total de publicações: {}.".format(scopus.get_results_size()))

# Obtêm uma lista contendo todos os identificadores digitais (EID) resgatados da API durante a busca.
eids_documentos = scopus.get_eids()

# Coleta as informações sobre os artigos, a partir dos EID e da função auxiliar.
df = coletar_artigos(eids_documentos, api_view)

# Armazena todas as entradas em um arquivo .csv, para consulta posterior
df.to_csv("data/resultado_pesquisa_scopus.csv",
          index=False,
          quoting=csv.QUOTE_ALL)
"""-------------------------------------------------------------
def search_scopus(query, docs=None, retrieve_orcid=True):
    """Search Scopus."""

    documents = []
    authors_cache = {}
    affiliations_cache = {}
    try:
        retrieved_paper_ids = ScopusSearch(query, view="STANDARD").get_eids()
    except ScopusQueryError:
        print("Impossible to process query \"{}\".".format(query))
        return None
    if len(retrieved_paper_ids) == 0:
        print("No matching documents for the provided query.")
        return None
    for paper_id in tqdm(retrieved_paper_ids):
        try:
            paper = AbstractRetrieval(paper_id, view="FULL")
        except ValueError:
            print("Impossible to retrieve data for paper \"{}\".".format(paper_id))
            return None
        doc_id = DocumentID()
        doc_id.parse_scopus(paper)
        authors = []
        if paper.authors:
            for author in paper.authors:
                author_affiliations = []
                if retrieve_orcid:
                    if author.auid in authors_cache:
                        authors.append(Author(name=author.indexed_name,
                                              orcid=authors_cache[author.auid],
                                              affiliations=author_affiliations))
                    else:
                        authors_cache[author.auid] = AuthorRetrieval(author.auid).orcid
                        authors.append(Author(name=author.indexed_name,
                                              orcid=authors_cache[author.auid],
                                              affiliations=author_affiliations))
                else:
                    authors.append(Author(name=author.indexed_name,
                                          orcid=None,
                                          affiliations=author_affiliations))
                if author.affiliation:
                    for affiliation_id in author.affiliation:
                        if affiliation_id in affiliations_cache:
                            affiliation = affiliations_cache[affiliation_id]
                        else:
                            try:
                                affiliation = ContentAffiliationRetrieval(affiliation_id)
                                affiliations_cache[affiliation_id] = affiliation
                            except:
                                affiliation= None
                        if affiliation:
                            author_affiliations.append(Affiliation(name=affiliation.affiliation_name,
                                                               city=affiliation.city,
                                                               country=affiliation.country))
        references = []
        if paper.refcount and int(paper.refcount) > 0 and paper.references:
            for reference in paper.references:
                if reference.title:
                    references.append(reference.title)
        if paper.language:
            try:
                language = iso639.languages.get(part2b=paper.language).name
            except KeyError:
                language = None
        else:
            language = None

        document = Document(id=doc_id,
                            title=paper.title,
                            keywords=paper.authkeywords,
                            abstract=paper.description,
                            source=paper.publicationName,
                            source_type=paper.aggregationType,
                            language=language,
                            year=int(paper.coverDate.split("-")[0]),
                            authors=authors,
                            references=references,
                            publisher=paper.publisher,
                            internal=paper)
        if paper.citedby_count:
            document.citation_count = int(paper.citedby_count)
        documents.append(document)
    if docs:
        return DocumentSet(docs=documents).union(docs)
    else:
        return DocumentSet(docs=documents)
Ejemplo n.º 30
0
AUTHORS = {}
meta_authors = pubtrack.meta_author.get()['results']
for meta_author in meta_authors:
    for author in meta_author['authors']:
        if author['scopus_id']:
            full_name = '{} {}'.format(author['first_name'], author['last_name'])
            AUTHORS[author['scopus_id']] = full_name
            logger.info(' * Adding author {}({}) to be processed'.format(full_name, author['scopus_id']))
logger.info('==> Processing total of {} authors'.format(len(AUTHORS)))


DATE_LIMIT = datetime.datetime(year=SINCE, month=1, day=1)

for author_id, full_name in AUTHORS.items():
    publication_count = 0
    search = ScopusSearch(f'AU-ID ( {author_id} )')
    logger.info('STARTING SEARCH FOR AUTHOR {}({})'.format(full_name, author_id))

    for result in search.results:

        # We'll only take publications, which have a DOI
        if result.doi is None:
            continue

        # Requesting the detailed information from the scopus database for the current publication from the search
        # results
        try:
            abstract_retrieval = AbstractRetrieval(result.doi)
            logger.info(' * FETCHED publication {}'.format(result.doi))
        except Exception as e:
            logger.error(' ! Could not retrieve scopus abstract for DOI "{}". ERROR: {}'.format(result.doi, str(e)))