def author_yearly_productivity(self, df=None, colgroupby = 'AuthorId', datecol = 'Year', colcountby = 'PublicationId', show_progress=False): """ Calculate the number of publications for each author in each year. Parameters ---------- :param df : DataFrame, default None, Optional A DataFrame with the author2publication information. If None then the database 'author2pub_df' is used. :param colgroupby : str, default 'AuthorId', Optional The DataFrame column with Author Ids. If None then the database 'AuthorId' is used. :param datecol : str, default 'Year', Optional The DataFrame column with Year information. If None then the database 'Year' is used. :param colcountby : str, default 'PublicationId', Optional The DataFrame column with Publication Ids. If None then the database 'PublicationId' is used. Returns ------- DataFrame Productivity DataFrame with 3 columns: 'AuthorId', 'Year', 'YearlyProductivity' """ if df is None: df = self.author2pub_df # we can use show_progress to pass a label for the progress bar if show_progress: show_progress='Yearly Productivity' newname_dict = zip2dict([str(colcountby)+'Count', '0'], ['YearlyProductivity']*2) return groupby_count(df, [colgroupby, datecol], colcountby, count_unique=True, show_progress=show_progress).rename(columns=newname_dict)
def author_productivity(pub2author_df, colgroupby = 'AuthorId', colcountby = 'PublicationId', show_progress=False): """ Calculate the total number of publications for each author. Parameters ---------- pub2author_df : DataFrame, default None, Optional A DataFrame with the author2publication information. colgroupby : str, default 'AuthorId', Optional The DataFrame column with Author Ids. If None then the database 'AuthorId' is used. colcountby : str, default 'PublicationId', Optional The DataFrame column with Publication Ids. If None then the database 'PublicationId' is used. Returns ------- DataFrame Productivity DataFrame with 2 columns: 'AuthorId', 'Productivity' """ # we can use show_progress to pass a label for the progress bar if show_progress: show_progress='Author Productivity' newname_dict = zip2dict([str(colcountby)+'Count', '0'], ['Productivity']*2) return groupby_count(pub2author_df, colgroupby, colcountby, count_unique=True, show_progress=show_progress).rename(columns=newname_dict)
def author_top_field(pub2author_df, colgroupby = 'AuthorId', colcountby = 'FieldId', fractional_field_counts = False, show_progress=False): """ Calculate the most frequent field in the authors career. Parameters ---------- pub2author_df : DataFrame A DataFrame with the author2publication field information. colgroupby : str, default 'AuthorId' The DataFrame column with Author Ids. If None then the database 'AuthorId' is used. colcountby : str, default 'FieldId' The DataFrame column with Citation counts for each publication. If None then the database 'FieldId' is used. fractional_field_counts : bool, default False How to count publications that are assigned to multiple fields: - If False, each publication-field assignment is counted once. - If True, each publication is counted once, contributing 1/#fields to each field. Returns ------- DataFrame DataFrame with 2 columns: 'AuthorId', 'TopFieldId' """ check4columns(pub2author_df, [colgroupby, 'PublicationId', colcountby]) # register our pandas apply with tqdm for a progress bar tqdm.pandas(desc='Author Top Field', disable= not show_progress) if not fractional_field_counts: author2field = pub2author_df.groupby(colgroupby)[colcountby].progress_apply(lambda x: x.mode()[0]) else: # first calculate how many fields each publication maps too pub2nfields = groupby_count(pub2author_df, colgroupby='PublicationId', colcountby=colcountby) # each pub2field mapping is weighted by the number of fields for the publication pub2nfields['PublicationWeight'] = 1.0/pub2nfields['PublicationIdCount'] del pub2nfields[str(colcountby)+'Count'] # merge counts author2field = pub2author_df.merge(pub2nfields, how='left', on='PublicationId') # custom weighted mode based on def weighted_mode(adf): p = adf.groupby(colcountby)['PublicationWeight'].sum() return p.idxmax() # now take the weighted mode for each groupby column author2field = author2field.groupby(colgroupby).progress_apply(weighted_mode) newname_dict = zip2dict([str(colcountby), '0'], ['Top' + str(colcountby)]*2) return author2field.to_frame().reset_index().rename(columns=newname_dict)
def compute_yearly_citations(self, preprocess = True, show_progress=False): if show_progress: print("Starting Computation of Yearly Citations") # first load the publication year information pub2year = self.pub2year # now get the reference list and merge with year info pub2ref = self.pub2ref_df pub2ref['CitingYear'] = [pub2year.get(citingpid, 0) for citingpid in pub2ref['CitingPublicationId'].values] # drop all citations that happend before the publication year pub2ref = pub2ref.loc[[citingyear >= pub2year.get(citedpid, 0) for citingyear, citedpid in pub2ref[['CitingYear', 'CitedPublicationId']].values]] if show_progress: print("Yearly Citation Data Prepared") # calcuate the total citations citation_df = groupby_count(pub2ref, colgroupby=['CitedPublicationId', 'CitingYear'], colcountby='CitingPublicationId', count_unique=True ) citation_df.rename(columns={'CitingPublicationIdCount':'YearlyCitations', 'CitedPublicationId':'PublicationId'}, inplace=True) # get the Cited Year citation_df['CitedYear'] = [pub2year.get(pid, 0) for pid in citation_df['PublicationId'].values] citation_df.sort_values(by=['CitedYear', 'CitedPublicationId', 'CitingYear'], inplace=True) if show_progress: print("Yearly Citations Found") if preprocess: if not os.path.exists(os.path.join(self.path2database, 'temporalimpact')): os.mkdir(os.path.join(self.path2database, 'temporalimpact')) for y, cdf in citation_df.groupby('CitedYear', sort=True): cdf.to_hdf(os.path.join(self.path2database, 'temporalimpact', 'temporalimpact{}.hdf'.format(y)), mode='w', key ='temporalimpact') if show_progress: print("Yearly Citations Saved") else: return citation_df
def credit_share(focus_pid, pub2ref_df, pub2author_df, temporal=False, normed=False, show_progress=False): """ Calculate the credit share for each author of a publication based on :cite:`Shen2014credit`. Parameters ---------- :param focus_pid : int, str The focus publication id. :param pub2ref_df : DataFrame A DataFrame with the citation information for each Publication. :param pub2author_df : DataFrame A DataFrame with the author information for each Publication. :param temporal : bool, default False If True, compute the adjacency matrix using only publications for each year. :param normed : bool, default False Normalize the sum of credit share to 1.0 :param show_progress : bool, default False If True, show a progress bar tracking the calculation. Returns ------- credit_share, numpy array If temporal == False: The adjacency matrix for the co-citation network If temporal == True: A dictionary with key for each year, and value of the adjacency matrix for the cocitation network induced by citing publications in that year. author2int, dict A mapping of the AuthorIds from the focus publication to the column of the credit share vector or matrix (see above). """ # the focus publication's authors focus_authors = np.sort(pub2author_df.loc[pub2author_df['PublicationId']==focus_pid]['AuthorId'].unique()) author2int = {aid:i for i, aid in enumerate(focus_authors)} if focus_authors.shape[0] > 1: # start by getting the co-citation network around the focus publication adj_mat, cited2int = cocitation_network(pub2ref_df, focus_pub_ids=np.sort([focus_pid]), focus_constraint='egocited', temporal=temporal, show_progress=show_progress) # get the authorships for the publications in the cocitation network cocited_pubs = np.sort(list(cited2int.keys())) pa_df = pub2author_df.loc[isin_sorted(pub2author_df['PublicationId'].values, cocited_pubs)] if cocited_pubs.shape[0] > 0: # the credit allocation matrix has a row for each focus author, and a column for each cocited publication (including the focus pub) credit_allocation_mat = np.zeros((focus_authors.shape[0], cocited_pubs.shape[0]), dtype = float) # for each cocited publication, we count the number of authors # and assign to each focus author, their fractional share of the credit (1 divided by the number of authors) for cocitedid, adf in pa_df.groupby('PublicationId'): author2row = [author2int[aid] for aid in adf['AuthorId'].unique() if not author2int.get(aid, None) is None] if len(author2row) > 0: credit_allocation_mat[author2row, cited2int[cocitedid]] = 1.0/adf['AuthorId'].nunique() if temporal: # temporal credit allocation - broken down by year # we need the temporal citations to the focus article focus_citations = groupby_count(pub2ref_df.loc[isin_sorted(pub2ref_df['CitedPublicationId'].values, np.sort([focus_pid]))], colgroupby='CitingYear', colcountby='CitingPublicationId', count_unique=True, show_progress=False) focus_citations={y:c for y,c in focus_citations[['CitingYear', 'CitingPublicationIdCount']].values} # when temporal is True, a temporal adj mat is returned where each key is the year years = np.sort(list(adj_mat.keys())) cocite_counts = np.zeros((years.shape[0], cocited_pubs.shape[0]), dtype=float) for iy, y in enumerate(years): cocite_counts[iy] = adj_mat[y].tocsr()[cited2int[focus_pid]].todense()#set the off-diagonal to be the total co-citations from that year cocite_counts[iy, cited2int[focus_pid]] = focus_citations[y] #set the diagonal to be the total citations from that year cocite_counts = cocite_counts.cumsum(axis=0) else: # just do credit allocation with the full cocitation matrix cocite_counts = adj_mat.tocsr()[cited2int[focus_pid]].todense() # the co-citation matrix misses the number of citations to the focus publication # so explicitly calculate the number of citations to the focus publication cocite_counts[0,cited2int[focus_pid]] = pub2ref_df.loc[isin_sorted(pub2ref_df['CitedPublicationId'].values, np.sort([focus_pid]))]['CitingPublicationId'].nunique() # credit share is the matrix product of the credit_allocation_mat with cocite_counts credit_share = np.squeeze(np.asarray(credit_allocation_mat.dot(cocite_counts.T))) # normalize the credit share vector to sum to 1 if normed: credit_share = credit_share/credit_share.sum(axis=0) if temporal: return credit_share, author2int, years else: return credit_share, author2int else: if temporal: years = np.sort(pub2ref_df.loc[pub2ref_df['CitedPublicationId'] == focus_pid]['CitingYear'].unique()) return np.array([[None for y in years] for a in author2int]), author2int, years else: return np.array([None for a in author2int]), author2int elif focus_authors.shape[0] == 1: if temporal: years = np.sort(pub2ref_df.loc[pub2ref_df['CitedPublicationId'] == focus_pid]['CitingYear'].unique()) return np.ones(shape=(1,years.shape[0])), author2int, years else: return np.array([1.0]), author2int
def compute_impact(self, preprocess=True, citation_horizons = [5,10], noselfcite = True): """ Calculate several of the common citation indices. * 'Ctotal' : The total number of citations. * 'Ck' : The total number of citations within the first k years of publcation, for each k value specified by `citation_horizons`. * 'Ctotal_noself' : The total number of citations with self-citations removed. * 'Ck' : The total number of citations within the first k years of publcation with self-citations removed, for each k value specified by `citation_horizons`. Parameters ---------- :param preprocess : bool, default True, Optional If True then the impact measures are saved in preprocessed files. :param citation_horizons : list, default [5,10], Optional The DataFrame column with Author Ids. If None then the database 'AuthorId' is used. :param noselfcite : Bool, default 'True', Optional If True then the noselfcitation pub2ref files are also processed. Returns ------- DataFrame The impact DataFrame with at least two columns: 'PublicationId', 'Year', + citation columns """ # first load the publication year information pub2year = self.load_pub2year() # now get the reference list and merge with year info pub2ref = self.pub2ref_df # drop all citations that happend before the publication year pub2ref = pub2ref.loc[[pub2year.get(citingpid, 0) >= pub2year.get(citedpid, 0) for citingpid, citedpid in pub2ref[['CitingPublicationId', 'CitedPublicationId']].values]] # calcuate the total citations citation_df = groupby_count(pub2ref, colgroupby='CitedPublicationId', colcountby='CitingPublicationId', count_unique=True ) citation_df.rename(columns={'CitingPublicationIdCount':'Ctotal', 'CitedPublicationId':'PublicationId'}, inplace=True) # go from the larest k down for k in np.sort(citation_horizons)[::-1]: # drop all citations that happend after the k #pub2ref = pub2ref.loc[pub2ref['CitingPublicationYear'] <= pub2ref['CitedPublicationYear'] + k] pub2ref = pub2ref.loc[[pub2year.get(citingpid, 0) <= pub2year.get(citedpid, 0) + k for citingpid, citedpid in pub2ref[['CitingPublicationId', 'CitedPublicationId']].values]] # recalculate the impact k_citation_df = groupby_count(pub2ref, colgroupby='CitedPublicationId', colcountby='CitingPublicationId', count_unique=True ) k_citation_df.rename(columns={'CitingPublicationIdCount':'C{}'.format(k), 'CitedPublicationId':'PublicationId'}, inplace=True) citation_df = citation_df.merge(k_citation_df, how='left', on='PublicationId') # get the Cited Year citation_df['Year'] = [pub2year.get(pid, 0) for pid in citation_df['PublicationId'].values] if noselfcite: del pub2ref pub2ref = self.pub2refnoself_df # drop all citations that happend before the publication year pub2ref = pub2ref.loc[[pub2year.get(citingpid, 0) >= pub2year.get(citedpid, 0) for citingpid, citedpid in pub2ref[['CitingPublicationId', 'CitedPublicationId']].values]] # calcuate the total citations citation_noself_df = groupby_count(pub2ref, colgroupby='CitedPublicationId', colcountby='CitingPublicationId', count_unique=True ) citation_noself_df.rename(columns={'CitingPublicationIdCount':'Ctotal_noself', 'CitedPublicationId':'PublicationId'}, inplace=True) # go from the larest k down for k in np.sort(citation_horizons)[::-1]: # drop all citations that happend after the k #pub2ref = pub2ref.loc[pub2ref['CitingPublicationYear'] <= pub2ref['CitedPublicationYear'] + k] pub2ref = pub2ref.loc[[pub2year.get(citingpid, 0) <= pub2year.get(citedpid, 0) + k for citingpid, citedpid in pub2ref[['CitingPublicationId', 'CitedPublicationId']].values]] # recalculate the impact k_citation_df = groupby_count(pub2ref, colgroupby='CitedPublicationId', colcountby='CitingPublicationId', count_unique=True ) k_citation_df.rename(columns={'CitingPublicationIdCount':'C{}_noself'.format(k), 'CitedPublicationId':'PublicationId'}, inplace=True) citation_noself_df = citation_noself_df.merge(k_citation_df, how='left', on='PublicationId') citation_df = citation_df.merge(citation_noself_df, how='left', on='PublicationId') # set all nan to 0 citation_df.fillna(0, inplace=True) if preprocess: if not os.path.exists(os.path.join(self.path2database, 'impact')): os.mkdir(os.path.join(self.path2database, 'impact')) for y, cdf in citation_df.groupby('Year', sort=True): cdf.to_hdf(os.path.join(self.path2database, 'impact', 'impact{}.hdf'.format(y)), mode='w', key ='impact') else: return citation_df