def create_journalcitation_table(pubdf, pub2ref): required_pub_columns = ['PublicationId', 'JournalId', 'Year'] check4columns(pubdf, required_pub_columns) pubdf = pubdf[required_pub_columns] required_pub2ref_columns = ['CitingPublicationId', 'CitedPublicationId'] check4columns(pub2ref, required_pub_columns) pub2ref = pub2ref[required_pub2ref_columns] journals = np.sort(pubdf['JournalId'].unique()) journal2int = {j: i for i, j in enumerate(journals)} pubdf['JournalInt'] = [journal2int[jid] for jid in pubdf['JournalId']] jctable = pub2ref.merge(pubdf[['PublicationId', 'Year', 'JournalInt']], how='left', left_on='CitingPublicationId', right_on='PublicationId') jctable.rename({'Year': 'CitingYear', 'JournalInt': 'CitingJournalInt'}) del jctable['PublicationId'] del jctable['CitingPublicationId'] jctable = jctable.merge(pubdf[['PublicationId', 'Year', 'JournalInt']], how='left', left_on='CitedPublicationId', right_on='PublicationId') jctable.rename({'Year': 'CitedYear', 'JournalInt': 'CitedJournalInt'}) del jctable['PublicationId'] del jctable['CitedPublicationId'] return jctable, {i: j for j, i in journal2int.items()}
def author_top_field(pub2author_df, colgroupby = 'AuthorId', colcountby = 'FieldId', fractional_field_counts = False, show_progress=False): """ Calculate the most frequent field in the authors career. Parameters ---------- pub2author_df : DataFrame A DataFrame with the author2publication field information. colgroupby : str, default 'AuthorId' The DataFrame column with Author Ids. If None then the database 'AuthorId' is used. colcountby : str, default 'FieldId' The DataFrame column with Citation counts for each publication. If None then the database 'FieldId' is used. fractional_field_counts : bool, default False How to count publications that are assigned to multiple fields: - If False, each publication-field assignment is counted once. - If True, each publication is counted once, contributing 1/#fields to each field. Returns ------- DataFrame DataFrame with 2 columns: 'AuthorId', 'TopFieldId' """ check4columns(pub2author_df, [colgroupby, 'PublicationId', colcountby]) # register our pandas apply with tqdm for a progress bar tqdm.pandas(desc='Author Top Field', disable= not show_progress) if not fractional_field_counts: author2field = pub2author_df.groupby(colgroupby)[colcountby].progress_apply(lambda x: x.mode()[0]) else: # first calculate how many fields each publication maps too pub2nfields = groupby_count(pub2author_df, colgroupby='PublicationId', colcountby=colcountby) # each pub2field mapping is weighted by the number of fields for the publication pub2nfields['PublicationWeight'] = 1.0/pub2nfields['PublicationIdCount'] del pub2nfields[str(colcountby)+'Count'] # merge counts author2field = pub2author_df.merge(pub2nfields, how='left', on='PublicationId') # custom weighted mode based on def weighted_mode(adf): p = adf.groupby(colcountby)['PublicationWeight'].sum() return p.idxmax() # now take the weighted mode for each groupby column author2field = author2field.groupby(colgroupby).progress_apply(weighted_mode) newname_dict = zip2dict([str(colcountby), '0'], ['Top' + str(colcountby)]*2) return author2field.to_frame().reset_index().rename(columns=newname_dict)
def compute_cnorm(pub2ref, pub2year): """ This function calculates the cnorm for publications. References ---------- .. [h] Ke, Q., Gates, A. J., Barabasi, A.-L. (2020): "title", *in submission*. DOI: xxx """ raise NotImplementedError required_pub2ref_columns = ['CitingPublicationId', 'CitedPublicationId'] check4columns(pub2ref, required_pub_columns) pub2ref = pub2ref[required_pub2ref_columns] # we need the citation counts and cocitation network temporal_cocitation_dict = { y: defaultdict(set) for y in set(pub2year.values()) } temporal_citation_dict = { y: defaultdict(int) for y in temporal_cocitation_dict.keys() } def count_cocite(cited_df): y = pub2year[cited_df.name] for citedpid in cited_df['CitedPublicationId'].values: temporal_citation_dict[y][citedpid] += 1 for icitedpid, jcitedpid in combinations( cited_df['CitedPublicationId'].values, 2): temporal_cocitation_dict[y][icitedpid].add(jcitedpid) temporal_cocitation_dict[y][jcitedpid].add(icitedpid) pub2ref.groupby('CitingPublicationId', sort=False).apply(count_cocite) cnorm = {} for y in temporal_citation_dict.keys(): for citedpid, year_cites in temporal_citation_dict[y].items(): if cnorm.get(citedpid, None) is None: cnorm[citedpid] = {y: year_cites / np.mean()}
def publication_beauty(pub2ref_df, colgroupby='CitedPublicationId', colcountby='CitingPublicationId', show_progress=False): """ Calculate the sleeping beauty and awakening time for each cited publication. See :cite:`Sinatra2016qfactor` for the derivation. The algorithmic implementation can be found in :py:func:`metrics.qfactor`. Parameters ---------- pub2ref_df : DataFrame, default None, Optional A DataFrame with the temporal citing information information. colgroupby : str, default 'CitedPublicationId', Optional The DataFrame column with Author Ids. If None then the database 'CitedPublicationId' is used. colcountby : str, default 'CitingPublicationId', Optional The DataFrame column with Citation counts for each publication. If None then the database 'CitingPublicationId' is used. Returns ------- DataFrame Trajectory DataFrame with 2 columns: 'AuthorId', 'Hindex' """ check4columns(pub2ref_df, ['CitedPublicationId', 'CitingPublicationId', 'CitingYear']) tqdm.pandas(desc='Beauty', disable=not show_progress) df = groupby_count(pub2ref_df, colgroupby=['CitedPublicationId', 'CitingYear'], colcountby='CitingPublicationId', count_unique=True) newname_dict = zip2dict([str(colcountby), '0', '1'], [str(colgroupby) + 'Beauty'] * 2 + ['Awakening']) return df.groupby(colgroupby)[colcountby + 'Count'].progress_transform( beauty_coefficient).rename(columns=newname_dict)
def temporal_cocited_edgedict(pub2ref, pub2year): required_pub2ref_columns = ['CitingPublicationId', 'CitedPublicationId'] check4columns(pub2ref, required_pub2ref_columns) pub2ref = pub2ref[required_pub2ref_columns] year_values = sorted(list(set(pub2year.values()))) # we need the citation counts and cocitation network temporal_cocitation_dict = {y: defaultdict(set) for y in year_values} temporal_citation_dict = {y: defaultdict(int) for y in year_values} def count_cocite(cited_df): y = pub2year[cited_df.name] for citedpid in cited_df['CitedPublicationId'].values: temporal_citation_dict[y][citedpid] += 1 for icitedpid, jcitedpid in combinations( cited_df['CitedPublicationId'].values, 2): temporal_cocitation_dict[y][icitedpid].add(jcitedpid) temporal_cocitation_dict[y][jcitedpid].add(icitedpid) pub2ref.groupby('CitingPublicationId', sort=False).apply(count_cocite)
def coauthorship_network(paa_df, focus_author_ids=None, focus_constraint='authors', show_progress=False): """ Create the co-authorship network. Parameters ---------- :param paa_df : DataFrame A DataFrame with the links between authors and publications. :param focus_author_ids : numpy array or list, default None A list of the AuthorIds to seed the coauthorship-network. :param focus_constraint : str, default `authors` If focus_author_ids is not None: `authors` : the `focus_author_ids' defines the node set, giving only the co-authorships between authors in the set. `publications` : the publication history of `focus_author_ids' defines the edge set, giving the co-authorhips where at least one author from `focus_author_ids' was involved. 'ego' : the `focus_author_ids' defines a seed set, such that all authors must have co-authored at least one publication with an author from `focus_author_ids', but co-authorships are also found between the second-order author sets. :param show_progress : bool, default False If True, show a progress bar tracking the calculation. Returns ------- coo_matrix The adjacency matrix for the co-authorship network author2int, dict A mapping of AuthorIds to the row/column of the adjacency matrix. """ required_columns = ['AuthorId', 'PublicationId'] check4columns(paa_df, required_columns) paa_df = paa_df[required_columns].dropna() if not focus_author_ids is None: focus_author_ids = np.sort(focus_author_ids) # identify the subset of the publications we need to form the network if focus_constraint == 'authors': # take only the publication-author links that have an author from the `focus_author_ids' paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values, focus_author_ids)] elif focus_constraint == 'publications': # take all publications authored by an author from the `focus_author_ids' focus_pubs = np.sort(paa_df.loc[isin_sorted( paa_df['AuthorId'].values, focus_author_ids)]['PublicationId'].unique()) # then take only the subset of publication-author links inducded by these publications paa_df = paa_df.loc[isin_sorted(paa_df['PublicationId'].values, focus_pubs)] del focus_pubs elif focus_constraint == 'ego': # take all publications authored by an author from the `focus_author_ids' focus_pubs = np.sort(paa_df.loc[isin_sorted( paa_df['AuthorId'].values, focus_author_ids)]['PublicationId'].unique()) # then take all authors who contribute to this subset of publications focus_author_ids = np.sort(paa_df.loc[isin_sorted( paa_df['PublicationId'].values, focus_pubs)]['AuthorId'].unique()) del focus_pubs # finally take the publication-author links that have an author from the above ego subset paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values, focus_author_ids)] # map authors to the row/column of the adj mat author2int = { aid: i for i, aid in enumerate(np.sort(paa_df['AuthorId'].unique())) } Nauthors = paa_df['AuthorId'].nunique() adj_mat = sparse.dok_matrix((Nauthors, Nauthors), dtype=int) def coauthor_cluster(author_list): if author_list.shape[0] >= 2: for ia, ja in combinations(author_list, 2): adj_mat[author2int[ia], author2int[ja]] += 1 # register our pandas apply with tqdm for a progress bar tqdm.pandas(desc='CoAuthorship Relations', leave=True, disable=not show_progress) # go through all publications and apply the coauthorship edge generator paa_df.groupby('PublicationId')['AuthorId'].progress_apply( coauthor_cluster) adj_mat = adj_mat + adj_mat.transpose() return adj_mat, author2int
def compute_raostriling_interdisciplinarity(pub2ref_df, pub2field_df, focus_pub_ids=None, pub2field_norm=True, temporal=False, citation_direction='references', field_distance_metric='cosine', distance_matrix=None, show_progress=False): """ Calculate the RaoStirling index as a measure of a publication's interdisciplinarity. See :cite:`stirling20` for the definition and :cite:`gates2019naturereach` for an application. Parameters ---------- :param pub2ref_df : DataFrame A DataFrame with the citation information for each Publication. :param pub2field_df : DataFrame A DataFrame with the field information for each Publication. :param focus_pub_ids : numpy array or list, default None A list of the PublicationIds to calculate interdisciplinarity. :param pub2field_norm : bool, default True When a publication occurs in m > 1 fields, count the publication 1/m times in each field. Normalizes the membership vector so it sums to 1 for each publication. :param temporal : bool, default False If True, compute the distance matrix using only publications for each year. :param citation_direction : str, default `references` `references` : the fields are defined by a publication's references. `citations` : the fields are defined by a publication's citations. :param field_distance_metric : str, default `cosine` The interfield distance metric. Valid entries come from sklearn.metrics.pairwise_distances: ‘cosine‘, ‘euclidean’, ‘l1’, ‘l2’, etc. :param distance_matrix : numpy array, default None The precomputed field distance matrix. :param show_progress : bool, default False If True, show a progress bar tracking the calculation. Returns ------- DataFrame DataFrame with 2 columns: 'PublicationId', 'RaoStirling' """ required_columns = ['CitedPublicationId', 'CitingPublicationId'] if temporal: required_columns.append('CitingYear') check4columns(pub2ref_df, required_columns) pub2ref_df = pub2ref_df[required_columns].dropna() check4columns(pub2field_df, ['PublicationId', 'FieldId']) # to leverage matrix operations we need to map fields to the rows/cols of the matrix field2int = { fid: i for i, fid in enumerate(np.sort(pub2field_df['FieldId'].unique())) } pub2field_df['FieldId'] = [ field2int[fid] for fid in pub2field_df['FieldId'].values ] Nfields = len(field2int) if temporal: years = np.sort(pub2ref_df['CitingYear'].unique()) year2int = {y: i for i, y in enumerate(years)} Nyears = years.shape[0] # check that the precomputed distance matrix is the correct size if not precomputed_distance_matrix is None: if not temporal and precomputed_distance_matrix != (Nfields, Nfields): raise pySciSciMetricError( 'The precomputed_distance_matrix is of the wrong size to compute the RaoStirling interdisciplinarity for the publications passed.' ) elif temporal and precomputed_distance_matrix != (Nyears, Nfields, Nfields): raise pySciSciMetricError( 'The precomputed_distance_matrix is of the wrong size to compute the RaoStirling interdisciplinarity for the publications and years passed.' ) # the assignment of a publication to a field is 1/(number of fields) when normalized, and 1 otherwise if pub2field_norm: pub2nfields = pub2field_df.groupby( 'PublicationId')['FieldId'].nunique() else: pub2nfields = defaultdict(lambda: 1) pub2field_df['PubFieldContribution'] = [ 1.0 / pub2nfields[pid] for pid in pub2field_df['PublicationId'].values ] # now we map citing and cited to the source and target depending on which diretion was specified by `citation_direction' if citation_direction == 'references': pub2ref_rename_dict = { 'CitedPublicationId': 'TargetId', 'CitingPublicationId': 'SourceId' } elif citation_direction == 'citations': pub2ref_rename_dict = { 'CitedPublicationId': 'SourceId', 'CitingPublicationId': 'TargetId' } pub2ref_df = pub2ref_df.rename(columns=pub2ref_rename_dict) # merge the references to the fields for the target fields pub2ref_df = pub2ref_df.merge( pub2field_df, how='left', left_on='TargetId', right_on='PublicationId').rename( columns={ 'FieldId': 'TargetFieldId', 'PubFieldContribution': 'TargetPubFieldContribution' }) del pub2ref_df['PublicationId'] # we need to calcuate the field 2 field distance matrix if distance_matrix is None: # merge the references to the fields for the source fields pub2ref_df = pub2ref_df.merge( pub2field_df, how='left', left_on='SourceId', right_on='PublicationId').rename( columns={ 'FieldId': 'SourceFieldId', 'PubFieldContribution': 'SourcePubFieldContribution' }) del pub2ref_df['PublicationId'] # drop any citation relationships for which we dont have field information pub2ref_df.dropna(inplace=True) # we need to use integer ids to map to the matrix pub2ref_df[['SourceFieldId', 'TargetFieldId' ]] = pub2ref_df[['SourceFieldId', 'TargetFieldId']].astype(int) # in the field2field distance matrix, the weighted contribution from a source publication in multiple fields # is the product of the source and target contributions pub2ref_df['SourcePubFieldContribution'] = pub2ref_df[ 'SourcePubFieldContribution'] * pub2ref_df[ 'TargetPubFieldContribution'] # differeniate between the temporal and the static RS if temporal: # make the temporal distance matrix distance_matrix = np.zeros((Nyears, Nfields, Nfields)) for y, ydf in pub2ref_df.groupby('CitingYear'): # calculate the field representation vectors for this year only yfield2field_mat = dataframe2bipartite( df=ydf, rowname='SourceFieldId', colname='TargetFieldId', shape=(Nfields, Nfields), weightname='SourcePubFieldContribution') # now compute the distance matrix for this year only distance_matrix[year2int[y]] = pairwise_distances( yfield2field_mat, metric=field_distance_metric) else: # calculate the field representation vectors field2field_mat = dataframe2bipartite( df=pub2ref_df, rowname='SourceFieldId', colname='TargetFieldId', shape=(Nfields, Nfields), weightname='SourcePubFieldContribution') # now compute the distance matrix distance_matrix = pairwise_distances(field2field_mat, metric=field_distance_metric) # we no longer need the 'SourceFieldId' or 'SourcePubFieldContribution' so cleanup del pub2ref_df['SourceFieldId'] del pub2ref_df['SourcePubFieldContribution'] pub2ref_df.drop_duplicates( subset=['SourceId', 'TargetId', 'TargetFieldId'], inplace=True) # Now we start on the RaoStiring calculation # drop any citation relationships for which we dont have field information pub2ref_df.dropna(inplace=True) if temporal: rsdf = [] for y, ydf in pub2ref_df.groupby('CitingYear'): # for each year, we need to map individual publications to the rows of our matrix ypub2int = { pid: i for i, pid in enumerate(np.sort(ydf['SourceId'].unique())) } ydf['SourceId'] = [ypub2int[fid] for fid in ydf['SourceId'].values] ydf[['SourceId', 'TargetFieldId']] = ydf[['SourceId', 'TargetFieldId']].astype(int) yNpubs = len(ypub2int) # calculate the publication representation vectors over fields ypub2field_mat = dataframe2bipartite( df=ydf, rowname='SourceId', colname='TargetFieldId', shape=(yNpubs, Nfields), weightname='TargetPubFieldContribution').tocsr() # make sure the publication 2 field vector is normalized ypub2field_mat = normalize(ypub2field_mat, norm='l1', axis=1) # finally, we calculate the matrix representation of the RS measure yrsdf = 0.5 * np.squeeze( np.asarray( ypub2field_mat.dot(distance_matrix[year2int[y]]).multiply( ypub2field_mat).sum(axis=1))) rsdf.append( pd.DataFrame( zip(np.sort(ydf['SourceId'].unique()), yrsdf, [y] * yNpubs), columns=['PublicationId', 'RaoStirling', 'CitingYear'])) rsdf = pd.concat(rsdf) return rsdf, precomputed_distance_matrix, field2int, years else: # first map individual publications to the rows of our matrix pub2int = { pid: i for i, pid in enumerate(np.sort(pub2ref_df['SourceId'].unique())) } pub2ref_df['SourceId'] = [ pub2int[fid] for fid in pub2ref_df['SourceId'].values ] pub2ref_df[['SourceId', 'TargetFieldId' ]] = pub2ref_df[['SourceId', 'TargetFieldId']].astype(int) Npubs = len(pub2int) # calculate the publication representation vectors over fields pub2field_mat = dataframe2bipartite( df=pub2ref_df, rowname='SourceId', colname='TargetFieldId', shape=(Npubs, Nfields), weightname='TargetPubFieldContribution').tocsr() # make sure the publication 2 field vector is normalized pub2field_mat = normalize(pub2field_mat, norm='l1', axis=1) # finally, we calculate the matrix representation of the RS measure rsdf = 0.5 * np.squeeze( np.asarray( pub2field_mat.dot(distance_matrix).multiply(pub2field_mat).sum( axis=1))) rsdf = pd.DataFrame(zip(np.sort(pub2ref_df['SourceId'].unique()), rsdf), columns=['PublicationId', 'RaoStirling']) return rsdf, distance_matrix, field2int
def raostriling_interdisciplinarity(pub2ref_df, pub2field_df, focus_pub_ids=None, pub2field_norm=True, temporal=False, citation_direction='references', field_distance_metric='cosine', distance_matrix=None, show_progress=False): """ Calculate the RaoStirling index as a measure of a publication's interdisciplinarity. See :cite:`stirling20` for the definition and :cite:`gates2019naturereach` for an application. Parameters ---------- :param pub2ref_df : DataFrame A DataFrame with the citation information for each Publication. :param pub2field_df : DataFrame A DataFrame with the field information for each Publication. :param focus_pub_ids : numpy array or list, default None A list of the PublicationIds to calculate interdisciplinarity. :param pub2field_norm : bool, default True When a publication occurs in m > 1 fields, count the publication 1/m times in each field. Normalizes the membership vector so it sums to 1 for each publication. :param temporal : bool, default False If True, compute the distance matrix using only publications for each year. :param citation_direction : str, default `references` `references` : the fields are defined by a publication's references. `citations` : the fields are defined by a publication's citations. :param field_distance_metric : str, default `cosine` The interfield distance metric. Valid entries come from sklearn.metrics.pairwise_distances: ‘cosine‘, ‘euclidean’, ‘l1’, ‘l2’, etc. :param distance_matrix : numpy array, default None The precomputed field distance matrix. :param show_progress : bool, default False If True, show a progress bar tracking the calculation. Returns ------- DataFrame DataFrame with 2 columns: 'PublicationId', 'RaoStirling' """ # now we map citing and cited to the source and target depending on which diretion was specified by `citation_direction' if citation_direction == 'references': pub2ref_rename_dict = { 'CitedPublicationId': 'TargetId', 'CitingPublicationId': 'SourceId' } year_col = 'CitingYear' elif citation_direction == 'citations': pub2ref_rename_dict = { 'CitedPublicationId': 'SourceId', 'CitingPublicationId': 'TargetId' } year_col = 'CitedYear' required_columns = ['CitedPublicationId', 'CitingPublicationId'] if temporal: required_columns.append(year_col) check4columns(pub2ref_df, required_columns) pub2ref_df = pub2ref_df[required_columns].dropna().copy(deep=True) check4columns(pub2field_df, ['PublicationId', 'FieldId']) pub2field_df = pub2field_df.copy(deep=True) # check that the precomputed distance matrix is the correct size if distance_matrix is None: distance_matrix = field_citation_distance(pub2ref_df, pub2field_df, pub2field_norm, temporal, citation_direction, field_distance_metric, show_progress) field2int = { fid: i for i, fid in enumerate(np.sort(pub2field_df['FieldId'].unique())) } pub2field_df['FieldId'] = [ field2int[fid] for fid in pub2field_df['FieldId'].values ] Nfields = len(field2int) pub2ref_df.rename(columns=pub2ref_rename_dict, inplace=True) if not focus_pub_ids is None: pub2ref_df = pub2ref_df.loc[isin_sorted(pub2ref_df['SourceId'].values, focus_pub_ids)] if temporal: years = np.sort(pub2ref_df[year_col].unique()) year2int = {y: i for i, y in enumerate(years)} Nyears = years.shape[0] if type(distance_matrix) == pd.DataFrame and temporal: check4columns(distance_matrix, ['iFieldId', 'jFieldId', year_col, 'FieldDistance']) distance_matrix = distance_matrix.loc[isin_sorted( distance_matrix[year_col].values, years)].copy(deep=True) distance_matrix['iFieldId'] = [ field2int.get(fid, None) for fid in distance_matrix['iFieldId'].values ] distance_matrix['jFieldId'] = [ field2int.get(fid, None) for fid in distance_matrix['jFieldId'].values ] distance_matrix.dropna(inplace=True) tdm = np.zeros((Nyears, Nfields, Nfields)) for y in years: tdm[year2int[y]] = dataframe2bipartite( df=distance_matrix[distance_matrix[year_col] == y], rowname='iFieldId', colname='jFieldId', shape=(Nfields, Nfields), weightname='FieldDistance').todense() tdm[year2int[y]] = tdm[year2int[y]] + tdm[year2int[y]].T distance_matrix = tdm elif type(distance_matrix) == pd.DataFrame and not temporal: check4columns(distance_matrix, ['iFieldId', 'jFieldId', 'FieldDistance']) distance_matrix = distance_matrix.copy(deep=True) distance_matrix['iFieldId'] = [ field2int.get(fid, None) for fid in distance_matrix['iFieldId'].values ] distance_matrix['jFieldId'] = [ field2int.get(fid, None) for fid in distance_matrix['jFieldId'].values ] distance_matrix.dropna(inplace=True) distance_matrix = dataframe2bipartite( df=distance_matrix, rowname='iFieldId', colname='jFieldId', shape=(Nfields, Nfields), weightname='FieldDistance').todense() distance_matrix = distance_matrix + distance_matrix.T elif (type(distance_matrix) == np.array or type(distance_matrix) == np.matrix): if not temporal and distance_matrix.shape != (Nfields, Nfields): raise pySciSciMetricError( 'The precomputed_distance_matrix is of the wrong size to compute the RaoStirling interdisciplinarity for the publications passed.' ) elif temporal and distance_matrix.shape != (Nyears, Nfields, Nfields): raise pySciSciMetricError( 'The precomputed_distance_matrix is of the wrong size to compute the RaoStirling interdisciplinarity for the publications and years passed.' ) # the assignment of a publication to a field is 1/(number of fields) when normalized, and 1 otherwise if pub2field_norm: pub2nfields = pub2field_df.groupby( 'PublicationId')['FieldId'].nunique() else: pub2nfields = defaultdict(lambda: 1) pub2field_df['PubFieldContribution'] = [ 1.0 / pub2nfields[pid] for pid in pub2field_df['PublicationId'].values ] # merge the references to the fields for the target fields pub2ref_df = pub2ref_df.merge( pub2field_df, how='left', left_on='TargetId', right_on='PublicationId').rename( columns={ 'FieldId': 'TargetFieldId', 'PubFieldContribution': 'TargetPubFieldContribution' }) del pub2ref_df['PublicationId'] pub2ref_df.dropna(inplace=True) # Now we start on the RaoStiring calculation if temporal: rsdf = [] for y, ydf in pub2ref_df.groupby(year_col): # for each year, we need to map individual publications to the rows of our matrix ypub2int = { pid: i for i, pid in enumerate(np.sort(ydf['SourceId'].unique())) } yint2pub = {i: pid for pid, i in ypub2int.items()} ydf['SourceId'] = [ypub2int[fid] for fid in ydf['SourceId'].values] yNpubs = len(ypub2int) # calculate the publication representation vectors over fields ypub2field_mat = dataframe2bipartite( df=ydf, rowname='SourceId', colname='TargetFieldId', shape=(yNpubs, Nfields), weightname='TargetPubFieldContribution').tocsr() # make sure the publication 2 field vector is normalized ypub2field_mat = normalize(ypub2field_mat, norm='l1', axis=1) # finally, we calculate the matrix representation of the RS measure yrsdf = pd.DataFrame() yrsdf['PublicationId'] = [ yint2pub[i] for i in np.sort(ydf['SourceId'].unique()) ] yrsdf['CitingYear'] = y yrsdf['RaoStirling'] = 0.5 * np.squeeze( np.asarray( ypub2field_mat.dot( spsparse.csr_matrix(distance_matrix[year2int[y]])). multiply(ypub2field_mat).sum(axis=1))) rsdf.append(yrsdf) rsdf = pd.concat(rsdf) return rsdf else: # first map individual publications to the rows of our matrix pub2int = { pid: i for i, pid in enumerate(np.sort(pub2ref_df['SourceId'].unique())) } int2pub = {i: pid for pid, i in pub2int.items()} pub2ref_df['SourceId'] = [ pub2int[pid] for pid in pub2ref_df['SourceId'].values ] pub2ref_df[['SourceId', 'TargetFieldId' ]] = pub2ref_df[['SourceId', 'TargetFieldId']].astype(int) Npubs = len(pub2int) # calculate the publication representation vectors over fields pub2field_mat = dataframe2bipartite( df=pub2ref_df, rowname='SourceId', colname='TargetFieldId', shape=(Npubs, Nfields), weightname='TargetPubFieldContribution').tocsr() # make sure the publication 2 field vector is normalized pub2field_mat = normalize(pub2field_mat, norm='l1', axis=1) distance_matrix = spsparse.csr_matrix(distance_matrix) # finally, we calculate the matrix representation of the RS measure rsdf = pd.DataFrame() rsdf['RaoStirling'] = 0.5 * np.squeeze( np.asarray( spsparse.csr_matrix.multiply( pub2field_mat.dot(distance_matrix), pub2field_mat).sum(axis=1))) rsdf['PublicationId'] = [ int2pub[i] for i in np.sort(pub2ref_df['SourceId'].unique()) ] return rsdf
def field_citation_distance(pub2ref_df, pub2field_df, pub2field_norm=True, temporal=True, citation_direction='references', field_distance_metric='cosine', show_progress=False): """ Calculate the field distance matrix based on references or citations. Parameters ---------- :param pub2ref_df : DataFrame A DataFrame with the citation information for each Publication. :param pub2field_df : DataFrame A DataFrame with the field information for each Publication. :param pub2field_norm : bool, default True When a publication occurs in m > 1 fields, count the publication 1/m times in each field. Normalizes the membership vector so it sums to 1 for each publication. :param temporal : bool, default False If True, compute the distance matrix using only publications for each year. :param citation_direction : str, default `references` `references` : the fields are defined by a publication's references. `citations` : the fields are defined by a publication's citations. :param field_distance_metric : str, default `cosine` The interfield distance metric. Valid entries come from sklearn.metrics.pairwise_distances: ‘cosine‘, ‘euclidean’, ‘l1’, ‘l2’, etc. :param show_progress : bool, default False If True, show a progress bar tracking the calculation. Returns ------- Distance DataFrame if temporal is True DataFrame with 4 columns: iFieldId, jFieldId, Year, and FieldDistance if temporal is False DataFrame with 3 columns: iFieldId, jFieldId, FieldDistance """ # now we map citing and cited to the source and target depending on which diretion was specified by `citation_direction' if citation_direction == 'references': pub2ref_rename_dict = { 'CitedPublicationId': 'TargetId', 'CitingPublicationId': 'SourceId' } year_col = 'CitingYear' elif citation_direction == 'citations': pub2ref_rename_dict = { 'CitedPublicationId': 'SourceId', 'CitingPublicationId': 'TargetId' } year_col = 'CitedYear' required_columns = ['CitedPublicationId', 'CitingPublicationId'] if temporal: required_columns.append(year_col) check4columns(pub2ref_df, required_columns) pub2ref_df = pub2ref_df[required_columns].dropna().copy(deep=True) check4columns(pub2field_df, ['PublicationId', 'FieldId']) pub2field_df = pub2field_df.copy(deep=True) # to leverage matrix operations we need to map fields to the rows/cols of the matrix field2int = { fid: i for i, fid in enumerate(np.sort(pub2field_df['FieldId'].unique())) } int2field = {i: fid for fid, i in field2int.items()} pub2field_df['FieldId'] = [ field2int[fid] for fid in pub2field_df['FieldId'].values ] Nfields = len(field2int) pub2ref_df.rename(columns=pub2ref_rename_dict, inplace=True) # the assignment of a publication to a field is 1/(number of fields) when normalized, and 1 otherwise if pub2field_norm: pub2nfields = pub2field_df.groupby( 'PublicationId')['FieldId'].nunique() else: pub2nfields = defaultdict(lambda: 1) pub2field_df['PubFieldContribution'] = [ 1.0 / pub2nfields[pid] for pid in pub2field_df['PublicationId'].values ] distance_df = [] # differeniate between the temporal and the static RS if temporal: for y, ydf in pub2ref_df.groupby(year_col): # merge the references to the fields for the source fields ydf = ydf.merge( pub2field_df, how='left', left_on='SourceId', right_on='PublicationId').rename( columns={ 'FieldId': 'SourceFieldId', 'PubFieldContribution': 'SourcePubFieldContribution' }) del ydf['PublicationId'] ydf = ydf.merge( pub2field_df, how='left', left_on='TargetId', right_on='PublicationId').rename( columns={ 'FieldId': 'TargetFieldId', 'PubFieldContribution': 'TargetPubFieldContribution' }) del ydf['PublicationId'] # drop any citation relationships for which we dont have field information ydf.dropna(inplace=True) # we need to use integer ids to map to the matrix ydf[['SourceFieldId', 'TargetFieldId']] = ydf[['SourceFieldId', 'TargetFieldId']].astype(int) # in the field2field distance matrix, the weighted contribution from a source publication in multiple fields # is the product of the source and target contributions ydf['SourcePubFieldContribution'] = ydf[ 'SourcePubFieldContribution'] * ydf[ 'TargetPubFieldContribution'] # calculate the field representation vectors for this year only yfield2field_mat = dataframe2bipartite( df=ydf, rowname='SourceFieldId', colname='TargetFieldId', shape=(Nfields, Nfields), weightname='SourcePubFieldContribution') # now compute the distance matrix for this year only distance_matrix = pairwise_distances(yfield2field_mat, metric=field_distance_metric) nnzrow, nnzcol = np.nonzero(distance_matrix) for isource, itarget in zip(nnzrow, nnzcol): if isource < itarget: distance_df.append([ int2field[isource], int2field[itarget], y, distance_matrix[isource, itarget] ]) distance_df = pd.DataFrame( distance_df, columns=['iFieldId', 'jFieldId', year_col, 'FieldDistance']) else: field2field_mat = spsparse.coo_matrix((Nfields, Nfields)) nref = int(pub2ref_df.shape[0] / 10.0**6) + 1 for itab in range(nref): tabdf = pub2ref_df.loc[0 * 10**6:(0 + 1) * 10**6] tabdf = tabdf.merge( pub2field_df, how='left', left_on='SourceId', right_on='PublicationId').rename( columns={ 'FieldId': 'SourceFieldId', 'PubFieldContribution': 'SourcePubFieldContribution' }) del tabdf['PublicationId'] tabdf = tabdf.merge( pub2field_df, how='left', left_on='TargetId', right_on='PublicationId').rename( columns={ 'FieldId': 'TargetFieldId', 'PubFieldContribution': 'TargetPubFieldContribution' }) del tabdf['PublicationId'] # drop any citation relationships for which we dont have field information tabdf.dropna(inplace=True) # we need to use integer ids to map to the matrix tabdf[['SourceFieldId', 'TargetFieldId' ]] = tabdf[['SourceFieldId', 'TargetFieldId']].astype(int) # in the field2field distance matrix, the weighted contribution from a source publication in multiple fields # is the product of the source and target contributions tabdf['SourcePubFieldContribution'] = tabdf[ 'SourcePubFieldContribution'] * tabdf[ 'TargetPubFieldContribution'] # calculate the field representation vectors field2field_mat += dataframe2bipartite( df=tabdf, rowname='SourceFieldId', colname='TargetFieldId', shape=(Nfields, Nfields), weightname='SourcePubFieldContribution') # now compute the distance matrix distance_matrix = pairwise_distances(field2field_mat, metric=field_distance_metric) sources, targets = np.nonzero(distance_matrix) for isource, itarget in zip(sources, targets): if isource < itarget: distance_df.append([ int2field[isource], int2field[itarget], distance_matrix[isource, itarget] ]) distance_df = pd.DataFrame( distance_df, columns=['iFieldId', 'jFieldId', 'FieldDistance']) return distance_df
def cocitation_network(pub2ref_df, focus_pub_ids=None, focus_constraint='citing', temporal=False, show_progress=False): """ Create the co-citation network. Parameters ---------- :param pub2ref_df : DataFrame A DataFrame with the links between authors and publications. :param focus_pub_ids : numpy array or list, default None A list of the PublicationIds to seed the cocitation-network. :param focus_constraint : str, default `citing` If focus_author_ids is not None: `citing` : the `focus_pub_ids' defines the citation set, giving only the co-citations between the references of the publications from this set. `cited` : the `focus_pub_ids' defines the cocitation node set. 'egocited' : the `focus_pub_ids' defines a seed set, such that all other publications must have been co-citeed with at least one publication from this set. :param temporal : bool, default False If True, compute the adjacency matrix using only publications for each year. :param show_progress : bool, default False If True, show a progress bar tracking the calculation. Returns ------- coo_matrix or dict of coo_matrix If temporal == False: The adjacency matrix for the co-citation network If temporal == True: A dictionary with key for each year, and value of the adjacency matrix for the cocitation network induced by citing publications in that year. pub2int, dict A mapping of PublicationIds to the row/column of the adjacency matrix. """ required_columns = ['CitedPublicationId', 'CitingPublicationId'] if temporal: required_columns.append('CitingYear') check4columns(pub2ref_df, required_columns) pub2ref_df = pub2ref_df[required_columns].dropna() if not focus_pub_ids is None: focus_pub_ids = np.sort(focus_pub_ids) # identify the subset of the publications we need to form the network if focus_constraint == 'citing': # take only the links that have a citing publication from the `focus_pub_ids' pub2ref_df = pub2ref_df.loc[isin_sorted( pub2ref_df['CitingPublicationId'].values, focus_pub_ids)] elif focus_constraint == 'cited': # take only the links that have a cited publication from the `focus_pub_ids' pub2ref_df = pub2ref_df.loc[isin_sorted( pub2ref_df['CitedPublicationId'].values, focus_pub_ids)] elif focus_constraint == 'egocited': # take all publications that cite one of the publications in `focus_pub_ids' focus_citing_pubs = np.sort(pub2ref_df.loc[isin_sorted( pub2ref_df['CitedPublicationId'].values, focus_pub_ids)]['CitingPublicationId'].unique()) # then take all the links that have a citing publication from the `focus_citing_pubs' pub2ref_df = pub2ref_df.loc[isin_sorted( pub2ref_df['CitingPublicationId'].values, focus_citing_pubs)] del focus_citing_pubs pub2ref_df.drop_duplicates( subset=['CitingPublicationId', 'CitedPublicationId'], inplace=True) if pub2ref_df.shape[0] > 0: # map cited publications to the rows of the bipartite adj mat cited2int = { pid: i for i, pid in enumerate( np.sort(pub2ref_df['CitedPublicationId'].unique())) } Ncited = pub2ref_df['CitedPublicationId'].nunique() pub2ref_df['CitedPublicationId'] = [ cited2int[pid] for pid in pub2ref_df['CitedPublicationId'].values ] # map citing publications to the columns of the bipartite adj mat citing2int = { pid: i for i, pid in enumerate( np.sort(pub2ref_df['CitingPublicationId'].unique())) } Nciting = pub2ref_df['CitingPublicationId'].nunique() pub2ref_df['CitingPublicationId'] = [ citing2int[pid] for pid in pub2ref_df['CitingPublicationId'].values ] if temporal: years = np.sort(pub2ref_df['CitingYear'].unique()) temporal_adj = {} for y in years: bipartite_adj = dataframe2bipartite( pub2ref_df.loc[pub2ref_df['CitingYear'] == y], 'CitedPublicationId', 'CitingPublicationId', (Ncited, Nciting)) adj_mat = project_bipartite_mat(bipartite_adj, project_to='row') # remove diagonal entries adj_mat.setdiag(0) adj_mat.eliminate_zeros() temporal_adj[y] = adj_mat return temporal_adj, cited2int else: bipartite_adj = dataframe2bipartite(pub2ref_df, 'CitedPublicationId', 'CitingPublicationId', (Ncited, Nciting)) adj_mat = project_bipartite_mat(bipartite_adj, project_to='row') # remove diagonal entries adj_mat.setdiag(0) adj_mat.eliminate_zeros() return adj_mat, cited2int else: return spsparse.coo_matrix(), {}
def coauthorship_network(paa_df, focus_author_ids=None, focus_constraint='authors', temporal=False, show_progress=False): """ Create the co-authorship network. Parameters ---------- :param paa_df : DataFrame A DataFrame with the links between authors and publications. :param focus_author_ids : numpy array or list, default None A list of the AuthorIds to seed the coauthorship-network. :param focus_constraint : str, default `authors` If focus_author_ids is not None: `authors` : the `focus_author_ids' defines the node set, giving only the co-authorships between authors in the set. `publications` : the publication history of `focus_author_ids' defines the edge set, giving the co-authorhips where at least one author from `focus_author_ids' was involved. 'ego' : the `focus_author_ids' defines a seed set, such that all authors must have co-authored at least one publication with an author from `focus_author_ids', but co-authorships are also found between the second-order author sets. :param temporal : bool, default False If True, compute the adjacency matrix using only publications for each year. :param show_progress : bool, default False If True, show a progress bar tracking the calculation. Returns ------- coo_matrix or dict of coo_matrix If temporal == False: The adjacency matrix for the co-authorship network If temporal == True: A dictionary with key for each year, and value of the adjacency matrix for the co-authorship network induced by publications in that year. author2int, dict A mapping of AuthorIds to the row/column of the adjacency matrix. """ required_columns = ['AuthorId', 'PublicationId'] if temporal: required_columns.append('Year') check4columns(paa_df, required_columns) paa_df = paa_df[required_columns].dropna() if not focus_author_ids is None: focus_author_ids = np.sort(focus_author_ids) # identify the subset of the publications we need to form the network if focus_constraint == 'authors': # take only the publication-author links that have an author from the `focus_author_ids' paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values, focus_author_ids)] elif focus_constraint == 'publications': # take all publications authored by an author from the `focus_author_ids' focus_pubs = np.sort(paa_df.loc[isin_sorted( paa_df['AuthorId'].values, focus_author_ids)]['PublicationId'].unique()) # then take only the subset of publication-author links inducded by these publications paa_df = paa_df.loc[isin_sorted(paa_df['PublicationId'].values, focus_pubs)] del focus_pubs elif focus_constraint == 'ego': # take all publications authored by an author from the `focus_author_ids' focus_pubs = np.sort(paa_df.loc[isin_sorted( paa_df['AuthorId'].values, focus_author_ids)]['PublicationId'].unique()) # then take all authors who contribute to this subset of publications focus_author_ids = np.sort(paa_df.loc[isin_sorted( paa_df['PublicationId'].values, focus_pubs)]['AuthorId'].unique()) del focus_pubs # finally take the publication-author links that have an author from the above ego subset paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values, focus_author_ids)] paa_df.drop_duplicates(subset=['AuthorId', 'PublicationId'], inplace=True) # map authors to the rows of the bipartite adj mat author2int = { aid: i for i, aid in enumerate(np.sort(paa_df['AuthorId'].unique())) } Nauthors = paa_df['AuthorId'].nunique() paa_df['AuthorId'] = [author2int[aid] for aid in paa_df['AuthorId'].values] # map publications to the columns of the bipartite adj mat pub2int = { pid: i for i, pid in enumerate(np.sort(paa_df['PublicationId'].unique())) } Npubs = paa_df['PublicationId'].nunique() paa_df['PublicationId'] = [ pub2int[pid] for pid in paa_df['PublicationId'].values ] if temporal: years = np.sort(paa_df['Year'].unique()) temporal_adj = {} for y in years: bipartite_adj = dataframe2bipartite( paa_df.loc[paa_df['Year'] == y], 'AuthorId', 'PublicationId', (Nauthors, Npubs)) adj_mat = project_bipartite_mat(bipartite_adj, project_to='row') # remove diagonal entries adj_mat.setdiag(0) adj_mat.eliminate_zeros() temporal_adj[y] = adj_mat return temporal_adj, author2int else: bipartite_adj = dataframe2bipartite(paa_df, 'AuthorId', 'PublicationId', (Nauthors, Npubs)) adj_mat = project_bipartite_mat(bipartite_adj, project_to='row') # remove diagonal entries adj_mat.setdiag(0) adj_mat.eliminate_zeros() return adj_mat, author2int
def cociting_network(pub2ref_df, focus_pub_ids=None, focus_constraint='citing', temporal=False, show_progress=False): """ Create the co-citing network. Each node is a publication, two publications are linked if they cite the same article. Parameters ---------- pub2ref_df : DataFrame A DataFrame with the links between authors and publications. focus_pub_ids : numpy array or list, default None A list of the PublicationIds to seed the cocitation-network. focus_constraint : str, default 'citing' If focus_author_ids is not None - 'citing' : the 'focus_pub_ids' defines the citation set, giving only the co-citations between the references of the publications from this set. - 'cited' : the 'focus_pub_ids' defines the cocitation node set. show_progress : bool, default False If True, show a progress bar tracking the calculation. Returns ------- coo_matrix or dict of coo_matrix The adjacency matrix for the co-citing network pub2int, dict A mapping of PublicationIds to the row/column of the adjacency matrix. | """ required_columns = ['CitedPublicationId', 'CitingPublicationId'] check4columns(pub2ref_df, required_columns) pub2ref_df = pub2ref_df[required_columns].dropna() if not focus_pub_ids is None: focus_pub_ids = np.sort(focus_pub_ids) # identify the subset of the publications we need to form the network if focus_constraint == 'citing': # take only the links that have a citing publication from the `focus_pub_ids' pub2ref_df = pub2ref_df.loc[isin_sorted( pub2ref_df['CitingPublicationId'].values, focus_pub_ids)] elif focus_constraint == 'cited': # take only the links that have a cited publication from the `focus_pub_ids' pub2ref_df = pub2ref_df.loc[isin_sorted( pub2ref_df['CitedPublicationId'].values, focus_pub_ids)] pub2ref_df.drop_duplicates( subset=['CitingPublicationId', 'CitedPublicationId'], inplace=True) if pub2ref_df.shape[0] > 0: # map cited publications to the rows of the bipartite adj mat cited2int = { pid: i for i, pid in enumerate( np.sort(pub2ref_df['CitedPublicationId'].unique())) } Ncited = pub2ref_df['CitedPublicationId'].nunique() pub2ref_df['CitedPublicationId'] = [ cited2int[pid] for pid in pub2ref_df['CitedPublicationId'].values ] # map citing publications to the columns of the bipartite adj mat citing2int = { pid: i for i, pid in enumerate( np.sort(pub2ref_df['CitingPublicationId'].unique())) } Nciting = pub2ref_df['CitingPublicationId'].nunique() pub2ref_df['CitingPublicationId'] = [ citing2int[pid] for pid in pub2ref_df['CitingPublicationId'].values ] bipartite_adj = dataframe2bipartite(pub2ref_df, 'CitedPublicationId', 'CitingPublicationId', (Ncited, Nciting)) adj_mat = project_bipartite_mat(bipartite_adj, project_to='col') # remove diagonal entries adj_mat.setdiag(0) adj_mat.eliminate_zeros() return adj_mat, cited2int else: return spsparse.coo_matrix(), {}