コード例 #1
0
def coauthorship_network(paa_df,
                         focus_author_ids=None,
                         focus_constraint='authors',
                         show_progress=False):
    """
    Create the co-authorship network.

    Parameters
    ----------
    :param paa_df : DataFrame
        A DataFrame with the links between authors and publications.

    :param focus_author_ids : numpy array or list, default None
        A list of the AuthorIds to seed the coauthorship-network.

    :param focus_constraint : str, default `authors`
        If focus_author_ids is not None:
            `authors` : the `focus_author_ids' defines the node set, giving only the co-authorships between authors in the set.
            `publications` : the publication history of `focus_author_ids' defines the edge set, giving the co-authorhips where at least 
                                one author from `focus_author_ids' was involved.
            'ego' : the `focus_author_ids' defines a seed set, such that all authors must have co-authored at least one publication with 
                                an author from `focus_author_ids', but co-authorships are also found between the second-order author sets. 

    :param show_progress : bool, default False
        If True, show a progress bar tracking the calculation.

    Returns
    -------
    coo_matrix
        The adjacency matrix for the co-authorship network

    author2int, dict
        A mapping of AuthorIds to the row/column of the adjacency matrix.

    """
    required_columns = ['AuthorId', 'PublicationId']
    check4columns(paa_df, required_columns)
    paa_df = paa_df[required_columns].dropna()

    if not focus_author_ids is None:
        focus_author_ids = np.sort(focus_author_ids)

        # identify the subset of the publications we need to form the network
        if focus_constraint == 'authors':
            # take only the publication-author links that have an author from the `focus_author_ids'
            paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values,
                                            focus_author_ids)]

        elif focus_constraint == 'publications':
            # take all publications authored by an author from the `focus_author_ids'
            focus_pubs = np.sort(paa_df.loc[isin_sorted(
                paa_df['AuthorId'].values,
                focus_author_ids)]['PublicationId'].unique())
            # then take only the subset of publication-author links inducded by these publications
            paa_df = paa_df.loc[isin_sorted(paa_df['PublicationId'].values,
                                            focus_pubs)]
            del focus_pubs

        elif focus_constraint == 'ego':
            # take all publications authored by an author from the `focus_author_ids'
            focus_pubs = np.sort(paa_df.loc[isin_sorted(
                paa_df['AuthorId'].values,
                focus_author_ids)]['PublicationId'].unique())
            # then take all authors who contribute to this subset of publications
            focus_author_ids = np.sort(paa_df.loc[isin_sorted(
                paa_df['PublicationId'].values,
                focus_pubs)]['AuthorId'].unique())
            del focus_pubs
            # finally take the publication-author links that have an author from the above ego subset
            paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values,
                                            focus_author_ids)]

    #  map authors to the row/column of the adj mat
    author2int = {
        aid: i
        for i, aid in enumerate(np.sort(paa_df['AuthorId'].unique()))
    }
    Nauthors = paa_df['AuthorId'].nunique()

    adj_mat = sparse.dok_matrix((Nauthors, Nauthors), dtype=int)

    def coauthor_cluster(author_list):
        if author_list.shape[0] >= 2:
            for ia, ja in combinations(author_list, 2):
                adj_mat[author2int[ia], author2int[ja]] += 1

    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc='CoAuthorship Relations',
                leave=True,
                disable=not show_progress)

    # go through all publications and apply the coauthorship edge generator
    paa_df.groupby('PublicationId')['AuthorId'].progress_apply(
        coauthor_cluster)

    adj_mat = adj_mat + adj_mat.transpose()

    return adj_mat, author2int
コード例 #2
0
def raostriling_interdisciplinarity(pub2ref_df,
                                    pub2field_df,
                                    focus_pub_ids=None,
                                    pub2field_norm=True,
                                    temporal=False,
                                    citation_direction='references',
                                    field_distance_metric='cosine',
                                    distance_matrix=None,
                                    show_progress=False):
    """
    Calculate the RaoStirling index as a measure of a publication's interdisciplinarity.
    See :cite:`stirling20` for the definition and :cite:`gates2019naturereach` for an application.

    Parameters
    ----------
    :param pub2ref_df : DataFrame
        A DataFrame with the citation information for each Publication.

    :param pub2field_df : DataFrame
        A DataFrame with the field information for each Publication.

    :param focus_pub_ids : numpy array or list, default None
        A list of the PublicationIds to calculate interdisciplinarity.

    :param pub2field_norm : bool, default True
        When a publication occurs in m > 1 fields, count the publication 1/m times in each field.  Normalizes the membership
        vector so it sums to 1 for each publication.

    :param temporal : bool, default False
        If True, compute the distance matrix using only publications for each year.

    :param citation_direction : str, default `references`
        `references` : the fields are defined by a publication's references.
        `citations` : the fields are defined by a publication's citations.

    :param field_distance_metric : str, default `cosine`
        The interfield distance metric.  Valid entries come from sklearn.metrics.pairwise_distances:
        ‘cosine‘, ‘euclidean’, ‘l1’, ‘l2’, etc.

    :param distance_matrix : numpy array, default None
        The precomputed field distance matrix.

    :param show_progress : bool, default False
        If True, show a progress bar tracking the calculation.

    Returns
    -------
    DataFrame
        DataFrame with 2 columns: 'PublicationId', 'RaoStirling'

    """

    # now we map citing and cited to the source and target depending on which diretion was specified by `citation_direction'
    if citation_direction == 'references':
        pub2ref_rename_dict = {
            'CitedPublicationId': 'TargetId',
            'CitingPublicationId': 'SourceId'
        }
        year_col = 'CitingYear'
    elif citation_direction == 'citations':
        pub2ref_rename_dict = {
            'CitedPublicationId': 'SourceId',
            'CitingPublicationId': 'TargetId'
        }
        year_col = 'CitedYear'

    required_columns = ['CitedPublicationId', 'CitingPublicationId']
    if temporal:
        required_columns.append(year_col)
    check4columns(pub2ref_df, required_columns)
    pub2ref_df = pub2ref_df[required_columns].dropna().copy(deep=True)

    check4columns(pub2field_df, ['PublicationId', 'FieldId'])
    pub2field_df = pub2field_df.copy(deep=True)

    # check that the precomputed distance matrix is the correct size
    if distance_matrix is None:
        distance_matrix = field_citation_distance(pub2ref_df, pub2field_df,
                                                  pub2field_norm, temporal,
                                                  citation_direction,
                                                  field_distance_metric,
                                                  show_progress)

    field2int = {
        fid: i
        for i, fid in enumerate(np.sort(pub2field_df['FieldId'].unique()))
    }
    pub2field_df['FieldId'] = [
        field2int[fid] for fid in pub2field_df['FieldId'].values
    ]
    Nfields = len(field2int)

    pub2ref_df.rename(columns=pub2ref_rename_dict, inplace=True)

    if not focus_pub_ids is None:
        pub2ref_df = pub2ref_df.loc[isin_sorted(pub2ref_df['SourceId'].values,
                                                focus_pub_ids)]

    if temporal:
        years = np.sort(pub2ref_df[year_col].unique())
        year2int = {y: i for i, y in enumerate(years)}
        Nyears = years.shape[0]

    if type(distance_matrix) == pd.DataFrame and temporal:
        check4columns(distance_matrix,
                      ['iFieldId', 'jFieldId', year_col, 'FieldDistance'])

        distance_matrix = distance_matrix.loc[isin_sorted(
            distance_matrix[year_col].values, years)].copy(deep=True)

        distance_matrix['iFieldId'] = [
            field2int.get(fid, None)
            for fid in distance_matrix['iFieldId'].values
        ]
        distance_matrix['jFieldId'] = [
            field2int.get(fid, None)
            for fid in distance_matrix['jFieldId'].values
        ]
        distance_matrix.dropna(inplace=True)

        tdm = np.zeros((Nyears, Nfields, Nfields))
        for y in years:
            tdm[year2int[y]] = dataframe2bipartite(
                df=distance_matrix[distance_matrix[year_col] == y],
                rowname='iFieldId',
                colname='jFieldId',
                shape=(Nfields, Nfields),
                weightname='FieldDistance').todense()

            tdm[year2int[y]] = tdm[year2int[y]] + tdm[year2int[y]].T

        distance_matrix = tdm

    elif type(distance_matrix) == pd.DataFrame and not temporal:
        check4columns(distance_matrix,
                      ['iFieldId', 'jFieldId', 'FieldDistance'])
        distance_matrix = distance_matrix.copy(deep=True)
        distance_matrix['iFieldId'] = [
            field2int.get(fid, None)
            for fid in distance_matrix['iFieldId'].values
        ]
        distance_matrix['jFieldId'] = [
            field2int.get(fid, None)
            for fid in distance_matrix['jFieldId'].values
        ]
        distance_matrix.dropna(inplace=True)
        distance_matrix = dataframe2bipartite(
            df=distance_matrix,
            rowname='iFieldId',
            colname='jFieldId',
            shape=(Nfields, Nfields),
            weightname='FieldDistance').todense()

        distance_matrix = distance_matrix + distance_matrix.T

    elif (type(distance_matrix) == np.array
          or type(distance_matrix) == np.matrix):
        if not temporal and distance_matrix.shape != (Nfields, Nfields):
            raise pySciSciMetricError(
                'The precomputed_distance_matrix is of the wrong size to compute the RaoStirling interdisciplinarity for the publications passed.'
            )
        elif temporal and distance_matrix.shape != (Nyears, Nfields, Nfields):
            raise pySciSciMetricError(
                'The precomputed_distance_matrix is of the wrong size to compute the RaoStirling interdisciplinarity for the publications and years passed.'
            )

    # the assignment of a publication to a field is 1/(number of fields) when normalized, and 1 otherwise
    if pub2field_norm:
        pub2nfields = pub2field_df.groupby(
            'PublicationId')['FieldId'].nunique()
    else:
        pub2nfields = defaultdict(lambda: 1)
    pub2field_df['PubFieldContribution'] = [
        1.0 / pub2nfields[pid] for pid in pub2field_df['PublicationId'].values
    ]

    # merge the references to the fields for the target fields
    pub2ref_df = pub2ref_df.merge(
        pub2field_df, how='left', left_on='TargetId',
        right_on='PublicationId').rename(
            columns={
                'FieldId': 'TargetFieldId',
                'PubFieldContribution': 'TargetPubFieldContribution'
            })
    del pub2ref_df['PublicationId']

    pub2ref_df.dropna(inplace=True)

    # Now we start on the RaoStiring calculation
    if temporal:

        rsdf = []
        for y, ydf in pub2ref_df.groupby(year_col):

            # for each year, we need to map individual publications to the rows of our matrix
            ypub2int = {
                pid: i
                for i, pid in enumerate(np.sort(ydf['SourceId'].unique()))
            }
            yint2pub = {i: pid for pid, i in ypub2int.items()}
            ydf['SourceId'] = [ypub2int[fid] for fid in ydf['SourceId'].values]
            yNpubs = len(ypub2int)

            # calculate the publication representation vectors over fields
            ypub2field_mat = dataframe2bipartite(
                df=ydf,
                rowname='SourceId',
                colname='TargetFieldId',
                shape=(yNpubs, Nfields),
                weightname='TargetPubFieldContribution').tocsr()

            # make sure the publication 2 field vector is normalized
            ypub2field_mat = normalize(ypub2field_mat, norm='l1', axis=1)

            # finally, we calculate the matrix representation of the RS measure
            yrsdf = pd.DataFrame()
            yrsdf['PublicationId'] = [
                yint2pub[i] for i in np.sort(ydf['SourceId'].unique())
            ]
            yrsdf['CitingYear'] = y
            yrsdf['RaoStirling'] = 0.5 * np.squeeze(
                np.asarray(
                    ypub2field_mat.dot(
                        spsparse.csr_matrix(distance_matrix[year2int[y]])).
                    multiply(ypub2field_mat).sum(axis=1)))

            rsdf.append(yrsdf)

        rsdf = pd.concat(rsdf)

        return rsdf

    else:

        # first map individual publications to the rows of our matrix
        pub2int = {
            pid: i
            for i, pid in enumerate(np.sort(pub2ref_df['SourceId'].unique()))
        }
        int2pub = {i: pid for pid, i in pub2int.items()}
        pub2ref_df['SourceId'] = [
            pub2int[pid] for pid in pub2ref_df['SourceId'].values
        ]
        pub2ref_df[['SourceId', 'TargetFieldId'
                    ]] = pub2ref_df[['SourceId', 'TargetFieldId']].astype(int)
        Npubs = len(pub2int)

        # calculate the publication representation vectors over fields
        pub2field_mat = dataframe2bipartite(
            df=pub2ref_df,
            rowname='SourceId',
            colname='TargetFieldId',
            shape=(Npubs, Nfields),
            weightname='TargetPubFieldContribution').tocsr()

        # make sure the publication 2 field vector is normalized
        pub2field_mat = normalize(pub2field_mat, norm='l1', axis=1)

        distance_matrix = spsparse.csr_matrix(distance_matrix)

        # finally, we calculate the matrix representation of the RS measure
        rsdf = pd.DataFrame()
        rsdf['RaoStirling'] = 0.5 * np.squeeze(
            np.asarray(
                spsparse.csr_matrix.multiply(
                    pub2field_mat.dot(distance_matrix),
                    pub2field_mat).sum(axis=1)))
        rsdf['PublicationId'] = [
            int2pub[i] for i in np.sort(pub2ref_df['SourceId'].unique())
        ]

        return rsdf
コード例 #3
0
def credit_share(focus_pid, pub2ref_df, pub2author_df, temporal=False, normed=False, show_progress=False):
    """
    Calculate the credit share for each author of a publication based on :cite:`Shen2014credit`.

    Parameters
    ----------
    :param focus_pid : int, str
        The focus publication id.

    :param pub2ref_df : DataFrame
        A DataFrame with the citation information for each Publication.

    :param pub2author_df : DataFrame
        A DataFrame with the author information for each Publication.

    :param temporal : bool, default False
        If True, compute the adjacency matrix using only publications for each year.

    :param normed : bool, default False
        Normalize the sum of credit share to 1.0

    :param show_progress : bool, default False
        If True, show a progress bar tracking the calculation.

    Returns
    -------
    credit_share, numpy array
        If temporal == False:
            The adjacency matrix for the co-citation network

        If temporal == True:
            A dictionary with key for each year, and value of the adjacency matrix for the cocitation network induced
            by citing publications in that year.

    author2int, dict
        A mapping of the AuthorIds from the focus publication to the column of the credit share vector or matrix (see above).

    """

    # the focus publication's authors
    focus_authors = np.sort(pub2author_df.loc[pub2author_df['PublicationId']==focus_pid]['AuthorId'].unique())
    author2int = {aid:i for i, aid in enumerate(focus_authors)}

    if focus_authors.shape[0] > 1:
        # start by getting the co-citation network around the focus publication
        adj_mat, cited2int = cocitation_network(pub2ref_df, focus_pub_ids=np.sort([focus_pid]), focus_constraint='egocited',
                temporal=temporal, show_progress=show_progress)

        # get the authorships for the publications in the cocitation network
        cocited_pubs = np.sort(list(cited2int.keys()))
        pa_df = pub2author_df.loc[isin_sorted(pub2author_df['PublicationId'].values, cocited_pubs)]

        if cocited_pubs.shape[0] > 0:
            # the credit allocation matrix has a row for each focus author, and a column for each cocited publication (including the focus pub)
            credit_allocation_mat = np.zeros((focus_authors.shape[0], cocited_pubs.shape[0]), dtype = float)

            # for each cocited publication, we count the number of authors
            # and assign to each focus author, their fractional share of the credit (1 divided by the number of authors)
            for cocitedid, adf in pa_df.groupby('PublicationId'):
                author2row = [author2int[aid] for aid in adf['AuthorId'].unique() if not author2int.get(aid, None) is None]
                if len(author2row) > 0:
                    credit_allocation_mat[author2row, cited2int[cocitedid]] = 1.0/adf['AuthorId'].nunique()

            if temporal:
                # temporal credit allocation - broken down by year

                # we need the temporal citations to the focus article
                focus_citations = groupby_count(pub2ref_df.loc[isin_sorted(pub2ref_df['CitedPublicationId'].values, np.sort([focus_pid]))],
                    colgroupby='CitingYear', colcountby='CitingPublicationId', count_unique=True, show_progress=False)
                focus_citations={y:c for y,c in focus_citations[['CitingYear', 'CitingPublicationIdCount']].values}

                # when temporal is True, a temporal adj mat is returned where each key is the year
                years = np.sort(list(adj_mat.keys()))

                cocite_counts = np.zeros((years.shape[0], cocited_pubs.shape[0]), dtype=float)

                for iy, y in enumerate(years):
                    cocite_counts[iy] = adj_mat[y].tocsr()[cited2int[focus_pid]].todense()#set the off-diagonal to be the total co-citations from that year
                    cocite_counts[iy, cited2int[focus_pid]] = focus_citations[y]          #set the diagonal to be the total citations from that year

                cocite_counts = cocite_counts.cumsum(axis=0)

            else:
                # just do credit allocation with the full cocitation matrix
                cocite_counts = adj_mat.tocsr()[cited2int[focus_pid]].todense()

                # the co-citation matrix misses the number of citations to the focus publication
                # so explicitly calculate the number of citations to the focus publication
                cocite_counts[0,cited2int[focus_pid]] = pub2ref_df.loc[isin_sorted(pub2ref_df['CitedPublicationId'].values, np.sort([focus_pid]))]['CitingPublicationId'].nunique()

            # credit share is the matrix product of the credit_allocation_mat with cocite_counts
            credit_share = np.squeeze(np.asarray(credit_allocation_mat.dot(cocite_counts.T)))

            # normalize the credit share vector to sum to 1
            if normed:
                credit_share = credit_share/credit_share.sum(axis=0)

            if temporal:
                return credit_share, author2int, years
            else:
                return credit_share, author2int
        else:
            if temporal:
                years = np.sort(pub2ref_df.loc[pub2ref_df['CitedPublicationId'] == focus_pid]['CitingYear'].unique())
                return np.array([[None for y in years] for a in author2int]), author2int, years
            else:
                return np.array([None for a in author2int]), author2int

    elif focus_authors.shape[0] == 1:
        if temporal:
            years = np.sort(pub2ref_df.loc[pub2ref_df['CitedPublicationId'] == focus_pid]['CitingYear'].unique())
            return np.ones(shape=(1,years.shape[0])), author2int, years
        else:
            return np.array([1.0]), author2int
コード例 #4
0
ファイル: network.py プロジェクト: shouwangbuqi/pyscisci
def cocitation_network(pub2ref_df,
                       focus_pub_ids=None,
                       focus_constraint='citing',
                       temporal=False,
                       show_progress=False):
    """
    Create the co-citation network.

    Parameters
    ----------
    :param pub2ref_df : DataFrame
        A DataFrame with the links between authors and publications.

    :param focus_pub_ids : numpy array or list, default None
        A list of the PublicationIds to seed the cocitation-network.

    :param focus_constraint : str, default `citing`
        If focus_author_ids is not None:
            `citing` : the `focus_pub_ids' defines the citation set, giving only the co-citations between the references
                of the publications from this set.
            `cited` : the `focus_pub_ids' defines the cocitation node set.
            'egocited' : the `focus_pub_ids' defines a seed set, such that all other publications must have been co-citeed with
                at least one publication from this set.

    :param temporal : bool, default False
        If True, compute the adjacency matrix using only publications for each year.

    :param show_progress : bool, default False
        If True, show a progress bar tracking the calculation.


    Returns
    -------
    coo_matrix or dict of coo_matrix
        If temporal == False:
            The adjacency matrix for the co-citation network

        If temporal == True:
            A dictionary with key for each year, and value of the adjacency matrix for the cocitation network induced
            by citing publications in that year.

    pub2int, dict
        A mapping of PublicationIds to the row/column of the adjacency matrix.

    """
    required_columns = ['CitedPublicationId', 'CitingPublicationId']
    if temporal:
        required_columns.append('CitingYear')
    check4columns(pub2ref_df, required_columns)
    pub2ref_df = pub2ref_df[required_columns].dropna()

    if not focus_pub_ids is None:
        focus_pub_ids = np.sort(focus_pub_ids)

        # identify the subset of the publications we need to form the network
        if focus_constraint == 'citing':
            # take only the links that have a citing publication from the `focus_pub_ids'
            pub2ref_df = pub2ref_df.loc[isin_sorted(
                pub2ref_df['CitingPublicationId'].values, focus_pub_ids)]

        elif focus_constraint == 'cited':
            # take only the links that have a cited publication from the `focus_pub_ids'
            pub2ref_df = pub2ref_df.loc[isin_sorted(
                pub2ref_df['CitedPublicationId'].values, focus_pub_ids)]

        elif focus_constraint == 'egocited':
            # take all publications that cite one of the publications in `focus_pub_ids'
            focus_citing_pubs = np.sort(pub2ref_df.loc[isin_sorted(
                pub2ref_df['CitedPublicationId'].values,
                focus_pub_ids)]['CitingPublicationId'].unique())
            # then take all the links that have a citing publication from the `focus_citing_pubs'
            pub2ref_df = pub2ref_df.loc[isin_sorted(
                pub2ref_df['CitingPublicationId'].values, focus_citing_pubs)]
            del focus_citing_pubs

    pub2ref_df.drop_duplicates(
        subset=['CitingPublicationId', 'CitedPublicationId'], inplace=True)

    if pub2ref_df.shape[0] > 0:
        #  map cited publications to the rows of the bipartite adj mat
        cited2int = {
            pid: i
            for i, pid in enumerate(
                np.sort(pub2ref_df['CitedPublicationId'].unique()))
        }
        Ncited = pub2ref_df['CitedPublicationId'].nunique()

        pub2ref_df['CitedPublicationId'] = [
            cited2int[pid] for pid in pub2ref_df['CitedPublicationId'].values
        ]

        #  map citing publications to the columns of the bipartite adj mat
        citing2int = {
            pid: i
            for i, pid in enumerate(
                np.sort(pub2ref_df['CitingPublicationId'].unique()))
        }
        Nciting = pub2ref_df['CitingPublicationId'].nunique()

        pub2ref_df['CitingPublicationId'] = [
            citing2int[pid] for pid in pub2ref_df['CitingPublicationId'].values
        ]

        if temporal:
            years = np.sort(pub2ref_df['CitingYear'].unique())

            temporal_adj = {}
            for y in years:
                bipartite_adj = dataframe2bipartite(
                    pub2ref_df.loc[pub2ref_df['CitingYear'] == y],
                    'CitedPublicationId', 'CitingPublicationId',
                    (Ncited, Nciting))

                adj_mat = project_bipartite_mat(bipartite_adj,
                                                project_to='row')

                # remove diagonal entries
                adj_mat.setdiag(0)
                adj_mat.eliminate_zeros()

                temporal_adj[y] = adj_mat

            return temporal_adj, cited2int

        else:
            bipartite_adj = dataframe2bipartite(pub2ref_df,
                                                'CitedPublicationId',
                                                'CitingPublicationId',
                                                (Ncited, Nciting))

            adj_mat = project_bipartite_mat(bipartite_adj, project_to='row')

            # remove diagonal entries
            adj_mat.setdiag(0)
            adj_mat.eliminate_zeros()

            return adj_mat, cited2int

    else:
        return spsparse.coo_matrix(), {}
コード例 #5
0
ファイル: network.py プロジェクト: shouwangbuqi/pyscisci
def coauthorship_network(paa_df,
                         focus_author_ids=None,
                         focus_constraint='authors',
                         temporal=False,
                         show_progress=False):
    """
    Create the co-authorship network.

    Parameters
    ----------
    :param paa_df : DataFrame
        A DataFrame with the links between authors and publications.

    :param focus_author_ids : numpy array or list, default None
        A list of the AuthorIds to seed the coauthorship-network.

    :param focus_constraint : str, default `authors`
        If focus_author_ids is not None:
            `authors` : the `focus_author_ids' defines the node set, giving only the co-authorships between authors in the set.
            `publications` : the publication history of `focus_author_ids' defines the edge set, giving the co-authorhips where at least
                                one author from `focus_author_ids' was involved.
            'ego' : the `focus_author_ids' defines a seed set, such that all authors must have co-authored at least one publication with
                                an author from `focus_author_ids', but co-authorships are also found between the second-order author sets.

    :param temporal : bool, default False
        If True, compute the adjacency matrix using only publications for each year.

    :param show_progress : bool, default False
        If True, show a progress bar tracking the calculation.


    Returns
    -------
    coo_matrix or dict of coo_matrix
        If temporal == False:
            The adjacency matrix for the co-authorship network

        If temporal == True:
            A dictionary with key for each year, and value of the adjacency matrix for the co-authorship network induced by publications in that year.

    author2int, dict
        A mapping of AuthorIds to the row/column of the adjacency matrix.

    """
    required_columns = ['AuthorId', 'PublicationId']
    if temporal:
        required_columns.append('Year')
    check4columns(paa_df, required_columns)
    paa_df = paa_df[required_columns].dropna()

    if not focus_author_ids is None:
        focus_author_ids = np.sort(focus_author_ids)

        # identify the subset of the publications we need to form the network
        if focus_constraint == 'authors':
            # take only the publication-author links that have an author from the `focus_author_ids'
            paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values,
                                            focus_author_ids)]

        elif focus_constraint == 'publications':
            # take all publications authored by an author from the `focus_author_ids'
            focus_pubs = np.sort(paa_df.loc[isin_sorted(
                paa_df['AuthorId'].values,
                focus_author_ids)]['PublicationId'].unique())
            # then take only the subset of publication-author links inducded by these publications
            paa_df = paa_df.loc[isin_sorted(paa_df['PublicationId'].values,
                                            focus_pubs)]
            del focus_pubs

        elif focus_constraint == 'ego':
            # take all publications authored by an author from the `focus_author_ids'
            focus_pubs = np.sort(paa_df.loc[isin_sorted(
                paa_df['AuthorId'].values,
                focus_author_ids)]['PublicationId'].unique())
            # then take all authors who contribute to this subset of publications
            focus_author_ids = np.sort(paa_df.loc[isin_sorted(
                paa_df['PublicationId'].values,
                focus_pubs)]['AuthorId'].unique())
            del focus_pubs
            # finally take the publication-author links that have an author from the above ego subset
            paa_df = paa_df.loc[isin_sorted(paa_df['AuthorId'].values,
                                            focus_author_ids)]

    paa_df.drop_duplicates(subset=['AuthorId', 'PublicationId'], inplace=True)

    #  map authors to the rows of the bipartite adj mat
    author2int = {
        aid: i
        for i, aid in enumerate(np.sort(paa_df['AuthorId'].unique()))
    }
    Nauthors = paa_df['AuthorId'].nunique()

    paa_df['AuthorId'] = [author2int[aid] for aid in paa_df['AuthorId'].values]

    #  map publications to the columns of the bipartite adj mat
    pub2int = {
        pid: i
        for i, pid in enumerate(np.sort(paa_df['PublicationId'].unique()))
    }
    Npubs = paa_df['PublicationId'].nunique()

    paa_df['PublicationId'] = [
        pub2int[pid] for pid in paa_df['PublicationId'].values
    ]

    if temporal:
        years = np.sort(paa_df['Year'].unique())

        temporal_adj = {}
        for y in years:
            bipartite_adj = dataframe2bipartite(
                paa_df.loc[paa_df['Year'] == y], 'AuthorId', 'PublicationId',
                (Nauthors, Npubs))

            adj_mat = project_bipartite_mat(bipartite_adj, project_to='row')

            # remove diagonal entries
            adj_mat.setdiag(0)
            adj_mat.eliminate_zeros()

            temporal_adj[y] = adj_mat

        return temporal_adj, author2int

    else:
        bipartite_adj = dataframe2bipartite(paa_df, 'AuthorId',
                                            'PublicationId', (Nauthors, Npubs))

        adj_mat = project_bipartite_mat(bipartite_adj, project_to='row')

        # remove diagonal entries
        adj_mat.setdiag(0)
        adj_mat.eliminate_zeros()

        return adj_mat, author2int
コード例 #6
0
ファイル: database.py プロジェクト: shouwangbuqi/pyscisci
    def filter_doctypes(self, doctypes = ['j', 'b', 'bc', 'c'], show_progress=False):

        """
        Filter all of the publication files keeping only the publications of specified doctype.

        :param list doctypes: optional
            the list of doctypes

        :return None:

        """
        doctypes = np.sort(doctypes)

        if show_progress: print("Starting DocType filter. \nFiltering Publications.")


        valid_pubids = []
        pub2year = {}
        pub2doctype = {}
        Nfiles = sum('publication' in fname for fname in os.listdir(os.path.join(self.path2database, 'publication')))
        for ifile in range(Nfiles):
            pubdf = pd.read_hdf(os.path.join(self.path2database, 'publication', 'publication{}.hdf'.format(ifile)))
            pubdf.loc[isin_sorted(pubdf['DocType'].values, doctypes)]
            pubdf.dropna(subset=['Year'], inplace=True)
            pubdf['Year'] = pubdf['Year'].astype(int)
            pubdf.to_hdf(os.path.join(self.path2database, 'publication', 'publication{}.hdf'.format(ifile)), key='pub', mode='w')
            valid_pubids.extend(pubdf['PublicationId'].values)
            for pid, y, dt in pubdf[['PublicationId', 'Year', 'DocType']].values:
                pub2year[pid] = y
                pub2doctype[pid] = dt

        with gzip.open(os.path.join(self.path2database, 'pub2year.json.gz'), 'w') as outfile:
            outfile.write(json.dumps(pub2year).encode('utf8'))

        with gzip.open(os.path.join(self.path2database, 'pub2doctype.json.gz'), 'w') as outfile:
            outfile.write(json.dumps(pub2doctype).encode('utf8'))

        del pubdf

        valid_pubids = np.sort(valid_pubids)

        if show_progress: print("Filtering References.")

        Nfiles = sum('pub2ref' in fname for fname in os.listdir(os.path.join(self.path2database, 'pub2ref')))
        for ifile in range(Nfiles):
            pub2refdf = pd.read_hdf(os.path.join(self.path2database, 'pub2ref', 'pub2ref{}.hdf'.format(ifile)))
            pub2refdf = pub2refdf.loc[isin_sorted(pub2refdf['CitedPublicationId'].values, valid_pubids)]
            pub2refdf = pub2refdf.loc[isin_sorted(pub2refdf['CitingPublicationId'].values, valid_pubids)]
            pub2refdf.to_hdf(os.path.join(self.path2database, 'pub2ref', 'pub2ref{}.hdf'.format(ifile)),
                key='pub2ref', mode='w')

        if show_progress: print("Filtering Publication and Author.")

        Nfiles = sum('publicationauthoraffiliation' in fname for fname in os.listdir(os.path.join(self.path2database, 'publicationauthoraffiliation')))
        for ifile in range(Nfiles):
            paa_df = pd.read_hdf(os.path.join(self.path2database, 'publicationauthoraffiliation', 'publicationauthoraffiliation{}.hdf'.format(ifile)))
            paa_df = paa_df.loc[isin_sorted(paa_df['PublicationId'].values, valid_pubids)]
            paa_df.to_hdf(os.path.join(self.path2database, 'publicationauthoraffiliation', 'publicationauthoraffiliation{}.hdf'.format(ifile)),
                key='paa', mode='w')

        if show_progress: print("Finished filtering DocType.")
コード例 #7
0
def load_preprocessed_data(dataname,
                           path2database,
                           columns=None,
                           isindict=None,
                           duplicate_subset=None,
                           duplicate_keep='last',
                           dropna=None,
                           keep_source_file=False,
                           prefunc2apply=None,
                           postfunc2apply=None,
                           show_progress=False):
    """
        Load the preprocessed DataFrame from a preprocessed directory.

        Parameters
        ----------
        :param dataname : str
            The type of preprocessed data to load.

        :param path2database : str
            The path to the database directory.

        :param columns : list, default None
            Load only this subset of columns

        :param isindict : dict, default None
            Dictionary of format {"ColumnName":"ListofValues"} where "ColumnName" is a data column
            and "ListofValues" is a sorted list of valid values.  A DataFrame only containing rows that appear in
            "ListofValues" will be returned.

        :param duplicate_subset : list, default None
            Drop any duplicate entries as specified by this subset of columns

        :param duplicate_keep : str, default 'last', Optional
            If duplicates are being dropped, keep the 'first' or 'last'
            (see `pandas.DataFram.drop_duplicates <https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.drop_duplicates.html>`_)

        :param dropna : list, default None, Optional
            Drop any NaN entries as specified by this subset of columns

        :param keep_source_file : bool, default False
            Keep track of the source file the data was loaded from.

        :param prefunc2apply : callable, default None
            A function to apply to each of the sub-DataFrames as they are loaded before filtering.

        :param postfunc2apply : callable, default None
            A function to apply to each of the sub-DataFrames as they are loaded after filtering.

        Returns
        -------
        DataFrame
            dataname DataFrame.

    """

    path2files = os.path.join(path2database, dataname)
    if not os.path.exists(path2files):
        # TODO: make a real warning
        raise NotImplementedError("First preprocess the raw data.")
        return []

    if isinstance(columns, str):
        columns = [columns]

    if isinstance(dropna, str):
        dropna = [dropna]

    if isinstance(duplicate_subset, str):
        duplicate_subset = [duplicate_subset]

    if isinstance(isindict, dict):
        isindict = {
            isinkey: np.sort(isinlist)
            for isinkey, isinlist in isindict.items()
        }

    FileNumbers = sorted([
        int(fname.replace(dataname, '').split('.')[0])
        for fname in os.listdir(path2files) if dataname in fname
    ])

    desc = ''
    if isinstance(show_progress, str):
        desc = show_progress

    data_df = []
    for ifile in tqdm(FileNumbers,
                      desc=desc,
                      leave=True,
                      disable=not show_progress):
        fname = os.path.join(path2files, dataname + "{}.hdf".format(ifile))
        subdf = pd.read_hdf(fname, mode='r')

        if callable(prefunc2apply):
            subdf = prefunc2apply(subdf)

        if isinstance(columns, list):
            subdf = subdf[columns]

        if isinstance(dropna, list):
            subdf.dropna(subset=dropna, inplace=True, how='any')

        if isinstance(isindict, dict):
            for isinkey, isinlist in isindict.items():
                subdf = subdf[isin_sorted(subdf[isinkey], isinlist)]

        if isinstance(duplicate_subset, list):
            subdf.drop_duplicates(subset=duplicate_subset,
                                  keep=duplicate_keep,
                                  inplace=True)

        if keep_source_file:
            subdf['filetag'] = ifile

        if callable(postfunc2apply):
            postfunc2apply(subdf)

        data_df.append(subdf)

    data_df = pd.concat(data_df)

    if isinstance(duplicate_subset, list):
        data_df.drop_duplicates(subset=duplicate_subset,
                                keep=duplicate_keep,
                                inplace=True)

    return data_df
コード例 #8
0
def cociting_network(pub2ref_df,
                     focus_pub_ids=None,
                     focus_constraint='citing',
                     temporal=False,
                     show_progress=False):
    """
    Create the co-citing network.  Each node is a publication, two publications are linked if they cite the same article.


    Parameters
    ----------

    pub2ref_df : DataFrame
        A DataFrame with the links between authors and publications.

    focus_pub_ids : numpy array or list, default None
        A list of the PublicationIds to seed the cocitation-network.

    focus_constraint : str, default 'citing'
        If focus_author_ids is not None
            - 'citing' : the 'focus_pub_ids' defines the citation set, giving only the co-citations between the references
                of the publications from this set.
            - 'cited' : the 'focus_pub_ids' defines the cocitation node set.

    show_progress : bool, default False
        If True, show a progress bar tracking the calculation.


    Returns
    -------

    coo_matrix or dict of coo_matrix
        The adjacency matrix for the co-citing network

    pub2int, dict
        A mapping of PublicationIds to the row/column of the adjacency matrix.



    |
    

    """
    required_columns = ['CitedPublicationId', 'CitingPublicationId']
    check4columns(pub2ref_df, required_columns)
    pub2ref_df = pub2ref_df[required_columns].dropna()

    if not focus_pub_ids is None:
        focus_pub_ids = np.sort(focus_pub_ids)

        # identify the subset of the publications we need to form the network
        if focus_constraint == 'citing':
            # take only the links that have a citing publication from the `focus_pub_ids'
            pub2ref_df = pub2ref_df.loc[isin_sorted(
                pub2ref_df['CitingPublicationId'].values, focus_pub_ids)]

        elif focus_constraint == 'cited':
            # take only the links that have a cited publication from the `focus_pub_ids'
            pub2ref_df = pub2ref_df.loc[isin_sorted(
                pub2ref_df['CitedPublicationId'].values, focus_pub_ids)]

    pub2ref_df.drop_duplicates(
        subset=['CitingPublicationId', 'CitedPublicationId'], inplace=True)

    if pub2ref_df.shape[0] > 0:
        #  map cited publications to the rows of the bipartite adj mat
        cited2int = {
            pid: i
            for i, pid in enumerate(
                np.sort(pub2ref_df['CitedPublicationId'].unique()))
        }
        Ncited = pub2ref_df['CitedPublicationId'].nunique()

        pub2ref_df['CitedPublicationId'] = [
            cited2int[pid] for pid in pub2ref_df['CitedPublicationId'].values
        ]

        #  map citing publications to the columns of the bipartite adj mat
        citing2int = {
            pid: i
            for i, pid in enumerate(
                np.sort(pub2ref_df['CitingPublicationId'].unique()))
        }
        Nciting = pub2ref_df['CitingPublicationId'].nunique()

        pub2ref_df['CitingPublicationId'] = [
            citing2int[pid] for pid in pub2ref_df['CitingPublicationId'].values
        ]

        bipartite_adj = dataframe2bipartite(pub2ref_df, 'CitedPublicationId',
                                            'CitingPublicationId',
                                            (Ncited, Nciting))

        adj_mat = project_bipartite_mat(bipartite_adj, project_to='col')

        # remove diagonal entries
        adj_mat.setdiag(0)
        adj_mat.eliminate_zeros()

        return adj_mat, cited2int

    else:
        return spsparse.coo_matrix(), {}