Ejemplos de groupby_count en Python, ejemplos de pyscisci.utils.groupby_count en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: database.py Proyecto: shouwangbuqi/pyscisci

    def author_yearly_productivity(self, df=None, colgroupby = 'AuthorId', datecol = 'Year', colcountby = 'PublicationId', show_progress=False):
        """
        Calculate the number of publications for each author in each year.

        Parameters
        ----------
        :param df : DataFrame, default None, Optional
            A DataFrame with the author2publication information.  If None then the database 'author2pub_df' is used.

        :param colgroupby : str, default 'AuthorId', Optional
            The DataFrame column with Author Ids.  If None then the database 'AuthorId' is used.

        :param datecol : str, default 'Year', Optional
            The DataFrame column with Year information.  If None then the database 'Year' is used.

        :param colcountby : str, default 'PublicationId', Optional
            The DataFrame column with Publication Ids.  If None then the database 'PublicationId' is used.

        Returns
        -------
        DataFrame
            Productivity DataFrame with 3 columns: 'AuthorId', 'Year', 'YearlyProductivity'

        """
        if df is None:
            df = self.author2pub_df

        # we can use show_progress to pass a label for the progress bar
        if show_progress:
            show_progress='Yearly Productivity'

        newname_dict = zip2dict([str(colcountby)+'Count', '0'], ['YearlyProductivity']*2)
        return groupby_count(df, [colgroupby, datecol], colcountby, count_unique=True, show_progress=show_progress).rename(columns=newname_dict)

Ejemplo n.º 2

0

Mostrar archivo

Archivo: author.py Proyecto: SciSciCollective/pyscisci

def author_productivity(pub2author_df, colgroupby = 'AuthorId', colcountby = 'PublicationId', show_progress=False):
    """
    Calculate the total number of publications for each author.

    Parameters
    ----------
    pub2author_df : DataFrame, default None, Optional
        A DataFrame with the author2publication information.

    colgroupby : str, default 'AuthorId', Optional
        The DataFrame column with Author Ids.  If None then the database 'AuthorId' is used.

    colcountby : str, default 'PublicationId', Optional
        The DataFrame column with Publication Ids.  If None then the database 'PublicationId' is used.


    Returns
    -------
    DataFrame
        Productivity DataFrame with 2 columns: 'AuthorId', 'Productivity'

    """

    # we can use show_progress to pass a label for the progress bar
    if show_progress:
        show_progress='Author Productivity'

    newname_dict = zip2dict([str(colcountby)+'Count', '0'], ['Productivity']*2)
    return groupby_count(pub2author_df, colgroupby, colcountby, count_unique=True, show_progress=show_progress).rename(columns=newname_dict)

Ejemplo n.º 3

0

Mostrar archivo

Archivo: author.py Proyecto: SciSciCollective/pyscisci

def author_top_field(pub2author_df, colgroupby = 'AuthorId', colcountby = 'FieldId', fractional_field_counts = False, show_progress=False):
    """
    Calculate the most frequent field in the authors career.

    Parameters
    ----------
    pub2author_df : DataFrame
        A DataFrame with the author2publication field information.

    colgroupby : str, default 'AuthorId'
        The DataFrame column with Author Ids.  If None then the database 'AuthorId' is used.

    colcountby : str, default 'FieldId'
        The DataFrame column with Citation counts for each publication.  If None then the database 'FieldId' is used.

    fractional_field_counts : bool, default False
        How to count publications that are assigned to multiple fields:
            - If False, each publication-field assignment is counted once.
            - If True, each publication is counted once, contributing 1/#fields to each field.

    Returns
    -------
    DataFrame
        DataFrame with 2 columns: 'AuthorId', 'TopFieldId'

    """

    check4columns(pub2author_df, [colgroupby, 'PublicationId', colcountby])

    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc='Author Top Field', disable= not show_progress)

    if not fractional_field_counts:
        author2field = pub2author_df.groupby(colgroupby)[colcountby].progress_apply(lambda x: x.mode()[0])

    else:
        # first calculate how many fields each publication maps too
        pub2nfields = groupby_count(pub2author_df, colgroupby='PublicationId', colcountby=colcountby)

        # each pub2field mapping is weighted by the number of fields for the publication
        pub2nfields['PublicationWeight'] = 1.0/pub2nfields['PublicationIdCount']
        del pub2nfields[str(colcountby)+'Count']

        # merge counts
        author2field = pub2author_df.merge(pub2nfields, how='left', on='PublicationId')

        # custom weighted mode based on 
        def weighted_mode(adf):
            p = adf.groupby(colcountby)['PublicationWeight'].sum()
            return p.idxmax()

        # now take the weighted mode for each groupby column
        author2field = author2field.groupby(colgroupby).progress_apply(weighted_mode)

    newname_dict = zip2dict([str(colcountby), '0'], ['Top' + str(colcountby)]*2)
    return author2field.to_frame().reset_index().rename(columns=newname_dict)

Ejemplo n.º 4

0

Mostrar archivo

Archivo: database.py Proyecto: shouwangbuqi/pyscisci

    def compute_yearly_citations(self, preprocess = True, show_progress=False):

        if show_progress:
            print("Starting Computation of Yearly Citations")

        # first load the publication year information
        pub2year = self.pub2year

        # now get the reference list and merge with year info
        pub2ref = self.pub2ref_df

        pub2ref['CitingYear'] = [pub2year.get(citingpid, 0) for citingpid in pub2ref['CitingPublicationId'].values]

        # drop all citations that happend before the publication year
        pub2ref = pub2ref.loc[[citingyear >= pub2year.get(citedpid, 0) for citingyear, citedpid in pub2ref[['CitingYear', 'CitedPublicationId']].values]]

        if show_progress:
            print("Yearly Citation Data Prepared")

        # calcuate the total citations
        citation_df = groupby_count(pub2ref, colgroupby=['CitedPublicationId', 'CitingYear'], colcountby='CitingPublicationId', count_unique=True )
        citation_df.rename(columns={'CitingPublicationIdCount':'YearlyCitations', 'CitedPublicationId':'PublicationId'}, inplace=True)

        # get the Cited Year
        citation_df['CitedYear'] = [pub2year.get(pid, 0) for pid in citation_df['PublicationId'].values]

        citation_df.sort_values(by=['CitedYear', 'CitedPublicationId', 'CitingYear'], inplace=True)

        if show_progress:
            print("Yearly Citations Found")

        if preprocess:
            if not os.path.exists(os.path.join(self.path2database, 'temporalimpact')):
                os.mkdir(os.path.join(self.path2database, 'temporalimpact'))

            for y, cdf in citation_df.groupby('CitedYear', sort=True):
                cdf.to_hdf(os.path.join(self.path2database, 'temporalimpact', 'temporalimpact{}.hdf'.format(y)), mode='w', key ='temporalimpact')

            if show_progress:
                print("Yearly Citations Saved")

        else:
            return citation_df

Ejemplo n.º 5

0

Mostrar archivo

Archivo: creditshare.py Proyecto: SciSciCollective/pyscisci

def credit_share(focus_pid, pub2ref_df, pub2author_df, temporal=False, normed=False, show_progress=False):
    """
    Calculate the credit share for each author of a publication based on :cite:`Shen2014credit`.

    Parameters
    ----------
    :param focus_pid : int, str
        The focus publication id.

    :param pub2ref_df : DataFrame
        A DataFrame with the citation information for each Publication.

    :param pub2author_df : DataFrame
        A DataFrame with the author information for each Publication.

    :param temporal : bool, default False
        If True, compute the adjacency matrix using only publications for each year.

    :param normed : bool, default False
        Normalize the sum of credit share to 1.0

    :param show_progress : bool, default False
        If True, show a progress bar tracking the calculation.

    Returns
    -------
    credit_share, numpy array
        If temporal == False:
            The adjacency matrix for the co-citation network

        If temporal == True:
            A dictionary with key for each year, and value of the adjacency matrix for the cocitation network induced
            by citing publications in that year.

    author2int, dict
        A mapping of the AuthorIds from the focus publication to the column of the credit share vector or matrix (see above).

    """

    # the focus publication's authors
    focus_authors = np.sort(pub2author_df.loc[pub2author_df['PublicationId']==focus_pid]['AuthorId'].unique())
    author2int = {aid:i for i, aid in enumerate(focus_authors)}

    if focus_authors.shape[0] > 1:
        # start by getting the co-citation network around the focus publication
        adj_mat, cited2int = cocitation_network(pub2ref_df, focus_pub_ids=np.sort([focus_pid]), focus_constraint='egocited',
                temporal=temporal, show_progress=show_progress)

        # get the authorships for the publications in the cocitation network
        cocited_pubs = np.sort(list(cited2int.keys()))
        pa_df = pub2author_df.loc[isin_sorted(pub2author_df['PublicationId'].values, cocited_pubs)]

        if cocited_pubs.shape[0] > 0:
            # the credit allocation matrix has a row for each focus author, and a column for each cocited publication (including the focus pub)
            credit_allocation_mat = np.zeros((focus_authors.shape[0], cocited_pubs.shape[0]), dtype = float)

            # for each cocited publication, we count the number of authors
            # and assign to each focus author, their fractional share of the credit (1 divided by the number of authors)
            for cocitedid, adf in pa_df.groupby('PublicationId'):
                author2row = [author2int[aid] for aid in adf['AuthorId'].unique() if not author2int.get(aid, None) is None]
                if len(author2row) > 0:
                    credit_allocation_mat[author2row, cited2int[cocitedid]] = 1.0/adf['AuthorId'].nunique()

            if temporal:
                # temporal credit allocation - broken down by year

                # we need the temporal citations to the focus article
                focus_citations = groupby_count(pub2ref_df.loc[isin_sorted(pub2ref_df['CitedPublicationId'].values, np.sort([focus_pid]))],
                    colgroupby='CitingYear', colcountby='CitingPublicationId', count_unique=True, show_progress=False)
                focus_citations={y:c for y,c in focus_citations[['CitingYear', 'CitingPublicationIdCount']].values}

                # when temporal is True, a temporal adj mat is returned where each key is the year
                years = np.sort(list(adj_mat.keys()))

                cocite_counts = np.zeros((years.shape[0], cocited_pubs.shape[0]), dtype=float)

                for iy, y in enumerate(years):
                    cocite_counts[iy] = adj_mat[y].tocsr()[cited2int[focus_pid]].todense()#set the off-diagonal to be the total co-citations from that year
                    cocite_counts[iy, cited2int[focus_pid]] = focus_citations[y]          #set the diagonal to be the total citations from that year

                cocite_counts = cocite_counts.cumsum(axis=0)

            else:
                # just do credit allocation with the full cocitation matrix
                cocite_counts = adj_mat.tocsr()[cited2int[focus_pid]].todense()

                # the co-citation matrix misses the number of citations to the focus publication
                # so explicitly calculate the number of citations to the focus publication
                cocite_counts[0,cited2int[focus_pid]] = pub2ref_df.loc[isin_sorted(pub2ref_df['CitedPublicationId'].values, np.sort([focus_pid]))]['CitingPublicationId'].nunique()

            # credit share is the matrix product of the credit_allocation_mat with cocite_counts
            credit_share = np.squeeze(np.asarray(credit_allocation_mat.dot(cocite_counts.T)))

            # normalize the credit share vector to sum to 1
            if normed:
                credit_share = credit_share/credit_share.sum(axis=0)

            if temporal:
                return credit_share, author2int, years
            else:
                return credit_share, author2int
        else:
            if temporal:
                years = np.sort(pub2ref_df.loc[pub2ref_df['CitedPublicationId'] == focus_pid]['CitingYear'].unique())
                return np.array([[None for y in years] for a in author2int]), author2int, years
            else:
                return np.array([None for a in author2int]), author2int

    elif focus_authors.shape[0] == 1:
        if temporal:
            years = np.sort(pub2ref_df.loc[pub2ref_df['CitedPublicationId'] == focus_pid]['CitingYear'].unique())
            return np.ones(shape=(1,years.shape[0])), author2int, years
        else:
            return np.array([1.0]), author2int

Ejemplo n.º 6

0

Mostrar archivo

Archivo: database.py Proyecto: shouwangbuqi/pyscisci

    def compute_impact(self, preprocess=True, citation_horizons = [5,10], noselfcite = True):
        """
        Calculate several of the common citation indices.
            * 'Ctotal' : The total number of citations.
            * 'Ck' : The total number of citations within the first k years of publcation, for each k value specified by `citation_horizons`.
            * 'Ctotal_noself' : The total number of citations with self-citations removed.
            * 'Ck' : The total number of citations within the first k years of publcation with self-citations removed, for each k value specified by `citation_horizons`.

        Parameters
        ----------
        :param preprocess : bool, default True, Optional
            If True then the impact measures are saved in preprocessed files.

        :param citation_horizons : list, default [5,10], Optional
            The DataFrame column with Author Ids.  If None then the database 'AuthorId' is used.

        :param noselfcite : Bool, default 'True', Optional
            If True then the noselfcitation pub2ref files are also processed.

        Returns
        -------
        DataFrame
            The impact DataFrame with at least two columns: 'PublicationId', 'Year', + citation columns

        """

        # first load the publication year information
        pub2year = self.load_pub2year()

        # now get the reference list and merge with year info
        pub2ref = self.pub2ref_df

        # drop all citations that happend before the publication year
        pub2ref = pub2ref.loc[[pub2year.get(citingpid, 0) >= pub2year.get(citedpid, 0) for citingpid, citedpid in pub2ref[['CitingPublicationId', 'CitedPublicationId']].values]]

        # calcuate the total citations
        citation_df = groupby_count(pub2ref, colgroupby='CitedPublicationId', colcountby='CitingPublicationId', count_unique=True )
        citation_df.rename(columns={'CitingPublicationIdCount':'Ctotal', 'CitedPublicationId':'PublicationId'}, inplace=True)

        # go from the larest k down
        for k in np.sort(citation_horizons)[::-1]:

            # drop all citations that happend after the k
            #pub2ref = pub2ref.loc[pub2ref['CitingPublicationYear'] <= pub2ref['CitedPublicationYear'] + k]
            pub2ref = pub2ref.loc[[pub2year.get(citingpid, 0) <= pub2year.get(citedpid, 0) + k for citingpid, citedpid in pub2ref[['CitingPublicationId', 'CitedPublicationId']].values]]

            # recalculate the impact
            k_citation_df = groupby_count(pub2ref, colgroupby='CitedPublicationId', colcountby='CitingPublicationId', count_unique=True )
            k_citation_df.rename(columns={'CitingPublicationIdCount':'C{}'.format(k), 'CitedPublicationId':'PublicationId'}, inplace=True)

            citation_df = citation_df.merge(k_citation_df, how='left', on='PublicationId')

        # get the Cited Year
        citation_df['Year'] = [pub2year.get(pid, 0) for pid in citation_df['PublicationId'].values]


        if noselfcite:
            del pub2ref
            pub2ref = self.pub2refnoself_df

            # drop all citations that happend before the publication year
            pub2ref = pub2ref.loc[[pub2year.get(citingpid, 0) >= pub2year.get(citedpid, 0) for citingpid, citedpid in pub2ref[['CitingPublicationId', 'CitedPublicationId']].values]]

            # calcuate the total citations
            citation_noself_df = groupby_count(pub2ref, colgroupby='CitedPublicationId', colcountby='CitingPublicationId', count_unique=True )
            citation_noself_df.rename(columns={'CitingPublicationIdCount':'Ctotal_noself', 'CitedPublicationId':'PublicationId'}, inplace=True)

            # go from the larest k down
            for k in np.sort(citation_horizons)[::-1]:

                # drop all citations that happend after the k
                #pub2ref = pub2ref.loc[pub2ref['CitingPublicationYear'] <= pub2ref['CitedPublicationYear'] + k]
                pub2ref = pub2ref.loc[[pub2year.get(citingpid, 0) <= pub2year.get(citedpid, 0) + k for citingpid, citedpid in pub2ref[['CitingPublicationId', 'CitedPublicationId']].values]]

                # recalculate the impact
                k_citation_df = groupby_count(pub2ref, colgroupby='CitedPublicationId', colcountby='CitingPublicationId', count_unique=True )
                k_citation_df.rename(columns={'CitingPublicationIdCount':'C{}_noself'.format(k), 'CitedPublicationId':'PublicationId'}, inplace=True)

                citation_noself_df = citation_noself_df.merge(k_citation_df, how='left', on='PublicationId')

            citation_df = citation_df.merge(citation_noself_df, how='left', on='PublicationId')

        # set all nan to 0
        citation_df.fillna(0, inplace=True)

        if preprocess:

            if not os.path.exists(os.path.join(self.path2database, 'impact')):
                os.mkdir(os.path.join(self.path2database, 'impact'))

            for y, cdf in citation_df.groupby('Year', sort=True):
                cdf.to_hdf(os.path.join(self.path2database, 'impact', 'impact{}.hdf'.format(y)), mode='w', key ='impact')

        else:
            return citation_df