def compute_sleepingbeauty(df, colgroupby, colcountby, show_progress=False):
    """
    Calculate the sleeping beauty and awakening time for each group in the DataFrame.  See :cite:`ke2015beauty` for details.

    The algorithmic implementation for each publication can be found in :py:func:`sleepingbeauty.beauty_coefficient`.

    Parameters
    ----------
    df : DataFrame
        A DataFrame with the citation information for each Author.

    colgroupby : str
        The DataFrame column with Author Ids.

    colcountby : str
        The DataFrame column with Citation counts for each publication.

    Returns
    -------
    DataFrame
        DataFrame with 3 columns: colgroupby, 'Beauty' and 'Awakening'

        """
    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc='Beauty', disable=not show_progress)

    newname_dict = zip2dict([str(colcountby), '0', '1'],
                            [str(colgroupby) + 'Beauty'] * 2 + ['Awakening'])
    return df.groupby(colgroupby, sort=False)[colcountby].progress_apply(
        beauty_coefficient).to_frame().reset_index().rename(
            columns=newname_dict)
Beispiel #2
0
def compute_hindex(df, colgroupby, colcountby, show_progress=False):
    """
    Calculate the h index for each group in the DataFrame.  See :cite:`hirsch2005index` for the definition.

    The algorithmic implementation for each author can be found in :py:func:`citationanalysis.author_hindex`.

    Parameters
    ----------
    :param df : DataFrame
        A DataFrame with the citation information for each Author.

    :param colgroupby : str
        The DataFrame column with Author Ids.

    :param colcountby : str
        The DataFrame column with Citation counts for each publication.

    Returns
    -------
    DataFrame
        DataFrame with 2 columns: colgroupby, 'Hindex'

        """
    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc='Hindex', disable=not show_progress)

    newname_dict = zip2dict([str(colcountby), '0'],
                            [str(colgroupby) + 'Hindex'] * 2)
    return df.groupby(colgroupby, sort=False)[colcountby].progress_apply(
        author_hindex).to_frame().reset_index().rename(columns=newname_dict)
Beispiel #3
0
    def author_yearly_productivity(self, df=None, colgroupby = 'AuthorId', datecol = 'Year', colcountby = 'PublicationId', show_progress=False):
        """
        Calculate the number of publications for each author in each year.

        Parameters
        ----------
        :param df : DataFrame, default None, Optional
            A DataFrame with the author2publication information.  If None then the database 'author2pub_df' is used.

        :param colgroupby : str, default 'AuthorId', Optional
            The DataFrame column with Author Ids.  If None then the database 'AuthorId' is used.

        :param datecol : str, default 'Year', Optional
            The DataFrame column with Year information.  If None then the database 'Year' is used.

        :param colcountby : str, default 'PublicationId', Optional
            The DataFrame column with Publication Ids.  If None then the database 'PublicationId' is used.

        Returns
        -------
        DataFrame
            Productivity DataFrame with 3 columns: 'AuthorId', 'Year', 'YearlyProductivity'

        """
        if df is None:
            df = self.author2pub_df

        # we can use show_progress to pass a label for the progress bar
        if show_progress:
            show_progress='Yearly Productivity'

        newname_dict = zip2dict([str(colcountby)+'Count', '0'], ['YearlyProductivity']*2)
        return groupby_count(df, [colgroupby, datecol], colcountby, count_unique=True, show_progress=show_progress).rename(columns=newname_dict)
Beispiel #4
0
    def author_career_length(self, df = None, colgroupby = 'AuthorId', colrange = 'Year', show_progress=False):
        """
        Calculate the career length for each author.  The career length is the length of time from the first
        publication to the last publication.

        Parameters
        ----------
        :param df : DataFrame, default None, Optional
            A DataFrame with the author2publication information.  If None then the database 'author2pub_df' is used.

        :param colgroupby : str, default 'AuthorId', Optional
            The DataFrame column with Author Ids.  If None then the database 'AuthorId' is used.

        :param colrange : str, default 'Year', Optional
            The DataFrame column with Date information.  If None then the database 'Year' is used.

        Returns
        -------
        DataFrame
            Productivity DataFrame with 2 columns: 'AuthorId', 'CareerLength'

        """
        if df is None:
            df = self.author2pub_df

        # we can use show_progress to pass a label for the progress bar
        if show_progress:
            show_progress='Career Length'

        newname_dict = zip2dict([str(colrange)+'Range', '0'], ['CareerLength']*2)
        return groupby_range(df, colgroupby, colrange, show_progress=show_progress).rename(columns=newname_dict)
Beispiel #5
0
def author_productivity(pub2author_df, colgroupby = 'AuthorId', colcountby = 'PublicationId', show_progress=False):
    """
    Calculate the total number of publications for each author.

    Parameters
    ----------
    pub2author_df : DataFrame, default None, Optional
        A DataFrame with the author2publication information.

    colgroupby : str, default 'AuthorId', Optional
        The DataFrame column with Author Ids.  If None then the database 'AuthorId' is used.

    colcountby : str, default 'PublicationId', Optional
        The DataFrame column with Publication Ids.  If None then the database 'PublicationId' is used.


    Returns
    -------
    DataFrame
        Productivity DataFrame with 2 columns: 'AuthorId', 'Productivity'

    """

    # we can use show_progress to pass a label for the progress bar
    if show_progress:
        show_progress='Author Productivity'

    newname_dict = zip2dict([str(colcountby)+'Count', '0'], ['Productivity']*2)
    return groupby_count(pub2author_df, colgroupby, colcountby, count_unique=True, show_progress=show_progress).rename(columns=newname_dict)
Beispiel #6
0
def author_top_field(pub2author_df, colgroupby = 'AuthorId', colcountby = 'FieldId', fractional_field_counts = False, show_progress=False):
    """
    Calculate the most frequent field in the authors career.

    Parameters
    ----------
    pub2author_df : DataFrame
        A DataFrame with the author2publication field information.

    colgroupby : str, default 'AuthorId'
        The DataFrame column with Author Ids.  If None then the database 'AuthorId' is used.

    colcountby : str, default 'FieldId'
        The DataFrame column with Citation counts for each publication.  If None then the database 'FieldId' is used.

    fractional_field_counts : bool, default False
        How to count publications that are assigned to multiple fields:
            - If False, each publication-field assignment is counted once.
            - If True, each publication is counted once, contributing 1/#fields to each field.

    Returns
    -------
    DataFrame
        DataFrame with 2 columns: 'AuthorId', 'TopFieldId'

    """

    check4columns(pub2author_df, [colgroupby, 'PublicationId', colcountby])

    # register our pandas apply with tqdm for a progress bar
    tqdm.pandas(desc='Author Top Field', disable= not show_progress)

    if not fractional_field_counts:
        author2field = pub2author_df.groupby(colgroupby)[colcountby].progress_apply(lambda x: x.mode()[0])

    else:
        # first calculate how many fields each publication maps too
        pub2nfields = groupby_count(pub2author_df, colgroupby='PublicationId', colcountby=colcountby)

        # each pub2field mapping is weighted by the number of fields for the publication
        pub2nfields['PublicationWeight'] = 1.0/pub2nfields['PublicationIdCount']
        del pub2nfields[str(colcountby)+'Count']

        # merge counts
        author2field = pub2author_df.merge(pub2nfields, how='left', on='PublicationId')

        # custom weighted mode based on 
        def weighted_mode(adf):
            p = adf.groupby(colcountby)['PublicationWeight'].sum()
            return p.idxmax()

        # now take the weighted mode for each groupby column
        author2field = author2field.groupby(colgroupby).progress_apply(weighted_mode)

    newname_dict = zip2dict([str(colcountby), '0'], ['Top' + str(colcountby)]*2)
    return author2field.to_frame().reset_index().rename(columns=newname_dict)
Beispiel #7
0
def yearly_productivity_traj(df,
                             colgroupby='AuthorId',
                             colx='Year',
                             coly='YearlyProductivity'):
    """
    Calculate the piecewise linear yearly productivity trajectory original studied in :cite:`way2017misleading`.

    """

    newname_dict = zip2dict(list(
        range(4)), ['t_break', 'b', 'm1', 'm2'])  #[str(i) for i in range(4)]
    return df.groupby(colgroupby, sort=False).apply(
        _fit_piecewise_lineardf,
        args=(colx, coly)).reset_index().rename(columns=newname_dict)
Beispiel #8
0
def compute_yearly_productivity_traj(df,
                                     colgroupby='AuthorId',
                                     colx='Year',
                                     coly='YearlyProductivity'):
    """
    This function calculates the piecewise linear yearly productivity trajectory original studied in [w].

    References
    ----------
    .. [w] Way, Larremore (2018): "title", *PNAS*.
           DOI: xxx
    """

    newname_dict = zip2dict(list(
        range(4)), ['t_break', 'b', 'm1', 'm2'])  #[str(i) for i in range(4)]
    return df.groupby(colgroupby, sort=False).apply(
        _fit_piecewise_lineardf,
        args=(colx, coly)).reset_index().rename(columns=newname_dict)
Beispiel #9
0
def longterm_impact(pub2ref_df, colgroupby = 'CitedPublicationId', coldate='CitingYear', show_progress=True):
    """
    This function calculates the longterm scientific impact as introduced in :cite:`Wang2013longterm`.

    Following equation (3) from [w]:
    c_(t) = m * (e^{lam * PHI()})

    """
    pub2ref_df = pub2ref_df.copy()

    if 'Year' in coldate:
        pub2ref_df['DateDelta'] = pub2ref_df.groupby(colgroupby, sort=False)[coldate].transform(lambda x: x-x.min())
    elif 'Date' in coldate:
        pub2ref_df['DateDelta'] = pub2ref_df.groupby(colgroupby, sort=False)[coldate].transform(lambda x: x-x.min()) / np.timedelta64(1,'D')
    else:
        print("Column Date Error") 

    pub2ref_df = pub2ref_df.loc[pub2ref_df['DateDelta'] > 0]
    pub2ref_df.sort_values(by=['DateDelta'], inplace=True)

    newname_dict = zip2dict(list(range(4)), ['lam', 'mu', 'sig', 'm' ])
    return pub2ref_df.groupby(colgroupby, sort=False).apply(_fit_predicted_citations).reset_index().rename(columns = newname_dict)
Beispiel #10
0
def author_endyear(pub2author_df = None, colgroupby = 'AuthorId', datecol = 'Year', show_progress=False):
    """
    Calculate the year of last publication for each author.

    Parameters
    ----------
    pub2author_df : DataFrame, default None, Optional
        A DataFrame with the author2publication information.

    colgroupby : str, default 'AuthorId', Optional
        The DataFrame column with Author Ids.  If None then the database 'AuthorId' is used.

    datecol : str, default 'Year', Optional
        The DataFrame column with Date information.  If None then the database 'Year' is used.

    Returns
    -------
    DataFrame
        Productivity DataFrame with 2 columns: 'AuthorId', 'CareerLength'

    """

    newname_dict = zip2dict([str(datecol), '0'], ['EndYear']*2)
    return pub2author_df.groupby(colgroupby)[datecol].max().to_frame().reset_index().rename(columns=newname_dict)