Ejemplo n.º 1
0
def threshold_n_binom(params, p_value, thresh_range=None):
    """
    Determine a p-value threshold for a composite negative binomial
    and lognormal distribution based only on the value of the negative
    binomial.

    :param tuple params: Tuple of parameters for a combined \
            negative/binomial (see :func:`~n_binom_plus_log_normal`)
    :param float p_value: P-value cut-off
    :param list thresh_range: Possible values to consider as a cut \
            off (default is 0-500).
    :returns: Position above which the integral of the negative \
            binomial is equal to the P-value cut-off.
    """

    if thresh_range is None:
        thresh_range = list(range(500))

    bin_n, bin_p, nm_delta, nm_scale, size = params

    bin_mean, bin_var = nbinom.stats(bin_n, bin_p)

    cumulative_dist = nbinom.cdf(thresh_range, bin_n, bin_p)

    prob_dist = sum_to_1(un_cumulative(cumulative_dist))
    index = bisect_left(prob_dist[::-1], p_value)
    return thresh_range[::-1][index]
Ejemplo n.º 2
0
def plot_binom(breaks, counts, params):
    """
    Given the parameters of a composite negative binomal
    and lognormal distribution, plot only the contribution
    of the negative binomial.

    :param breaks: Array containing histogram bin edges.
    :type breaks: :class:`~numpy.ndarray`
    :param counts: Array containing the number of windows \
            falling into each histogram bin.
    :type counts: :class:`~numpy.ndarray`
    :param tuple params: Parameters of the composite \
            distribution (see  :func:`~n_binom_plus_log_normal`).
    """

    bin_n, bin_p, nm_delta, nm_scale, size = params

    bin_mean, bin_var = nbinom.stats(bin_n, bin_p)

    binom_y = neg_binomial(breaks, bin_n, bin_p)
    binom_y = binom_y * sum(counts) * (1. - abs(size))
    binom_y = mask_x_by_z(binom_y, counts)

    fit_x = get_fit_x(breaks, counts)

    return plt.plot(fit_x, binom_y, color='green')
Ejemplo n.º 3
0
def n_binom_plus_log_normal(params, x):
    """
    Composite probability density function for the sum of a lognormal
    distribution and a negative binomial distribution.

    params is a tuple of all parameters for both underlying distributions:

    params = bin_n, bin_p, nm_delta, nm_scale, size

    Parameters for the negative binomial distribution are:

    bin_n = Negative binomial number of trials (see :data:`~scipy.stats.nbinom`)
    bin_p = Negative binomial probability of success (see :data:`~scipy.stats.nbinom`)

    Parameters for the lognormal distribution are:

    nm_delta = Absolute difference between the mean of the lognormal distribution
    and the mean of the negative binomial distribution
    nm_scale = Standard deviation of the lognormal distribution

    The lognormal is parameterized in this particular way because
    we don't want any solutions where the mean of the lognormal is
    less than the mean of the negative binomial. By parameterizing the function
    such that the position of the lognormal is given as a distance from the
    mean of the negative binomial (nm_delta), we can impose that nm_delta
    is always treated as positive.

    The final parameter, size, gives the ratio between the two underlying
    probability distributions.

    :param x: Edges of bins within which to calculate probability density
    :type x: :class:`~numpy.ndarray`
    :param tuple params: Parameters of the composite function
    :returns: Cumulative probability distribution over x. Return \
            array has one less value than x, as the first value of \
            the return array is the probability density between x[0] \
            and x[1].
    """

    bin_n, bin_p, nm_delta, nm_scale, size = params

    bin_mean, bin_var = nbinom.stats(bin_n, bin_p)

    nm_loc = np.log10(bin_mean) + np.abs(nm_delta)

    bin_y = neg_binomial(x, bin_n, bin_p)

    norm_y = normal(x, nm_loc, nm_scale)

    sum_y = (bin_y * (1. - abs(size))) + (norm_y * abs(size))

    return sum_y / sum(sum_y)
Ejemplo n.º 4
0
def _random_noise(df, noise_factor):
    r"""
    Generates random noise on an observable by a Negative Binomial :math:`NB`.
    References to the negative binomial can be found `here <https://ncss-wpengine.netdna-ssl.com/wp-content/themes/ncss/pdf/Procedures/NCSS/Negative_Binomial_Regression.pdf>`_
    .

    .. math::
        O &\sim NB(\mu=datapoint,\alpha)

    We keep the alpha parameter low to obtain a small variance which should than always be approximately the size of the mean.

    Parameters
    ----------
    df : new_cases , pandas.DataFrame
        Observable on which we want to add the noise

    noise_factor: :math:`\alpha`
        Alpha factor for the random number generation

    Returns
    -------
    array : 1-dim
        observable with added noise
    """
    def convert(mu, alpha):
        r = 1 / alpha
        p = mu / (mu + r)
        return r, 1 - p

    # Apply noise on every column
    for column in df:
        # Get values
        array = df[column].to_numpy()

        for i in range(len(array)):
            if (array[i] == 0) or (np.isnan(array[i])):
                continue
            log.debug(f"Data {array[i]}")
            r, p = convert(array[i], noise_factor)
            log.info(f"n {r}, p {p}")
            mean, var = nbinom.stats(r, p, moments="mv")
            log.debug(f"mean {mean} var {var}")
            array[i] = nbinom.rvs(r, p)
            log.debug(f"Drawn {array[i]}")

        df[column] = array

    return df
 predR.append(RRest)
 testRRM=1.+infperiod*ln( gamma.ppf(0.99, a=alpha, scale=1./beta) )# these are the boundaries of the 99% confidence interval  for new cases
 if (testRRM <0.): testRRM=0.
 pstRRM.append(testRRM)
 testRRm=1.+infperiod*ln( gamma.ppf(0.01, a=alpha, scale=1./beta) )
 if (testRRm <0.): testRRm=0.
 pstRRm.append(testRRm)
 #print('estimated RR=',RRest,testRRm,testRRM) # to see the numbers for the evolution of Rt
 
 if (new_cases>0. and old_new_cases>0.):
     NewCases.append(new_cases)
     
     # Using a Negative Binomial as the  Posterior Predictor of New Cases, given old one
     # This takes parameters r,p which are functions of new alpha, beta from Gamma
     r, p = alpha, beta/(old_new_cases+beta)
     mean, var, skew, kurt = nbinom.stats(r, p, moments='mvsk')
     
     pred.append(mean) # the expected value of new cases
     testciM=nbinom.ppf(0.99, r, p) # these are the boundaries of the 99% confidence interval  for new cases
     pstdM.append(testciM)
     testcim=nbinom.ppf(0.01, r, p)
     pstdm.append(testcim)
     
     newp=p
     newr=r
     flag=0
     while (new_cases>testciM or new_cases<testcim):
         if (flag==0):
             anomalyday.append(dates[i+1]) # the first new cases are at i=2
             anomalypred.append(new_cases)
         
Ejemplo n.º 6
0
Archivo: td1.py Proyecto: linkzl/insa
ax.vlines(x, 0, binom.pmf(x, n, p), colors='b', lw=5, alpha=0.5)

rv = binom(n, p)
ax.vlines(x, 0, rv.pmf(x), colors='k', linestyles='-', lw=1, label='frozen pmf')
ax.legend(loc='best', frameon=False)



# ============================================= #
# ============= BINOMIALE NEGATIVE ============ #
# ============================================= #
fig, ax = plt.subplots(1, 1)


n, p = 50, 0.4
mean, var, skew, kurt = nbinom.stats(n, p, moments='mvsk')


x = np.arange(nbinom.ppf(0.01, n, p), nbinom.ppf(0.99, n, p))
ax.plot(x, nbinom.pmf(x, n, p), 'bo', ms=8, label='nbinom pmf')
ax.vlines(x, 0, nbinom.pmf(x, n, p), colors='b', lw=5, alpha=0.5)


rv = nbinom(n, p)
ax.vlines(x, 0, rv.pmf(x), colors='k', linestyles='-', lw=1, label='frozen pmf')
ax.legend(loc='best', frameon=False)



# ============================================= #
# ================= GEOMETRIQUE =============== #
Ejemplo n.º 7
0
def collect_and_plot_passes_nb(teams_list=None,
                               teams_dict=None,
                               plot_output=['single', 'all'],
                               teams_col_dict=None):

    team_sequences = {}
    dict_of_passing_stats = {}
    all_sequences = []

    for tm in teams_list:
        passing_stats = {}
        df = teams_dict[tm]
        list_of_dates = set(df['Date/Time'])

        date_sequences = {}
        for d in list_of_dates:
            df_filter = df[df['Date/Time'] == d]
            df_filter = df_filter[df_filter['Event Type'] != 'Cessation']
            opponent = df_filter['Opponent'].iloc[0]
            kee = str(d) + ' | ' + opponent
            date_sequences[kee] = get_sequences(df_filter)
        team_sequences[tm] = date_sequences
        counts = convert_date_sequences_to_list_and_count(date_sequences)
        all_sequences.extend(counts)

        x_values_for_barplot = [key for key, group in groupby(counts)]
        y_values_for_barplot = [
            i / sum([len(list(group)) for key, group in groupby(counts)])
            for i in [len(list(group)) for key, group in groupby(counts)]
        ]

        ## (GP) NB Estimation
        mu = sum(counts) / len(counts)
        sigma = math.sqrt(
            sum([(mu - float(i))**2
                 for i in counts]) / (len([(mu - float(i))**2
                                           for i in counts]) - 1))
        r = (mu**2) / (sigma**2 - mu)
        p = (mu) / (sigma**2)

        mean, var, skew, kurt = nbinom.stats(r, p, moments='mvsk')

        passing_stats['nb_probability'] = p
        passing_stats['nb_r'] = r
        passing_stats['avg_passes'] = mean
        passing_stats['var_passes'] = sigma**2
        passing_stats['nb_skew'] = skew
        passing_stats['nb_kurtosis'] = kurt

        dict_of_passing_stats[tm] = passing_stats

        if plot_output == 'single':
            x_values_for_nb = np.arange(nbinom.ppf(0.01, r, p),
                                        nbinom.ppf(0.9999, r, p))
            y_values_for_nb = nbinom.pmf(x_values_for_nb, r, p)

            fig = go.Figure(data=[
                go.Bar(x=x_values_for_barplot,
                       y=y_values_for_barplot,
                       marker_color=teams_col_dict[tm],
                       marker_line_color="black",
                       name="Passes Completed")
            ])

            fig.add_trace(
                go.Scatter(x=x_values_for_nb,
                           y=y_values_for_nb,
                           marker_color="black",
                           mode='lines',
                           name='Negative Binomial Approximation'))

            fig.update_layout(
                title="{}: Catch Counts, with Negative Binomial Estimation".
                format(tm),
                xaxis_title="n Number of Catches",
                yaxis_title="Frequency",
                boxmode='group',
                plot_bgcolor='rgb(220,220,220)')

            iplot(fig)

    all_sequences.sort()

    if plot_output == 'all':
        mu_a = sum(all_sequences) / len(all_sequences)
        sigma_a = math.sqrt(
            sum([(mu_a - float(i))**2 for i in all_sequences]) /
            (len([(mu_a - float(i))**2 for i in all_sequences]) - 1))
        r_a = (mu_a**2) / (sigma_a**2 - mu_a)
        p_a = (mu_a) / (sigma_a**2)

        mean_a, var_a, skew_a, kurt_a = nbinom.stats(r_a, p_a, moments='mvsk')

        x_values_for_barplot_a = [key for key, group in groupby(all_sequences)]
        y_values_for_barplot_a = [
            i /
            sum([len(list(group)) for key, group in groupby(all_sequences)])
            for i in
            [len(list(group)) for key, group in groupby(all_sequences)]
        ]

        x_values_for_nb_a = np.arange(nbinom.ppf(0.01, r_a, p_a),
                                      nbinom.ppf(0.9999, r_a, p_a))
        y_values_for_nb_a = nbinom.pmf(x_values_for_nb_a, r_a, p_a)

        fig = go.Figure(data=[
            go.Bar(x=x_values_for_barplot_a,
                   y=y_values_for_barplot_a,
                   marker_color="oldlace",
                   marker_line_color="black",
                   name="Passes Completed")
        ])

        fig.add_trace(
            go.Scatter(x=x_values_for_nb_a,
                       y=y_values_for_nb_a,
                       marker_color="black",
                       mode='lines',
                       name='Negative Binomial Approximation'))

        fig.update_layout(
            title=
            "League Wide Catch Counts Per Possession, with Negative Binomial Estimation",
            xaxis_title="n Number of Catches in a Possession",
            yaxis_title="Frequency",
            boxmode='group',
            plot_bgcolor='rgb(220,220,220)')

        iplot(fig)

    return (dict_of_passing_stats, team_sequences, all_sequences)
Ejemplo n.º 8
0
def run_luis_model(df: pd.DataFrame, filepath: Path) -> None:

    infperiod = 4.5  # length of infectious period, adjust as needed

    def smooth(y, box_pts):
        box = np.ones(box_pts) / box_pts
        y_smooth = np.convolve(y, box, mode='same')
        return y_smooth

    # Loop through states
    states = df['state'].unique()

    returndf = pd.DataFrame()
    for state in states:

        from scipy.stats import gamma  # not sure why this needs to be recalled after each state, but otherwite get a type exception
        import numpy as np

        statedf = df[df['state'] == state].sort_values('date')

        confirmed = list(statedf['positive'])
        dates = list(statedf['date'])
        day = list(range(1, len(statedf['date']) + 1))

        if (confirmed[-1] < 10.):
            continue  # this skips the Rt analysis for states for which there are <10 total cases

    ##### estimation and prediction
        dconfirmed = np.diff(confirmed)
        for ii in range(len(dconfirmed)):
            if dconfirmed[ii] < 0.: dconfirmed[ii] = 0.
        xd = dates[1:]

        sdays = 15
        yy = smooth(
            dconfirmed, sdays
        )  # smoothing over sdays (number of days) moving window, averages large chunking in reporting in consecutive days
        yy[-2] = (
            dconfirmed[-4] + dconfirmed[-3] + dconfirmed[-2]
        ) / 3.  # these 2 last lines should not be necesary but the data tend to be initially underreported and also the smoother struggles.
        yy[-1] = (dconfirmed[-3] + dconfirmed[-2] + dconfirmed[-1]) / 3.

        #lyyy=np.cumsum(lwy)
        TotalCases = np.cumsum(
            yy
        )  # These are confirmed cases after smoothing: tried also a lowess smoother but was a bit more parameer dependent from place to place.

        alpha = 3.  # shape parameter of gamma distribution
        beta = 2.  # rate parameter of gamma distribution see https://en.wikipedia.org/wiki/Gamma_distribution

        valpha = []
        vbeta = []

        pred = []
        pstdM = []
        pstdm = []
        xx = []
        NewCases = []

        predR = []
        pstRRM = []
        pstRRm = []

        anomalyday = []
        anomalypred = []

        for i in range(2, len(TotalCases)):
            new_cases = float(TotalCases[i] - TotalCases[i - 1])
            old_new_cases = float(TotalCases[i - 1] - TotalCases[i - 2])

            # This uses a conjugate prior as a Gamma distribution for b_t, with parameters alpha and beta
            alpha = alpha + new_cases
            beta = beta + old_new_cases
            valpha.append(alpha)
            vbeta.append(beta)

            mean = gamma.stats(a=alpha, scale=1 / beta, moments='m')

            RRest = 1. + infperiod * ln(mean)
            if (RRest < 0.): RRest = 0.
            predR.append(RRest)
            testRRM = 1. + infperiod * ln(
                gamma.ppf(0.99, a=alpha, scale=1. / beta)
            )  # these are the boundaries of the 99% confidence interval  for new cases
            if (testRRM < 0.): testRRM = 0.
            pstRRM.append(testRRM)
            testRRm = 1. + infperiod * ln(
                gamma.ppf(0.01, a=alpha, scale=1. / beta))
            if (testRRm < 0.): testRRm = 0.
            pstRRm.append(testRRm)

            if (new_cases == 0. or old_new_cases == 0.):
                pred.append(0.)
                pstdM.append(10.)
                pstdm.append(0.)
                NewCases.append(0.)

            if (new_cases > 0. and old_new_cases > 0.):
                NewCases.append(new_cases)

                # Using a Negative Binomial as the  Posterior Predictor of New Cases, given old one
                # This takes parameters r,p which are functions of new alpha, beta from Gamma
                r, p = alpha, beta / (old_new_cases + beta)
                mean, var, skew, kurt = nbinom.stats(r, p, moments='mvsk')

                pred.append(mean)  # the expected value of new cases
                testciM = nbinom.ppf(
                    0.99, r, p
                )  # these are the boundaries of the 99% confidence interval  for new cases
                pstdM.append(testciM)
                testcim = nbinom.ppf(0.01, r, p)
                pstdm.append(testcim)

                np = p
                nr = r
                flag = 0

                while (new_cases > testciM or new_cases < testcim):
                    if (flag == 0):
                        anomalypred.append(new_cases)
                        anomalyday.append(
                            dates[i + 1])  # the first new cases are at i=2

                    # annealing: increase variance so as to encompass anomalous observation: allow Bayesian code to recover
                    # mean of negbinomial=r*(1-p)/p  variance= r (1-p)/p**2
                    # preserve mean, increase variance--> np=0.8*p (smaller), r= r (np/p)*( (1.-p)/(1.-np) )
                    # test anomaly

                    nnp = 0.95 * np  # this doubles the variance, which tends to be small after many Bayesian steps
                    nr = nr * (nnp / np) * (
                        (1. - np) / (1. - nnp)
                    )  # this assignement preserves the mean of expected cases
                    np = nnp
                    mean, var, skew, kurt = nbinom.stats(nr,
                                                         np,
                                                         moments='mvsk')
                    testciM = nbinom.ppf(0.99, nr, np)
                    testcim = nbinom.ppf(0.01, nr, np)

                    flag = 1
                else:
                    if (flag == 1):
                        alpha = nr  # this updates the R distribution  with the new parameters that enclose the anomaly
                        beta = np / (1. - np) * old_new_cases

                        testciM = nbinom.ppf(0.99, nr, np)
                        testcim = nbinom.ppf(0.01, nr, np)

                        # annealing leaves the RR mean unchanged, but we need to adjus its widened CI:
                        testRRM = 1. + infperiod * ln(
                            gamma.ppf(0.99, a=alpha, scale=1. / beta)
                        )  # these are the boundaries of the 99% confidence interval  for new cases
                        if (testRRM < 0.): testRRM = 0.
                        testRRm = 1. + infperiod * ln(
                            gamma.ppf(0.01, a=alpha, scale=1. / beta))
                        if (testRRm < 0.): testRRm = 0.

                        pstRRM = pstRRM[:
                                        -1]  # remove last element and replace by expanded CI for RRest
                        pstRRm = pstRRm[:-1]
                        pstRRM.append(testRRM)
                        pstRRm.append(testRRm)

        # visualization of the time evolution of R_t with confidence intervals
        x = []
        for i in range(len(predR)):
            x.append(i)
        days = dates[3:]
        xd = days
        dstr = []
        for xdd in xd:
            dstr.append(xdd.strftime("%Y-%m-%d"))

        appenddf = pd.DataFrame({
            'state': state,
            'date': days,
            'RR_pred_luis': predR,
            'RR_CI_lower_luis': pstRRm,
            'RR_CI_upper_luis': pstRRM
        })
        returndf = pd.concat([returndf, appenddf], axis=0)

    returndf.to_csv(filepath / "luis_code_estimates.csv", index=False)
# DISTRIBUCIÓN BINOMIAL NEGATIVA

from scipy.stats import nbinom

nbinom.pmf(k=5, n=2, p=0.1)

nbinom.pmf(k=5, n=2, p=0.1, loc=0)

nbinom.cdf(k=4, n=2, p=0.1)

1 - nbinom.cdf(k=4, n=2, p=0.1)

nbinom.rvs(n=2, p=0.1, size=100)

params = nbinom.stats(n=2, p=0.1, moments='mv')

'E(X) = {} y Var(X) = {}'.format(params[0], params[1])

n, p = 10, 0.25
x = np.arange(nbinom.ppf(0.01, n, p), nbinom.ppf(0.99, n, p))
fig = plt.figure(figsize=(5, 2.7))
ax = fig.add_subplot(1, 2, 1)
ax.plot(x, nbinom.pmf(x, n, p), 'bo', ms=8, label="nbinom pmf")
ax.vlines(x, 0, nbinom.pmf(x, n, p), color="b", lw=5, alpha=0.5)
for tick in ax.xaxis.get_major_ticks():
    tick.label.set_fontsize(5)
for tick in ax.yaxis.get_major_ticks():
    tick.label.set_fontsize(5)
ax = fig.add_subplot(1, 2, 2)
ax.plot(x, nbinom.cdf(x, n, p), 'bo', ms=8, label='nbinom pmf')