def get_rule_rhs(Xtrain,Ytrain,d_t,alpha,intervals):
    N_t = compute_rule_usage(d_t,d_t.index(0),Xtrain,Ytrain)
    theta = []
    ci_theta = []
    for i,j in enumerate(d_t):
        # i is the index in the list,
        # j is the global index of the rule

        if Ytrain.shape[-1] == 2:
            #theta ~ Dirichlet(N[j,:] + alpha)
            #E[theta] = (N[j,:] + alpha)/float(sum(N[j,:] + alpha))
            #NOTE this result is only for binary classification
            #theta = p(y=1)
            theta.append((N_t[i,1] + alpha[1])/float(sum(N_t[i,:] + alpha)))
            #And now the 95% interval, for Beta(N[j,1] + alpha[1], N[j,0] + alpha[0])
            if intervals:
                ci_theta.append(beta.interval(0.95,N_t[i,1] + alpha[1],N_t[i,0] + alpha[0]))

        else:
            # theta ~ Dirichlet(N[j,:] + alpha)
            # E[theta] = (N[j,:] + alpha) / float(sum(N[j,:] + alpha))
            theta.append((N_t[i, :] + alpha) / (N_t[i, :] + alpha).sum())

            # marginal of a dirichlet is beta.
            # X_i ~ Beta(alpha[i], alpha.sum() - alpha[i])
            if intervals:
                alpha_i = N_t[i, :] + alpha
                ci_theta.append(beta.interval(0.95, alpha_i, (N_t[i, :] + alpha).sum() - alpha_i))
    return theta,ci_theta
Beispiel #2
0
    def get_distribution_limits(self) -> Tuple[float, float]:
        lower_limit_0, upper_limit_0 = beta.interval(SequenceAssociationLikelihood.DISTRIBUTION_PERCENTAGE_TO_SHOW,
                                                     self.method.alpha_0, self.method.beta_0)
        lower_limit_1, upper_limit_1 = beta.interval(SequenceAssociationLikelihood.DISTRIBUTION_PERCENTAGE_TO_SHOW,
                                                     self.method.alpha_1, self.method.beta_1)
        lower_limit = min(lower_limit_0, lower_limit_1)
        upper_limit = max(upper_limit_0, upper_limit_1)

        return lower_limit, upper_limit
Beispiel #3
0
def switching_BER(data, **kwargs):
    """ Process data for BER experiment. """
    count_mat, start_stt = count_matrices_ber(data, **kwargs)
    switched_stt = int(1 - start_stt)
    mean = beta.mean(1 + count_mat[start_stt, switched_stt],
                     1 + count_mat[start_stt, start_stt])
    limit = beta.mean(
        1 + count_mat[start_stt, switched_stt] +
        count_mat[start_stt, start_stt], 1)
    ci68 = beta.interval(0.68, 1 + count_mat[start_stt, switched_stt],
                         1 + count_mat[start_stt, start_stt])
    ci95 = beta.interval(0.95, 1 + count_mat[start_stt, switched_stt],
                         1 + count_mat[start_stt, start_stt])
    return mean, limit, ci68, ci95
Beispiel #4
0
def avg_y_by_x(x, y):
    x = np.array(x)
    y = np.array(y)

    xs = sorted(list(set(x)))

    xv = []
    yv = []
    lcb = []
    ucb = []
    n_obs = []

    for v in xs:
        ys = [y[i] for i, e in enumerate(x) if e == v]
        if len(ys) > 0:
            xv.append(v)
            yv.append(sum(ys) / len(ys))
            n_obs.append(len(ys))

            unique, counts = np.unique(ys, return_counts=True)
            counts = dict(zip(unique, counts))

            if 0 not in counts:
                counts[0] = 0
            if 1 not in counts:
                counts[1] = 0

            ci = beta.interval(0.95, 0.5 + counts[0], 0.5 + counts[1])
            lcb.append(ci[0])
            ucb.append(ci[1])

    return xv, yv, lcb, ucb, n_obs
 def _interval(self, row):
     interval = beta.interval(
         self._interval_size,
         row[self._numerator_column] + self._alpha_prior,
         row[self._denominator_column] - row[self._numerator_column] +
         self._beta_prior)
     return interval
Beispiel #6
0
def part_a3(fname=fname):
    fname = fname + '_a3'
    # m = 5
    # theta_set = np.linspace(1/float(m),1-1/float(m),m)
    # print(theta_set)
    a = 2
    b = 8
    n = 34
    y = 15
    a2 = y + a
    b2 = n + b - y
    x = np.linspace(0, 1, 100)
    fig = plt.figure()
    plt.plot(x, beta.pdf(x, a2, b2), lw=1.5)
    # hh = []
    # for i, theta in enumerate(theta_set):
    #     h, = plt.plot(x, binom.pmf(x, n, theta), lw=1.5, label=r'$\theta='+'{:.2f}'.format(theta)+'$')
    #     hh.append(h)
    plt.xlabel(r'$\theta$', fontsize=labelFontSize)
    plt.ylabel(r'$p(\theta \mid y)$', fontsize=labelFontSize)
    plt.title('3.4a' + r' $p(\theta \mid y)$', fontsize=titleFontSize)
    plt.xticks(fontsize=tickFontSize)
    plt.yticks(fontsize=tickFontSize)
    # plt.legend()
    plt.savefig(fname + '.' + imgFmt, format=imgFmt)
    print('95% CI = ', beta.interval(0.05, a2, b2))
Beispiel #7
0
def calculate_proportion(_x, _n):
    x = _x.round()
    n = _n.round()
    ci_low, ci_upp = beta.interval(1 - 0.05, x + 0.5,
                                   n - x + 0.5)  # Jeffreys Interval
    est_proportion = _x / _n
    return est_proportion, ci_low, ci_upp
Beispiel #8
0
def _calc_prob_interval(volume, probs, prob_vars):
    """Compute the confidence interval of probability."""
    if not isinstance(probs, np.ndarray):
        probs = np.asarray(probs)
    if not isinstance(prob_vars, np.ndarray):
        prob_vars = np.asarray(prob_vars)
    one_minus_probs = 1 - probs
    alpha_coef = (np.square(probs) * one_minus_probs / prob_vars) - probs
    beta_coef = alpha_coef * one_minus_probs / probs
    intervals = beta.interval(volume, alpha_coef, beta_coef)

    # avoid invalid result due to extreme small value of prob_vars
    lows = []
    highs = []
    for i, low in enumerate(intervals[0]):
        high = intervals[1][i]
        if prob_vars[i] <= 0 or \
                not np.isfinite(low) or low > probs[i] or \
                not np.isfinite(high) or high < probs[i]:
            low = probs[i]
            high = probs[i]
        lows.append(low)
        highs.append(high)

    return lows, highs
Beispiel #9
0
def binomial_binning(dat,
                     grouping_variables,
                     ci=.95,
                     rule_of_succession=True,
                     bernoulli_column='correct'):
    """Bin trials based on grouping variables, returning a new data frame
    with binomial outcome columns (successes, N_trials, plus propotion correct)
    rather than each row being a single trial.
    This data format will significantly speed up model fitting.

    :param dat:
        a pandas dataframe containing the data. Must have
        grouping_variables columns and also a column corresponding to
        bernoulli outcomes (0, 1).
    :param grouping_variables:
        a string or list of strings containing the column names
        to group over.
    :param ci:
        the percentage of the confidence intervals.
    :param rule_of_succession:
        if true, apply a rule-of-succession correction to the data by
        adding 1 success and one failure to the total number of trials.
        This is essentially a prior acknowledging the possibility of both
        successes and failures, and is used to correct for values with
        proportions of 0 or 1 (i.e. allow estimation of beta errors).
    :param bernoulli_column:
        A string naming the column of the dataframe corresponding to bernoulli
        trial outcome. Defaults to "correct".
    :returns:
        a new pandas dataframe where each row is a binomial trial.

    Example
    ----------
    res = binomial_binning(dat, ['subj', 'surround', 'scale'])


    """
    grouped = dat.groupby(grouping_variables, as_index=False)
    res = grouped[bernoulli_column].agg({'n_successes': _np.sum,
                                         'n_trials': _np.size})

    if rule_of_succession:
        res.loc[:, 'n_successes'] += 1
        res.loc[:, 'n_trials'] += 2

    # compute some additional values:
    res.loc[:, 'prop_corr'] = res.n_successes / res.n_trials

    # confidence intervals from a beta distribution:
    cis = _beta.interval(ci, res.n_successes, (res.n_trials-res.n_successes))
    res.loc[:, 'ci_min'] = cis[0]
    res.loc[:, 'ci_max'] = cis[1]
    res.loc[:, 'error_min'] = _np.abs(res['ci_min'].values -
                                      res['prop_corr'].values)
    res.loc[:, 'error_max'] = _np.abs(res['ci_max'].values -
                                      res['prop_corr'].values)

    return(res)
Beispiel #10
0
def acc_ci(confusion_matrix, alpha=0.05):
    """Function takes in a NxN confusion matrix
    and returns the fraction of correct predictions"""

    x = confusion_matrix.diagonal().sum()
    N = confusion_matrix.sum()

    ci = beta.interval(1-alpha,x,N-x)
    return ci
Beispiel #11
0
def acc_ci(confusion_matrix, alpha=0.05):
    """Function takes in a NxN confusion matrix
    and returns the fraction of correct predictions"""

    x = confusion_matrix.diagonal().sum()
    N = confusion_matrix.sum()

    ci = beta.interval(1 - alpha, x, N - x)
    return ci
Beispiel #12
0
def binomial_binning(dat,
                     y='correct',
                     grouping_variables='x',
                     ci=.95,
                     rule_of_succession=False):
    """Bin trials based on grouping variables, returning a new data frame
    with binomial outcome columns (successes, N_trials, plus propotion correct)
    rather than each row being a single trial.
    This data format can significantly speed up model fitting.

    :param dat:
        a pandas dataframe containing the data. Must have
        grouping_variables columns and also a column corresponding to
        bernoulli outcomes (0, 1).
    :param y:
        A string naming the column of the dataframe corresponding to bernoulli
        trial outcome. Defaults to "correct".
    :param grouping_variables:
        a string or list of strings containing the column names
        to group over. Defaults to 'x'.
    :param ci:
        the percentage of the confidence intervals.
    :param rule_of_succession:
        if true, apply a rule-of-succession correction to the data by
        adding 1 success and one failure to the total number of trials.
        This is essentially a prior acknowledging the possibility of both
        successes and failures, and is used to correct for values with
        proportions of 0 or 1 (to e.g. allow estimation of beta errors).
    :returns:
        a new pandas dataframe where each row is a binomial trial.

    Example
    ----------
    res = binomial_binning(dat, ['subj', 'surround', 'scale'])


    """
    grouped = dat.groupby(grouping_variables, as_index=False)
    res = grouped[y].agg({'n_successes': np.sum, 'n_trials': np.size})

    if rule_of_succession:
        res.loc[:, 'n_successes'] += 1
        res.loc[:, 'n_trials'] += 2

    # compute some additional values:
    res.loc[:, 'prop_corr'] = res.n_successes / res.n_trials

    # confidence intervals from a beta distribution:
    cis = beta.interval(ci, res.n_successes, (res.n_trials - res.n_successes))
    res.loc[:, 'ci_min'] = cis[0]
    res.loc[:, 'ci_max'] = cis[1]
    res.loc[:, 'error_min'] = np.abs(res['ci_min'].values -
                                     res['prop_corr'].values)
    res.loc[:, 'error_max'] = np.abs(res['ci_max'].values -
                                     res['prop_corr'].values)
    return (res)
Beispiel #13
0
def qq_plot(output_dir, file_path=None, dataset=None):

    if dataset is None:
        dataset = pd.read_table(file_path)
    dataset = dataset[dataset.isna()['pvalue'] == False]
    dataset['-log10_pvalue'] = -np.log10(dataset['pvalue'])
    gene_size = dataset.shape[0]

    exp = np.concatenate([
        np.arange(100) / gene_size,
        np.logspace(-np.log10(gene_size) + 2, 0, 200)
    ])
    obs = mquantiles(dataset['pvalue'], prob=exp, alphap=0, betap=1)

    lower = list()
    upper = list()
    for i in range(0, len(exp)):
        CI_values = beta.interval(0.95, gene_size * exp[i],
                                  gene_size - gene_size * exp[i])
        lower.append(CI_values[0])
        upper.append(CI_values[1])

    exp = -np.log10(exp)
    obs = -np.log10(obs)
    up = -np.log10(lower)
    low = -np.log10(upper)

    plt.close()
    fig = plt.figure(figsize=(10, 10))
    ax = fig.add_subplot(111)
    ax.fill_between(exp, up, low, color='grey', alpha=0.5)
    ax.set_xlim(np.nanmin(exp[exp != -np.inf]),
                np.nanmax(exp[exp != np.inf]) + 0.1)
    ax.set_ylim(
        np.nanmin(obs[obs != -np.inf]),
        max(np.nanmax(obs[obs != np.inf]), np.nanmax(up[up != np.inf])) + 0.5)
    ax.plot(ax.get_xlim(), ax.get_xlim(), linestyle='--', color='black')
    ax.scatter(exp, obs, s=3, c=(31 / 255., 119 / 255., 180 / 255.))

    if file_path is not None:
        # check file names
        if re.search(r".+\/(.+).tsv", file_path) == None:
            file_path = './' + file_path
        filename = re.search(r".+\/(.+).tsv", file_path).group(1)
        title = filename.replace('_', ' ')
        title = title.replace('.', ' ')
    else:
        filename = 'null'
        title = 'Null'
    ax.set_title(title + ' QQ-Plot', fontweight='bold', fontsize=24, y=1.02)
    ax.set_xlabel('expected -log\u2081\u2080 pvalue', fontsize=22)
    ax.set_ylabel('observed -log\u2081\u2080 pvalue', fontsize=22)
    ax.tick_params(labelsize=12)

    fig.savefig(output_dir + '/' + filename + '_qq_plot.png')
Beispiel #14
0
def historical_prob_winning(seed1, seed2):
    wins, losses = Seed2DDistribution.lookup(seed1, seed2)

    if not ((wins == 0) and (losses == 0)):
        mean = (wins + 1) / (wins + losses + 2.0)
        lower_bound, upper_bound = beta.interval(0.95, wins + 1, losses + 1)
        result = [seed1, seed2, mean, -lower_bound + mean, upper_bound - mean]
    else:
        result = None

    return result
Beispiel #15
0
def historical_prob_winning(seed1, seed2):
    wins, losses = Seed2DDistribution.lookup(seed1, seed2)

    if not ((wins == 0) and (losses == 0)):
        mean = (wins + 1) / (wins + losses + 2.0)
        lower_bound, upper_bound = beta.interval(0.95, wins + 1, losses + 1)
        result = [seed1, seed2, mean, -lower_bound + mean, upper_bound - mean]
    else:
        result = None

    return result
Beispiel #16
0
def test_multivariate_equal_intensities():
    N = 100
    false_positives = 0
    alpha = 0.95
    for i in range(100):
        T = np.random.exponential(10, size=300)
        g = np.random.binomial(2, 0.5, size=300)
        s, _, result = multivariate_logrank_test(T, g, alpha=alpha, suppress_print=True)
        false_positives += result is not None
    bounds = beta.interval(0.95, 1 + false_positives, N - false_positives + 1)
    assert bounds[0] < 1 - alpha < bounds[1]
    def is_confident(self, pi, percentage):
        """ Checks whether a given samples is confident with the distribution given a significance level

        :param pi: observed sample
        :param percentage: significance level
        :return: bool
        """

        # Compute the upper bound that contains 'percentage' percent of the distribution
        upper_confidence_bound = beta.interval(percentage, self._alpha,
                                               self._beta)[1]
        return pi < upper_confidence_bound
Beispiel #18
0
def get_rule_rhs(Xtrain,Ytrain,d_t,alpha,intervals, Volume, Y2length): #modify this to handle volume.
    N_t, Volume_t = compute_rule_usage(d_t,d_t.index(0),Xtrain,Ytrain, Volume, Y2length)
    theta = []
    ci_theta = []
    print("this is d_t", d_t)
    for i,j in enumerate(d_t):
        if Volume_t[i]>0:
            theta.append((N_t[i]+alpha)/(float(sum([N_t[k]+alpha for k in range(N_t.size)]))*(Volume_t[i])))
        else:
            theta.append(0.0)
        #And now the 95% interval, for Beta(N[j,1] + alpha[1], N[j,0] + alpha[0])
        if intervals:
            ci_theta.append(beta.interval(0.95,N_t[i,1] + alpha[1],N_t[i,0] + alpha[0]))
    return theta,ci_theta
def plot2(_list, chr_brps, centromere_brps, line_names=None):

    if not line_names:
        line_names = range(1, _list.shape[0]+1)

    inflated_table = np.vstack([inflate_tags(x[0, :], 25) for x in np.split(_list, _list.shape[0])])

    gs = gridspec.GridSpec(4, 4)

    ax1 = plt.subplot(gs[:-1, :])
    plt.imshow(inflated_table, interpolation='nearest', cmap='coolwarm')
    show_breakpoints([0] + chr_brps + [_list.shape[1]], 'k')
    show_breakpoints(list(set(centromere_brps) - set(chr_brps)), 'g')

    ax2 = plt.subplot(gs[-1, :], sharex=ax1)
    red_run = np.nanmean((_list > 0).astype(np.float), 0)
    blue_run = np.nanmean((_list < 0).astype(np.float), 0)

    stack = np.hstack((blue_run, red_run))
    mean = np.mean(stack)
    std = np.std(stack)
    _alpha = ((1 - mean)/std**2 - 1/mean)*mean**2
    _beta = _alpha*(1/mean-1)
    r = beta.rvs(_alpha, _beta, size=1000)
    _min, _max = beta.interval(0.95, _alpha, _beta)

    plt.plot(blue_run, 'b')
    plt.plot(red_run, 'r')
    plt.axhline(y=_min, color='g')
    plt.axhline(y=_max, color='g')
    show_breakpoints([0] + chr_brps + [_list.shape[1]], 'k')
    show_breakpoints(list(set(centromere_brps) - set(chr_brps)), 'g')

    chr_arm_locations, chr_arm_names = sf.align_chromosome_edges(chr_brps, centromere_brps)

    ax1.set_xticks(chr_arm_locations)
    ax1.set_xticklabels(chr_arm_names, rotation='vertical')

    ax1.set_yticks(range(0, _list.shape[0]*25+1, 25))
    ax1.set_yticklabels(line_names)

    ax2.set_xticks(chr_arm_locations)
    ax2.set_xticklabels(chr_arm_names, rotation='vertical')
    plt.show()

    smooth_histogram(r, 'b')
    smooth_histogram(stack)
    plt.axvline(x=_max, color='g')
    plt.axvline(x=_min, color='g')
    plt.show()
Beispiel #20
0
def get_rule_rhs(Xtrain, Ytrain, d_t, alpha, intervals):
    N_t = compute_rule_usage(d_t, d_t.index(0), Xtrain, Ytrain)
    theta = []
    ci_theta = []
    for i, j in enumerate(d_t):
        # theta ~ Dirichlet(N[j,:] + alpha)
        # E[theta] = (N[j,:] + alpha)/float(sum(N[j,:] + alpha))
        # NOTE this result is only for binary classification
        # theta = p(y=1)
        theta.append((N_t[i, 1] + alpha[1]) / float(sum(N_t[i, :] + alpha)))
        # And now the 95% interval, for Beta(N[j,1] + alpha[1], N[j,0] + alpha[0])
        if intervals:
            ci_theta.append(beta.interval(0.95, N_t[i, 1] + alpha[1], N_t[i, 0] + alpha[0]))
    return theta, ci_theta
def get_rule_rhs(Xtrain,Ytrain,d_t,alpha,intervals, Volume, Y2length): #modify this to handle volume.
    N_t, Volume_t = compute_rule_usage(d_t,d_t.index(0),Xtrain,Ytrain, Volume, Y2length)
    theta = []
    ci_theta = []
    print "this is d_t", d_t
    for i,j in enumerate(d_t):
        if Volume_t[i]>0:
            theta.append((N_t[i]+alpha)/(float(sum([N_t[k]+alpha for k in range(N_t.size)]))*(Volume_t[i])))
        else:
            theta.append(0.0)
        #And now the 95% interval, for Beta(N[j,1] + alpha[1], N[j,0] + alpha[0])
        if intervals:
            ci_theta.append(beta.interval(0.95,N_t[i,1] + alpha[1],N_t[i,0] + alpha[0]))
    return theta,ci_theta
def get_rule_rhs(Xtrain, Ytrain, d_t, alpha, intervals):
    N_t = compute_rule_usage(d_t, d_t.index(0), Xtrain, Ytrain)
    theta = []
    ci_theta = []
    for i, j in enumerate(d_t):
        #theta ~ Dirichlet(N[j,:] + alpha)
        #E[theta] = (N[j,:] + alpha)/float(sum(N[j,:] + alpha))
        #NOTE this result is only for binary classification
        #theta = p(y=1)
        theta.append((N_t[i, 1] + alpha[1]) / float(sum(N_t[i, :] + alpha)))
        #And now the 95% interval, for Beta(N[j,1] + alpha[1], N[j,0] + alpha[0])
        if intervals:
            ci_theta.append(beta.interval(0.95, N_t[i, 1] + alpha[1], N_t[i, 0] + alpha[0]))
    return theta, ci_theta
Beispiel #23
0
def test_equal_intensity():
    """
    This is the (I think) fact that 1-alpha == false positive rate.
    I use a Bayesian test to test that we achieve this rate.
    """
    N = 100
    false_positives = 0
    alpha = 0.95
    for i in range(100):
        data1 = np.random.exponential(5, size=(200, 1))
        data2 = np.random.exponential(5, size=(200, 1))
        summary, p_value, result = logrank_test(data1, data2, alpha=0.95, suppress_print=True)
        false_positives += result is not None
    bounds = beta.interval(0.95, 1 + false_positives, N - false_positives + 1)
    assert bounds[0] < 1 - alpha < bounds[1]
Beispiel #24
0
 def _calc_beta_intervals(means, variances, prob=0.95):
     """Calculate confidence interval of beta distributions."""
     if not isinstance(means, np.ndarray):
         means = np.array(means)
     if not isinstance(variances, np.ndarray):
         variances = np.array(variances)
     with np.errstate(divide='ignore'):
         coef_a = ((means ** 2) * (1 - means) / variances) - means
         coef_b = (coef_a * (1 - means)) / means
         itl_lows, itl_his = beta.interval(prob, coef_a, coef_b)
         sds = np.sqrt(variances)
     for i in range(itl_lows.shape[0]):
         if not np.isfinite(sds[i]) or not np.isfinite(itl_lows[i]) or not np.isfinite(itl_his[i]):
             itl_lows[i] = means[i]
             itl_his[i] = means[i]
             sds[i] = 0
     return itl_lows, itl_his, sds
Beispiel #25
0
def get_rule_rhs(Xtrain, Ytrain, d_t, alpha, intervals):
    '''Compute the posterior consequent distributions
    (Basically compute points in each part of rule)
    '''
    N_t = compute_rule_usage(d_t, d_t.index(0), Xtrain, Ytrain)
    theta = []  # P(Y=1)
    ci_theta = []  # confidence interval for Y=1
    for i, j in enumerate(d_t):
        # theta ~ Dirichlet(N[j,:] + alpha)
        # E[theta] = (N[j,:] + alpha)/float(sum(N[j,:] + alpha))
        # NOTE this result is only for binary classification
        # theta = p(y=1)
        theta.append((N_t[i, 1] + alpha[1]) / float(sum(N_t[i, :] + alpha)))
        # And now the 95% interval, for Beta(N[j,1] + alpha[1], N[j,0] + alpha[0])
        if intervals:
            ci_theta.append(
                beta.interval(0.95, N_t[i, 1] + alpha[1],
                              N_t[i, 0] + alpha[0]))
    return theta, ci_theta
Beispiel #26
0
    def interval(self, alpha: float) -> Array:
        """ Calculates the endpoints of a confidence interval range using alpha
        
        Parameters
        ----------
        alpha: float
            Percent of distribution to be caintained within returned interval.
            Must be between 0.0 and 1.0
            
        Returns
        -------
        Array:
            Array containing the interval range, first element is the low end of
            the range, second element is the high end of the range.
        """

        interval = beta_dist.interval(alpha, self.alpha, self.beta)
        interval = np.array([(val * self.range) + self.a for val in interval])
        return interval
Beispiel #27
0
def get_sex(sample, Nx, Na, Lx, La):
    Rx = float(Nx) / (Nx + Na)

    # Beta CI with non-informative prior, aka Jefferey's interval.
    # See Brown, Cai, and DasGupta (2001). doi:10.1214/ss/1009213286
    Rx_CI = beta.interval(0.99, Nx + 0.5, Na + 0.5)

    # expected ratios from the chromosome lengths
    Elx_X0 = float(Lx) / (Lx + 2 * La)
    Elx_XX = float(Lx) / (Lx + La)

    #ll_x0 = beta.logpdf(Elx_X0, Nx+0.5, Na+0.5)
    #ll_xx = beta.logpdf(Elx_XX, Nx+0.5, Na+0.5)
    ll_x0 = binom.logpmf(Nx, Nx + Na, Elx_X0)
    ll_xx = binom.logpmf(Nx, Nx + Na, Elx_XX)

    # likelihood ratio test
    alpha = 0.001
    if chi2.sf(2 * (ll_x0 - ll_xx), 1) < alpha:
        sex = 'M'
    elif chi2.sf(2 * (ll_xx - ll_x0), 1) < alpha:
        sex = 'F'
    else:
        # indeterminate
        sex = 'U'

    if ll_x0 > ll_xx:
        Elx = 2 * Elx_X0
    else:
        Elx = Elx_XX

    Mx = Rx / Elx
    Mx_CI = [Rx_CI[0] / Elx, Rx_CI[1] / Elx]

    if Mx < 0.4 or Mx > 1.2:
        #print("Warning: {} has unexpected Mx={:g}".format(sample, Mx), file=sys.stderr)
        pass

    if Mx > 0.6 and Mx < 0.8:
        # suspicious sample, may be contaminated
        sex = 'U'

    return Elx, Mx, Mx_CI, sex
Beispiel #28
0
def qq(data, ax, color):
    xmax = 0
    ymax = 0
    alpha = 0.9
    color = '#000000'
    n_quantiles = 100

    q_pos = np.concatenate([
        np.arange(99.) / len(data),
        np.logspace(-np.log10(len(data)) + 2, 0, n_quantiles)
    ])

    q_data = mquantiles(data, prob=q_pos, alphap=0, betap=1, limit=(0, 1))
    q_th = q_pos.copy()
    q_err = np.zeros([len(q_pos), 2])
    for i in range(0, len(q_pos)):
        q_err[i, :] = q_err[i, :] = beta.interval(
            alpha,
            len(data) * q_pos[i],
            len(data) - len(data) * q_pos[i])

    q_err[i, q_err[i, :] < 0] = 1e-15
    slope, intercept, r_value, p_value, std_err = linregress(q_th, q_data)
    xmax = np.max([xmax, -np.log10(q_th[1])])
    ymax = np.max([ymax, -np.log10(q_data[0])])

    ax.plot(-np.log10(q_th[n_quantiles - 1:]),
            -np.log10(q_data[n_quantiles - 1:]),
            '-',
            color=color)
    ax.plot(-np.log10(q_th[:n_quantiles]),
            -np.log10(q_data[:n_quantiles]),
            '.',
            color=color,
            label='gf')
    ax.plot([0, xmax], [0, xmax], '--k', color='#f42e30')
    ax.fill_between(
        -np.log10(q_th),
        -np.log10(q_err[:, 0]),
        -np.log10(q_err[:, 1]),
        color=color,
        alpha=0.1,
    )
Beispiel #29
0
def BetaModel(data):

    def fitted(x, a, b):
        fx = gammaf(a+b)/gammaf(a)/gammaf(b)*x**(a-1)*(1-x)**(b-1)  # pdf of beta
        return fx

    data1 = MaxMinNormalization(data)

    a, b, xx, yy = beta.fit(data1)

    plt.hist(data1, bins=30, normed=True)
    xx = np.linspace(0, max(data1), len(data1))
    plt.plot(xx, fitted(xx, a, b), 'g')
    plt.show()

    alpha = 0.95
    q1, q2 = beta.interval(alpha, a, b, loc=0, scale=1)

    d1 = q1*(max(data)-min(data))+min(data)
    d2 = q2*(max(data)-min(data))+min(data)
    return a, b, d1, d2
Beispiel #30
0
def parse_sextable(filename):
    sextable = []
    with open(filename) as f:
        next(f)  # skip header
        for line in f:
            line = line.rstrip()
            fields = line.split("\t")
            if (len(fields) < 5):
                continue
            sample = fields[0]
            Mx = int(fields[1])
            Lx = int(fields[2])
            Ma = int(fields[3])
            La = int(fields[4])

            if Ma < 1000:
                # don't bother
                continue

            Rl = float(Lx) / (Lx + La)
            Rx = float(Mx) / (Mx + Ma)

            # Beta 95%CI with non-informative prior, aka Jefferey's interval.
            # See Brown, Cai, and DasGupta (2001). doi:10.1214/ss/1009213286
            Rx_95CI = beta.interval(0.95, Mx + 0.5, Ma + 0.5)

            sfields = sample.split("_")
            if sfields[-1].upper() == 'INF':
                age = float('inf')
            else:
                try:
                    age = float(int(sfields[-1]))
                except ValueError:
                    age = None

            sextable.append((sample, Mx, Lx, Ma, La, Rl, Rx, Rx_95CI, age))
    return sextable
Beispiel #31
0
# Compute 95% CI for a beta distribution
import superimport

from scipy.stats import beta
import numpy as np

np.random.seed(42)

N1 = 2
N0 = 8
N = N0 + N1  # Sufficient statistics
aprior = 1
bprior = 1
# prior
apost = aprior + N1
bpost = bprior + N0  # posterior

alpha = 0.05
CI1 = beta.interval(1 - alpha, apost, bpost)
print('{:0.2f}--{:0.2f}'.format(CI1[0], CI1[1]))  # (0.06:0.52)

l = beta.ppf(alpha / 2, apost, bpost)
u = beta.ppf(1 - alpha / 2, apost, bpost)
CI2 = (l, u)
print('{:0.2f}--{:0.2f}'.format(CI2[0], CI2[1]))  # (0.06:0.52)

samples = beta.rvs(apost, bpost, size=1000)
samples = np.sort(samples)
CI3 = np.percentile(samples, 100 * np.array([alpha / 2, 1 - alpha / 2]))
print('{:0.2f}--{:0.2f}'.format(CI3[0], CI3[1]))  # (0.06:0.51)
Beispiel #32
0
 def get_beta_confidence_interval(self, conf=0.68):
     betaparams = beta.fit(self.data)
     beta.interval(conf, *betaparams)
def QQPlot(data_table,p_value_column=None,maf_column=None,freq_bins=None,n_quantiles=1000,error_ci=0.95,min_p=1e-30,hide_hla=False,error_type='experimental',lambda_gc_scale=None):
    f, axis = plt.subplots(1, 1,figsize=(8,8))
    axis.spines['right'].set_visible(False)
    axis.spines['top'].set_visible(False)
    if p_value_column==None:
        p_value_column='P'
    if maf_column==None:
        if 'MAF' in data_table.columns:
            maf_column='MAF'
        else:
            data_table['MAF']=np.zeros(len(data_table))*np.nan

    if hide_hla:
        chr6 = data_table.loc[(data_table.CHROM==6)]
        excluded=chr6.index[np.logical_and(chr6.POS>=28477797,chr6.POS<=33448354)]
        p_maf_table=data_table.drop(excluded)[[maf_column,p_value_column]]
    elif maf_column is not None:
        p_maf_table=data_table[[maf_column,p_value_column]]
    else:
        p_maf_table=data_table[[p_value_column]]

    assert error_type in ['experimental','theoretical'],"Error type must be in ['experimental','theoretical']"



    min_vals_obs=[]
    min_vals_exp=[]
    if freq_bins is None:
        p_input= p_maf_table[p_value_column].values
        p_input[p_input<min_p]=min_p
        quantile_thresholds = np.concatenate([np.arange(1,np.floor(0.5*n_quantiles))/p_input.shape[0], np.logspace(np.log10(np.floor(0.5*n_quantiles)/p_input.shape[0]), 0, int(np.ceil(0.5*n_quantiles))+1)[:-1]])
        obs_quantiles = mquantiles(p_input, prob=quantile_thresholds, alphap=0.0, betap=1.0, limit=(0.0, 1.0))
        axis.plot(-np.log10(quantile_thresholds),-np.log10(obs_quantiles),'.',color=color_list[0],ms=15)
        if lambda_gc_scale is not None:
            axis.text(1,5,r'$\lambda_{IF}$'+'={0:1.2f}'.format(LambdaGC(p_input)[0])+' ('+r'$\lambda^{'+'{0:d}'.format(lambda_gc_scale)+'}_{IF}$'+'={0:1.3f}'.format(LambdaGC(p_input,scaling_factor=lambda_gc_scale)[0])+')',fontsize=24,fontweight='bold',color=color_list[0])
        else:
            axis.text(1,5,r'$\lambda_{IF}$'+'={0:1.2f}'.format(LambdaGC(p_input)[0]),fontsize=24,fontweight='bold',color=color_list[0])

        min_vals_obs+=[obs_quantiles.min()]
        min_vals_exp+=[quantile_thresholds.min()]
        if error_type=='experimental':
            ci_vecs = beta.interval(error_ci, len(p_maf_table)*quantile_thresholds, len(p_maf_table) - len(p_maf_table)*quantile_thresholds)


            axis.fill_between( -np.log10(quantile_thresholds), -np.log10(obs_quantiles/quantile_thresholds*ci_vecs[0]), -np.log10(obs_quantiles/quantile_thresholds*ci_vecs[1]), color=color_list[0], alpha=0.25, label='{0:2d}% CI'.format(int(100*error_ci)))


    else:
        for i in range(len(freq_bins)-2):
            p_input= p_maf_table[np.logical_and(p_maf_table[maf_column]>=freq_bins[i],p_maf_table[maf_column]<freq_bins[i+1])][p_value_column].values
            p_input[p_input<min_p]=min_p
            quantile_thresholds = np.concatenate([np.arange(1,np.floor(0.5*n_quantiles))/p_input.shape[0], np.logspace(np.log10(np.floor(0.5*n_quantiles)/p_input.shape[0]), 0, int(np.ceil(0.5*n_quantiles))+1)[:-1]])
            obs_quantiles = mquantiles(p_input, prob=quantile_thresholds, alphap=0.0, betap=1.0, limit=(0.0, 1.0))
            axis.plot(-np.log10(quantile_thresholds),-np.log10(obs_quantiles),'.',ms=15,color=color_list[(i*2)%len(color_list)],label=r'{0:.1e}$\leq$ MAF$<${1:.1e}'.format(freq_bins[i],freq_bins[i+1]))
            if error_type=='experimental':
                ci_vecs = beta.interval(error_ci, len(p_maf_table)*quantile_thresholds, len(p_maf_table) - len(p_maf_table)*quantile_thresholds)
                axis.fill_between( -np.log10(quantile_thresholds), -np.log10(obs_quantiles/quantile_thresholds*ci_vecs[0]), -np.log10(obs_quantiles/quantile_thresholds*ci_vecs[1]), color=color_list[(i*2)%len(color_list)], alpha=0.25, label='{0:2d}% CI'.format(int(100*error_ci)))

            if lambda_gc_scale is not None:
                axis.text(1,5-i,r'$\lambda_{IF}$'+'={0:1.2f}'.format(LambdaGC(p_input)[0])+' ('+r'$\lambda^{'+'{0:d}'.format(lambda_gc_scale)+'}_{IF}$'+'={0:1.3f}'.format(LambdaGC(p_input,scaling_factor=lambda_gc_scale)[0])+')',fontsize=24,fontweight='bold',color=color_list[i*2])
            else:
                axis.text(1,5-i,r'$\lambda_{IF}$'+'={0:1.2f}'.format(LambdaGC(p_input)[0]),fontsize=24,fontweight='bold',color=color_list[i*2])

            min_vals_obs+=[obs_quantiles.min()]
            min_vals_exp+=[quantile_thresholds.min()]

        i+=1
        p_input= p_maf_table[np.logical_and(p_maf_table[maf_column]>=freq_bins[i],p_maf_table[maf_column]<=freq_bins[i+1])][p_value_column].values
        p_input[p_input<min_p]=min_p
        quantile_thresholds = np.concatenate([np.arange(1,np.floor(0.5*n_quantiles))/p_input.shape[0], np.logspace(np.log10(np.floor(0.5*n_quantiles)/p_input.shape[0]), 0, int(np.ceil(0.5*n_quantiles))+1)[:-1]])
        obs_quantiles = mquantiles(p_input, prob=quantile_thresholds, alphap=0.0, betap=1.0, limit=(0.0, 1.0))
        axis.plot(-np.log10(quantile_thresholds),-np.log10(obs_quantiles),'o',color=color_list[(i*2)%len(color_list)],mew=0.0,label=r'{0:.1e}$\leq$ MAF$\leq${1:.1e}'.format(freq_bins[i],0.5))
        if error_type=='experimental':
            ci_vecs = beta.interval(error_ci, len(p_maf_table)*quantile_thresholds, len(p_maf_table) - len(p_maf_table)*quantile_thresholds)

            axis.fill_between( -np.log10(quantile_thresholds), -np.log10(obs_quantiles/quantile_thresholds*ci_vecs[0]), -np.log10(obs_quantiles/quantile_thresholds*ci_vecs[1]), color=color_list[(i*2)%len(color_list)], alpha=0.25, label='{0:2d}% CI'.format(int(100*error_ci)))

        if lambda_gc_scale is not None:
            axis.text(1,5-i,r'$\lambda_{IF}$'+'={0:1.2f}'.format(LambdaGC(p_input)[0])+' ('+r'$\lambda^{'+'{0:d}'.format(lambda_gc_scale)+'}_{IF}$'+'={0:1.3f}'.format(LambdaGC(p_input,scaling_factor=lambda_gc_scale)[0])+')',fontsize=24,fontweight='bold',color=color_list[i*2])
        else:
            axis.text(1,5-i,r'$\lambda_{IF}$'+'={0:1.2f}'.format(LambdaGC(p_input)[0]),fontsize=24,fontweight='bold',color=color_list[i*2])
        min_vals_obs+=[obs_quantiles.min()]
        min_vals_exp+=[quantile_thresholds.min()]



    axis.set_xlim(0.0,np.ceil(-np.log10(min(min_vals_exp))))



    exp_p_vals = np.linspace(0,axis.get_xlim()[1],100)
    if error_type=='theoretical':
        ci_vecs = beta.interval(error_ci, len(p_maf_table)*(10**(-1.0*exp_p_vals)), len(p_maf_table) - len(p_maf_table)*(10**(-1.0*exp_p_vals)))
        axis.fill_between(exp_p_vals, -np.log10(ci_vecs[0]), -np.log10(ci_vecs[1]), color=grey_color, alpha=0.25, label='{0:2d}% CI'.format(int(100*error_ci)))

    axis.plot(exp_p_vals,exp_p_vals,'--',color=red_color,lw=3.0)

    axis.set_ylim(0.0,np.ceil(-np.log10(min(min(min_vals_obs),ci_vecs[0].min(),min(min_vals_obs))))+1)

    axis.legend(loc='upper left',frameon=False,fontsize=14)
    axis.set_xlabel(r'$\log_{10}$(P-Value)'+'\nExpected',fontsize=24)
    axis.set_ylabel(r'$\log_{10}$(P-Value)'+'\nObserved',fontsize=24)
    return f,axis
        final_state = state[1::2]
        switched = np.logical_xor(init_state, final_state)

        count_mat = np.zeros((2,2), dtype=np.int)

        count_mat[0,0] = np.sum(np.logical_and(init_state == 0, np.logical_not(switched) ))
        count_mat[0,1] = np.sum(np.logical_and(init_state == 0, switched ))
        count_mat[1,0] = np.sum(np.logical_and(init_state == 1, switched ))
        count_mat[1,1] = np.sum(np.logical_and(init_state == 1, np.logical_not(switched) ))

        counts.append(count_mat)

    plt.figure()
    mean_PtoAP = [beta.mean(1+c[0,1], 1+c[0,0]) for c in counts]
    mean_APtoP = [beta.mean(1+c[1,0], 1+c[1,1]) for c in counts]
    ci68_PtoAP = [beta.interval(0.68, 1+c[0,1], 1+c[0,0]) for c in counts]
    ci68_APtoP = [beta.interval(0.68, 1+c[1,0], 1+c[1,1]) for c in counts]
    ci95_PtoAP = [beta.interval(0.95, 1+c[0,1], 1+c[0,0]) for c in counts]
    ci95_APtoP = [beta.interval(0.95, 1+c[1,0], 1+c[1,1]) for c in counts]
    current_palette = sns.color_palette()
    plt.plot(fall_times, mean_PtoAP)
    plt.fill_between(fall_times, [ci[0] for ci in ci68_PtoAP], [ci[1] for ci in ci68_PtoAP], color=current_palette[0], alpha=0.2, edgecolor="none")
    plt.fill_between(fall_times, [ci[0] for ci in ci95_PtoAP], [ci[1] for ci in ci95_PtoAP], color=current_palette[0], alpha=0.2, edgecolor="none")
    plt.plot(fall_times, mean_APtoP)
    plt.fill_between(fall_times, [ci[0] for ci in ci68_APtoP], [ci[1] for ci in ci68_APtoP], color=current_palette[1], alpha=0.2, edgecolor="none")
    plt.fill_between(fall_times, [ci[0] for ci in ci95_APtoP], [ci[1] for ci in ci95_APtoP], color=current_palette[1], alpha=0.2, edgecolor="none")
    plt.xlabel("nTron Pulse Fall Time (s)")
    plt.ylabel("Switching Probability")
    plt.legend(("P->AP", "AP->P"))

    plt.show()
Beispiel #35
0
def calculate_dirichlet_abundances(ts_tv_file, p_values_file,
                                   total_fastq_reads, sample_abundance):
    """
    Function that calculates the mean posterior abundances of species in metagenomic samples/libraries.
    """

    assert os.stat(
        ts_tv_file).st_size, f"The ts_tv count file is empty {ts_tv_file}"
    assert os.stat(
        p_values_file
    ).st_size, f"The chi-square p values file is empty {p_values_file}"
    assert os.stat(
        total_fastq_reads
    ).st_size, f"The total fastq reads file is empty {total_fastq_reads}"

    # I calculate the coverage, fraction of the covered genome and evenness of coverage of each taxon
    # from reads in its bam/pileup file. Let's go there
    cov_val = pd.read_csv(
        p_values_file,
        sep="\t",
        names=[
            "species", "ref_bases_cov", "total_bases_cov", "coverage",
            "fraction_ref_cov", "cov_evenness"
        ],
        usecols=["species", "coverage", "fraction_ref_cov", "cov_evenness"],
    )

    evenness_vector = (cov_val[["species", "cov_evenness"
                                ]].fillna(value=0).groupby("species").apply(
                                    hmean).astype("float64").rename("Taxon"))
    evenness_vector["Dark_Matter"] = np.nan
    evenness_vector["Grey_Matter"] = np.nan

    fraction_vector = (cov_val[["species", "fraction_ref_cov"
                                ]].fillna(value=0).groupby("species").apply(
                                    hmean).astype("float64").rename("Taxon"))
    fraction_vector["Dark_Matter"] = np.nan
    fraction_vector["Grey_Matter"] = np.nan

    coverage_vector = (cov_val[["species", "coverage"
                                ]].fillna(value=0).groupby("species").apply(
                                    hmean).astype("float64").rename("Taxon"))
    coverage_vector["Dark_Matter"] = np.nan
    coverage_vector["Grey_Matter"] = np.nan

    ts_tv_matrix = pd.read_csv(
        ts_tv_file,
        sep=",",
        usecols=["Taxon", "Read_ID", "Dirichlet_Assignment"])

    aln_reads_vector = ts_tv_matrix[[
        "Taxon", "Read_ID"
    ]].groupby("Taxon").count().squeeze(axis=1).rename("Taxon")
    aln_reads_vector["Dark_Matter"] = 0
    aln_reads_vector["Grey_Matter"] = 0

    # Sum the Dirichlet Assignments per taxon and calculate the Dark Matter reads from the Dirichlet Assignment column
    ts_tv_group = ts_tv_matrix.groupby("Read_ID").sum().squeeze(axis=1)
    grey_matter = ts_tv_group.where(ts_tv_group == 0).replace(0, 1).fillna(0)

    if len(ts_tv_matrix.Taxon.unique()) > 1:
        a = ts_tv_matrix.groupby("Taxon").sum().squeeze().astype(float)
    else:
        a = ts_tv_matrix.groupby("Taxon").sum().iloc[:, 0].astype(float)
    a.loc["Grey_Matter"] = grey_matter.sum()

    # Add the non aligned filtered reads count in the Dark Matter category
    total_fastq_reads = float(open(total_fastq_reads, "r").read())
    reads_in_bams = len(ts_tv_matrix["Read_ID"].unique())

    remaining_dark_matter = total_fastq_reads - reads_in_bams

    a.loc["Dark_Matter"] = remaining_dark_matter

    print(a, file=sys.stderr)

    # Perform Alberto's formulas
    b = a.sum()

    posterior_abundance_mean = a.add(1).divide(b + len(a)).sort_values(
        ascending=False)

    # Prepare the dataframe that is going to be outputted and calculate the rest of the output columns.
    posterior_abundance = posterior_abundance_mean.to_frame().reset_index()
    posterior_abundance.rename(
        columns={"Dirichlet_Assignment": "Mean_Posterior_Abundance"},
        inplace=True)
    posterior_abundance["95_CI_lower"] = np.nan
    posterior_abundance["95_CI_upper"] = np.nan
    posterior_abundance["Minimum_Read_Num"] = np.nan
    posterior_abundance["Maximum_Read_Num"] = np.nan
    posterior_abundance["Dirichlet_Read_Num"] = np.nan
    posterior_abundance["Evenness_of_Coverage_Ratio"] = np.nan
    posterior_abundance["Fraction_of_Genome_Covered"] = np.nan
    posterior_abundance["Coverage"] = np.nan
    posterior_abundance["Aligned_Read_Num"] = np.nan

    for idx, row in posterior_abundance.iterrows():
        ai = a.loc[posterior_abundance.iloc[idx, 0]]

        print(ai, file=sys.stderr)

        ci = beta.interval(0.95, ai + 1, b + len(a) - ai - 1)

        print(len(a), file=sys.stderr)

        posterior_abundance.iloc[idx, 2] = ci[0]
        posterior_abundance.iloc[idx, 3] = ci[1]
        posterior_abundance.iloc[idx, 4] = round(ci[0] * b)
        posterior_abundance.iloc[idx, 5] = round(ci[1] * b)
        posterior_abundance.iloc[idx, 6] = a.loc[posterior_abundance.iloc[idx,
                                                                          0]]
        posterior_abundance.iloc[idx, 7] = evenness_vector.loc[
            posterior_abundance.iloc[idx, 0]]
        posterior_abundance.iloc[idx, 8] = fraction_vector.loc[
            posterior_abundance.iloc[idx, 0]]
        posterior_abundance.iloc[idx, 9] = coverage_vector.loc[
            posterior_abundance.iloc[idx, 0]]
        posterior_abundance.iloc[idx, 10] = aln_reads_vector.loc[
            posterior_abundance.iloc[idx, 0]]

    with open(sample_abundance, "w") as output_handle:
        posterior_abundance.to_csv(path_or_buf=output_handle,
                                   sep="\t",
                                   index=False,
                                   header=True)
from scipy.stats import beta
import numpy as np

S = 47
N = 100
a = S + 1
b = (N - S) + 1
alpha = 0.05

CI1 = beta.interval(1 - alpha, a, b)

l = beta.ppf(alpha / 2, a, b)
u = beta.ppf(1 - alpha / 2, a, b)
CI2 = (l, u)

samples = beta.rvs(a, b, size=1000)
samples = np.sort(samples)
CI3 = np.percentile(samples, 100 * np.array([alpha / 2, 1 - alpha / 2]))

print(CI1)
print(CI2)
print(CI3)
from numpy import *
from scipy.stats import beta
from scipy.stats import gamma
from numpy.random import normal
from scipy.optimize import fsolve

rs = random.RandomState(12345)
n_sample = 10000

######################
# parameters of beta distributions representing the proportion of the population sexually 
# active, by sex and age group
######################
# men, 16-24
[alpha_m_16_24, beta_m_16_24] = fsolve(
    lambda x: array(beta.interval(0.95, x[0], x[1], loc=0, scale=1))
    - (0.8023836019, 0.843403825),
    [1,1]
    )
# women, 16-24
[alpha_f_16_24, beta_f_16_24] = fsolve(
    lambda x: array(beta.interval(0.95, x[0], x[1], loc=0, scale=1))
    - (0.7998634469, 0.837979601),
    [1,1]
    )

######################
# sexually-active population:
######################

# Population, testing and diagnosis data is from http://www.chlamydiascreening.nhs.uk/ps/data.asp (downloaded 17 April 2015).
def histogram_pair(value_vec,
                   binary_vec,
                   bins,
                   smoothing_const=.01,
                   prior_prob=.5,
                   rel_risk=False,
                   error_bar_alpha=.05,
                   figsize=(12, 6),
                   **kwargs):
    """This is a tool to explore the relationship between a numerical feature and a 1/0 binary outcome.

    Author: Brian Lucena

    It plots two histograms: one is of the values of the feature when the binary outcome is positive (1)
    and the other when it is negative (0).

    It then gives the marginal empirical probability of being a 1 given that the numerical feature
    is in a particular value range.

    In practice, it often takes some experimentation to find the appropriate bin endpoints for a
    particular feature.

    If the data contains 'NaN' values, it will also draw two small horizontal (dotted and dashed)
    lines, indicating the probabilities given NaN and not NaN respectively.
    """
    nan_mask = np.isnan(value_vec)
    num_nans = np.sum(nan_mask)
    if num_nans > 0:
        nan_binary_vec = binary_vec[nan_mask]
        binary_vec = binary_vec[~nan_mask]
        value_vec = value_vec[~nan_mask]
        nan_avg_value = np.mean(nan_binary_vec)
        reg_avg_value = np.mean(binary_vec)
    # digitized_value_vec = np.digitize(value_vec, bins)
    # x_pts_to_graph = np.array([np.mean(value_vec[digitized_value_vec==i]) for i in np.unique(digitized_value_vec)])
    # print(x_pts_to_graph)
    out0 = plt.hist(value_vec[binary_vec == 0], bins=bins, **kwargs)
    out1 = plt.hist(value_vec[binary_vec == 1], bins=bins, **kwargs)
    plt.close()
    plt.figure(figsize=figsize)
    plt.subplot(2, 1, 1)
    plt.hist((value_vec[binary_vec == 0], value_vec[binary_vec == 1]),
             stacked=True,
             bins=bins,
             **kwargs)
    bin_leftpts = (out1[1])[:-1]
    bin_rightpts = (out1[1])[1:]
    default_bin_centers = (bin_leftpts + bin_rightpts) / 2
    digitized_value_vec = np.digitize(value_vec, bins)
    bin_centers = np.array([
        np.mean(value_vec[digitized_value_vec == i])
        if i in np.unique(digitized_value_vec) else default_bin_centers[i - 1]
        for i in np.arange(len(bins) - 1) + 1
    ])
    prob_numer = out1[0]
    prob_denom = out1[0] + out0[0]
    smoothing_const = .001
    probs = (prob_numer + prior_prob * smoothing_const) / (prob_denom +
                                                           smoothing_const)
    # print(bin_centers)
    # print(probs)
    plt.subplot(2, 1, 2)
    if rel_risk:
        plt.plot(bin_centers, np.log10(probs / prior_prob))
        # plt.errorbar(bin_centers, probs, yerr=1.96 * probs * (1 - probs) / np.sqrt(prob_denom), capsize=3)
        plt.xlim(bin_leftpts[0], bin_rightpts[-1])
    else:
        plt.plot(bin_centers[:len(probs)], probs)
        plt.xlim(bin_leftpts[0], bin_rightpts[-1])
        yerr_mat_temp = beta.interval(1 - error_bar_alpha, out1[0] + 1,
                                      out0[0] + 1)
        yerr_mat = np.vstack((yerr_mat_temp[0], yerr_mat_temp[1])) - probs
        yerr_mat[0, :] = -yerr_mat[0, :]
        plt.errorbar(bin_centers[:len(probs)], probs, yerr=yerr_mat, capsize=5)
        plt.xlim(bin_leftpts[0], bin_rightpts[-1])
        if num_nans > 0:
            plt.hlines(y=nan_avg_value,
                       xmin=bin_leftpts[0],
                       xmax=bin_leftpts[1],
                       linestyle='dotted')
            plt.hlines(y=reg_avg_value,
                       xmin=bin_leftpts[0],
                       xmax=bin_leftpts[1],
                       linestyle='dashed')
    return {
        'bin_centers': bin_centers,
        'probs': probs,
        'prob_numer': prob_numer,
        'prob_denom': prob_denom
    }
        count_mat = np.zeros((2, 2), dtype=np.int)

        count_mat[0, 0] = np.sum(
            np.logical_and(init_state == 0, np.logical_not(switched)))
        count_mat[0, 1] = np.sum(np.logical_and(init_state == 0, switched))
        count_mat[1, 0] = np.sum(np.logical_and(init_state == 1, switched))
        count_mat[1, 1] = np.sum(
            np.logical_and(init_state == 1, np.logical_not(switched)))

        counts.append(count_mat)

    plt.figure()
    mean_PtoAP = [beta.mean(1 + c[0, 1], 1 + c[0, 0]) for c in counts]
    mean_APtoP = [beta.mean(1 + c[1, 0], 1 + c[1, 1]) for c in counts]
    ci68_PtoAP = [
        beta.interval(0.68, 1 + c[0, 1], 1 + c[0, 0]) for c in counts
    ]
    ci68_APtoP = [
        beta.interval(0.68, 1 + c[1, 0], 1 + c[1, 1]) for c in counts
    ]
    ci95_PtoAP = [
        beta.interval(0.95, 1 + c[0, 1], 1 + c[0, 0]) for c in counts
    ]
    ci95_APtoP = [
        beta.interval(0.95, 1 + c[1, 0], 1 + c[1, 1]) for c in counts
    ]
    current_palette = sns.color_palette()
    plt.plot(fall_times, mean_PtoAP)
    plt.fill_between(fall_times, [ci[0] for ci in ci68_PtoAP],
                     [ci[1] for ci in ci68_PtoAP],
                     color=current_palette[0],