Python ECDF Examples, statsmodels.distributions.ECDF Python Examples

Example #1

0

Show file

File: test_group_lasso.py Project: sophial05/selective-inference

def main(nsim=500, n=200, p=50, target='full', sigma=3):

    import matplotlib.pyplot as plt
    P0, PA = [], []
    from statsmodels.distributions import ECDF

    for i in range(nsim):
        try:
            p0, pA = test_group_lasso(n=n, p=p, target=target, sigma=sigma)
        except:
            pass
        print(len(p0), len(pA))
        P0.extend(p0)
        PA.extend(pA)

        P0_clean = np.array(P0)

        P0_clean = P0_clean[P0_clean > 1.e-5]  #
        print(np.mean(P0_clean), np.std(P0_clean),
              np.mean(np.array(PA) < 0.05),
              np.sum(np.array(PA) < 0.05) / (i + 1),
              np.mean(np.array(P0) < 0.05), np.mean(P0_clean < 0.05),
              np.mean(np.array(P0) < 1e-5), 'null pvalue + power + failure')

        if i % 3 == 0 and i > 0:
            U = np.linspace(0, 1, 101)
            plt.clf()
            if len(P0_clean) > 0:
                plt.plot(U, ECDF(P0_clean)(U))
            if len(PA) > 0:
                plt.plot(U, ECDF(PA)(U), 'r')
            plt.plot([0, 1], [0, 1], 'k--')
            plt.savefig("plot.pdf")
    plt.show()

Example #2

0

Show file

def main():

    fig1, fig2, dbn = marginal(20, 3., 3, nsim=1000)
    full = np.load('pval_20.npz')
    Ugrid = np.linspace(0, 1, 101)

    ax1 = fig1.gca()
    ax1.plot(Ugrid,
             ECDF(full['known'])(Ugrid),
             label=r'Selected using $i^*(Z)$',
             c='green',
             linewidth=5,
             alpha=0.5)
    ax1.legend(loc='lower right')

    ax2 = fig2.gca()
    ax2.plot(Ugrid,
             ECDF(full['known'][full['hypotheses']])(Ugrid),
             label=r'Selected using $i^*(Z)$',
             c='green',
             linewidth=5,
             alpha=0.5)
    ax2.legend(loc='lower right')

    fig1.savefig('splitting_marginal_1sparse.pdf')
    fig2.savefig('splitting_conditional_1sparse.pdf')

Example #3

0

Show file

File: data_plotter.py Project: erssebaggala/nemo-traces-analyzer

def plot_ecdf_pair(data_0, data_1, x, label_0, label_1, unit):
    ecdf = ECDF(data_0.values)
    median = np.median(data_0.values)

    plt.plot(x,
             ecdf(x),
             lw=2.0,
             c='m',
             label=label_0 + ': median {:.1f} {}'.format(median, unit))

    ecdf = ECDF(data_1.values)
    median = np.median(data_1.values)

    plt.plot(x,
             ecdf(x),
             lw=2.0,
             c='Orange',
             label=label_1 + ': median {:.1f} {}'.format(median, unit))

    plt.plot(x, 0.5 * np.ones(len(x)), lw=2.0, ls='--', c='b', alpha=.3)

    plt.grid()
    plt.tick_params(axis='both', which='major')
    plt.xlabel(label_0 + '/' + label_1 + ' [{}]'.format(unit))
    plt.ylabel('ECDF')
    plt.ylim([0, 1.05])
    plt.legend(loc='upper left')
    plt.tight_layout()

Example #4

0

Show file

File: test_multiple_queries.py Project: abyanka/selective-inference

def main(nsim=500, n=500, p=100, sigma=3):

    P0, PA = [], []
    from statsmodels.distributions import ECDF
    import matplotlib.pyplot as plt

    for i in range(nsim):
        if True:
            p0, pA = test_multiple_queries(n=n, p=p, sigma=sigma)
        else:
            p0, pA = [], []
        P0.extend(p0)
        PA.extend(pA)

        P0_clean = np.array(P0)

        P0_clean = P0_clean[P0_clean > 1.e-5]  #
        print(np.mean(P0_clean), np.std(P0_clean),
              np.mean(np.array(PA) < 0.05), np.mean(np.array(P0) < 0.05),
              np.mean(P0_clean < 0.05), np.mean(np.array(P0) < 1e-5))

        if i % 3 == 0 and i > 0:
            U = np.linspace(0, 1, 101)
            plt.clf()
            if len(P0_clean) > 0:
                plt.plot(U, ECDF(P0_clean)(U))
            if len(PA) > 0:
                plt.plot(U, ECDF(PA)(U), 'r')
            plt.plot([0, 1], [0, 1], 'k--')
            plt.savefig("plot.pdf")
    plt.show()

Example #5

0

Show file

def median_summ_conn_pass_thresh(inSum,outSum,dmsoSum,matrixType,rnkpt_thresh=90,graph=True):
    "1) what is the number of connections past a given rnkpt threshold \
    -what is the median for each unique pert"
    passMask = np.zeros_like(inSum.values)
    passMask[np.where(inSum.values > rnkpt_thresh)] = 1
    # count connections passed theshold
    passSum = np.sum(passMask,axis=0)
    passSer = pd.Series(data=passSum,index=inSum.columns)
    passSer.name = 'number_of_connections_pass_' + str(rnkpt_thresh) + '_rnkpt'
    passGrped = passSer.groupby(level='pert_id')
    dosMedConnect = passGrped.median()
    dosMedConnect.name = 'median_number_of_connections_above_' + str(rnkpt_thresh) + '_rnkpt'
    # repeat calculation for DMSOs
    passMaskDMSO = np.zeros_like(dmsoSum.values)
    passMaskDMSO[np.where(dmsoSum.values > rnkpt_thresh)] = 1
    passSumDMSO = np.sum(passMaskDMSO,axis=0)
    dmsoSer = pd.Series(data=passSumDMSO,index=dmsoSum.columns)
    dmsoSer.name = 'number_of_connections_above_' + str(rnkpt_thresh) + '_rnkpt'
    # repeat calculation for non-dos compounds
    passMaskNon = np.zeros_like(outSum.values)
    passMaskNon[np.where(outSum.values > rnkpt_thresh)] = 1
    passSumNon = np.sum(passMaskNon,axis=0)
    nonSer = pd.Series(data=passSumNon,index=outSum.columns)
    nonSer.name = 'number_of_connections_pass_' + str(rnkpt_thresh) + '_rnkpt'
    nonSer.index.name = 'sig_id'
    nonGrped = nonSer.groupby(level='pert_id')
    nonMedConnect = nonGrped.median()    
    if graph:
        min1 = np.min([np.min(passSer.values),np.min(passSumNon),np.min(dmsoSer.values)])
        max1 = np.max([np.max(passSer.values),np.max(passSumNon),np.min(dmsoSer.values)])
        h1 = plt.hist(dmsoSer,30,color='b',range=[min1,max1],label=['DMSO n=' + str(len(dmsoSer))],alpha=.4,normed=True)
        # h2 = plt.hist(nonMedConnect,30,color='g',range=[min1,max1],label=['non_DOS n=' + str(len(nonMedConnect))],alpha=.4,normed=True)
        h3 = plt.hist(dosMedConnect,30,color='r',range=[min1,max1],label=['DOS n=' + str(len(dosMedConnect))],alpha=.3,normed=True) #
        plt.legend()
        plt.ylabel('normed freq',fontweight='bold')
        plt.xlabel('median counts ('+ matrixType + ' > ' + str(rnkpt_thresh) + ')',fontweight='bold')
        plt.title('median connections (compounds collapsed by pert_id) - pass rnkpt ' + str(rnkpt_thresh))
        outF = os.path.join(wkdir, 'median_summly_counts_pass_threshold.png')
        plt.savefig(outF, bbox_inches='tight',dpi=200)
        plt.close()
        ### make cdf graph ####
        vals = np.linspace(min1,max1,100)
        dosEcdf = ECDF(dosMedConnect)
        dmsoEcdf = ECDF(dmsoSer)
        nonEcdf = ECDF(dosMedConnect)
        obsDos = dosEcdf(vals)
        obsDmso = dmsoEcdf(vals)
        obsNon = nonEcdf(vals)
        a1 = plt.plot(vals,obsDos,color='b',label=['DOS n=' + str(len(dosMedConnect))])
        a2 = plt.plot(vals,obsNon,color='g',label=['non_DOS n=' + str(len(nonMedConnect))])
        a3 = plt.plot(vals,obsDmso,color='r',label=['DMSO n=' + str(len(dmsoSer))]) #
        # plt.legend()
        plt.ylabel('F(x)',fontweight='bold')
        plt.xlabel('median counts ('+ matrixType + ' > ' + str(rnkpt_thresh) + ')',fontweight='bold')
        # plt.title('median connections pass rnkpt ' + str(rnkpt_thresh))
        outF = os.path.join(wkdir, 'median_summly_counts_cdf.png')
        plt.savefig(outF, bbox_inches='tight',dpi=200)
        plt.close()
    return dosMedConnect, dmsoSer

Example #6

0

Show file

File: data_plotter.py Project: erssebaggala/nemo-traces-analyzer

def plot_ecdf_triplet(data_0,
                      data_1,
                      data_2,
                      x,
                      label_0=None,
                      label_1=None,
                      label_2=None,
                      unit=None,
                      plot_info=True):
    ecdf = ECDF(data_0.values)
    median = np.median(data_0.values)

    plt.plot(x,
             ecdf(x),
             lw=2.0,
             c='m',
             label=label_0 + ': median {:.1f} {}'.format(median, unit))

    ecdf = ECDF(data_1.values)
    median = np.median(data_1.values)

    plt.plot(x,
             ecdf(x),
             lw=2.0,
             c='Blue',
             label=label_1 + ': median {:.1f} {}'.format(median, unit))

    ecdf = ECDF(data_2.values)
    median = np.median(data_2.values)

    plt.plot(x,
             ecdf(x),
             lw=2.0,
             c='Orange',
             label=label_2 + ': median {:.1f} {}'.format(median, unit))

    plt.plot(x, 0.5 * np.ones(len(x)), lw=2.0, ls='--', c='b', alpha=.3)

    plt.grid()
    plt.tick_params(axis='both', which='major')
    if plot_info:
        if label_0 is not None:
            plt.xlabel(label_0 + '/' + label_1 + '/' + label_2 +
                       ' [{}]'.format(unit))
        else:
            plt.xlabel('[{}]'.format(unit))
        plt.ylabel('ECDF')
        plt.ylim([0, 1.05])
        plt.legend(loc='upper left')
    plt.tight_layout()

Example #7

0

Show file

File: test_lasso.py Project: snigdhagit/selective-inference

def main(nsim=500, n=500, p=200, sqrt=False, target='full', sigma=3, AR=True):

    import matplotlib.pyplot as plt
    P0, PA = [], []
    from statsmodels.distributions import ECDF

    for i in range(nsim):
        if True:
            if not sqrt:
                if AR:
                    p0, pA = test_AR_randomization(n=n,
                                                   p=p,
                                                   target=target,
                                                   sigma=sigma)
                else:
                    p0, pA = test_highdim_lasso(n=n,
                                                p=p,
                                                target=target,
                                                sigma=sigma)
            else:
                p0, pA = test_sqrt_highdim_lasso(n=n,
                                                 p=p,
                                                 target=target,
                                                 compare_to_lasso=False)
        else:
            p0, pA = [], []
        print(len(p0), len(pA))
        P0.extend(p0)
        PA.extend(pA)

        P0_clean = np.array(P0)

        P0_clean = P0_clean[P0_clean > 1.e-5]  #
        print(np.mean(P0_clean), np.std(P0_clean),
              np.mean(np.array(PA) < 0.05),
              np.sum(np.array(PA) < 0.05) / (i + 1),
              np.mean(np.array(P0) < 0.05), np.mean(P0_clean < 0.05),
              np.mean(np.array(P0) < 1e-5), 'null pvalue + power + failure')

        if i % 3 == 0 and i > 0:
            U = np.linspace(0, 1, 101)
            plt.clf()
            if len(P0_clean) > 0:
                plt.plot(U, ECDF(P0_clean)(U))
            if len(PA) > 0:
                plt.plot(U, ECDF(PA)(U), 'r')
            plt.plot([0, 1], [0, 1], 'k--')
            plt.savefig("plot.pdf")
    plt.show()

Example #8

0

Show file

def cdf(path):
    params, colnames, vals = load_coverage_file(path)
    from matplotlib import pyplot as plt
    from scipy.stats import binom
    from statsmodels.distributions import ECDF

    # Prepare binning
    n_MC = params["ntest"]
    n_min = min(n_MC * min(params["cl"]), vals[:, -1].min())
    bin_centers = np.arange(n_min, n_MC + 1)

    plt.figure()
    plt.title("coverage histogram {}".format(path))
    ls_col = cycle(product(("solid", "dashed", "dashdot", "dotted"), "rgbmk"))
    for targ_cl, (ls, col) in zip(params["cl"], ls_col):
        idx = vals[:, -2] == targ_cl
        n_cov = vals[idx, -1]
        ecdf = ECDF(n_cov)
        plt.plot(bin_centers,
                 ecdf(bin_centers),
                 color=col,
                 linestyle=ls,
                 linewidth=3,
                 label="{0:.5f}".format(targ_cl))
        plt.plot(bin_centers,
                 binom.cdf(bin_centers, n_MC, targ_cl),
                 color=col,
                 linestyle=ls,
                 marker="o",
                 mew=0)

    plt.legend(loc="best", title="target CL")
    plt.xlabel("#(covered)")
    plt.ylabel("cumulative frequency (CDF)")
    plt.show()

Example #9

0

Show file

File: methodDisc.py Project: LabUfjf/DiscretizationProcess3

def CDFm(data,
         nPoint,
         dist='normal',
         mu=0,
         sigma=1,
         analitica=False,
         lim=None):
    import numpy as np
    from scipy.interpolate import interp1d
    from statsmodels.distributions import ECDF
    from scipy.stats import norm, lognorm

    eps = 5e-5
    y = np.linspace(eps, 1 - eps, nPoint)

    if not analitica:
        ecdf = ECDF(data)
        xest = np.linspace(lim[0], lim[1], int(100e3))
        yest = ecdf(xest)
        interp = interp1d(yest, xest, fill_value='extrapolate', kind='nearest')
        x = interp(y)
    else:
        if dist == 'normal':
            x = norm.ppf(y, loc=mu, scale=sigma)
        elif dist == 'lognormal':
            x = lognorm.ppf(y, sigma, loc=0, scale=np.exp(mu))

    return x

Example #10

0

Show file

File: ldpe.py Project: alexazhu/hdpy

    def pval_adjust_WY(self, cov, pvals, N=10000):
        """
        Purpose:
        multiple testing correction with a Westfall young-like procedure as
        in ridge projection method, http://arxiv.org/abs/1202.1377 P.Buehlmann
        ======================================================================
        :param cov: covariance matrix of your estimator
        :param pvals: single testing pvalues
        :param N: the number of samples to take for the empirical distribution
        :return pcorr: corrected p-values
        ======================================================================
        Author: Ziyan Zhu, Date: April 10th, 2019
        Following R version by Ruben Dezeure, Date: 6 Feb 2014, 14:27
        """
        ncol = cov.shape[1]
        zz = np.random.multivariate_normal(mean=np.zeros(ncol),
                                           cov=cov,
                                           size=N)
        zz2 = zz / np.sqrt(np.diagonal(cov))
        gz = 2 * norm.sf(abs(zz2))
        GZ = np.min(gz, axis=0)

        ecdf = ECDF(GZ)
        pcorr = ecdf(pvals)
        return pcorr

Example #11

0

Show file

File: test_lasso_iv.py Project: snigdhagit/selective-inference

def main(nsim=500):

    P0 = []
    from statsmodels.distributions import ECDF

    n, p, s = 1000, 10, 3
    Sigma_12 = 0.8
    gsnr = 1.
    beta_star = 1.

    for i in range(nsim):
        try:
            p0 = test_lasso_iv_instance(n=n, p=p, s=s, Sigma_12=Sigma_12, gsnr=gsnr, beta_star=beta_star)
        except:
            p0 = []
        P0.extend(p0)

    print(np.mean(P0), np.std(P0), np.mean(np.array(P0) < 0.05))

    U = np.linspace(0, 1, 101)
    #plt.clf()
    plt.plot(U, ECDF(P0)(U))
    plt.plot(U, U, 'r--')
    #plt.savefig("plot.pdf")
    plt.show()

Example #12

0

Show file

def L(muestra, alpha):
    n = len(muestra)
    epsilon = sqrt(log(2. / alpha) / (2 * n))
    ecdf = ECDF(muestra)
    nn = len(ecdf.y)
    out = zeros(nn)
    for i in range(0, nn):
        out[i] = max(ecdf.y[i] - epsilon, 0)
    return out

Example #13

0

Show file

	def passive_aggressive_train(self):
		'''Trains passive aggressive classifier

		'''
		self._clf = PassiveAggressiveClassifier(n_iter=50, C=0.2, n_jobs=-1, random_state=0)
		self._clf.fit(self._term_doc_matrix._X, self._term_doc_matrix._y)
		y_dist = self._clf.decision_function(self._term_doc_matrix._X)
		pos_ecdf = ECDF(y_dist[y_dist >= 0])
		neg_ecdf = ECDF(y_dist[y_dist <= 0])

		def proba_function(distance_from_hyperplane):
			if distance_from_hyperplane > 0:
				return pos_ecdf(distance_from_hyperplane) / 2. + 0.5
			elif distance_from_hyperplane < 0:
				return pos_ecdf(distance_from_hyperplane) / 2.
			return 0.5

		self._proba = proba_function
		return self

Example #14

0

Show file

File: encoders.py Project: zqifdu/kaggletils

    def fit(self, x):
        self.ecdfs = {}
        if len(x.shape) == 1:
            x = x.reshape(-1, 1)
        ncols = x.shape[1]
        is_np = is_numpy(x)

        for i in range(ncols):
            self.ecdfs.update(
                {i: ECDF(x[:, i] if is_np else x.iloc[:, i].values)})
        return self

Example #15

0

Show file

File: utils.py Project: nnoll/basel_carbapenemases

def plotCDF(data, logScale=False):

    from statsmodels.distributions.empirical_distribution import ECDF
    import matplotlib.pylab as plt

    ecdf = ECDF(data)
    plt.figure()
    plt.plot(ecdf.x, ecdf.y, linewidth=2)
    plt.ylabel('CDF')
    if logScale:
        plt.xscale('log')
    plt.show()

Example #16

0

Show file

def cdf_dphase(delay, freq, title='Histogram of phase lag', xscale='f'):
    """
    Plot cdf of delay for given frequencies.

    Params:
    --------
    delay (ndarray)
        (n_freq,n_samples) Phase distance between two trajectories.
    freq (ndarray)
        Frequencies that are given.
    title (str)
    xscale (str)
        'f' means frequency scale and 't' means time scale
    """
    from misc.plot import set_ticks_radian, colorcycle
    from statsmodels.distributions import ECDF

    fig, ax = plt.subplots(figsize=(7, 4))
    c = colorcycle(len(freq))
    for freqix in range(len(freq)):
        ecdf = ECDF(delay[freqix])
        if xscale == 't':
            ax.plot(ecdf.x / (2 * np.pi) / freq[freqix],
                    ecdf.y,
                    '-',
                    alpha=1,
                    c=next(c),
                    lw=2)
        else:
            ax.plot(ecdf.x, ecdf.y, '-', alpha=1, c=next(c), lw=2)

    if xscale == 't':
        xlim = [-1 / freq[0], 1 / freq[0]]
        xticks = np.arange(*xlim)
    else:
        xlim = [-pi, pi]
        xticks = [-pi, pi / 2, 0, pi / 2, pi]
    ax.set(xlim=xlim,
           xticks=xticks,
           xlabel='Phase lag',
           ylabel='CDF',
           title=title)
    set_ticks_radian(ax, axis='x')
    ax.legend(['%1.1f Hz' % f for f in freq],
              numpoints=1,
              title='Frequency',
              fontsize='small',
              bbox_to_anchor=[1.4, 1.03],
              labelspacing=.1)
    ax.grid()
    return fig, ax

Example #17

0

Show file

File: utils.py Project: nnoll/basel_carbapenemases

def plotDistributions(data, title, lineC, f=None, ax=None):
    """
    Plots the distribution of reads lengths (or any other data defined over 
    reads) over both paired end sets
    """

    from statsmodels.distributions import ECDF
    import matplotlib
    matplotlib.use('agg')
    import matplotlib.pylab as plt

    if (f is None):
        f, ax = plt.subplots(2, sharey=True, sharex=True)

    ecdfPE1 = ECDF(data[0, :])
    ecdfPE2 = ECDF(data[1, :])
    ax[0].step(ecdfPE1.x, ecdfPE1.y, color=lineC, alpha=.5)
    ax[1].step(ecdfPE2.x, ecdfPE2.y, color=lineC, alpha=.5)
    ax[0].set_title('Read 0')
    ax[1].set_title('Read 1')

    f.suptitle('Cumulative Distributions of Read Length')
    return f, ax

Example #18

0

Show file

def main():

    beta_seq, MLE_cur, MLE_prev, pivot = test_agreement()

    plt.figure(num=1)

    plt.plot(beta_seq, np.array(MLE_cur), label='MLE now')
    plt.plot(beta_seq, np.array(MLE_prev), 'r--', label='MLE prev')
    plt.legend()

    plt.figure(num=2)
    U = np.linspace(0, 1, 101)
    plt.plot(U, ECDF(pivot)(U))
    plt.plot([0, 1], [0, 1], 'k--')

Example #19

0

Show file

File: test_general_lasso_pval.py Project: snigdhagit/selective-inference

def main(nsim=500):

    P0, PA = [], []
    from statsmodels.distributions import ECDF

    for i in range(nsim):
        try:
            p0, pA = test_condition_subgrad(n=200, p=10)
        except:
            p0, pA = [], []
        P0.extend(p0)
        PA.extend(pA)
        print(np.mean(P0), np.std(P0), np.mean(np.array(PA) < 0.05))

        if i % 3 == 0 and i > 0:
            U = np.linspace(0, 1, 101)
            plt.clf()
            if len(P0) > 0:
                plt.plot(U, ECDF(P0)(U))
            if len(PA) > 0:
                plt.plot(U, ECDF(PA)(U), 'r')
            plt.plot([0, 1], [0, 1], 'k--')
            plt.savefig("plot.pdf")
    plt.show()

Example #20

0

Show file

def CDFm(data,nPoint):
    import numpy as np
    from scipy.interpolate import interp1d
    from statsmodels.distributions import ECDF
    eps = 5e-5
    
    yest = np.linspace(0+eps,1-eps,nPoint)
    ecdf = ECDF(data)
    inf,sup = min(data),max(data)
    xest = np.linspace(inf,sup,int(100e3))
    yest = ecdf(xest)
    interp = interp1d(yest,xest,fill_value = 'extrapolate', kind = 'nearest')
    y = np.linspace(eps,1-eps,nPoint)
    x = interp(y)
    
    return x

Example #21

0

Show file

File: test_selective_MLE_onedim.py Project: snigdhagit/selective-inference

def main():

    beta_seq, MLE_cur, MLE_prev, pivot = test_agreement()

    import matplotlib.pyplot as plt
    from statsmodels.distributions import ECDF

    plt.figure(num=1)

    plt.plot(beta_seq, np.array(MLE_cur), label='MLE now')
    plt.plot(beta_seq, np.array(MLE_prev), 'r--', label='MLE prev')
    plt.legend()

    plt.figure(num=2)
    U = np.linspace(0, 1, 101)
    plt.plot(U, ECDF(pivot)(U))
    plt.plot([0, 1], [0, 1], 'k--')

Example #22

0

Show file

File: ScaledFScore.py Project: yangyang0477/scattertext

	def _get_scaler_function(scaler_algo):
		scaler = None
		if scaler_algo == 'normcdf':
			scaler = lambda x: norm.cdf(x, x.mean(), x.std())
		elif scaler_algo == 'lognormcdf':
			scaler = lambda x: norm.cdf(np.log(x), np.log(x).mean(), np.log(x).std())
		elif scaler_algo == 'percentile':
			scaler = lambda x: rankdata(x).astype(np.float64) / len(x)
		elif scaler_algo == 'percentiledense':
			scaler = lambda x: rankdata(x, method='dense').astype(np.float64) / len(x)
		elif scaler_algo == 'ecdf':
			from statsmodels.distributions import ECDF
			scaler = lambda x: ECDF(x)
		elif scaler_algo == 'none':
			scaler = lambda x: x
		else:
			raise InvalidScalerException("Invalid scaler alogrithm.  Must be either percentile or normcdf.")
		return scaler

Example #23

0

Show file

File: test_contrast.py Project: prakharnigotiya/selective-inference

def main(nsim=500):
    cover= 0.
    pivot = []

    for i in range(nsim):
        cover_, pivot_ = test_selected_targets()

        cover += cover_
        pivot.append(pivot_)

        print("iteration completed ", i)
        print("coverage so far ", cover/(i+1.))
    plt.clf()
    ecdf_MLE = ECDF(np.asarray(pivot))
    grid = np.linspace(0, 1, 101)
    plt.plot(grid, ecdf_MLE(grid), c='blue', marker='^')
    plt.plot(grid, grid, 'k--')
    plt.show()

Example #24

0

Show file

File: runner.py Project: huangy10/VehicleDensityEstimation

def lmda_estimator(data):
    ecdf = ECDF(data)
    y = ecdf(data)

    def F(x, lmda, D):
        return 1 - lmda * (x - D + 1 / lmda) * np.exp(-lmda * (x - D))

    result = curve_fit(F, data, y)
    # print """
    #     ===================λ估计完成，结果如下===================
    #         λ：%s
    #         D: %s
    #         Covariance matrix:
    #              | %s\t, %s |
    #              | %s\t, %s |
    #     ======================================================
    #     """ % (result[0][0], result[0][1], result[1][0][0], result[1][0][1], result[1][1][0], result[1][1][1])

    return result[0][0]

Example #25

0

Show file

    def pval_adjust_WY(self, cov, pval, N=10000):
        ## Purpose:
        ## multiple testing correction with a Westfall young-like procedure as
        ## in ridge projection method, http://arxiv.org/abs/1202.1377 P.Buehlmann
        ## ----------------------------------------------------------------------
        ## Arguments:
        ## cov: covariance matrix of your estimator
        ## pval: the single testing p-values
        ## N: the number of samples to take for the empirical distribution
        ##    which is used to correct the p-values
        ## ----------------------------------------------------------------------
        ## R-version Author: Ruben Dezeure, Date: 6 Feb 2014, 14:27

        ncol = cov.shape[1]
        zz = np.random.multivariate_normal(mean=np.zeros(ncol),
                                           cov=cov,
                                           size=N)
        zz2 = zz / np.sqrt(np.diagonal(cov))
        gz = 2 * norm.sf(abs(zz2))
        GZ = np.min(gz, axis=0)

        ecdf = ECDF(GZ)
        pcorr = ecdf(pval)
        return pcorr

Example #26

0

Show file

from statsmodels.distributions import ECDF

def empirical_cdf_plot(xs):
    ecdf = ECDF(xs)
    xmin = np.nanmin(xs)
    xmax = np.nanmax(xs)
    vals = np.linspace(xmin,xmax,100)
    ax = plt.axes()
    ax.plot(vals,ecdf(vals))
    ax.set_ylabel('F(x)')
    return ax
ax1 = empirical_cdf_plot(passSer)

# Though something simple works too
    ecdf_ridge = ECDF(ridge_r)
    ecdf_linreg = ECDF(linreg_r)
    vals = np.linspace(-1,1,100)
    ax = plt.axes()
    ax.plot(vals,ecdf_ridge(vals),label='LR',linewidth=2)
    ax.plot(vals,ecdf_linreg(vals),label='Ridge',linewidth=2)



dosSer = sigSer.reindex(dosGold['sig_id'].values)

### make summary table:
# 1) pert_id
# 2) times_profiled_in_a2
# 3) times_gold_in_a2
# 4) is_gold_cell lines

Example #27

0

Show file

 rMed = rowMedian[pIds]
 fig = plt.figure(1, figsize=(10, 10))
 # make matrix of equal size using null
 nperm = 10000
 permDict = {}
 for iperm in range(nperm):
     iRand = np.random.choice(range(0, dmsoFrm.shape[1]), size=(len(pIds)))
     iRandCol = dmsoFrm.columns[iRand]  #random column names
     smDmso = dmsoFrm.reindex(index=pIds, columns=iRandCol)
     # remove identity cells and unstack
     uDmso = no_diagonal_unstack(smDmso)
     medDmso = uDmso.median()
     permDict[iperm] = medDmso
 nullSer = pd.Series(permDict)
 #two tailed p-value
 ecdf = ECDF(nullSer)
 arg1 = ecdf(medObs)
 arg2 = 1 - ecdf(medObs)
 pval = 2 * np.minimum(arg1, arg2)
 #set p-val min
 if pval == 0:
     pval = 1 / float(nperm)
 pvalDict[cName] = pval
 if graph:
     # graph heatmap of each
     plt.imshow(smFrm.values,
                interpolation='nearest',
                aspect='auto',
                vmin=-100,
                vmax=100,
                cmap=cm.RdBu_r)

Example #28

0

Show file

def diffArea(nest,
             outlier=0,
             data=0,
             kinds='all',
             axis='probability',
             ROI=20,
             mu=0,
             sigma=1,
             weight=False,
             interpolator='linear',
             distribuition='normal',
             seed=None,
             plot=True):
    """
    Return an error area between a analitic function and a estimated discretization from a distribuition.

    Parameters
    ----------
    nest: int
        The number of estimation points.
    outlier: int, optional
        Is the point of an outlier event, e.g outlier = 50 will put an event in -50 and +50 if mu = 0.
        Defaut is 0
    data: int, optional
        If data > 0, a randon data will be inserted insted analitcs data.
        Defaut is 0.
    kinds: str or array, optional
        specifies the kind of distribuition to analize.
        ('Linspace', 'CDFm', 'PDFm', 'iPDF1', 'iPDF2', 'all').
        Defaut is 'all'.
    axis: str, optional
        specifies the x axis to analize
        ('probability', 'derivative', '2nd_derivative', 'X').
        Defaut is 'probability'.
    ROI: int, optional
        Specifies the number of regions of interest.
        Defaut is 20.
    mu: int, optional
        Specifies the mean of distribuition.
        Defaut is 0.
    sigma: int, optional
        Specifies the standard desviation of a distribuition.
        Defaut is 1.
    weight: bool, optional
        if True, each ROI will have a diferent weight to analyze.
        Defaut is False
    interpolator: str, optional
        Specifies the kind of interpolation as a string
        ('linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic'
        where 'zero', 'slinear', 'quadratic' and 'cubic' refer to a spline
        interpolation of zeroth, first, second or third order) or as an
        integer specifying the order of the spline interpolator to use.
        Default is 'linear'.
    distribuition: str, optional
        Select the distribuition to analyze.
        ('normal', 'lognormal')
        Defaut is 'normal'
    plot: bool, optional
        If True, a plot will be ploted with the analyzes
        Defaut is True
        
    Returns
    -------
    a, [b,c]: float and float of ndarray. area,[probROIord,areaROIord]
       returns the sum of total error area and the 'x' and 'y' values.   
    

    """
    import numpy as np
    from scipy.stats import norm, lognorm
    from scipy.interpolate import interp1d
    from numpy import exp
    import matplotlib.pyplot as plt
    from statsmodels.distributions import ECDF
    from distAnalyze import pdf, dpdf, ddpdf, PDF, dPDF, ddPDF

    area = []
    n = []
    data = int(data)
    if distribuition == 'normal':
        outlier_inf = outlier_sup = outlier
    elif distribuition == 'lognormal':
        outlier_inf = 0
        outlier_sup = outlier

    ngrid = int(1e6)
    truth = pdf

    if axis == 'probability':
        truth1 = pdf
    elif axis == 'derivative':
        truth1 = dpdf
    elif axis == '2nd_derivative':
        truth1 = ddpdf
    elif axis == 'X':
        truth1 = lambda x, mu, sigma, distribuition: x
    #else: return 'No valid axis'

    probROIord = {}
    areaROIord = {}
    div = {}
    if seed is not None:
        np.random.set_state(seed)
    if data:
        if distribuition == 'normal':
            d = np.random.normal(mu, sigma, data)
        elif distribuition == 'lognormal':
            d = np.random.lognormal(mu, sigma, data)

    if kinds == 'all':
        kinds = ['Linspace', 'CDFm', 'PDFm', 'iPDF1', 'iPDF2']
    elif type(kinds) == str:
        kinds = [kinds]

    for kind in kinds:
        if distribuition == 'normal':
            inf, sup = norm.interval(0.9999, loc=mu, scale=sigma)

        elif distribuition == 'lognormal':
            inf, sup = lognorm.interval(0.9999, sigma, loc=0, scale=exp(mu))
            inf = lognorm.pdf(sup, sigma, loc=0, scale=np.exp(mu))
            inf = lognorm.ppf(inf, sigma, loc=0, scale=np.exp(mu))

        xgrid = np.linspace(inf, sup, ngrid)
        xgridROI = xgrid.reshape([ROI, ngrid // ROI])

        dx = np.diff(xgrid)[0]

        if kind == 'Linspace':
            if not data:
                xest = np.linspace(inf - outlier_inf, sup + outlier_sup, nest)
            else:
                if distribuition == 'normal':
                    #d = np.random.normal(loc = mu, scale = sigma, size = data)
                    inf, sup = min(d), max(d)
                    xest = np.linspace(inf - outlier_inf, sup + outlier_sup,
                                       nest)
                elif distribuition == 'lognormal':
                    #d = np.random.lognormal(mean = mu, sigma = sigma, size = data)
                    inf, sup = min(d), max(d)
                    xest = np.linspace(inf - outlier_inf, sup + outlier_sup,
                                       nest)

            yest = pdf(xest, mu, sigma, distribuition)

        elif kind == 'CDFm':
            eps = 5e-5
            yest = np.linspace(0 + eps, 1 - eps, nest)
            if distribuition == 'normal':
                if not data:
                    xest = norm.ppf(yest, loc=mu, scale=sigma)
                    yest = pdf(xest, mu, sigma, distribuition)
                else:
                    #d = np.random.normal(loc = mu, scale = sigma, size = data)
                    ecdf = ECDF(d)
                    inf, sup = min(d), max(d)
                    xest = np.linspace(inf, sup, data)
                    yest = ecdf(xest)
                    interp = interp1d(yest,
                                      xest,
                                      fill_value='extrapolate',
                                      kind='nearest')
                    yest = np.linspace(eps, 1 - eps, nest)
                    xest = interp(yest)

            elif distribuition == 'lognormal':
                if not data:
                    xest = lognorm.ppf(yest, sigma, loc=0, scale=exp(mu))
                    yest = pdf(xest, mu, sigma, distribuition)
                else:
                    #d = np.random.lognormal(mean = mu, sigma = sigma, size = data)
                    ecdf = ECDF(d)
                    inf, sup = min(d), max(d)
                    xest = np.linspace(inf, sup, nest)
                    yest = ecdf(xest)
                    interp = interp1d(yest,
                                      xest,
                                      fill_value='extrapolate',
                                      kind='nearest')
                    yest = np.linspace(eps, 1 - eps, nest)
                    xest = interp(yest)

        elif kind == 'PDFm':
            xest, yest = PDF(nest, mu, sigma, distribuition, outlier, data,
                             seed)
        elif kind == 'iPDF1':
            xest, yest = dPDF(nest, mu, sigma, distribuition, outlier, data,
                              10, seed)
        elif kind == 'iPDF2':
            xest, yest = ddPDF(nest, mu, sigma, distribuition, outlier, data,
                               10, seed)

        YY = pdf(xest, mu, sigma, distribuition)
        fest = interp1d(xest,
                        YY,
                        kind=interpolator,
                        bounds_error=False,
                        fill_value=(YY[0], YY[-1]))

        #fest = lambda x: np.concatenate([fest1(x)[fest1(x) != -1],np.ones(len(fest1(x)[fest1(x) == -1]))*fest1(x)[fest1(x) != -1][-1]])

        yestGrid = []
        ytruthGrid = []
        ytruthGrid2 = []
        divi = []

        for i in range(ROI):
            yestGrid.append([fest(xgridROI[i])])
            ytruthGrid.append([truth(xgridROI[i], mu, sigma, distribuition)])
            ytruthGrid2.append([truth1(xgridROI[i], mu, sigma, distribuition)])
            divi.append(
                len(
                    np.intersect1d(
                        np.where(xest >= min(xgridROI[i]))[0],
                        np.where(xest < max(xgridROI[i]))[0])))

        diff2 = np.concatenate(
            abs((np.array(yestGrid) - np.array(ytruthGrid)) * dx))
        #diff2[np.isnan(diff2)] = 0
        areaROI = np.sum(diff2, 1)

        divi = np.array(divi)
        divi[divi == 0] = 1

        try:
            probROI = np.mean(np.sum(ytruthGrid2, 1), 1)
        except:
            probROI = np.mean(ytruthGrid2, 1)

        probROIord[kind] = np.sort(probROI)
        index = np.argsort(probROI)

        areaROIord[kind] = areaROI[index]
        #deletes = ~np.isnan(areaROIord[kind])
        #areaROIord[kind] = areaROIord[kind][deletes]
        #probROIord[kind] = probROIord[kind][deletes]

        area = np.append(area, np.sum(areaROIord[kind]))
        n = np.append(n, len(probROIord[kind]))
        div[kind] = divi[index]
        if plot:
            if weight:
                plt.logy(probROIord[kind],
                         areaROIord[kind] * div[kind],
                         '-o',
                         label=kind,
                         ms=3)
            else:
                plt.plot(probROIord[kind],
                         areaROIord[kind],
                         '-o',
                         label=kind,
                         ms=3)

            plt.yscale('log')
            plt.xlabel(axis)
            plt.ylabel('Error')
            plt.legend()

        #plt.title('%s - Pontos = %d, div = %s - %s' %(j,nest, divs,interpolator))

    return area, [probROIord, areaROIord]

Example #29

0

Show file

    def ppplot(self,
               xlabel=None,
               ylabel=None,
               line=None,
               other=None,
               ax=None,
               **plotkwargs):
        """
        P-P plot of the percentiles (probabilities) of x versus the
        probabilities (percetiles) of a distribution.

        Parameters
        ----------
        xlabel : str or None, optional
            User-provided lables for the x-axis. If None (default),
            other values are used depending on the status of the kwarg `other`.
        ylabel : str or None, optional
            User-provided lables for the y-axis. If None (default),
            other values are used depending on the status of the kwarg `other`.
        line : str {'45', 's', 'r', q'} or None, optional
            Options for the reference line to which the data is compared:

                - '45': 45-degree line
                - 's': standardized line, the expected order statistics are
                  scaled by the standard deviation of the given sample and have
                  the mean added to them
                - 'r': A regression line is fit
                - 'q': A line is fit through the quartiles.
                - None: by default no reference line is added to the plot.

        other : ProbPlot, array-like, or None, optional
            If provided, ECDF(x) will be plotted against p(x) where x are
            sorted samples from `self`. ECDF is an empirical cumulative
            distribution function estimated from `other` and
            p(x) = 0.5/n, 1.5/n, ..., (n-0.5)/n where n is the number of
            samples in `self`. If an array-object is provided, it will be
            turned into a `ProbPlot` instance default parameters. If not
            provided (default), `self.dist(x)` is be plotted against p(x).

        ax : Matplotlib AxesSubplot instance, optional
            If given, this subplot is used to plot in instead of a new figure
            being created.
        **plotkwargs : additional matplotlib arguments to be passed to the
            `plot` command.

        Returns
        -------
        fig : Matplotlib figure instance
            If `ax` is None, the created figure.  Otherwise the figure to which
            `ax` is connected.
        """
        if other is not None:
            check_other = isinstance(other, ProbPlot)
            if not check_other:
                other = ProbPlot(other)

            p_x = self.theoretical_percentiles
            ecdf_x = ECDF(other.sample_quantiles)(self.sample_quantiles)

            fig, ax = _do_plot(p_x,
                               ecdf_x,
                               self.dist,
                               ax=ax,
                               line=line,
                               **plotkwargs)

            if xlabel is None:
                xlabel = 'Probabilities of 2nd Sample'
            if ylabel is None:
                ylabel = 'Probabilities of 1st Sample'

        else:
            fig, ax = _do_plot(self.theoretical_percentiles,
                               self.sample_percentiles,
                               self.dist,
                               ax=ax,
                               line=line,
                               **plotkwargs)
            if xlabel is None:
                xlabel = "Theoretical Probabilities"
            if ylabel is None:
                ylabel = "Sample Probabilities"

        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)

        ax.set_xlim([0.0, 1.0])
        ax.set_ylim([0.0, 1.0])

        return fig

Example #30

0

Show file

def empirical_cdf_plot(xs):
    ecdf = ECDF(xs)