def main(nsim=500, n=200, p=50, target='full', sigma=3): import matplotlib.pyplot as plt P0, PA = [], [] from statsmodels.distributions import ECDF for i in range(nsim): try: p0, pA = test_group_lasso(n=n, p=p, target=target, sigma=sigma) except: pass print(len(p0), len(pA)) P0.extend(p0) PA.extend(pA) P0_clean = np.array(P0) P0_clean = P0_clean[P0_clean > 1.e-5] # print(np.mean(P0_clean), np.std(P0_clean), np.mean(np.array(PA) < 0.05), np.sum(np.array(PA) < 0.05) / (i + 1), np.mean(np.array(P0) < 0.05), np.mean(P0_clean < 0.05), np.mean(np.array(P0) < 1e-5), 'null pvalue + power + failure') if i % 3 == 0 and i > 0: U = np.linspace(0, 1, 101) plt.clf() if len(P0_clean) > 0: plt.plot(U, ECDF(P0_clean)(U)) if len(PA) > 0: plt.plot(U, ECDF(PA)(U), 'r') plt.plot([0, 1], [0, 1], 'k--') plt.savefig("plot.pdf") plt.show()
def main(): fig1, fig2, dbn = marginal(20, 3., 3, nsim=1000) full = np.load('pval_20.npz') Ugrid = np.linspace(0, 1, 101) ax1 = fig1.gca() ax1.plot(Ugrid, ECDF(full['known'])(Ugrid), label=r'Selected using $i^*(Z)$', c='green', linewidth=5, alpha=0.5) ax1.legend(loc='lower right') ax2 = fig2.gca() ax2.plot(Ugrid, ECDF(full['known'][full['hypotheses']])(Ugrid), label=r'Selected using $i^*(Z)$', c='green', linewidth=5, alpha=0.5) ax2.legend(loc='lower right') fig1.savefig('splitting_marginal_1sparse.pdf') fig2.savefig('splitting_conditional_1sparse.pdf')
def plot_ecdf_pair(data_0, data_1, x, label_0, label_1, unit): ecdf = ECDF(data_0.values) median = np.median(data_0.values) plt.plot(x, ecdf(x), lw=2.0, c='m', label=label_0 + ': median {:.1f} {}'.format(median, unit)) ecdf = ECDF(data_1.values) median = np.median(data_1.values) plt.plot(x, ecdf(x), lw=2.0, c='Orange', label=label_1 + ': median {:.1f} {}'.format(median, unit)) plt.plot(x, 0.5 * np.ones(len(x)), lw=2.0, ls='--', c='b', alpha=.3) plt.grid() plt.tick_params(axis='both', which='major') plt.xlabel(label_0 + '/' + label_1 + ' [{}]'.format(unit)) plt.ylabel('ECDF') plt.ylim([0, 1.05]) plt.legend(loc='upper left') plt.tight_layout()
def main(nsim=500, n=500, p=100, sigma=3): P0, PA = [], [] from statsmodels.distributions import ECDF import matplotlib.pyplot as plt for i in range(nsim): if True: p0, pA = test_multiple_queries(n=n, p=p, sigma=sigma) else: p0, pA = [], [] P0.extend(p0) PA.extend(pA) P0_clean = np.array(P0) P0_clean = P0_clean[P0_clean > 1.e-5] # print(np.mean(P0_clean), np.std(P0_clean), np.mean(np.array(PA) < 0.05), np.mean(np.array(P0) < 0.05), np.mean(P0_clean < 0.05), np.mean(np.array(P0) < 1e-5)) if i % 3 == 0 and i > 0: U = np.linspace(0, 1, 101) plt.clf() if len(P0_clean) > 0: plt.plot(U, ECDF(P0_clean)(U)) if len(PA) > 0: plt.plot(U, ECDF(PA)(U), 'r') plt.plot([0, 1], [0, 1], 'k--') plt.savefig("plot.pdf") plt.show()
def median_summ_conn_pass_thresh(inSum,outSum,dmsoSum,matrixType,rnkpt_thresh=90,graph=True): "1) what is the number of connections past a given rnkpt threshold \ -what is the median for each unique pert" passMask = np.zeros_like(inSum.values) passMask[np.where(inSum.values > rnkpt_thresh)] = 1 # count connections passed theshold passSum = np.sum(passMask,axis=0) passSer = pd.Series(data=passSum,index=inSum.columns) passSer.name = 'number_of_connections_pass_' + str(rnkpt_thresh) + '_rnkpt' passGrped = passSer.groupby(level='pert_id') dosMedConnect = passGrped.median() dosMedConnect.name = 'median_number_of_connections_above_' + str(rnkpt_thresh) + '_rnkpt' # repeat calculation for DMSOs passMaskDMSO = np.zeros_like(dmsoSum.values) passMaskDMSO[np.where(dmsoSum.values > rnkpt_thresh)] = 1 passSumDMSO = np.sum(passMaskDMSO,axis=0) dmsoSer = pd.Series(data=passSumDMSO,index=dmsoSum.columns) dmsoSer.name = 'number_of_connections_above_' + str(rnkpt_thresh) + '_rnkpt' # repeat calculation for non-dos compounds passMaskNon = np.zeros_like(outSum.values) passMaskNon[np.where(outSum.values > rnkpt_thresh)] = 1 passSumNon = np.sum(passMaskNon,axis=0) nonSer = pd.Series(data=passSumNon,index=outSum.columns) nonSer.name = 'number_of_connections_pass_' + str(rnkpt_thresh) + '_rnkpt' nonSer.index.name = 'sig_id' nonGrped = nonSer.groupby(level='pert_id') nonMedConnect = nonGrped.median() if graph: min1 = np.min([np.min(passSer.values),np.min(passSumNon),np.min(dmsoSer.values)]) max1 = np.max([np.max(passSer.values),np.max(passSumNon),np.min(dmsoSer.values)]) h1 = plt.hist(dmsoSer,30,color='b',range=[min1,max1],label=['DMSO n=' + str(len(dmsoSer))],alpha=.4,normed=True) # h2 = plt.hist(nonMedConnect,30,color='g',range=[min1,max1],label=['non_DOS n=' + str(len(nonMedConnect))],alpha=.4,normed=True) h3 = plt.hist(dosMedConnect,30,color='r',range=[min1,max1],label=['DOS n=' + str(len(dosMedConnect))],alpha=.3,normed=True) # plt.legend() plt.ylabel('normed freq',fontweight='bold') plt.xlabel('median counts ('+ matrixType + ' > ' + str(rnkpt_thresh) + ')',fontweight='bold') plt.title('median connections (compounds collapsed by pert_id) - pass rnkpt ' + str(rnkpt_thresh)) outF = os.path.join(wkdir, 'median_summly_counts_pass_threshold.png') plt.savefig(outF, bbox_inches='tight',dpi=200) plt.close() ### make cdf graph #### vals = np.linspace(min1,max1,100) dosEcdf = ECDF(dosMedConnect) dmsoEcdf = ECDF(dmsoSer) nonEcdf = ECDF(dosMedConnect) obsDos = dosEcdf(vals) obsDmso = dmsoEcdf(vals) obsNon = nonEcdf(vals) a1 = plt.plot(vals,obsDos,color='b',label=['DOS n=' + str(len(dosMedConnect))]) a2 = plt.plot(vals,obsNon,color='g',label=['non_DOS n=' + str(len(nonMedConnect))]) a3 = plt.plot(vals,obsDmso,color='r',label=['DMSO n=' + str(len(dmsoSer))]) # # plt.legend() plt.ylabel('F(x)',fontweight='bold') plt.xlabel('median counts ('+ matrixType + ' > ' + str(rnkpt_thresh) + ')',fontweight='bold') # plt.title('median connections pass rnkpt ' + str(rnkpt_thresh)) outF = os.path.join(wkdir, 'median_summly_counts_cdf.png') plt.savefig(outF, bbox_inches='tight',dpi=200) plt.close() return dosMedConnect, dmsoSer
def plot_ecdf_triplet(data_0, data_1, data_2, x, label_0=None, label_1=None, label_2=None, unit=None, plot_info=True): ecdf = ECDF(data_0.values) median = np.median(data_0.values) plt.plot(x, ecdf(x), lw=2.0, c='m', label=label_0 + ': median {:.1f} {}'.format(median, unit)) ecdf = ECDF(data_1.values) median = np.median(data_1.values) plt.plot(x, ecdf(x), lw=2.0, c='Blue', label=label_1 + ': median {:.1f} {}'.format(median, unit)) ecdf = ECDF(data_2.values) median = np.median(data_2.values) plt.plot(x, ecdf(x), lw=2.0, c='Orange', label=label_2 + ': median {:.1f} {}'.format(median, unit)) plt.plot(x, 0.5 * np.ones(len(x)), lw=2.0, ls='--', c='b', alpha=.3) plt.grid() plt.tick_params(axis='both', which='major') if plot_info: if label_0 is not None: plt.xlabel(label_0 + '/' + label_1 + '/' + label_2 + ' [{}]'.format(unit)) else: plt.xlabel('[{}]'.format(unit)) plt.ylabel('ECDF') plt.ylim([0, 1.05]) plt.legend(loc='upper left') plt.tight_layout()
def main(nsim=500, n=500, p=200, sqrt=False, target='full', sigma=3, AR=True): import matplotlib.pyplot as plt P0, PA = [], [] from statsmodels.distributions import ECDF for i in range(nsim): if True: if not sqrt: if AR: p0, pA = test_AR_randomization(n=n, p=p, target=target, sigma=sigma) else: p0, pA = test_highdim_lasso(n=n, p=p, target=target, sigma=sigma) else: p0, pA = test_sqrt_highdim_lasso(n=n, p=p, target=target, compare_to_lasso=False) else: p0, pA = [], [] print(len(p0), len(pA)) P0.extend(p0) PA.extend(pA) P0_clean = np.array(P0) P0_clean = P0_clean[P0_clean > 1.e-5] # print(np.mean(P0_clean), np.std(P0_clean), np.mean(np.array(PA) < 0.05), np.sum(np.array(PA) < 0.05) / (i + 1), np.mean(np.array(P0) < 0.05), np.mean(P0_clean < 0.05), np.mean(np.array(P0) < 1e-5), 'null pvalue + power + failure') if i % 3 == 0 and i > 0: U = np.linspace(0, 1, 101) plt.clf() if len(P0_clean) > 0: plt.plot(U, ECDF(P0_clean)(U)) if len(PA) > 0: plt.plot(U, ECDF(PA)(U), 'r') plt.plot([0, 1], [0, 1], 'k--') plt.savefig("plot.pdf") plt.show()
def cdf(path): params, colnames, vals = load_coverage_file(path) from matplotlib import pyplot as plt from scipy.stats import binom from statsmodels.distributions import ECDF # Prepare binning n_MC = params["ntest"] n_min = min(n_MC * min(params["cl"]), vals[:, -1].min()) bin_centers = np.arange(n_min, n_MC + 1) plt.figure() plt.title("coverage histogram {}".format(path)) ls_col = cycle(product(("solid", "dashed", "dashdot", "dotted"), "rgbmk")) for targ_cl, (ls, col) in zip(params["cl"], ls_col): idx = vals[:, -2] == targ_cl n_cov = vals[idx, -1] ecdf = ECDF(n_cov) plt.plot(bin_centers, ecdf(bin_centers), color=col, linestyle=ls, linewidth=3, label="{0:.5f}".format(targ_cl)) plt.plot(bin_centers, binom.cdf(bin_centers, n_MC, targ_cl), color=col, linestyle=ls, marker="o", mew=0) plt.legend(loc="best", title="target CL") plt.xlabel("#(covered)") plt.ylabel("cumulative frequency (CDF)") plt.show()
def CDFm(data, nPoint, dist='normal', mu=0, sigma=1, analitica=False, lim=None): import numpy as np from scipy.interpolate import interp1d from statsmodels.distributions import ECDF from scipy.stats import norm, lognorm eps = 5e-5 y = np.linspace(eps, 1 - eps, nPoint) if not analitica: ecdf = ECDF(data) xest = np.linspace(lim[0], lim[1], int(100e3)) yest = ecdf(xest) interp = interp1d(yest, xest, fill_value='extrapolate', kind='nearest') x = interp(y) else: if dist == 'normal': x = norm.ppf(y, loc=mu, scale=sigma) elif dist == 'lognormal': x = lognorm.ppf(y, sigma, loc=0, scale=np.exp(mu)) return x
def pval_adjust_WY(self, cov, pvals, N=10000): """ Purpose: multiple testing correction with a Westfall young-like procedure as in ridge projection method, http://arxiv.org/abs/1202.1377 P.Buehlmann ====================================================================== :param cov: covariance matrix of your estimator :param pvals: single testing pvalues :param N: the number of samples to take for the empirical distribution :return pcorr: corrected p-values ====================================================================== Author: Ziyan Zhu, Date: April 10th, 2019 Following R version by Ruben Dezeure, Date: 6 Feb 2014, 14:27 """ ncol = cov.shape[1] zz = np.random.multivariate_normal(mean=np.zeros(ncol), cov=cov, size=N) zz2 = zz / np.sqrt(np.diagonal(cov)) gz = 2 * norm.sf(abs(zz2)) GZ = np.min(gz, axis=0) ecdf = ECDF(GZ) pcorr = ecdf(pvals) return pcorr
def main(nsim=500): P0 = [] from statsmodels.distributions import ECDF n, p, s = 1000, 10, 3 Sigma_12 = 0.8 gsnr = 1. beta_star = 1. for i in range(nsim): try: p0 = test_lasso_iv_instance(n=n, p=p, s=s, Sigma_12=Sigma_12, gsnr=gsnr, beta_star=beta_star) except: p0 = [] P0.extend(p0) print(np.mean(P0), np.std(P0), np.mean(np.array(P0) < 0.05)) U = np.linspace(0, 1, 101) #plt.clf() plt.plot(U, ECDF(P0)(U)) plt.plot(U, U, 'r--') #plt.savefig("plot.pdf") plt.show()
def L(muestra, alpha): n = len(muestra) epsilon = sqrt(log(2. / alpha) / (2 * n)) ecdf = ECDF(muestra) nn = len(ecdf.y) out = zeros(nn) for i in range(0, nn): out[i] = max(ecdf.y[i] - epsilon, 0) return out
def passive_aggressive_train(self): '''Trains passive aggressive classifier ''' self._clf = PassiveAggressiveClassifier(n_iter=50, C=0.2, n_jobs=-1, random_state=0) self._clf.fit(self._term_doc_matrix._X, self._term_doc_matrix._y) y_dist = self._clf.decision_function(self._term_doc_matrix._X) pos_ecdf = ECDF(y_dist[y_dist >= 0]) neg_ecdf = ECDF(y_dist[y_dist <= 0]) def proba_function(distance_from_hyperplane): if distance_from_hyperplane > 0: return pos_ecdf(distance_from_hyperplane) / 2. + 0.5 elif distance_from_hyperplane < 0: return pos_ecdf(distance_from_hyperplane) / 2. return 0.5 self._proba = proba_function return self
def fit(self, x): self.ecdfs = {} if len(x.shape) == 1: x = x.reshape(-1, 1) ncols = x.shape[1] is_np = is_numpy(x) for i in range(ncols): self.ecdfs.update( {i: ECDF(x[:, i] if is_np else x.iloc[:, i].values)}) return self
def plotCDF(data, logScale=False): from statsmodels.distributions.empirical_distribution import ECDF import matplotlib.pylab as plt ecdf = ECDF(data) plt.figure() plt.plot(ecdf.x, ecdf.y, linewidth=2) plt.ylabel('CDF') if logScale: plt.xscale('log') plt.show()
def cdf_dphase(delay, freq, title='Histogram of phase lag', xscale='f'): """ Plot cdf of delay for given frequencies. Params: -------- delay (ndarray) (n_freq,n_samples) Phase distance between two trajectories. freq (ndarray) Frequencies that are given. title (str) xscale (str) 'f' means frequency scale and 't' means time scale """ from misc.plot import set_ticks_radian, colorcycle from statsmodels.distributions import ECDF fig, ax = plt.subplots(figsize=(7, 4)) c = colorcycle(len(freq)) for freqix in range(len(freq)): ecdf = ECDF(delay[freqix]) if xscale == 't': ax.plot(ecdf.x / (2 * np.pi) / freq[freqix], ecdf.y, '-', alpha=1, c=next(c), lw=2) else: ax.plot(ecdf.x, ecdf.y, '-', alpha=1, c=next(c), lw=2) if xscale == 't': xlim = [-1 / freq[0], 1 / freq[0]] xticks = np.arange(*xlim) else: xlim = [-pi, pi] xticks = [-pi, pi / 2, 0, pi / 2, pi] ax.set(xlim=xlim, xticks=xticks, xlabel='Phase lag', ylabel='CDF', title=title) set_ticks_radian(ax, axis='x') ax.legend(['%1.1f Hz' % f for f in freq], numpoints=1, title='Frequency', fontsize='small', bbox_to_anchor=[1.4, 1.03], labelspacing=.1) ax.grid() return fig, ax
def plotDistributions(data, title, lineC, f=None, ax=None): """ Plots the distribution of reads lengths (or any other data defined over reads) over both paired end sets """ from statsmodels.distributions import ECDF import matplotlib matplotlib.use('agg') import matplotlib.pylab as plt if (f is None): f, ax = plt.subplots(2, sharey=True, sharex=True) ecdfPE1 = ECDF(data[0, :]) ecdfPE2 = ECDF(data[1, :]) ax[0].step(ecdfPE1.x, ecdfPE1.y, color=lineC, alpha=.5) ax[1].step(ecdfPE2.x, ecdfPE2.y, color=lineC, alpha=.5) ax[0].set_title('Read 0') ax[1].set_title('Read 1') f.suptitle('Cumulative Distributions of Read Length') return f, ax
def main(): beta_seq, MLE_cur, MLE_prev, pivot = test_agreement() plt.figure(num=1) plt.plot(beta_seq, np.array(MLE_cur), label='MLE now') plt.plot(beta_seq, np.array(MLE_prev), 'r--', label='MLE prev') plt.legend() plt.figure(num=2) U = np.linspace(0, 1, 101) plt.plot(U, ECDF(pivot)(U)) plt.plot([0, 1], [0, 1], 'k--')
def main(nsim=500): P0, PA = [], [] from statsmodels.distributions import ECDF for i in range(nsim): try: p0, pA = test_condition_subgrad(n=200, p=10) except: p0, pA = [], [] P0.extend(p0) PA.extend(pA) print(np.mean(P0), np.std(P0), np.mean(np.array(PA) < 0.05)) if i % 3 == 0 and i > 0: U = np.linspace(0, 1, 101) plt.clf() if len(P0) > 0: plt.plot(U, ECDF(P0)(U)) if len(PA) > 0: plt.plot(U, ECDF(PA)(U), 'r') plt.plot([0, 1], [0, 1], 'k--') plt.savefig("plot.pdf") plt.show()
def CDFm(data,nPoint): import numpy as np from scipy.interpolate import interp1d from statsmodels.distributions import ECDF eps = 5e-5 yest = np.linspace(0+eps,1-eps,nPoint) ecdf = ECDF(data) inf,sup = min(data),max(data) xest = np.linspace(inf,sup,int(100e3)) yest = ecdf(xest) interp = interp1d(yest,xest,fill_value = 'extrapolate', kind = 'nearest') y = np.linspace(eps,1-eps,nPoint) x = interp(y) return x
def main(): beta_seq, MLE_cur, MLE_prev, pivot = test_agreement() import matplotlib.pyplot as plt from statsmodels.distributions import ECDF plt.figure(num=1) plt.plot(beta_seq, np.array(MLE_cur), label='MLE now') plt.plot(beta_seq, np.array(MLE_prev), 'r--', label='MLE prev') plt.legend() plt.figure(num=2) U = np.linspace(0, 1, 101) plt.plot(U, ECDF(pivot)(U)) plt.plot([0, 1], [0, 1], 'k--')
def _get_scaler_function(scaler_algo): scaler = None if scaler_algo == 'normcdf': scaler = lambda x: norm.cdf(x, x.mean(), x.std()) elif scaler_algo == 'lognormcdf': scaler = lambda x: norm.cdf(np.log(x), np.log(x).mean(), np.log(x).std()) elif scaler_algo == 'percentile': scaler = lambda x: rankdata(x).astype(np.float64) / len(x) elif scaler_algo == 'percentiledense': scaler = lambda x: rankdata(x, method='dense').astype(np.float64) / len(x) elif scaler_algo == 'ecdf': from statsmodels.distributions import ECDF scaler = lambda x: ECDF(x) elif scaler_algo == 'none': scaler = lambda x: x else: raise InvalidScalerException("Invalid scaler alogrithm. Must be either percentile or normcdf.") return scaler
def main(nsim=500): cover= 0. pivot = [] for i in range(nsim): cover_, pivot_ = test_selected_targets() cover += cover_ pivot.append(pivot_) print("iteration completed ", i) print("coverage so far ", cover/(i+1.)) plt.clf() ecdf_MLE = ECDF(np.asarray(pivot)) grid = np.linspace(0, 1, 101) plt.plot(grid, ecdf_MLE(grid), c='blue', marker='^') plt.plot(grid, grid, 'k--') plt.show()
def lmda_estimator(data): ecdf = ECDF(data) y = ecdf(data) def F(x, lmda, D): return 1 - lmda * (x - D + 1 / lmda) * np.exp(-lmda * (x - D)) result = curve_fit(F, data, y) # print """ # ===================λ估计完成,结果如下=================== # λ:%s # D: %s # Covariance matrix: # | %s\t, %s | # | %s\t, %s | # ====================================================== # """ % (result[0][0], result[0][1], result[1][0][0], result[1][0][1], result[1][1][0], result[1][1][1]) return result[0][0]
def pval_adjust_WY(self, cov, pval, N=10000): ## Purpose: ## multiple testing correction with a Westfall young-like procedure as ## in ridge projection method, http://arxiv.org/abs/1202.1377 P.Buehlmann ## ---------------------------------------------------------------------- ## Arguments: ## cov: covariance matrix of your estimator ## pval: the single testing p-values ## N: the number of samples to take for the empirical distribution ## which is used to correct the p-values ## ---------------------------------------------------------------------- ## R-version Author: Ruben Dezeure, Date: 6 Feb 2014, 14:27 ncol = cov.shape[1] zz = np.random.multivariate_normal(mean=np.zeros(ncol), cov=cov, size=N) zz2 = zz / np.sqrt(np.diagonal(cov)) gz = 2 * norm.sf(abs(zz2)) GZ = np.min(gz, axis=0) ecdf = ECDF(GZ) pcorr = ecdf(pval) return pcorr
from statsmodels.distributions import ECDF def empirical_cdf_plot(xs): ecdf = ECDF(xs) xmin = np.nanmin(xs) xmax = np.nanmax(xs) vals = np.linspace(xmin,xmax,100) ax = plt.axes() ax.plot(vals,ecdf(vals)) ax.set_ylabel('F(x)') return ax ax1 = empirical_cdf_plot(passSer) # Though something simple works too ecdf_ridge = ECDF(ridge_r) ecdf_linreg = ECDF(linreg_r) vals = np.linspace(-1,1,100) ax = plt.axes() ax.plot(vals,ecdf_ridge(vals),label='LR',linewidth=2) ax.plot(vals,ecdf_linreg(vals),label='Ridge',linewidth=2) dosSer = sigSer.reindex(dosGold['sig_id'].values) ### make summary table: # 1) pert_id # 2) times_profiled_in_a2 # 3) times_gold_in_a2 # 4) is_gold_cell lines
rMed = rowMedian[pIds] fig = plt.figure(1, figsize=(10, 10)) # make matrix of equal size using null nperm = 10000 permDict = {} for iperm in range(nperm): iRand = np.random.choice(range(0, dmsoFrm.shape[1]), size=(len(pIds))) iRandCol = dmsoFrm.columns[iRand] #random column names smDmso = dmsoFrm.reindex(index=pIds, columns=iRandCol) # remove identity cells and unstack uDmso = no_diagonal_unstack(smDmso) medDmso = uDmso.median() permDict[iperm] = medDmso nullSer = pd.Series(permDict) #two tailed p-value ecdf = ECDF(nullSer) arg1 = ecdf(medObs) arg2 = 1 - ecdf(medObs) pval = 2 * np.minimum(arg1, arg2) #set p-val min if pval == 0: pval = 1 / float(nperm) pvalDict[cName] = pval if graph: # graph heatmap of each plt.imshow(smFrm.values, interpolation='nearest', aspect='auto', vmin=-100, vmax=100, cmap=cm.RdBu_r)
def diffArea(nest, outlier=0, data=0, kinds='all', axis='probability', ROI=20, mu=0, sigma=1, weight=False, interpolator='linear', distribuition='normal', seed=None, plot=True): """ Return an error area between a analitic function and a estimated discretization from a distribuition. Parameters ---------- nest: int The number of estimation points. outlier: int, optional Is the point of an outlier event, e.g outlier = 50 will put an event in -50 and +50 if mu = 0. Defaut is 0 data: int, optional If data > 0, a randon data will be inserted insted analitcs data. Defaut is 0. kinds: str or array, optional specifies the kind of distribuition to analize. ('Linspace', 'CDFm', 'PDFm', 'iPDF1', 'iPDF2', 'all'). Defaut is 'all'. axis: str, optional specifies the x axis to analize ('probability', 'derivative', '2nd_derivative', 'X'). Defaut is 'probability'. ROI: int, optional Specifies the number of regions of interest. Defaut is 20. mu: int, optional Specifies the mean of distribuition. Defaut is 0. sigma: int, optional Specifies the standard desviation of a distribuition. Defaut is 1. weight: bool, optional if True, each ROI will have a diferent weight to analyze. Defaut is False interpolator: str, optional Specifies the kind of interpolation as a string ('linear', 'nearest', 'zero', 'slinear', 'quadratic', 'cubic' where 'zero', 'slinear', 'quadratic' and 'cubic' refer to a spline interpolation of zeroth, first, second or third order) or as an integer specifying the order of the spline interpolator to use. Default is 'linear'. distribuition: str, optional Select the distribuition to analyze. ('normal', 'lognormal') Defaut is 'normal' plot: bool, optional If True, a plot will be ploted with the analyzes Defaut is True Returns ------- a, [b,c]: float and float of ndarray. area,[probROIord,areaROIord] returns the sum of total error area and the 'x' and 'y' values. """ import numpy as np from scipy.stats import norm, lognorm from scipy.interpolate import interp1d from numpy import exp import matplotlib.pyplot as plt from statsmodels.distributions import ECDF from distAnalyze import pdf, dpdf, ddpdf, PDF, dPDF, ddPDF area = [] n = [] data = int(data) if distribuition == 'normal': outlier_inf = outlier_sup = outlier elif distribuition == 'lognormal': outlier_inf = 0 outlier_sup = outlier ngrid = int(1e6) truth = pdf if axis == 'probability': truth1 = pdf elif axis == 'derivative': truth1 = dpdf elif axis == '2nd_derivative': truth1 = ddpdf elif axis == 'X': truth1 = lambda x, mu, sigma, distribuition: x #else: return 'No valid axis' probROIord = {} areaROIord = {} div = {} if seed is not None: np.random.set_state(seed) if data: if distribuition == 'normal': d = np.random.normal(mu, sigma, data) elif distribuition == 'lognormal': d = np.random.lognormal(mu, sigma, data) if kinds == 'all': kinds = ['Linspace', 'CDFm', 'PDFm', 'iPDF1', 'iPDF2'] elif type(kinds) == str: kinds = [kinds] for kind in kinds: if distribuition == 'normal': inf, sup = norm.interval(0.9999, loc=mu, scale=sigma) elif distribuition == 'lognormal': inf, sup = lognorm.interval(0.9999, sigma, loc=0, scale=exp(mu)) inf = lognorm.pdf(sup, sigma, loc=0, scale=np.exp(mu)) inf = lognorm.ppf(inf, sigma, loc=0, scale=np.exp(mu)) xgrid = np.linspace(inf, sup, ngrid) xgridROI = xgrid.reshape([ROI, ngrid // ROI]) dx = np.diff(xgrid)[0] if kind == 'Linspace': if not data: xest = np.linspace(inf - outlier_inf, sup + outlier_sup, nest) else: if distribuition == 'normal': #d = np.random.normal(loc = mu, scale = sigma, size = data) inf, sup = min(d), max(d) xest = np.linspace(inf - outlier_inf, sup + outlier_sup, nest) elif distribuition == 'lognormal': #d = np.random.lognormal(mean = mu, sigma = sigma, size = data) inf, sup = min(d), max(d) xest = np.linspace(inf - outlier_inf, sup + outlier_sup, nest) yest = pdf(xest, mu, sigma, distribuition) elif kind == 'CDFm': eps = 5e-5 yest = np.linspace(0 + eps, 1 - eps, nest) if distribuition == 'normal': if not data: xest = norm.ppf(yest, loc=mu, scale=sigma) yest = pdf(xest, mu, sigma, distribuition) else: #d = np.random.normal(loc = mu, scale = sigma, size = data) ecdf = ECDF(d) inf, sup = min(d), max(d) xest = np.linspace(inf, sup, data) yest = ecdf(xest) interp = interp1d(yest, xest, fill_value='extrapolate', kind='nearest') yest = np.linspace(eps, 1 - eps, nest) xest = interp(yest) elif distribuition == 'lognormal': if not data: xest = lognorm.ppf(yest, sigma, loc=0, scale=exp(mu)) yest = pdf(xest, mu, sigma, distribuition) else: #d = np.random.lognormal(mean = mu, sigma = sigma, size = data) ecdf = ECDF(d) inf, sup = min(d), max(d) xest = np.linspace(inf, sup, nest) yest = ecdf(xest) interp = interp1d(yest, xest, fill_value='extrapolate', kind='nearest') yest = np.linspace(eps, 1 - eps, nest) xest = interp(yest) elif kind == 'PDFm': xest, yest = PDF(nest, mu, sigma, distribuition, outlier, data, seed) elif kind == 'iPDF1': xest, yest = dPDF(nest, mu, sigma, distribuition, outlier, data, 10, seed) elif kind == 'iPDF2': xest, yest = ddPDF(nest, mu, sigma, distribuition, outlier, data, 10, seed) YY = pdf(xest, mu, sigma, distribuition) fest = interp1d(xest, YY, kind=interpolator, bounds_error=False, fill_value=(YY[0], YY[-1])) #fest = lambda x: np.concatenate([fest1(x)[fest1(x) != -1],np.ones(len(fest1(x)[fest1(x) == -1]))*fest1(x)[fest1(x) != -1][-1]]) yestGrid = [] ytruthGrid = [] ytruthGrid2 = [] divi = [] for i in range(ROI): yestGrid.append([fest(xgridROI[i])]) ytruthGrid.append([truth(xgridROI[i], mu, sigma, distribuition)]) ytruthGrid2.append([truth1(xgridROI[i], mu, sigma, distribuition)]) divi.append( len( np.intersect1d( np.where(xest >= min(xgridROI[i]))[0], np.where(xest < max(xgridROI[i]))[0]))) diff2 = np.concatenate( abs((np.array(yestGrid) - np.array(ytruthGrid)) * dx)) #diff2[np.isnan(diff2)] = 0 areaROI = np.sum(diff2, 1) divi = np.array(divi) divi[divi == 0] = 1 try: probROI = np.mean(np.sum(ytruthGrid2, 1), 1) except: probROI = np.mean(ytruthGrid2, 1) probROIord[kind] = np.sort(probROI) index = np.argsort(probROI) areaROIord[kind] = areaROI[index] #deletes = ~np.isnan(areaROIord[kind]) #areaROIord[kind] = areaROIord[kind][deletes] #probROIord[kind] = probROIord[kind][deletes] area = np.append(area, np.sum(areaROIord[kind])) n = np.append(n, len(probROIord[kind])) div[kind] = divi[index] if plot: if weight: plt.logy(probROIord[kind], areaROIord[kind] * div[kind], '-o', label=kind, ms=3) else: plt.plot(probROIord[kind], areaROIord[kind], '-o', label=kind, ms=3) plt.yscale('log') plt.xlabel(axis) plt.ylabel('Error') plt.legend() #plt.title('%s - Pontos = %d, div = %s - %s' %(j,nest, divs,interpolator)) return area, [probROIord, areaROIord]
def ppplot(self, xlabel=None, ylabel=None, line=None, other=None, ax=None, **plotkwargs): """ P-P plot of the percentiles (probabilities) of x versus the probabilities (percetiles) of a distribution. Parameters ---------- xlabel : str or None, optional User-provided lables for the x-axis. If None (default), other values are used depending on the status of the kwarg `other`. ylabel : str or None, optional User-provided lables for the y-axis. If None (default), other values are used depending on the status of the kwarg `other`. line : str {'45', 's', 'r', q'} or None, optional Options for the reference line to which the data is compared: - '45': 45-degree line - 's': standardized line, the expected order statistics are scaled by the standard deviation of the given sample and have the mean added to them - 'r': A regression line is fit - 'q': A line is fit through the quartiles. - None: by default no reference line is added to the plot. other : ProbPlot, array-like, or None, optional If provided, ECDF(x) will be plotted against p(x) where x are sorted samples from `self`. ECDF is an empirical cumulative distribution function estimated from `other` and p(x) = 0.5/n, 1.5/n, ..., (n-0.5)/n where n is the number of samples in `self`. If an array-object is provided, it will be turned into a `ProbPlot` instance default parameters. If not provided (default), `self.dist(x)` is be plotted against p(x). ax : Matplotlib AxesSubplot instance, optional If given, this subplot is used to plot in instead of a new figure being created. **plotkwargs : additional matplotlib arguments to be passed to the `plot` command. Returns ------- fig : Matplotlib figure instance If `ax` is None, the created figure. Otherwise the figure to which `ax` is connected. """ if other is not None: check_other = isinstance(other, ProbPlot) if not check_other: other = ProbPlot(other) p_x = self.theoretical_percentiles ecdf_x = ECDF(other.sample_quantiles)(self.sample_quantiles) fig, ax = _do_plot(p_x, ecdf_x, self.dist, ax=ax, line=line, **plotkwargs) if xlabel is None: xlabel = 'Probabilities of 2nd Sample' if ylabel is None: ylabel = 'Probabilities of 1st Sample' else: fig, ax = _do_plot(self.theoretical_percentiles, self.sample_percentiles, self.dist, ax=ax, line=line, **plotkwargs) if xlabel is None: xlabel = "Theoretical Probabilities" if ylabel is None: ylabel = "Sample Probabilities" ax.set_xlabel(xlabel) ax.set_ylabel(ylabel) ax.set_xlim([0.0, 1.0]) ax.set_ylim([0.0, 1.0]) return fig
def empirical_cdf_plot(xs): ecdf = ECDF(xs)