def anova_function(x_train, x_test, y_train, y_test, alpha, r_state, remove_nan): path = Path('./data/anova_{}_{}{}.npy'.format(r_state, alpha, "_no_nan" if remove_nan else "")) if path.is_file(): result = np.load(path) else: result = [] dfs_ad = x_train[y_train == 0, :] dfs_sq = x_train[y_train == 1, :] dfs_sane = x_train[y_train == 2, :] for column in range(x_train.shape[1]): F_statistic, p_value = stats.f_oneway(dfs_ad[:, column], dfs_sq[:, column], dfs_sane[:, column]) if p_value >= alpha: # il cg non esprime differrnziazione tra i due tumori result.append(0) else: result.append(1) # il cg esprime differenziazione tra i due tumori result = np.asarray(result) np.save(path, result) if x_train[:, result == 1].shape[1] != 0 and x_train[:, result == 0].shape != 0: p.plot_histogram(x_train[:, result == 1][0], y_train, "Anova Significative Feature Distribution", "{}_distribution_signif".format(cl.name), 3) p.plot_histogram(x_train[:, result == 0][0], y_train, "Anova Non-significative Feature Distribution", "{}_distribution_nonsign".format(cl.name), 3) x_train = x_train[:, result == 1] x_test = x_test[:, result == 1] return x_train, x_test
def save_histogram_plots(self, serializer, cart_coords): config = serializer.read_config("config.yaml", path="stats") for file in glob.glob(os.path.join("stats", "hist*")): os.remove(file) for k in xrange(len(cart_coords)): X = np.array([cart_coords[k][i][0] for i in xrange(len(cart_coords[0]))]) Y = np.array([cart_coords[k][i][1] for i in xrange(len(cart_coords[0]))]) histogram_range = [[-3.1, 3.1], [-3.1, 3.1]] H, xedges, yedges = get_2d_histogram(X, Y, histogram_range, bins=config['num_bins']) Plot.plot_histogram(H, xedges, yedges, save=self.save, path="stats", filename="hist"+ str(k) + ".png")
def main(): pregs = survey.Pregnancies() pregs.ReadRecords() lengths = pregnancy_length_list(pregs) hist = Hist() for l in lengths: hist.Incr(l) print 'The skewness of the pregnacy lengths is: ', Skewness(lengths) print 'The pearson skewness of the pregnacy lengths is: ', PearsonSkewness( lengths) plot_histogram(hist, xlabel='weeks', ylabel='number of births')
def analyze(self, dataset, dataset_dir): data = self.data_class.load_dataset(dataset, train_size=100) (X, Y) = (data['x_train'], data['y_train']) print_score.print_breakdown(X, Y) if self.shall_plot: plot_scatter(X, Y, "%s-orig" % dataset, filename=os.path.join(dataset_dir, "%s-orig.png" % dataset)) plot_histogram(X, Y, "%s-hist" % dataset, filename=os.path.join(dataset_dir, "%s-hist.png" % dataset)) pca = PCA() pca.fit(X) plot_PCA_variance(pca.explained_variance_ratio_ * 100, "%s-pca-#feature-vs-variance" % dataset, filename=os.path.join(dataset_dir, "%s-pca-variance-ratio" % dataset))
def visualize(out, height, width): outX = [i.X for i in out["samples"]] outY = [height - i.Y for i in out["samples"]] plot.plot_histogram(data=outX, bins=width, title="X_histogram of posterior samples") plot.plot_histogram(data=outY, bins=height, title="Y_histogram of posterior samples") plot.show_scatterplot(outX, outY, title="Scatter plot of posterior samples", height=height, width=width) outsrcX = [src.X for src in out["src"]] outsrcY = [height - src.Y for src in out["src"]] plot.plot_histogram(data=outsrcX, bins=width, title="Xsrc") plot.plot_histogram(data=outsrcY, bins=height, title="Ysrc") plot.show_scatterplot(outsrcX, outsrcY, title="Scatter plot of sources", height=height, width=width)
def fisher_function(x_train, x_test, y_train, y_test, r_state, best, n, remove_nan): n = int(n) path = Path('./data/fisher_{}{}.npy'.format( r_state, "_no_nan" if remove_nan else "")) if path.is_file(): result = np.load(path) else: result = [] #numero_sq = int(sum(y_train)) #numero_ad = y_train.shape[0] - numero_sq dfs_ad = x_train[y_train == 0, :] dfs_sq = x_train[y_train == 1, :] for column in range(x_train.shape[1]): value = (np.mean(dfs_ad[:, column]) - np.mean(dfs_sq[:, column]))**2 / ( np.var(dfs_ad[:, column]) + np.var(dfs_sq[:, column])) result.append(value) best_feat = result.index(max(result)) worste_feat = result.index(min(result)) print(best_feat) print(worste_feat) result = np.asarray(result) np.save(path, result) p.plot_histogram(x_train[:, best_feat], y_train, "Fisher Best Feature Distribution", "{}_distribution_best".format(cl.name), 2) p.plot_histogram(x_train[:, worste_feat], y_train, "Fisher Worst Feature Distribution", "{}_distribution_worst".format(cl.name), 2) if best: result_max = result[np.argsort(result)[-n:]] indice = np.where(result >= min(result_max))[0] else: result_max = result[np.argsort(result)[:n]] indice = np.where(result <= max(result_max))[0] x_train = x_train[:, indice] x_test = x_test[:, indice] return x_train, x_test
def main(): # step 1: load data from data_generator import load_data xy_train_df, forecast_dict = load_data() # step 2: obtain visual and statistical reports from plot import plot_df, plot_histogram plot_df(df = xy_train_df, title='history monthly data', subplots=True) plot_histogram(df= xy_train_df, title = 'history monthly histogram') # step 3: train model and make the forecast from xgboost_model import model_predict y_label = ['Actuals'] x_label = list(set(xy_train_df.columns) - set(y_label)) forecast_result = pd.DataFrame(columns=['Forecast', 'Predictive Power', 'MSE Upper', 'MSE Lower', 'Running Time']) for forecast_date in forecast_dict.keys(): forecast_date_str = forecast_date.strftime('%d.%m.%Y') print('start forecasting on ' + forecast_date_str) forecast_start_time = datetime.now() saving_path = 'reports/model_'+ forecast_date_str x_forecast = forecast_dict[forecast_date].iloc[[-1]][x_label] # default bootstrap n_iterations=1000, confidence=95 n_iterations = 500 forecast_value, evaluation = model_predict(data_train = xy_train_df, x_forecast=x_forecast, saving_path=saving_path, n_iterations=n_iterations) time_delta = str(datetime.now() - forecast_start_time) forecast_result.loc[forecast_date] = [forecast_value, evaluation['Predictive Power'], evaluation['MSE CI Lower Bound'], evaluation['MSE CI Upper Bound'], time_delta] print('forecasting for {} is finished, took {}s for {} iterations'.format(forecast_date_str, time_delta, n_iterations)) # step 4: save and visualize forecast result print(forecast_result) forecast_result.to_csv('reports/forecast_result.csv', index=True) plot_forecast = True if plot_forecast is True: from plot import vertical_plot vertical_plot()
def removeFeatures(x_train, x_test, cpg_r, chrms_pos, only_chrms_t, remove_nan, y_train): features = None if remove_nan: path_h5 = Path("data/cpgs_nan.h5") if not path_h5.is_file(): print("FILE ERROR") exit(0) with h5py.File(path_h5, 'r') as hf: features = np.array(hf["cpgs_nan"]) with tqdm(total=len(cpg_r.keys())) as pbar: d = np.ones(0) for chrm in chrms_pos.keys(): start = chrms_pos[chrm][0] end = chrms_pos[chrm][1] length = end - start if remove_nan: length -= len(features[start:end][features[start:end]]) # end = start + length if len(cpg_r[chrm]) == 0: d = np.concatenate((d, np.zeros(length))) else: if only_chrms_t: #### fix d = np.concatenate((d, np.ones(length))) else: a = np.asarray(cpg_r[chrm]) d = np.concatenate((d, a)) pbar.update() if x_train[:, d == 1].shape[1] != 0 and x_train[:, d == 0].shape != 0: p.plot_histogram(x_train[:, d == 1][0], y_train, "T-test Significative Feature Distribution", "{}_distribution_signif".format(cl.name), 2) p.plot_histogram(x_train[:, d == 0][0], y_train, "T-test Non-significative Feature Distribution", "{}_distribution_nonsign".format(cl.name), 2) x_train = x_train[:, d == 1] x_test = x_test[:, d == 1] return x_train, x_test
def save_histogram_plots(self, serializer, cart_coords): config = serializer.read_config("config.yaml", path="stats") for file in glob.glob(os.path.join("stats", "hist*")): os.remove(file) for k in xrange(len(cart_coords)): X = np.array( [cart_coords[k][i][0] for i in xrange(len(cart_coords[0]))]) Y = np.array( [cart_coords[k][i][1] for i in xrange(len(cart_coords[0]))]) histogram_range = [[-3.1, 3.1], [-3.1, 3.1]] H, xedges, yedges = get_2d_histogram(X, Y, histogram_range, bins=config['num_bins']) Plot.plot_histogram(H, xedges, yedges, save=self.save, path="stats", filename="hist" + str(k) + ".png")
def visualize(out, height, width): outX = [i.X for i in out["samples"]] outY = [height-i.Y for i in out["samples"]] plot.plot_histogram(data = outX, bins = width, title = "X_histogram of posterior samples") plot.plot_histogram(data = outY, bins = height, title = "Y_histogram of posterior samples") plot.show_scatterplot(outX,outY, title= "Scatter plot of posterior samples", height = height, width = width) outsrcX = [src.X for src in out["src"]] outsrcY = [height - src.Y for src in out["src"]] plot.plot_histogram(data = outsrcX, bins = width, title="Xsrc") plot.plot_histogram(data = outsrcY, bins = height, title="Ysrc") plot.show_scatterplot(outsrcX, outsrcY, title= "Scatter plot of sources", height = height, width = width)
def sys_error(data, pvals, d, lattice, par=0, path="./plots/", absolute=False): """Calculates the statistical and systematic error of an np-array of fit results on bootstrap samples of a quantity and the corresponding p-values. Args: data: A numpy array with three axis. The first axis is the bootstrap sample number, the second axis is the number of correlators, the third axis is the fit range index pvals: The p value indicating the quality of the fit. lattice: The name of the lattice, used for the output file. d: The total momentum of the reaction. par: which parameter to plot (second index of data arrays) path: path where the plots are saved absolute: calculate with the absolute values of data Returns: res: The weighted median value on the original data, see flag "boot" res_std: The standard deviation derived from the deviation of medians on the bootstrapped data. res_syst: 1 sigma systematic uncertainty is the difference res - 16%-quantile or 84%-quantile - res respectively weights: numpy array of the calculated weights for every bootstrap sample and fit range """ # check the deepness of the list structure depth = lambda L: isinstance(L, list) and max(map(depth, L)) + 1 deep = depth(data) if deep == 1: # initialize empty arrays data_weight = [] res, res_std, res_sys = [], [], [] # loop over principal correlators for i, p in enumerate(data): # append the necessary data arrays data_weight.append(np.zeros((p.shape[-1]))) res.append(np.zeros(p.shape[0])) res_std.append(np.zeros((1, ))) res_sys.append(np.zeros((2, ))) # calculate the weight for the fit ranges using the standard # deviation and the p-values of the fit if absolute: data_std = np.std(np.fabs(p[:, par])) else: data_std = np.std(p[:, par]) data_weight[i] = (1. - 2. * np.fabs(pvals[i][0] - 0.5) * np.amin(data_std) / data_std)**2 # draw data in histogram plotlabel = 'hist_%d' % i label = ["", "", "principal correlator"] if absolute: plt.plot_histogram(np.fabs(p[0, par]), data_weight[i], lattice, d, label, path, plotlabel) else: plt.plot_histogram(p[0, par], data_weight[i], lattice, d, label, path, plotlabel) # using the weights, calculate the median over all fit intervals # for every bootstrap sample. if absolute: for b in xrange(p.shape[0]): res[i][b] = qlt.weighted_quantile(np.fabs(p[b, par]), data_weight[i], 0.5) else: for b in xrange(p.shape[0]): res[i][b] = qlt.weighted_quantile(p[b, par], data_weight[i], 0.5) # the statistical error is the standard deviation of the medians # over the bootstrap samples. res_std[i] = np.std(res[i]) # the systematic error is given by difference between the median # on the original data and the 16%- or 84%-quantile respectively if absolute: res_sys[i][0] = res[i][0] - qlt.weighted_quantile( np.fabs(p[0, par]), data_weight[i], 0.16) res_sys[i][1] = qlt.weighted_quantile(np.fabs( p[0, par]), data_weight[i], 0.84) - res[i][0] else: res_sys[i][0] = res[i][0] - qlt.weighted_quantile( p[0, par], data_weight[i], 0.16) res_sys[i][1] = qlt.weighted_quantile( p[0, par], data_weight[i], 0.84) - res[i][0] # keep only the median of the original data res[i] = res[i][0] elif deep == 2: # initialize empty arrays data_weight = [] res, res_std, res_sys = [], [], [] # loop over principal correlators for i, p in enumerate(data): data_weight.append([]) res.append([]) res_std.append([]) res_sys.append([]) for j, q in enumerate(p): # append the necessary data arrays data_weight[i].append(np.zeros(q.shape[-2:])) res[i].append(np.zeros(q.shape[0])) res_std[i].append(np.zeros((1, ))) res_sys[i].append(np.zeros((2, ))) # calculate the weight for the fit ranges using the standard # deviation and the p-values of the fit if absolute: data_std = np.std(np.fabs(q[:, par]), axis=0) else: data_std = np.std(q[:, par], axis=0) data_weight[i][j] = (1. - 2. * np.fabs(pvals[i][j][0] - 0.5) * np.amin(data_std) / data_std)**2 # draw data in histogram plotlabel = 'hist_%d_%d' % (i, j) label = ["", "", "principal correlator"] if absolute: plt.plot_histogram( np.fabs(q[0, par]).ravel(), data_weight[i][j].ravel(), lattice, d, label, path, plotlabel) else: plt.plot_histogram( np.fabs(q[0, par]).ravel(), data_weight[i][j].ravel(), lattice, d, label, path, plotlabel) # using the weights, calculate the median over all fit intervals # for every bootstrap sample. if absolute: for b in xrange(q.shape[0]): res[i][j][b] = qlt.weighted_quantile( np.fabs(q[b, par]).ravel(), data_weight[i][j].ravel(), 0.5) else: for b in xrange(q.shape[0]): res[i][j][b] = qlt.weighted_quantile( q[b, par].ravel(), data_weight[i][j].ravel(), 0.5) # the statistical error is the standard deviation of the medians # over the bootstrap samples. res_std[i][j] = np.std(res[i][j]) # the systematic error is given by difference between the median # on the original data and the 16%- or 84%-quantile respectively if absolute: res_sys[i][j][0] = res[i][j][0] - qlt.weighted_quantile( np.fabs(q[0, par]).ravel(), data_weight[i][j].ravel(), 0.16) res_sys[i][j][1] = qlt.weighted_quantile( np.fabs(q[0, par]).ravel(), data_weight[i][j].ravel(), 0.84) - res[i][j][0] else: res_sys[i][j][0] = res[i][j][0] - qlt.weighted_quantile( q[0, par].ravel(), data_weight[i][j].ravel(), 0.16) res_sys[i][j][1] = qlt.weighted_quantile( q[0, par].ravel(), data_weight[i][j].ravel(), 0.84) - res[i][j][0] # keep only the median of the original data res[i][j] = res[i][j][0] else: print("made for lists of depth < 3") os.sys.exit(-10) return res, res_std, res_sys, data_weight
plot.plot_3d(ax, x_rng, y_rng, obj_func) plt.savefig(fig_dir + "/3d.png", dpi=200) plt.figure() plot.plot_obj_func(x_rng, y_rng, obj_func) plot.plot_trajs(x_rng, y_rng, trajs, [0]) plt.title("Trajs") plt.savefig(fig_dir + "/trajs.png", dpi=200) plt.figure() plot.plot_obj_func(x_rng, y_rng, obj_func) # plot.plot_quiver(x_rng, y_rng, trajs) plot.plot_endpoint_counts(trajs) plt.title("Endpoints") plt.savefig(fig_dir + "/endpoints.png", dpi=200) plt.figure() plot.plot_histogram(x_rng, y_rng, trajs, 50) plot.plot_endpoint_counts(trajs) plt.title("Endpoints") plt.savefig(fig_dir + "/histogram.png", dpi=200) # plt.show() plt.close('all') # Line search plt.figure() plot.plot_line_search(line_search_factors, traj, obj_func) plt.title("Line search") plt.savefig(fig_dir + "/line_search.png", dpi=200)
def makemetadataplot(folderdict,totreads,readsize): ''' In PE readsize is twice ''' numtypes,replicates,datadir=folderdict['data'] samples=max(1,numtypes) metadatadir=folderdict['metadata'] genealldatadict={} for i in range(1,samples+1): exp_file=open('%s/expression_T%02dS01.txt'%(metadatadir,i)) genedict={} for lntxt in exp_file: ln=lntxt.rstrip('\n').split('\t') gene=ln[0].split(':')[0] txsize=int(ln[3]) txexp=float(ln[4]) numreads=int(ln[5]) if gene in genedict: genedict[gene][0].append(txsize) genedict[gene][1].append(numreads) genedict[gene][2].append(txexp) else: genedict[gene]=[[txsize],[numreads],[txexp]] histbins=len(genedict)/5 genedatadict={} for gene in genedict: numtx=len(genedict[gene][0]) entropy=common.shannon_entropy(genedict[gene][2]) rpkm=1000000.0/totreads*sum([genedict[gene][1][k]*1.0/genedict[gene][0][k] for k in range(numtx)]) coverage=sum([genedict[gene][1][k]*1.0/genedict[gene][0][k] for k in range(numtx)])*readsize genedatadict[gene]=[numtx,entropy,rpkm,coverage] genealldatadict[gene]=genealldatadict.get(gene,[])+[coverage] plot.plot_histogram([genedatadict[gene][0] for gene in genedatadict],histbins, 'transcripts count','# genes','Number of Transcripts','%s/plt_T%02d_transcript_count.jpg'%(metadatadir,i)) plot.plot_histogram([genedatadict[gene][1] for gene in genedatadict if genedatadict[gene][1]>1],histbins, 'entropy','# genes','Entropy Distribution','%s/plt_T%02d_entropy_dist.jpg'%(metadatadir,i)) plot.plot_histogram([genedatadict[gene][2] for gene in genedatadict],histbins, 'RPKM','# genes','RPKM Distribution','%s/plt_T%02d_rpkm_dist.jpg'%(metadatadir,i),1) plot.plot_histogram([genedatadict[gene][3] for gene in genedatadict],histbins, 'coverage','# genes','Coverage distribution','%s/plt_T%02d_coverage_dist.jpg'%(metadatadir,i),1) if samples==2: jsd_file=open('%s/expression_jsd.txt'%metadatadir) for lntxt in jsd_file: ln=lntxt.rstrip('\n').split('\t') gene=ln[0]; jsd=float(ln[5]) #print gene, jsd genealldatadict[gene].append(jsd) #print genealldatadict todellist=[] covlist1=[];covlist2=[];jsdlist=[]; mincovlist=[] for gene in genealldatadict: #print genealldatadict[gene] if min(genealldatadict[gene][0],genealldatadict[gene][1])>0.5 and genealldatadict[gene][2]>0.02: covlist1.append(genealldatadict[gene][0]) covlist2.append(genealldatadict[gene][1]) mincovlist.append(min(genealldatadict[gene][0],genealldatadict[gene][1])) jsdlist.append(genealldatadict[gene][2]) else: todellist.append(gene) numofgenestr='%d of %d'%(len(genedatadict)-len(todellist),len(genedatadict)) imagefilename='%s/plt_jsd_coverage1.pdf'%(metadatadir) plot.plotscatterwithhistogram(jsdlist,covlist1,'jsd','coverage1','cov1-jsd\n%s'%numofgenestr,imagefilename,markertup=('.',5),logscaleyflg=1,logscalexflg=0) imagefilename='%s/plt_jsd_coverage2.pdf'%(metadatadir) plot.plotscatterwithhistogram(jsdlist,covlist2,'jsd','coverage2','cov2-jsd\n%s'%numofgenestr,imagefilename,markertup=('.',5),logscaleyflg=1,logscalexflg=0) imagefilename='%s/plt_coverage1_coverage2.pdf'%(metadatadir) plot.plotscatterwithhistogram(covlist1,covlist2,'coverage1','coverage2','cov1-cov2\n%s'%numofgenestr,imagefilename,markertup=('.',5),logscaleyflg=1,logscalexflg=1) imagefilename='%s/plt_jsd_mincoverage.pdf'%(metadatadir) plot.plotscatterwithhistogram(jsdlist,mincovlist,'jsd','mincoverage','mincov-jsd\n%s'%numofgenestr,imagefilename,markertup=('.',5),logscaleyflg=1,logscalexflg=0)
def _worker_lif_stats(uf): u = uf[0] rate = uf[1] fig = plt.figure(figsize=(16, 6)) ax = [fig.add_subplot(4, 2, 1)] for i in xrange(1, 4): ax.append(fig.add_subplot(4, 2, i*2+1)) th_f = th_lif_fi(u, tau_m, tref, xt) T = 1. if th_f > 0: T = max(T, tgt_outspikes/th_f) nspikes = max(10, 2.*T*rate) spks_in = make_poisson_spikes(rate, nspikes, rng) plot_spike_raster(spks_in, ax=ax[0], yticks=[]) t, u_in = filter_spikes(dt, T, spks_in, tau_syn) u_in *= alpha mean_uin = np.mean(u_in[t > 5*tau_syn]) plot_continuous(t, u_in, ax=ax[1], axhline=mean_uin, axhlinep={'color': 'r'}, ylabel='syn', ylabelp={'fontsize': 16}) spk_t, state = run_lifsoma(dt, u_in, tau_m, tref, xt, ret_state=True) plot_continuous(t, state, ax=ax[2], ylabel='soma', ylabelp={'fontsize': 16}) t, rate_out = filter_spikes(dt, xlim_T, spk_t, tau_syn) ss_idx = t > 5*tau_syn mean_rate_out = np.mean(rate_out[ss_idx]) var_rate_out = np.var(rate_out[ss_idx]) plot_continuous(t, rate_out, ax=ax[3], axhline=mean_rate_out, axhlinep={'color': 'r'}, xlabel=r'$t$ (s)', xlabelp={'fontsize': 20}, ylabel=r'filtered out', ylabelp={'fontsize': 16}) ax[3].axhline(th_f, color='r', linestyle=':') for a in ax[:-1]: plt.setp(a.get_xticklabels(), visible=False) match_xlims(ax, (0, xlim_T)) ax[0].set_title(r'$\alpha=%.1e$, $E[u]=%.2f$, $f_{in}=%.1f$ Hz' % (alpha, u, rate), fontsize=20) if len(spk_t) > 5: ax = fig.add_subplot(1, 2, 2) isi = np.diff(spk_t) histp = dict(bins=int(tgt_outspikes/10), normed=True, histtype='step') plot_histogram( isi, histp=histp, ax=ax, axvline=1./mean_rate_out, axvlinep={'color': 'r', 'label': 'observed mean'}, xlabel='output isi (s)', xlabelp={'fontsize': 20}, xlim=(min(isi), max(isi)), title=r'$%d$ spikes, $E[a(u)]=%.1f$, $Var(a(u))=%.1f$' % (len(spk_t), mean_rate_out, var_rate_out), titlep={'fontsize': 20}) if th_f > 0: ax.axvline(1./th_f, color='r', linestyle=':', label='theoretical mean') ax.legend(loc='upper right') sub_fname = fname if fname is not None: sub_fname = fname + '_alpha%.1e_u%.2f_f%.1f_.png' % (alpha, u, rate) save_close_fig(fig, sub_fname, close)
def model_predict(data_train, x_forecast, saving_path, n_iterations=1000, confidence=95): if not os.path.exists(saving_path): os.mkdir(saving_path) ''' with open(saving_path + '/model.sav', 'rb') as file1: best_model = pickle.load(file1) # there is problem to predict by reading saved model: feature names are mismatched prediction_result = pd.read_csv(saving_path+'/prediction_results.csv') predictive_power = prediction_result['Predictive Power'].iloc[0] ''' x_label = x_forecast.columns y_label = ['Actuals'] data_train.sort_index(inplace=True) fitness = pd.DataFrame() prediction_result = pd.DataFrame(index=data_train.index) evaluation = {} model_ls = [] ## bootstrap: resample 80% of total data to train, and test on the unused data for performance - mse # this loop results in two dataframe: one for predictions and one for fitness for i in range(n_iterations): sample_size = int(data_train.shape[0] * 0.8) # resample shuffles data train_sample = resample(data_train, n_samples=sample_size, replace=True) test_sample = data_train[~data_train.index.isin(train_sample.index)] x_train = train_sample[x_label] y_train = train_sample[y_label] label_mean = y_train.mean().values[0] x_test = test_sample[x_label] y_test = test_sample[y_label] training_time_start = time.time() model = build_model(label_mean) model.fit(x_train, y_train) training_time = time.time() - training_time_start model_ls.append(model) predict = model.predict(x_test) iterate_col_str = 'iterate_%d' % i prediction = pd.DataFrame(predict, index=y_test.index, columns=[iterate_col_str]) prediction_result = pd.concat([prediction_result, prediction], axis=1) fitness.loc[iterate_col_str, 'mse'] = mean_squared_error(y_test, predict) fitness.loc[iterate_col_str, 'training_time'] = training_time fitness.loc[iterate_col_str, 'estimators'] = model.best_estimator_ ## evaluations prediction_result.sort_index(inplace=True) prediction_result_col = prediction_result.columns # confidence interval of prediction results for index in prediction_result.index: prediction_row = prediction_result.loc[index].dropna() if prediction_row.empty: continue CI_low = np.percentile(prediction_row, [(100 - confidence) / 2.]) CI_up = np.percentile(prediction_row, [100 - (100 - confidence) / 2.]) prediction_result.loc[index, 'CI_up'] = CI_up prediction_result.loc[index, 'CI_low'] = CI_low # fill nan with mean value in CI_up and CI_low prediction_result[['CI_up', 'CI_low']] = prediction_result[[ 'CI_up', 'CI_low' ]].fillna(prediction_result[['CI_up', 'CI_low']].mean()) prediction_result['In_CI'] = np.where( (data_train[y_label].values < prediction_result[['CI_up']].values) & (data_train[y_label].values > prediction_result[['CI_low']].values), 1, 0) predictive_power = prediction_result['In_CI'].sum( ) / prediction_result.shape[0] prediction_result.to_csv(saving_path + '/prediction_results.csv') evaluation['Predictive Power'] = predictive_power print('At %d confidence, the prediction score is ' % confidence, predictive_power) # plot predictions prediction_result['val_mean'] = prediction_result[ prediction_result_col].mean(axis=1) lineplotCI( prediction_result.index.strftime('%Y-%m').values, prediction_result['val_mean'].values, data_train[y_label].values, prediction_result['CI_low'], prediction_result['CI_up'], 'Month', 'Actuals', saving_path, 'Predicted Actuals') # confidence interval of mse plot_histogram(fitness[['mse']], folder=saving_path, title='mse_hit') mse_CI_low = np.percentile(fitness['mse'].values, [(100 - confidence) / 2.]) mse_CI_up = np.percentile(fitness['mse'].values, [100 - (100 - confidence) / 2.]) evaluation['MSE CI Lower Bound'] = mse_CI_low evaluation['MSE CI Upper Bound'] = mse_CI_up print('{}% confidence of mse is {} and {}'.format(confidence, mse_CI_low, mse_CI_up)) with open(saving_path + '/evaluation_dictionary.pkl', 'wb') as dict_file: pickle.dump(evaluation, dict_file, protocol=pickle.HIGHEST_PROTOCOL) # find smallest mse and use according model to forecast fitness_min = fitness[['mse', 'training_time']].min() fitness_idxmin = fitness[['mse', 'training_time']].idxmin() print('the smallest mse is', fitness_min['mse'], ', training took', fitness.loc[fitness_idxmin['mse'], 'training_time'], 's.') fitness.to_csv(saving_path + '/fitness.csv') # print('we choose the estimator with smallest validation mse.') best_model = model_ls[fitness.index.get_loc(fitness_idxmin['mse'])] with open(saving_path + '/model.sav', 'wb') as file: pickle.dump(best_model, file, protocol=pickle.HIGHEST_PROTOCOL) '''' # only applies to Booster models plot_importance(best_model) plt.savefig(saving_path+'/feature_importance.png') plt.show() ''' forecast = best_model.predict(x_forecast) return forecast[0], evaluation
def makemetadataplot(folderdict, totreads, readsize): ''' In PE readsize is twice ''' numtypes, replicates, datadir = folderdict['data'] samples = max(1, numtypes) metadatadir = folderdict['metadata'] genealldatadict = {} for i in range(1, samples + 1): exp_file = open('%s/expression_T%02dS01.txt' % (metadatadir, i)) genedict = {} for lntxt in exp_file: ln = lntxt.rstrip('\n').split('\t') gene = ln[0].split(':')[0] txsize = int(ln[3]) txexp = float(ln[4]) numreads = int(ln[5]) if gene in genedict: genedict[gene][0].append(txsize) genedict[gene][1].append(numreads) genedict[gene][2].append(txexp) else: genedict[gene] = [[txsize], [numreads], [txexp]] histbins = len(genedict) / 5 genedatadict = {} for gene in genedict: numtx = len(genedict[gene][0]) entropy = common.shannon_entropy(genedict[gene][2]) rpkm = 1000000.0 / totreads * sum([ genedict[gene][1][k] * 1.0 / genedict[gene][0][k] for k in range(numtx) ]) coverage = sum([ genedict[gene][1][k] * 1.0 / genedict[gene][0][k] for k in range(numtx) ]) * readsize genedatadict[gene] = [numtx, entropy, rpkm, coverage] genealldatadict[gene] = genealldatadict.get(gene, []) + [coverage] plot.plot_histogram( [genedatadict[gene][0] for gene in genedatadict], histbins, 'transcripts count', '# genes', 'Number of Transcripts', '%s/plt_T%02d_transcript_count.jpg' % (metadatadir, i)) plot.plot_histogram([ genedatadict[gene][1] for gene in genedatadict if genedatadict[gene][1] > 1 ], histbins, 'entropy', '# genes', 'Entropy Distribution', '%s/plt_T%02d_entropy_dist.jpg' % (metadatadir, i)) plot.plot_histogram([genedatadict[gene][2] for gene in genedatadict], histbins, 'RPKM', '# genes', 'RPKM Distribution', '%s/plt_T%02d_rpkm_dist.jpg' % (metadatadir, i), 1) plot.plot_histogram( [genedatadict[gene][3] for gene in genedatadict], histbins, 'coverage', '# genes', 'Coverage distribution', '%s/plt_T%02d_coverage_dist.jpg' % (metadatadir, i), 1) if samples == 2: jsd_file = open('%s/expression_jsd.txt' % metadatadir) for lntxt in jsd_file: ln = lntxt.rstrip('\n').split('\t') gene = ln[0] jsd = float(ln[5]) #print gene, jsd genealldatadict[gene].append(jsd) #print genealldatadict todellist = [] covlist1 = [] covlist2 = [] jsdlist = [] mincovlist = [] for gene in genealldatadict: #print genealldatadict[gene] if min(genealldatadict[gene][0], genealldatadict[gene] [1]) > 0.5 and genealldatadict[gene][2] > 0.02: covlist1.append(genealldatadict[gene][0]) covlist2.append(genealldatadict[gene][1]) mincovlist.append( min(genealldatadict[gene][0], genealldatadict[gene][1])) jsdlist.append(genealldatadict[gene][2]) else: todellist.append(gene) numofgenestr = '%d of %d' % (len(genedatadict) - len(todellist), len(genedatadict)) imagefilename = '%s/plt_jsd_coverage1.pdf' % (metadatadir) plot.plotscatterwithhistogram(jsdlist, covlist1, 'jsd', 'coverage1', 'cov1-jsd\n%s' % numofgenestr, imagefilename, markertup=('.', 5), logscaleyflg=1, logscalexflg=0) imagefilename = '%s/plt_jsd_coverage2.pdf' % (metadatadir) plot.plotscatterwithhistogram(jsdlist, covlist2, 'jsd', 'coverage2', 'cov2-jsd\n%s' % numofgenestr, imagefilename, markertup=('.', 5), logscaleyflg=1, logscalexflg=0) imagefilename = '%s/plt_coverage1_coverage2.pdf' % (metadatadir) plot.plotscatterwithhistogram(covlist1, covlist2, 'coverage1', 'coverage2', 'cov1-cov2\n%s' % numofgenestr, imagefilename, markertup=('.', 5), logscaleyflg=1, logscalexflg=1) imagefilename = '%s/plt_jsd_mincoverage.pdf' % (metadatadir) plot.plotscatterwithhistogram(jsdlist, mincovlist, 'jsd', 'mincoverage', 'mincov-jsd\n%s' % numofgenestr, imagefilename, markertup=('.', 5), logscaleyflg=1, logscalexflg=0)
import plot import numpy as np X = np.random.randn(400) plot.plot_histogram(X, 10, "Histogram demo")
first_estimations(distributions, thetas) strat_pairs = build_strategies_pairs() strat_comparisons = init_strat_comparisons() p = 0 while p < len(strat_pairs): if len(strat_pairs) > 0: cp = strat_pairs[p] d1 = distributions[str(cp[0])] d2 = distributions[str(cp[1])] conclusion = fe.formation_evaluator(d1.prior, d2.prior) if conclusion == 0: var_d1 = d1.get_variance() var_d2 = d2.get_variance() if var_d1 < var_d2: conclusion = 1 else: conclusion = -1 strat_comparisons[cp[0]][cp[1]] = conclusion if conclusion == None: simulate_pairs(distributions, str(cp[0]), str(cp[1]), settings.N_ADDITIONAL_RUNS) p += 1 else: del strat_pairs[p] if p!= 0: p -= 1 plot.plot_distributions_validation(distributions, settings.NB_THETA, i) fullfill_stratcomp(strat_comparisons) get_classification(distributions, thetas, strat_comparisons, wellclas, misclas) distances, ratios = prepare_histogram(wellclas, misclas) plot.plot_histogram(distances, ratios)