def compute_novelty_stats_without_contrast(data_timeseries, baseline_bool=None): # remove the filler novel items (they were never repeated) data = data_timeseries[~((data_timeseries.event.data['isFirst']) & (data_timeseries.event.data['lag'] == 0))] # determine the mean and std of the baseline period for normalization # if baseline bool is not given, use all timepoints before 0 if baseline_bool is None: baseline_bool = data.time.values < 0 baseline_data = data[:, baseline_bool].mean(dim='time') m = np.mean(baseline_data) s = np.std(baseline_data) # compute the zscored data zdata = (data - m) / s # pull out the data for each condition novel_items = data.event.data['isFirst'] zdata_novel = zdata[novel_items] zdata_repeated = zdata[~novel_items] # run stats at each timepoint ts, ps = ttest_ind(zdata_novel, zdata_repeated, axis=0) # return the statistics and the mean of each condition zdata_novel_mean = np.mean(zdata_novel, axis=0) zdata_novel_sem = sem(zdata_novel, axis=0) zdata_repeated_mean = np.mean(zdata_repeated, axis=0) zdata_repeated_sem = sem(zdata_repeated, axis=0) return zdata_novel_mean, zdata_repeated_mean, zdata_novel_sem, zdata_repeated_sem, ts, ps
def plot_approximations_equal_times(): results_dir = 'experiments/results/approx-heuristic1/' figures_dir = 'experiments/figures/' file_names = \ ['wikivote-approx-heuristic-prob_method-2-k_min-0.0010-k_max-0.0100-tau_scale-0.100-samples-5-bfs_samples-1000-init_samples-5-iter_samples-5',\ 'wikivote-approx-heuristic-prob_method-3-k_min-0.0010-k_max-0.0100-tau_scale-0.100-samples-5-bfs_samples-1000-init_samples-5-iter_samples-5',\ 'BA1000_dataset-approx-heuristic-prob_method-2-k_min-0.0010-k_max-0.0100-tau_scale-0.100-samples-5-bfs_samples-1000-init_samples-5-iter_samples-5',\ 'BA1000_dataset-approx-heuristic-prob_method-3-k_min-0.0010-k_max-0.0100-tau_scale-0.100-samples-5-bfs_samples-1000-init_samples-5-iter_samples-5', 'gnp08-1000-approx-heuristic-prob_method-2-k_min-0.0010-k_max-0.0100-tau_scale-0.100-samples-10-bfs_samples-1000-init_samples-30-iter_samples-30', 'gnp08-1000-approx-heuristic-prob_method-2-k_min-0.0010-k_max-0.0050-tau_scale-0.100-samples-10-bfs_samples-1000-init_samples-10-iter_samples-10'] titles = ['']*len(file_names) approx_errors_apx_d = defaultdict(list) approx_errors_seq_d = defaultdict(list) for i,fname in enumerate(file_names): print "fname: ", fname lines = [line.strip().split('\t') for line in open(results_dir + fname, 'r').readlines()] for line in lines: k_frac, value, apx_value, vanilla_value = float(line[0]), float(line[1]), float(line[2]), float(line[3]) approx_errors_apx_d[k_frac].append(computeError(value, apx_value)) approx_errors_seq_d[k_frac].append(computeError(value, vanilla_value)) k_fracs = approx_errors_apx_d.keys() k_fracs.sort() errors_apx = [np.mean(approx_errors_apx_d[k_frac]) for k_frac in k_fracs] print "errors for apx: ", errors_apx sems_apx = [sem(approx_errors_apx_d[k_frac]) for k_frac in k_fracs] errors_seq = [np.mean(approx_errors_seq_d[k_frac]) for k_frac in k_fracs] print "errors for vanilla: ", errors_seq sems_seq = [sem(approx_errors_seq_d[k_frac]) for alpha in k_fracs] plot2d(k_fracs,[errors_apx, errors_seq], [sems_apx,sems_seq], [r'INFEST^*', 'Capped MC'], [r'$k/n$','Estimation error'], titles[i] , figures_dir + fname+'.pdf', location="upper left")
def _add_param_table_row(self, name, data): """ Adds a parameter row to the output table. @param name Name of the parameter @param data Data values """ data = data[np.isfinite(data)] mean = np.mean(data) weights = 1.0 / np.abs(np.repeat(mean, data.size) - data) weighted_data = data * weights / np.sum(weights) # In the event that a fit went wrong and all weights sum to zero try: weighted_mean = np.average(data, weights=weights) weighted_error = stats.sem(weighted_data) except ZeroDivisionError: weighted_mean = np.nan weighted_error = np.nan self._output_table.addRow([name, mean, np.std(data), stats.sem(data), weighted_mean, weighted_error])
def cleandat(x, y): """ This function takes a two row data matrix (x,y) and does the following: 1- sorts the data in ascending form based on the x row. 2- finds the unique values of x and put the average of corresponding y in the y column in the future: 3- adds a third row with standard deviation for multiple data (indication of statistical error) """ # check data form # assign variables # x = dat[1,:] # y = dat[2,:] x_sort_idx = np.argsort(x) x_srt = x[x_sort_idx] y_srt = y[x_sort_idx] # finding unique values x_uq, x_uq_idx = np.unique(x_srt, return_index=True) # the statistic loop l = len(x_uq_idx) y_out = np.ones(l) y_err = np.ones(l) for i in np.arange(l - 1): y_out[i] = np.mean(y_srt[x_uq_idx[i] : x_uq_idx[i + 1]]) # y_err[i] = np.std(y_srt[x_uq_idx[i]:x_uq_idx[i+1]]) y_err[i] = stat.sem(y_srt[x_uq_idx[i] : x_uq_idx[i + 1]]) y_out[-1] = np.mean(y_srt[x_uq_idx[-1] :]) # y_err[-1] = np.std(y_srt[x_uq_idx[-1]:]) y_err[-1] = stat.sem(y_srt[x_uq_idx[-1] :]) return x_uq, y_out, y_err
def plot_controller_run(data, trials, epochs, threshold): performace_array = [] for n in range(len(data[0][0])): fitness_list = [item[0][n] for item in data] pop_list = [item[1][n] for item in data] performance = (fitness_list, np.mean(fitness_list), stats.sem(fitness_list)/2, np.std(fitness_list)/2, pop_list, np.mean(pop_list), stats.sem(pop_list)/2, np.std(pop_list)/2) performace_array.append(performance) y = [item[1] for item in performace_array] yerr = [item[2] for item in performace_array] x = np.arange(len(y)) y1 = [item[5] for item in performace_array] y1err = [item[6] for item in performace_array] f, axarr = plt.subplots(2, sharex=True) axarr[0].set_title("Average Performance of Neuro-Evolutionary controller \n " "over n={0} trials, {1:.2f}>theta<{2:.2f} " .format(trials, degrees(threshold[0]), degrees(threshold[1]))) axarr[0].errorbar(x, y, yerr=yerr, label='Best Fitness') axarr[0].legend(loc="upper left", shadow=True, fancybox=True) axarr[0].set_ylabel('Fitness') axarr[1].errorbar(x, y1, yerr=y1err, label='Avg Pop. Fitness') axarr[1].legend(loc="upper left", shadow=True, fancybox=True) axarr[1].set_ylabel('Fitness') plt.xlabel('Epochs (e={0})'.format(epochs)) plt.show()
def widerstand(R,V,A,a): print('Einzelwiederstände:') for x in range(a): R[x] = V[x]/A[x] #print(x, ' = ', R[x]) print('Mittelwert = ', np.mean(R),' +- ', stats.sem(R)) return ufloat(np.mean(R), stats.sem(R))
def plot_modulation_depth(arr_early, arr_late, sigma): arr_early = ss.zscored_fr(arr_early, sigma).max(axis = 0) arr_early = np.nan_to_num(arr_early) arr_late = ss.zscored_fr(arr_late, sigma).max(axis = 0) arr_late = np.nan_to_num(arr_late) if arr_early.size > arr_late.size: arr_early = np.random.choice(arr_early, size = arr_late.size, replace = False) if arr_late.size > arr_early.size: arr_late = np.random.choice(arr_late, size = arr_early.size, replace = False) early_sem = stats.sem(arr_early) early_mean = arr_early.mean() late_sem = stats.sem(arr_late) late_mean = arr_late.mean() p_val = stats.ttest_rel(arr_early, arr_late) print "p val is = " + str(p_val) # Pull the formatting out here width = 0.8 bar_kwargs = {'width':width,'color':'g','linewidth':2,'zorder':5} err_kwargs = {'zorder':0,'fmt':None,'lw':2,'ecolor':'k'} means = np.array([early_mean, late_mean]) errs = np.array([early_sem, late_sem]) idx = np.arange(2) X = idx+width/2 labels = ['E1 early', 'E1_late'] plt.bar(idx, means, alpha = 0.5,**bar_kwargs) plt.errorbar(X, means, yerr = errs,**err_kwargs) plt.xticks(X, labels) plt.ylabel('z-scored modulation depth') plt.title('Change in modulation depth from early in session to late in session') plt.show()
def get_data(self): directory = self.get_dir() shots = self.dic['shots'] shots.replace(' ', '') #remove all the spaces # keys = " ".join([self.dic['X'], self.dic['Y']]) # self.data, errmsg, raw_data = qrange.qrange(directory, shots, keys) keys = [self.dic['X'], self.dic['Y']] # print 'Before qrange.' self.data, errmsg, raw_data = qrange.qrange_eval(directory, shots, keys) # print 'After qrange.' s = '' for i in range(self.data.shape[1]): col = self.data[:,i] s00 = numpy.mean(col) s01 = stats.sem(col) s02 = numpy.std(col) s03 = numpy.max(col) - numpy.min(col) s = s + "Mean = %10.6f\n" % s00 s = s + "Std. deviation = %10.6f\n" % s02 s = s + "Std. Error of the mean = %10.6f\n" % s01 s = s + "Pk-Pk = %10.6f\n" % s03 s = s+ '\n' raw_data = s + raw_data self.dic['data_str'] = raw_data self.sdata = None if self.dic['X'] == "SEQ:shot": s = [ numpy.mean(self.data[:,1]), numpy.std(self.data[:,1]), stats.sem(self.data[:,1]),numpy.max(self.data[:,1]) - numpy.min(self.data[:,1]) ] a = [] for val in s: a.append( [val for i in range(self.data[:,1].size)]) self.sdata = numpy.c_[self.data[:,0], numpy.transpose(numpy.array(a))] else: self.sdata = statdat.statdat(self.data, 0, 1) return
def _compute_item_pair_diff(self, smoothed_spike_counts): data = smoothed_spike_counts[~((smoothed_spike_counts.event.data['isFirst']) & (smoothed_spike_counts.event.data['lag'] == 0))] item_names = data.event.data['item_name'] novel_rep_diffs = [] mean_item_frs = [] novel_mean = [] rep_mean = [] for this_item in np.unique(item_names): data_item = data[item_names == this_item] if data_item.shape[0] == 2: novel_data_item = data_item[data_item.event.data['isFirst']].values rep_data_item = data_item[~data_item.event.data['isFirst']].values diff_due_to_cond = novel_data_item - rep_data_item novel_rep_diffs.append(diff_due_to_cond) novel_mean.append(novel_data_item) rep_mean.append(rep_data_item) mean_item_frs.append(np.mean(data_item.data)) novel_mean = np.squeeze(np.stack(novel_mean)) novel_sem = sem(novel_mean, axis=0) novel_trial_means = np.mean(novel_mean, axis=1) novel_mean = np.mean(novel_mean, axis=0) rep_mean = np.squeeze(np.stack(rep_mean)) rep_sem = sem(rep_mean, axis=0) rep_trial_means = np.mean(rep_mean, axis=1) rep_mean = np.mean(rep_mean, axis=0) return np.squeeze(np.stack(novel_rep_diffs)), np.stack(mean_item_frs), novel_mean, rep_mean, novel_sem, \ rep_sem, novel_trial_means, rep_trial_means
def delP(f,w,m,e,s,force): NEMD = read_log_nemd(f,w,m,e,s,force) # for now only pick the last 2 nanoseconds # should rerun with longer equilibration Pleft = NEMD[:,8] Pright = NEMD[:,9] # plot pressure to check convergence fig1 = plt.figure(figsize=(9,7)) ax1 = fig1.add_axes([0.1,0.15,0.8,0.75]) ax1.plot(NEMD[:,0][::10], (NEMD[:,8][::10]-NEMD[:,9][::10])*1e-1) ax1.set_xlabel('Time') ax1.set_ylabel('$\Delta$P (MPa)') fig1.savefig('PLOTS/PDF/dP_{}_{}_{}_eps{}_s{}_f{}.pdf'.format(f,w,m,e,s,force)) fig1.clear() Pleft_val = np.mean(Pleft) Pleft_err = stats.sem(Pleft) Pright_val = np.mean(Pright) Pright_err = stats.sem(Pright) deltaP = (Pright_val - Pleft_val)*1e5 deltaP_err = 1e5*np.sqrt(Pleft_err**2+Pright_err**2) print 'Pressure drop: ', deltaP*1e-6, '+/-', deltaP_err*1e-6, 'MPa.' return deltaP, deltaP_err
def get_data(self): directory = self.get_dir() shots = self.setup_dict['shots'] if type(shots) == type([]): shots = ",".join(shots) keys = " ".join([self.setup_dict['X'], self.setup_dict['Y']]) self.data, self.errmsg, self.raw_data = qrange.qrange(directory, shots, keys) s = '' for i in range(self.data.shape[1]): col = self.data[:,i] s00 = numpy.mean(col) s01 = stats.sem(col) s02 = numpy.std(col) s03 = numpy.max(col) - numpy.min(col) s = s + "Mean = %10.6f\n" % s00 s = s + "Std. deviation = %10.6f\n" % s02 s = s + "Std. Error of the mean = %10.6f\n" % s01 s = s + "Pk-Pk = %10.6f\n" % s03 s = s+ '\n' self.raw_data = s + self.raw_data self.sdata = None if self.setup_dict['X'] == "SEQ:shot": s = [ numpy.mean(self.data[:,1]), numpy.std(self.data[:,1]), stats.sem(self.data[:,1]),numpy.max(self.data[:,1]) - numpy.min(self.data[:,1]) ] a = [] for val in s: a.append( [val for i in range(self.data[:,1].size)]) self.sdata = numpy.c_[self.data[:,0], numpy.transpose(numpy.array(a))] else: self.sdata = statdat.statdat(self.data, 0, 1)
def partition_main(args): print(args, file=sys.stderr) base_prior = make_base_prior(args.het, GTYPE3) # base genotype prior mm,mm0,mm1 = make_mut_matrix(args.mu, GTYPE3) # substitution rate matrix, with non-diagonal set to 0, with diagonal set to 0 vcffile, variants, DPRs, PLs = read_vcf(args.vcf, args.min_ev) n_site,n_smpl = PLs.shape[0:2] tree = Tree() if sem(PLs[...,1],axis=1).mean() > sem(PLs[...,2],axis=1).mean(): partition(PLs[...,0:2], tree, np.arange(n_smpl), args.min_ev) else: partition(PLs, tree, np.arange(n_smpl), args.min_ev) init_tree(tree) PLs = PLs.astype(np.longdouble) populate_tree_PL(tree, PLs, mm, 'PL') populate_tree_PL(tree, PLs, mm0, 'PL0') calc_mut_likelihoods(tree, mm0, mm1) print(tree) tree.write(outfile=args.output+'.pt0.nwk', format=5) best_tree,best_PL = recursive_NNI(tree, mm0, mm1, base_prior) best_tree,best_PL = recursive_reroot(best_tree, mm0, mm1, base_prior) print(best_tree) print('PL_per_site = %.4f' % (best_PL/n_site)) best_tree.write(outfile=args.output+'.pt.nwk', format=5)
def count_totals_to_percents_weighted(count_totals): # Here's the return datatype that stores the percentage of occupancy # in a given channel/sf state which can be paired with the indices ion_count_percents = defaultdict(list) ion_count_indices = defaultdict(list) for traj_id, count_dict in count_totals.iteritems(): traj_total_lines = float(sum(count_dict.values())) for ion_state, ion_count in count_dict.iteritems(): ion_count_percents[traj_id].append(ion_count/traj_total_lines) ion_count_indices[traj_id].append(ion_state) # Append a little statistics, sorry if this is confusing... all_weighted_avgs=[] weighted_avgs_by_occid=defaultdict(list) for traj_id, percents in ion_count_percents.iteritems(): temp_weighted_avg = 0 for occ_id, percent in enumerate(percents): x = ion_count_indices[traj_id][occ_id]*percent temp_weighted_avg += x weighted_avgs_by_occid[occ_id].append(x) all_weighted_avgs.append(temp_weighted_avg) for occ_id, weight_avg in weighted_avgs_by_occid.iteritems(): ion_count_percents['MEAN'].append(mean(weight_avg)) ion_count_indices['MEAN'].append(occ_id) ion_count_percents['STDERR'].append(sem(weight_avg)) ion_count_indices['STDERR'].append(occ_id) ion_count_percents['MEAN'].append(mean(all_weighted_avgs)) ion_count_indices['MEAN'].append('ALL') ion_count_percents['STDERR'].append(sem(all_weighted_avgs)) ion_count_indices['STDERR'].append('ALL') return (dict(ion_count_percents), dict(ion_count_indices))
def get_phylo_depth_changes(fluc_levels,fluc_type,data_type): assert type(fluc_levels)==list assert type(fluc_type)==str assert fluc_type in ["sync","stag","lowhigh"] assert type(data_type)==str assert data_type in ["raw","avg"] for fluc_level in fluc_levels: fluc_length=int(fluc_level) if data_type=="avg": start_slope_means=[] start_slope_se=[] end_slope_means=[] end_slope_se=[] else: start_slopes=[[] for i in range(30)] end_slopes=[[] for i in range(30)] for replicate in range(1,31): avg_depth_for_updates=[] start_inflow_slopes=[] end_inflow_slopes=[] averages_for_replicate=get_file_lines("../data_"+str(fluc_type)+"_"+str(fluc_level)+"/replicate_"+str(replicate)+"/average.dat") for line in averages_for_replicate: if len(line)!=0 and line[0]!="#": temp=line.split(" ") update=int(temp[0]) if update%fluc_length==0: depth=float(temp[11]) avg_depth_for_updates+=[float(depth)] for i in range(len(avg_depth_for_updates)-1): if i%2==0: start_inflow_slopes+=[math.fabs(avg_depth_for_updates[i]-avg_depth_for_updates[i+1])] else: end_inflow_slopes+=[math.fabs(avg_depth_for_updates[i]-avg_depth_for_updates[i+1])] if data_type=="avg": start_slope_means+=[stats.nanmean(start_inflow_slopes)] start_slope_se+=[stats.sem(start_inflow_slopes)] end_slope_means+=[stats.nanmean(end_inflow_slopes)] end_slope_se+=[stats.sem(end_inflow_slopes)] else: start_slopes[replicate-1]=list(start_inflow_slopes) end_slopes[replicate-1]=list(end_inflow_slopes) if data_type=="avg": pickle.dump(start_slope_means,open("../plot_data/start_slope_mean_"+str(fluc_type)+"_"+str(fluc_level)+".data","wb")) pickle.dump(end_slope_means,open("../plot_data/end_slope_mean_"+str(fluc_type)+"_"+str(fluc_level)+".data","wb")) pickle.dump(start_slope_se,open("../plot_data/start_slope_se_"+str(fluc_type)+"_"+str(fluc_level)+".data","wb")) pickle.dump(end_slope_se,open("../plot_data/end_slope_se_"+str(fluc_type)+"_"+str(fluc_level)+".data","wb")) else: pickle.dump(start_slopes,open("../plot_data/start_slope_raw_"+str(fluc_type)+"_"+str(fluc_level)+".data","wb")) pickle.dump(end_slopes,open("../plot_data/end_slope_raw_"+str(fluc_type)+"_"+str(fluc_level)+".data","wb")) return "success"
def plot_avg_and_sem(npArray, axis=1): mean = npArray.mean(axis=axis) sem_plus = mean + stats.sem(npArray, axis=axis) sem_minus = mean - stats.sem(npArray, axis=axis) plt.figure() plt.fill_between(np.arange(mean.shape[0]), sem_plus, sem_minus, alpha=0.5) plt.plot(mean)
def semFinite(data,N): 'Standard error of the mean with finite population correction' # print(N) # print(len(data)) if len(data) < 0.05*N: return sem(data) else: return sem(data)*((N-len(data))/(N-1))**0.5
def plot_cd_data(pre_arr, peri_arr, post_arr): # Custom function to draw the p-value bars def label_diff(i,j,text,X,Y): x = (X[i]+X[j])/2 ##center of the p-val bar y = max(Y[i], Y[j]) props = {'connectionstyle':'bar','arrowstyle':'-',\ 'shrinkA':20,'shrinkB':20,'lw':2} ax.annotate(text, xy=(x,y+0.1), zorder=10) ax.annotate('', xy=(X[i],y), xytext=(X[j],y), arrowprops=props) ##create a numpy array containing the mean vals for the bar chart means = np.array([pre_arr.mean(), peri_arr.mean(), post_arr.mean()]) ##get the standard error values errs = np.array([stats.sem(pre_arr), stats.sem(peri_arr), stats.sem(post_arr)]) ##calculate the p-values between each of the sets p_pre_peri = np.round(stats.ttest_rel(pre_arr, peri_arr)[1], 3) p_pre_post = np.round(stats.ttest_rel(pre_arr, post_arr)[1], 3) p_peri_post = np.round(stats.ttest_rel(peri_arr, post_arr)[1], 3) ##put all the arrays into one big array to plot the ##individual lines all_arr = np.zeros((3,pre_arr.size)) all_arr[0,:] = pre_arr all_arr[1,:] = peri_arr all_arr[2,:] = post_arr ##formatting stuff idx = np.arange(3) # the x locations for the groups width= 0.8 labels = ('Pre', 'CD', 'Reinstatement') # Pull the formatting out here bar_kwargs = {'width':width,'color':'g','linewidth':2,'zorder':5} err_kwargs = {'zorder':0,'fmt':None,'lw':2,'ecolor':'k'} X = idx+width/2 ##position of the center of the bars fig, ax = plt.subplots() ax.p1 = plt.bar(idx, means, alpha = 0.5, **bar_kwargs) ax.errs = plt.errorbar(X, means, yerr=errs, **err_kwargs) ##plot the individual lines on their own axis ax2 = ax.twinx() ax2.lines = plt.plot(np.linspace(0,3,3), all_arr) ax2.set_ylabel("Percent correct") # Call the function label_diff(0,1,'p='+str(p_pre_peri),X,means) label_diff(0,2,'p='+str(p_pre_post),X,means) label_diff(1,2,'p='+str(p_peri_post),X,means) ax.set_ylim(ymax=means.max()+0.3) plt.xticks(X, labels, color='k') plt.title("Performance during contingency degredation") ax.set_ylabel("Percent correct") plt.show()
def avg_scores_plot_multi(fit_results_no_CV, fit_results_CV, predictor_variable, CV_order): n_array_no_CV = [] r2_array_no_CV = [] rmse_array_no_CV = [] n_array_CV = [] r2_array_CV = [] rmse_array_CV = [] for key in fit_results_no_CV: value = fit_results_no_CV[key] i = 0 sum_r2 = 0 sum_rmse = 0 while i < len(value): sum_r2 = float(sum_r2) + value[i][1] sum_rmse = float(sum_rmse) + value[i][2] i = i + 1 avg_r2 = sum_r2/len(value) avg_rmse = sum_rmse/len(value) n_array_no_CV.append(key) r2_array_no_CV.append(avg_r2) rmse_array_no_CV.append(avg_rmse) print 'For fit with All Data: For n = ' + str(key) + ': Average R^2 = ' + str(avg_r2) + ', Average RMSE = ' + str(avg_rmse) for key in fit_results_CV: value = fit_results_CV[key] i = 0 sum_r2 = 0 sum_rmse = 0 while i < len(value): sum_r2 = float(sum_r2) + value[i][1] sum_rmse = float(sum_rmse) + value[i][2] i = i + 1 avg_r2 = sum_r2/len(value) avg_rmse = sum_rmse/len(value) n_array_CV.append(key) r2_array_CV.append(avg_r2) rmse_array_CV.append(avg_rmse) print 'For fit with CV: For n = ' + str(key) + ': Average R^2 = ' + str(avg_r2) + ', Average RMSE = ' + str(avg_rmse) #Plot Average Values of each against one another f, ax = plt.subplots() y_error_CV=stats.sem(rmse_array_CV, axis=None, ddof=4) y_error_no_CV = stats.sem(rmse_array_no_CV, axis=None, ddof=4) ax.errorbar(n_array_CV, rmse_array_CV, yerr=y_error_CV, fmt='o', color='r', label='Fit with '+str(CV_order)+'-fold Cross Validation') ax.errorbar(n_array_no_CV, rmse_array_no_CV, yerr=y_error_no_CV, fmt='o', color='blue', label='Fit on All Data') ax.set_ylabel('Average RMSE Value') ax.xaxis.grid() ax.yaxis.grid() miny = min(rmse_array_no_CV)-500 maxy = max(rmse_array_no_CV)+500 plt.ylim([miny, maxy]) plt.xlim([0, 6]) plt.xlabel('Polynomial Fit Order') plt.title('Avg. RMSE in ' + str(CV_order)+ '-fold Cross Validation and Polynomial Fit with All Data of ' + str(predictor_variable) + ' vs No. 311 Incidents') plt.legend() plt.show()
def runQLearningTest(): runs = 1500 epochs = 500 _epsilon=0.1 _alpha=0.5 _gamma=0.9 bandit = Explorer() fig, axs = plt.subplots(nrows=1, ncols=2, sharex=True) x = np.arange(epochs) ax = axs[0] accumulator = [] errors = [] y = [] for i in range(runs): start = dt.datetime.now() accumulator.append(bandit.findPath(epochs, 0, _alpha, _gamma)) print('Run 10-0 #' + str(i) + ' took ' + str((dt.datetime.now() - start).total_seconds()) + ' seconds') accumulator2 = np.array(accumulator) for i in range(len(accumulator[0])): errors.append(stats.sem(accumulator2[: ,i], axis=None, ddof=0)) y.append(np.mean(accumulator2[: ,i])) ax.plot(x, y, 'b', label='Greedy', linewidth=2) ax.errorbar(x, y, yerr=errors, ecolor='g') #, fmt='o') ax.legend(loc='upper left', shadow=True) ax.set_ylabel('Reward') ax.set_xlabel('Steps {0}, alpha {1}, gamma {2}'.format(epochs, _alpha, _gamma)) #-------------------------------------------- ax = axs[1] accumulator = [] errors = [] y = [] for i in range(runs): start = dt.datetime.now() accumulator.append(bandit.findPath(epochs, _epsilon, _alpha, _gamma)) print('Run 10-0 #' + str(i) + ' took ' + str((dt.datetime.now() - start).total_seconds()) + ' seconds') accumulator2 = np.array(accumulator) for i in range(len(accumulator[0])): errors.append(stats.sem(accumulator2[: ,i]/2, axis=None, ddof=0)) y.append(np.mean(accumulator2[: ,i])) ax.plot(x, y, 'r--', label='e-Greedy', linewidth=2) ax.errorbar(x, y, yerr=errors, ecolor='r') #, fmt='o') ax.legend(loc='upper left', shadow=True) ax.set_xlabel('Steps {0}, alpha {1}, gamma {2}'.format(epochs, _alpha, _gamma)) _title = 'Q-Learning - Greedy vs e-Greedy (e = {0} - {1} Runs)'.format(_epsilon, runs) fig.suptitle(_title) plt.show()
def make_plot_fig2(sim_results): rms_lc = sim_results[0, 2] lam_lc = sim_results[0, 0] rms_cv = sim_results[1, 2] lam_cv = sim_results[1, 0] fig = plt.figure(figsize = (12,12), dpi = 300) widths = [10] heights = [1, 1] gs = gridspec.GridSpec(2, 1, height_ratios=heights, width_ratios=widths, hspace=0.45, wspace=0.3) ax1 = plt.subplot(gs[0]) if np.min(rms_cv) < np.min(rms_lc): trans = np.min(np.mean(rms_cv, axis=0)) else: trans = np.min(np.mean(rms_lc, axis=0)) mn_rms = np.mean(rms_lc, axis=0) - trans st_rms = st.sem(rms_lc, axis=0) plt.plot(noise_lvl, mn_rms, marker = 'o', color = 'blue', label = 'L-curve') plt.fill_between(noise_lvl, mn_rms - st_rms, mn_rms + st_rms, alpha = 0.3, color = 'blue') mn_rms = np.mean(rms_cv, axis=0) - trans st_rms = st.sem(rms_cv, axis=0) plt.plot(noise_lvl, mn_rms, marker = 'o', color = 'green', label = 'cross-validation') plt.fill_between(noise_lvl, mn_rms - st_rms, mn_rms + st_rms, alpha = 0.3, color = 'green') plt.ylabel('Estimation error') plt.xlabel('Relative noise level') ax1.spines['right'].set_visible(False) ax1.spines['top'].set_visible(False) set_axis(ax1, -0.05, 1.05, letter='A') ht, lh = ax1.get_legend_handles_labels() fig.legend(ht, lh, loc='center', ncol=2, frameon=False) '''second plot''' ax2 = plt.subplot(gs[1]) mn_lam = np.mean(lam_lc, axis=0) st_lam = st.sem(lam_lc, axis=0) plt.plot(noise_lvl, mn_lam, marker = 'o', color = 'blue', label = 'L-curve') plt.fill_between(noise_lvl, mn_lam - st_lam, mn_lam + st_lam, alpha = 0.3, color = 'blue') mn_lam = np.mean(lam_cv, axis=0) st_lam = st.sem(lam_cv, axis=0) plt.plot(noise_lvl, mn_lam, marker = 'o', color = 'green', label = 'cross-validation') plt.fill_between(noise_lvl, mn_lam - st_lam, mn_lam + st_lam, alpha = 0.3, color = 'green') # ax2.set_yscale('log') ax2.ticklabel_format(style='sci', axis='y', scilimits=((0.0, 0.0))) plt.ylabel('Lambda') plt.xlabel('Relative noise level') set_axis(ax2, -0.05, 1.05, letter='B') ht, lh = ax2.get_legend_handles_labels() fig.legend(ht, lh, loc='lower center', ncol=2, frameon=False) ax2.spines['right'].set_visible(False) ax2.spines['top'].set_visible(False) fig.savefig('stats.jpg')
def get_stats_all(dates, data, ydate): ix = []; iy = [] for i, date in enumerate(dates): if date == ydate: iy.append(i) else: ix.append(i) x0 = data[:,ix]; x = x0.mean(axis=1) y0 = data[:,iy]; y = y0.mean(axis=1) xe = st.sem(x0, axis=1) ye = st.sem(y0, axis=1) return x, y, xe, ye, x0, y0
def plot_fr_means(arrs1, arrs2, chunk1 = (0,10), chunk2 = (35,45), n = None): ##grab the specified chunks arrs1_early = arrs1[:,chunk1[0]*60*1000:chunk1[1]*60*1000] arrs1_late = arrs1[:,chunk2[0]*60*1000:chunk2[1]*60*1000] arrs2_early = arrs2[:,chunk1[0]*60*1000:chunk1[1]*60*1000] arrs2_late = arrs2[:,chunk2[0]*60*1000:chunk2[1]*60*1000] ##calculate the means across all the arrays means =np.array([arrs1_early.mean(), arrs2_early.mean(), arrs1_late.mean(), arrs2_late.mean()])*1000 ##get the across session means m_arrs1_early = arrs1_early.mean(axis = 1)*1000 m_arrs2_early = arrs2_early.mean(axis = 1)*1000 m_arrs1_late = arrs1_late.mean(axis = 1)*1000 m_arrs2_late = arrs2_late.mean(axis = 1)*1000 ##get an array of SEM mesurements for the error bars errs = np.array([stats.sem(m_arrs1_early,axis = None), stats.sem(m_arrs2_early,axis = None), stats.sem(m_arrs1_late,axis = None), stats.sem(m_arrs2_late, axis = None)]) ##calculate the t-tests p_e1s = stats.ttest_rel(m_arrs1_early, m_arrs1_late) p_e2s = stats.ttest_rel(m_arrs2_early, m_arrs2_late) p_e12_early = stats.ttest_rel(m_arrs1_early, m_arrs2_early) p_e12_late = stats.ttest_rel(m_arrs1_late, m_arrs2_late) ##print the ttest results print "p_e1s = " + str(p_e1s) print "p_e2s = " + str(p_e2s) print "p_e12_early = " + str(p_e12_early) print "p_e12_late = " + str(p_e12_late) ##plot the bar graph ##formatting stuff idx = np.arange(4) # the x locations for the groups width= 0.8 labels = ('E1 early', 'E2_early', 'E1_late', 'E2_late') # Pull the formatting out here bar_kwargs = {'width':width,'color':'g','linewidth':2,'zorder':5} err_kwargs = {'zorder':0,'fmt':None,'lw':2,'ecolor':'k'} X = idx+width/2 ##position of the center of the bars fig, ax = plt.subplots() ax.p1 = plt.bar(idx, means, alpha = 0.5, **bar_kwargs) ax.errs = plt.errorbar(X, means, yerr=errs, **err_kwargs) ax.set_ylim(ymax=means.max()+means.max()/6.0) plt.xticks(X, labels, color='k') plt.title("Average firing rate within sessions") ax.set_ylabel("FR (Hz)") if n is not None: plt.text(0.2, means.max()+means.max()/10, "n= "+str(n)+" sessions") plt.show()
def calculate_tuning_curve_inputs(spikeTimeStamps, eventOnsetTimes, firstSort, secondSort, timeRange, baseRange=[-1.1,-0.1], errorType = 'sem', info='full'): fullTimeRange = [min(min(timeRange),min(baseRange)), max(max(timeRange),max(baseRange))] numFirst = np.unique(firstSort) numSec = np.unique(secondSort) duration = timeRange[1]-timeRange[0] spikeArray = np.zeros((len(numFirst), len(numSec))) errorArray = np.zeros_like(spikeArray) trialsEachCond = behavioranalysis.find_trials_each_combination(firstSort, numFirst, secondSort, numSec) spikeTimesFromEventOnset, trialIndexForEachSpike, indexLimitsEachTrial = spikesanalysis.eventlocked_spiketimes( spikeTimeStamps, eventOnsetTimes, fullTimeRange) spikeCountMat = spikesanalysis.spiketimes_to_spikecounts(spikeTimesFromEventOnset, indexLimitsEachTrial, timeRange) baseSpikeCountMat = spikesanalysis.spiketimes_to_spikecounts(spikeTimesFromEventOnset, indexLimitsEachTrial, baseRange) baselineSpikeRate = np.mean(baseSpikeCountMat)/(baseRange[1]-baseRange[0]) if errorType == 'sem': baselineError = stats.sem(baseSpikeCountMat)/(baseRange[1]-baseRange[0]) elif errorType == 'std': baselineError = np.std(baseSpikeCountMat)/(baseRange[1]-baseRange[0]) for sec in range(len(numSec)): trialsThisSec = trialsEachCond[:,:,sec] for first in range(len(numFirst)): trialsThisFirst = trialsThisSec[:,first] if spikeCountMat.shape[0] != len(trialsThisFirst): spikeCountMat = spikeCountMat[:-1,:] if any(trialsThisFirst): thisFirstCounts = spikeCountMat[trialsThisFirst].flatten() spikeArray[first,sec] = np.mean(thisFirstCounts)/duration if errorType == 'sem': errorArray[first,sec] = stats.sem(thisFirstCounts)/duration elif errorType == 'std': errorArray[first,sec] = np.std(thisFirstCounts)/duration else: spikeArray[first,sec] = np.nan errorArray[first,sec] = np.nan if info=='full': tuningDict = {'responseArray':spikeArray, 'errorArray':errorArray, 'baselineSpikeRate':baselineSpikeRate, 'baselineSpikeError':baselineError, 'spikeCountMat':spikeCountMat, 'trialsEachCond':trialsEachCond} elif info=='plotting': tuningDict = {'responseArray':spikeArray, 'errorArray':errorArray, 'baselineSpikeRate':baselineSpikeRate, 'baselineSpikeError':baselineError} else: raise NameError('That is not an info type you degenerate') return tuningDict
def AnalyzeResultInfo(modelPredicts, testObs, reflectObs) : print "FULL SET" sumInfo = DoSummaryInfo(testObs, modelPredicts) print "RMSE: %8.4f %8.4f" % (numpy.mean(sumInfo['rmse']), ss.sem(sumInfo['rmse'])) print "MAE : %8.4f %8.4f" % (numpy.mean(sumInfo['mae']), ss.sem(sumInfo['mae'])) print "CORR: %8.4f %8.4f" % (numpy.mean(sumInfo['corr']), ss.sem(sumInfo['corr'])) print "\nZ < 40" belowCondition = reflectObs < 40 belowSumInfo = DoSummaryInfo(numpy.where(belowCondition, testObs, numpy.NaN), numpy.where(belowCondition, modelPredicts, numpy.NaN)) print "RMSE: %8.4f %8.4f" % (numpy.mean(belowSumInfo['rmse']), ss.sem(belowSumInfo['rmse'])) print "MAE : %8.4f %8.4f" % (numpy.mean(belowSumInfo['mae']), ss.sem(belowSumInfo['mae'])) print "CORR: %8.4f %8.4f" % (numpy.mean(belowSumInfo['corr']), ss.sem(belowSumInfo['corr'])) print "\nZ >= 40" aboveSumInfo = DoSummaryInfo(numpy.where(belowCondition, numpy.NaN, testObs), numpy.where(belowCondition, numpy.NaN, modelPredicts)) print "RMSE: %8.4f %8.4f" % (numpy.mean(aboveSumInfo['rmse']), ss.sem(aboveSumInfo['rmse'])) print "MAE : %8.4f %8.4f" % (numpy.mean(aboveSumInfo['mae']), ss.sem(aboveSumInfo['mae'])) print "CORR: %8.4f %8.4f" % (numpy.mean(aboveSumInfo['corr']), ss.sem(aboveSumInfo['corr']))
def plot_proportions(us, shortcuts, novels, savepath, savefig=True): """Plots proportion of each trajectory taken. Behavior only. Parameters ---------- us : list of floats Proportion along the u trajectory for each session. len(us) == num_sessions evaluated shortcuts : list of floats Proportion along the shortcut trajectory for each session. len(shortcut) == num_sessions evaluated novels : list of floats Proportion along the novel trajectory for each session. len(novel) == num_sessions evaluated savepath : str Location and filename for the saved plot. savefig : boolean Default is True and will save the plot to the specified location. False shows with plot without saving it. """ all_us = np.mean(us) us_sem = stats.sem(us) all_shortcuts = np.mean(shortcuts) shortcuts_sem = stats.sem(shortcuts) all_novels = np.mean(novels) novels_sem = stats.sem(novels) n_groups = list(range(3)) colour = ['#5975a4', '#5f9e6e', '#b55d5f'] data = [all_us, all_shortcuts, all_novels] sems = [us_sem, shortcuts_sem, novels_sem] fig = plt.figure() ax = fig.add_subplot(111) for i in list(range(len(data))): ax.bar(n_groups[i], data[i], align='center', yerr=sems[i], color=colour[i], ecolor='#525252') plt.xlabel('(sessions=' + str(len(us)) + ')') plt.ylabel('Proportion of trials') sns.despine() ax.yaxis.set_ticks_position('left') ax.xaxis.set_ticks_position('bottom') plt.xticks(n_groups, ['U', 'Shortcut', 'Novel']) # plt.tight_layout() if savefig: plt.savefig(savepath, dpi=300, bbox_inches='tight') plt.close() else: plt.show()
def _sta_by_event_cond(spike_rel_times, phase_bin_start, phase_bin_stop, sta_buffer, eeg, filtered_eeg, events, h_file=None): valid_samps = np.where((eeg.time > phase_bin_start) & (eeg.time < phase_bin_stop))[0] nsamples = int(np.ceil(float(eeg['samplerate']) * sta_buffer)) # throw out novel items that were never repeated good_events = events[~((events['isFirst']) & (events['lag'] == 0))].index.values # loop over each event stas = [] stas_filt = [] is_novel = [] for (index, e), spikes, eeg_data_event, eeg_filt_data_event in zip(events.iterrows(), spike_rel_times, eeg, filtered_eeg): if index in good_events: if len(spikes) > 0: valid_spikes = spikes[np.in1d(spikes, valid_samps)] if len(valid_spikes) > 0: for this_spike in valid_spikes: stas.append(eeg_data_event[this_spike - nsamples:this_spike + nsamples].data) stas_filt.append(eeg_filt_data_event[this_spike - nsamples:this_spike + nsamples].data) is_novel.append(e.isFirst) is_novel = np.array(is_novel) # sta by condition for raw eeg if len(stas) > 0: stas = np.stack(stas) novel_sta_mean = stas[is_novel].mean(axis=0) novel_sta_sem = sem(stas[is_novel], axis=0) rep_sta_mean = stas[~is_novel].mean(axis=0) rep_sta_sem = sem(stas[~is_novel], axis=0) sta_time = np.linspace(-sta_buffer, sta_buffer, novel_sta_mean.shape[0]) # sta by condition for filtered eeg stas_filt = np.stack(stas_filt) novel_sta_filt_mean = stas_filt[is_novel].mean(axis=0) novel_sta_filt_sem = sem(stas_filt[is_novel], axis=0) rep_sta_filt_mean = stas_filt[~is_novel].mean(axis=0) rep_sta_filt_sem = sem(stas_filt[~is_novel], axis=0) if h_file is not None: add_to_hd5f_file(h_file, 'novel_sta_mean', novel_sta_mean) add_to_hd5f_file(h_file, 'novel_sta_sem', novel_sta_sem) add_to_hd5f_file(h_file, 'rep_sta_mean', rep_sta_mean) add_to_hd5f_file(h_file, 'rep_sta_sem', rep_sta_sem) add_to_hd5f_file(h_file, 'novel_sta_filt_mean', novel_sta_filt_mean) add_to_hd5f_file(h_file, 'novel_sta_filt_sem', novel_sta_filt_sem) add_to_hd5f_file(h_file, 'rep_sta_filt_mean', rep_sta_filt_mean) add_to_hd5f_file(h_file, 'rep_sta_filt_sem', rep_sta_filt_sem) add_to_hd5f_file(h_file, 'sta_time', sta_time) else: return novel_sta_mean, novel_sta_sem, rep_sta_mean, rep_sta_sem, novel_sta_filt_mean, novel_sta_filt_sem, \ rep_sta_filt_mean, rep_sta_filt_sem, sta_time
def post_process(): pattern = re.compile('lls_([0-9]+)_[0-9]+\.npz') data_dir = 'data' files = [join(data_dir,f) for f in listdir(data_dir) if isfile(join(data_dir,f)) and re.match(pattern, f)] all_lls = [np.load(f)['lls'] for f in files] all_lls = np.array(all_lls) avg_lls = np.average(all_lls, axis=0) lls_sem = 1.96*stats.sem(all_lls) x = np.arange(all_lls.shape[1]) plt.plot(x, avg_lls, linestyle='-.', color='b', label='PB_ungibbs') plt.fill_between(x, avg_lls-lls_sem, avg_lls+lls_sem, color='b', alpha=0.3) plt.xlabel('Iterations') plt.ylabel('Log Likelihood') plt.title('Average Log Likelihood and 95% confidence interval (normal)') ''' pattern2 = re.compile('lls_amcmc_([0-9]+)_[0-9]+\.npz') files = [join(data_dir,f) for f in listdir(data_dir) if isfile(join(data_dir,f)) and re.match(pattern2, f)] all_lls = [np.load(f)['lls'] for f in files] all_lls = np.array(all_lls) avg_lls = np.average(all_lls, axis=0) lls_sem = 1.96*stats.sem(all_lls) x = np.arange(all_lls.shape[1]) plt.plot(x, avg_lls, linestyle='-.', color='g', label='PB_amcmc') plt.fill_between(x, avg_lls-lls_sem, avg_lls+lls_sem, color='g', alpha=0.3) ''' # Gibbs ll gibbs_lls = np.array(pd.read_csv(join(data_dir, 'gibbs_ll'), sep=' ', header=None)).T avg_gibbs_lls = np.average(gibbs_lls, axis=0) gibbs_lls_sem = 1.96*stats.sem(gibbs_lls) x_gibbs = np.arange(gibbs_lls.shape[1]) plt.plot(x_gibbs, avg_gibbs_lls, linestyle='-', color='r', label='Collapsed Gibbs') plt.fill_between(x_gibbs, avg_gibbs_lls-gibbs_lls_sem, avg_gibbs_lls+gibbs_lls_sem, color='r', alpha=0.3) # VEM ll #vem_lls = np.array(pd.read_csv(join(data_dir, 'vem_ll'), # sep=' ', header=None)).T #avg_vem_lls = np.average(vem_lls, axis=0) #vem_lls_sem = 1.96*stats.sem(vem_lls) #x_vem = np.arange(vem_lls.shape[1]) #plt.plot(x_vem, avg_vem_lls, label='VEM') #plt.fill_between(x_vem, avg_vem_lls-vem_lls_sem, # avg_vem_lls+vem_lls_sem, alpha=0.3) # save plot plt.legend(loc='lower right') plt.savefig('data/lls.pdf', format='pdf')
def plot_avg_and_sem(npArray, axis=1): """This routine takes a multidimenionsal numpy array and an axis and then plots the average over that axis on top of a band that represents the standard error of the mean. """ mean = npArray.mean(axis=axis) sem_plus = mean + stats.sem(npArray, axis=axis) sem_minus = mean - stats.sem(npArray, axis=axis) plt.figure() plt.fill_between(np.arange(mean.shape[0]), sem_plus, sem_minus, alpha=0.5) plt.plot(mean)
def display_grid_scores(grid_scores, top=None): """Helper function to format a report on a grid of scores""" grid_scores = sorted(grid_scores, key=lambda x: x[1], reverse=True) if top is not None: grid_scores = grid_scores[:top] _, best_mean, best_scores = grid_scores[0] threshold = best_mean - 2 * sem(best_scores) for params, mean_score, scores in grid_scores: append_star = mean_score + 2 * sem(scores) > threshold print(display_scores(params, scores, append_star=append_star))
def dump_pre_post_stim_firing_rate(ffname, outprefix, window=10e-3): """Dump mean, median and standard deviation in population spike before and after stimulus. """ dbcnt_flist_dict = get_dbcnt_dict(ffname) celltype_data_dict = defaultdict(list) for dbcnt, flist in dbcnt_flist_dict.items(): for fname in flist: data = TraubData(makepath(fname)) bgtimes, probetimes = get_stim_times(data, correct_tcr=True) times = np.concatenate((bgtimes, probetimes)) times.sort() spiketrains = defaultdict(list) for cell, train in data.spikes.items(): celltype = cell.partition('_')[0] spiketrains[celltype].append(train) for celltype, trains in spiketrains.items(): popspikes = np.concatenate(trains) popspikes.sort() pre = [] post = [] for t in times: npre = np.flatnonzero((popspikes <= t) & (popspikes > (t - window/2))).shape[0] pre.append(npre / (data.cellcounts._asdict()[celltype] * window / 2.0)) npost = np.flatnonzero((popspikes > t) & (popspikes < (t + window/2))).shape[0] post.append(npost / (data.cellcounts._asdict()[celltype] * window / 2.0)) dstats = { 'filename': fname, 'dbcount': dbcnt, 'premean': np.mean(pre), 'premedian': np.median(pre), 'prestd': np.std(pre), 'presem': stats.sem(pre), 'postmean': np.mean(post), 'postmedian': np.median(post), 'poststd': np.std(post), 'postsem': stats.sem(post), 'nstim': len(times)} celltype_data_dict[celltype].append(dstats) for celltype, datalist in celltype_data_dict.items(): df = pd.DataFrame(datalist, columns=['filename', 'dbcount', 'premean', 'premedian', 'prestd', 'presem', 'postmean', 'postmedian', 'poststd', 'postsem', 'nstim']) outfile = '{}_prepost_rates_{}_{}ms_window.csv'.format(outprefix, celltype, window*1e3) df.to_csv(outfile)
def bin_and_mean(xdata, ydata, bins=10, distribution='normal', show_fig=True, fig=None, ax=None, figsize=None, dpi=100, show_bins=True, raw_data_label='raw data', mean_data_label='average', xlabel=None, ylabel=None, logx=False, logy=False, grid_on=True, error_bounds=True, err_bound_type='shade', legend_on=True, subsamp_thres=None, show_stats=True, show_SE=False, err_bound_shade_opacity=0.5): ''' Calculate the "bin-and-mean" results and optionally show the "bin-and-mean" plot. A "bin-and-mean" plot is a more salient way to show the dependency of ``ydata`` on ``xdata``. The data points (``xdata``, ``ydata``) are divided into different bins according to the values in ``xdata`` (via ``bins``), and within each bin, the mean values of x and y are calculated, and treated as the representative x and y values. "Bin-and-mean" is preferred when data points are highly skewed (e.g., a lot of data points for when x is small, but very few for large x). The data points when x is large are usually not noises, and could be even more valuable (think of the case where x is earthquake magnitude and y is the related economic loss). If we want to study the relationship between economic loss and earthquake magnitude, we need to bin-and-mean raw data and draw conclusions from the mean data points. The theory that enables this method is the assumption that the data points with similar x values follow the same distribution. Naively, we assume the data points are normally distributed, then y_mean is the arithmetic mean of the data points within a bin. We also often assume the data points follow log-normal distribution (if we want to assert that y values are all positive), then y_mean is the expected value of the log-normal distribution, while x_mean for any bins are still just the arithmetic mean. Notes: (1) For log-normal distribution, the expective value of y is: E(Y) = exp(mu + (1/2)*sigma^2) and the variance is: Var(Y) = [exp(sigma^2) - 1] * exp(2*mu + sigma^2) where mu and sigma are the two parameters of the distribution. (2) Knowing E(Y) and Var(Y), mu and sigma can be back-calculated:: ___________________ mu = ln[ E(Y) / V 1 + Var(Y)/E^2(Y) ] _________________________ sigma = V ln[ 1 + Var(Y)/E^2(Y) ] (Reference: https://en.wikipedia.org/wiki/Log-normal_distribution) Parameters ---------- xdata : list, numpy.ndarray, or pandas.Series X data. ydata : list, numpy.ndarray, or pandas.Series Y data. bins : int, list, numpy.ndarray, or pandas.Series Number of bins (an integer), or an array representing the actual bin edges. If ``bins`` means bin edges, the edges are inclusive on the lower bound, e.g., a value 2 shall fall into the bin [2, 3), but not the bin [1, 2). Note that the binning is done according to the X values. distribution : {'normal', 'lognormal'} Specifies which distribution the Y values within a bin follow. Use 'lognormal' if you want to assert all positive Y values. Only supports normal and log-normal distributions at this time. show_fig : bool Whether or not to show a bin-and-mean plot. fig : matplotlib.figure.Figure or ``None`` Figure object. If None, a new figure will be created. ax : matplotlib.axes._subplots.AxesSubplot or ``None`` Axes object. If None, a new axes will be created. figsize: (float, float) Figure size in inches, as a tuple of two numbers. The figure size of ``fig`` (if not ``None``) will override this parameter. dpi : float Figure resolution. The dpi of ``fig`` (if not ``None``) will override this parameter. show_bins : bool Whether or not to show the bin edges as vertical lines on the plots. raw_data_label : str The label name of the raw data to be shown in the legend (such as "raw data"). It has no effects if ``show_legend`` is ``False``. mean_data_label : str The label name of the mean data to be shown in the legend (such as "averaged data"). It has no effects if ``show_legend`` is ``False``. xlabel : str or ``None`` X axis label. If ``None`` and ``xdata`` is a pandas Series, use ``xdata``'s "name" attribute as ``xlabel``. ylabel : str of ``None`` Y axis label. If ``None`` and ``ydata`` is a pandas Series, use ``ydata``'s "name" attribute as ``ylabel``. logx : bool Whether or not to show the X axis in log scale. logy : bool Whether or not to show the Y axis in log scale. grid_on : bool Whether or not to show grids on the plot. error_bounds : bool Whether or not to show error bounds of each bin. err_bound_type : {'shade', 'bar'} Type of error bound: shaded area or error bars. It has no effects if error_bounds is set to ``False``. legend_on : bool Whether or not to show a legend. subsamp_thres : int A positive integer that defines the number of data points in each bin to show in the scatter plot. The smaller this number, the faster the plotting process. If larger than the number of data points in a bin, then all data points from that bin are plotted. If ``None``, then all data points from all bins are plotted. show_stats : bool Whether or not to show R^2 scores, correlation coefficients of the raw data and the binned averages on the plot. show_SE : bool If ``True``, show the standard error of y_mean (orange dots) of each bin as the shaded area beneath the mean value lines. If ``False``, show the standard deviation of raw Y values (gray dots) within each bin. err_bound_shade_opacity : float The opacity of the shaded area representing the error bound. 0 means completely transparent, and 1 means completely opaque. It has no effect if ``error_bound_type`` is ``'bar'``. Returns ------- fig : matplotlib.figure.Figure The figure object being created or being passed into this function. ``None``, if ``show_fig`` is set to ``False``. ax : matplotlib.axes._subplots.AxesSubplot The axes object being created or being passed into this function. ``None``, if ``show_fig`` is set to ``False``. x_mean : numpy.ndarray Mean X values of each data bin (in terms of X values). y_mean : numpy.ndarray Mean Y values of each data bin (in terms of X values). y_std : numpy.ndarray Standard deviation of Y values or each data bin (in terms of X values). y_SE : numpy.ndarray Standard error of ``y_mean``. It describes how far ``y_mean`` is from the population mean (or the "true mean value") within each bin, which is a different concept from ``y_std``. See https://en.wikipedia.org/wiki/Standard_error#Standard_error_of_mean_versus_standard_deviation for further information. stats_ : tuple<float> A tuple in the order of (r2_score_raw, corr_coeff_raw, r2_score_binned, corr_coeff_binned), which are the R^2 score and correlation coefficient of the raw data (``xdata`` and ``ydata``) and the binned averages (``x_mean`` and ``y_mean``). ''' if not isinstance(xdata, hlp._array_like) or not isinstance( ydata, hlp._array_like): raise TypeError('`xdata` and `ydata` must be lists, numpy arrays, ' 'or pandas Series.') if len(xdata) != len(ydata): raise hlp.LengthError('`xdata` and `ydata` must have the same length.') if isinstance(xdata, list): xdata = np.array(xdata) # otherwise boolean if isinstance(ydata, list): ydata = np.array(ydata) # indexing won't work #------------Pre-process "bins"-------------------------------------------- if isinstance(bins, (int, np.integer)): # if user specifies number of bins if bins <= 0: raise ValueError('`bins` must be a positive integer.') else: nr = bins + 1 # create bins with percentiles in xdata x_uni = np.unique(xdata) bins = [ np.nanpercentile(x_uni, (j + 0.) / bins * 100) for j in range(nr) ] if not all(x <= y for x, y in zip(bins, bins[1:]) ): # https://stackoverflow.com/a/4983359/8892243 print( '\nWARNING: Resulting "bins" array is not monotonically ' 'increasing. Please use a smaller "bins" to avoid potential ' 'issues.\n') elif isinstance(bins, (list, np.ndarray)): # if user specifies array nr = len(bins) else: raise TypeError('`bins` must be either an integer or an array.') #-----------Pre-process xlabel and ylabel---------------------------------- if not xlabel and isinstance(xdata, pd.Series): # xdata has 'name' attr xlabel = xdata.name if not ylabel and isinstance(ydata, pd.Series): # ydata has 'name' attr ylabel = ydata.name #-----------Group data into bins------------------------------------------- inds = np.digitize(xdata, bins) x_mean = np.zeros(nr - 1) y_mean = np.zeros(nr - 1) y_std = np.zeros(nr - 1) y_SE = np.zeros(nr - 1) x_subs = [] # subsampled x data (for faster scatter plots) y_subs = [] for j in range(nr - 1): # loop over every bin x_in_bin = xdata[inds == j + 1] y_in_bin = ydata[inds == j + 1] #------------Calculate mean and std------------------------------------ if len(x_in_bin) == 0: # no point falls into current bin x_mean[j] = np.nan # this is to prevent numpy from throwing... y_mean[j] = np.nan #...confusing warning messages y_std[j] = np.nan y_SE[j] = np.nan else: x_mean[j] = np.nanmean(x_in_bin) if distribution == 'normal': y_mean[j] = np.nanmean(y_in_bin) y_std[j] = np.nanstd(y_in_bin) y_SE[j] = stats.sem(y_in_bin) elif distribution == 'lognormal': s, loc, scale = stats.lognorm.fit(y_in_bin, floc=0) estimated_mu = np.log(scale) estimated_sigma = s y_mean[j] = np.exp(estimated_mu + estimated_sigma**2.0 / 2.0) y_std[j] = np.sqrt(np.exp(2.*estimated_mu + estimated_sigma**2.) \ * (np.exp(estimated_sigma**2.) - 1) ) y_SE[j] = y_std[j] / np.sqrt(len(y_in_bin)) else: raise ValueError("Valid values of `distribution` are " "{'normal', 'lognormal'}. Not '%s'." % distribution) #------------Pick subsets of data, for faster plotting----------------- #------------Note that this does not affect mean and std--------------- if subsamp_thres is not None and show_fig: if not isinstance(subsamp_thres, (int, np.integer)) or subsamp_thres <= 0: raise TypeError( '`subsamp_thres` must be a positive integer or None.') if len(x_in_bin) > subsamp_thres: x_subs.extend( np.random.choice(x_in_bin, subsamp_thres, replace=False)) y_subs.extend( np.random.choice(y_in_bin, subsamp_thres, replace=False)) else: x_subs.extend(x_in_bin) y_subs.extend(y_in_bin) #-------------Calculate R^2 and corr. coeff.------------------------------- non_nan_indices = ~np.isnan(xdata) & ~np.isnan(ydata) xdata_without_nan = xdata[non_nan_indices] ydata_without_nan = ydata[non_nan_indices] r2_score_raw = hlp._calc_r2_score( ydata_without_nan, xdata_without_nan) # treat "xdata" as "y_pred" corr_coeff_raw = np.corrcoef(xdata_without_nan, ydata_without_nan)[0, 1] r2_score_binned = hlp._calc_r2_score(y_mean, x_mean) corr_coeff_binned = np.corrcoef(x_mean, y_mean)[0, 1] stats_ = (r2_score_raw, corr_coeff_raw, r2_score_binned, corr_coeff_binned) #-------------Plot data on figure------------------------------------------ if show_fig: fig, ax = hlp._process_fig_ax_objects(fig, ax, figsize, dpi) if subsamp_thres: xdata, ydata = x_subs, y_subs ax.scatter(xdata, ydata, c='gray', alpha=0.3, label=raw_data_label, zorder=1) if error_bounds: if err_bound_type == 'shade': ax.plot(x_mean, y_mean, '-o', c='orange', lw=2, label=mean_data_label, zorder=3) if show_SE: ax.fill_between(x_mean, y_mean + y_SE, y_mean - y_SE, label='$\pm$ S.E.', facecolor='orange', alpha=err_bound_shade_opacity, zorder=2.5) else: ax.fill_between(x_mean, y_mean + y_std, y_mean - y_std, label='$\pm$ std', facecolor='orange', alpha=err_bound_shade_opacity, zorder=2.5) # END IF-ELSE elif err_bound_type == 'bar': if show_SE: mean_data_label += '$\pm$ S.E.' ax.errorbar(x_mean, y_mean, yerr=y_SE, ls='-', marker='o', c='orange', lw=2, elinewidth=1, capsize=2, label=mean_data_label, zorder=3) else: mean_data_label += '$\pm$ std' ax.errorbar(x_mean, y_mean, yerr=y_std, ls='-', marker='o', c='orange', lw=2, elinewidth=1, capsize=2, label=mean_data_label, zorder=3) # END IF-ELSE else: raise ValueError('Valid "err_bound_type" name are {"bound", ' '"bar"}, not "%s".' % err_bound_type) else: ax.plot(x_mean, y_mean, '-o', c='orange', lw=2, label=mean_data_label, zorder=3) ax.set_axisbelow(True) if xlabel: ax.set_xlabel(xlabel) if ylabel: ax.set_ylabel(ylabel) if logx: ax.set_xscale('log') if logy: ax.set_yscale('log') if grid_on: ax.grid(ls=':') ax.set_axisbelow(True) if show_bins: ylims = ax.get_ylim() for k, edge in enumerate(bins): lab_ = 'bin edges' if k == 0 else None # only label 1st edge ec = cl.get_colors(N=1)[0] ax.plot([edge] * 2, ylims, '--', c=ec, lw=1.0, zorder=2, label=lab_) if legend_on: ax.legend(loc='best') if show_stats: stats_text = "$R^2_{\mathrm{raw}}$=%.2f, $r_{\mathrm{raw}}$=%.2f, " \ "$R^2_{\mathrm{avg}}$=%.2f, " \ "$r_{\mathrm{avg}}$=%.2f" % stats_ ax.set_title(stats_text) return fig, ax, x_mean, y_mean, y_std, y_SE, stats_ else: return None, None, x_mean, y_mean, y_std, y_SE, stats_
def SNRwavelets(epochs_condition, low, high, step, timewindow, snr_format, numrois, frqwindow, snr_format_name): ######################################################################################################################### # Based on SNR estimation in evoked responses described in Gonzale-Morino et al, (2014) # SNRwavelets performs single trial and evoked response wavelet transformation on the specified, epoched data, # and uses this information to provide an estimate of the SNR in the frequency range of interest, as well as # more broadly across all bands for induced and evoked power. # Inputs: # epochs_condition = epoched data (object) # low = lowest frequency to estimate # high = highest frequency to estimate # step = interval between frequencies # timewindow = samples of interest for evoked response # snr_format = dictionary of roi channel information (see SNRcreate_test_data) # numrois = number of roi channels # frqwindow = frequencies defining the evoked response of interest (for ASSR this would be between 38 and 42 for example) # snr_format_name = string with names to find in the snr_format dict # Returns: # dictionary of SNR values # roi_snr_ASSR - snr for each channel of the evoked response # roi_snr_EVOKEDbands - snr for each channel of the evoked bands # roi_snr_INDUCEDbands - snr for each channel of the induced bands ####### DEFINE ENVIRONMENT print('Importing additional modules') import scipy from scipy import stats import numpy as np import copy import mne from mne.time_frequency import tfr_multitaper, tfr_stockwell, tfr_morlet # Organise input and perform wavelet transform for single trials and average response data print('Beginning wavelet transforms') # frequency information for wavelets freqs = np.arange(low, high, step) n_cycles = freqs / 4. # plot data - whole head - single trials power = mne.time_frequency.tfr_morlet(epochs_condition, freqs=freqs, n_cycles=n_cycles, use_fft=False, return_itc=False, decim=3, n_jobs=1, average=False) power.apply_baseline( mode='ratio', baseline=(-.5, 0) ) # apply baseline correction using ratio method (power is divided by the mean baseline power) # plot data - whole head - average powerAV = mne.time_frequency.tfr_morlet(epochs_condition, freqs=freqs, n_cycles=n_cycles, use_fft=False, return_itc=False, decim=3, n_jobs=1, average=True) powerAV.apply_baseline( mode='ratio', baseline=(-.5, 0) ) # apply baseline correction using ratio method (power is divided by the mean baseline power) # organise rois print('Extracting information from region of interest sites') rois = np.zeros(numrois, dtype=np.int) for x in range( 0, np.shape(rois)[0] ): # what this loop is doing is to go through and get the name of the items to select from the snr_format dict. # this information is then used to find which items to use for the rois text1 = snr_format_name text2 = str(x + 1) text3 = text1 + text2 rois[x] = int(snr_format[text3]) eppower = copy.deepcopy( power.data[:, rois, :, ]) # trials,channels,freqs,time eppowerAV = copy.deepcopy(powerAV.data[rois, :, ]) # channels,freqs,time del power del powerAV # we've now got the roi wavelet data. the next steps are to apply the appropriate baseline # and then to estimate the total power from the average of 39:41hz in the evoked and single trials # following this we can estimate the snr for each channel, and globally over our roi ########################################################################################################################## # EVOKED RESPONSE SNR # create evoked power average total (39:41hz) # windAV = eppowerAV[...,[18:20],[starta:enda]] print('Estimating SNR for evoked response') chAVpower = np.zeros(np.shape(rois)[0]) for x in range(0, np.shape(rois)[0]): temp = eppowerAV[x, frqwindow, :] f = np.zeros(len(frqwindow)) for y in range(0, len(frqwindow)): f[y] = sum(temp[y, timewindow[0]:(timewindow[-1] + 1)]) f = np.mean(f, 0) del temp chAVpower[x] = f # create single trial power average total (39:41hz) chSTpower = np.zeros((np.shape(rois)[0], np.shape(eppower)[0])) for x in range(0, np.shape(eppower)[0]): temp1 = eppower[x, :, :, :] for y in range(0, np.shape(rois)[0]): temp2 = temp1[y, frqwindow, :] f = np.zeros(len(frqwindow)) for yy in range(0, len(frqwindow)): f[yy] = sum(temp2[yy, timewindow[0]:(timewindow[-1] + 1)]) f = np.mean(f, 0) chSTpower[y, x] = f del temp2 del temp1 # this is what we've all been waiting for, get channel snr chSNR_ASSR = np.zeros(np.shape(rois)[0]) for x in range(0, np.shape(rois)[0]): temp1 = chAVpower[x] temp2 = chSTpower[x, :] snr = temp1 / stats.sem(temp2) chSNR_ASSR[x] = snr del temp1 del temp2 del snr # what this section is doing: # the first loop goes through each of the roi channels and sums all of the power values in a given frq range over the specified time bin. # this is then averaged across the rois. # the second set of loops goes through the single trials and performs the same procedure. # the third loop estimates the SNR for each roi channel. ######################################################################################################### # Individual band SNR # this section will take all of the individual frequency bands and estimate the SNR # for evoked and induced power. Induced power is retained in the avergae response # by squaring individual power values (see Gonzale-Morino et al, 2014). # In this version the window can be set to include any given time window but should # be focussed on the task response period to allow for analysis of time and phase # locked properties of the stimulus. # Frequency bands definition delta = 0 theta = np.arange(1, 3, 1) alpha = np.arange(3, 7, 1) beta = np.arange(7, 20, 1) gamma = np.arange(20, 29, 1) # Evoked power # pt1 - average response print('Estimating SNR for evoked response per band') ch = 0 tempout = np.zeros((len(freqs), numrois)) while ch < numrois: data = eppowerAV[ch, :, :] for x in range(0, len(freqs)): temp = data[x, timewindow[0]:(timewindow[-1] + 1)] temp = np.sum(temp) tempout[x, ch] = temp del temp ch = ch + 1 del data evoked_bands_pt1 = np.zeros((5, numrois)) del ch for x in range(0, numrois): evoked_bands_pt1[0, x] = np.sum(tempout[delta, x]) evoked_bands_pt1[1, x] = np.sum(tempout[theta, x]) evoked_bands_pt1[2, x] = np.sum(tempout[alpha, x]) evoked_bands_pt1[3, x] = np.sum(tempout[beta, x]) evoked_bands_pt1[4, x] = np.sum(tempout[gamma, x]) # pt2 - single trials del tempout tempout = np.zeros((len(eppower), len(freqs), numrois)) ch = 0 tr = 0 while tr < len(eppower): data = eppower[tr, :, :, :] for y in range(0, numrois): temp = data[y, :, :] for x in range(0, len(freqs)): tempout[tr, x, y] = np.sum(temp[x, 167:500]) del temp del data tr = tr + 1 evoked_bands_pt2 = np.zeros((len(eppower), 5, numrois)) for x in range(0, len(eppower)): data = tempout[x, :, :] for y in range(0, numrois): evoked_bands_pt2[x, 0, y] = np.sum(data[delta, y]) evoked_bands_pt2[x, 1, y] = np.sum(data[theta, y]) evoked_bands_pt2[x, 2, y] = np.sum(data[alpha, y]) evoked_bands_pt2[x, 3, y] = np.sum(data[beta, y]) evoked_bands_pt2[x, 4, y] = np.sum(data[gamma, y]) # this is what we've all been waiting for, get channel snr del tempout chSNR_bands_evoked = np.zeros((5, numrois)) for x in range(0, numrois): temp1 = evoked_bands_pt1[:, x] temp2 = evoked_bands_pt2[:, :, x] chSNR_bands_evoked[0, x] = temp1[0] / stats.sem(temp2[:, 0]) chSNR_bands_evoked[1, x] = temp1[1] / stats.sem(temp2[:, 1]) chSNR_bands_evoked[2, x] = temp1[2] / stats.sem(temp2[:, 2]) chSNR_bands_evoked[3, x] = temp1[3] / stats.sem(temp2[:, 3]) chSNR_bands_evoked[4, x] = temp1[4] / stats.sem(temp2[:, 4]) del temp1 del temp2 del ch del tr del x del y # Induced power # pt1 - average response # create average response by first squaring the individual values ch = 0 print('Estimating SNR for induced response per band') tempout = np.zeros((len(freqs), numrois)) while ch < numrois: data = np.mean(np.square(eppower[:, ch, :, :]), 0) for x in range(0, len(freqs)): temp = data[x, timewindow[0]:(timewindow[-1] + 1)] temp = np.sum(temp) tempout[x, ch] = temp del temp ch = ch + 1 del data induced_bands_pt1 = np.zeros((5, numrois)) del ch for x in range(0, numrois): induced_bands_pt1[0, x] = np.sum(tempout[delta, x]) induced_bands_pt1[1, x] = np.sum(tempout[theta, x]) induced_bands_pt1[2, x] = np.sum(tempout[alpha, x]) induced_bands_pt1[3, x] = np.sum(tempout[beta, x]) induced_bands_pt1[4, x] = np.sum(tempout[gamma, x]) # pt2 - single trials # this is what we've all been waiting for, get channel snr chSNR_bands_induced = np.zeros((5, numrois)) for x in range(0, numrois): temp1 = induced_bands_pt1[:, x] temp2 = evoked_bands_pt2[:, :, x] chSNR_bands_induced[0, x] = temp1[0] / stats.sem(temp2[:, 0]) chSNR_bands_induced[1, x] = temp1[1] / stats.sem(temp2[:, 1]) chSNR_bands_induced[2, x] = temp1[2] / stats.sem(temp2[:, 2]) chSNR_bands_induced[3, x] = temp1[3] / stats.sem(temp2[:, 3]) chSNR_bands_induced[4, x] = temp1[4] / stats.sem(temp2[:, 4]) del temp1 del temp2 del x ######################################################################################## #### Output print('Complete. Returning output') return { 'roi_snr_ASSR': chSNR_ASSR, 'roi_snr_EVOKEDbands': chSNR_bands_evoked, 'roi_snr_INDUCEDbands': chSNR_bands_induced } # create dictionary of outputs
# extra_features = ['room_per_hh', 'pop_per_hh', 'bedroom_per_room'] # cat_encoder = full_pipeline.named_transformers_['cat'] # cat_1hot = list(cat_encoder.categories_[0]) # features = num_features + extra_features + cat_1hot # pprint(sorted(zip(feature_importance, features), reverse=True)) # pick the best model final_model = rand_search.best_estimator_ # evaluate on test set X_test = strat_test_set.drop('median_house_value', axis=1) y_test = strat_test_set['median_house_value'].copy() X_test_prepared = full_pipeline.transform(X_test) final_prediction = final_model.predict(X_test_prepared) # final evaluation score final_mse = mean_squared_error(y_test, final_prediction) final_rmse = np.sqrt(final_mse) print(f'\nEvaluation score on test set: {round(final_rmse, 1)}') # confidence interval conf = 0.95 sqd_er = (final_prediction - y_test)**2 conf_int = np.sqrt( stats.t.interval(conf, len(sqd_er) - 1, loc=sqd_er.mean(), scale=stats.sem(sqd_er))) print(f'\n95% Confidence interval = {conf_int}')
def run_analysis(top_k, search_file_name, add_all, embed_type, model_dir): top_k = top_k with open(search_file_name, 'r') as f: data = json.load(f) queries = [] embeddings = [] all_matches = [] qids = {} query_2_matches = {} data_as_array = [] for obj in data: query = obj['query'] if query not in query_2_matches: query_2_matches[query] = [] query_2_matches[query].append(obj) for query, obj in query_2_matches.items(): queries.append(query) all_docs = [] for match in obj: qid = match['q_id'] # posts get repeated across queries sometimes - to avoid neural # embeddings reproducing the same result multiple times - ignore dups # across queries. if qid in qids: continue qids[match['q_id']] = 1 if add_all: content = embed_sentences( match['q_title'] + ' ' + match['q_text'] + ' ' + match['a_text'], embed_type, model_dir) else: content = embed_sentences(match['q_title'], embed_type, model_dir) # not performing this step seems to cause catastrophic # issues in the np.asarray(embeddings) step further down # suspect something is suboptimal about converting whatever # tensorflow returns into np arrays if embed_type == 'USE': content = np.asarray(content) embeddings.append(content) all_matches.append((query, match)) data_as_array.append((query, obj)) print('ALL LOADED') queries_embedding = np.asarray( embed_sentences(queries, embed_type, model_dir)) print('queries embedded') print(len(embeddings)) embeddings = np.asarray(embeddings).squeeze(1) print('numpy array created for main embeddings') num_queries = len(queries_embedding) faiss.normalize_L2(embeddings) faiss.normalize_L2(queries_embedding) index = faiss.IndexFlatIP(len(embeddings[0])) print(embeddings.shape) index.add(embeddings) query_distances, query_neighbors = index.search( queries_embedding, top_k) num_matches_to_text = 0 all_ndcg = [] ranks_avgs = [] overlap_avgs = [] recipRanks = [] for index, q in enumerate(query_neighbors): ranks = [] print(data_as_array[index][0]) search_matches = [] print('Actual matches (top-100):') for idx, m in enumerate(data_as_array[index][1]): try: if idx < 100: print(f"{idx} -- {m['q_id']}:{m['q_title']}") except: pass search_matches.append(m['q_id']) print(q) print('Returned matches:') y_true = [] y_pred = [] num_overlap = 0 for idx, k in enumerate(q): # print(all_matches[k][1]['q_title']) # print(all_matches[k][1]['q_id']) if all_matches[k][1]['q_id'] in search_matches: try: print( f"{idx} -- {all_matches[k][1]['q_id']}:{all_matches[k][1]['q_title']}" ) except: pass num_overlap += 1 num_matches_to_text += 1 rank = search_matches.index(all_matches[k][1]['q_id']) ''' # Corpus# True relevance score - scale from 0-10 true_relevance = {'d1': 10, 'd2': 9, 'd3':7, 'd4':6, 'd5':4}# Predicted relevence score predicted_relevance = {'d1': 8, 'd2': 9, 'd3':6, 'd4':6, 'd5':5}# relevance list processed as array true_rel = np.asarray([list(true_relevance.values())]) predicted_rel = np.asarray([list(predicted_relevance.values())]) >> ndcg_score(true_rel, predicted_rel) >> 0.9826797611735933 ''' y_true.append(rank + 1) #true scores of entities to be ranked. y_pred.append(idx + 1) reciprocal = 1 / (rank + 1) recipRanks.append(reciprocal) ranks.append(rank + 1) q_mrr = np.mean(np.asarray(ranks)) ranks_avgs.append(q_mrr) q_overlap = num_overlap / top_k overlap_avgs.append(q_overlap) if q_overlap < 0.1: print('Very low overlap: ', q_overlap) if len(y_true) > 0 and len(y_pred) > 0: print('y_true: ', y_true, ', y_pred: ', y_pred) q_ndcg = ndcg_score(np.asarray([y_true]), np.asarray([y_pred])) print(f'Question MRR: {q_mrr}, NDCG: {q_ndcg}') all_ndcg.append(q_ndcg) print('num of matches to text:' + str(num_matches_to_text)) print('num queries:' + str(num_queries)) print('average overlap with search:' + str(num_matches_to_text / (num_queries * top_k))) print('mean search rank:', np.mean(np.asarray(ranks_avgs))) meanRecipRank = sum(recipRanks) / len(recipRanks) print('MRR: standard error of the mean ', stat.sem(recipRanks)) print("Mean reciprocal rank is:", meanRecipRank) print("Average NDCG:", (sum(all_ndcg) / len(all_ndcg)))
def pumpProbe( data, background_data=None, norm=None, data_select=SelectorPP(spectra=0), background_select=SelectorPP(spectra=0), norm_select=SelectorPP(spectra=0), intensityE_select=SelectorPP(spectra=0), wavenumber=None, pp_delay=None, pixel=None, ): """Make pump-probe spectrum object taking the median over the scan axis. A pump-probe spectrum combines multiple vicotr `.dat` files into one `pysft.spectrum.PumpProbe` object. One must select a single `spectra` index. Thus the `SelectorPP(spectra=0)` in the default configuration. The selection of the data for `data` (data_select), the background (background_select) and the normalization (norm_select) are independent, but assignment will fail if background and norm can't be casted into the same shape as data. Arguments: data: A victor data dict as returned by `pysfg.vicotr.read.data_file`. background_data: Can be a constant number, or a numpy array with the same length as the pixel axis of `data`, or a `pysfg.read.victor.data_file` dictionary. norm: Can be a constant number, or a 1D array with the same length as pixel axis of `data` above, or a 2D array with the exact same shape as `data` above data_selectrion: `pysfg.SelectorPP` object. This is used to subselect data from the data['data'] entry of the passed data dict. The default is to take spectrum index 0 and leave the rest untouched. background_select: Same as `data_select` but for the background. Shapes of data and background must match or else a `ValueError` occurs. norm_select: Same as `data_select` but for the norm. Shape of data and norm must match. Else ValueError occurs. wavenumber: Not fully implemented, but if None. The calibration is read of the above passed data dict. pp_delay: Not fully impelemented, but if None, pp_delays is read of the `data` dict. Example: see `pysfg/test/pump_probe.py` for example usage. Returns: A `pysfg.PumpProbe` object """ if not isinstance(data, dict): raise NotImplementedError # Need to implement alternative default wavenumber # Need to implement alternative for pp_delay intensity = np.median( data['data'][data_select.select], axis=(1) # Median scans ) intensityE = sem(data['data'][intensityE_select.select], axis=(1)) # Handle various background data inputs if isinstance(background_data, dict): baseline = np.median(background_data['data'][background_select.select], axis=(1)) else: baseline = background_data # Assume norm is correct shape else it will fail # during assingment TODO: Add shape checking if isinstance(norm, PumpProbe): norm = norm.basesubed if isinstance(wavenumber, type(None)): wavenumber = from_victor_header(data).wavenumber[data_select.pixel] if len(wavenumber) != np.shape(intensity)[-1]: raise ValueError( "Shape of wavenumber doesn't match shape of intensity") if not isinstance(pp_delay, type(None)): raise NotImplementedError pp_delay = data['timedelay'] return PumpProbe( intensity=intensity, baseline=baseline, norm=norm, wavenumber=wavenumber, pp_delay=pp_delay, intensityE=intensityE, pixel=pixel, )
plt.ylabel('Frequency') ax = plt.gca() ax.yaxis.labelpad = -2 plt.xlim((-3.5,3.5)) plt.xticks([-3,-2,-1,0,1,2,3]) plt.savefig("Orient.jpg",dpi=400) plt.savefig("Orient.pdf",dpi=400, format ='pdf') plt.clf() print 'Here is the mean relative orientation: ' + str(np.mean(ors)) print 'Here is the std relative orientation: ' + str(np.std(ors)) allvals = ors conf_int = stats.t.interval(0.99, len(allvals)-1, loc=np.mean(allvals), scale=stats.sem(allvals) ) print 'Here is the confidence interval orientations' + str(conf_int) allvals = allvals - np.mean(allvals) x = stats.shapiro(allvals) print(x) fig = plt.figure(1) ax = fig.add_subplot(111) stats.probplot(allvals,plot=pylab) #stats.probplot(allvals,dist="t", sparams=(len(ors),), plot=pylab) pylab.xlabel('Quantiles')
parmfile = '/auto/data/daq/Tabor/TBR023/TBR023a14_p_OLP.m' parmfile = '/auto/data/daq/Tabor/TBR025/TBR025a13_p_OLP.m' parmfile = '/auto/data/daq/Tabor/TBR026/TBR026a16_p_OLP.m' parmfile = '/auto/data/daq/Tabor/TBR027/TBR027a14_p_OLP.m' parmfile = '/auto/data/daq/Tabor/TBR028/TBR028a08_p_OLP.m' parmfile = '/auto/data/daq/Tabor/TBR030/TBR030a13_p_OLP.m' parmfile = '/auto/data/daq/Tabor/TBR031/TBR031a13_p_OLP.m' parmfile = '/auto/data/daq/Tabor/TBR034/TBR034a14_p_OLP.m' parmfile = '/auto/data/daq/Tabor/TBR035/TBR035a15_p_OLP.m' parmfile = '/auto/data/daq/Tabor/TBR036/TBR036a14_p_OLP.m' regression_stuff(parmfile, plot=True, dataframe='none') Pred = regression_stuff(parmfile, plot=False, dataframe='none') neur_pred = np.mean(Pred, axis=1) neur_sem = stats.sem(Pred, axis=1) stim_pred = np.mean(Pred, axis=0) stim_sem = stats.sem(Pred, axis=0) fig, ax = plt.subplots(1,2, figsize=(7,3), sharey=True) ax[0].errorbar(range(Pred.shape[0]), neur_pred*-1, yerr=neur_sem, marker='.', ls='none', color='black') ax[1].errorbar(range(Pred.shape[1]), stim_pred*-1, yerr=stim_sem, marker='.', ls='none', color='black') ax[0].set_xlabel('Neuron', fontweight='bold', size=15) ax[0].set_ylabel('Mean Weight', fontweight='bold', size=15) ax[0].axhline(0, linestyle=':', color='black') ax[1].set_xlabel('Stimulus Pair', fontweight='bold', size=15) ax[1].axhline(0, linestyle=':', color='black') fig.tight_layout()
def aggregate_by_pos(meth_fi, aggfi, depth_thresh, mod_thresh, pos_list, control, verbose_results, gff, ref, plot, plotdir, plotsummary): pos_dict = {} if verbose_results: pos_dict_verbose = {} if pos_list: pos_set = make_pos_set(pos_list) values_dict = {} for line in open(meth_fi, 'r'): #try: #print line try: csome, read, pos, context, values, strand, label, prob = tuple( line.split('\t')) except: #for backwards compatibility; does not work with verbose results csome, read, pos, context, values, strand, label = tuple( line.split('\t')) nextpos = str(int(pos) + 1) if (pos_list and (csome, pos, nextpos, strand) not in pos_set) or (context[int(len(context) / 2)] != 'M'): continue if (csome, pos, nextpos, context, strand) not in pos_dict: pos_dict[(csome, pos, nextpos, context, strand)] = [] values_dict[(csome, pos, nextpos, context, strand)] = [] if verbose_results: pos_dict_verbose[(csome, pos, nextpos, context, strand)] = [] if (pos_list and (csome, pos, nextpos, strand) in pos_set) or (not pos_list and plot): values_dict[(csome, pos, nextpos, context, strand)].append([float(v) for v in values.split(',')][:-1]) if label[0] == 'm': pos_dict[(csome, pos, nextpos, context, strand)].append(1) else: pos_dict[(csome, pos, nextpos, context, strand)].append(0) if verbose_results: pos_dict_verbose[(csome, pos, nextpos, context, strand)].append(prob.strip()) #except: # pass print(values_dict) if plotsummary: print('plotting all current deviations...') num2lab = {0: 'A', 1: 'm6A'} curlab = [(val, num2lab[lab], lab) for pos_tup in values_dict for val, lab in zip(values_dict[pos_tup], pos_dict[pos_tup])] currents, labels, klabels = zip(*curlab) colours = {'m6A': '#B4656F', 'A': '#55B196'} plot_w_labels(klabels, labels, currents, 'classifierProb', 'allpos', 'allpos', plotdir, colours, alpha=0.3) print('finished plotting.') if plot: for locus in values_dict: cluster(values_dict[locus], locus[3], ['m6A' if x == 1 else 'A' for x in pos_dict[locus]], locus[0], locus[1], plot, plotdir) if pos_list: for locus in values_dict: values_df = pd.DataFrame(values_dict[locus]) tvals = [] pvals = [] for i in values_df.columns: #[:-1]: #for j in values_df.columns[i+1:]: ttest = stats.ttest_1samp(values_df[i], 0) #ttest = stats.ttest_rel(values_df[i],values_df[i+1]) #tvals.append(ttest[0]) pvals.append((ttest[1], ttest[0])) pval = (sum([-np.log10(x[0]) for x in pvals]), max([x[1] for x in pvals])) #min(pvals) values_dict[locus] = [np.round(x, 3) for x in [pval[1], pval[0]]] if ref: context_dict = ref2context(ref, pos_dict) count = 0 outfi = open(aggfi, 'w') for locus in pos_dict.keys(): a = (not pos_list) and check_thresh(pos_dict[locus], mod_thresh, depth_thresh, control) b = pos_list and (locus[0], locus[1], locus[2], locus[4]) in pos_set #and #'A' not in set(locus[4]) if ref: cx = context_dict[locus] else: cx = locus[3] if a or b: count += 1 frac = np.mean(pos_dict[locus]) if gff: deets = 'coverage=' + str( len(pos_dict[locus] )) + ';context=' + cx + ';IPDRatio=5;frac=' + str(frac) if verbose_results: probs = [float(x) for x in pos_dict_verbose[locus]] se_95 = 2 * stats.sem(probs) deets = deets + ';fracLow=' + str( frac - se_95) + ';fracUp=' + str( frac + se_95) + ';identificationQv=' + str( int(100 * np.mean([ float(x) for x in pos_dict_verbose[locus] ]))) gff_info = (locus[0], locus[2], locus[4], deets) write_gff(outfi, gff_info) else: print(aggfi) out_line = '\t'.join( list(locus)[:-1] + [str(np.mean(pos_dict[locus]))] + [locus[-1]] + [str(len(pos_dict[locus])) ]) #+[str(x) for x in values_dict[locus]]) if pos_list: out_line = out_line + '\t' + '\t'.join( [str(x) for x in values_dict[locus]]) if verbose_results: out_line = out_line + '\t' + ','.join( pos_dict_verbose[locus]) outfi.write(out_line + '\n') if not pos_list: if not control: print(count, 'methylated loci found with min depth', depth_thresh, 'reads') else: print(count, 'unmethylated loci found with min depth', depth_thresh, 'reads')
def CI_model(y, confidence=0.95): std_err_y = st.sem(y1) n_y = len(y1) h_y = std_err_y * st.t.ppf((1 + confidence) / 2, n_y - 1) return h_y
def compute_stats(list_dict, pkey, skey): sample = [sam[pkey][skey] for sam in list_dict] mean = round(np.mean(sample), 3) sem = round(stats.sem(sample), 3) return mean, sem
# RMS noise level for all channels def RMS_calculation(data): RMS = np.zeros(num_ivm_channels) for i in range(num_ivm_channels): RMS[i] = np.sqrt((1 / len(data[i])) * np.sum(data[i]**2)) return RMS noise_rms = RMS_calculation(temp_filtered_uV) noise_rms_average = np.average(noise_rms) noise_rms_stdv = stats.sem(noise_rms) print('#------------------------------------------------------') print('RMS:' + str(noise_rms)) print('RMS_average:' + str(noise_rms_average)) print('RMS_average_stdv:' + str(noise_rms_stdv)) print('#------------------------------------------------------') filename_RMS = os.path.join(analysis_folder + '\\' + str(high_pass_freq) + 'noise_RMS' + '.npy') np.save(filename_RMS, noise_rms) #Protocol1 to calculate the stdv from noise MEDIAN noise_median = np.median(np.abs(temp_filtered_uV) / 0.6745, axis=1) noise_median_average = np.average(noise_median)
curUserSimilarity) simRMSE = rmse(curUserSimPreiction, curTestUserItemMatrix) rmseList_cosine.append((simRMSE)) # In[95]: print("Sim_cosine") for simScore in rmseList_cosine: print("%.3lf" % (simScore)) print("the average is ", np.mean(rmseList_cosine)) print("The 95% CI for cosine is", (st.t.interval(0.95, len(rmseList_cosine) - 1, loc=np.mean(rmseList_cosine), scale=st.sem(rmseList_cosine)))) # In[91]: rmseList_eucd = [] for trainFileName, testFileName in datasetsFileNames: curTrainDF = pd.read_csv(os.path.join(MOVIELENS_DIR, trainFileName), sep='\t', names=fields) curTestDF = pd.read_csv(os.path.join(MOVIELENS_DIR, testFileName), sep='\t', names=fields) curTrainUserItemMatrix = buildUserItemMatrix(curTrainDF, numUsers, numItems) curTestUserItemMatrix = buildUserItemMatrix(curTestDF, numUsers, numItems)
from scipy.stats import f_oneway from statsmodels.stats.multicomp import pairwise_tukeyhsd testdf = pd.read_csv('test_qrt_pcr_2.csv') groups = testdf.Group.unique() group1 = groups[0] group2 = groups[1] meancontrol = (testdf['dCT'].where(testdf['Group'] == group1)) meancontrol = [ meancontrol_i for meancontrol_i in meancontrol if str(meancontrol_i) != 'nan' ] meancontrol controlsem = stats.sem(meancontrol) meancontrol = sum(meancontrol) / len(meancontrol) meancontrol testdf['Power'] = 2**-(testdf['dCT'] - meancontrol) i = len(groups) experimental_rqs = [] for x in range(1, i): group = groups[x] experimental = (testdf['dCT'].where(testdf['Group'] == group)) experimental = [ experimental_i for experimental_i in experimental if str(experimental_i) != 'nan'
low_ci_95 = {} high_ci_95 = {} low_ci_99 = {} high_ci_99 = {} df_stats = {} avg = [] low_ci_95 = [] high_ci_95 = [] low_ci_99 = [] high_ci_99 = [] for step in df["Step"].unique(): values = df[feature][df["Step"] == step] f_mean = values.mean() lci95, hci95 = sps.t.interval(0.95, len(values), loc=f_mean, scale=sps.sem(values)) lci99, hci99 = sps.t.interval(0.99, len(values), loc=f_mean, scale=sps.sem(values)) avg.append(f_mean) low_ci_95.append(lci95) high_ci_95.append(hci95) low_ci_99.append(lci99) high_ci_99.append(hci99) df_stats = pd.DataFrame() df_stats["Step"] = df["Step"].unique() df_stats["mean"] = avg df_stats["lci95"] = low_ci_95 df_stats["hci95"] = high_ci_95 df_stats["lci99"] = low_ci_99 df_stats["hci99"] = high_ci_99
def calc_yield(kstd, lamb, ks, kt, temp, temp_dat, lifetime_exp_zero, lifetime_exp_res, lifetime_exp_high, J, j_exp): # Define variables, initial frame rad_fram_aniso_g1 = np.array([[0.0006, 0.0, 0.0], [0.0, 0.0001, 0.0], [0.0, 0.0, -0.0009]]) rad_fram_aniso_g2 = np.array([[0.0010, 0.0, 0.0], [0.0, 0.0007, 0.0], [0.0, 0.0, -0.0020]]) rad_fram_aniso_hyperfine_1 = np.zeros([19, 3, 3]) rad_fram_aniso_hyperfine_1[0] = array_construct(0.018394, 0.00575, -0.024144, 0.119167, -0.090257, -0.105530) rad_fram_aniso_hyperfine_1[1] = array_construct(-0.030255, 0.134767, -0.104512, 0.111178, 0.03952, 0.065691) rad_fram_aniso_hyperfine_1[2] = array_construct(0.041327, -0.039294, 0.002033, 0.017961, 0.78922, 0.025615) rad_fram_aniso_hyperfine_1[3] = array_construct(0.065617, -0.016154, -0.049462, 0.036655, 0.014217, 0.004047) rad_fram_aniso_hyperfine_1[4] = array_construct(0.069089, -0.054902, -0.014187, 0.013749, -0.075976, -0.006477) rad_fram_aniso_hyperfine_1[5] = array_construct(0.098308, -0.041108, -0.0572, -0.024641, 0.013959, 0.002803) rad_fram_aniso_hyperfine_1[6] = array_construct(0.017844, 0.006183, -0.024028, -00.119099, -0.090068, 0.105661) rad_fram_aniso_hyperfine_1[7] = array_construct(-0.030775, 0.135406, -0.104631, -0.110876, 0.039322, -0.065607) rad_fram_aniso_hyperfine_1[8] = array_construct(0.041235, -0.039174, -0.002061, -0.018150, 0.078901, -0.025838) rad_fram_aniso_hyperfine_1[9] = array_construct(0.065415, -0.015957, -0.049358, -0.036874, 0.014222, -0.004080) rad_fram_aniso_hyperfine_1[10] = array_construct(0.069102, -0.054901, -0.014201, -0.014035, -0.075981, 0.006618) rad_fram_aniso_hyperfine_1[11] = array_construct(0.098464, -0.041245, -0.0571219, 0.024346, 0.014054, -0.002814) rad_fram_aniso_hyperfine_1[12] = array_construct(0.036159, -0.00026, -0.035899, 0.038259, -0.007026, -0.004047) rad_fram_aniso_hyperfine_1[13] = array_construct(0.036159, -0.00026, -0.035899, 0.038259, -0.007026, -0.004047) rad_fram_aniso_hyperfine_1[14] = array_construct(0.036159, -0.00026, -0.035899, 0.038259, -0.007026, -0.004047) rad_fram_aniso_hyperfine_1[15] = array_construct(0.035983, -0.000104, -0.035879, -0.038338, -0.007021, 0.004066) rad_fram_aniso_hyperfine_1[16] = array_construct(0.035983, -0.000104, -0.035879, -0.038338, -0.007021, 0.004066) rad_fram_aniso_hyperfine_1[17] = array_construct(0.035983, -0.000104, -0.035879, -0.038338, -0.007021, 0.004066) rad_fram_aniso_hyperfine_1[18] = array_construct(-0.772676, -0.7811, 1.553776, 0.000000, -0.061480, 0.000443) rad_fram_aniso_hyperfine_2 = np.zeros([6, 3, 3]) rad_fram_aniso_hyperfine_2[0] = array_construct(0.011586, 0.032114, -0.0437, -0.101834, -0.000008, 0.000014) rad_fram_aniso_hyperfine_2[1] = array_construct(0.011586, 0.032114, -0.0437, -0.101834, 0.000014, 0.000008) rad_fram_aniso_hyperfine_2[2] = array_construct(0.011586, 0.032114, -0.0437, -0.101834, 0.000014, 0.000008) rad_fram_aniso_hyperfine_2[3] = array_construct(0.011586, 0.032114, -0.0437, -0.101834, -0.000008, 0.000014) rad_fram_aniso_hyperfine_2[4] = array_construct(0.0352, 0.034, -0.0692, 0.0, 0.0, 0.0) rad_fram_aniso_hyperfine_2[5] = array_construct(0.0352, 0.034, -0.0692, 0.0, 0.0, 0.0) # axis frames data_xyz = np.loadtxt('dmj-an-pe1p-ndi-opt.txt', delimiter=',') transform_mol = inertia_tensor(data_xyz) dmj_xyz = np.loadtxt('dmj_in_pe1p.txt', delimiter=',') transform_dmj = inertia_tensor(dmj_xyz) ndi_xyz = np.loadtxt('NDI_in_pe1p.txt', delimiter=',') transform_ndi = inertia_tensor(ndi_xyz) # Convert to molecular frame aniso_g1 = rad_tensor_mol_axis(transform_mol, transform_dmj, rad_fram_aniso_g1) aniso_g2 = rad_tensor_mol_axis(transform_mol, transform_ndi, rad_fram_aniso_g2) aniso_hyperfine_1 = rad_tensor_mol_axis(transform_mol, transform_dmj, rad_fram_aniso_hyperfine_1) aniso_hyperfine_2 = rad_tensor_mol_axis(transform_mol, transform_ndi, rad_fram_aniso_hyperfine_2) # for n=1 radius = 24.044e-10 cnst = (1.0e3 * 1.25663706e-6 * 1.054e-34 * 1.766086e11) / (4.0 * np.pi * radius**3) aniso_dipolar = np.array([[1.0, 0.0, 0.0], [0.0, 1.0, 0.0], [0.0, 0.0, -2.0]]) * cnst # Isotropic components g1_iso = 2.0031 g2_iso = 2.0040 # ISO h1 for the anti conformation iso_h1 = np.array([[ 2.308839, 0.903770, -0.034042, -0.077575, 1.071863, 0.258828, 2.308288, 0.0902293, -0.034202, 0.077648, 1.073569, 0.259878, -0.166563, -0.166563, -0.166563, -0.166487, -0.166487, -0.166487, 0.831260 ]]) iso_h2 = np.array([[-0.1927, -0.1927, -0.1927, -0.1927, -0.0963, -0.0963]]) spin_numbers_1 = np.array([[ 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 1.0 ]]) spin_numbers_2 = np.array([[0.5, 0.5, 0.5, 0.5, 1.0, 1.0]]) field = np.reshape(temp_dat[:, 0], (len(temp_dat[:, 0]))) data_y = np.reshape(temp_dat[:, 1], (len(temp_dat[:, 1]))) sampled_field = np.linspace(0.0, 100.0, 20) triplet_yield = np.zeros_like(sampled_field) standard_error = np.zeros_like(sampled_field) compound_error = np.zeros_like(sampled_field) num_samples = 300 samples = np.arange(1.0, np.float(num_samples)) trip = np.zeros_like(samples) #-------------------------------------------------------------------------------------------------------------------------------------- #zero field lifetime lifetime_zero = 0.0 zero = np.zeros_like(samples) # zero field lifetime for index, item in enumerate(samples): relaxation_0 = rotational_relaxation(aniso_dipolar, g1_iso, g2_iso, aniso_g1, aniso_g2, iso_h1, iso_h2, aniso_hyperfine_1, aniso_hyperfine_2, spin_numbers_1, spin_numbers_2, 0.0, J, ks, kt, lamb, temp, kstd) zero[index] = relaxation_0.lifetime() lifetime_zero += zero[index] lifetime_zero = np.float(lifetime_zero) / np.float(num_samples) lifetime_dif_zero = lifetime_zero - lifetime_exp_zero #-------------------------------------------------------------------------------------------------------------------------------------- #resonance field lifetime (B=2J) lifetime_res = 0.0 res = np.zeros_like(samples) # zero field lifetime for index, item in enumerate(samples): relaxation_0 = rotational_relaxation(aniso_dipolar, g1_iso, g2_iso, aniso_g1, aniso_g2, iso_h1, iso_h2, aniso_hyperfine_1, aniso_hyperfine_2, spin_numbers_1, spin_numbers_2, 2.0 * J, J, ks, kt, lamb, temp, kstd) res[index] = relaxation_0.lifetime() lifetime_res += res[index] lifetime_res = np.float(lifetime_res) / np.float(num_samples) lifetime_dif_res = lifetime_res - lifetime_exp_res #-------------------------------------------------------------------------------------------------------------------------------------- # High field lifetime lifetime_high = 0.0 high = np.zeros_like(samples) # zero field lifetime for index, item in enumerate(samples): relaxation_0 = rotational_relaxation(aniso_dipolar, g1_iso, g2_iso, aniso_g1, aniso_g2, iso_h1, iso_h2, aniso_hyperfine_1, aniso_hyperfine_2, spin_numbers_1, spin_numbers_2, 100.0, J, ks, kt, lamb, temp, kstd) high[index] = relaxation_0.lifetime() lifetime_high += high[index] lifetime_high = np.float(lifetime_high) / np.float(num_samples) lifetime_dif_high = lifetime_high - lifetime_exp_high #-------------------------------------------------------------------------------------------------------------------------------------- for index_field, item_field in enumerate(sampled_field): total_t = 0.0 for index, item in enumerate(samples): np.random.seed(index) # Define class relaxation = rotational_relaxation( aniso_dipolar, g1_iso, g2_iso, aniso_g1, aniso_g2, iso_h1, iso_h2, aniso_hyperfine_1, aniso_hyperfine_2, spin_numbers_1, spin_numbers_2, item_field, J, ks, kt, lamb, temp, kstd) # Calculate triplet yield trip[index] = relaxation.triplet_yield() total_t += trip[index] triplet_yield[index_field] = total_t standard_error[index_field] = sts.sem(trip) compound_error[index_field] = np.sqrt( standard_error[0] * standard_error[0] * ((1.0 / triplet_yield[0])**2 + (standard_error[index_field] * standard_error[index_field] * (triplet_yield[index_field] / triplet_yield[0])**2))) compound_error[0] = 0.0 triplet_yield = triplet_yield / (triplet_yield[0]) tck = interpolate.splrep(sampled_field, triplet_yield, s=0) xnew = field ynew = interpolate.splev(xnew, tck, der=0) mary = ((ynew) - (data_y - data_y[0] + 1.0)) * ((ynew) - (data_y - data_y[0] + 1.0)) mean_mary = (np.sum(ynew)) / np.float(len(ynew)) mary_var = (mean_mary - ynew) * (mean_mary - ynew) lt = np.array([lifetime_zero, lifetime_res, lifetime_high]) sq_lt_diff = np.array([(lifetime_dif_zero) * (lifetime_dif_zero), (lifetime_dif_res) * (lifetime_dif_res) + 4.0 * (J - j_exp) * (J - j_exp), (lifetime_dif_high) * (lifetime_dif_high)]) mean_lt = (np.sum(lt)) / np.float(len(lt)) lt_var = (mean_lt - lt) * (mean_lt - lt) chai_lifetime = np.sum(sq_lt_diff) / np.sum(lt_var) chai_yield = np.sum(mary) / np.sum(mary_var) return chai_lifetime, chai_yield
com_ran = df.commRange[index] energy = df.energy[index] energy_max = df.energy[index] prob = df.freq[index] node = Node(location=location, com_ran=com_ran, energy=energy, energy_max=energy_max, id=i, energy_thresh=0.4 * energy, prob=prob) list_node.append(node) mc = MobileCharger(energy=df.E_mc[index], capacity=df.E_max[index], e_move=df.e_move[index], e_self_charge=df.e_mc[index], velocity=df.velocity[index]) target = [int(item) for item in df.target[index].split(',')] net = Network(list_node=list_node, mc=mc, target=target) print(len(net.node), len(net.target), max(net.target)) q_learning = Q_learning(network=net) # inma = Inma() file_name = "log/q_learning_" + str(index) + ".csv" temp = net.simulate(optimizer=q_learning, file_name=file_name) life_time.append(temp) result.writerow({"nb run": nb_run, "lifetime": temp}) confidence = 0.95 h = sem(life_time) * t.ppf((1 + confidence) / 2, len(life_time) - 1) result.writerow({"nb run": mean(life_time), "lifetime": h})
def consensus_demultiplex(self): """ Takes a FASTQ file of consensus reads and identifies each by index. Handles writing demultiplexed FASTQ if user desired. """ self.log.info("Consensus Index Search") eof = False start_time = time.time() split_time = time.time() fastq_file_name_list = [] fastq_data_dict = collections.defaultdict(lambda: collections.defaultdict(list)) indexed_read_count = 0 key_counts = [] while not eof: # Debugging Code Block if self.args.Verbose == "DEBUG": read_limit = 1000000 if self.read_count > read_limit: if self.args.Demultiplex: for index_name in fastq_data_dict: r1_data = fastq_data_dict[index_name]["R1"] r1, r2 = self.fastq_outfile_dict[index_name] r1.write(r1_data) r1.close() if not self.args.PEAR: r2_data = fastq_data_dict[index_name]["R2"] r2.write(r2_data) r2.close() Tool_Box.debug_messenger("Limiting Reads Here to {}".format(read_limit)) eof = True fastq2_read = None try: fastq1_read = next(self.fastq1.seq_read()) if not self.args.PEAR: fastq2_read = next(self.fastq2.seq_read()) except StopIteration: if self.args.Demultiplex: for index_name in fastq_data_dict: r1_data = fastq_data_dict[index_name]["R1"] r1, r2 = self.fastq_outfile_dict[index_name] r1.write(r1_data) r1.close() if not self.args.PEAR: r2_data = fastq_data_dict[index_name]["R2"] r2.write(r2_data) r2.close() eof = True continue self.read_count += 1 if self.read_count % 100000 == 0: elapsed_time = int(time.time() - start_time) block_time = int(time.time() - split_time) split_time = time.time() self.log.info("Processed {} reads in {} seconds. Total elapsed time: {} seconds." .format(self.read_count, block_time, elapsed_time)) # Match read with library index. match_found, left_seq, right_seq, index_name, fastq1_read, fastq2_read = \ self.index_matching(fastq1_read, fastq2_read) if match_found: indexed_read_count += 1 locus = self.index_dict[index_name][7] phase_key = "{}+{}".format(index_name, locus) r2_found = False r1_found = False if self.args.Platform == "Illumina": # Score the phasing and place the reads in a dictionary. for r2_phase, r1_phase in zip(self.phase_dict[locus]["R2"], self.phase_dict[locus]["R1"]): r2_phase_name = r2_phase[1] r1_phase_name = r1_phase[1] # Tag reads that should not have any phasing. if not r1_phase[0]: self.phase_count[phase_key]["Phase " + r1_phase_name] = -1 self.phase_count[phase_key]["Phase " + r2_phase_name] = -1 continue else: self.phase_count[phase_key]["Phase " + r1_phase_name] += 0 self.phase_count[phase_key]["Phase " + r2_phase_name] += 0 # The phasing is the last N nucleotides of the consensus. if r2_phase[0] == Sequence_Magic.rcomp(fastq1_read.seq[-len(r2_phase[0]):]) and not r2_found: self.phase_count[phase_key]["Phase "+r2_phase_name] += 1 r2_found = True if r1_phase[0] == fastq1_read.seq[:len(r1_phase[0])] and not r1_found: self.phase_count[phase_key]["Phase "+r1_phase_name] += 1 r1_found = True # if no phasing is found then note that. if not r2_found: self.phase_count[phase_key]["No Read 2 Phasing"] += 1 if not r1_found: self.phase_count[phase_key]["No Read 1 Phasing"] += 1 # The adapters on Gupta Lab AAVS1.1 are reversed causing the reads to be reversed. if locus == "AAVS1.1": self.sequence_dict[index_name].append(fastq1_read.seq) else: self.sequence_dict[index_name].append(fastq1_read.seq) elif self.args.Platform == "TruSeq": self.sequence_dict[index_name].append(right_seq) elif self.args.Platform == "Ramsden": self.sequence_dict[index_name].append(Sequence_Magic.rcomp(fastq1_read.seq)) else: self.log.error("--Platform {} not correctly defined. Edit parameter file and try again" .format(self.args.Platform)) raise SystemExit(1) if self.args.Demultiplex: fastq_data_dict[index_name]["R1"].append([fastq1_read.name, fastq1_read.seq, fastq1_read.qual]) if not self.args.PEAR: fastq_data_dict[index_name]["R2"].append([fastq2_read.name, fastq2_read.seq, fastq2_read.qual]) fastq_file_name_list.append("{}{}_{}_Consensus.fastq" .format(self.args.WorkingFolder, self.args.Job_Name, index_name)) elif self.args.Demultiplex and not match_found: fastq_data_dict['Unknown']["R1"].append([fastq1_read.name, fastq1_read.seq, fastq1_read.qual]) fastq_data_dict['Unknown']["R2"].append([fastq1_read.name, fastq1_read.seq, fastq1_read.qual]) fastq_file_name_list.append("{}{}_Unknown_Consensus.fastq" .format(self.args.WorkingFolder, self.args.Job_Name)) if self.args.Demultiplex: self.fastq_compress(list(set(fastq_file_name_list))) for key in self.sequence_dict: key_counts.append(len(self.sequence_dict[key])) # The lower limit is used when plotting the data. Generally the lowest values are just noise. if len(key_counts) == 0: self.log.error("No Scar Patterns Found") raise SystemExit(1) lower, upper_limit = stats.norm.interval(0.9, loc=statistics.mean(key_counts), scale=stats.sem(key_counts)) lower_limit = statistics.mean(key_counts)-lower return indexed_read_count, lower_limit
def mean_ci(x): import scipy.stats as st mn = np.mean(x) ci = st.t.interval(0.95, len(x) - 1, loc=np.mean(x), scale=st.sem(x)) return (mn, ci[0], ci[1])
def SNRpsdEPOCH_GRANDAV(epoch_condition, starta, enda, snr_format, fstart, fend, snr_format_name, tmin, tmax, baseline, cond_events, cond_events_id, reject): print('Importing additional modules') import scipy from scipy import stats from scipy import signal import numpy as np import copy import mne import matplotlib as mpl from matplotlib import mlab ################### PRE-ANALYSIS CHECKS # Does the data require epochs creating? check = np.shape(epoch_condition._data) check = np.size(check) if check < 3: print('creating temporary epochs object') temp = mne.Epochs( epoch_condition, cond_events, cond_events_id, tmin, tmax, proj=False, #picks=picks, baseline=baseline, preload=True, reject=reject, add_eeg_ref=False) time = np.linspace(tmin, tmax, np.shape(temp._data)[2]) epoch_condition = copy.deepcopy(temp) del temp else: time = np.linspace(tmin, tmax, np.shape(epoch_condition._data)[2]) # Create timewindow s = mlab.find(time == starta) e = mlab.find(time == enda) ee = e + 1 timewindow = np.arange(s, ee, 1) # Get sampling frequency fs = epoch_condition.info['sfreq'] ######### PART 1 --- FRQ WINDOW OF INTEREST SNR ga = epoch_condition._data ga = ga[:, snr_format[snr_format_name], :] ga = np.mean(ga, 0) ga = np.mean(ga, 0) ga = ga - np.mean(ga) f, Pxx = signal.welch(ga[timewindow], fs=fs, nperseg=1001, noverlap=np.round(1001 / 2), detrend='linear', scaling='density') gapsd = Pxx del Pxx del ga ff = np.round(f) frqwindow = np.arange((mlab.find(ff == fstart)), (mlab.find(ff == fend + 1)), 1) delta = np.arange((mlab.find(ff == 1)), (mlab.find(ff == 3 + 1)), 1) theta = np.arange((mlab.find(ff == 4)), (mlab.find(ff == 7 + 1)), 1) alpha = np.arange((mlab.find(ff == 8)), (mlab.find(ff == 14 + 1)), 1) beta = np.arange((mlab.find(ff == 15)), (mlab.find(ff == 31 + 1)), 1) gamma = np.arange((mlab.find(ff == 32)), (mlab.find(ff == 58 + 1)), 1) ######### STEP 2 - PSD FROM EACH ROI CHANNEL (SINGLE TRIALS) print('Estimating PSD for each ROI channel, single trials') psdmatrix2 = np.zeros([len(epoch_condition._data), int((fs / 2) + 1)]) for y in range(0, len(epoch_condition._data)): ga = epoch_condition._data ga = ga[y, snr_format[snr_format_name], :] ga = np.mean(ga, 0) ga = ga - np.mean(ga) f, Pxx = signal.welch(ga[timewindow], fs=fs, nperseg=1001, noverlap=np.round(1001 / 2), detrend='linear', scaling='density') psdmatrix2[y, :] = Pxx ######### STEP 3 - SNR print('Estimating SNR') temp1 = gapsd[frqwindow] temp1 = np.sum(temp1) temp2 = np.sum(psdmatrix2[:, frqwindow], 1) snr = temp1 / stats.sem(temp2) deltasnr = np.sum(gapsd[delta]) / stats.sem(np.sum(psdmatrix2[:, delta], 1)) thetasnr = np.sum(gapsd[theta]) / stats.sem(np.sum(psdmatrix2[:, theta], 1)) alphasnr = np.sum(gapsd[alpha]) / stats.sem(np.sum(psdmatrix2[:, alpha], 1)) betasnr = np.sum(gapsd[beta]) / stats.sem(np.sum(psdmatrix2[:, beta], 1)) gammasnr = np.sum(gapsd[gamma]) / stats.sem(np.sum(psdmatrix2[:, gamma], 1)) return { 'roisnrGA': snr, 'deltasnr': deltasnr, 'thetasnr': thetasnr, 'alphasnr': alphasnr, 'betasnr': betasnr, 'gammasnr': gammasnr }
#程序文件Pex4_14_2.py import numpy as np import scipy.stats as ss from scipy import stats a = np.array([ 506, 508, 499, 503, 504, 510, 497, 512, 514, 505, 493, 496, 506, 502, 509, 496 ]) alpha = 0.95 df = len(a) - 1 ci = ss.t.interval(alpha, df, loc=a.mean(), scale=ss.sem(a)) print("置信区间为:", ci)
def SNRpsdEPOCH(epoch_condition, starta, enda, snr_format, numrois, fstart, fend, snr_format_name, blchange, tmin, tmax, baseline, cond_events, cond_events_id, reject): # SNR check for epoched MEG data using PSD estimation of frequency information. # Inputs # epoch_condition = data (raw or epoched, if raw event information will be used to create an epoched data set) # starta = period of interest start (seconds) # enda = period of interest end (seconds) # snr_format = snr channel information (dict) # numrois = number of roi channels (int) # fstart = frequency window of interest start (int) # fend = frequency window of interest end (int) # snr_format_name = name of channel type (e.g. ASSRnum - str) # blchange = measure percent change from baseline (1 = yes, 2 = no) # tmin = event epoch minimum time (float) # tmax = event epoch maximum time (float) # baseline = mne baseline period (e.g (None,0),obj) # cond_events = events for condition of interest # cond_events_id = event ids for conditions to highlight cond of interest # Outputs # chSNR_ASSR (roiSNR) = snr for period of interest at each roi channel # perchangeSNR = snr for percentage change between baseline and active period print('Importing additional modules') import scipy from scipy import stats from scipy import signal import numpy as np import copy import mne import matplotlib as mpl from matplotlib import mlab ################### PRE-ANALYSIS CHECKS # Does the data require epochs creating? check = np.shape(epoch_condition._data) check = np.size(check) if check < 3: print('creating temporary epochs object') temp = mne.Epochs( epoch_condition, cond_events, cond_events_id, tmin, tmax, proj=False, #picks=picks, baseline=baseline, preload=True, reject=reject, add_eeg_ref=False) time = np.linspace(tmin, tmax, np.shape(temp._data)[2]) epoch_condition = copy.deepcopy(temp) del temp else: time = np.linspace(tmin, tmax, np.shape(epoch_condition._data)[2]) # Create timewindow s = mlab.find(time == starta) e = mlab.find(time == enda) ee = e + 1 timewindow = np.arange(s, ee, 1) # Get sampling frequency fs = epoch_condition.info['sfreq'] ######### PART 1 --- FRQ WINDOW OF INTEREST SNR ######### STEP 1 - PSD FROM EACH ROI CHANNEL (MEAN) # psd is performed on the signal for the time window of interest (mean centred) # in this version the appropriate window and overlap should be pre-determined # for the resolution. future versions will automate this procedure to determine # the windowing properties needed for a given resolution. # Example - signal detrended and psd estimated for ~1hz resolution # f,Pxx = signal.welch((teste[timewindow]-mean(teste[timewindow])), # fs=1000,nperseg = 1001,noverlap=np.round(1001/2), # detrend = 'linear',scaling='density') print('Extracting information from region of interest site') rois = np.zeros(numrois, dtype=np.int) for x in range(0, np.shape(rois)[0]): text1 = snr_format_name text2 = str(x + 1) text3 = text1 + text2 rois[x] = int(snr_format[text3]) numfs = (fs / 2) + 1 psdmatrix = np.zeros([numfs, numrois]) print('Estimating grand average PSD for each ROI channel') for x in range(0, numrois): tempo = np.mean(epoch_condition._data[:, rois[x], timewindow], 0) f, Pxx = signal.welch((tempo - np.mean(tempo)), fs=fs, nperseg=1001, noverlap=np.round(1001 / 2), detrend='linear', scaling='density') psdmatrix[:, x] = Pxx ff = np.round(f) frqwindow = np.arange((mlab.find(ff == fstart)), (mlab.find(ff == fend + 1)), 1) ######### STEP 2 - PSD FROM EACH ROI CHANNEL (SINGLE TRIALS) print('Estimating PSD for each ROI channel, single trials') psdmatrix2 = np.zeros([len(epoch_condition._data), (fs / 2) + 1, numrois]) for x in range(0, numrois): data = epoch_condition._data[:, rois[x], timewindow] for y in range(0, len(epoch_condition._data)): tempo = data[y, :] f, Pxx = signal.welch((tempo - np.mean(tempo)), fs=fs, nperseg=1001, noverlap=np.round(1001 / 2), detrend='linear', scaling='density') psdmatrix2[y, :, x] = Pxx ######### STEP 3 - SNR print('Estimating SNR') chSNR_ASSR = np.zeros(np.shape(rois)[0]) for x in range(0, np.shape(rois)[0]): temp1 = psdmatrix[frqwindow, x] temp1 = np.sum(temp1) temp2 = np.sum(psdmatrix2[:, frqwindow, x], 1) snr = temp1 / stats.sem(temp2) chSNR_ASSR[x] = snr del temp1 del temp2 del snr ######### STEP 4 - if requested, snr from %signal change (baseline to window --- mean) if blchange == 1: print( 'Estimating SNR for percentage change between baseline and active period' ) bl = np.arange(0, (mlab.find(time == 0) + 1), 1) bldpsd = np.zeros([(fs / 2) + 1, numrois]) perchangeMean = np.zeros(numrois) bldpsdST = np.zeros( [len(epoch_condition._data), (fs / 2) + 1, numrois]) perchangeST = np.zeros([len(epoch_condition._data), numrois]) percSNR = np.zeros(numrois) for x in range(0, numrois): bldat = np.mean(epoch_condition._data[:, rois[x], bl], 0) f, Pxx = signal.welch((bldat - np.mean(bldat)), nfft=1001, nperseg=500, noverlap=250, fs=fs, detrend='linear', scaling='density') bldpsd[:, x] = Pxx bldpsd = np.sum(bldpsd[frqwindow, :], 0) for x in range(0, numrois): temp1 = np.sum(psdmatrix[frqwindow, x], 0) perchangeMean[x] = ((temp1 - bldpsd[x]) / temp1) * 100 for y in range(0, numrois): bldata = epoch_condition._data[:, rois[y], bl] for x in range(0, len(epoch_condition._data)): temp1 = bldata[x, :] f, Pxx = signal.welch((temp1 - np.mean(temp1)), nfft=1001, nperseg=500, noverlap=250, fs=fs, detrend='linear', scaling='density') bldpsdST[x, :, y] = Pxx temp1 = np.sum(psdmatrix2[x, frqwindow, y], 0) perchangeST[ x, y] = (temp1 - (np.sum(bldpsdST[x, frqwindow, y], 0)) / temp1) * 100 for x in range(0, np.shape(rois)[0]): temp1 = perchangeMean[x] temp2 = perchangeST[x] snr = temp1 / stats.sem(temp2) percSNR[x] = snr print('Finished') return {'roiSNR': chSNR_ASSR, 'perchangeSNR': percSNR} else: print('Finished') return chSNR_ASSR
def plotStandardErrorOfMean(x,methods,drawBarPlot = False, drawPointPlot = False, title="", width=0.10, colors=['b', 'g', 'r', 'c', 'm', 'y', 'k'], log_scale_y=False, log_scale_x=False, legend=True, x_title="X Label", y_title="Y Label"): ''' Plots Mean and Standard Error of the mean for Methods with multiple runs Example ------- x = np.array([[1, 3, 4, 5], [1, 3, 4, 5], [1, 3, 4, 6]]) method_1 = np.array([[1,4,5,2], [3,4,3,6] , [2,5,5,8]]) method_2 = np.array([[8,7,5,9], [7,3,9,1] , [3,2,9,4]]) method_3 = np.array([[10,13,9,11], [9,12,10,10] , [11,14,18,6]]) methods = [method_1, method_2, method_3] plot = plotStandardErrorOfMean(x,methods,drawBarPlot = True) plot.show() Parameters ----------- x : numpy array For each curve, contains the x-coordinates. Each entry corresponds to one method. methods : list of numpy arrays A list of numpy arrays of methods. Each method contains a numpy array of several run of that corresponding method. drawBarPlot : Bool Should be True if a Bar Plot is expected. drawPointPlot : Bool Should be True if a Point Plot is expected. title : string Title of the graph width : float Width of the bars. colors : string array Color of the curve. Each entry corresponds to one curve log_scale_y : Boolean If set to true, changes the y-axis to log scale. log_scale_x: Boolean If set to true, change the x-axis to log scale. legend : Boolean If set to true, displays the legend. x_title : String X label string y_title : String Y label string Retrun ---------- plt : object Plot Object ''' curves = [] for index,method in enumerate(methods): mean = [] sem = [] for j in range(0,len(x[index])): valueArray = np.array([el[j] for el in method]) meanValue = np.mean(valueArray) semValue = stats.sem(valueArray) #Standard Error of Mean mean.append(meanValue) sem.append(semValue) curves.append(np.array([mean,sem])) if(drawBarPlot): barPlot = bar_plot(x,curves, title=title, width=width, colors=colors, log_scale_y=log_scale_y, log_scale_x=log_scale_x, legend=legend, x_title=x_title, y_title=y_title) return barPlot elif (drawPointPlot): pointPlot = point_plot(x,curves,title=title, colors=colors, log_scale_y=log_scale_y, log_scale_x=log_scale_x, legend=legend, x_title=x_title, y_title=y_title) return pointPlot else: raise NameError('Please select the type of the plot')
#coarse-grained metric: #statistics of the rmses (w.r.t. ensembles) for predicted position on the prediction interval mean_rmse[k] = np.mean(rmse_vec) std_rmse[k] = np.std(rmse_vec) print('Mean of rmse:', mean_rmse[k]) print('Std deviation of rmse:', std_rmse[k]) #true position, multiple predicted positions, the averaged prediction and the 90 percent confidence interval sonn = [] ax4 = plt.figure(figsize=(6, 3)) plt.plot(t_res, data_orig[trainlen:trainlen + future], 'r^') for i in range(n_ens): sonn.append(sol[i, trainlen - trainbeg:trainlen + future - trainbeg]) plt.plot(t_res, sonn[i], alpha=0.2) stderr = sem( sonn, axis=0 ) #std error of the mean (sem) provides a simple measure of uncertainty in a value #Remark: Confidence interval is calculated assuming the samples are drawn from a Gaussian distribution #Justification: As the sample size tends to infinity the central limit theorem guarantees that the sampling # distribution of the mean is asymptotically normal plt.plot(t_res, np.mean(sonn, axis=0), 'b-o') y1 = np.mean(sonn, axis=0) - 1.645 * stderr y2 = np.mean(sonn, axis=0) + 1.645 * stderr plt.plot(t_res, y1, '--') plt.plot(t_res, y2, '--') plt.fill_between(t_res, y1, y2, facecolor='blue', alpha=0.2) ax4.text(0.1, 0.96, '(b)', fontsize=12, verticalalignment='top') #plt.grid(False) #plt.title('true position, multiple predicted positions, the averaged prediction and the 90 percent confidence interval') plt.show()
plt.close() def plot_scatter(x, y, title, x_axis, y_axis, file_name): plt.plot(x,y,color=colors.pop()) plt.scatter(x,y,color=colors.pop()) plt.title(title) plt.xlabel(x_axis) plt.ylabel(y_axis) plt.savefig(file_name) plt.show() plt.close() arms = K_array arm_mean = [] arm_err = [] arms_mistakes = [] sampleSize = len(arms) freedom_degree = sampleSize-1 for ele in arrayOfNumberOfSamplesForEveryK: arm_mean.append(np.mean(ele[1:len(ele)-1])) arm_err.append(ss.t.ppf(0.95, freedom_degree)*ss.sem(ele[1:len(ele)-1])) arms_mistakes.append(1.0*ele[len(ele)-1]/sampleSize) plot_errorbar(arms, arm_mean, arm_err, "K vs Sample Complexity", "K", "Sample Complexity", "KL_UCB_Sample_complexity.png") plot_scatter(arms, arms_mistakes, "Mistakes Probability vs K", "K", "Mistakes Probability", "KL_UCB_Mistake_probablity.png") #############################
def confidence_interval(x): lower, upper = stats.t.interval(0.95, len(x) - 1, loc=np.mean(x), scale=stats.sem(x)) return lower, upper
def main(): fdir = "data/processed" df_flux = pd.read_csv(os.path.join(fdir, "g1_fluxnet_screened.csv")) df_leaf = pd.read_csv(os.path.join(fdir, "g1_leaf_gas_exchange.csv")) df_isotope = pd.read_csv(os.path.join(fdir, "g1_isotope_screened.csv")) sns.set_style("ticks") sns.set_style({"xtick.direction": "in","ytick.direction": "in"}) fig = plt.figure(figsize=(12,12)) fig.subplots_adjust(hspace=0.05) fig.subplots_adjust(wspace=0.05) plt.rcParams['text.usetex'] = False plt.rcParams['font.family'] = "sans-serif" plt.rcParams['font.sans-serif'] = "Helvetica" plt.rcParams['axes.labelsize'] = 14 plt.rcParams['font.size'] = 12 plt.rcParams['legend.fontsize'] = 12 plt.rcParams['xtick.labelsize'] = 12 plt.rcParams['ytick.labelsize'] = 12 almost_black = '#262626' # change the tick colors also to the almost black plt.rcParams['ytick.color'] = almost_black plt.rcParams['xtick.color'] = almost_black # change the text colors also to the almost black plt.rcParams['text.color'] = almost_black # Change the default axis colors from black to a slightly lighter black, # and a little thinner (0.5 instead of 1) plt.rcParams['axes.edgecolor'] = almost_black plt.rcParams['axes.labelcolor'] = almost_black colour_list = brewer2mpl.get_map('Set2', 'qualitative', 8).mpl_colors #colour_list = sns.palplot(sns.color_palette("colorblind", 10)) #colour_list = sns.color_palette("Set2", 10) # CB palette with grey: # from http://jfly.iam.u-tokyo.ac.jp/color/image/pallete.jpg #colour_list = ["#56B4E9", "#009E73", "#0072B2", "#F0E442",\ # "#E69F00", "#D55E00", "#CC79A7", "#999999"] colour_list = sns.color_palette("Accent", 10) ax1 = fig.add_subplot(231) ax2 = fig.add_subplot(232) ax3 = fig.add_subplot(233) ax4 = fig.add_subplot(234) ax5 = fig.add_subplot(235) ax6 = fig.add_subplot(236) pft_order = ['ENF', 'EBF', 'DBF', 'TRF'] for i, pft in enumerate(pft_order): leaf = df_leaf[df_leaf.PFT == pft] isotope = df_isotope[df_isotope.PFT == pft] flux = df_flux[df_flux.PFT == pft] if i >= 3: cidx = i+1 else: cidx = i for lat in np.unique(leaf.latitude): data_lat = leaf[leaf.latitude == lat] ax1.errorbar(np.mean(data_lat.g1), np.mean(data_lat.latitude), xerr=stats.sem(data_lat.g1), ls=" ", marker="o", color=colour_list[cidx], markeredgecolor="lightgrey", alpha=0.8, capsize=False) for lat in np.unique(isotope.latitude): data_lat = isotope[isotope.latitude == lat] ax2.errorbar(np.mean(data_lat.g1), np.mean(data_lat.latitude), xerr=stats.sem(data_lat.g1), ls=" ", marker="o", color=colour_list[cidx], markeredgecolor="lightgrey", alpha=0.8, capsize=False) for lat in np.unique(flux.latitude): data_lat = flux[flux.latitude == lat] ax3.errorbar(np.mean(data_lat.g1), np.mean(data_lat.latitude), xerr=stats.sem(data_lat.g1), ls=" ", marker="o", color=colour_list[cidx], markeredgecolor="lightgrey", alpha=0.8, capsize=False) for i, pft in enumerate(pft_order): if i >= 3: cidx = i+1 else: cidx = i ax1.plot(np.nan, np.nan, ls=" ", marker="o", color=colour_list[cidx], markeredgecolor="lightgrey", label=pft, alpha=0.9) ax1.legend(numpoints=1, ncol=1, loc="best", frameon=False) pft_order = ['SAV', 'SHB', 'C3G', 'C4G', 'C3C', 'C4C'] for i, pft in enumerate(pft_order): leaf = df_leaf[df_leaf.PFT == pft] isotope = df_isotope[df_isotope.PFT == pft] flux = df_flux[df_flux.PFT == pft] if i >= 3: cidx = i+1 else: cidx = i for lat in np.unique(leaf.latitude): data_lat = leaf[leaf.latitude == lat] ax4.errorbar(np.mean(data_lat.g1), np.mean(data_lat.latitude), xerr=stats.sem(data_lat.g1), ls=" ", marker="D", color=colour_list[cidx], markeredgecolor="lightgrey", alpha=0.8, capsize=False) for lat in np.unique(isotope.latitude): data_lat = isotope[isotope.latitude == lat] ax5.errorbar(np.mean(data_lat.g1), np.mean(data_lat.latitude), xerr=stats.sem(data_lat.g1), ls=" ", marker="D", color=colour_list[cidx], markeredgecolor="lightgrey", label=pft, alpha=0.8, capsize=False) for lat in np.unique(flux.latitude): data_lat = flux[flux.latitude == lat] ax6.errorbar(np.mean(data_lat.g1), np.mean(data_lat.latitude), xerr=stats.sem(data_lat.g1), ls=" ", marker="D", color=colour_list[cidx], markeredgecolor="lightgrey", alpha=0.8, capsize=False) for i, pft in enumerate(pft_order): if i >= 3: cidx = i+1 else: cidx = i ax4.plot(np.nan, np.nan, ls=" ", marker="D", color=colour_list[cidx], markeredgecolor="lightgrey", label=pft, alpha=0.9) ax4.legend(numpoints=1, ncol=1, loc="best", frameon=False) labels = ["(a)", "(b)", "(c)", "(d)", "(e)", "(f)"] props = dict(boxstyle='round', facecolor='white', alpha=1.0, ec="white") for i, ax in enumerate([ax1, ax2, ax3, ax4, ax5, ax6]): ax.set_xlim(0, 14) ax.set_ylim(-60, 90) ax.locator_params(nbins=6, axis="x") ax.locator_params(nbins=6, axis="y") ax.axhline(y=0.0, c='grey', lw=1.0, ls='--') ax.axhline(y=-23.43723, c='grey', lw=1.0, ls='-.') ax.axhline(y=23.43723, c='grey', lw=1.0, ls='-.') ax.text(0.03, 0.98, labels[i], transform=ax.transAxes, fontsize=12, verticalalignment='top', bbox=props) for ax in [ax1, ax2, ax3]: plt.setp(ax.get_xticklabels(), visible=False) for ax in [ax2, ax3, ax5, ax6]: plt.setp(ax.get_yticklabels(), visible=False) ax1.set_title("Leaf gas exchange") ax2.set_title("Leaf isotope") ax3.set_title("FLUXNET") ax1.set_ylabel("Latitude (degrees)", position=(0.5, 0.0)) ax5.set_xlabel("Estimated $g_1$ (kPa$^{0.5}$)") odir = "/Users/%s/Dropbox/g1_leaf_ecosystem_paper/figures/figs/" % \ (os.getlogin()) plt.savefig(os.path.join(odir, "g1_vs_latitude.pdf"), bbox_inches='tight', pad_inches=0.1)
def tf_edge_delta_out( crc_folder, bam_list, analysis_name, edge_table_path_1, edge_table_path_2, group1_list, group2_list, output="", ): """Calculates changes in group out degree at each predicted motif occurrence (by subpeaks).""" crc_folder = utils.format_folder(crc_folder, True) edge_path = merge_edge_tables( edge_table_path_1, edge_table_path_2, os.path.join(crc_folder, "{}_EDGE_TABLE.txt".format(analysis_name)), ) # make a gff of the edge table edge_table = utils.parse_table(edge_path, "\t") edge_gff = [] for line in edge_table[1:]: gff_line = [ line[2], "{}_{}".format(line[0], line[1]), "", line[3], line[4], "", ".", "", "{}_{}".format(line[0], line[1]), ] edge_gff.append(gff_line) edge_gff_path = os.path.join(crc_folder, "{}_EDGE_TABLE.gff".format(analysis_name)) utils.unparse_table(edge_gff, edge_gff_path, "\t") # direct the output to the crc folder signal_path = os.path.join( crc_folder, "{}_EDGE_TABLE_signal.txt".format(analysis_name)) all_group_list = group1_list + group2_list if not utils.check_output(signal_path, 0, 0): signal_table_list = pipeline_utils.map_regions( bam_list, [edge_gff_path], crc_folder, crc_folder, all_group_list, True, signal_path, extend_reads_to=100, ) print(signal_table_list) else: print("Found previous signal table at {}".format(signal_path)) # now bring in the signal table as a dictionary using the locus line as the id print("making log2 group1 vs group2 signal table at edges") signal_table = utils.parse_table(signal_path, "\t") # figure out columns for group1 and group2 group1_columns = [signal_table[0].index(name) for name in group1_list] group2_columns = [signal_table[0].index(name) for name in group2_list] group1_signal_vector = [] group2_signal_vector = [] for line in signal_table[1:]: group1_signal = numpy.mean( [float(line[col]) for col in group1_columns]) group2_signal = numpy.mean( [float(line[col]) for col in group2_columns]) group1_signal_vector.append(group1_signal) group2_signal_vector.append(group2_signal) group1_median = numpy.median(group1_signal_vector) group2_median = numpy.median(group2_signal_vector) print("group1 median signal") print(group1_median) print("group2 median signal") print(group2_median) # now that we have the median, we can take edges where at least 1 edge is above the median # and both are above zero and generate a new table w/ the fold change signal_filtered_path = signal_path.replace(".txt", "_filtered.txt") if utils.check_output(signal_filtered_path, 0, 0): print("Found filtered signal table for edges at {}".format( signal_filtered_path)) signal_table_filtered = utils.parse_table(signal_filtered_path, "\t") else: signal_table_filtered = [ signal_table[0] + ["GROUP1_MEAN", "GROUP2_MEAN", "GROUP1_vs_GROUP2_LOG2"] ] for line in signal_table[1:]: group1_signal = numpy.mean( [float(line[col]) for col in group1_columns]) group2_signal = numpy.mean( [float(line[col]) for col in group2_columns]) if (group1_signal > group1_median or group2_signal > group2_median ) and min(group1_signal, group2_signal) > 0: delta = numpy.log2(group1_signal / group2_signal) new_line = line + [group1_signal, group2_signal, delta] signal_table_filtered.append(new_line) utils.unparse_table(signal_table_filtered, signal_filtered_path, "\t") # now get a list of all TFs in the system tf_list = utils.uniquify( [line[0].split("_")[0] for line in signal_table_filtered[1:]]) tf_list.sort() print(tf_list) out_degree_table = [[ "TF_NAME", "EDGE_COUNT", "DELTA_MEAN", "DELTA_MEDIAN", "DELTA_STD", "DELTA_SEM", ]] for tf_name in tf_list: print(tf_name) edge_vector = [ float(line[-1]) for line in signal_table_filtered[1:] if line[0].split("_")[0] == tf_name ] edge_count = len(edge_vector) delta_mean = round(numpy.mean(edge_vector), 4) delta_median = round(numpy.median(edge_vector), 4) delta_std = round(numpy.std(edge_vector), 4) delta_sem = round(stats.sem(edge_vector), 4) tf_out_line = [ tf_name, edge_count, delta_mean, delta_median, delta_std, delta_sem, ] out_degree_table.append(tf_out_line) # set final output if not output: output_path = os.path.join( crc_folder, "{}_EDGE_DELTA_OUT.txt".format(analysis_name)) else: output_path = output utils.unparse_table(out_degree_table, output_path, "\t") print(output_path) return output_path
import matplotlib.pyplot as plt import numpy as np import scipy.stats as sc TEST_DATA = np.array([ [1, 2, 3, 2, 1, 2, 3, 4, 2, 3, 2, 1, 2, 3, 4, 4, 3, 2, 3, 2, 3, 2, 1], [5, 6, 5, 4, 5, 6, 7, 7, 6, 7, 7, 2, 8, 7, 6, 5, 5, 6, 7, 7, 7, 6, 5], [9, 8, 7, 8, 8, 7, 4, 6, 6, 5, 4, 3, 2, 2, 2, 3, 3, 4, 5, 5, 5, 6, 1], [3, 2, 3, 2, 2, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 5, 6, 6, 7, 8, 9, 8, 5], ]) # find mean for each of our observations y = np.mean(TEST_DATA, axis=1, dtype=np.float64) # and the 95% confidence interval ci95 = np.abs(y - 1.96 * sc.sem(TEST_DATA, axis=1)) # each set is one try tries = np.arange(0, len(y), 1.0) # tweak grid and setup labels, limits plt.grid(True, alpha=0.5) plt.gca().set_xlabel('Observation #') plt.gca().set_ylabel('Mean (+- 95% CI)') plt.title("Observations with corresponding 95% CI as error bar.") plt.bar(tries, y, align='center', alpha=0.2) plt.errorbar(tries, y, yerr=ci95, fmt=None) plt.show()
def evaluate_embeddings(self, algo, global_step, datasets): """Labeled evaluation.""" # TODO(debidatta): Move these hard coded params to config after expts. num_labeled_list = range(1, 11) num_episodes = 50 # Set random seed to ensure same samples are generated for each evaluation. np.random.seed(seed=42) train_embs = np.concatenate(datasets['train_dataset']['embs']) val_embs = np.concatenate(datasets['val_dataset']['embs']) if train_embs.shape[0] == 0 or val_embs.shape[0] == 0: logging.warn( 'All embeddings are NAN. Something is wrong with model.') return 0.0 val_labels = np.concatenate(datasets['val_dataset']['labels']) report_val_accs = [] train_dataset = datasets['train_dataset'] num_samples = len(train_dataset['embs']) # Also add half of the train dataset. num_labeled_list += [int(0.5 * num_samples)] # Create episode list. episodes_list = [] for num_labeled in num_labeled_list: episodes = [] for _ in range(num_episodes): episodes.append( np.random.permutation(num_samples)[:num_labeled]) episodes_list.append(episodes) def indi_worker(episode): """Executes single epsiode for a particular k-shot task.""" train_embs = np.concatenate(np.take(train_dataset['embs'], episode)) train_labels = np.concatenate( np.take(train_dataset['labels'], episode)) train_acc, val_acc = fit_linear_models(train_embs, train_labels, val_embs, val_labels) return train_acc, val_acc def worker(episodes): """Executes all epsiodes for a particular k-shot task.""" with cf.ThreadPoolExecutor() as executor: results = executor.map(indi_worker, episodes) results = list(zip(*results)) train_accs = results[0] val_accs = results[1] return train_accs, val_accs with cf.ThreadPoolExecutor() as executor: results = executor.map(worker, episodes_list) for (num_labeled, (train_accs, val_accs)) in zip(num_labeled_list, results): prefix = '%s_%s' % (datasets['name'], str(num_labeled)) # Get average accuracy over all episodes. train_acc = np.mean(np.mean(train_accs)) val_acc = np.mean(np.mean(val_accs)) # Get 95% Confidence Intervals. train_ci = st.t.interval( 0.95, len(train_accs) - 1, loc=train_acc, scale=st.sem(train_accs))[1] - train_acc val_ci = st.t.interval(0.95, len(val_accs) - 1, loc=val_acc, scale=st.sem(val_accs))[1] - val_acc logging.info('[Global step: {}] Classification {} Shot ' 'Train Accuracy: {:.4f},'.format( global_step.numpy(), prefix, train_acc)) logging.info('[Global step: {}] Classification {} Shot ' 'Val Accuracy: {:.4f},'.format( global_step.numpy(), prefix, val_acc)) logging.info('[Global step: {}] Classification {} Shot ' 'Train Confidence Interval: {:.4f},'.format( global_step.numpy(), prefix, train_ci)) logging.info('[Global step: {}] Classification {} Shot ' 'Val Confidence Interval: {:.4f},'.format( global_step.numpy(), prefix, val_ci)) tf.summary.scalar('few_shot_cxn/train_%s_accuracy' % prefix, train_acc, step=global_step) tf.summary.scalar('few_shot_cxn/val_%s_accuracy' % prefix, val_acc, step=global_step) tf.summary.scalar('few_shot_cxn/train_%s_ci' % prefix, train_ci, step=global_step) tf.summary.scalar('few_shot_cxn/val_%s_ci' % prefix, val_ci, step=global_step) report_val_accs.append(val_acc) return report_val_accs[-1]
def mean_and_se(data): mu = np.mean(data) se = stats.sem(data) return mu, se