def find_scores_Z(N,old_seeds,degrees): scores = [] diffs = [] timespents = [] avgdegs = [] for i in range(len(N)): seeds = old_seeds + [i] diffscore = 0.0 timespentscore = 0.0 for j in range(len(seeds)): seed = seeds[j] for other_seed in seeds[j:]: diffscore += float(abs(degrees[seed] - degrees[other_seed])) for other_seed in seeds: if seed == other_seed: continue timespentscore += N[seed][other_seed] diffs.append(diffscore) timespents.append(timespentscore) avgdegsum = 0 for seed in seeds: avgdegsum += degrees[seed] avgdeg = avgdegsum/len(seeds) avgdegs.append(avgdeg) print timespents[76] print timespents[58] diffs = stats.zscore(diffs) timespents = stats.zscore(timespents) avgdegs = stats.zscore(avgdegs) for i in range(len(diffs)): scores.append(diffs[i] + timespents[i] - avgdegs[i]) return scores
def collaspe_fclusters(data=None, t=None, row_labels=None, col_labels=None, linkage='average', pdist='euclidean', standardize=3, log=False): """a function to collaspe flat clusters by averaging the vectors within each flat clusters achieved from hierarchical clustering""" ## preprocess data if log: data = np.log2(data + 1.0) if standardize == 1: # Standardize along the columns of data data = zscore(data, axis=0) elif standardize == 2: # Standardize along the rows of data data = zscore(data, axis=1) if row_labels is not None and col_labels is None: ## only get fclusters for rows d = dist.pdist(data, metric=pdist) axis = 1 ##!!! haven't checked whether this is correct yet elif row_labels is None and col_labels is not None: ## only get fclusters for cols d = dist.pdist(data.T, metric=pdist) axis = 0 D = dist.squareform(d) Y = sch.linkage(D, method=linkage, metric=pdist) fclusters = sch.fcluster(Y, t, 'distance') fcluster_set = set(fclusters) data_cf = [] for fc in fcluster_set: mask = np.where(fclusters==fc) data_t = data.T vector_avg = np.average(data_t[mask],axis=axis) data_cf.append(vector_avg) data_cf = np.array(data_cf).T return data_cf
def main(argv): category_predictions = {} for cat in categories: file_reg = classifier_reg + "/" + cat + "/predictions" fo = open(file_reg) reg_predictions = [float(x) for x in fo.read().split('\n')[:-1]] reg_zscores = stats.zscore(reg_predictions) file_red = classifier_red + "/" + cat + "/predictions" fo = open(file_red) red_predictions = [float(x) for x in fo.read().split('\n')[:-1]] red_zscores = stats.zscore(red_predictions) max_zscore = [] # Take zscore that has the greatest deviation out of the two classifiers for i in range(len(reg_predictions)): reg_greater = abs(reg_zscores[i]) >= abs(red_zscores[i]) val = reg_zscores[i] if reg_greater else red_zscores[i] max_zscore.append(val) category_predictions[cat] = max_zscore results_list = output_predictions(categories, category_predictions) calculate_accuracies(results_list)
def algorithm(w1,w2,w3,w4,G1,G2,G3,G4): try: cc=np.array([nx.average_clustering(G1,weight='weight'),nx.average_clustering(G2,weight='weight'),nx.average_clustering(G3,weight='weight'),nx.average_clustering(G4,weight='weight')]) spl=np.array([nx.average_shortest_path_length(G1,weight='weight'),nx.average_shortest_path_length(G2,weight='weight'),nx.average_shortest_path_length(G3,weight='weight'),nx.average_shortest_path_length(G4,weight='weight')]) nds=np.array([nx.number_of_nodes(G1),nx.number_of_nodes(G2),nx.number_of_nodes(G3),nx.number_of_nodes(G4)]) edgs= np.array([nx.number_of_edges(G1),nx.number_of_edges(G2),nx.number_of_edges(G3),nx.number_of_edges(G4)]) if valid(cc): cc=stats.zscore(cc) else: cc=np.array([.1,.1,.1,.1]) cc= cc-min(cc)+.1 if valid(spl): spl=stats.zscore(spl) else: spl=np.array([.1,.1,.1,.1]) spl= spl-min(spl)+.1 if valid(nds): nds=stats.zscore(nds) else: nds=np.array([.1,.1,.1,.1]) nds = nds-min(nds)+.1 if valid(edgs): edgs=stats.zscore(edgs) else: edgs=np.array([.1,.1,.1,.1]) edgs=edgs-min(edgs)+.1 r1=(w1*cc[0]+w2*spl[0]+w3*nds[0]+w4*edgs[0])*1000 r2=(w1*cc[1]+w2*spl[1]+w3*nds[1]+w4*edgs[1])*1000 r3=(w1*cc[2]+w2*spl[2]+w3*nds[2]+w4*edgs[2])*1000 r4=(w1*cc[3]+w2*spl[3]+w3*nds[3]+w4*edgs[3])*1000 d={'Player 1:': r1, 'Player 2:': r2,'Player 3:': r3, 'Player 4:': r4} rank = sorted(d.items(), key=lambda x: x[1], reverse=True) return ["USAU RANKINGS",str(rank[0][0])+ " " + str(int(rank[0][1])),str(rank[1][0])+" "+ str(int(rank[1][1])),str(rank[2][0])+" "+ str(int(rank[2][1])),str(rank[3][0])+" "+str(int(rank[3][1]))] except: return ["Unable to compute rankings! Need data","Player 1","Player 2","Player 3","Player 4"]
def correct_covariates(Dtrait, Dcov, variables): Dcomb = pd.merge(Dtrait.T, Dcov.T, left_index=True, right_index=True).T Dcorr = Dtrait.copy() traits = Dtrait.columns.values.tolist() print 'Correcting for %s' % variables for idx, i in enumerate(Dtrait.index): sys.stdout.write('\rTrait %d of %d' % (idx, Dtrait.shape[0])) sys.stdout.flush() if len(variables) == 1: rlm_model = sm.RLM(Dcomb.loc[i,:], zscore(array(Dcomb.loc[variables,:]).T)) else: rlm_model = sm.RLM(Dcomb.loc[i,:], zscore(array(Dcomb.loc[variables,:]).T, axis=0)) rlm_results = rlm_model.fit() Dcorr.loc[i,:] = rlm_results.resid """ if idx > 1: f, axarr = subplots(3,2) axarr[0,0].scatter(Dtrait.loc['EIF1AY',:], Dtrait.loc['OSBP', :]) axarr[0,1].scatter(Dcorr.loc['EIF1AY',:], Dcorr.loc['OSBP', :]) axarr[1,0].hist([x for x in Dtrait.loc['EIF1AY',:] if not isnan(x)]) axarr[1,1].hist([x for x in Dcorr.loc['EIF1AY',:] if not isnan(x)]) axarr[2,0].hist([x for x in Dtrait.loc['OSBP',:] if not isnan(x)]) axarr[2,1].hist([x for x in Dcorr.loc['OSBP',:] if not isnan(x)]) f2, axarr2 = subplots(3,2) axarr2[0,0].scatter(Dcomb.loc['gender',:], Dcomb.loc['OSBP', :]) axarr2[1,0].scatter(Dcomb.loc['age',:], Dcomb.loc['OSBP', :]) axarr2[2,0].scatter(Dcomb.loc['site',:], Dcomb.loc['OSBP', :]) axarr2[0,1].scatter(Dcomb.loc['gender',:], Dcomb.loc['EIF1AY', :]) axarr2[1,1].scatter(Dcomb.loc['age',:], Dcomb.loc['EIF1AY', :]) axarr2[2,1].scatter(Dcomb.loc['site',:], Dcomb.loc['EIF1AY', :]) show() exit() """ return Dcorr
def predict_loo(transformed_data, args): print 'mysseg loo', sys.stdout.flush() (ndim, nsample , nsubjs) = transformed_data.shape tst_subj = args.loo win_size = args.winsize nseg = nsample - win_size # mysseg prediction prediction trn_data = np.zeros((ndim*win_size, nseg)) # the trn data also include the tst data, but will be subtracted when # calculating A for m in range(nsubjs): for w in range(win_size): trn_data[w*ndim:(w+1)*ndim,:] += transformed_data[:,w:(w+nseg),m] tst_data = np.zeros((ndim*win_size, nseg)) for w in range(win_size): tst_data[w*ndim:(w+1)*ndim,:] = transformed_data[:,w:(w+nseg),tst_subj] A = stats.zscore((trn_data - tst_data),axis=0, ddof=1) B = stats.zscore(tst_data,axis=0, ddof=1) corr_mtx = B.T.dot(A) for i in range(nseg): for j in range(nseg): if abs(i-j)<win_size and i != j : corr_mtx[i,j] = -np.inf rank = np.argmax(corr_mtx, axis=1) accu = sum(rank == range(nseg)) / float(nseg) return accu
def hist_and_smooth_data(spike_data): max_spike_ts = 0 for i in range(len(spike_data)): if np.amax(spike_data[i]) > max_spike_ts: max_spike_ts = np.amax(spike_data[i]) max_bin_num = int(np.ceil(max_spike_ts) / float(bin_size) * 1000) hist_data = np.zeros((len(spike_data),max_bin_num)) hist_bins = np.zeros((len(spike_data),max_bin_num)) for i in range(len(spike_data)): total_bin_range = np.arange(0,int(np. ceil(spike_data[i].max())),bin_size/1000.0) hist,bins = np.histogram(spike_data[i],bins=total_bin_range,range=(0,int(np.ceil(spike_data[i].max()))),normed=False,density=False) #pdb.set_trace() hist_data[i,0:len(hist)] = hist hist_bins[i,0:len(bins)] = bins #TODO fix so gaus divide by bin size and -> fr before smoothing #TODO make option for zscore and gaus togethre if zscore_bool and gaussian_bool: smoothed = stats.zscore(hist_data,axis=1) smoothed = ndimage.filters.gaussian_filter1d(smoothed,gauss_sigma,axis=1) elif zscore_bool: smoothed = stats.zscore(hist_data,axis=1) elif gaussian_bool: smoothed = ndimage.filters.gaussian_filter1d(hist_data,gauss_sigma,axis=1) else: smoothed = {} return_dict = {'hist_data':hist_data,'hist_bins':hist_bins,'smoothed':smoothed} return(return_dict)
def plot_features_distribution(feature_set, feature_set_permutation, save_path, prename='features', n_features=90, n_bins=20): plt.figure() h_values_p, _ = np.histogram(feature_set_permutation.flatten(), bins=np.arange(0, n_features+1)) plt.hist(zscore(h_values_p), bins=n_bins) fname = "%s_features_set_permutation_distribution.png" % (prename) plt.savefig(os.path.join(save_path, fname)) plt.figure() h_values_, _ = np.histogram(feature_set.flatten(), bins=np.arange(0, n_features+1)) plt.plot(zscore(h_values_)) fname = "%s_features_set_cross_validation.png" % (prename) plt.savefig(os.path.join(save_path, fname)) plt.close('all')
def knnClassifier(training_data, test_data, training_target, test_target, k=5): #normalize the data #calculate the z-score of the data #print training_data training_data = training_data new_training_data = stats.zscore(training_data.astype(int), axis=0) new_test_data = stats.zscore(test_data.astype(int), axis=0) #find the k nearest neighbors for each test data #print 'test', new_test_data predictions = [] for test in new_test_data: #print test # find the euclidean distance between the test case and all training cases distances = [] neighbors = [] neighbor_predictions = [] for train in new_training_data: #print train distances.append(np.linalg.norm(train-test)) #print distances for i in range(k): neighb_i = distances.index(min(distances)) neighbors.append(neighb_i) distances[neighb_i] = 1000000 #print neighbors for neighb in neighbors: neighbor_predictions.append(training_target[neighb]) predictions.append(stats.mode(neighbor_predictions)[0][0]) return predictions
def ExtractDataVer2(all_relevant_channels, marker_positions, target, ms_before, ms_after): target_idx = marker_positions[np.where(target == 1)[0]] - 1 all_target_transpose = np.asarray(all_relevant_channels).T number_positive_of_samples = len(target_idx) before_trigger = int((ms_before * 1.0) / 5) after_trigger = int((ms_after * 1.0) / 5) all_target_data = extractTimeWindowFast(all_target_transpose, target_idx, before_trigger, after_trigger) non_target_idx = marker_positions[np.where(target == 0)[0]] - 1 number_positive_of_samples = len(non_target_idx) all_non_target_data = extractTimeWindowFast(all_target_transpose, non_target_idx, before_trigger, after_trigger) # normalize the data over the time axe all_data = np.vstack((stats.zscore(all_target_data, axis=1).astype('float32'), stats.zscore(all_non_target_data, axis=1).astype('float32'))) all_tags = np.vstack((np.ones((all_target_data.shape[0], 1), dtype='int8'), np.zeros((all_non_target_data.shape[0], 1), dtype='int8'))) return all_data, all_tags
def data_parser(theta,kappa,tt,ch,tt_ch): theta_r = np.array([[resample(theta.values.squeeze()[i,950:1440],50)] for i in range(0,theta.shape[0])]) theta_r = zscore(theta_r.squeeze(),axis=None) kappa_r = np.array([[resample(kappa.values.squeeze()[i,950:1440],50)] for i in range(0,kappa.shape[0])]) kappa_r = zscore(kappa_r.squeeze(),axis=None) kappa_df = pd.DataFrame(kappa_r) theta_df = pd.DataFrame(theta_r) both_df = pd.concat([theta_df,kappa_df],axis=1) if tt_ch == 'tt': # trial type clean = np.nan_to_num(tt) !=0 tt_c = tt[clean.squeeze()].values else : # choice clean = np.nan_to_num(ch) !=0 tt_c = ch[clean.squeeze()].values # tt_c = tt[tt.values !=0|3].values both = both_df.values # both_c = both[clean.squeeze(),:] both_c = both[clean.squeeze(),:] # keeping one hot vector for now (incase we want it later) # labs = np.eye(3)[tt_c.astype(int)-1] # y[np.arange(3), a] = 1 # labs = labs.squeeze() return both_c, tt_c, clean
def significantly_unenriched(xs, ys, zthresh=2., scale='linear'): assert scale in ['linear', 'log'] if scale =='log': xs = np.log2(xs) ys = np.log2(ys) xs = stats.zscore(xs) ys = stats.zscore(ys) return [x < -zthresh or y < -zthresh for x, y in zip(xs, ys)]
def simulate(self, beta0, beta, x): if(self.distr=='poisson'): y = np.random.poisson(self.lmb(beta0, beta, zscore(x))) if(self.distr=='normal'): y = np.random.normal(self.lmb(beta0, beta, zscore(x))) if(self.distr=='binomial'): y = np.random.binomial(1, self.lmb(beta0, beta, zscore(x))) return y
def _normalize_network(self, x, square=True): if square: norm_col = zscore(x, axis=0) return (norm_col + norm_col.T) / math.sqrt(2) else: norm_col = zscore(x, axis=0) norm_row = zscore(x, axis=1) return (norm_col + norm_row) / math.sqrt(2)
def combineTT(data1,data2): iA,iB = findOLIndices(data1.geneList,data2.geneList) newGeneList = [data1.geneList[i] for i in iA] newSampleList = data1.cleanSampIDs + data2.cleanSampIDs newCnData = np.column_stack((data1.cnData[iA,:],data2.cnData[iB,:])) newRnaData = np.column_stack((ss.zscore(data1.rnaData[iA,:],1),ss.zscore(data2.rnaData[iB,:],1))) newData = combinedData(newGeneList,newSampleList,newRnaData,newCnData) return newData
def five_second_z_score(start): for interval in range(len(matrix[start])/5000 + 1): j = 5000*interval # print interval, j if interval == len(matrix[start])/5000 + 1: z_score_five_sec.append(stats.zscore(matrix[start][j:])) else: z_score_five_sec.append(stats.zscore(matrix[start][j:j+5000]))
def np_fisherZ(x,y,r): z = 0.5 * (np.log(1.0 + r) - np.log(1.0 - r)) w = np.sqrt(len(x)) * z x_ = zscore(x) y_ = zscore(y) t2 = moment22(x_,y_) t = np.sqrt(t2) p = 2. * (1. - norm.cdf(np.abs(w), 0.0, t)) return p
def simulate(self, beta0, beta, X): """Simulate data.""" if self.distr == 'poisson': y = np.random.poisson(self.lmb(beta0, beta, zscore(X))) if self.distr == 'normal': y = np.random.normal(self.lmb(beta0, beta, zscore(X))) if self.distr == 'binomial': y = np.random.binomial(1, self.lmb(beta0, beta, zscore(X))) return y
def euclid(v1,v2): from scipy.stats import zscore '''Squared Euclidean Distance between two scalars or equally matched vectors USAGE: d = euclid(v1,v2)''' v1 = zscore(v1.flatten()) v2 = zscore(v2.flatten()) d2= np.sqrt(np.sum((v1-v2)**2)) return d2
def plot(pred, y): predm, ym = stats.zscore(pred, axis=0, ddof=1), stats.zscore(y, axis=0, ddof=1) # predm, ym = pp.normalize(pred), pp.normalize(y) print predm.shape, ym.shape times = range(0, len(pred)) # print times.shape print len(times) plt.plot(times, predm, "r-", times, ym, "b-") plt.xlabel("Time (s)") plt.ylabel("BOLD Response") plt.savefig("../figure/lassoplot.jpg")
def iClustergram(data=None, row_labels=None, col_labels=None, row_groups=None, col_groups=None, row_linkage='average', col_linkage='average', row_pdist='euclidean', col_pdist='euclidean', standardize=None, log=False, display_range=3, username='******', apikey='fmnoxd2t2u'): ## preprocess data if log: data = np.log2(data + 1.0) if standardize == 1: # Standardize along the columns of data data = zscore(data, axis=0) elif standardize == 2: # Standardize along the rows of data data = zscore(data, axis=1) ## cluster data: ## compute pdist for rows d1 = dist.pdist(data, metric=row_pdist) D1 = dist.squareform(d1) Y1 = sch.linkage(D1, method=row_linkage, metric=row_pdist) Z1 = sch.dendrogram(Y1, orientation='right') idx1 = Z1['leaves'] ## compute pdist for cols d2 = dist.pdist(data.T, metric=col_pdist) D2 = dist.squareform(d2) Y2 = sch.linkage(D2, method=col_linkage, metric=col_pdist) Z2 = sch.dendrogram(Y2) idx2 = Z2['leaves'] ## transform the orders of data to clustered data data_clustered = data data_clustered = data_clustered[:,idx2] data_clustered = data_clustered[idx1,:] data_to_plot = data_clustered.tolist() ## transform the orders of row and col labels new_row_labels = [] new_col_labels = [] for i in range(data.shape[0]): new_row_labels.append(row_labels[idx1[i]]) for i in range(data.shape[1]): new_col_labels.append(col_labels[idx2[i]]) ## plot clustered data using plotly py = plotly.plotly(username, apikey) d = {} d['x'] = new_row_labels d['y'] = new_col_labels d['z'] = data_to_plot d['type'] = 'heatmap' py.plot([d]) return
def plot_fclusters(data=None, row_labels=None, col_labels=None, linkage='average', pdist='euclidean', standardize=3, log=False): """a function to plot the relationship between thresholds and number of flat clusters achieved from hierarchical clustering, aims to find the optimal threshold for forming clusters""" ## preprocess data if log: data = np.log2(data + 1.0) if standardize == 1: # Standardize along the columns of data data = zscore(data, axis=0) elif standardize == 2: # Standardize along the rows of data data = zscore(data, axis=1) fig = plt.figure() ax1 = fig.add_subplot(121) ax2 = fig.add_subplot(122) if row_labels is not None and col_labels is None: ## only get fclusters for rows d = dist.pdist(data, metric=pdist) elif row_labels is None and col_labels is not None: ## only get fclusters for cols d = dist.pdist(data.T, metric=pdist) D = dist.squareform(d) Y = sch.linkage(D, method=linkage, metric=pdist) space1 = np.linspace(d.min(), d.max(), num=5, endpoint=False) space2 = np.linspace(d.max(),1.,num=30,endpoint=True) thresholds = np.concatenate((space1,space2)) num_clusters = [] num_singles = [] for t in thresholds: fclusters = sch.fcluster(Y, t,'distance') c = Counter(fclusters) num_cluster = len(c.keys()) num_single = c.values().count(1) num_clusters.append(num_cluster) num_singles.append(num_single) print 'threshold=', t, 'clusters:', num_cluster, 'singles:',num_single if num_cluster < 290: print c ax1.plot(thresholds, num_clusters,label='# of flat clusters') ax1.plot(thresholds, num_singles,label='# of singles',c='r') ax1.plot(thresholds, np.array(num_clusters)-np.array(num_singles),label='# of non-singles',c='g') ax1.legend(loc='upper right') ax1.set_xlabel('threshold for forming flat clusters') ax2.plot(thresholds, num_clusters,label='# of flat clusters') ax2.plot(thresholds, num_singles,label='# of singles',c='r') ax2.plot(thresholds, np.array(num_clusters)-np.array(num_singles),label='# of non-singles',c='g') ax2.legend(loc='upper right') ax2.set_xlabel('threshold for forming flat clusters') ax2.set_yscale('log') plt.show() return
def _divide_to_regions(info, add_stim=True): """Divide channels to regions by positions.""" from scipy.stats import zscore picks = _pick_data_channels(info, exclude=[]) chs_in_lobe = len(picks) // 4 pos = np.array([ch['loc'][:3] for ch in info['chs']]) x, y, z = pos.T frontal = picks[np.argsort(y[picks])[-chs_in_lobe:]] picks = np.setdiff1d(picks, frontal) occipital = picks[np.argsort(y[picks])[:chs_in_lobe]] picks = np.setdiff1d(picks, occipital) temporal = picks[np.argsort(z[picks])[:chs_in_lobe]] picks = np.setdiff1d(picks, temporal) lt, rt = _divide_side(temporal, x) lf, rf = _divide_side(frontal, x) lo, ro = _divide_side(occipital, x) lp, rp = _divide_side(picks, x) # Parietal lobe from the remaining picks. # Because of the way the sides are divided, there may be outliers in the # temporal lobes. Here we switch the sides for these outliers. For other # lobes it is not a big problem because of the vicinity of the lobes. with np.errstate(invalid='ignore'): # invalid division, greater compare zs = np.abs(zscore(x[rt])) outliers = np.array(rt)[np.where(zs > 2.)[0]] rt = list(np.setdiff1d(rt, outliers)) with np.errstate(invalid='ignore'): # invalid division, greater compare zs = np.abs(zscore(x[lt])) outliers = np.append(outliers, (np.array(lt)[np.where(zs > 2.)[0]])) lt = list(np.setdiff1d(lt, outliers)) l_mean = np.mean(x[lt]) r_mean = np.mean(x[rt]) for outlier in outliers: if abs(l_mean - x[outlier]) < abs(r_mean - x[outlier]): lt.append(outlier) else: rt.append(outlier) if add_stim: stim_ch = _get_stim_channel(None, info, raise_error=False) if len(stim_ch) > 0: for region in [lf, rf, lo, ro, lp, rp, lt, rt]: region.append(info['ch_names'].index(stim_ch[0])) return {'Left-frontal': lf, 'Right-frontal': rf, 'Left-parietal': lp, 'Right-parietal': rp, 'Left-occipital': lo, 'Right-occipital': ro, 'Left-temporal': lt, 'Right-temporal': rt}
def clustergram(data, rids, cids, row_linkage='average', col_linkage='average', row_pdist='euclidean', col_pdist='euclidean', standardize=3, log=False): ## preprocess data if log: data = np.log2(data + 1.0) if standardize == 1: # Standardize along the columns of data data = zscore(data, axis=0) elif standardize == 2: # Standardize along the rows of data data = zscore(data, axis=1) ## perform hierarchical clustering for rows and cols ## compute pdist for rows: d1 = dist.pdist(data, metric=row_pdist) D1 = dist.squareform(d1) Y1 = sch.linkage(D1, method=row_linkage, metric=row_pdist) Z1 = sch.dendrogram(Y1, orientation='right') idx1 = Z1['leaves'] ## compute pdist for cols d2 = dist.pdist(data.T, metric=col_pdist) D2 = dist.squareform(d2) Y2 = sch.linkage(D2, method=col_linkage, metric=col_pdist) Z2 = sch.dendrogram(Y2) idx2 = Z2['leaves'] row_nodes = [] rids = np.array(rids)[np.array(idx1)] for idx, rid in enumerate(rids): row_nodes.append({'sort': idx, 'name': rid}) col_nodes = [] cids = np.array(cids)[np.array(idx2)] for idx, cid in enumerate(cids): col_nodes.append({'sort': idx, 'name': cid}) links = [] for i in range(len(rids)): for j in range(len(cids)): links.append({'source': i, 'target': j, 'value': data[i,j]}) json_data = { 'row_nodes':row_nodes, 'col_nodes':col_nodes, 'links': links } return json_data
def trend_zscore(self,sym,date,window): slice = self.trends[sym][-window:] if slice[-1] == slice[-2]: z = self.zscores[sym][-1] else: z = zscore(slice)[-1] return z
def find_outliers(X, threshold=3.0): """Find outliers based on Gaussian mixture Parameters ---------- X : np.ndarray of float, shape (n_elemenets,) The scores for which to find outliers. threshold : float The value above which a feature is classified as outlier. Returns ------- bad_idx : np.ndarray of int, shape (n_features) The outlier indices. """ max_iter = 2 my_mask = np.zeros(len(X), dtype=np.bool) X = np.abs(X) for _ in range(max_iter): X = np.ma.masked_array(X, my_mask) this_z = stats.zscore(X) local_bad = this_z > threshold my_mask = np.max([my_mask, local_bad], 0) if not np.any(local_bad): break bad_idx = np.where(my_mask)[0] return bad_idx
def align(movie_data, options, args, lrh): print 'pPCA(scikit-learn)' nvoxel = movie_data.shape[0] nTR = movie_data.shape[1] nsubjs = movie_data.shape[2] align_algo = args.align_algo nfeature = args.nfeature # zscore the data bX = np.nan((nsubjs*nvoxel,nTR)) for m in xrange(nsubjs): bX[m*nvoxel:(m+1)*nvoxel,:] = stats.zscore(movie_data[:, :, m].T, axis=0, ddof=1).T del movie_data U, s, VT = np.linalg.svd(bX, full_matrices=False) bW = np.zeros((nsubjs*nvoxel,nfeature)) for m in xrange(nsubjs): bW[m*nvoxel:(m+1)*nvoxel,:] = U[m*nvoxel:(m+1)*nvoxel,:nfeature] niter = 10 # initialization when first time run the algorithm np.savez_compressed(options['working_path']+align_algo+'_'+lrh+'_'+str(niter)+'.npz',\ bW = bW, niter=niter) return niter
def _generate_noise_system(dimensions_tr, ): """Generate the scanner noise Generate the noise that is typical of a scanner. This is comprised of two types of noise, Rician and Gaussian Parameters ---------- dimensions_tr : n length array, int What are the dimensions of the volume you wish to insert noise into. This can be a volume of any size Returns ---------- system_noise : multidimensional array, float Create a volume with system noise """ # Generate the Rician noise noise_rician = stats.rice.rvs(1, 1, size=dimensions_tr) # Apply the gaussian noise noise_gaussian = np.random.normal(0, 1, size=dimensions_tr) # Combine these two noise types noise_system = noise_rician + noise_gaussian # Normalize noise_system = stats.zscore(noise_system) return noise_system
def triplet_data_collection(data, tags, batch_size, select=3, outof=10): from scipy import stats stimuli_category_size = 30 number_of_repetition = select magic_number = number_of_repetition * stimuli_category_size number_of_samples = data.shape[0] time_samples_dim_size = data.shape[2] channel_dim_size = data.shape[3] all_combination = _get_all_possible_combination(np.arange(number_of_samples), outof, select) shuffled_combination = np.random.permutation(all_combination) batch_data = np.zeros((magic_number, batch_size, time_samples_dim_size, channel_dim_size), dtype=np.float32) counter = 0 for i in range(0,len(shuffled_combination), batch_size): batch_tags = np.zeros((batch_size, stimuli_category_size), dtype=np.int8) for single_combination in shuffled_combination[i:min(i +batch_size,len(shuffled_combination) )]: batch_data[:, counter, :, :] = np.vstack([data[item] for item in single_combination]) batch_tags[counter] = np.mean(np.vstack([tags[item] for item in single_combination]), axis=0) counter += 1 if counter == batch_size: input_dict = dict( [["positive_item_input_{}".format(i), stats.zscore(batch_data[i], axis=1)] for i in range(90)]) input_dict['triplet_loss'] = batch_tags return input_dict
def plot_transition_clustermap(data_array, gene_names, pseudotimes, n_clusters=10, gradient=False): if gradient: data_to_plot = zscore(np.gradient(data_array)[1].T, axis=0) scale = None metric = 'seuclidean' row_linkage = linkage(pdist(abs(data_to_plot), metric=metric), method='complete') else: data_to_plot = data_array.T scale = 0 metric = 'correlation' row_linkage = linkage(pdist(data_to_plot, metric=metric), method='complete') assignments = fcluster(row_linkage, n_clusters, criterion='maxclust') cm = sns.clustermap(data_to_plot, col_cluster=False, standard_scale=scale, yticklabels=gene_names, row_linkage=row_linkage, row_colors=[settings.STATE_COLORS[i] for i in assignments]) r = np.arange(10, data_array.shape[0], data_array.shape[0]/10) plt.setp(cm.ax_heatmap.get_yticklabels(), fontsize=5) cm.ax_heatmap.set_xticks(r) cm.ax_heatmap.set_xticklabels(['%.1f' % x for x in pseudotimes[r]]) cm.ax_heatmap.set_xlabel('Pseudotime') cm.ax_heatmap.set_ylabel('Gene') gene_clusters = defaultdict(list) for i, cl in enumerate(assignments): gene_clusters[settings.STATE_COLORS[cl]].append(gene_names[i]) return gene_clusters
wine_ds[col] = np.log1p(wine_ds[col]) # In[75]: wine_ds.skew() # In[76]: sns.pairplot(wine_ds) plt.show() # In[77]: from scipy.stats import zscore z_score = abs(zscore(wine_ds)) print(wine_ds.shape) wine_ds_final = wine_ds.loc[(z_score < 3).all(axis=1)] print(wine_ds_final.shape) # In[78]: #Separatomg target and input variables df_x = wine_ds_final.drop(columns=['Class']) y = wine_ds_final['Class'] # In[79]: df_x
np.random.seed(42) n_splits = 10 # Read data #print("Start loading data @ %.5f\n" % (time.time()-elapsed)) cwd = os.getcwd() file_name_feature = cwd + "/../dataset/bank-additional-full_new_features.csv" file_name_label = cwd + "/../dataset/bank-additional-full_new_labels.csv" #print("End loading data @ %.5f\n" % (time.time()-elapsed)) #print("Start shuffling and sampling @ %.5f\n" % (time.time()-elapsed)) features, header_ele = readData(file_name_feature) features = shuffle(features, random_state=41)[:5000] features[:, :9] = zscore(features[:, :9]) label, label_names = loadLabels(file_name_label) label = shuffle(label, random_state=41)[:5000] #print("End shuffling and sampling @ %.5f\n" % (time.time()-elapsed)) #print("Start splitting dataset with 10-folds @ %.5f\n" % (time.time()-elapsed)) Kfold = StratifiedKFold(n_splits=n_splits) #print("End splitting dataset with 10-folds @ %.5f\n" % (time.time()-elapsed)) accuracy_training_log = np.zeros(n_splits) accuracy_testing_log = np.zeros(n_splits) nlpd_t = np.zeros(n_splits) nlpd_v = np.zeros(n_splits) for i, (train_index, test_index) in enumerate(Kfold.split(features, label)):
}) # S marker size # Set title plt.title('plot between Garage Area and SalPrice') plt.xlim(-200, 1600) # Set x-axis label plt.xlabel('GarageArea') # Set y-axis label plt.ylabel('SalePrice') plt.show() # Removing the Anamolies using z-score # if the data is more than 3 standard deviations away then it is considered as outlier # if the data is less than -3 standard deviations away then it is considered as outlier df = pd.read_csv('train.csv', sep=',', usecols=(62, 80)) z = np.abs(stats.zscore(df)) threshold = 3 print(np.where(z > 3)) modified_df = df[(z < 3).all(axis=1)] print(df.shape) print(modified_df.shape) # Create Scatterplot of the dataframe after removing the anamolies in the data sns.lmplot( 'GarageArea', # Horizontal axis 'SalePrice', # Vertical axis data=modified_df, # Data source fit_reg=False, # Don't fix a regression line scatter_kws={ "marker": "o", # Set marker style "s": 80
import warnings warnings.filterwarnings('ignore') df_train = pd.read_csv( 'C:/Users/Sushu/Documents/Python/ICP6/Python_Lesson6/train.csv') df_train.describe() var = 'GarageArea' data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1) data.plot.scatter(x=var, y='SalePrice', ylim=(0, 800000)) data.shape data_remana = np.abs(stats.zscore(data)) data_remana[:5, :5] data1 = data[(data_remana < 3).all(axis=1)] var = 'GarageArea' data = pd.concat([df_train['SalePrice'], df_train[var]], axis=1) data1.plot.scatter(x=var, y='SalePrice', ylim=(0, 800000)) df = data[~data.SalePrice.isin(data[data_remana > 3].SalePrice)] df.describe() data1.shape
def runPatterns(actmat, method='ica', nullhyp='mp', nshu=1000, percentile=99, tracywidom=False): ''' INPUTS actmat: activity matrix - numpy array (neurons, time bins) nullhyp: defines how to generate statistical threshold for assembly detection. 'bin' - bin shuffling, will shuffle time bins of each neuron independently 'circ' - circular shuffling, will shift time bins of each neuron independently obs: mantains (virtually) autocorrelations 'mp' - Marcenko-Pastur distribution - analytical threshold nshu: defines how many shuffling controls will be done (n/a if nullhyp is 'mp') percentile: defines which percentile to be used use when shuffling methods are employed. (n/a if nullhyp is 'mp') tracywidow: determines if Tracy-Widom is used. See Peyrache et al 2010. (n/a if nullhyp is NOT 'mp') OUTPUTS patterns: co-activation patterns (assemblies) - numpy array (assemblies, neurons) significance: object containing general information about significance tests zactmat: returns z-scored actmat ''' nneurons = np.size(actmat, 0) nbins = np.size(actmat, 1) silentneurons = np.var(actmat, axis=1) == 0 actmat_ = actmat[~silentneurons, :] # z-scoring activity matrix zactmat_ = stats.zscore(actmat_, axis=1) # running significance (estimating number of assemblies) significance = PCA() significance.fit(zactmat_.T) significance.nneurons = nneurons significance.nbins = nbins significance.nshu = nshu significance.percentile = percentile significance.tracywidom = tracywidom significance.nullhyp = nullhyp significance = runSignificance(zactmat_, significance) if np.isnan(significance.nassemblies): return if significance.nassemblies < 1: print('WARNING !') print(' no assembly detecded!') patterns = [] else: # extracting co-activation patterns patterns_ = extractPatterns(zactmat_, significance, method) if patterns_ is np.nan: return # putting eventual silent neurons back (their assembly weights are defined as zero) patterns = np.zeros((np.size(patterns_, 0), nneurons)) patterns[:, ~silentneurons] = patterns_ zactmat = np.copy(actmat) zactmat[~silentneurons, :] = zactmat_ return patterns, significance, zactmat
def demand_graphs(branch='A', product_line='Electronic accessories', date=datetime.date(2019, 3, 25)): pd.options.plotting.backend = "plotly" fig = make_subplots(rows=1, cols=2, shared_xaxes=False, specs=[[{ "type": "xy" }, { "type": "xy" }]], subplot_titles=('Day vs Demand', 'Predicted Values for a Week')) fig['layout'].update(xaxis_title='Day', yaxis_title='Quantity', paper_bgcolor='rgba(0,0,0,0)', plot_bgcolor='rgba(0,0,0,0)') #Gets the earliest date in the database with connection.cursor() as crs: earliest_date_query = 'SELECT min(Date) FROM Sales' earliest_date = crs.execute(earliest_date_query).fetchval() df = pd.read_sql( demand_query % (earliest_date, earliest_date, date, branch, product_line, earliest_date), connection) date_difference = date - earliest_date #fills in days where there are no sales s = df['Day'].tolist() for i in range(0, date_difference.days): if i not in s: df.loc[-1] = [i, 0] df.index = df.index + 1 #drops data that has a zscore with an absolute value greater than or equal to 3 then sorts z_scores = np.abs(stats.zscore(df['Demand'])) df = df[(z_scores < 3)] df = df.sort_values('Day') #makes a linear regression model then predicts the values and store them model = linear_model.LinearRegression() weight = np.ones(len(df)) * 10 weight[-7:] *= 1.5 x = df[['Day']] y = df[['Demand']] model.fit(x, y, weight) df['bestfit'] = model.predict(df[['Day']]) #makes a prediction of the next weeks demand days_predicted = list(range(date_difference.days, date_difference.days + 7)) predicted_values = [] total_predicted = 0 for i in days_predicted: predicted = float(model.predict([[i]])) predicted_values.append(predicted) total_predicted += predicted predicted_values.append(total_predicted) #makes a scatter plot with a line for the linear regression fig.add_trace( go.Scatter(name='data points', x=df['Day'], y=df['Demand'].values, mode='markers'), 1, 1) fig.add_trace( go.Scatter(name='regression line', x=df['Day'], y=df['bestfit'], mode='lines'), 1, 1) #makes a bar chart for the predicted values for the week fig.add_trace( go.Bar(name='predicted values', x=days_predicted.append('total'), y=predicted_values), 1, 2) #finds the explained variance score, r2 score, and mean absolute error evs = sm.explained_variance_score(df['Demand'], df['bestfit']) r2 = sm.r2_score(df['Demand'], df['bestfit']) mae = sm.mean_absolute_error(df['Demand'], df['bestfit']) new_date = True with connection.cursor() as crs: crs.execute(check_prediction_query, (date, product_line, branch)) if crs.fetchone() != None: new_date = False if new_date: with connection.cursor() as crs: crs.execute(update_prediction_log, (branch, product_line, date, evs, r2, mae)) return fig, evs, r2, mae
ix = obj1.meta.type.isin(['iPSC']) obj1.filter_samples(ix) obj2 = copy(ref_obj) ix = obj2.meta.type == 'ESC' obj2.filter_samples(ix) dend = plot_dendrogram([obj1, obj2], qn_method=quantile_norm, n_by_mad=n_gene_by_mad) dend['fig'].savefig(os.path.join(outdir, "cluster_ipsc_esc.png"), dpi=200) # 3. iPSC, ESC, Ruiz signature (only) the_obj = loader.MultipleBatchLoader([obj1, obj2]) dat_r_z = pd.DataFrame(np.log2(the_obj.data + eps)) dat_r_z = dat_r_z.reindex(gene_sign_ens.values).dropna() for r in dat_r_z.index: dat_r_z.loc[r] = zscore(dat_r_z.loc[r]) dat_r_z.index = gene_sign_ens.index[gene_sign_ens.isin(dat_r_z.index)] cg = clustering.plot_clustermap(dat_r_z, show_gene_labels=True, cmap='RdBu_r') cg.gs.update(bottom=0.2) cg.savefig(os.path.join(outdir, "clustermap_ruiz_ipsc_esc_ztrans.png"), dpi=200) # 4. HipSci, iPSC, ESC, FB obj1 = copy(obj) ix = obj1.meta.type.isin(['iPSC', 'FB']) obj1.filter_samples(ix) dend = plot_dendrogram([obj1, ref_obj, hip_obj], qn_method=quantile_norm, n_by_mad=n_gene_by_mad) dend['fig'].savefig(os.path.join(outdir, "cluster_ipsc_esc_fb_with_hipsci%d.png" % n_hipsci), dpi=200)
def Price_Main(data: pd.DataFrame): # Remove price and term outliers (out of 3 sigmas) data = data[((np.abs(stats.zscore(data.price)) < 2.5) & (np.abs(stats.zscore(data.term)) < 2.5) & ( np.abs(stats.zscore(data.full_sq)) < 2.5))] # Fill NaN if it appears after merging data[['term']] = data[['term']].fillna(data[['term']].mean()) # Fix year data = data[((data.yyyy_announce == 19) | (data.yyyy_announce == 20))] # Log Transformation data["longitude"] = np.log1p(data["longitude"]) data["latitude"] = np.log1p(data["latitude"]) data["full_sq"] = np.log1p(data["full_sq"]) data["life_sq"] = np.log1p(data["life_sq"]) data["kitchen_sq"] = np.log1p(data["kitchen_sq"]) data["to_center"] = np.log1p(data["to_center"]) data["price"] = np.log1p(data["price"]) X = data[['life_sq', 'to_center', 'mm_announce', 'rooms', 'renovation', 'has_elevator', 'longitude', 'latitude', 'full_sq', 'kitchen_sq', 'time_to_metro', 'floor_last', 'floor_first', 'clusters', 'is_rented', 'rent_quarter', 'rent_year']] y = data[['price']].values.ravel() print(X.shape, y.shape, flush=True) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # GBR model gbr_model = GradientBoostingRegressor(n_estimators=350, max_depth=8, verbose=1, random_state=42) print(10*'-', '> GBR Spb started fitting...') gbr_model.fit(X_train, y_train) gbr_preds = gbr_model.predict(X_test) print('Spb GBR R2_score: ', r2_score(y_test, gbr_preds), flush=True) print('Spb GBR RMSE : ', mean_squared_error(y_test, gbr_preds), flush=True) print('Train GBR on full spb dataset: ', flush=True) gbr_model.fit(X, y) dump(gbr_model, PATH_TO_PRICE_MODEL_GBR_D) print('GBR Spb model saved !', flush=True) # RANDOM FOREST REGRESSOR RF = RandomForestRegressor(n_estimators=300, verbose=1, n_jobs=-1) print(10*'-', '> Rf Spb started fitting...') RF.fit(X_train, y_train) rf_predicts = RF.predict(X_test) print('Spb RF R2_score: ', r2_score(y_test, rf_predicts), flush=True) print('Spb RF RMSE: ', mean_squared_error(y_test, rf_predicts), flush=True) print('Train RF on full spb dataset: ', flush=True) RF.fit(X, y) dump(RF, PATH_TO_PRICE_MODEL_RF_D) print('GBR Spb model saved !', flush=True) # LGBM model lgbm_model = LGBMRegressor(objective='regression', learning_rate=0.05, n_estimators=1250, max_depth=7, min_child_samples=1, verbose=0) print(10*'-', '> LGBM Spb started fitting...') lgbm_model.fit(X_train, y_train) lgbm_preds = lgbm_model.predict(X_test) print('Spb RF R2_score: ', r2_score(y_test, lgbm_preds), flush=True) print('Spb LGBM RMSE: ', mean_squared_error(y_test, lgbm_preds), flush=True) print('Train LGBM on full spb dataset: ', flush=True) lgbm_model.fit(X, y) dump(lgbm_model, PATH_TO_PRICE_MODEL_LGBM_D) print('LGBM Spb model saved !', flush=True)
import numpy as np import pandas as pd from scipy import stats # load predict -ligKi matrix ki_matrix = np.loadtxt("..//..//02_drug_model//03_predicted_ki_matrix.txt", delimiter=",") # generate zscore matrix z_score_matrix = [] # calculate zscore counter = 1 for prot in ki_matrix: print(counter) z_score = stats.zscore(prot) z_score[z_score < 2] = 0 z_score_matrix.append(z_score) counter += 1 z_score_matrix = np.array(z_score_matrix) np.savetxt(".//protein_ki_zscore_2_matrix.txt", z_score_matrix, fmt = "%.5e", delimiter=",")
# result = [result1,result2,result3] fig = plt.figure() for j in range(len(result)): dataset = pd.concat([base,result[j]]) n_comp = 2 pca = PCA(n_components=n_comp) principalComponents = pca.fit_transform(dataset) columns = ['principal component '+str(i) for i in range(1,n_comp+1)] principalDf = pd.DataFrame(data = principalComponents , columns = columns) finalDf = pd.concat([principalDf, classe], axis = 1) finalDf = finalDf.dropna() z = np.abs(stats.zscore(finalDf.iloc[:,:2])) finalDf = finalDf[(z < 3).all(axis=1)] finalDf.reset_index(drop=True) loadings = pca.components_.T * np.sqrt(pca.explained_variance_) for k in range(n_comp): loadings_ = loadings[:,k] # smooth2(smoo, loadings_) smoothed = smooth2(smoo, loadings_) peaksf, _ = find_peaks(smoothed,distance=10) plt.plot(xData,smoothed,label='L'+str(1+k)+' '+label[i]+' vs '+classe[0][dataset.index[-1]]+'(var: %d%%)' %(pca.explained_variance_ratio_[k]*100),color=colors[1:len(colors)][j],alpha=(n_comp-k)/(n_comp)) plt.hlines(0, xmin=xData.min(),xmax=xData.max(),ls='dotted',linewidth=1) plt.xlabel('Raman Shift (cm$^{-1}$)', fontsize = 16) plt.ylabel('PCA Loadings', fontsize = 16)
Using vectorized functions When performance is paramount, you should avoid using .apply() and .map() because those constructs perform Python for-loops over the data stored in a pandas Series or DataFrame. By using vectorized functions instead, you can loop over the data at the same speed as compiled code (C, Fortran, etc.)! NumPy, SciPy and pandas come with a variety of vectorized functions (called Universal Functions or UFuncs in NumPy). You can even write your own vectorized functions, but for now we will focus on the ones distributed by NumPy and pandas. In this exercise you're going to import the zscore method from scipy.stats and use it to compute the deviation in voter turnout in Pennsylvania from the mean in fractions of the standard deviation. In statistics, the z-score is the number of standard deviations by which an observation is above the mean - so if it is negative, it means the observation is below the mean. Instead of using .apply() as you did in the earlier exercises, the zscore UFunc will take a pandas Series as input and return a NumPy array. You will then assign the values of the NumPy array to a new column in the DataFrame. You will be working with the election DataFrame - it has been pre-loaded for you. Import zscore from scipy.stats. Call zscore with election['turnout'] as input . Print the output of type(turnout_zscore). This has been done for you. Assign turnout_zscore to a new column in election as 'turnout_zscore'. Print the output of election.head(). This has been done for you, so hit 'Submit Answer' to view the result. ''' # Import zscore from scipy.stats from scipy.stats import zscore # Call zscore with election['turnout'] as input: turnout_zscore turnout_zscore = zscore(election['turnout']) # Print the type of turnout_zscore print(type(turnout_zscore)) # Assign turnout_zscore to a new column: election['turnout_zscore'] election['turnout_zscore'] = turnout_zscore # Print the output of election.head() print(election.head())
def prepareForBoxplots(data, gender=True): """ brings data into the correct format for generating a boxplot over all students from all years in this particular case we use year of university entrance as query_id, psu scores as features and notas as rank writes one dataset with protection_status "gender" and one with protection_status "highschool_type" """ data = data[data['sem'] == 1] data = data[data['inactivo'] != 1] # drop all lines where values are missing data = data.dropna( subset=['nem', 'psu_mat', 'psu_len', 'psu_cie', 'notas_', 'uds_i_']) # drop all columns that are not needed if (gender): keep_cols = [ 'hombre', 'psu_mat', 'psu_len', 'psu_cie', 'nem', 'notas_', 'uds_i_', 'uds_r_', 'uds_e_' ] else: keep_cols = [ 'highschool_type', 'psu_mat', 'psu_len', 'psu_cie', 'nem', 'notas_', 'uds_i_', 'uds_r_', 'uds_e_' ] data = data[keep_cols] # replace NaNs with zeros data['uds_r_'].fillna(0) data['uds_e_'].fillna(0) # add new column for ranking scores data['score'] = np.zeros(data.shape[0]) # calculate score based on grades and credits for idx, row in data.iterrows(): grades = row.loc['notas_'] credits_taken = row.loc['uds_i_'] credits_failed = row.loc['uds_r_'] credits_dropped = row.loc['uds_e_'] score = grades * (credits_taken - credits_failed - credits_dropped) / credits_taken data.loc[idx, 'score'] = score # don't need these columns anymore data = data.drop(columns=['notas_', 'uds_i_', 'uds_r_', 'uds_e_']) # zscore psu scores and normalize scores data['psu_mat'] = stats.zscore(data['psu_mat']) data['psu_len'] = stats.zscore(data['psu_len']) data['psu_cie'] = stats.zscore(data['psu_cie']) data['nem'] = stats.zscore(data['nem']) data['score'] = stats.zscore(data['score']) # rename protected column to prot_attr data.columns = [ 'prot\_attr', 'psu\_mat', 'psu\_len', 'psu\_cie', 'nem', 'score' ] return data
def time_segment_matching( data, win_size=10, ): """ Performs time segment matching experiment (code inspired from brainiak tutorials at https://brainiak.org/events/ohbm2018/brainiak_sample_tutorials/10-func-align.html) Parameters ---------- data: array of shape (n_subjects, n_components, n_timeframes) Input shared responses Returns ------- cv_score: np array of shape (n_subjects) Per-subject accuracy """ # Pull out shape information n_subjs = len(data) (n_features, n_TR) = data[0].shape # Voxel/feature by timepoint # How many segments are there (account for edges) n_seg = n_TR - win_size # mysseg prediction prediction train_data = np.zeros((n_features * win_size, n_seg)) # Concatenate the data across participants for ppt_counter in range(n_subjs): for window_counter in range(win_size): train_data[window_counter * n_features:(window_counter + 1) * n_features, :, ] += data[ ppt_counter][:, window_counter:window_counter + n_seg] # Iterate through the participants, leaving one out accuracy = np.zeros(shape=n_subjs) for ppt_counter in range(n_subjs): # Preset test_data = np.zeros((n_features * win_size, n_seg)) for window_counter in range(win_size): test_data[window_counter * n_features:(window_counter + 1) * n_features, :, ] = data[ppt_counter][:, window_counter:( window_counter + n_seg)] # Take this participant data away train_ppts = stats.zscore((train_data - test_data), axis=0, ddof=1) test_ppts = stats.zscore(test_data, axis=0, ddof=1) # Correlate the two data sets corr_mtx = test_ppts.T.dot(train_ppts) # If any segments have a correlation difference less than the window size and they aren't the same segments then set the value to negative infinity for seg_1 in range(n_seg): for seg_2 in range(n_seg): if abs(seg_1 - seg_2) < win_size and seg_1 != seg_2: corr_mtx[seg_1, seg_2] = -np.inf # Find the segement with the max value rank = np.argmax(corr_mtx, axis=1) # Find the number of segments that were matched for this participant accuracy[ppt_counter] = sum(rank == range(n_seg)) / float(n_seg) return accuracy
def demo_rsHRF(input_file, mask_file, output_dir, para, p_jobs, file_type=".nii", mode="bids", wiener=False, temporal_mask=[]): # book-keeping w.r.t parameter values if 'localK' not in para or para['localK'] == None: if para['TR']<=2: para['localK'] = 1 else: para['localK'] = 2 # creating the output-directory if not already present if not os.path.isdir(output_dir): os.mkdir(output_dir) # for four-dimensional input if mode != 'time-series': if mode == 'bids' or mode == 'bids w/ atlas': name = input_file.filename.split('/')[-1].split('.')[0] v1 = spm_dep.spm.spm_vol(input_file.filename) else: name = input_file.split('/')[-1].split('.')[0] v1 = spm_dep.spm.spm_vol(input_file) if mask_file != None: if mode == 'bids': mask_name = mask_file.filename.split('/')[-1].split('.')[0] v = spm_dep.spm.spm_vol(mask_file.filename) else: mask_name = mask_file.split('/')[-1].split('.')[0] v = spm_dep.spm.spm_vol(mask_file) if file_type == ".nii" or file_type == ".nii.gz": brain = spm_dep.spm.spm_read_vols(v) else: brain = v.agg_data().flatten(order='F') if ((file_type == ".nii" or file_type == ".nii.gz") and \ v1.header.get_data_shape()[:-1] != v.header.get_data_shape()) or \ ((file_type == ".gii" or file_type == ".gii.gz") and \ v1.agg_data().shape[0]!= v.agg_data().shape[0]): raise ValueError ('Inconsistency in input-mask dimensions' + '\n\tinput_file == ' + name + file_type + '\n\tmask_file == ' + mask_name + file_type) else: if file_type == ".nii" or file_type == ".nii.gz" : data = v1.get_data() else: data = v1.agg_data() else: print('No atlas provided! Generating mask file...') if file_type == ".nii" or file_type == ".nii.gz" : data = v1.get_data() brain = np.nanvar(data.reshape(-1, data.shape[3]), -1, ddof=0) else: data = v1.agg_data() brain = np.nanvar(data, -1, ddof=0) print('Done') voxel_ind = np.where(brain > 0)[0] mask_shape = data.shape[:-1] nobs = data.shape[-1] data1 = np.reshape(data, (-1, nobs), order='F').T bold_sig = stats.zscore(data1[:, voxel_ind], ddof=1) # for time-series input else: name = input_file.split('/')[-1].split('.')[0] data1 = (np.loadtxt(input_file, delimiter=",")) if data1.ndim == 1: data1 = np.expand_dims(data1, axis=1) nobs = data1.shape[0] bold_sig = stats.zscore(data1, ddof=1) if len(temporal_mask) > 0 and len(temporal_mask) != nobs: raise ValueError ('Inconsistency in temporal_mask dimensions.\n' + 'Size of mask: ' + str(len(temporal_mask)) + '\n' + 'Size of time-series: ' + str(nobs)) bold_sig = np.nan_to_num(bold_sig) bold_sig_deconv = processing. \ rest_filter. \ rest_IdealFilter(bold_sig, para['TR'], para['passband_deconvolve']) bold_sig = processing. \ rest_filter. \ rest_IdealFilter(bold_sig, para['TR'], para['passband']) data_deconv = np.zeros(bold_sig.shape) event_number = np.zeros((1, bold_sig.shape[1])) print('Retrieving HRF ...') #Estimate HRF for the fourier / hanning / gamma / cannon basis functions if not (para['estimation'] == 'sFIR' or para['estimation'] == 'FIR'): bf = basis_functions.basis_functions.get_basis_function(bold_sig.shape, para) beta_hrf, event_bold = utils.hrf_estimation.compute_hrf(bold_sig, para, temporal_mask, p_jobs, bf=bf) hrfa = np.dot(bf, beta_hrf[np.arange(0, bf.shape[1]), :]) #Estimate HRF for FIR and sFIR else: beta_hrf, event_bold = utils.hrf_estimation.compute_hrf(bold_sig, para, temporal_mask, p_jobs) hrfa = beta_hrf[:-1,:] nvar = hrfa.shape[1] PARA = np.zeros((3, nvar)) for voxel_id in range(nvar): hrf1 = hrfa[:, voxel_id] PARA[:, voxel_id] = \ parameters.wgr_get_parameters(hrf1, para['TR'] / para['T']) print('Done') print('Deconvolving HRF ...') if para['T'] > 1: hrfa_TR = signal.resample_poly(hrfa, 1, para['T']) else: hrfa_TR = hrfa for voxel_id in range(nvar): hrf = hrfa_TR[:, voxel_id] if not wiener: H = np.fft.fft( np.append(hrf, np.zeros((nobs - max(hrf.shape), 1))), axis=0) M = np.fft.fft(bold_sig_deconv[:, voxel_id]) data_deconv[:, voxel_id] = \ np.fft.ifft(H.conj() * M / (H * H.conj() + .1*np.mean((H * H.conj())))) else: data_deconv[:, voxel_id] = iterative_wiener_deconv.rsHRF_iterative_wiener_deconv(bold_sig_deconv[:, voxel_id], hrf) event_number[:, voxel_id] = np.amax(event_bold[voxel_id].shape) # setting the output-path if mode == 'bids' or mode == 'bids w/ atlas': try: sub_save_dir = os.path.join( output_dir, 'sub-' + input_file.subject, 'session-' + input_file.session, input_file.modality ) except AttributeError as e: sub_save_dir = os.path.join( output_dir, 'sub-' + input_file.subject, input_file.modality ) else: sub_save_dir = output_dir if not os.path.isdir(sub_save_dir): os.makedirs(sub_save_dir, exist_ok=True) dic = {'para': para, 'hrfa': hrfa, 'event_bold': event_bold, 'PARA': PARA} ext = '_hrf.mat' if mode == "time-series": dic["event_number"] = event_number dic["data_deconv"] = data_deconv ext = '_hrf_deconv.mat' sio.savemat(os.path.join(sub_save_dir, name + ext), dic) HRF_para_str = ['Height', 'Time2peak', 'FWHM'] if mode != "time-series": mask_data = np.zeros(mask_shape).flatten(order='F') for i in range(3): fname = os.path.join(sub_save_dir, name + '_' + HRF_para_str[i]) mask_data[voxel_ind] = PARA[i, :] mask_data = mask_data.reshape(mask_shape, order='F') spm_dep.spm.spm_write_vol(v1, mask_data, fname, file_type) mask_data = mask_data.flatten(order='F') fname = os.path.join(sub_save_dir, name + '_event_number.nii') mask_data[voxel_ind] = event_number mask_data = mask_data.reshape(mask_shape, order='F') spm_dep.spm.spm_write_vol(v1, mask_data, fname, file_type) mask_data = np.zeros(data.shape) dat3 = np.zeros(data.shape[:-1]).flatten(order='F') for i in range(nobs): fname = os.path.join(sub_save_dir, name + '_deconv') dat3[voxel_ind] = data_deconv[i, :] dat3 = dat3.reshape(data.shape[:-1], order='F') if file_type == ".nii" or file_type == ".nii.gz" : mask_data[:, :, :, i] = dat3 else: mask_data[:, i] = dat3 dat3 = dat3.flatten(order='F') spm_dep.spm.spm_write_vol(v1, mask_data, fname, file_type) pos = 0 while pos < hrfa_TR.shape[1]: if np.any(hrfa_TR[:,pos]): break pos += 1 event_plot = lil_matrix((1, nobs)) if event_bold.size: event_plot[:, event_bold[pos]] = 1 else: print("No Events Detected!") return 0 event_plot = np.ravel(event_plot.toarray()) plt.figure() plt.plot(para['TR'] * np.arange(1, np.amax(hrfa_TR[:, pos].shape) + 1), hrfa_TR[:, pos], linewidth=1) plt.xlabel('time (s)') plt.savefig(os.path.join(sub_save_dir, name + '_plot_1.png')) plt.figure() plt.plot(para['TR'] * np.arange(1, nobs + 1), np.nan_to_num(stats.zscore(bold_sig[:, pos], ddof=1)), linewidth=1) plt.plot(para['TR'] * np.arange(1, nobs + 1), np.nan_to_num(stats.zscore(data_deconv[:, pos], ddof=1)), color='r', linewidth=1) markerline, stemlines, baseline = \ plt.stem(para['TR'] * np.arange(1, nobs + 1), event_plot) plt.setp(baseline, 'color', 'k', 'markersize', 1) plt.setp(stemlines, 'color', 'k') plt.setp(markerline, 'color', 'k', 'markersize', 3, 'marker', 'd') plt.legend(['BOLD', 'deconvolved', 'events']) plt.xlabel('time (s)') plt.savefig(os.path.join(sub_save_dir, name + '_plot_2.png')) print('Done') return 0
""" Created on Mon Nov 27 19:11:38 2017 @author: Jacob """ import numpy as np from matplotlib.pyplot import (figure, imshow, bar, title, xticks, yticks, cm, subplot, show) from scipy.stats.kde import gaussian_kde from toolbox_02450 import gausKernelDensity from sklearn.neighbors import NearestNeighbors from scipy import stats import dataSetup X = dataSetup.numbersData.values X = stats.zscore(X) N,M = X.shape """ # OUTLIER DETECTION # Compute kernel density estimate kde = gaussian_kde(X.ravel(), 'silverman') scoresKDE = kde.evaluate(X.ravel()) idxKDE = scoresKDE.argsort() scoresKDE.sort() print('The index of the lowest density object: {0}'.format(idxKDE[0])) # Plot kernel density estimate figure()
def main(): """ load data """ train_set = pd.read_csv('../data/train.csv') test_set = pd.read_csv('../data/test.csv') #Without outlier remover, with basic nanRemover 0.12416413124809748 """ Remove Outliers """ outliers = [197, 523, 691, 854, 1182, 1298] print(outliers) z = np.abs(zscore(train_set[get_numeric_columns(train_set)])) row, col = np.where(z > 4) df = pd.DataFrame({"row": row, "col": col}) rows_count = df.groupby(['row']).count() outliers = rows_count[rows_count.col > 2].index print(outliers) train_set.drop(outliers, inplace=True) """ fix salePrice skewness """ train_set["SalePrice"] = np.log1p(train_set["SalePrice"]) y_train_values = train_set["SalePrice"].values """ prepare combined data. """ train_set_id = train_set['Id'] test_set_id = test_set['Id'] train_set_rows = train_set.shape[0] test_set_rows = test_set.shape[0] train_set.drop('Id', axis=1, inplace=True) test_set.drop('Id', axis=1, inplace=True) train_set.drop('SalePrice', axis=1, inplace=True) combined_data = pd.concat((train_set, test_set)) """ create data transform pipeline """ transform_pipeline = Pipeline(steps=[ ('OutlierRemover', OutlierRemover()), ('NaNImputer', NaNImputer()), ('NaNRemover', NaNRemover()), ('AdditionalFeatureGenerator', AdditionalFeatureGenerator()), ('TypeTransformer', TypeTransformer()), ('ErrorImputer', ErrorImputer()), ('SkewFixer', SkewFixer()), ('Scaler', Scaler()), ('FeatureDropper', FeatureDropper()), ('Dummyfier', Dummyfier()), ]) transformed_data = transform_pipeline.transform(combined_data) train_data = transformed_data[:train_set_rows] predict_data = transformed_data[train_set_rows:] transformed_data.to_csv('transformed_Data.csv', index=False) """ try various regressors """ rf_param = { # 'bootstrap': [True], 'max_depth': [3, 4, 5], 'min_samples_leaf': [3, 4, 5], 'n_estimators': [5, 7, 10] } ls_param = {'alpha': [0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008], 'max_iter': [10000], "normalize": [False]} elnet_param = {'alpha': [0.0003, 0.0004, 0.0005], 'l1_ratio': [0.9, 0.95, 0.99, 1], 'max_iter': [10000]} ridge_param = {'alpha': [10, 10.1, 10.2, 10.3, 10.4, 10.5]} svr_param = {'gamma': [1e-08, 1e-09], 'C': [100000, 110000], 'epsilon': [1, 0.1, 0.01] } rf = get_best_estimator(train_data, y_train_values, estimator=RandomForestRegressor(), params=rf_param, n_jobs=4) elnet = get_best_estimator(train_data, y_train_values, estimator=ElasticNet(), params=elnet_param, n_jobs=4) lso = get_best_estimator(train_data, y_train_values, estimator=Lasso(), params=ls_param, n_jobs=4) rdg = get_best_estimator(train_data, y_train_values, estimator=Ridge(), params=ridge_param, n_jobs=4) svr = get_best_estimator(train_data, y_train_values, estimator=SVR(), params=svr_param, n_jobs=4) def cv_rmse(model): kfolds = KFold(n_splits=5, shuffle=True, random_state=42) rmse = np.sqrt(-cross_val_score(model, train_data, y_train_values, scoring="neg_mean_squared_error", cv=kfolds)) return (rmse) """ print("Randomforest model rmse : ", cv_rmse(rf).mean()) print("elastic model rmse : ", cv_rmse(elnet).mean()) print("lasso model rmse : ", cv_rmse(lso).mean()) print("ridge model rmse : ", cv_rmse(rdg).mean()) print("svr model rmse : ", cv_rmse(svr).mean()) """ model = StackingRegressor( regressors=[rf, elnet, lso, rdg, svr], meta_regressor=Lasso(alpha=0.0005) # meta_regressor=SVR(kernel='rbf') ) # Fit the model on our data model.fit(train_data, y_train_values) #print("StackingRegressor model rmse : ", cv_rmse(model).mean()) # y_pred = model.predict(train_data) # print(sqrt(mean_squared_error(y_train_values, y_pred))) # Predict test set ensembled = np.expm1(model.predict(predict_data)) # sns.scatterplot(np.expm1(rf.predict(train_data),np.expm1(y_train_values))) # plt.show() # ensembled = np.expm1(rf.predict(predict_data)) """ export submission data """ submission = pd.DataFrame({ "Id": test_set_id, "SalePrice": ensembled }) submission.to_csv('submission_jiwon.csv', index=False)
import pandas as pd import seaborn as sns import numpy as np from scipy import stats #%% accidents = pd.read_csv( r'C:\Users\rcf004\Documents\Python Scripts\US_Accidents_Dec19.csv') accidents.dropna( subset=['Start_Lng', 'Start_Lat', 'Severity', 'Visibility(mi)'], inplace=True) #%% no_out = accidents[ np.abs(stats.zscore(accidents['Visibility(mi)'].astype(int))) < 3] #%% df1 = no_out[no_out['Start_Time'].astype(str).str.contains('2016-')] df2 = no_out[no_out['Start_Time'].astype(str).str.contains('2017-')] df3 = no_out[no_out['Start_Time'].astype(str).str.contains('2018-')] df4 = no_out[no_out['Start_Time'].astype(str).str.contains('2019-')] #%% a = 0.5 sz = (1, 10) lw = 0 kwargs = {'marker': "."} cmap = sns.cubehelix_palette(start=1.1,
elif seq[nucpos:nucpos + 3] == "---": nucpos += 3 continue else: dnlist[codon] += 1 nucpos += 3 except KeyError: nucpos += 3 continue nucpos += 3 calclist=[] for i in range(0,len(dslist)): calclist.append(dnlist[i] - dslist[i]) zmin =stats.zscore(calclist) plt.figure() for i, z in enumerate(zmin): if z > 3: plt.scatter(i,z, color ='orange') else: plt.scatter(i, z, color='blue') plt.ylabel("zscore") plt.xlabel("codon num") plt.savefig("scatter.png") plt.close()
date_bucket = np.hstack((date_bucket, np.asarray(tsd['timestamp'][i]))) date_bucket = np.unique(date_bucket) date_bucket = date_bucket[date_bucket > datetime.date(2020, 1, 1)] interpolated_time_series = [] for i in range(0, tsd.__len__()): series = pd.DataFrame(tsd['data'][i], tsd['timestamp'][i]) series = series[~series.index.duplicated(keep='first')] series = series[series.index > datetime.date(2020, 1, 1)] series = series.reindex( date_bucket, fill_value=0).sort_index().mask(series == 0).interpolate() if series.isna().sum()[0] > series.__len__() / 2 or series.var( )[0] < variance_threshold: continue elif series.isna().sum()[0] > 0: series = series.fillna(0) series['zscore'] = st.zscore(series) interpolated_time_series.append({ "path": tsd['path'][i], "node": tsd['node'][i], "slot": tsd['slot'][i], "port": tsd['port'][i], "pm": tsd['pm'][i], "raw_data": np.asarray(series[0]), "z-score": np.asarray(series['zscore']), "timestamp": np.asarray(series.index) }) print(i) f = open("vodafone_data_oct30_not_pm_filtered_interpolated.pkl", "wb") pickle.dump(pd.DataFrame(interpolated_time_series), f) f.close()
## comparison to shortest path analysis ALL = set(INH_PLC_list) HIT = PET_TG & SPL_TG pval = stats.hypergeom.sf(len(HIT), len(ALL), len(PET_TG), len(SPL_TG)) print("Petrinet & shortest path", pval, [entz_dic[entz] for entz in HIT]) ## get gold standard from CTD database gene_df = pd.read_table('./CTD/CTD_genes_related_to_atrophy.txt', sep='\t') ## get z-score of inferenece score in CTD infScore_ds = gene_df['InferenceScore'] scoreList = list(infScore_ds.dropna()) scoreDic = {} zScoreList = stats.zscore(scoreList) for ii in range(len(scoreList)): scoreDic[scoreList[ii]] = zScoreList[ii] GS_dic = {} for index, row in gene_df.iterrows(): if pd.isnull(row['DirectEvidence']) == False: if str(row['GeneID']) == '3479': # use IGF1R instead of IGF1 GS_dic['3480'] = 100 else: GS_dic[str(row['GeneID'])] = 100 for index, row in gene_df.iterrows(): if pd.isnull(row['DirectEvidence']) == True: if GS_dic.has_key(str(row['GeneID'])): continue
import numpy as np import pandas as pd from copy import deepcopy from scipy import stats from mpl_toolkits.mplot3d import Axes3D get_ipython().magic('matplotlib inline') #%matplotlib inline from matplotlib import pyplot as plt plt.rcParams['figure.figsize'] = (16, 9) plt.style.use('ggplot') # Importing the dataset data = pd.read_csv('C:\\Users\\himverma\\AnacondaProjects\\KMeans\\xclaraOriginal.csv') print("Input Data and Shape") print(data.shape) data.head() # Getting the values and plotting it f1 = data['V1'].values #f2 = data['V2'].values #X = np.array(list(zip(f1, f2))) X = np.array(f1) stats.zscore(X) fit = stats.norm.pdf(X, np.mean(X), np.std(X)) plt.plot(X,fit,'-o') plt.hist(X, 30, normed=True) plt.show() print(X.mean())
# "standard representation" for the course, is the number of classes, C: C = len(classNames) # Add offset attribute X = np.concatenate((np.ones((X.shape[0],1)),X),1) #attributeNames = [u'Offset']+attributeNames M = M+1 #attributeNames = ('Offset', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline') #attributeNames = ('Offset', 'Ash', 'Magnesium', 'Color intensity', 'Hue' 'Proline') attributeNames = ('Offset', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline') #Standalize the data to zero mean and standard deviation of 1 X_standarized = zscore(X, ddof=1) #Do the standalization ############################################ #Use exercise 8.1.1 ########################################### ## Crossvalidation # Create crossvalidation partition for evaluation K = 10 CV = model_selection.KFold(K, shuffle=True) #CV = model_selection.KFold(K, shuffle=False) # Values of lambda
def create_model(self, train_X, train_y, val_X, val_y): """ Args: train_X (pandas dataframe) train_y (pandas dataframe) Returns: ExtraTreesRegressor """ # モデル作成 train_X = train_X[self.feature_columns] train_X = stats.zscore(train_X) train_X = train_X.reshape( (train_X.shape[0], 1, train_X.shape[1])) val_X = val_X[self.feature_columns] val_X = stats.zscore(val_X) val_X = val_X.reshape((val_X.shape[0], 1, val_X.shape[1])) model = Sequential() model.add(LSTM(512, input_shape=(train_X.shape[1], train_X.shape[2]))) model.add(BatchNormalization()) model.add(Dropout(.2)) model.add(Dense(256)) model.add(PReLU()) model.add(BatchNormalization()) model.add(Dropout(.1)) model.add(Dense(256)) model.add(PReLU()) model.add(BatchNormalization()) model.add(Dropout(.1)) model.add(Dense(128)) model.add(PReLU()) model.add(BatchNormalization()) model.add(Dropout(.05)) model.add(Dense(64)) model.add(PReLU()) model.add(BatchNormalization()) model.add(Dropout(.05)) model.add(Dense(32)) model.add(PReLU()) model.add(BatchNormalization()) model.add(Dropout(.05)) model.add(Dense(16)) model.add(PReLU()) model.add(BatchNormalization()) model.add(Dropout(.05)) model.add(Dense(1)) # ネットワークのコンパイル model.compile(loss='mse', optimizer=optimizers.Adam(0.001), metrics=['mse']) callbacks = [ EarlyStopping(monitor='val_loss', patience=10, verbose=0), ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=7, verbose=1, epsilon=1e-4, mode='min') ] model.fit(x=train_X, y=train_y, epochs=80, validation_data=(val_X, val_y), callbacks=[callbacks])
def impute_markers(model_path, data_path, *, save_path=None, start_frame=None, n_frames=None, stride=1, markers_to_fix=None, error_diff_thresh=.25, model=None): """Imputes the position of missing markers. :param model_path: Path to model to use for prediction. :param data_path: Path to marker and bad_frames data. Can be hdf5 or mat -v7.3. :param save_path: Path to .mat file where predictions will be saved. :param start_frame: Frame at which to begin imputation. :param n_frames: Number of frames to impute. :param stride: stride length between frames for faster imputation. :param markers_to_fix: Markers for which to override suspicious MoCap measurements :param error_diff_thresh: Z-scored difference threshold marking suspicious frames :param model: Model to be used in prediction. Overrides model_path. :return: preds """ # Check data extensions filename, file_extension = os.path.splitext(data_path) accepted_extensions = {'.h5', '.hdf5', '.mat'} if file_extension not in accepted_extensions: raise ValueError('Improper extension: hdf5 or \ mat -v7.3 file required.') # Load data print('Loading data') f = h5py.File(data_path, 'r') if file_extension in {'.h5', '.hdf5'}: markers = np.array(f['markers'][:]).T marker_means = np.array(f['marker_means'][:]).T marker_stds = np.array(f['marker_stds'][:]).T bad_frames = np.array(f['bad_frames'][:]).T else: # Get the markers data from the struct dset = 'markers_aligned_preproc' marker_names = list(f[dset].keys()) n_frames_tot = f[dset][marker_names[0]][:].T.shape[0] n_dims = f[dset][marker_names[0]][:].T.shape[1] markers = np.zeros((n_frames_tot, len(marker_names) * n_dims)) for i in range(len(marker_names)): marker = f[dset][marker_names[i]][:].T for j in range(n_dims): markers[:, i * n_dims + j] = marker[:, j] # Z-score the marker data marker_means = np.mean(markers, axis=0) marker_means = marker_means[None, ...] marker_stds = np.std(markers, axis=0) marker_stds = marker_stds[None, ...] print(marker_means) print(marker_stds) markers = stats.zscore(markers) # Get the bad_frames data from the cell dset = 'bad_frames_agg' n_markers = f[dset][:].shape[0] bad_frames = np.zeros((markers.shape[0], n_markers)) for i in range(n_markers): reference = f[dset][i][0] bad_frames[np.squeeze(f[reference][:]).astype('int32') - 1, i] = 1 # Set number of frames to impute if n_frames is None: n_frames = markers.shape[0] if start_frame is None: start_frame = 0 print('Predicting %d frames starting at frame %d.' % (n_frames, start_frame)) # Exceptions if n_frames > markers.shape[0]: raise ValueError("Improper n_frames to predict: likely asked to " + "predict a greater number of frames than were " + "available.") if (n_frames + start_frame) > markers.shape[0]: raise ValueError('start_frame + n_frames exceeds matrix dimensions.') if n_frames < 0: raise ValueError("Improper n_frames to predict: likely too few input" + " frames.") if n_frames == 0: raise ValueError("Improper n_frames to predict: likely asked to " + "predict zero frames.") markers = markers[start_frame:(start_frame + n_frames):stride, :] bad_frames = bad_frames[start_frame:(start_frame + n_frames):stride, :] # Load model if model is None: print('Loading model') model = load_model(model_path) # Check how many outputs the model has, and how many members if returning # member data. n_outputs = len(model.output_shape) if n_outputs == 2: return_member_data = True else: return_member_data = False member_predsF = [None] member_predsR = [None] # Set Markers to fix if markers_to_fix is None: markers_to_fix = np.zeros((markers.shape[1])) > 1 # TODO(Skeleton): Automate this by including the skeleton. markers_to_fix[30:36] = True markers_to_fix[42:] = True # If the model can return the member predictions, do so. if return_member_data: # Forward predict print('Imputing markers: forward pass') predsF, bad_framesF, member_predsF = \ predict_markers(model, markers, bad_frames, markers_to_fix=markers_to_fix, error_diff_thresh=error_diff_thresh, return_member_data=return_member_data) # Reverse Predict print('Imputing markers: reverse pass') predsR, bad_framesR, member_predsR = \ predict_markers(model, markers[::-1, :], bad_frames[::-1, :], markers_to_fix=markers_to_fix, error_diff_thresh=error_diff_thresh, return_member_data=return_member_data) else: # Forward predict print('Imputing markers: forward pass') predsF, bad_framesF = \ predict_markers(model, markers, bad_frames, markers_to_fix=markers_to_fix, error_diff_thresh=error_diff_thresh, return_member_data=return_member_data) # Reverse Predict print('Imputing markers: reverse pass') predsR, bad_framesR = \ predict_markers(model, markers[::-1, :], bad_frames[::-1, :], markers_to_fix=markers_to_fix, error_diff_thresh=error_diff_thresh, return_member_data=return_member_data) # Convert to real world coordinates markers_world = np.zeros((markers.shape)) predsF_world = np.zeros((predsF.shape)) predsR_world = np.zeros((predsR.shape)) for i in range(markers_world.shape[1]): markers_world[:, i] = \ markers[:, i]*marker_stds[0, i] + marker_means[0, i] predsF_world[:, i] = \ predsF[:, i]*marker_stds[0, i] + marker_means[0, i] predsR_world[:, i] = \ predsR[:, i]*marker_stds[0, i] + marker_means[0, i] predsR_world = predsR_world[::-1, :] bad_framesR = bad_framesR[::-1, :] # This is not necessarily all of the error frames from # multiple_predict_recording_with_replacement, but if they overlap, # we would just take the weighted average. for i in range(bad_frames.shape[1]): bad_frames[:, i] = np.any(bad_framesF[:, (i * 3):(i * 3) + 3] & bad_framesR[:, (i * 3):(i * 3) + 3], axis=1) # Compute the weighted average of the forward and reverse predictions using # a logistic function print('Computing weighted average') preds_world = np.zeros(predsF_world.shape) for i in range(bad_frames.shape[1] * 3): is_bad = bad_frames[:, np.floor(i / 3).astype('int32')] CC = measure.label(is_bad, background=0) num_CC = len(np.unique(CC)) - 1 preds_world[:, i] = predsF_world[:, i] for j in range(num_CC): length_CC = np.sum(CC == (j + 1)) x_0 = np.round(length_CC / 2) k = 1 weightR = sigmoid(np.arange(length_CC), x_0, k) weightF = 1 - weightR preds_world[CC == (j+1), i] = \ (predsF_world[CC == (j+1), i]*weightF) +\ (predsR_world[CC == (j+1), i]*weightR) # Save predictions to a matlab file. if save_path is not None: s = 'Saving to %s' % (save_path) print(s) savemat( save_path, { 'preds': preds_world, 'markers': markers_world, 'badFrames': bad_frames, 'member_predsF': member_predsF, 'member_predsR': member_predsR }) return preds_world
def prepare_confounders( sm, confounders=tuple(), hcp_confounders=False, hcp_confounder_software_version=True, squared_confounders=False, impute0=True, #headmotion_features=('movement_AbsoluteRMS_mean', # 'movement_RelativeRMS_mean') ): """Prepare the confounder matrix. Parameters ---------- sm : pd.DataFrame (n_samples, n_features) behavioral data matrix confounders : tuple of str column-names in ``sm`` to be used as confounders. If some are not found a warning is issued and the code will continue without the missing ones. hcp_confounders : bool if ``True`` 'Weight', 'Height', 'BPSystolic', 'BPDiastolic', 'HbA1C' as well as the cubic roots of 'FS_BrainSeg_Vol', 'FS_IntraCranial_Vol' are included as confounders hcp_confounder_software_version : bool if ``True`` and ``hcp_confounders`` is also ``True``, then the feature 'fMRI_3T_ReconVrs' (encoded as a dummy variable) is used as confounder squared_confounders : bool if ``True`` the squares of all confounders (except software version, if used) are used as additional confounders impute0 : bool if True, missing confound values are imputed with 0 (after an inverse normal transformation) Returns ------- confounders : np.ndarray (n_samples, n_features) confounder data matrix, if impute0 is ``False`` it can have ``NaN``s Raises ------ ValueError if confounders couldn't be found in ``sm`` """ _confounders = [f for f in confounders if f in sm] if len(_confounders) != len(confounders): missing_confounders = [f for f in confounders if f not in sm] raise ValueError('Confounders not found: ' '{}'.format(missing_confounders)) confounders_matrix = sm[_confounders].values if hcp_confounders: sm_confounders = sm[[ 'Weight', 'Height', 'BPSystolic', 'BPDiastolic', 'HbA1C', ]].values fs_confounders = sm[['FS_BrainSeg_Vol', 'FS_IntraCranial_Vol']].values**(1. / 3) confounders_matrix = np.hstack( [confounders_matrix, sm_confounders, fs_confounders]) if squared_confounders: confounders_matrix = np.hstack( [confounders_matrix, confounders_matrix**2]) if hcp_confounders and hcp_confounder_software_version: # software reconstruction version reconvrs = sm['fMRI_3T_ReconVrs'].values used_reconvrss = np.unique(reconvrs) print('used fMRI 3T reconstruction software versions are:', used_reconvrss) assert set(used_reconvrss.tolist()) == {'r177', 'r177 r227', 'r227'} # dummy-coding: r177 -> 0, r227 -> 1, "r177 r227" -> 1 reconvrs = np.where(reconvrs == 'r177', 0, 1).reshape(-1, 1) confounders_matrix = np.hstack([confounders_matrix, reconvrs]) if confounders_matrix.shape[1] > 0: # inverse normal transform (this also results in mean 0) confounders_matrix = \ rank_based_inverse_normal_trafo(confounders_matrix) if impute0: # impute 0 for missing values print('{:.2f}% of values in confounders missing, imputing 0 for ' 'these'.format(100 * (1 - np.isfinite(confounders_matrix).mean()))) confounders_matrix[~np.isfinite(confounders_matrix)] = 0 else: print('{:.2f}% of values in confounders missing'.format( 100 * (1 - np.isfinite(confounders_matrix).mean()))) # normalise confounders_matrix = zscore(confounders_matrix, nan_policy='omit') return confounders_matrix
def train_price_model(self, data: pd.DataFrame): df = data df = df[((np.abs(stats.zscore(df.price)) < 2.8) & (np.abs(stats.zscore(df.term)) < 2.8) & (np.abs(stats.zscore(df.full_sq)) < 2.8))] # !!!!!!!! ADD 'was_opened' # Fix year: only 2019 df = df[(df.yyyy_announce.isin([19, 20]))] df = df[[ 'price', 'to_center', 'full_sq', 'kitchen_sq', 'life_sq', 'rooms', 'is_apartment', 'renovation', 'has_elevator', 'time_to_metro', 'floor_first', 'floor_last', 'is_rented', 'rent_quarter', 'rent_year', 'mm_announce', 'yyyy_announce', 'clusters' ]] # Save leaved columns to variable columns = list(df.columns) # Log transformation # Log Transformation df["full_sq"] = np.log1p(df["full_sq"]) df["life_sq"] = np.log1p(df["life_sq"]) df["kitchen_sq"] = np.log1p(df["kitchen_sq"]) df["price"] = np.log1p(df["price"]) df["to_center"] = np.log1p(df["to_center"]) # Create features - predictors X = df.drop(['price'], axis=1) # Target feature y = df[['price']].values.ravel() # Split for train and test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1) # Define Gradient Boosting Machine model lgbm_model = LGBMRegressor(objective='regression', learning_rate=0.07, n_estimators=1250, max_depth=10, min_child_samples=1, verbose=0) # RF = RandomForestRegressor(n_estimators=300, verbose=1, n_jobs=-1) # Train GBR on train dataset lgbm_model.fit(X_train, y_train) lgbm_preds = lgbm_model.predict(X_test) print('The R2_score of the Gradient boost is', r2_score(y_test, lgbm_preds), flush=True) print('RMSE is: \n', mean_squared_error(y_test, lgbm_preds), flush=True) # Train GBR on full dataset lgbm_model.fit(X, y) return lgbm_model, columns
# # Preliminary data analysis # ## missing value # In[6]: null_data = seeds[seeds.isnull().any(axis=1)] display(null_data) # No missing value here # # ## outliers # In[7]: """" clean outliers here by compute Z-score of each value in the column, if abs of Z score bigger than 3 than delete this row""" data = seeds[(np.abs(stats.zscore(seeds)) < 3).all(axis=1)] data.info() # only deleted 2 rows so delete the outlier rows directly is a easy way to process ouelier. # # ## Correlation # In[8]: plt.figure(figsize=(20, 7)) sns.heatmap(data.corr(), cmap='BrBG', annot=True) plt.title('Variables Correlation', fontsize=18) plt.show() # As can be seen asym is most unrelated to other attributes. #
plt.savefig('KDE_MINIMUM_PAYMENTS.png') data['MINIMUM_PAYMENTS'] = data['MINIMUM_PAYMENTS'].fillna( data['MINIMUM_PAYMENTS'].median()) pd.isnull(data).sum() #%% ''' Outliers Treatment ''' # calculate z-score from scipy import stats # drop string feature and features with meaningful range data1 = data.drop(columns=['CUST_ID', 'TENURE']) z_score = pd.DataFrame(np.abs(stats.zscore(data1)), columns=data1.columns) # Find out features with more than 2% outliers (absolute z-score >3) z_score3 = [] over3_index = [] for i in z_score.columns: indexs = z_score.index[z_score[i] > 3].tolist() ans = i, "{:.3f}".format(len(indexs) / len(z_score)), indexs z_score3.append(ans) if len(indexs) / len(z_score) > 0.02: over3_index.append(i) # remove 'BALANCE' and 'CASH_ADVANCE' since thay are regarded as high # discriminative features del over3_index[0] del over3_index[1]
int(team["L"]), "is_world_series_winner": team["WSWin"], "attendance": float(team["attendance"]), "avg_salary": float(round(average(players_in_team, "salary"), 2)), "batting_avg": float(round(average(map(batting_average, players_in_team)), 3)), "era": float(round(average(pitchers_in_team, "ERA"), 3)) }) grouped_by_year = group_by(flat_franchise_year, "year") for year, records in grouped_by_year.iteritems(): z_scores_salary = np.round(stats.zscore(pluck("avg_salary", records)), 2) z_scores_wins = np.round(stats.zscore(pluck("wins", records)), 2) z_scores_batting_avg = np.round( stats.zscore(pluck("batting_avg", records)), 2) z_scores_losses = np.round(stats.zscore(pluck("losses", records)), 2) z_scores_attendance = np.round(stats.zscore(pluck("attendance", records)), 2) z_scores_era = np.round(stats.zscore(pluck("era", records)), 2) for i, record in enumerate(records): record["z_avg_salary"] = z_scores_salary[i] record["z_wins"] = z_scores_wins[i] record["z_batting_avg"] = z_scores_batting_avg[i] record["z_losses"] = z_scores_losses[i] record["z_attendance"] = z_scores_attendance[i] record["z_era"] = z_scores_era[i]
def removeOutliersZScore(data): outlierColumns = data[[IncomeColumn]].copy() z = numpy.abs(stats.zscore(outlierColumns)) newData = data[(z < 12).all(axis=1)] return newData