def NB_PCAclassification(train_set,test_set,train_ann,test_ann): global path, train_perc #Create z-scored data normalized_train = zscore(train_set) #create classifier object gaussian_nb=GaussianNB() #Create PCA object pca = PCA(n_components=20) pca.fit(train_set) normalized_train = pca.transform(train_set) #train the NB classifier gaussian_nb.fit(normalized_train, train_ann) #store the classifier and the pca object if train_perc<1.0: pickle.dump( gaussian_nb, open(path+"GaussianNB_classifier.p", "wb+" ) ) pickle.dump( pca, open(path+"PCA_object.p", "wb+")) #convert test data to suitable format and test the NB classifier normalized_test = zscore(test_set) test = pca.transform(test_set) results = gaussian_nb.predict(test) cm = confusion_matrix(test_ann, results) print 'CONFUSION MATRIX = {}'.format(cm) return metrics(cm)
def clean_confound(RS, COG, confmat): ''' clean things and zscore things ''' # regress out confound z_confound = zscore(confmat) # squared measures to help account for potentially nonlinear effects of these confounds z2_confound = z_confound**2 conf_mat = np.hstack((z_confound, z2_confound)) # Handle nan in z scores conf_mat = np.nan_to_num(conf_mat) # clean signal RS_clean = clean(zscore(RS), confounds=conf_mat, detrend=False, standardize=False) COG_clean = clean(zscore(COG), confounds=conf_mat, detrend=False, standardize=False) return RS_clean, COG_clean, conf_mat
def stadardize(data, drop_vars, drop): if drop == 1: temp_data = data.drop(drop_vars, axis=1, inplace = False) po.DataFram`e(zscore(temp_data), columns=temp_data.columns) else: po.DataFrame(zscore(data), columns=data.columns) return data
def manifold_plot(man, fpkmMatrix, samples, standardize=3, log=True, show_text=False, sep='_', legend_loc='best', legend_size=14): # man: the instance of a manifold algorithm ## preprocessing of the fpkmMatrix if log: fpkmMatrix = np.log10(fpkmMatrix + 1.) if standardize == 2: # standardize along rows/genes fpkmMatrix = zscore(fpkmMatrix, axis=1) elif standardize == 1: # standardize along cols/samples fpkmMatrix = zscore(fpkmMatrix, axis=0) fpkmMatrix = man.fit_transform(fpkmMatrix.T) fig = plt.figure(figsize=(8, 8)) ax = fig.add_subplot(111) scatter_proxies = [] labels_show = [] groups = {} conditions = list(set([s.split(sep)[0] for s in samples])) for row, label in zip(fpkmMatrix, samples): label_show = label.split(sep)[0] idx = conditions.index(label_show) ax.scatter(row[0], row[1], label='label', color=COLORS10[idx], visible=not show_text, s=50, marker='o') if label_show not in labels_show: labels_show.append(label_show) scatter1_proxy = Line2D([0], [0], ls="none", c=COLORS10[idx], marker='o') scatter_proxies.append(scatter1_proxy) if show_text: ax.text(row[0], row[1], label, \ ha='center', va='center', rotation=0, color=COLORS10[idx], size='large') ax.legend(scatter_proxies, labels_show, numpoints=1, frameon=True, loc=legend_loc, prop={'size': legend_size}) ax.set_xlabel('M1', fontsize=20) ax.set_ylabel('M2', fontsize=20) enlarge_tick_fontsize(ax, 14) fig.tight_layout() plt.show() return
def export_for_check_intercoder(data,field, folder, trinairize=True,trinary_cutoff=0.5, beta=1, errors=False, normalization='zscore'): # Load support libraries import sklearn.metrics import sklearn.utils from scipy.stats.mstats import zscore # Define helper function to parse results def parse_PRFS_result(predicted,true,beta): result = {} output = sklearn.metrics.precision_recall_fscore_support(true,predicted, beta=beta) fbeta_label = "f{beta}".format(beta=beta) # Parse the output labels = sklearn.utils.multiclass.unique_labels(true,predicted) n_predicted = [sum(predicted==label) for label in labels] output = (*output,n_predicted) output_fields = ["precision","recall",fbeta_label,"support",'n_predicted'] for output_field,row in zip(output_fields,output): result[output_field] = dict(zip(labels,row)) result['precision'].update({'global': sklearn.metrics.precision_score(true,predicted, average='weighted')}) result['recall'].update({'global': sklearn.metrics.recall_score(true,predicted, average='weighted')}) result[fbeta_label].update({'global': sklearn.metrics.fbeta_score(true, predicted, beta=beta, average='weighted')}) result['support'].update({'global':len(predicted[~predicted.isnull()])}) return result if normalization=="min-max": # min-max scale metrics to push them into a [-1,1] interval, assuming a minimum upper-value bound of 1 colnames = [name for name in data.columns if field+'_' in name and not '_err' in name] data = data[colnames] / data[colnames].abs().max().map(lambda x: max(x,1)) # Select appropriate subset of data for quality metrics gold_field = "{field}_gold".format(field=field) data = data[~data[gold_field].isnull()] # Run the quality report function over each column if not errors: cols = [col for col in data.columns if field+"_" in col and not '_gold' in col and not "_err" in col] df_results = {} for col in cols: df_results[col+'_gold'] = data[~data[col].isnull()&~data[gold_field].isnull()][gold_field] df_results[col] = data[~data[col].isnull()&~data[gold_field].isnull()][col] # Trinarize if so required to cast # task as a three-class classification problem if trinairize: df_results[col+'_gold'] = pandas.Series(zscore(df_results[col+'_gold'])).map(make_trinary) if normalization=='zscore': df_results[col] = pandas.Series(zscore(df_results[col])).map(make_trinary).map(int) else: df_results[col] = df_results[col].map(make_trinary).map(int) print(col) df_results = pandas.DataFrame(df_results) df_results.to_csv(folder+'/'+field+'_for_intercoder.csv') return
def dtw(x, y, dist, l=1, warp=1, z_normalize=False): if z_normalize: x = zscore(x) y = zscore(y) series_len = len(x) distance_cost = np.full((series_len + 1, series_len + 1), np.inf) distance_cost[0, 0] = 0 ident = int(l * series_len) pairs = distance_cost[1:, 1:] for i in range(series_len): for j in range(max(0, i - ident), min(series_len, i + ident + 1)): pairs[i, j] = dist(x[i], y[j]) pairwise_distances = pairs.copy() for i in range(1, series_len + 1): for j in range(max(1, i - ident), min(series_len + 1, i + ident + 1)): min_list = [] for k in range(1, warp + 1): i_k = max(i - k, 0) j_k = max(j - k, 0) min_list += [distance_cost[i_k, j], distance_cost[i, j_k], distance_cost[i_k, j_k]] distance_cost[i, j] += min(min_list) path, path_cost = _traceback(distance_cost) return path_cost, path, distance_cost[1:, 1:], pairwise_distances
def dtw_improved(x, y, dist, warp=1, l=0.3, zscr=False): if zscr: zscore(x) zscore(y) r, c = len(x), len(y) lc = int(round(c * l)) D0 = zeros((r + 1, c + 1)) D0[0, 1:] = inf D0[1:, 0] = inf # D0[0, 0] = 0 D = D0[1:, 1:] # view D[0:, 0:] = inf a1, a2 = 0, 0 for i in range(r + c - 1): t1 = threading.Thread(target=count_lines, args=(0, 3, x, y, copy.copy(a1), copy.copy(a2), dist, l, D, D0, warp)) t2 = threading.Thread(target=count_lines, args=(1, 3, x, y, copy.copy(a1), copy.copy(a2), dist, l, D, D0, warp)) t3 = threading.Thread(target=count_lines, args=(2, 3, x, y, copy.copy(a1), copy.copy(a2), dist, l, D, D0, warp)) t1.start() t2.start() t3.start() t1.join() t2.join() t3.join() a1 = min(a1 + 1, r - 1) a2 = max(0, a1 - lc) + max((i + 2) - r, 0) # for i in range(r): # for j in range(max(i - lc, 0), min(i + lc, c)): # # if (c >= r - lc and c <= r + lc): # D[i, j] = dist(x[i], y[j]) # # else: # # D1[i, j] = inf # print(D0) # print("-----") # print(D) C = D.copy() # for i in range(r): # for j in range(max(i - lc, 0), min(i + lc, c)): # min_list = [D0[i, j]] # for k in range(1, warp + 1): # i_k = min(i + k, r - 1) # j_k = min(j + k, c - 1) # min_list += [D0[i_k, j], D0[i, j_k]] # D[i, j] += min(min_list) if len(x) == 1: path = zeros(len(y)), range(len(y)) elif len(y) == 1: path = range(len(x)), zeros(len(x)) else: path = _traceback(D0) return D[-1, -1] / sum(D.shape), C, D, path
def zscore_patient_adjmats(): M_S1 = zscore(np.loadtxt('/home/despoB/kaihwang/Rest/NotBackedUp/ParMatrices/Tha_176_Gordon_333_cortical_corrmat'),None) M_S2 = zscore(np.loadtxt('/home/despoB/kaihwang/Rest/NotBackedUp/ParMatrices/Tha_128_Gordon_333_cortical_corrmat'), None) M_S3 = zscore(np.loadtxt('/home/despoB/kaihwang/Rest/NotBackedUp/ParMatrices/Tha_168_Gordon_333_cortical_corrmat'), None) M_S4 = zscore(np.loadtxt('/home/despoB/kaihwang/Rest/NotBackedUp/ParMatrices/Tha_163_Gordon_333_cortical_corrmat'), None) Patient_AdjMats = np.dstack((M_S1, M_S2, M_S3, M_S4)) return Patient_AdjMats
def tra_linear_regression(self): tree_prepared = self.pipeline_processing() tree_labeled = self.outcome_processing() print(type(tree_prepared)) model = sm.OLS(tree_labeled, tree_prepared).fit() z_model = sm.OLS(zscore(tree_labeled), zscore(tree_prepared)).fit() print(model.summary()) print(z_model.summary())
def scipy_z_transfer(df_input, direct): df = df_input.copy() if direct == 'c': for k in range(len(df.columns)): df.iloc[:, k] = mt.zscore(df.iloc[:, k], ddof=1) else: for k in range(len(df.index)): df.iloc[k, :] = mt.zscore(df.iloc[k, :], ddof=1) return df
def PCA_3d_plot(fpkmMatrix, samples, standardize=3, log=True, show_text=False, sep='_', legend_loc='best', legend_size=14): # standardize: whether to a zscore transformation on the log10 transformed FPKM pca = PCA(n_components=None) ## preprocessing of the fpkmMatrix if log: fpkmMatrix = np.log10(fpkmMatrix + 1.) if standardize == 2: # standardize along rows/genes fpkmMatrix = zscore(fpkmMatrix, axis=1) elif standardize == 1: # standardize along cols/samples fpkmMatrix = zscore(fpkmMatrix, axis=0) ## remove genes with NaNs fpkmMatrix = fpkmMatrix[~np.isnan(np.sum(fpkmMatrix, axis=1))] ## get variance captured pca.fit(fpkmMatrix.T) variance_explained = pca.explained_variance_ratio_[0:3] variance_explained *= 100 ## compute PCA and plot pca = PCA(n_components=3) pca_transformed = pca.fit_transform(fpkmMatrix.T) fig = plt.figure(figsize=(9,9)) ax = fig.add_subplot(111, projection='3d') labels_show = [] scatter_proxies = [] groups = {} conditions = list(set([s.split(sep)[0] for s in samples])) colors = COLORS10 if len(conditions) > 10: colors = COLORS20 if len(conditions) > 20: r = lambda: random.randint(0,255) colors = ['#%02X%02X%02X' % (r(),r(),r()) for i in range(len(conditions))] for row, label in zip(pca_transformed, samples): label_show = label.split(sep)[0] idx = conditions.index(label_show) ax.scatter(row[0], row[1], row[2], label='label', color=colors[idx], s=50, marker='o') if label_show not in labels_show: labels_show.append(label_show) scatter1_proxy = Line2D([0],[0], ls="none", c=colors[idx], marker='o') scatter_proxies.append(scatter1_proxy) if show_text: ax.text(row[0], row[1]-5, row[2]-5, label.split(sep)[1], \ ha='center', va='center', rotation=0, color=colors[idx], size='large') ax.set_xlabel('PC1 (%.2f'%variance_explained[0] + '%' + ' variance captured)', fontsize=16) ax.set_ylabel('PC2 (%.2f'%variance_explained[1] + '%' + ' variance captured)', fontsize=16) ax.set_zlabel('PC3 (%.2f'%variance_explained[2] + '%' + ' variance captured)', fontsize=16) ax.legend(scatter_proxies, labels_show, numpoints=1, frameon=True,loc='upper left',prop={'size':legend_size}) fig.tight_layout() plt.show()
def standardize_pow_mat(stripped_pow_mat, events, sessions, outsample_session=None, outsample_list=None): zpow_mat = np.array(stripped_pow_mat) outsample_mask = None for session in sessions: sess_event_mask = (events.session == session) if session == outsample_session: outsample_mask = (events.list == outsample_list) & sess_event_mask insample_mask = ~outsample_mask & sess_event_mask zpow_mat[outsample_mask] = zmap(zpow_mat[outsample_mask], zpow_mat[insample_mask], axis=0, ddof=1) zpow_mat[insample_mask] = zscore(zpow_mat[insample_mask], axis=0, ddof=1) else: zpow_mat[sess_event_mask] = zscore(zpow_mat[sess_event_mask], axis=0, ddof=1) return zpow_mat, outsample_mask
def cleanClusters(faces, similarityMatrix, labels): #first remove outlying clusters indices = groupLabels(labels) inter_cluster_variances = list() toRemove = list() for k, v in indices.iteritems(): print('cluster {0}'.format(k)) inter_cluster_variances.append( sum(sum(np.power(similarityMatrix[:, v][v, :], 2), 1)) / (len(v) - 1)) inter_cluster_zscores = zscore(inter_cluster_variances) toRemove_inter_cluster = list() for index in range(0, len(inter_cluster_zscores)): if inter_cluster_zscores[index] <= ( -1) or inter_cluster_zscores[index] >= 1: toRemove_inter_cluster.append(index) for i in toRemove_inter_cluster: toRemove.extend(indices.pop(i, None)) similarityMatrix = np.delete(similarityMatrix, toRemove, 0) similarityMatrix = np.delete(similarityMatrix, toRemove, 1) labels = np.delete(labels, toRemove) faces = np.delete(np.array(faces), toRemove, 0) print(inter_cluster_zscores, toRemove_inter_cluster) #them remove the individual images silhouetteSamples = zscore( silhouette_samples(similarityMatrix, labels, metric='precomputed')) print(silhouetteSamples) below = (silhouetteSamples <= (-1)) above = (silhouetteSamples >= 1) toRemove = list() for index in range(0, len(below)): if below[index]: toRemove.append(index) for index in range(0, len(above)): if above[index]: toRemove.append(index) toRemove.sort() print('toRemove', toRemove) similarityMatrix = np.delete(similarityMatrix, toRemove, 0) similarityMatrix = np.delete(similarityMatrix, toRemove, 1) labels = np.delete(labels, toRemove) faces = np.delete(np.array(faces), toRemove, 0) return (faces, similarityMatrix, labels)
def discard_outliers(Hshifts, Cshifts, H_thresh=15.0, C_thresh=10.0): """ Returns a boolean array with True if points are outliers and False otherwise. Parameters: ----------- points : An numobservations by numdimensions array of observations thresh : The modified z-score to use as a threshold. Observations with a modified z-score (based on the median absolute deviation) greater than this value will be classified as outliers. Returns: -------- mask : A numobservations-length boolean array. References: ---------- Boris Iglewicz and David Hoaglin (1993), "Volume 16: How to Detect and Handle Outliers", The ASQC Basic References in Quality Control: Statistical Techniques, Edward F. Mykytka, Ph.D., Editor. """ Hzscores = zscore(Hshifts) Hp_values = norm.sf(abs(Hzscores))*2 adj_Hp_values = p_adjust_bh(Hp_values) final_discard_H = adj_Hp_values < 10e-100 Czscores = zscore(Cshifts) Cp_values = norm.sf(abs(Czscores))*2 adj_Cp_values = p_adjust_bh(Cp_values) final_discard_C = adj_Cp_values < 10e-10 if len(Cshifts.shape) == 1: Cshifts = Cshifts[:,None] if len(Hshifts.shape) == 1: Hshifts = Hshifts[:,None] outliers = final_discard_C | final_discard_H # boolean array, True if this value either in Cshifts or Hshifts is an outlier i=0 new_Cshifts_list, new_Hshifts_list = [], [] for C, H in zip(Cshifts, Hshifts): if outliers[i] == False: new_Cshifts_list.append(C[0]) new_Hshifts_list.append(H[0]) i += 1 return np.array(new_Hshifts_list), np.array(new_Cshifts_list)
def embed_hierarchy(x, yy): # mean feature by level of abstract [x1,y1] = group_mean_x(x, yy['1']) [x2,y2] = group_mean_x(x, yy['2']) [x3,y3] = group_mean_x(x, yy['3']) # x4 = x.todense() # y4 = np.array(yy['1']) # concatenate x,y for embedding, h is the level (1,2,or 3) xh = np.concatenate((x1,x2,x3)) yh = np.concatenate((y1,y2,y3)) h = np.concatenate((np.ones(len(y1))*1, np.ones(len(y2))*2, np.ones(len(y3))*3)) # low-D embedding print('start embedding') xh_ld = embed(xh) # zscore for plotting xh_ld_z = zscore(xh_ld, axis=0) # coloring scheme mycolor = gen_distinct_color(len(y1)) y_to_c = dict(zip([y[0] for y in y1], mycolor)) # y to color dictionary, based on top level ch = [y_to_c[i] for i in [y[0] for y in yh]] # get color for every data point embed_results = {'xh_ld':xh_ld, 'xh_ld_z':xh_ld_z, 'xh':xh, 'yh':yh, 'h':h, 'ch':ch,'y1':y1} return embed_results
def LDAclassification(train_set,test_set,train_ann,test_ann): global path, train_perc #Create z-scored data normalized_train = zscore(train_set) classifier = lda.LDA('lsqr') #train the LDA classifier classifier.fit(train_set, train_ann) #store the trained classifier if train_perc<1.0: pickle.dump( classifier, open(path+"LDA_classifier.p", "wb+" ) ) results = classifier.predict(test_set) #res2 = classifier.predict() cm = confusion_matrix(test_ann, results) print 'CONFUSION MATRIX = {}'.format(cm) return metrics(cm)
def perform_regression(self, regressor_list): regressor_list = zscore(regressor_list, axis=1) regression_results = RegressionModel.load(regressor_list, "linear").fit(self.data) b = regression_results.select("betas").pack() rsq = regression_results.select("stats").pack() return regression_results, b, rsq
def data_zcore_norm(data): '''z score normalise the ['Open', 'Low', 'Close','Volume'] of the stock data ''' data_copy = pd.DataFrame.copy(data, deep=True) for col in ['High', 'Open', 'Low', 'Close', 'Volume']: data_copy[col] = zscore(data[col]) return data_copy
def forward_model(folder_out, folder_audio, model, feature_extractor): if not os.path.exists(os.path.join(folder_out, "clusters")): os.makedirs(os.path.join(folder_out, "clusters")) LOGGER.info("Saving results") for root, _, files in os.walk(folder_audio): LOGGER.info("Saving in: " + folder_out) for file in files: path_to_file = os.path.join(root, file) try: features_file = feature_extractor.get_feature_from_file(path_to_file) except Exception as exception: LOGGER.warning("There is a problem with: " + path_to_file) LOGGER.warning(exception) continue # normalize the feature features = mstats.zscore(features_file, axis=1, ddof=1) features = np.transpose(features) clusters = model.predic_clusters(features) # save results file_out, _ = os.path.splitext(file) path_out_forwarded = os.path.join(folder_out, "clusters/" + root.replace(folder_audio, '')) if not os.path.exists(path_out_forwarded): os.makedirs(path_out_forwarded) path_out_forwarded = os.path.join(path_out_forwarded, file_out + ".txt") np.savetxt(path_out_forwarded, clusters, delimiter=" ", fmt='%i',)
def resample(data, source_to_target_ratio, ZSCORE, resample_method='sinc_best', N_channels_max=128): ###################### # If downsampling by an integer, just anti-alias and subsample?? ###################### # 128 is the max for the underlying library N_channels_max = min(N_channels_max, 128) N_channels = data.shape[1] data_mat = None for i0 in np.arange(0, N_channels, N_channels_max): iF = np.min((i0 + N_channels_max, N_channels)) resampler = samplerate.Resampler(resample_method, channels=iF - i0) data_chunk = resampler.process(data[:, i0:iF], 1 / source_to_target_ratio, end_of_input=True) data_mat = (data_chunk if data_mat is None else np.concatenate( (data_mat, data_chunk), axis=1)) if ZSCORE: data_mat = zscore(data_mat) return data_mat
def test_grad_fourier(x, in_channels, filter_sz, n_filters, t, X): sz2 = filter_sz**2 x_in = copy.deepcopy(x) x_shape = x.shape x = np.float32(x.reshape((in_channels*(filter_sz**2), n_filters))) x = zscore(x,axis=0) x = x.reshape(x_shape) t_start = time.time() ################ fourier grad_f = np.zeros((in_channels, sz2, sz2, n_filters)) Xx_sum = np.zeros(sz2) l = 0 for channel in range(in_channels): for filter in range(n_filters): x = x_in.reshape((in_channels, sz2, n_filters))[channel][:,filter] Xx = np.dot(X,x) Xx_sum += Xx l += np.abs(Xx) sign_mat = np.ones_like(Xx) - 2*(Xx < 0) grad_f[channel][:,:,filter] = X * sign_mat[:,np.newaxis] sign_mat2 = np.ones(sz2) - 2*(t > l) grad_f = (grad_f*sign_mat2[np.newaxis][:,:,np.newaxis,np.newaxis]).sum(1).ravel() fourier_loss = np.sum(np.abs(t - l)) ######### grad = grad_f loss = fourier_loss #print loss, fourier_loss, np.max(x_in) return np.double(loss), np.double(grad)
def zscore_signals(signalArray): signalArray_zscore = np.zeros_like(signalArray) signalArray_zscore = mstats.zscore(signalArray) return signalArray_zscore
def _preprocess_data(self): """ process the raw data according to epoch info This is done in rank 0 which has the raw_data read in Average the activity within epochs and z-scoring within subject. Write the results to self.processed_data, which is a 4D array of averaged epoch by epoch processed data Also write the labels to self.label as a 1D numpy array """ logger.info("mask size: %d" % np.sum(self.mask)) num_epochs = len(self.epoch_info) (d1, d2, d3, _) = self.raw_data[0].shape self.processed_data_ = np.empty([d1, d2, d3, num_epochs]) self.labels_ = np.empty(num_epochs) subject_count = [0] # counting the epochs per subject for z-scoring cur_sid = -1 # averaging for idx, epoch in enumerate(self.epoch_info): self.labels_[idx] = epoch[0] if cur_sid != epoch[1]: subject_count.append(0) cur_sid = epoch[1] subject_count[-1] += 1 self.processed_data_[:, :, :, idx] = np.mean(self.raw_data[cur_sid][:, :, :, epoch[2] : epoch[3]], axis=3) # z-scoring cur_epoch = 0 for i in subject_count: if i > 1: self.processed_data_[:, :, :, cur_epoch : cur_epoch + i] = zscore( self.processed_data_[:, :, :, cur_epoch : cur_epoch + i], axis=3, ddof=0 ) cur_epoch += i # if zscore fails (standard deviation is zero), # set all values to be zero self.processed_data_ = np.nan_to_num(self.processed_data_)
def remove_outliers(self, points_sr, thresh=2.5, window_length=5, polyorder=3, tz='Asia/Shanghai'): """ Description: remove outliers by savgol_filter Parameters: points_sr: pandas.Series thresh: float window_length: int, odd number polyorder: int tz: str Returns: pandas.Series """ points = points_sr.values points_filtered = savgol_filter(points, window_length, polyorder, mode='nearest') points_zscored = zscore(points - points_filtered) for i, score in enumerate(points_zscored): if abs(score) > thresh: points_sr[i] = np.nan return points_sr
def tra_linear_regression(self): tree_prepared = self.convert_dataframe_to_ndarray() tree_labeled = self.outcome() #tree_prepared=sm.add_constant(tree_prepared) model = sm.OLS(tree_labeled, tree_prepared).fit() z_model = sm.OLS(zscore(tree_labeled), zscore(tree_prepared)).fit() print(model.summary()) with open('testt_PL.txt', 'wt') as f: print(z_model.summary(), file=f) tree_predict = model.predict(tree_prepared) z_tree_predict = z_model.predict(zscore(tree_prepared)) res = tree_labeled - tree_predict #return zscore(tree_labeled),z_tree_predict return tree_labeled, tree_predict, res
def check_cluster(cluster): n = len(cluster) if n < 2: return True, [] # Run k_means on two centers children, labels, _ = k_means(cluster, 2) # Let v = c1 - c2 be a d-dimensional vector that connects the two centers. This is the direction that k-means # believes to be important for clustering. v = children[1]-children[0] # Then project X onto v: x'i = hxi, vi/||v||2. X0 is a 1-dimensional # representation of the data projected onto v. x_prime = [np.dot(point, v) for point in cluster] # Transform X0 so that it has mean 0 and variance 1. x_prime = zscore(x_prime) # Let zi = F(x0(i)). If A2*(Z) is in the range of non-critical values at confidence level alpha, then accept H0, # keep the original center, and discard {c1, c2}. Otherwise, reject H0 and keep {c1, c2} in place of the original # center. a2, critical, sig = anderson(x_prime) a2 *= (1+4.0/n-25.0/(n**2)) return a2 < critical[0], children
def apply_stcs(method='dSPM', event='LLst'): ''' Normalize the individual STCs and average them across subjects. Parameters ---------- method: string 'dSPM' or 'MNE'. event: string the event name in the experimental conditions. ''' import glob from scipy.signal import detrend from scipy.stats.mstats import zscore fn_list = glob.glob(subjects_dir+'/fsaverage/%s_ROIs/*/*,evtW_%s_bc-lh.stc' % (method, event)) stcs = [] for fname in fn_list: stc = mne.read_source_estimate(fname) #stc = stc.crop(tmin, tmax) cal_data = stc.data dt_data = detrend(cal_data, axis=-1) zc_data = zscore(dt_data, axis=-1) stc.data.setfield(zc_data, np.float32) stcs.append(stc) stcs = np.array(stcs) stc_avg = np.sum(stcs, axis=0)/stcs.shape[0] fn_avg = subjects_dir+'/fsaverage/%s_ROIs/%s' %(method,event) stc_avg.save(fn_avg, ftype='stc')
def get_similarity_timeserie(path, name, condition, time, **kwargs): TR = 1. for arg in kwargs: if arg == 'TR': TR = np.float(kwargs[arg]) file_list = os.listdir(path) file_list = [f for f in file_list if f.find(name) != -1 and f.find('_'+condition) != -1 and f.find(time) != -1 ] total_data = [] for f in file_list: print(os.path.join(path, f)) data = np.loadtxt(os.path.join(path, f), delimiter=',') data = np.sqrt(data.T) data_z = zscore(data, axis=1) total_data.append(data_z) ts = TimeSeries(np.vstack(total_data), sampling_interval=TR) return ts
def filter_by_pvalue_strand_lag(ratios, pcutoff, pvalues, output, no_correction, name, singlestrand): """Filter DPs by strang lag and pvalue""" if not singlestrand: zscore_ratios = zscore(ratios) ratios_pass = np.where(np.bitwise_and(zscore_ratios > -2, zscore_ratios < 2) == True, True, False) if not no_correction: pv_pass = [True] * len(pvalues) pvalues = map(lambda x: 10**-x, pvalues) _output_BED(name + '-uncor', output, pvalues, pv_pass) _output_narrowPeak(name + '-uncor', output, pvalues, pv_pass) pv_pass, pvalues = multiple_test_correction(pvalues, alpha=pcutoff) else: pv_pass = np.where(np.asarray(pvalues) >= -log10(pcutoff), True, False) if not singlestrand: filter_pass = np.bitwise_and(ratios_pass, pv_pass) assert len(pv_pass) == len(ratios_pass) else: filter_pass = pv_pass assert len(output) == len(pvalues) assert len(filter_pass) == len(pvalues) return output, pvalues, filter_pass
def test_zscore(self): # This is not in R, so tested by using: # (testcase[i]-mean(testcase,axis=0)) / sqrt(var(testcase)*3/4) y = mstats.zscore(self.testcase) desired = ma.fix_invalid([-1.3416407864999, -0.44721359549996, 0.44721359549996, 1.3416407864999, np.nan]) assert_almost_equal(desired, y, decimal=12)
def _normalize_for_correlation(data, axis): """normalize the data before computing correlation The data will be z-scored and divided by sqrt(n) along the assigned axis Parameters ---------- data: 2D array axis: int specify which dimension of the data should be normalized Returns ------- data: 2D array the normalized data """ shape = data.shape data = zscore(data, axis=axis, ddof=0) # if zscore fails (standard deviation is zero), # set all values to be zero data = np.nan_to_num(data) data = data / math.sqrt(shape[axis]) return data
def _normalize_for_correlation(data, axis, return_nans=False): """normalize the data before computing correlation The data will be z-scored and divided by sqrt(n) along the assigned axis Parameters ---------- data: 2D array axis: int specify which dimension of the data should be normalized return_nans: bool, default:False If False, return zeros for NaNs; if True, return NaNs Returns ------- data: 2D array the normalized data """ shape = data.shape data = zscore(data, axis=axis, ddof=0) # if zscore fails (standard deviation is zero), # optionally set all values to be zero if not return_nans: data = np.nan_to_num(data) data = data / math.sqrt(shape[axis]) return data
def __init__(self, data, bgdata, crop=None, fill=False, normalize=True, down_sampling=0): super(DataPrep, self).__init__() self.gap_position = None self.gap_length = None self._data = data self._bgdata = bgdata dmin = data.position.min() self.position = data.position.copy() - dmin self.counts = data.counts.copy() if crop is not None: if len(crop) != 2: raise ValueError(("Cropping parameter requires a sequence " "of lenght 2")) self._crop(*crop) if normalize: self.counts = zscore(self.counts) if fill: self._fill_missing() if down_sampling != 0: self._down_sample(down_sampling) if self.gap_length is None: self._gap_lenght(self.position)
def zscore_mag_div_matrix(mat): for col in xrange(mat.shape[1]): if col % 50000 == 0: print ' {}% done'.format(np.round(float(col)/float(mat.shape[1]),2)* 100) mat[:,col] = zscore(mat[:,col]) mat[:,col] = mat[:,col] / np.sqrt( sum( [ x**2 for x in mat[:,col] ] ) ) return mat
def regress_subject(f, gradients, n_jobs=1): """ This function estimates coefficients of a linear model that fits gradients to individual volumes. It will fit all gradients to each volume. Inputs: f: volume time series (V x T) gradients: matrix containing gradients in columns (V x Ng) Outputs: Ng x T matrix with linear regression coefficients """ from sklearn.linear_model import LinearRegression from scipy.stats.mstats import zscore # Load the data d = nib.load(f).get_data() # Z-score d_z = zscore(d, axis=0) # Regress m = LinearRegression(fit_intercept=True, n_jobs=n_jobs) m.fit(d_z.T, gradients) return m.coef_.T
def export_scatterplots(data, field, tool_order, folder, standardization=False): from scipy.stats.mstats import zscore import seaborn as sns import matplotlib.pyplot as plt data = data[[field+"_"+t for t in tool_order]+[field+"_gold",field+"_top3"]].dropna() cols = data.columns if standardization: colnames = [name for name in data.columns if field+'_' in name and not '_err' in name] if standardization=="min-max": data_std = ((data[colnames] - data[colnames].min()) / (data[colnames].max() - data[colnames].min()))*2-1 elif standardization == "z-score": data_std = data.copy() for col in colnames: data_std[col] = zscore(data[col]) else: data_std = data if 'ID' in data.columns: # for normal data data_std['ID'] = data['ID'] else: # for aggregated data, create placeholder IDs data_std['ID'] = [i for i in range(len(data_std))] data_std['%s_recessie' %field] = data['%s_recessie' %field] print(data_std.describe().transpose()) data_std.to_csv(folder+'/'+field+'_' + standardization + '_for_scatterplots.csv') else: print(data.describe().transpose()) data.to_csv(folder+'/'+field+'_' + 'unstandardized' + '_for_scatterplots.csv') return
def _findTriggerEnd(reference_signal, window=101, prominence=1, zscoring=True): """Uses z-scoring of rolling standard deviation to find the end trigger from the camera to synch audio and video. :param reference_signal: reference signal in audio data :type reference_signal: numpy.ndarray :param window: window size, defaults to 101 :type window: int, optional :param prominence: prominence for peak finding, defaults to 1 :type prominence: int, optional :param zscoring: enables z-scoring of data before peak finding, defaults to True :type zscoring: bool, optional :return: peak location and std signal :rtype: tuple(int, numpy.ndarray) """ std = _rolling_std_numba(reference_signal, window) if zscoring: std = zscore(std) else: std = (std - std.min()) / (std.max() - std.min()) peaks = find_peaks(std, prominence=prominence)[0] if len(peaks): return peaks[0], std else: return False
def test_F(F): sparsity = 0 sse = 0 batch = 9001 for batch in range(9001,9001+3):#9011): batch_data = np.load('/storage/batch128_img138_full/data_batch_' + str(batch))['data'].reshape((3,138,138,128)).transpose((3,1,2,0))[:,66:66+7,66:66+7] for step in range(128): patch = batch_data[img].ravel() patch = zscore(patch) Ft = pinv(F) sse += np.sum((patch - np.dot(Ft, np.dot(F,patch)))**2) sparsity += np.sum(np.abs(np.dot(F,patch))) l, g = test_grad_transpose(F.T, 3, 7, n_out) lf, gf = test_grad_fourier_l1(F.T, 3, 7, n_out, X, X2) lc, gc = test_grad_channel_corr(F.T, 3, 7, n_out) ls, gs = test_grad_second_order(F.T, 3, 7, n_out, c_mat_input) loss = sse + lambds*sparsity + lambdt*l + lambdf*lf + lambdc*lc backprop_corr = pearsonr(1-pdist(F.T,'correlation'), backprop_rdm)[0] print 'recon:',sse, 'sparsity:',lambdf*sparsity, 'transpose:',lambdt*l, 'fourier:',lambdf*lf, 'loss:',loss, 'transpose: ',np.mean(np.abs(1-pdist(F.T,'correlation'))), img_t print 'channel corr:',lambdc*lc, 'second order:',lambds*ls, 'backprop corr:',backprop_corr sparsities.append(sparsity) sses.append(sse) transposes.append(l) fouriers.append(lf) losses.append(loss) channel_corrs.append(lc) second_orders.append(ls) backprop_corrs.append(backprop_corr)
def get_similarity_timeserie(path, name, condition, time, **kwargs): TR = 1. for arg in kwargs: if arg == 'TR': TR = np.float(kwargs[arg]) file_list = os.listdir(path) file_list = [f for f in file_list if f.find(name) != -1 and f.find('_'+condition) != -1 and f.find(time) != -1 ] total_data = [] for f in file_list: print os.path.join(path, f) data = np.loadtxt(os.path.join(path, f), delimiter=',') data = np.sqrt(data.T) data_z = zscore(data, axis=1) total_data.append(data_z) ts = TimeSeries(np.vstack(total_data), sampling_interval=TR) return ts
def frequency_harmonies(x, settings): """ This was used by Michael Hills for the seizure detection competition in 2014 in Kaggle. See https://github.com/MichaelHills/seizure-detection/raw/master/seizure-detection.pdf :param x: the input signal. Its size is (number of channels, samples). :param settings: a dictionary including the "freq_hramonies_max_freq". :return: """ time = [0, 0, 0] t = timer() m_x = x - np.mean(x, axis=1, keepdims=True) x_mgn = np.log10( np.absolute( np.fft.rfft(m_x, axis=1)[:, 1:settings["freq_harmonies_max_freq"]])) time[0] = timer() - t x_zscored = mstats.zscore(x_mgn, axis=1) channels_correlations = fch.calc_corr(x_zscored) eigs = fch.calc_eigens(channels_correlations) time[1] = timer() - t time[2] = time[1] channels_corrs_eig_values = eigs["lambda"] channels_corrs_eigs_vectors = eigs["vectors"] results = fch.fill_results( ["frequency_harmonies", "lambdas", "eigen_vectors"], [x_mgn, channels_corrs_eig_values, channels_corrs_eigs_vectors], "frequency_harmonise", time, settings["is_normalised"]) return results
def __distance(self): """ Compute the average euclidean distance from members of the cluster taking the weighted label into account """ self._data_distance = [] for x in range(0, len(self._data)): cluster = self._pred_cluster[x] n = len(self._cluster_member[cluster]) population = self._cluster_member[cluster] if n > 10: population = random.sample(self._cluster_member[cluster], 10) n = 10 dist = 0 for y in self._cluster_member[cluster]: vx = np.array(self._data[x] + [self._labels[x] * self._label_weight]) vy = np.array(self._data[y] + [self._labels[y] * self._label_weight]) d = euclidean(vx, vy) dist += d * d self._data_distance.append(dist / n) self._data_distance = zscore(self._data_distance)
def word_party_correlations(folder='model'): stopwords = codecs.open("stopwords.txt", "r", "utf-8").readlines()[5:] stops = map(lambda x: x.lower().strip(), stopwords) # using now stopwords and filtering out digits bow = TfidfVectorizer(min_df=2) datafn = folder + '/textdata/rawtext.pickle' data = cPickle.load(open(datafn)) bow = bow.fit(chain.from_iterable(data.values())) # create numerical labels Y = hstack( map((lambda x: ones(len(data[data.keys()[x]])) * x), range(len(data)))) # create data matrix for key in data.keys(): data[key] = bow.transform(data[key]) X = vstack(data.values()) # map sentiment vector to bow space words = load_sentiment() sentiment_vec = zeros(X.shape[1]) for key in words.keys(): if bow.vocabulary_.has_key(key): sentiment_vec[bow.vocabulary_[key]] = words[key] # do sentiment analysis sentiments = X.dot(sentiment_vec) # compute label-BoW-tfidf-feature correlation lb = LabelBinarizer() partylabels = zscore(lb.fit_transform(Y), axis=0) # sentiment vs party correlation sentVsParty = corrcoef(partylabels.T, sentiments)[-1, :-1] fn = folder + '/sentiment_vs_party.json' for key in range(len(data.keys())): print "Sentiment vs Party %s: %0.2f" % (data.keys()[key], sentVsParty[key]) json.dump(dict(zip(data.keys(), sentVsParty)), open(fn, 'wb')) wordidx2word = dict(zip(bow.vocabulary_.values(), bow.vocabulary_.keys())) allcors = dict(zip(data.keys(), [[]] * len(data.keys()))) # this is extremely cumbersome and slow, ... # but computing the correlations naively on the matrices # requires densifying the matrix X, which is memory intense for partyidx in range(len(data.keys())): cors_words = [] print 'Computing correlations for %s' % data.keys()[partyidx] for wordidx in range(X.shape[-1]): cors = corrcoef(X[:, wordidx].todense().flatten(), partylabels[:, partyidx])[1, 0] if abs(cors) > .01: cors_words.append((wordidx2word[wordidx], cors)) allcors[data.keys()[partyidx]] = dict(cors_words) fn = folder + '/words_correlations.json' json.dump(dict(allcors), open(fn, 'wb'))
def word_party_correlations(folder='model'): stopwords = codecs.open("stopwords.txt", "r", "utf-8").readlines()[5:] stops = map(lambda x:x.lower().strip(),stopwords) # using now stopwords and filtering out digits bow = TfidfVectorizer(min_df=2) datafn = folder+'/textdata/rawtext.pickle' data = cPickle.load(open(datafn)) bow = bow.fit(chain.from_iterable(data.values())) # create numerical labels Y = hstack(map((lambda x: ones(len(data[data.keys()[x]]))*x),range(len(data)))) # create data matrix for key in data.keys(): data[key] = bow.transform(data[key]) X = vstack(data.values()) # map sentiment vector to bow space words = load_sentiment() sentiment_vec = zeros(X.shape[1]) for key in words.keys(): if bow.vocabulary_.has_key(key): sentiment_vec[bow.vocabulary_[key]] = words[key] # do sentiment analysis sentiments = X.dot(sentiment_vec) # compute label-BoW-tfidf-feature correlation lb = LabelBinarizer() partylabels = zscore(lb.fit_transform(Y),axis=0) # sentiment vs party correlation sentVsParty = corrcoef(partylabels.T,sentiments)[-1,:-1] fn = folder+'/sentiment_vs_party.json' for key in range(len(data.keys())): print "Sentiment vs Party %s: %0.2f"%(data.keys()[key],sentVsParty[key]) json.dump(dict(zip(data.keys(),sentVsParty)),open(fn,'wb')) wordidx2word = dict(zip(bow.vocabulary_.values(),bow.vocabulary_.keys())) allcors = dict(zip(data.keys(),[[]]*len(data.keys()))) # this is extremely cumbersome and slow, ... # but computing the correlations naively on the matrices # requires densifying the matrix X, which is memory intense for partyidx in range(len(data.keys())): cors_words = [] print 'Computing correlations for %s'%data.keys()[partyidx] for wordidx in range(X.shape[-1]): cors = corrcoef(X[:,wordidx].todense().flatten(),partylabels[:,partyidx])[1,0] if abs(cors)>.01: cors_words.append((wordidx2word[wordidx],cors)) allcors[data.keys()[partyidx]] = dict(cors_words) fn = folder+'/words_correlations.json' json.dump(dict(allcors),open(fn,'wb'))
def zscorenormalize(values): return zscore(filter_one_d_array(values,3)) # # tests and chill # arr = np.array([0, 2, 3, 3, 4, 6, 10, 15, 97]) # arr2 = [2, 80, 6, 3] # print (median_filter(arr, 5)) # print (filter_one_d_array(arr,5)) # print (zscorenormalize(arr))
def create_epoch(): row = 12 col = 5 mat = prng.rand(row, col).astype(np.float32) mat = zscore(mat, axis=0, ddof=0) # if zscore fails (standard deviation is zero), # set all values to be zero mat = np.nan_to_num(mat) mat = mat / math.sqrt(mat.shape[0]) return mat
def kpca_cluster(data,nclusters=100,ncomponents=40,topwhat=10,zscored=False): ''' Computes clustering of bag-of-words vectors of articles INPUT folder model folder nclusters number of clusters ''' from sklearn.cluster import KMeans # filtering out some noise words stops = map(lambda x:x.lower().strip(),open('stopwords.txt').readlines()[6:]) # vectorize non-stopwords bow = TfidfVectorizer(min_df=2,stop_words=stops) X = bow.fit_transform(data) # creating bow-index-to-word map idx2word = dict(zip(bow.vocabulary_.values(),bow.vocabulary_.keys())) # using now stopwords and filtering out digits print 'Computing pairwise distances' K = pairwise_distances(X,metric='l2',n_jobs=1) perc = 50.0 width = percentile(K.flatten(),perc) # KPCA transform bow vectors Xc = KernelPCA(n_components=ncomponents,kernel='rbf',gamma=width).fit_transform(X) if zscored: Xc = zscore(Xc) # compute clusters km = KMeans(n_clusters=nclusters).fit(Xc) Xc = km.predict(Xc) clusters = [] for icluster in range(nclusters): nmembers = (Xc==icluster).sum() if True:#nmembers < len(data) / 5.0 and nmembers > 1: # only group clusters big enough but not too big members = (Xc==icluster).nonzero()[0] topwordidx = array(X[members,:].sum(axis=0))[0].argsort()[-topwhat:][::-1] topwords = ' '.join([idx2word[wi] for wi in topwordidx]) meanDist = triu(pairwise_distances(X[members,:],metric='l2',n_jobs=1)).sum() meanDist = meanDist / (len(members) + (len(members)**2 - len(members))/2.0) # print u'Cluster %d'%icluster + u' %d members'%nmembers + u' mean Distance %f'%meanDist + u'\n\t'+topwords clusters.append({ 'name':'Cluster-%d'%icluster, 'description': topwords, 'members': list(members), 'meanL2Distances': meanDist }) return clusters
def soma_lfp(ns, ts, N, T, tau=0.002, dt=.001, norm=True): """Simulate LFP (1d) bu convlution with an 'alpha' kernel. Parameters ---------- ns : array-list (1d) Neuron codes (integers) ts : array-list (1d, seconds) Spikes times tau : numeric (default: 0.001) The alpha estimate time constant dt : numeric (default: 0.001, seconds) Step time """ spikes = to_spikes(ns, ts, T, N, dt) if spikes.ndim > 2: raise ValueError("spikes must be 1 of 2d") if tau < 0: raise ValueError("tau must be > 0") if dt < 0: raise ValueError("dt must be > 0") # Enforce col orientation if 1d if spikes.ndim == 1: spikes = spikes[:, np.newaxis] # 10 x tau (10 half lives) should be enough to span the # interesting parts of g, the alpha function we are # using to convert broadband firing to LFP # a technique we are borrowing from: # # http://www.ncbi.nlm.nih.gov/pubmed/20463210 # # then abusing a bit (too much?). # # We want 10*tau but we have to resample to dt time first n_alpha_samples = ((tau * 10) / dt) t0 = np.linspace(0, tau * 10, n_alpha_samples) # Define the alpha (g notation borrow from BV's initial code) gmax = 0.1 g = gmax * (t0 / tau) * np.exp(-(t0 - tau) / tau) # make LFP spsum = spikes.astype(np.float).sum(1) spsum /= spsum.max() lfps = np.convolve(spsum, g)[0:spikes.shape[0]] if norm: lfps = zscore(lfps) return lfps
def prepare_mvpa_data(data_dir, extension, mask_file, epoch_file): """ obtain the data for activity-based model training and prediction Average the activity within epochs and z-scoring within subject. Parameters ---------- data_dir: str the path to all subject files extension: str the file extension, usually nii.gz or nii mask_file: str the absolute path of the mask file, we apply the mask right after reading a file for saving memory epoch_file: str the absolute path of the epoch file Returns ------- processed\_data: 2D array in shape [num_voxels, num_epochs] averaged epoch by epoch processed data labels: 1D array contains labels of the data """ activity_data = read_activity_data(data_dir, extension, mask_file) epoch_list = np.load(epoch_file) epoch_info = generate_epochs_info(epoch_list) num_epochs = len(epoch_info) (d1, _) = activity_data[0].shape processed_data = np.empty([d1, num_epochs]) labels = np.empty(num_epochs) subject_count = [0] # counting the epochs per subject for z-scoring cur_sid = -1 # averaging for idx, epoch in enumerate(epoch_info): labels[idx] = epoch[0] if cur_sid != epoch[1]: subject_count.append(0) cur_sid = epoch[1] subject_count[-1] += 1 processed_data[:, idx] = np.mean(activity_data[cur_sid][:, epoch[2] : epoch[3]], axis=1) # z-scoring cur_epoch = 0 for i in subject_count: if i > 1: processed_data[:, cur_epoch : cur_epoch + i] = zscore( processed_data[:, cur_epoch : cur_epoch + i], axis=1, ddof=0 ) cur_epoch += i # if zscore fails (standard deviation is zero), # set all values to be zero processed_data = np.nan_to_num(processed_data) return processed_data, labels
def perform_PCA(fpkmMatrix, standardize=3, log=True): ## preprocessing of the fpkmMatrix if log: fpkmMatrix = np.log10(fpkmMatrix + 1.) if standardize == 2: # standardize along rows/genes fpkmMatrix = zscore(fpkmMatrix, axis=1) elif standardize == 1: # standardize along cols/samples fpkmMatrix = zscore(fpkmMatrix, axis=0) ## remove genes with NaNs fpkmMatrix = fpkmMatrix[~np.isnan(np.sum(fpkmMatrix, axis=1))] pca = PCA(n_components=None) ## get variance captured pca.fit(fpkmMatrix.T) variance_explained = pca.explained_variance_ratio_[0:3] variance_explained *= 100 ## compute PCA and plot pca_transformed = pca.transform(fpkmMatrix.T) return variance_explained, pca_transformed
def get_bold_signals (image, mask, TR, normalize=True, ts_extraction='mean', filter_par=None, roi_values=None): ''' Image and mask must be in nibabel format ''' mask_data = np.int_(mask.get_data()) if roi_values == None: labels = np.unique(mask_data)[1:] else: labels = np.int_(roi_values) final_data = [] #print labels for v in labels[:]: #print str(v) data = image.get_data()[mask_data == v] if normalize == True: data = zscore(data, axis = 1) data[np.isnan(data)] = 0 if ts_extraction=='mean': #assert np.mean(data, axis=0) == data.mean(axis=0) data = data.mean(axis=0) elif ts_extraction=='pca': if data.shape[0] > 0: data = PCA(n_components=1).fit_transform(data.T) data = np.squeeze(data) else: data = data.mean(axis=0) ts = TimeSeries(data, sampling_interval=float(TR)) if filter_par != None: upperf = filter_par['ub'] lowerf = filter_par['lb'] F = FilterAnalyzer(ts, ub=upperf, lb=lowerf) ts = TimeSeries(F.fir.data, sampling_interval=float(TR)) del F final_data.append(ts.data) del data del mask_data del ts return TimeSeries(np.vstack(final_data), sampling_interval=float(TR))
def get_arc(self, word): ''' implements the neighbourhood size/density algorithm described in [1] the algorithm simply extends the proposal of Shaoul & Westbury (2006) by relativising the threshold for EACH word. That is, we simply ask if the potential neighbour stands closer to the target word or further away than the average pairing of the target with any other word. In our implementation of the calculation of the semantic distances we take A ⋅ w_i for each word (see get_neighbourhood method with topn=0), where A is the ∣V∣ × D or the ∣V∣ × ∣V∣ vocabulary matrix and w_i is the target word. Having normalised each vector in the matrix to unit length, (see the _init method) this operation is equivalent to taking the cosine similarity between the word in question and all the other words in the vocabulary. The resulting ∣V∣ dimensional vector contains effectively the similarity values between the word in question and all the other words in the matrix. From this point it is a trivial task to obtain descriptive statistics for these distributions. Converting, thus, the vector of similarities into z-scores, we are able to keep all the scores above a predefined threshold obtaining thus neighbourhood size and density for each word, taking into account its similarity to all the other words in the lexicon. The number of stdevs above which a word is considered a neighbour, does not have to be explicitly set, the list_ variable reports the size and the density of the neighbourhood in predefined steps. [1] Alikaniotis D. (2014) Approximating semantic structures using high-dimensional lexical spaces ''' list_ = np.arange(-5, 10, 1) ## range and step of reporting ans_list = ARCObject(word, list_) most_similar = self.get_neighbourhood(word, topn=0) ## get all zscores = zscore([sim for (w, sim) in most_similar]) self.nl = zip(most_similar, zscores) ans_ncount = ans_arc = 0 it = iter(self.nl) high_ind = -1 high_val = list_[high_ind] try: low_ind = -2 low_val = list_[low_ind] except IndexError: print "Please use a list that contains more than two items" raise for i, (k, v) in enumerate(it): if v < list_[0]: return ans_list while v < high_val and v < low_val: ans_list[low_val] = tuple([ans_ncount, ans_arc / i if ans_ncount != 0 else 0]) ans_ncount = 0 low_ind -= 1 high_ind -= 1 low_val, high_val = list_[low_ind], list_[high_ind] ans_ncount += 1 ans_arc += k[1] return ans_list
def _separate_epochs(activity_data, epoch_list): """ create data epoch by epoch Separate data into epochs of interest specified in epoch_list and z-score them for computing correlation Parameters ---------- activity_data: list of 2D array in shape [nVoxels, nTRs] the masked activity data organized in voxel*TR formats of all subjects epoch_list: list of 3D array in shape [condition, nEpochs, nTRs] specification of epochs and conditions assuming all subjects have the same number of epochs len(epoch_list) equals the number of subjects Returns ------- raw_data: list of 2D array in shape [epoch length, nVoxels] the data organized in epochs and z-scored in preparation of correlation computation len(raw_data) equals the number of epochs labels: list of 1D array the condition labels of the epochs len(labels) labels equals the number of epochs """ time1 = time.time() raw_data = [] labels = [] for sid in range(len(epoch_list)): epoch = epoch_list[sid] for cond in range(epoch.shape[0]): sub_epoch = epoch[cond, :, :] for eid in range(epoch.shape[1]): r = np.sum(sub_epoch[eid, :]) if r > 0: # there is an epoch in this condition # mat is row-major # regardless of the order of acitvity_data[sid] mat = activity_data[sid][:, sub_epoch[eid, :] == 1] mat = np.ascontiguousarray(mat.T) mat = zscore(mat, axis=0, ddof=0) # if zscore fails (standard deviation is zero), # set all values to be zero mat = np.nan_to_num(mat) mat = mat / math.sqrt(r) raw_data.append(mat) labels.append(cond) time2 = time.time() logger.debug( 'epoch separation done, takes %.2f s' % (time2 - time1) ) return raw_data, labels
def prepare_mvpa_data(images, conditions, mask): """Prepare data for activity-based model training and prediction. Average the activity within epochs and z-scoring within subject. Parameters ---------- images: Iterable[SpatialImage] Data. conditions: List[UniqueLabelConditionSpec] Condition specification. mask: np.ndarray Mask to apply to each image. Returns ------- processed_data: 2D array in shape [num_voxels, num_epochs] averaged epoch by epoch processed data labels: 1D array contains labels of the data """ activity_data = list(mask_images(images, mask, np.float32)) epoch_info = generate_epochs_info(conditions) num_epochs = len(epoch_info) (d1, _) = activity_data[0].shape processed_data = np.empty([d1, num_epochs]) labels = np.empty(num_epochs) subject_count = [0] # counting the epochs per subject for z-scoring cur_sid = -1 # averaging for idx, epoch in enumerate(epoch_info): labels[idx] = epoch[0] if cur_sid != epoch[1]: subject_count.append(0) cur_sid = epoch[1] subject_count[-1] += 1 processed_data[:, idx] = \ np.mean(activity_data[cur_sid][:, epoch[2]:epoch[3]], axis=1) # z-scoring cur_epoch = 0 for i in subject_count: if i > 1: processed_data[:, cur_epoch:cur_epoch + i] = \ zscore(processed_data[:, cur_epoch:cur_epoch + i], axis=1, ddof=0) cur_epoch += i # if zscore fails (standard deviation is zero), # set all values to be zero processed_data = np.nan_to_num(processed_data) return processed_data, labels
def create_epoch(idx, num_voxels): row = 12 col = num_voxels mat = prng.rand(row, col).astype(np.float32) # impose a pattern to even epochs if idx % 2 == 0: mat = np.sort(mat, axis=0) mat = zscore(mat, axis=0, ddof=0) # if zscore fails (standard deviation is zero), # set all values to be zero mat = np.nan_to_num(mat) mat = mat / math.sqrt(mat.shape[0]) return mat