def _spearman(a, b): return spearmanr(a, b)[0]
def test_ranker(output, client, listen_port, group): if output == 'dataframe-with-categorical': X, y, w, g, dX, dy, dw, dg = _create_ranking_data( output=output, group=group, n_features=1, n_informative=1 ) else: X, y, w, g, dX, dy, dw, dg = _create_ranking_data( output=output, group=group, ) # rebalance small dask.Array dataset for better performance. if output == 'array': dX = dX.persist() dy = dy.persist() dw = dw.persist() dg = dg.persist() _ = wait([dX, dy, dw, dg]) client.rebalance() # use many trees + leaves to overfit, help ensure that Dask data-parallel strategy matches that of # serial learner. See https://github.com/microsoft/LightGBM/issues/3292#issuecomment-671288210. params = { "random_state": 42, "n_estimators": 50, "num_leaves": 20, "min_child_samples": 1 } dask_ranker = lgb.DaskLGBMRanker( client=client, time_out=5, local_listen_port=listen_port, tree_learner_type='data_parallel', **params ) dask_ranker = dask_ranker.fit(dX, dy, sample_weight=dw, group=dg) rnkvec_dask = dask_ranker.predict(dX) rnkvec_dask = rnkvec_dask.compute() p1_pred_leaf = dask_ranker.predict(dX, pred_leaf=True) rnkvec_dask_local = dask_ranker.to_local().predict(X) local_ranker = lgb.LGBMRanker(**params) local_ranker.fit(X, y, sample_weight=w, group=g) rnkvec_local = local_ranker.predict(X) # distributed ranker should be able to rank decently well and should # have high rank correlation with scores from serial ranker. dcor = spearmanr(rnkvec_dask, y).correlation assert dcor > 0.6 assert spearmanr(rnkvec_dask, rnkvec_local).correlation > 0.8 assert_eq(rnkvec_dask, rnkvec_dask_local) # pref_leaf values should have the right shape # and values that look like valid tree nodes pred_leaf_vals = p1_pred_leaf.compute() assert pred_leaf_vals.shape == ( X.shape[0], dask_ranker.booster_.num_trees() ) assert np.max(pred_leaf_vals) <= params['num_leaves'] assert np.min(pred_leaf_vals) >= 0 assert len(np.unique(pred_leaf_vals)) <= params['num_leaves'] # be sure LightGBM actually used at least one categorical column, # and that it was correctly treated as a categorical feature if output == 'dataframe-with-categorical': cat_cols = [ col for col in dX.columns if dX.dtypes[col].name == 'category' ] tree_df = dask_ranker.booster_.trees_to_dataframe() node_uses_cat_col = tree_df['split_feature'].isin(cat_cols) assert node_uses_cat_col.sum() > 0 assert tree_df.loc[node_uses_cat_col, "decision_type"].unique()[0] == '==' client.close(timeout=CLIENT_CLOSE_TIMEOUT)
def spearmanr(a, b): return stats.spearmanr(a, b)[0]
def main(data_dir, model_file, out_format, word_analogy, word_similarity, entity_similarity, lowercase, batch_size, vocab_size): model = Wikipedia2Vec.load(model_file) results = [] if word_similarity: base_dir = os.path.join(os.path.join(data_dir, 'word'), 'similarity') for filename in os.listdir(base_dir): if not filename.endswith('.txt'): continue oov_count = 0 with open(os.path.join(base_dir, filename)) as f: gold = [] estimated = [] for line in f: (w1, w2, val) = line.split() val = float(val) if lowercase: (w1, w2) = (w1.lower(), w2.lower()) try: v1 = model.get_word_vector(w1) except KeyError: oov_count += 1 continue try: v2 = model.get_word_vector(w2) except KeyError: oov_count += 1 continue gold.append(val) estimated.append(1.0 - cosine(v1, v2)) results.append((filename[:-4], spearmanr(gold, estimated)[0], oov_count)) if word_analogy: base_dir = os.path.join(os.path.join(data_dir, 'word'), 'analogy') for filename in os.listdir(base_dir): with open(os.path.join(base_dir, filename)) as f: (A_ind, B_ind, C_ind, D_ind) = ([], [], [], []) oov_count = 0 for (n, line) in enumerate(f): if not line.startswith(':'): if lowercase: words = list(map(model.get_word, line.lower().split())) else: words = list(map(model.get_word, line.split())) if not all(w is not None for w in words): oov_count += 1 continue (a_ind, b_ind, c_ind, d_ind) = map(lambda o: o.index, words) A_ind.append(a_ind) B_ind.append(b_ind) C_ind.append(c_ind) D_ind.append(d_ind) offset = model.dictionary.entity_offset word_emb = model.syn0[:offset] / np.linalg.norm(model.syn0[:offset], 2, axis=1, keepdims=True) (A, B, C) = (word_emb[A_ind], word_emb[B_ind], word_emb[C_ind]) D = (B - A + C) del A, B, C predictions = [] for i in trange(0, D.shape[0], batch_size, desc=filename[:-4]): D_batch = D[i:i+batch_size] dot_ret = np.dot(word_emb, D_batch.T) for (j, indices) in enumerate(zip(A_ind[i:i+batch_size], B_ind[i:i+batch_size], C_ind[i:i+batch_size])): dot_ret[indices, j] = float('-inf') predictions.append(np.argmax(dot_ret, 0)) results.append((filename[:-4], np.mean(np.hstack(predictions) == D_ind), oov_count)) if entity_similarity: category_mapping = {e: c for (c, l) in KORE_CATEGORIES.items() for e in l} base_dir = os.path.join(os.path.join(data_dir, 'entity'), 'similarity') for filename in os.listdir(base_dir): with open(os.path.join(base_dir, filename)) as f: if filename == 'KORE.txt': data = defaultdict(list) title = None for line in f: line = line.rstrip() if line.startswith('\t'): data[title].append(line[1:]) else: title = line kore_results = defaultdict(list) oov_count = 0 for (title, title_list) in data.items(): try: v1 = model.get_entity_vector(title) except KeyError: oov_count += len(title_list) continue estimated = [] for title2 in title_list: try: v2 = model.get_entity_vector(title2) except KeyError: oov_count += 1 continue estimated.append(1.0 - cosine(v1, v2)) gold = list(reversed(range(len(estimated)))) kore_results[category_mapping[title]].append(spearmanr(gold, estimated)[0]) results.append((filename[:-4], np.mean(list(chain(*kore_results.values()))), oov_count)) else: gold = [] estimated = [] oov_count = 0 for (n, line) in enumerate(f): if n == 0: continue line = line.rstrip() (_, _, title1, _, _, title2, score) = line.split('\t') try: v1 = model.get_entity_vector(title1.replace('_', ' ')) except KeyError: oov_count += 1 continue try: v2 = model.get_entity_vector(title2.replace('_', ' ')) except KeyError: oov_count += 1 continue gold.append(float(score)) estimated.append(1.0 - cosine(v1, v2)) results.append((filename[:-4], spearmanr(gold, estimated)[0], oov_count)) if out_format == 'text': for (name, score, oov_count) in results: print('%s: ' % name) print(' Spearman score: %.4f' % score) print(' OOV instances: %d' % oov_count) elif out_format == 'csv': print('name,' + ','.join([o[0] for o in results])) print('score,' + ','.join(['%.4f' % o[1] for o in results])) print('oov,' + ','.join(['%d' % o[2] for o in results]))
# 독립 샘플에 대한 평균 비교 two independent samples of scores dt = np.array([ 24, 43, 58, 71, 43, 49, 61, 44, 67, 49, 53, 56, 59, 52, 62, 54, 57, 33, 46, 43, 57 ]) dc = np.array([ 42, 43, 55, 26, 62, 37, 33, 41, 19, 54, 20, 85, 46, 10, 17, 60, 53, 42, 37, 42, 55, 28, 48 ]) print(stats.ttest_ind(dt, dc)) # pvalue=0.02 print(stats.ttest_ind(dt, dc, equal_var=False)) #등분산성이 아닌 경우 print(stats.jarque_bera(dt)) #정규분포와 일치하는가 print(stats.jarque_bera(dc)) print(stats.pearsonr(dt, dc)) #피어슨 상관계수 print(stats.spearmanr(dt, dc)) print(stat.kendalltau(dt, dc)) import matplotlib.pyplot as plt read_file = pd.read_csv('play_13_14_top30.csv', skiprows=1) read_file.describe() read_file.head() a = read_file.describe() a.boxplot() plt.show() re_file = read_file.rename( columns={ 'P': 'points',
from scipy import stats from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import GridSearchCV from sklearn.model_selection import StratifiedKFold import matplotlib.pyplot as plt trainingDataSet = pd.read_csv("A3_training_dataset.tsv", delimiter="\t", header=None) testDataSet = pd.read_csv("A3_test_dataset.tsv", delimiter="\t", header=None) classLabel = trainingDataSet.iloc[:, -1] trainingCorrData = trainingDataSet.iloc[:, :-1] correlation, pValue = stats.spearmanr(trainingCorrData) columns = np.full((correlation.shape[0], ), True, dtype=bool) for i in range(correlation.shape[0]): for j in range(i + 1, correlation.shape[0]): if correlation[ i, j] > 0.2: # Features below this threshold value are eliminated if columns[j]: columns[j] = False columns_Selected = trainingCorrData.columns[columns] print("Columns selected:", len(columns_Selected)) trainingData = pd.DataFrame(trainingCorrData[columns_Selected]) trainingData.insert(loc=len(columns_Selected), column="class", value=classLabel)
def _rank(x, y): return spearmanr(x, y).correlation
def correlation(a, b, method=EnumMethod.kendall): '''(list|ndarray, list|ndarray, enumeration, enumeration) -> dict Returns a dictionary: {'teststat':teststat, 'p':pval} method: is an enumeration member of EnumMethod engine: is an enumeration method scipy cant cope with nans. Matched nans will be removed if a and b are numpy arrays ''' if isinstance(a, _np.ndarray) or isinstance(b, _np.ndarray): if isinstance(a, _np.ndarray) is False or isinstance(b, _np.ndarray) is False: raise ValueError( 'If numpy arrays are used, both must be ndarray types') if a.shape != b.shape: raise ValueError('Numpy array shapes must match exactly') # scipy doesnt like nans. We drop out paired nans, leaving # all other pairings the same if _arraylib.np_contains_nan(a) and _arraylib.np_contains_nan(b): dic = _arraylib.np_delete_paired_nans_flattened(a, b) else: dic = {'a': a, 'b': b} # we have unmatched nans, ie a nan in one array # with a scalar in the other # this is an error state - could modify later to exclude # all values from both arrays where there is any nan if _arraylib.np_contains_nan(dic['a']): raise ValueError('Numpy array a contains NaNs') if _arraylib.np_contains_nan(dic['b']): raise ValueError('Numpy array b contains NaNs') lst_a = dic['a'].flatten().tolist() lst_b = dic['b'].flatten().tolist() else: if isinstance(a, list) is False or isinstance(b, list) is False: raise ValueError('If lists are used, both must be list types') lst_a = copy.deepcopy(a) lst_b = copy.deepcopy(b) if len(lst_a) != len(lst_b): raise ValueError('Array lengths must match exactly') assert isinstance(lst_a, list) assert isinstance(lst_b, list) for case in _baselib.switch(method): if case(EnumMethod.kendall): teststat, pval = _stats.kendalltau(lst_a, lst_b) break if case(EnumMethod.pearson): teststat, pval = _stats.pearsonr(lst_a, lst_b) break if case(EnumMethod.spearman): #if engine == EnumStatsEngine.r: # df = _pd.DataFrame({'a': lst_a, 'b': lst_b}) # df_r = _rpy2.robjects.pandas2ri(df) # _ro.globalenv['cordf'] = df_r # tmpstr = 'cor.test(cordf$a, cordf$b, method="spearman")' # result = _ro.r(tmpstr) # teststat = result[3][0] # pval = result[2][0] #else: teststat, pval = _stats.spearmanr(lst_a, lst_b) break if case(): raise ValueError('Enumeration member not in e_method') return {'teststat': teststat, 'p': pval}
def SFC_by_tissue_seg(structure_file_path, function_file_path, electrode_localization_by_atlas_file_path, electrode_localization_by_classification_atlas_file_path, outputfile): #Get functional connecitivty data in pickle file format with open(function_file_path, 'rb') as f: broadband, alphatheta, beta, lowgamma, highgamma, electrode_row_and_column_names, order_of_matrices_in_pickle_file = pickle.load( f) FC_list_global = [broadband, alphatheta, beta, lowgamma, highgamma] # set up the dataframe of electrodes to analyze final_electrodes = pd.DataFrame(electrode_row_and_column_names, columns=['electrode_name']) final_electrodes = final_electrodes.reset_index() final_electrodes = final_electrodes.rename(columns={"index": "func_index"}) #Get Structural Connectivity data in mat file format. Output from DSI studio structural_connectivity_array_global = np.array( pd.DataFrame(loadmat(structure_file_path)['connectivity'])) #Get electrode localization by atlas csv file data. From get_electrode_localization.py electrode_localization_by_atlas = pd.read_csv( electrode_localization_by_atlas_file_path) # Get electrode localization by classification atlas electrode_localization_by_class_atlas = pd.read_csv( electrode_localization_by_classification_atlas_file_path) # normalizing and log-scaling the structural matrices structural_connectivity_array_global[structural_connectivity_array_global == 0] = 1 structural_connectivity_array_global = np.log10( structural_connectivity_array_global ) # log-scaling. Converting 0s to 1 to avoid taking log of zeros structural_connectivity_array_global = structural_connectivity_array_global / np.max( structural_connectivity_array_global) # normalization #Only consider electrodes that are in both the localization and the pickle file final_electrodes = final_electrodes.merge( electrode_localization_by_atlas.iloc[:, [0, 4]], on='electrode_name') # Remove electrodes in the Functional Connectivity matrices that have a region of 0 final_electrodes = final_electrodes[final_electrodes['region_number'] != 0] # now join in the classification region number final_electrodes = final_electrodes.merge( electrode_localization_by_class_atlas.iloc[:, [0, 4]], on='electrode_name') for perm in range(0, 2): FC_list = FC_list_global.copy() structural_connectivity_array = structural_connectivity_array_global.copy( ) if (perm == 0): #we will first compute electrodes that are inside the classification atlas # grey matter final_electrodes_cur = final_electrodes[final_electrodes.iloc[:, 3] == 0] # adjust the output dir outputfile_adj = outputfile + '_inside_correlation.pickle' else: #we will next compute electrodes that are outside the classfiication atlas # white matter final_electrodes_cur = final_electrodes[ final_electrodes.iloc[:, 3] > 0] # adjust the output dir outputfile_adj = outputfile + '_outside_correlation.pickle' for i in range(len(FC_list)): FC_list[i] = FC_list[i][final_electrodes_cur['func_index'], :, :] FC_list[i] = FC_list[i][:, final_electrodes_cur['func_index'], :] #Fisher z-transform of functional connectivity data. This is to take means of correlations and do correlations to the structural connectivity #Fisher z transform is just arctanh for i in range(len(FC_list)): FC_list[i] = np.arctanh(FC_list[i]) # Remove structural ROIs not in electrode_localization ROIs electrode_ROIs = np.unique(np.array(final_electrodes_cur.iloc[:, 2])) electrode_ROIs = electrode_ROIs[~(electrode_ROIs == 0)] #remove region 0 structural_index = electrode_ROIs - 1 #subtract 1 because of python's zero indexing structural_connectivity_array = structural_connectivity_array[ structural_index, :] structural_connectivity_array = structural_connectivity_array[:, structural_index] #taking average functional connectivity for those electrodes in same atlas regions for i in range(len(FC_list)): ROIs = np.array(final_electrodes_cur.iloc[:, 2]) for r in range(len(electrode_ROIs)): index_logical = (ROIs == electrode_ROIs[r]) index_first = np.where(index_logical)[0][0] index_second_to_end = np.where(index_logical)[0][1:] mean = np.mean(FC_list[i][index_logical, :, :], axis=0) # Fill in with mean. FC_list[i][index_first, :, :] = mean FC_list[i][:, index_first, :] = mean #delete the other rows and oclumns belonging to same region. FC_list[i] = np.delete(FC_list[i], index_second_to_end, axis=0) FC_list[i] = np.delete(FC_list[i], index_second_to_end, axis=1) #keeping track of which electrode labels correspond to which rows and columns ROIs = np.delete(ROIs, index_second_to_end, axis=0) #remove electrodes in the ROI labeld as zero index_logical = (ROIs == 0) index = np.where(index_logical)[0] FC_list[i] = np.delete(FC_list[i], index, axis=0) FC_list[i] = np.delete(FC_list[i], index, axis=1) ROIs = np.delete(ROIs, index, axis=0) #order FC matrices by ROIs order = np.argsort(ROIs) for i in range(len(FC_list)): FC_list[i] = FC_list[i][order, :, :] FC_list[i] = FC_list[i][:, order, :] #un-fisher ztranform for i in range(len(FC_list)): FC_list[i] = np.tanh(FC_list[i]) #initialize correlation arrays Corrrelation_list = [None] * len(FC_list) for i in range(len(FC_list)): Corrrelation_list[i] = np.zeros([FC_list[0].shape[2]], dtype=float) correlation_type = 'spearman' #calculate Structure-Function Correlation. for i in range(len(FC_list)): for t in range(FC_list[i].shape[2] - 1): #Spearman Rank Correlation: functional connectivity and structural connectivity are non-normally distributed. So we should use spearman if correlation_type == 'spearman': Corrrelation_list[i][t] = spearmanr( np.ndarray.flatten(FC_list[i][:, :, t]), np.ndarray.flatten( structural_connectivity_array)).correlation #print("spearman") # Pearson Correlation: This is calculated bc past studies use Pearson Correlation and we want to see if these results are comparable. if correlation_type == 'pearson': Corrrelation_list[i][t] = pearsonr( np.ndarray.flatten(FC_list[i][:, :, t]), np.ndarray.flatten(structural_connectivity_array))[0] order_of_matrices_in_pickle_file = pd.DataFrame( ["broadband", "alphatheta", "beta", "lowgamma", "highgamma"], columns=["Order of matrices in pickle file"]) with open(outputfile_adj, 'wb') as f: pickle.dump([ Corrrelation_list[0], Corrrelation_list[1], Corrrelation_list[2], Corrrelation_list[3], Corrrelation_list[4], order_of_matrices_in_pickle_file ], f)
def SFC_for_null_model(structure_file_path, FC_list, electrode_row_and_column_names, electrode_localization_by_atlas_file_path): """ :param structure_file_path: :param function_file_path: :param electrode_localization_by_atlas_file_path: :return: """ """ #Example: sub_ID='RID0309' iEEG_filename="HUP151_phaseII" start_times_array=[494702000000] stop_times_array=[494776000000] atlas_folder = 'RA_N0100' perm = 1 structure_file_path= '/Users/andyrevell/mount/DATA/Human_Data/BIDS_processed/sub-{0}/connectivity_matrices/structural/{1}/sub-{0}_ses-preop3T_dwi-eddyMotionB0Corrected.nii.gz.trk.gz.{1}_Perm{2}.count.pass.connectivity.mat'.format(sub_ID,atlas_folder,'{:04}'.format(perm)) electrode_localization_by_atlas_file_path = '/Users/andyrevell/mount/DATA/Human_Data/BIDS_processed/sub-{0}/electrode_localization/electrode_localization_by_atlas/sub-{0}_electrode_coordinates_mni_{1}_Perm{2}.csv'.format(sub_ID,atlas_folder,'{:04}'.format(perm)) function_file_path = '/Users/andyrevell/mount/DATA/Human_Data/BIDS_processed/sub-{0}/connectivity_matrices/functional/eeg/sub-{0}_{1}_{2}_{3}_functionalConnectivity.pickle'.format(sub_ID,iEEG_filename,start_times_array[0],stop_times_array[0]) #Output Files: outputfile = '/Users/andyrevell/mount/DATA/Human_Data/BIDS_processed/sub-{0}/connectivity_matrices/structure_function_correlation/{1}/sub-{0}_{2}_{3}_{4}_{1}_Perm{5}_correlation.pickle'.format(sub_ID,atlas_folder, iEEG_filename, start_times_array[0],stop_times_array[0],'{:04}'.format(perm)) """ # set up the dataframe of electrodes to analyze final_electrodes = pd.DataFrame(electrode_row_and_column_names, columns=['electrode_name']) final_electrodes = final_electrodes.reset_index() final_electrodes = final_electrodes.rename(columns={"index": "func_index"}) #Get Structural Connectivity data in mat file format. Output from DSI studio structural_connectivity_array = np.array( pd.DataFrame(loadmat(structure_file_path)['connectivity'])) #Get electrode localization by atlas csv file data. From get_electrode_localization.py electrode_localization_by_atlas = pd.read_csv( electrode_localization_by_atlas_file_path) # normalizing and log-scaling the structural matrices structural_connectivity_array[structural_connectivity_array == 0] = 1 structural_connectivity_array = np.log10( structural_connectivity_array ) # log-scaling. Converting 0s to 1 to avoid taking log of zeros structural_connectivity_array = structural_connectivity_array / np.max( structural_connectivity_array) # normalization #Only consider electrodes that are in both the localization and the pickle file final_electrodes = final_electrodes.merge( electrode_localization_by_atlas.iloc[:, [0, 4]], on='electrode_name') # Remove electrodes in the Functional Connectivity matrices that have a region of 0 final_electrodes = final_electrodes[final_electrodes['region_number'] != 0] for i in range(len(FC_list)): FC_list[i] = FC_list[i][final_electrodes['func_index'], :, :] FC_list[i] = FC_list[i][:, final_electrodes['func_index'], :] #Fisher z-transform of functional connectivity data. This is to take means of correlations and do correlations to the structural connectivity #Fisher z transform is just arctanh for i in range(len(FC_list)): FC_list[i] = np.arctanh(FC_list[i]) # Remove structural ROIs not in electrode_localization ROIs electrode_ROIs = np.unique(np.array(final_electrodes.iloc[:, 2])) electrode_ROIs = electrode_ROIs[~(electrode_ROIs == 0)] #remove region 0 structural_index = electrode_ROIs - 1 #subtract 1 because of python's zero indexing structural_connectivity_array = structural_connectivity_array[ structural_index, :] structural_connectivity_array = structural_connectivity_array[:, structural_index] #taking average functional connectivity for those electrodes in same atlas regions for i in range(len(FC_list)): ROIs = np.array(final_electrodes.iloc[:, 2]) for r in range(len(electrode_ROIs)): index_logical = (ROIs == electrode_ROIs[r]) index_first = np.where(index_logical)[0][0] index_second_to_end = np.where(index_logical)[0][1:] mean = np.mean(FC_list[i][index_logical, :, :], axis=0) # Fill in with mean. FC_list[i][index_first, :, :] = mean FC_list[i][:, index_first, :] = mean #delete the other rows and oclumns belonging to same region. FC_list[i] = np.delete(FC_list[i], index_second_to_end, axis=0) FC_list[i] = np.delete(FC_list[i], index_second_to_end, axis=1) #keeping track of which electrode labels correspond to which rows and columns ROIs = np.delete(ROIs, index_second_to_end, axis=0) #remove electrodes in the ROI labeld as zero index_logical = (ROIs == 0) index = np.where(index_logical)[0] FC_list[i] = np.delete(FC_list[i], index, axis=0) FC_list[i] = np.delete(FC_list[i], index, axis=1) ROIs = np.delete(ROIs, index, axis=0) #order FC matrices by ROIs order = np.argsort(ROIs) for i in range(len(FC_list)): FC_list[i] = FC_list[i][order, :, :] FC_list[i] = FC_list[i][:, order, :] #un-fisher ztranform for i in range(len(FC_list)): FC_list[i] = np.tanh(FC_list[i]) #initialize correlation arrays Corrrelation_list = [None] * len(FC_list) for i in range(len(FC_list)): Corrrelation_list[i] = np.zeros([FC_list[0].shape[2]], dtype=float) correlation_type = 'spearman' #calculate Structure-Function Correlation. for i in range(len(FC_list)): for t in range(FC_list[i].shape[2] - 1): #Spearman Rank Correlation: functional connectivity and structural connectivity are non-normally distributed. So we should use spearman if correlation_type == 'spearman': Corrrelation_list[i][t] = spearmanr( np.ndarray.flatten(FC_list[i][:, :, t]), np.ndarray.flatten( structural_connectivity_array)).correlation #print("spearman") # Pearson Correlation: This is calculated bc past studies use Pearson Correlation and we want to see if these results are comparable. if correlation_type == 'pearson': Corrrelation_list[i][t] = pearsonr( np.ndarray.flatten(FC_list[i][:, :, t]), np.ndarray.flatten(structural_connectivity_array))[0] return (Corrrelation_list)
def plot_corr_heatmap(df, color_threshold=0.6, cmap=None, figsize=None, value_fontsize=8, label_fontsize=9, precision=2, xrot=80): """ Display the feature spearman's correlation matrix as a heatmap with any abs(value)>color_threshold appearing with background color. Spearman's correlation is the same thing as converting two variables to rank values and then running a standard Pearson's correlation on those ranked variables. Spearman's is nonparametric and does not assume a linear relationship between the variables; it looks for monotonic relationships. SAMPLE CODE from rfpimp import plot_corr_heatmap viz = plot_corr_heatmap(df_train, save='/tmp/corrheatmap.svg', figsize=(7,5), label_fontsize=13, value_fontsize=11) viz.view() # or just viz in notebook """ corr = spearmanr(df).correlation if len(corr.shape) == 0: corr = np.array([[1.0, corr], [corr, 1.0]]) filtered = copy(corr) filtered = np.abs(filtered) # work with abs but display negatives later mask = np.ones_like(corr) filtered[np.tril_indices_from(mask)] = -9999 if cmap is None: cw = plt.get_cmap('coolwarm') cmap = ListedColormap( [cw(x) for x in np.arange(color_threshold, .85, 0.01)]) elif isinstance(cmap, str): cmap = plt.get_cmap(cmap) cm = copy(cmap) cm.set_under(color='white') if figsize: plt.figure(figsize=figsize) im = plt.imshow(filtered, cmap=cm, vmin=color_threshold, vmax=1, aspect='equal') width, height = filtered.shape for x in range(width): for y in range(height): if x == y: plt.annotate('x', xy=(y, x), horizontalalignment='center', verticalalignment='center', fontsize=value_fontsize, color=GREY) if x < y: plt.annotate(myround(corr[x, y], precision), xy=(y, x), horizontalalignment='center', verticalalignment='center', fontsize=value_fontsize, color=GREY) cb = plt.colorbar(im, fraction=0.046, pad=0.04, ticks=[ color_threshold, color_threshold + (1 - color_threshold) / 2, 1.0 ]) cb.ax.tick_params( labelsize=label_fontsize, labelcolor=GREY, ) cb.outline.set_edgecolor('white') plt.xticks(range(width), df.columns, rotation=xrot, horizontalalignment='right', fontsize=label_fontsize, color=GREY) plt.yticks(range(width), df.columns, verticalalignment='center', fontsize=label_fontsize, color=GREY) ax = plt.gca() ax.spines['top'].set_linewidth(.3) ax.spines['right'].set_linewidth(.3) ax.spines['left'].set_linewidth(.3) ax.spines['bottom'].set_linewidth(.3) plt.tight_layout() return PimpViz()
def predictedToActualSimilarity(predictions, actual, labels): """ A cross-validated predicted-to-actual similarity analysis; serves to quantify how much information has transferred Similar to a cross-validated RSA Parameters: predictions - predicted activation patterns. A sample X features matrix. actual - actual activation patterns to compare predictions against. A sample X features matrix. labels - a label matrix organized as a 32 (samples) x 4 (conditions) matrix; columns indicate task condition (or task-rule); rows specify the miniblock index Returns ite_mean - The average information transfer estimate for a given prediction, averaged across all miniblocks. """ nrules = labels.shape[1] # number of task conditions ncvs = labels.shape[ 0] # number of cross-validations; a leave-four-out cross validation in the case of the manuscript correct_matches = [] incorrect_matches = [] # Running cross-validation. Hold out one sample of each condition (leave-four-out) in each cross validation for cv in range(ncvs): # Obtain the *real* prototypes for each of the rules, but leave out the current trial (cv value) testset_ind = labels[cv, :] trainset_ind = np.delete( labels, cv, axis=0) # Delete the test set row from the train set corr = [] err = [] for cond1 in np.arange(nrules, dtype=int): # Find the miniblock we're comparing testmb = testset_ind[cond1] predicted_miniblock = predictions[testmb, :] # predicted-to-actual similarity for cond2 in np.arange(nrules, dtype=int): # Obtain specific miniblocks pertaining to cond2 condition trainmb = trainset_ind[:, cond2] trainmb = trainmb.astype('int') actualprototype = np.mean( actual[trainmb, :], axis=0 ) # average across training samples to obtain prototype # If condition matches if cond1 == cond2: corr.append( np.arctanh( stats.spearmanr(predicted_miniblock, actualprototype)[0])) else: err.append( np.arctanh( stats.spearmanr(predicted_miniblock, actualprototype)[0])) # Get average matches for this cross-validation fold correct_matches.append(np.mean(corr)) # Get average mismatches for this cross-validation fold incorrect_matches.append(np.mean(err)) ite_mean = np.mean(correct_matches) - np.mean(incorrect_matches) return ite_mean
# ============================================================================= # transform # ============================================================================= gsem = gsem.fillna(method='ffill') gsem['log_return'] = np.log(gsem['Adj Close'] / gsem['Adj Close'].shift(1)) gscef = gscef.fillna(method='ffill') gscef['log_return'] = np.log(gscef['Adj Close'] / gscef['Adj Close'].shift(1)) # ============================================================================= # OLS Regression # ============================================================================= #df = pd.DataFrame({'A': gsem['log_return'], 'B': gscef['log_return']}) #result = sm.ols(formula = 'A ~ B', data = df).fit() #print(result.summary()) # correlation stats.spearmanr(gsem['Adj Close'], gscef['Adj Close']) print(np.corrcoef(gsem['Adj Close'], gscef['Adj Close'])) # ============================================================================= ## plotting ## ============================================================================= #plt.plot(gsem['log_return'], gscef['log_return'], 'r.') #ax = plt.axis() # grab x-axis values #x = np.linspace(ax[0], ax[1] + 0.01) #plt.plot(x, -0.0003 + x * 0.9303, 'b') #plt.grid(True) #plt.xlabel('Goldman Sachs Emerging Market Index') #plt.ylabel('Goldman Sachs China Equity Fund') #plt.title('Scatter of log returns and regresson line') #upper = plt.subplot(2, 1, 1)
def compute_spearman(predicts, labels): if len(predicts) >= 2: scof = spearmanr(labels, predicts)[0] return 100.0 * scof else: return np.nan
if m_match_flag[i] > -1: m_match_bin_count[m_mass_bin_index[i], m_z_bin_index[i]] += 1 complete_bin_mass = np.flipud(np.array(m_match_bin_count).astype("float") / np.array(m_bin_count2).astype("float")) ################ # SPEARMAN RHO # ################ for i in range(n_z_bins): m_index_1 = mock.rich[match_index][m_z_bin_index[match_index] == i] m_index_2 = mock.rich[match_index][c_z_bin_index[m_match_flag[match_index]] == i] c_index_1 = cluster.rich[m_match_flag[match_index]][m_z_bin_index[match_index] == i] c_index_2 = cluster.rich[m_match_flag[match_index]][c_z_bin_index[m_match_flag[match_index]] == i] if m_index_1.size > 1: m_rho, p1 = ss.spearmanr(m_index_1, c_index_1) m_rho_err = 0.6325 / (len(m_index_1) - 1) ** 0.5 else: m_rho = 0.0 m_rho_err = 0.0 if m_index_2.size > 1: c_rho, p2 = ss.spearmanr(m_index_2, c_index_2) c_rho_err = 0.6325 / (len(m_index_2) - 1) ** 0.5 else: c_rho = 0.0 c_rho_err = 0.0 if i == 0: m_rhos = np.array(m_rho) m_rhos_err = np.array(m_rho_err) c_rhos = np.array(c_rho) c_rhos_err = np.array(c_rho_err)
from statsmodels.stats import multitest import funs_fi as fi dir_base = os.getcwd() dir_output = os.path.join(dir_base, 'output') ################################################ # ----------- (1) LOAD IN THE DATA ----------- # tmp_FI = pd.read_csv(os.path.join('processed', 'df_FI.csv')) tmp_inf = pd.read_csv(os.path.join('processed', 'df_inf.csv')) tmp_res = pd.read_csv(os.path.join('processed', 'df_res.csv')) df = tmp_inf.merge(tmp_FI, on=['tt', 'ID']).merge(tmp_res, on=['tt', 'ID']) del tmp_FI, tmp_inf, tmp_res ################################################## # ----------- (2) REVERSE FI RESULTS ----------- # df_neg = df[df.tt == 'neg'].reset_index(drop=True) df_neg['FQ'] = df_neg.FI / (df_neg.num1 + df_neg.num2) print(np.round(df_neg.FI.describe(), 1)) print(np.round(df_neg.FQ.describe(), 2)) rho_IF = stats.spearmanr(df_neg.FI, df_neg.IF) print('Rho: %0.3f (p-val: %0.3f) for IF-FI' % (rho_IF[0], rho_IF[1])) thresh = 12 print('A total of %i studies have a RFI>%i and %i have <=%i' % (sum(df_neg.FI > thresh), thresh, sum(df_neg.FI <= thresh), thresh))
cars_data = pd.read_csv("mtcars.csv") cars_data.columns = ['car_names', 'mpg', 'cyl', 'disp', 'hp', 'drat', 'wt', 'qsec', 'vs' , 'am','gear' ,'carb'] cars_data.head() cars_sub = cars_data.iloc[:, [2, 4]].values cars_data_names = ['cyl', 'hp'] y = cars_data.iloc[:, 9].values sb.regplot(x='cyl', y='hp', data=cars_data, scatter=True) cyl = cars_data['cyl'] hp = cars_data['hp'] spearmanr_coefficient, p_value = spearmanr(cyl, hp) sb.countplot(x='am', data=cars_data, palette="hls") X = scale(cars_sub) LogReg = LogisticRegression() LogReg.fit(X, y) print(LogReg.score(X, y)) '''Predictors''' y_pred = LogReg.predict(X) print(classification_report(y, y_pred))
def eval(self, splt): """ Evaluate on XNLI validation and test sets, for all languages. """ params = self.params self.embedder.eval() self.proj.eval() scores = OrderedDict({'epoch': self.epoch}) task = self.task.lower() pred = [] # predicted values gold = [] # real values lang_id = params.lang2id['en'] for batch in self.get_iterator(splt): # batch if self.n_sent == 1: (x, lengths), idx = batch # x, lengths = truncate(x, lengths, params.max_len, params.eos_index) else: (sent1, len1), (sent2, len2), idx = batch # sent1, len1 = truncate(sent1, len1, params.max_len, params.eos_index) # sent2, len2 = truncate(sent2, len2, params.max_len, params.eos_index) x, lengths, _, _ = concat_batches(sent1, len1, lang_id, sent2, len2, lang_id, params.pad_index, params.eos_index, reset_positions=False) y = self.data[splt]['y'][idx] # cuda x, y, lengths = to_cuda(x, y, lengths) # prediction output = self.proj( self.embedder.get_embeddings(x, lengths, positions=None, langs=None)) p = output.data.max(1)[1] if self.is_classif else output.squeeze(1) pred.append(p.cpu().numpy()) gold.append(y.cpu().numpy()) gold = np.concatenate(gold) pred = np.concatenate(pred) if self.is_classif: scores['%s_valid_acc' % task] = 100. * (pred == gold).sum() / len(pred) scores['%s_valid_f1' % task] = 100. * f1_score( gold, pred, average='binary' if params.out_features == 2 else 'micro') scores['%s_valid_mc' % task] = 100. * matthews_corrcoef(gold, pred) else: scores['%s_valid_prs' % task] = 100. * pearsonr(pred, gold)[0] scores['%s_valid_spr' % task] = 100. * spearmanr(pred, gold)[0] logger.info("__log__:%s" % json.dumps(scores)) return scores
posthoc_tests['posthoc_' + str(var)] = posthoc stats_tests.loc[i, 'variable'] = var stats_tests.loc[i, 'test_type'] = test_type stats_tests.loc[i, 'p_value'] = test[1] stats_tests.loc[i, 'p_value_variance'] = p_var # Correct for multiple tests stats_tests['p_value'] = multipletests(stats_tests['p_value'], method='fdr_bh')[1] stats_tests['p_value_variance'] = multipletests( stats_tests['p_value_variance'], method='fdr_bh')[1] if (stats.normaltest(learned['n_trials'])[1] < 0.05 or stats.normaltest(learned['reaction_time'])[1] < 0.05): test_type = 'spearman' correlation_coef, correlation_p = stats.spearmanr(learned['reaction_time'], learned['n_trials']) if (stats.normaltest(learned['n_trials'])[1] > 0.05 and stats.normaltest(learned['reaction_time'])[1] > 0.05): test_type = 'pearson' correlation_coef, correlation_p = stats.pearsonr(learned['reaction_time'], learned['n_trials']) # Add all mice to dataframe seperately for plotting learned_no_all = learned.copy() learned_no_all.loc[learned_no_all.shape[0] + 1, 'lab_number'] = 'All' learned_2 = learned.copy() learned_2['lab_number'] = 'All' learned_2 = learned.append(learned_2) # %% seaborn_style()
im = np.asarray(cv2.imread(directory)) for j in range(Num_Patch): x = im.shape[0] y = im.shape[1] x_p = np.random.randint(x - 128, size=1)[0] y_p = np.random.randint(y - 128, size=1)[0] temp = im[x_p:x_p + 128, y_p:y_p + 128, :].transpose([2, 0, 1]) out = net.forward_all(data=np.asarray([temp])) feat[i, j] = out[ft][0] pre[i] += out[ft][0] pre[i] /= Num_Patch med[i] = np.median(feat[i, :]) srocc = stats.spearmanr(pre, scores)[0] lcc = stats.pearsonr(pre, scores)[0] print '% LCC of mean : {}'.format(lcc) print '% SROCC of mean: {}'.format(srocc) srocc_file.write('%6.3f\n' % (srocc)) lcc_file.write('%6.3f\n' % (lcc)) srocc_file.close() lcc_file.close() srocc = stats.spearmanr(med, scores)[0] lcc = stats.pearsonr(med, scores)[0] print '% LCC of median: {}'.format(lcc) print '% SROCC of median: {}'.format(srocc)
model = None best_model = load_model('BLSTM_pretrained_FE.hdf5') Recon_Spear_Intensity = [] Recon_Spear_Pitch = [] for i in range(len(Test_clean_audio)): # Spec Feature clean_Spec, _ = Sp_and_Phase(Test_clean_audio[i], Noisy=False) # Praat Feature clean_Prosodic = Prosodic_feat_process(Test_clean_prosodic[i], Normalize=False) clean_Prosodic = Exten_prosodic_feat(clean_Spec, clean_Prosodic) # model prediction pred_Prosodic = best_model.predict(clean_Spec) # Spearman corr. evaluation metric p_spear_corr = spearmanr(clean_Prosodic[:, :, 0].reshape(-1), pred_Prosodic[:, :, 0].reshape(-1)) Recon_Spear_Pitch.append(p_spear_corr) e_spear_corr = spearmanr(clean_Prosodic[:, :, 1].reshape(-1), pred_Prosodic[:, :, 1].reshape(-1)) Recon_Spear_Intensity.append(e_spear_corr) # Plot Reconstruction Results plt.rc('font', family='Times New Roman') plt.plot(clean_Prosodic[:, :, 0].reshape(-1), color='blue', linewidth=3.5) plt.plot(pred_Prosodic[:, :, 0].reshape(-1), color='red', linewidth=3.5) plt.xticks(fontsize=15) plt.yticks(fontsize=15) plt.show() print('Avg. Reconstruct Intensity SpearCorr: ' + str(np.mean(Recon_Spear_Intensity, axis=0)[0]))