def compareDistributions(bins_type, nb_exp, folder="../resultData/features_on_films"): ''' Ici on va comparer la distribution d'une feature dans differents tirages de nb_exp films. Le but est de trouver in fine le nb_exp minimal pour lequel la distribution est stable. Ca va me dire a quel point je peux paralleliser le calcul de la distance d'un film a ses controles ''' pvalues = defaultdict(list) for feature in featuresNumeriques: print feature l = filter(lambda x: feature in x and bins_type in x and int(x.split('_')[2])>0.75*nb_exp and int(x.split('_')[2])<1.25*nb_exp, os.listdir(folder)) for file_ in l: iter_ = int(file_.split('_')[-1].split('.')[0]) print '----', iter_ for other_file in filter(lambda x: int(x.split('_')[-1].split('.')[0])>iter_, l): iter2 = int(other_file.split('_')[-1].split('.')[0]) print iter2 f=open(os.path.join(folder, file_)) l1=pickle.load(f); f.close() f=open(os.path.join(folder, other_file)) l2=pickle.load(f); f.close() if bins_type =='quantile': ks, pval = ks_2samp(l1[1], l2[1]) else: ks, pval = ks_2samp(l1, l2) pvalues[feature].append(pval) for feature in featuresNumeriques: print feature, np.mean(pvalues[feature]), scoreatpercentile(pvalues[feature], 90) f=open(os.path.join(folder, 'pvalues_{}_{}'.format(bins_type, nb_exp)), 'w') pickle.dump(pvalues, f); f.close() return
def different_expression(expression='data/sample_expr', path_to_genes='data/genes_set.txt', cutoff=0.05, save=None): """ Determine whether specified genes differ in expression between 2 classes. Use Kolmogorov-Smirnov criterion. NOTE: takes a dataframe where genes are columns and observations are rows :param expression: str - path to expression data in csv :param path_to_genes: str - path to subset of genes where each gene occupy 1 row :param cutoff: float - p-value threshold :param save: str - save fig in 'save' location if provided :return: boolean - whether expression of genes is altered """ # Load data if not is_dataframe(expression): expression = pd.read_csv(expression) with open(path_to_genes, 'r') as source: genes = [gene.strip() for gene in source.readlines()] # Prepare df for subset of genes expression_subset = expression.loc[:, expression.columns.isin(genes)] expression_subset.loc[:, 'Description'] = expression.loc[:, 'Description'] # Correlations between class (phenotype) and expression of each gene # and correlations between class and genes from subset corrs_all = [pearsonr(expression.loc[:, gene], expression.loc[:, 'Description'])[0] for gene in expression] corrs_gene_set = [pearsonr(expression.loc[:, gene], expression.loc[:, 'Description'])[0] for gene in expression_subset] # Plot functions plot_curves(corrs_all, corrs_gene_set, save) # Compute p-value p_value = ks_2samp(corrs_gene_set, corrs_all)[1] return p_value <= cutoff
def KS_Testing(Databases, conditions): """ docstring """ columns = ["AppEn", "SampEn", "DFA", "HFD", "SD_ratio"] ks_test = list() for Data in Databases: for cond in conditions: #print(Data) print("Base de datos: ", cond) for col in columns: metric = np.array(Data[[col]]) print("Métrica: ", col) #print(type(metric)) comb = list(combinations(metric, 2)) #print("Combinaciones posibles: ",len(comb)) for i in range(len(comb) - 1): pair = comb[i] X = np.histogram(np.array(pair[0]).all(), bins='auto') Y = np.histogram(np.array(pair[1]).all(), bins='auto') ks_r = stats.ks_2samp(X[0], Y[0], alternative='two-sided') p_val = ks_r[1] #print(p_val) if p_val < 0.05: ks_test.append(0) elif p_val > 0.05: ks_test.append(1) prob = np.sum(ks_test) / len(ks_test) * 100 print("Porcentaje de Similitud {} %".format(prob)) print("\n")
def calc_kolmogorov_smirnov(columns, prior_data, target_data): """Calculate kolmogorov_smirnov matrix""" ks_param = {} for dkey in sorted(columns): ks_val = ks_2samp(prior_data[dkey], target_data[dkey]) ks_param[dkey] = ks_val[0] return ks_param
def GSEA(geo_ID, gene_list): gse = GEOparse.get_GEO(geo=geo_ID, destdir="./") expression = gse.pivot_samples('VALUE').T experiments = {} for i, (idx, row) in enumerate(gse.phenotype_data.iterrows()): tmp = {} tmp["Type"] = 1 if "control" in row["description"] else 0 experiments[i] = tmp experiments = pd.DataFrame(experiments).T counter = 0 all_genes_set = [] all_corr_set = [] genes_corr_set = [] for gene in expression: counter += 1 if counter <= 3: continue all_genes_set.append(gene) corr_matrix = np.corrcoef( [list(experiments['Type']), list(expression[gene])]) all_corr_set.append(corr_matrix[0, 1]) if gene in gene_list: genes_corr_set.append(corr_matrix[0, 1]) p_value = ks_2samp(genes_corr_set, all_corr_set)[1] return (str(p_value))
def compute_correlations_ks(mirnas): """ Compute Pearson, Spearman, and KS for 5p/3p expr data of miRs. """ print "miRNA\tPearson_r\tPearson_pval\tSpearman_rho\tSpearman_pval\t", print "KS_D\tKS_pval" for mir in mirnas: if len(mirnas[mir]) > 1: pears_r, pears_pval = pearsonr(mirnas[mir]["5p"], mirnas[mir]["3p"]) spear_rho, spear_pval = spearmanr(mirnas[mir]["5p"], mirnas[mir]["3p"]) ks_d, ks_pval = ks_2samp(mirnas[mir]["5p"], mirnas[mir]["3p"]) mir_name = mir.rstrip('-') print "{0}\t{1:f}\t{2:f}\t{3:f}\t{4:f}\t{5:f}\t{6:f}".format( mir_name, pears_r, pears_pval, spear_rho, spear_pval, ks_d, ks_pval)
def pushButton_clicked(self): # считываем текст, введенный в ячейку имени файла и ячейку генов gse_acc = self.lineEdit.text() mytext = self.textEdit.toPlainText() # делаем из генов список genes = mytext.split() # загружаем файл по имени gse = GEOparse.get_GEO(geo=gse_acc, destdir="./") # получаем матрицу экспрессии по генам и образцам expression = gse.pivot_samples('VALUE').T # получаем список из фенотипов: если в описании присутствует слово # "control", считаем это контролем и присваиваем 1 experiments = {} for i, (idx, row) in enumerate(gse.phenotype_data.iterrows()): tmp = {} tmp["Type"] = 1 if "control" in row["description"] else 0 experiments[i] = tmp experiments = pd.DataFrame(experiments).T phen = list(experiments['Type']) # строим матрицы корреляций (как в классе) counter = 0 all_genes_set = [] all_corr_set = [] genes_corr_set = [] for column in expression: counter += 1 if counter <= 3: continue expressions = list(expression[column]) gene = column all_genes_set.append(column) corr_matrix = np.corrcoef([phen, expressions]) all_corr_set.append(corr_matrix[0, 1]) if gene in genes: genes_corr_set.append(corr_matrix[0, 1]) # получаем p-value по тесту Колмогорова-Смирнова p_value = ks_2samp(genes_corr_set, all_corr_set)[1] # выводим его в окошко self.label_3.setText('{:.3f}'.format(p_value))
def GSEA (geo_ID, gene_list): gse = GEOparse.get_GEO(geo=geo_ID, destdir="./") expression = gse.pivot_samples('VALUE').T experiments = {} for i, (idx, row) in enumerate(gse.phenotype_data.iterrows()): tmp = {} tmp["Type"] = 1 if "control" in row["description"] else 0 experiments[i] = tmp experiments = pd.DataFrame(experiments).T counter = 0 all_genes_set = [] all_corr_set = [] genes_corr_set = [] for gene in expression: counter += 1 if counter <= 3: continue all_genes_set.append(gene) corr_matrix = np.corrcoef([list(experiments['Type']), list(expression[gene])]) all_corr_set.append(corr_matrix[0,1]) if gene in gene_list: genes_corr_set.append(corr_matrix[0,1]) p_value = ks_2samp(genes_corr_set, all_corr_set)[1] return(str(p_value))
pearsonsWnt = list() for wnt1 in wntGenes: if wnt1 in geneDict: for wnt2 in wntGenes: if wnt2 in geneDict and wnt1 != wnt2: corr, pVal = pearsonr(geneDict[wnt1], geneDict[wnt2]) if (corr == 1.0): corr = 0.99999 pearsonsWnt.append(math.atanh(corr)) # END Non Specific Wnt Pearson Correlations----------------------------------------------- # Determine if this is statistically significant------------------------------------------ ks, pVal = ks_2samp(pearsons, pearsonsWnt) sigFile.write(file + " " + str(pVal) + "\n") # BUILD HISTOGRAM------------------------------------------------------------------------- if pVal < (0.05 / 769.0): try: # the histogram of the random data plt.hist(pearsons, 75, density=True, facecolor='b', alpha=0.25) n, bins, patches = plt.hist(pearsonsWnt, 75, density=True, facecolor='g', alpha=0.25) plt.xlabel('Correlation')
sns.distplot(df[df["source"] == "aa12"][type_to_check], label="aa12", kde=False, rug=True) sns.distplot(df[df["source"] == "aa1"][type_to_check], label="aa1", kde=False, rug=True) ax = sns.distplot(df[df["source"] == "aa2"][type_to_check], label="aa2", kde=False, rug=True) ax.set(xlabel='Total length') plt.legend() sns.despine(offset=10, trim=True) path_to_read = "/Users/alessandrozonta/PycharmProjects/astar/output/" if save_figure is True: plt.savefig("{}/total_length_astar.pdf".format(path_to_read)) else: plt.show() plt.close() total = [] total.append(stats.ks_2samp(df[df["source"] == "a"]['fitness'], df[df["source"] == "aa012"]['fitness']).pvalue) total.append(stats.ks_2samp(df[df["source"] == "a"]['fitness'], df[df["source"] == "aa01"]['fitness']).pvalue) total.append(stats.ks_2samp(df[df["source"] == "a"]['fitness'], df[df["source"] == "aa02"]['fitness']).pvalue) total.append(stats.ks_2samp(df[df["source"] == "a"]['fitness'], df[df["source"] == "aa0"]['fitness']).pvalue) total.append(stats.ks_2samp(df[df["source"] == "a"]['fitness'], df[df["source"] == "aa12"]['fitness']).pvalue) total.append(stats.ks_2samp(df[df["source"] == "a"]['fitness'], df[df["source"] == "aa1"]['fitness']).pvalue) total.append(stats.ks_2samp(df[df["source"] == "a"]['fitness'], df[df["source"] == "aa2"]['fitness']).pvalue) total.append(stats.ks_2samp(df[df["source"] == "a"]['fitness'], df[df["source"] == "aa"]['fitness']).pvalue) # small p -> two different distributions logger.info(total) logger.info(np.mean(np.array(total))) logger.info(np.std(np.array(total))) total = []
def ks_feature_distribution(self, threshold=0.1, show_plots=True): """ Uses the Kolomogorov-Smirnov test see if the distribution in the training and test sets are similar. Credit: https://www.kaggle.com/nanomathias/distribution-of-test-vs-training-data#1.-t-SNE-Distribution-Overview Parameters ---------- threshold : float, optional KS statistic threshold, by default 0.1 show_plots : bool, optional True to show histograms of feature distributions, by default True Returns ------- DataFrame Columns that are significantly different in the train and test set. Examples -------- >>> data.ks_feature_distribution() >>> data.ks_feature_distribution(threshold=0.2) """ if self.x_test is None: raise ValueError( "Data must be split into train and test set. Please set the `x_test` variable." ) report_info = technique_reason_repo["stats"]["dist_compare"]["ks"] diff_data = [] diff_df = None for col in tqdm(self.x_train.columns): statistic, pvalue = ks_2samp( self.x_train[col].values, self.x_test[col].values ) if pvalue <= 0.05 and np.abs(statistic) > threshold: diff_data.append( { "feature": col, "p": np.round(pvalue, 5), "statistic": np.round(np.abs(statistic), 2), } ) if diff_data: diff_df = pd.DataFrame(diff_data).sort_values( by=["statistic"], ascending=False ) if show_plots: n_cols = 4 n_rows = int(len(diff_df) / n_cols) + 1 _, ax = plt.subplots(n_rows, n_cols, figsize=(40, 8 * n_rows)) for i, (_, row) in enumerate(diff_df.iterrows()): if i >= len(ax): break extreme = np.max( np.abs( self.x_train[row.feature].tolist() + self.x_test[row.feature].tolist() ) ) self.x_train.loc[:, row.feature].swifter.apply(np.log1p).hist( ax=ax[i], alpha=0.6, label="Train", density=True, bins=np.arange(-extreme, extreme, 0.25), ) self.x_test.loc[:, row.feature].swifter.apply(np.log1p).hist( ax=ax[i], alpha=0.6, label="Train", density=True, bins=np.arange(-extreme, extreme, 0.25), ) ax[i].set_title(f"Statistic = {row.statistic}, p = {row.p}") ax[i].set_xlabel(f"Log({row.feature})") ax[i].legend() plt.tight_layout() plt.show() if self.report is not None: self.report.report_technique(report_info, []) return diff_df
def KS2samp_normtest(x,alpha): normal_samp=np.random.normal(np.mean(x),np.std(x),len(x)) KS,pKS=st.ks_2samp(x, normal_samp) if pKS>alpha:print "dist is normal; KStest pval=", round(pKS,2) else:print "dist is NOT normal; KStest pval=", round(pKS,2) return pKS
def compute(train_scores, validate_scores): """ train/validate scores: predicted scores on train/validate set """ return stats.ks_2samp(train_scores, validate_scores).pvalue
def ks_test(observation_pdf, pdf): #observ_cdf = np.cumsum(observation_pdf) #cdf = np.cumsum(pdf) ks_stat, p_value = stats.ks_2samp(observation_pdf.reshape(-1),pdf.reshape(-1)) return p_value
plt.savefig("{}/total_length_random_walk.pdf".format(path_to_read)) # plt.show() plt.close() to_check = ["fitness", "no_overlapping", "direction"] for c in to_check: for f in fit: here_list = copy.deepcopy(fit) here_list.remove(f) total = [] # for el in here_list: total.append( stats.ks_2samp( df[df["source"] == "fitness_no_visited_seed_pd0_"][c], df[df["source"] == "fitness_no_visited_seed_pd1_"][c]).pvalue) logger.info(f) logger.info(total) logger.info(np.mean(np.array(total))) logger.info(np.std(np.array(total))) logger.info("---") logger.info("------------------------") # # path = "/Users/alessandrozonta/Desktop/output_random_walk/" # folders = sorted_nicely(glob.glob("{}*/".format(path))) # # for f in folders: # name_folder = f.split("/")[-1]
# plt.close() # ax = sns.boxplot(x="source", y="direction", data=df) # ax.set(xlabel='sources', ylabel='directions') # sns.despine(offset=10, trim=True) # path_to_read = "/Users/alessandrozonta/PycharmProjects/NEAT/output/" # plt.savefig("{}/directions_neat.pdf".format(path_to_read)) # # plt.show() # plt.close() to_check = ["fitness", "no_overlapping", "direction"] for c in to_check: total = [] total.append( stats.ks_2samp(df[df["source"] == "neat2"][c], df[df["source"] == "neat012"][c]).pvalue) total.append( stats.ks_2samp(df[df["source"] == "neat2"][c], df[df["source"] == "neat01"][c]).pvalue) total.append( stats.ks_2samp(df[df["source"] == "neat2"][c], df[df["source"] == "neat02"][c]).pvalue) total.append( stats.ks_2samp(df[df["source"] == "neat2"][c], df[df["source"] == "neat0"][c]).pvalue) total.append( stats.ks_2samp(df[df["source"] == "neat2"][c], df[df["source"] == "neat12"][c]).pvalue) total.append( stats.ks_2samp(df[df["source"] == "neat2"][c], df[df["source"] == "neat1"][c]).pvalue)
def ks_test(x): test_values, ref_values = x[:len(x)/2], x[len(x)/2:] return ks_2samp(test_values, ref_values)[0]
# Specific Wnt Pearson Correlations------------------------------------------------------- pearsonsPairedWnt = list() for pair in wntPairs: if pair[0] in geneDict and pair[1] in geneDict: corr, pVal = pearsonr(geneDict[pair[0]], geneDict[pair[1]]) if corr == 1.0: corr = 0.99999 pearsonsPairedWnt.append(math.atanh(corr)) # END Specific Wnt Pearson Correalations-------------------------------------------------- # Determine if this is statistically significant------------------------------------------ ks, pValNon = ks_2samp(pearsons, pearsonsWnt) ks, pValSpec = ks_2samp(pearsons, pearsonsPairedWnt) sigFile.write(file + " " + str(pValNon) + "," + str(pValSpec) + "\n") print(file + " " + str(pValNon) + "," + str(pValSpec) + "\n") # BUILD HISTOGRAM------------------------------------------------------------------------- if pValNon < (0.05 / 769.0) or pValSpec < (0.05 / 769.0): print("Number of Pairwise Wnt Data Points: " + str(len(pearsonsPairedWnt))) try: # the histogram of the random data plt.hist(pearsons, 75, density=True, range = [-2,2], facecolor='b', alpha=0.25) # Blue Random Background Data plt.hist(pearsonsWnt, 75, density=True, range = [-2,2], facecolor='g', alpha=0.25) # Green Non specific Wnt Data plt.hist(pearsonsPairedWnt, 75, density = True, range = [-2,2], facecolor='r', alpha=0.25) # Red specific Wnt Data plt.xlabel('Correlation')
# real_names = ["RWFBNV", "RWFB", "neat02", "aa"] # for name in real_names: # ax = sns.distplot(df[df["source"] == name]["total_length"], label=name, kde=False, rug=True) # ax.set(xlabel='total length') # plt.legend() # sns.despine(offset=10, trim=True) # plt.savefig("{}/total_length_neat.pdf".format(path_to_read)) # plt.show() # plt.close() to_check = ["fitness", "no_overlapping", "direction"] for c in to_check: total = [] total.append( stats.ks_2samp(df[df["source"] == "neat02"][c], df[df["source"] == "aa"][c]).pvalue) total.append( stats.ks_2samp(df[df["source"] == "neat02"][c], df[df["source"] == "RWFB"][c]).pvalue) total.append( stats.ks_2samp(df[df["source"] == "neat02"][c], df[df["source"] == "RWFBNV"][c]).pvalue) logger.info(total) logger.info(np.mean(np.array(total))) logger.info(np.std(np.array(total))) total = [] total.append( stats.ks_2samp(df[df["source"] == "aa"][c], df[df["source"] == "RWFB"][c]).pvalue) total.append(
numsamples=500 samp_size=10 alpha=0.05 print "\n",lith[f],round(np.mean(curRh),0) smd.append(SMD_analysis(curUCS,numsamples,samp_size,alpha)) curax=plt.gca() if f==len(lith)-1: curax.set_xlabel("UCS, MPa") plt.setp(curax.get_yticklabels(), visible=False) plt.tight_layout() for f in range(len(lith)): cursmd=smd[f] print "" for g in range(f+1,len(lith)): comsmd=smd[g] T,pT=st.ttest_ind(cursmd,comsmd) KS,pKS=st.ks_2samp(cursmd,comsmd) print [lith[f], lith[g], round(pT,3), round(pKS,3)] plt.show()