def save_enrichment(x): lib = gp.get_library_name('Human') with open('gensets.txt', 'w') as f: for item in range(len(lib)): f.write("%s %s\n" % (item, lib[item])) # lib = lib[49: 54] lib = lib[53] files = [(1, x+"/gcn-hom-hom.csv"), (2, x+"/gcn-hom-onto.csv"), (3, x+"/gcn-onto-onto.csv"), (4, x+"/gae-hom-hom.csv"), (5, x+"/gae-hom-onto.csv"), (6, x+"/gae-onto-onto.csv")] df = pd.DataFrame() writer = pd.ExcelWriter('enrich-cluster/full-results.xlsx') for key, file in files: print(file) cluster_data = read_file_2(file) for i in cluster_data: try: enr = gp.enrichr(gene_list=list(cluster_data[i][2]), gene_sets=lib, organism='Human', cutoff=0.05).results except: pass enr['model'] = key enr['cluster'] = i df = df.append(enr) df = df[(df['P-value'] < 0.05)] df.to_excel(writer, sheet_name="sheet1") writer.save()
def scatter_stats(perms): statistics = {"gae-hom-hom-gae-hom-onto": list(), "gae-hom-hom-gae-onto-onto": list(), "gae-onto-onto-jcd-onto": list(), "gae-hom-hom-jcd-hom": list(), "gae-hom-hom-jcd-onto": list(), "gae-hom-onto-gae-onto-onto": list(), "gae-hom-onto-jcd-hom": list(), "gae-hom-onto-jcd-onto": list(), "gae-onto-onto-jcd-hom": list(), "jcd-hom-jcd-onto": list(), } for perm in range(perms): print("********** permutation no {} *********".format(perm)) random_elments = random.sample(list(read_file_2("../enrich_red/selected_genesets.csv").keys()), 50) for i in files: print("file is {}".format(i)) file_one = "../enrich_red/" + i[0] + ".csv" file_two = "../enrich_red/" + i[1] + ".csv" _one = read_file_2(file_one) _two = read_file_2(file_two) one = {ii: _one[ii] for ii in random_elments} two = {ii: _two[ii] for ii in random_elments} array = [] for x, y in zip(one, two): x_one = list(one[x]) y_one = list(two[y]) array.extend(res(x_one, y_one)) array.extend(res(y_one, x_one)) x, y = zip(*array) linreg = spy.stats.linregress(x, y) statistics[i[0]+"-"+i[1]].append(linreg.rvalue) write_file("../perms2/"+lib+".csv", statistics)
def one_correlation(): selected = read_file_2("../enrich_red/selected_genesets.csv") top_n = 20 for i in files: similar = read_file_2("../enrich_red/" + i + ".csv") array = [] for x, y in zip(selected, similar): x_one = list(selected[x]) y_one = list(similar[y]) array.extend(res(x_one, y_one, top_n)) x, y = zip(*array) fig, ax = plt.subplots() ax.scatter(x, y) linreg = spy.stats.linregress(x, y) temp = [linreg.intercept + linreg.slope.item() * k for k in x] plt.plot(x, temp, 'r') x_min = min(x) x_max = max(x) + statistics.stdev(x) y_min = min(y) y_max = max(y) + statistics.stdev(y) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.ylabel("Corresponding P-value") plt.xlabel("Actual P-value") # plt.text(i[2], i[3], 'R2 = %0.2f' % linreg.rvalue) # plt.text(i[2], i[4], 'Slope = %0.2f' % linreg.slope) extra = Rectangle((0, 0), 1, 1, fc="w", fill=False, edgecolor='none', linewidth=0) ax.legend( [extra, extra], ('R2 = %0.2f' % linreg.rvalue, 'Slope = %0.2f' % linreg.slope)) plt.title("Correlation plot for " + i) plt.show()
def stat_correlation(perms, top_n): statistics = { "gae-hom-hom": list(), "gae-hom-onto": list(), "gae-onto-onto": list(), "jcd-hom": list(), "jcd-onto": list() } for perm in range(perms): print("********** permutation no {} *********".format(perm)) # random_elments = random.sample(list(read_file_2("ms-project/geneset_pairing.csv").keys()), 50) random_elments = random.sample( list(read_file_2("../enrich_red/selected_genesets.csv").keys()), 50) _selected = read_file_2("../enrich_red/selected_genesets.csv") selected = {ii: _selected[ii] for ii in random_elments} for i in files: print("file is {}".format(i)) _similar = read_file_2("../enrich_red/" + i + ".csv") similar = {ii: _similar[ii] for ii in random_elments} array = [] for x, y in zip(selected, similar): x_one = list(selected[x]) y_one = list(similar[y]) array.extend(res(x_one, y_one, top_n)) x, y = zip(*array) linreg = spy.stats.linregress(x, y) statistics[i].append(linreg.rvalue) write_file("../perms/" + lib + ".csv", statistics)
def distribution(): mthds = read_file_2("../perms2/GO_Biological_Process_2018.csv") # Using 95% confidence interval # (1-0.95)/2 t_score = abs(t.ppf(0.025, 23)) alpha = 1 - 0.95 excel_rows = [['Method', 'PT. Est', 'lower CI', 'upper CI']] for i in mthds: mean = statistics.mean(mthds[i]) std = statistics.stdev(mthds[i]) sqtr_nu = math.sqrt(len(mthds[i])) # p_hat and q_hat set to conservative since we have no previous data #0.5 for each # Since its probability I clip to 0 x = pd.Series(mthds[i]) # if i == 'jcd-hom-jcd-onto': # mtd = replacenth(i, "-", " vs ", 2) # else: # mtd = replacenth(i, "-", " vs ", 3) mtd = i lower_ci = max(mean - t_score * std / sqtr_nu, 0) upper_ci = mean + t_score * std / sqtr_nu qq_plot(x, mtd, "../qqplots/qq-" + i + ".png") excel_rows.append( [mtd, round(mean, 3), round(lower_ci, 3), round(upper_ci, 3)]) df = pd.DataFrame.from_records(excel_rows[1:], columns=excel_rows[0]) print(df) print(df.to_latex(index=True))
sample = read_file("enrich_red/gae-hom-hom.csv") red = read_file("enrich_red/selected_genesets.csv") temp = {} for i in red: if i in sample: temp[i] = red[i] write_file("enrich_red/selected_genesets.csv", temp) # reduce_genesets() lib = gp.get_library_name('Human')[53] files = [("gae-hom-hom", 1, 1), ("gae-hom-onto", 1, 2), ("gae-onto-onto", 2, 3), ("jcd-hom-hom", 1, 4), ("jcd-onto-onto", 2, 5)] data_desc = read_file_2("data\ms-project\data-description.csv") for i in files: file_name = "enrich_red/" + i[0] + ".csv" file = read_file(file_name) # if i[1] == 1: # neigh = read_file_2("data/ms-project/neig_len_hom.csv") # else: # neigh = read_file_2("data/ms-project/neig_len_onto.csv") if i[2] == 1: rank = read_file_2( "ranking_results/ -- GAE -- Homology -- Homology.csv") elif i[2] == 2: rank = read_file_2( "ranking_results/ -- GAE -- Homology -- Ontology.csv")