def reduce_genesets(): sample = read_file("enrich_red/gae-hom-hom.csv") red = read_file("enrich_red/selected_genesets.csv") temp = {} for i in red: if i in sample: temp[i] = red[i] write_file("enrich_red/selected_genesets.csv", temp)
def save_enrichment_set(): lib = gp.get_library_name('Human') lib = lib[53] files = [("gcn-hom-hom", "enrich/gcn-hom-hom.csv"), ("gcn-hom-onto", "enrich/gcn-hom-onto.csv"), ("gcn-onto-onto", "enrich/gcn-onto-onto.csv"), ("gae-hom-hom", "enrich/gae-hom-hom.csv"), ("gae-hom-onto", "enrich/gae-hom-onto.csv"), ("gae-onto-onto", "enrich/gae-onto-onto.csv")] enrich_set = {} for key, file in files: print(file) cluster_data = read_file(file) for i in cluster_data: print(len(cluster_data[i][2])) try: enr = gp.enrichr(gene_list=list(cluster_data[i][2])[:1000], gene_sets=lib, organism='Human', cutoff=0.05).results name = key + "-" + str(i) term = enr['Term'].to_list() enrich_set[name] = term # print(i) print(enr) except: pass write_file("enrich-cluster/full_result_dic.csv", enrich_set)
def line_plots(): pairs = [('gae-hom-hom', 'jcd-hom'), ('gae-hom-onto', 'jcd-hom'), ('gae-onto-onto', 'jcd-onto'), ('gae-hom-onto', 'jcd-onto')] dic = {} for i in pairs: file_one = read_file("top_4_bio_process/" + i[0] + ".csv") file_two = read_file("top_4_bio_process/" + i[1] + ".csv") name = i[0] + "-" + i[1] dic[name] = {0: 0, 5: 0, 10: 0, 15: 0, 20: 0} for j in keys: x = file_one[str(j)] if str(j) in file_one else set() y = file_two[str(j)] if str(j) in file_two else set() intersect = len(x & y) if intersect == 0: dic[name][0] = dic[name][0] + 1 elif intersect < 5: dic[name][5] = dic[name][5] + 1 elif intersect < 10: dic[name][10] = dic[name][10] + 1 elif intersect < 15: dic[name][15] = dic[name][15] + 1 else: dic[name][20] = dic[name][20] + 1 # normalise # for i in dic: # mean = 20 # std = stat.stdev(dic[i].values()) # # for j in dic[i]: # dic[i][j] = dic[i][j]/100 pd.DataFrame(dic).plot(kind='bar') plt.xlabel("# of nearest x") plt.ylabel("# of Occurrence") plt.title("# of Nearest-x-genesets in common between selected graphs") plt.show()
def scatter_plots(): for i in files: file_one = "enrich_red/"+i[0]+".csv" file_two = "enrich_red/"+i[1]+".csv" one = read_file(file_one) two = read_file(file_two) array = [] for x, y in zip(one, two): x_one = list(one[x]) y_one = list(two[y]) array.extend(res(x_one, y_one)) array.extend(res(y_one, x_one)) x, y = zip(*array) plt.scatter(x, y) linreg = spy.stats.linregress(x, y) temp = [linreg.intercept + linreg.slope.item() * k for k in x] plt.plot(x, temp, 'r') x_min = min(x) x_max = max(x) + statistics.stdev(x) y_min = min(y) y_max = max(y) + statistics.stdev(y) plt.xlim(x_min, x_max) plt.ylim(y_min, y_max) plt.ylabel("Corresponding P-value") plt.xlabel("Actual P-value") plt.text(i[2], i[3], 'R2 = %0.2f' % linreg.rvalue) plt.text(i[2], i[4], 'Slope = %0.2f' % linreg.slope) plt.title(i[0]+" and "+i[1]) plt.show()
def save_top4_csvs(): for i in files: file_name = "enrich/" + i + ".csv" file = read_file(file_name) temp = {} for j in file: gene_list = list(file[j][2]) enr_x_one = None try: enr_x_one = gp.enrichr( gene_list=gene_list, gene_sets=lib, organism='Human', cutoff=0.05).results.head(10)['Term'].tolist() temp[j] = set(enr_x_one) except: pass write_file("top_4_bio_process/" + i + ".csv", temp)
try: enr = gp.enrichr(gene_list=list(cluster_data[i][2])[:1000], gene_sets=lib, organism='Human', cutoff=0.05).results name = key + "-" + str(i) term = enr['Term'].to_list() enrich_set[name] = term # print(i) print(enr) except: pass write_file("enrich-cluster/full_result_dic.csv", enrich_set) save_enrichment_set() x = read_file("enrich-cluster/full_result_dic.csv") l1 = {} for i in x.keys(): l2 = [] for j in x.keys(): l2.append(jac_sim(x[i], x[j])) l1[i] = l2 df = pd.DataFrame.from_dict(l1, orient='index').transpose() col = {} col_name = list(df.columns.values) for i in range(len(col_name)): col[col_name[i]] = i print(col)
if i in sample: temp[i] = red[i] write_file("enrich_red/selected_genesets.csv", temp) # reduce_genesets() lib = gp.get_library_name('Human')[53] files = [("gae-hom-hom", 1, 1), ("gae-hom-onto", 1, 2), ("gae-onto-onto", 2, 3), ("jcd-hom-hom", 1, 4), ("jcd-onto-onto", 2, 5)] data_desc = read_file_2("data\ms-project\data-description.csv") for i in files: file_name = "enrich_red/" + i[0] + ".csv" file = read_file(file_name) # if i[1] == 1: # neigh = read_file_2("data/ms-project/neig_len_hom.csv") # else: # neigh = read_file_2("data/ms-project/neig_len_onto.csv") if i[2] == 1: rank = read_file_2( "ranking_results/ -- GAE -- Homology -- Homology.csv") elif i[2] == 2: rank = read_file_2( "ranking_results/ -- GAE -- Homology -- Ontology.csv") elif i[2] == 3: rank = read_file_2( "ranking_results/ -- GAE -- Ontology -- Ontology.csv") elif i[2] == 4: