Esempio n. 1
0
def reduce_genesets():
    sample = read_file("enrich_red/gae-hom-hom.csv")
    red = read_file("enrich_red/selected_genesets.csv")
    temp = {}
    for i in red:
        if i in sample:
            temp[i] = red[i]
    write_file("enrich_red/selected_genesets.csv", temp)
Esempio n. 2
0
def save_enrichment_set():
     lib = gp.get_library_name('Human')
     lib = lib[53]

     files = [("gcn-hom-hom", "enrich/gcn-hom-hom.csv"),
              ("gcn-hom-onto", "enrich/gcn-hom-onto.csv"),
              ("gcn-onto-onto", "enrich/gcn-onto-onto.csv"),
              ("gae-hom-hom", "enrich/gae-hom-hom.csv"),
              ("gae-hom-onto", "enrich/gae-hom-onto.csv"),
              ("gae-onto-onto", "enrich/gae-onto-onto.csv")]

     enrich_set = {}
     for key, file in files:
          print(file)
          cluster_data = read_file(file)
          for i in cluster_data:
              print(len(cluster_data[i][2]))
              try:
                  enr = gp.enrichr(gene_list=list(cluster_data[i][2])[:1000], gene_sets=lib, organism='Human', cutoff=0.05).results
                  name = key + "-" + str(i)
                  term = enr['Term'].to_list()
                  enrich_set[name] = term
                  # print(i)
                  print(enr)
              except:
                   pass


     write_file("enrich-cluster/full_result_dic.csv", enrich_set)
Esempio n. 3
0
def line_plots():
    pairs = [('gae-hom-hom', 'jcd-hom'), ('gae-hom-onto', 'jcd-hom'),
             ('gae-onto-onto', 'jcd-onto'), ('gae-hom-onto', 'jcd-onto')]

    dic = {}
    for i in pairs:

        file_one = read_file("top_4_bio_process/" + i[0] + ".csv")
        file_two = read_file("top_4_bio_process/" + i[1] + ".csv")

        name = i[0] + "-" + i[1]

        dic[name] = {0: 0, 5: 0, 10: 0, 15: 0, 20: 0}

        for j in keys:
            x = file_one[str(j)] if str(j) in file_one else set()
            y = file_two[str(j)] if str(j) in file_two else set()
            intersect = len(x & y)

            if intersect == 0:
                dic[name][0] = dic[name][0] + 1
            elif intersect < 5:
                dic[name][5] = dic[name][5] + 1
            elif intersect < 10:
                dic[name][10] = dic[name][10] + 1
            elif intersect < 15:
                dic[name][15] = dic[name][15] + 1
            else:
                dic[name][20] = dic[name][20] + 1

    # normalise
    # for i in dic:
    #     mean = 20
    #     std = stat.stdev(dic[i].values())
    #
    #     for j in dic[i]:
    #         dic[i][j] = dic[i][j]/100

    pd.DataFrame(dic).plot(kind='bar')
    plt.xlabel("# of nearest x")
    plt.ylabel("# of Occurrence")
    plt.title("# of Nearest-x-genesets in common between selected graphs")
    plt.show()
Esempio n. 4
0
def scatter_plots():
    for i in files:
        file_one = "enrich_red/"+i[0]+".csv"
        file_two = "enrich_red/"+i[1]+".csv"
        one = read_file(file_one)
        two = read_file(file_two)

        array = []
        for x, y in zip(one, two):
            x_one = list(one[x])
            y_one = list(two[y])

            array.extend(res(x_one, y_one))
            array.extend(res(y_one, x_one))

        x, y = zip(*array)

        plt.scatter(x, y)
        linreg = spy.stats.linregress(x, y)
        temp = [linreg.intercept + linreg.slope.item() * k for k in x]
        plt.plot(x, temp, 'r')

        x_min = min(x)
        x_max = max(x) + statistics.stdev(x)

        y_min = min(y)
        y_max = max(y) + statistics.stdev(y)

        plt.xlim(x_min, x_max)
        plt.ylim(y_min, y_max)

        plt.ylabel("Corresponding P-value")
        plt.xlabel("Actual P-value")
        plt.text(i[2], i[3], 'R2 = %0.2f' % linreg.rvalue)
        plt.text(i[2], i[4], 'Slope = %0.2f' % linreg.slope)
        plt.title(i[0]+" and "+i[1])
        plt.show()
Esempio n. 5
0
def save_top4_csvs():
    for i in files:
        file_name = "enrich/" + i + ".csv"
        file = read_file(file_name)

        temp = {}
        for j in file:
            gene_list = list(file[j][2])

            enr_x_one = None
            try:
                enr_x_one = gp.enrichr(
                    gene_list=gene_list,
                    gene_sets=lib,
                    organism='Human',
                    cutoff=0.05).results.head(10)['Term'].tolist()
                temp[j] = set(enr_x_one)
            except:
                pass
        write_file("top_4_bio_process/" + i + ".csv", temp)
Esempio n. 6
0
              try:
                  enr = gp.enrichr(gene_list=list(cluster_data[i][2])[:1000], gene_sets=lib, organism='Human', cutoff=0.05).results
                  name = key + "-" + str(i)
                  term = enr['Term'].to_list()
                  enrich_set[name] = term
                  # print(i)
                  print(enr)
              except:
                   pass


     write_file("enrich-cluster/full_result_dic.csv", enrich_set)
save_enrichment_set()


x = read_file("enrich-cluster/full_result_dic.csv")

l1 = {}
for i in x.keys():
     l2 = []
     for j in x.keys():
          l2.append(jac_sim(x[i], x[j]))
     l1[i] = l2


df = pd.DataFrame.from_dict(l1, orient='index').transpose()
col = {}
col_name = list(df.columns.values)
for i in range(len(col_name)):
    col[col_name[i]] = i
print(col)
Esempio n. 7
0
        if i in sample:
            temp[i] = red[i]
    write_file("enrich_red/selected_genesets.csv", temp)


# reduce_genesets()

lib = gp.get_library_name('Human')[53]
files = [("gae-hom-hom", 1, 1), ("gae-hom-onto", 1, 2),
         ("gae-onto-onto", 2, 3), ("jcd-hom-hom", 1, 4),
         ("jcd-onto-onto", 2, 5)]
data_desc = read_file_2("data\ms-project\data-description.csv")

for i in files:
    file_name = "enrich_red/" + i[0] + ".csv"
    file = read_file(file_name)
    # if i[1] == 1:
    #     neigh = read_file_2("data/ms-project/neig_len_hom.csv")
    # else:
    #     neigh = read_file_2("data/ms-project/neig_len_onto.csv")

    if i[2] == 1:
        rank = read_file_2(
            "ranking_results/ -- GAE -- Homology -- Homology.csv")
    elif i[2] == 2:
        rank = read_file_2(
            "ranking_results/ -- GAE -- Homology -- Ontology.csv")
    elif i[2] == 3:
        rank = read_file_2(
            "ranking_results/ -- GAE -- Ontology -- Ontology.csv")
    elif i[2] == 4: