Exemple #1
0
def build_hierarcy():
    print "fetching ppi"
    go_edges = fetch_string_ppi_edges()

    go2geneids, geneids2go = fetch_go_hierarcy()
    """Run numerous tests for various reports."""
    dag_fin = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)
    tic = timeit.default_timer()
    godag = GODag(dag_fin, optional_attrs=['relationship'])
    gosubdag = GoSubDag(godag.keys(), godag)
    toc = timeit.default_timer()
    out = file(
        os.path.join(constants.BASE_PROFILE, "output", "go_hierarcy.txt"),
        "w+")  # sys.stdout
    dict_result = {}
    for cur_term in ['GO:0005575']:
        vertices, edges = extract_hier_all(gosubdag, out, cur_term, go2geneids)
        dict_result[cur_term] = {"vertices": vertices, "edges": edges}

    go_edges_filtered = {}
    lines = []
    for cur_edges, score in go_edges.iteritems():
        vertices = cur_edges.split("=")
        if dict_result['GO:0005575']['vertices'].has_key(vertices[0]) and dict_result['GO:0005575']['vertices'].has_key(vertices[1]) and score > 1000 \
                and dict_result['GO:0005575']['vertices'][vertices[0]]['isleaf'] and dict_result['GO:0005575']['vertices'][vertices[1]]['isleaf']:
            go_edges_filtered[cur_edges] = score
            lines.append("{}\t{}\n".format(cur_edges, score))

    print "about to write filtered ppi go edges to file ({} lines)".format(
        len(lines))
    with file(
            os.path.join(constants.OUTPUT_GLOBAL_DIR,
                         "GO_edges_ppi_filtered.txt"), "w+") as f:
        f.writelines(lines)
Exemple #2
0
def build_hierarcy():
    print "fetching ppi"
    go_edges = fetch_string_ppi_edges()

    go2geneids, geneids2go = fetch_go_hierarcy()

    """Run numerous tests for various reports."""
    dag_fin = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)
    tic = timeit.default_timer()
    godag = GODag(dag_fin, optional_attrs=['relationship'])
    gosubdag = GoSubDag(godag.keys(), godag)
    toc = timeit.default_timer()
    out = file(os.path.join(constants.BASE_PROFILE, "output", "go_hierarcy.txt"), "w+")  # sys.stdout
    dict_result = {}
    for cur_term in ['GO:0005575']:
        vertices, edges = extract_hier_all(gosubdag, out, cur_term, go2geneids)
        dict_result[cur_term] = {"vertices": vertices, "edges": edges}


    driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "Hh123456"))

    def add_edge(tx, src, dst, score):
        tx.run(("MERGE (n1: GO{{term:\"{TERM1}\"}})"+ \
                "MERGE (n2: GO{{term:\"{TERM2}\"}})"+ \
                "MERGE (n1)-[r:SCR {{ score: {SCORE} }}]->(n2)").format(TERM1=src, TERM2=dst, SCORE=score))

    def add_node(tx, nd):
        tx.run(("CREATE (n1: GO{{term:\"{TERM1}\"}})".format(TERM1=nd)))

    def add_friends(tx, name, friend_name):
        tx.run("MERGE (a:Person {name: $name}) "
               "MERGE (a)-[:KNOWS]->(friend:Person {name: $friend_name})",
               name=name, friend_name=friend_name)

    def print_friends(tx, name):
        for record in tx.run("MATCH (a:Person)-[:KNOWS]->(friend) WHERE a.name = $name "
                             "RETURN friend.name ORDER BY friend.name", name=name):
            print(record["friend.name"])

    # with driver.session() as session:
    #     count=0
    #     for k, v in dict_result['GO:0005575']['vertices'].iteritems():
    #         if dict_result['GO:0005575']['vertices'].has_key(k) \
    #                         and dict_result['GO:0005575']['vertices'][k]['isleaf']:
    #                     session.write_transaction(add_node,k)
    #                     count+=1
    #     print "total vartices: {}".foramt(count)

    with driver.session() as session:
        count=0
        for cur_edges, score in go_edges.iteritems():

            vertices = cur_edges.split("=")
            if dict_result['GO:0005575']['vertices'].has_key(vertices[0]) and dict_result['GO:0005575'][
                'vertices'].has_key(vertices[1]) and score > 100000 \
                    and dict_result['GO:0005575']['vertices'][vertices[0]]['isleaf'] and \
                    dict_result['GO:0005575']['vertices'][vertices[1]]['isleaf']:
                count+=1
                session.write_transaction(add_edge, vertices[0], vertices[1], score)
        print "total edges: {}".format(count)
Exemple #3
0
def build_hierarcy(go_folder,
                   roots=['GO:0008150'],
                   ev_exclude=set()):  #  0008150 0005575 0003674

    go2geneids, geneids2go = fetch_go_hierarcy(go_folder, ev_exclude)
    """Run numerous tests for various reports."""
    dag_fin = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)
    tic = timeit.default_timer()
    godag = GODag(dag_fin, optional_attrs=['relationship'])
    gosubdag = GoSubDag(godag.keys(), godag)
    toc = timeit.default_timer()
    dict_result = {}
    for cur_term in roots:
        vertices, edges = extract_hier_all(gosubdag, cur_term, go2geneids)

        # all_go_ids=set(vertices.keys())
        # for cur_id in all_go_ids:
        #     if not cur_id in go2geneids:
        #         go2geneids[cur_id]=set()

        msg = "Elapsed HMS: {}\n\n".format(
            str(datetime.timedelta(seconds=(toc - tic))))
        sys.stdout.write(msg)
        dict_result[cur_term] = {"vertices": vertices, "edges": edges}
    return dict_result, go2geneids, geneids2go, get_entrez2ensembl_dictionary()
def get_highest_ic():
    if not os.path.isfile(HIGHEST_IC_FILE_PATH):
        go_dag = GODag(GO_DAG_FILE_PATH, prt=open(os.devnull, 'w'))
        compute_highest_inc_parallel(list(go_dag.keys()))

    ic_file = open(HIGHEST_IC_FILE_PATH, 'r')
    highest_ic_anc = json.load(ic_file)
    ic_file.close()

    return highest_ic_anc
Exemple #5
0
def test_all():

    obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME))

    assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True)

    """Run numerous tests for various reports."""
    dag_fin = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/mini_obo.obo")

    godag = GODag(dag_fin)
    gosubdag = GoSubDag(godag.keys(), godag)

    out = sys.stdout
    write_hier_all(gosubdag, out)
Exemple #6
0
def test_i148b_semsim_lin(do_plt=False):
    """Test for issue 148, Lin Similarity if a term has no annotations"""
    fin_gaf = os.path.join(REPO, 'tests/data/yangRWC/fig2a_nonleaf0.gaf')
    godag = GODag(os.path.join(REPO, "tests/data/yangRWC/fig2a.obo"))
    annoobj = GafReader(fin_gaf, godag=godag)

    associations = annoobj.get_id2gos('CC')
    tcntobj = TermCounts(godag, associations)

    if do_plt:
        _do_plt(tcntobj, godag)

    goids = list(godag.keys())

    ##print(lin_sim('GO:0000006', 'GO:0000002', godag, tcntobj, 1.0))
    ## print(lin_sim('GO:0005575', 'GO:0005575', godag, tcntobj, 1.0))
    ##return

    # Calculate Resnik values
    p2r = {
        frozenset([a, b]): resnik_sim(a, b, godag, tcntobj)
        for a, b in combo_w_rplc(goids, 2)
    }
    _prt_values('Resnik', goids, p2r)

    # Calculate Lin values
    p2l = {
        frozenset([a, b]): lin_sim(a, b, godag, tcntobj)
        for a, b in combo_w_rplc(goids, 2)
    }
    _prt_values('Lin', goids, p2l)
    _chk_lin(p2l)
    return

    # Calculate Resnik values
    p2r = {
        frozenset([a, b]): resnik_sim(a, b, godag, tcntobj)
        for a, b in combo_w_rplc(goids, 2)
    }
    _prt_values('Resnik', goids, p2r)

    # Calculate Lin values
    p2l = {
        frozenset([a, b]): lin_sim(a, b, godag, tcntobj)
        for a, b in combo_w_rplc(goids, 2)
    }
    _prt_values('Lin', goids, p2l)
    _chk_lin(p2l)
Exemple #7
0
def test_all():
    """Run numerous tests for various reports."""
    dag_fin = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/mini_obo.obo")
    tic = timeit.default_timer()
    godag = GODag(dag_fin)
    gosubdag = GoSubDag(godag.keys(), godag)
    toc = timeit.default_timer()
    out = sys.stdout
    write_hier_all(gosubdag, out)
    write_hier_norep(gosubdag, out)
    write_hier_lim(gosubdag, out)
    write_hier_mrk_lst(gosubdag, out)
    write_hier_mrk_dct(gosubdag, out)
    write_hier_up(gosubdag, out)
    msg = "Elapsed HMS: {}\n\n".format(str(datetime.timedelta(seconds=(toc-tic))))
    sys.stdout.write(msg)
Exemple #8
0
def build_hierarcy(roots=['GO:0005575']):

    go2geneids, geneids2go = fetch_go_hierarcy()
    """Run numerous tests for various reports."""
    dag_fin = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)
    tic = timeit.default_timer()
    godag = GODag(dag_fin, optional_attrs=['relationship'])
    gosubdag = GoSubDag(godag.keys(), godag)
    toc = timeit.default_timer()
    out = file(
        os.path.join(constants.BASE_PROFILE, "output", "go_hierarcy.txt"),
        "w+")  # sys.stdout
    dict_result = {}
    for cur_term in roots:
        vertices, edges = extract_hier_all(gosubdag, out, cur_term, go2geneids)
        # write_hier_norep(gosubdag, out)
        # write_hier_lim(gosubdag, out)
        # write_hier_mrk(gosubdag, out)
        msg = "Elapsed HMS: {}\n\n".format(
            str(datetime.timedelta(seconds=(toc - tic))))
        sys.stdout.write(msg)
        dict_result[cur_term] = {"vertices": vertices, "edges": edges}
    return dict_result, go2geneids, geneids2go, get_entrez2ensembl_dictionary()
Exemple #9
0
def build_hierarcy():
    print "fetching ppi"
    go_edges = fetch_string_ppi_edges()
    go_dict = get_go_dict()
    prize_dictionaty = get_prize_dict()

    go2geneids, geneids2go = fetch_go_hierarcy()
    """Run numerous tests for various reports."""
    dag_fin = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)
    tic = timeit.default_timer()
    godag = GODag(dag_fin, optional_attrs=['relationship'])
    gosubdag = GoSubDag(godag.keys(), godag)
    toc = timeit.default_timer()
    out = file(
        os.path.join(constants.BASE_PROFILE, "output", "go_hierarcy.txt"),
        "w+")  # sys.stdout
    dict_result = {}
    for cur_term in ['GO:0005575']:
        vertices, edges, G, layers = extract_hier_all(gosubdag, out, cur_term,
                                                      go2geneids,
                                                      prize_dictionaty)
        dict_result[cur_term] = {"vertices": vertices, "edges": edges}
        print layers

    count = 0
    vertices_grid = []
    vertices_grid_values = []
    vertices_prizes = []
    for k, v in dict_result['GO:0005575']['vertices'].iteritems():
        if dict_result['GO:0005575']['vertices'].has_key(k) \
                        and dict_result['GO:0005575']['vertices'][k]['isleaf']:
            vertices_grid.append(k)
            vertices_grid_values.append(v)
            vertices_prizes.append(0)
            x = [x for x in layers if k in x]
            if len(x) > 0:
                vertices_prizes[-1] = (len(layers) -
                                       layers.index(x[0])) / float(len(layers))
            count += 1
    print "total vartices: {}".format(count)

    count = 0
    edges_grid = []
    edges_costs = []
    for cur_edges, score in go_edges.iteritems():

        vertices = cur_edges.split("=")
        if dict_result['GO:0005575']['vertices'].has_key(vertices[0]) and dict_result['GO:0005575'][
            'vertices'].has_key(vertices[1]) and score > 10000 \
                and dict_result['GO:0005575']['vertices'][vertices[0]]['isleaf'] and \
                dict_result['GO:0005575']['vertices'][vertices[1]]['isleaf']:

            cost = (len(go_dict[vertices[0]]["ENSP"]) *
                    len(go_dict[vertices[1]]["ENSP"]))
            cost_alt = (len(go2geneids[vertices[0]]) *
                        len(go2geneids[vertices[1]]))
            # print "cost/alt: {}, {}:".format(cost, cost_alt)
            if cost != 0:
                cost = cost / float(score)
                edges_grid.append([
                    vertices_grid.index(vertices[0]),
                    vertices_grid.index(vertices[1])
                ])
                # print cost
                edges_costs.append(cost)
                count += 1
    print "total edges: {}".format(count)
    edges_costs = np.array(edges_costs, dtype=np.float64)
    min_cost = np.min(edges_costs)
    max_cost = np.max(edges_costs)
    for i, cur_cost in enumerate(edges_costs):
        edges_costs[i] = cur_cost / (max_cost - min_cost)
    percentiles = [np.percentile(edges_costs, x * 10) for x in range(11)]
    print "edge precentiles: {}".format(percentiles)
    print np.min(edges_costs)
    vertices_prizes = [percentiles[6] * x for x in vertices_prizes]
    edges_grid = np.array(edges_grid).astype(np.int64)
    vertices_prizes = np.array(vertices_prizes).astype(np.float64)
    edges_costs = np.array(edges_costs).astype(np.float64)
    root = -1
    num_clusters = 1
    pruning = 'strong'  # 'none'
    verbosity_level = 0
    vertices, edges = pcst_fast.pcst_fast(edges_grid, vertices_prizes,
                                          edges_costs, root, num_clusters,
                                          pruning, verbosity_level)
    G = nx.Graph()
    # print vertices_prizes
    # print edges_costs
    # print "vertices"
    # print [vertices_grid[x] for x in vertices]
    # print [vertices_grid_values[x]["name"] for x in vertices]
    c_values = {}
    labels = {}
    for cur_v in vertices:
        cur_layer = [x for x in layers if vertices_grid[cur_v] in x]
        level = len(layers)
        if len(cur_layer) != 0:
            level = layers.index(cur_layer[0])
        G.add_node(
            vertices_grid[cur_v], **{
                "name": vertices_grid_values[cur_v]["name"],
                "level": level
            })
        c_values[vertices_grid[cur_v]] = 1 - level / float(len(layers))
        labels[vertices_grid[cur_v]] = vertices_grid_values[cur_v]["name"]
    c_list = [c_values[x] for x in G.nodes()]
    print[G.node[x] for x in G.nodes()]
    # print "edges"
    # print ["{}={}".format(vertices_grid[edges_grid[x][0]], vertices_grid[edges_grid[x][1]]) for x in edges]
    # print ["{} = {}".format(vertices_grid_values[int(edges_grid[x][0])]["name"], vertices_grid_values[int(edges_grid[x][1])]["name"]) for x in edges]
    for cur_e in edges:
        G.add_edge(vertices_grid[edges_grid[cur_e][0]],
                   vertices_grid[edges_grid[cur_e][1]])

    nx.draw_networkx(G,
                     cmap=plt.get_cmap('jet'),
                     node_color=c_list,
                     labels=labels,
                     font_size=8)
    plt.savefig(
        os.path.join(constants.OUTPUT_GLOBAL_DIR,
                     "PCST_{}.png".format(time.time())))
Exemple #10
0
def build_hierarcy():
    print "fetching ppi"
    go_edges = fetch_string_ppi_edges()
    go_dict = get_go_dict()

    go2geneids, geneids2go = fetch_go_hierarcy()
    """Run numerous tests for various reports."""
    dag_fin = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)
    tic = timeit.default_timer()
    godag = GODag(dag_fin, optional_attrs=['relationship'])
    gosubdag = GoSubDag(godag.keys(), godag)
    toc = timeit.default_timer()
    out = file(
        os.path.join(constants.BASE_PROFILE, "output", "go_hierarcy.txt"),
        "w+")  # sys.stdout
    dict_result = {}
    for cur_term in ['GO:0005575']:
        vertices, edges = extract_hier_all(gosubdag, out, cur_term, go2geneids)
        dict_result[cur_term] = {"vertices": vertices, "edges": edges}

    count = 0
    vertices_grid = []
    vertices_grid_values = []
    vertices_prizes = []
    for k, v in dict_result['GO:0005575']['vertices'].iteritems():
        if dict_result['GO:0005575']['vertices'].has_key(k) \
                        and dict_result['GO:0005575']['vertices'][k]['isleaf']:
            vertices_grid.append(k)
            vertices_grid_values.append(v)
            vertices_prizes.append(1 / float(10000000))  #
            count += 1
    print "total vartices: {}".format(count)

    count = 0
    edges_grid = []
    edges_costs = []
    for cur_edges, score in go_edges.iteritems():

        vertices = cur_edges.split("=")
        if dict_result['GO:0005575']['vertices'].has_key(vertices[0]) and dict_result['GO:0005575'][
            'vertices'].has_key(vertices[1]) and score > 10000 \
                and dict_result['GO:0005575']['vertices'][vertices[0]]['isleaf'] and \
                dict_result['GO:0005575']['vertices'][vertices[1]]['isleaf']:

            cost = (len(go_dict[vertices[0]]["ENSP"]) *
                    len(go_dict[vertices[1]]["ENSP"]))
            cost_alt = (len(go2geneids[vertices[0]]) *
                        len(go2geneids[vertices[1]]))
            # print "cost/alt: {}, {}:".format(cost, cost_alt)
            if cost != 0:
                cost = cost / float(score)
                edges_grid.append([
                    vertices_grid.index(vertices[0]),
                    vertices_grid.index(vertices[1])
                ])
                # print cost
                edges_costs.append(cost)
                count += 1
    print "total edges: {}".format(count)
    edges_costs = np.array(edges_costs, dtype=np.float64)
    min_cost = np.min(edges_costs)
    max_cost = np.max(edges_costs)
    for i, cur_cost in enumerate(edges_costs):
        edges_costs[i] = cur_cost / (max_cost - min_cost)
    percentiles = [np.percentile(edges_costs, x * 10) for x in range(11)]
    print "edge precentiles: {}".format(percentiles)
    print np.min(edges_costs)
    vertices_prizes = [0.0005 for x in vertices_prizes]
    edges_grid = np.array(edges_grid).astype(np.int64)
    vertices_prizes = np.array(vertices_prizes).astype(np.float64)
    edges_costs = np.array(edges_costs).astype(np.float64)
    root = -1
    num_clusters = 1
    pruning = 'strong'  # 'none'
    verbosity_level = 0
    vertices, edges = pcst_fast.pcst_fast(edges_grid, vertices_prizes,
                                          edges_costs, root, num_clusters,
                                          pruning, verbosity_level)
    print vertices_prizes
    print edges_costs
    print "vertices"
    print[vertices_grid[x] for x in vertices]
    print[vertices_grid_values[x]["name"] for x in vertices]
    print "edges"
    print[
        "{}={}".format(vertices_grid[edges_grid[x][0]],
                       vertices_grid[edges_grid[x][1]]) for x in edges
    ]
    print[
        "{} = {}".format(vertices_grid_values[int(edges_grid[x][0])]["name"],
                         vertices_grid_values[int(edges_grid[x][1])]["name"])
        for x in edges
    ]