def build_hierarcy(): print "fetching ppi" go_edges = fetch_string_ppi_edges() go2geneids, geneids2go = fetch_go_hierarcy() """Run numerous tests for various reports.""" dag_fin = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME) tic = timeit.default_timer() godag = GODag(dag_fin, optional_attrs=['relationship']) gosubdag = GoSubDag(godag.keys(), godag) toc = timeit.default_timer() out = file( os.path.join(constants.BASE_PROFILE, "output", "go_hierarcy.txt"), "w+") # sys.stdout dict_result = {} for cur_term in ['GO:0005575']: vertices, edges = extract_hier_all(gosubdag, out, cur_term, go2geneids) dict_result[cur_term] = {"vertices": vertices, "edges": edges} go_edges_filtered = {} lines = [] for cur_edges, score in go_edges.iteritems(): vertices = cur_edges.split("=") if dict_result['GO:0005575']['vertices'].has_key(vertices[0]) and dict_result['GO:0005575']['vertices'].has_key(vertices[1]) and score > 1000 \ and dict_result['GO:0005575']['vertices'][vertices[0]]['isleaf'] and dict_result['GO:0005575']['vertices'][vertices[1]]['isleaf']: go_edges_filtered[cur_edges] = score lines.append("{}\t{}\n".format(cur_edges, score)) print "about to write filtered ppi go edges to file ({} lines)".format( len(lines)) with file( os.path.join(constants.OUTPUT_GLOBAL_DIR, "GO_edges_ppi_filtered.txt"), "w+") as f: f.writelines(lines)
def build_hierarcy(): print "fetching ppi" go_edges = fetch_string_ppi_edges() go2geneids, geneids2go = fetch_go_hierarcy() """Run numerous tests for various reports.""" dag_fin = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME) tic = timeit.default_timer() godag = GODag(dag_fin, optional_attrs=['relationship']) gosubdag = GoSubDag(godag.keys(), godag) toc = timeit.default_timer() out = file(os.path.join(constants.BASE_PROFILE, "output", "go_hierarcy.txt"), "w+") # sys.stdout dict_result = {} for cur_term in ['GO:0005575']: vertices, edges = extract_hier_all(gosubdag, out, cur_term, go2geneids) dict_result[cur_term] = {"vertices": vertices, "edges": edges} driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "Hh123456")) def add_edge(tx, src, dst, score): tx.run(("MERGE (n1: GO{{term:\"{TERM1}\"}})"+ \ "MERGE (n2: GO{{term:\"{TERM2}\"}})"+ \ "MERGE (n1)-[r:SCR {{ score: {SCORE} }}]->(n2)").format(TERM1=src, TERM2=dst, SCORE=score)) def add_node(tx, nd): tx.run(("CREATE (n1: GO{{term:\"{TERM1}\"}})".format(TERM1=nd))) def add_friends(tx, name, friend_name): tx.run("MERGE (a:Person {name: $name}) " "MERGE (a)-[:KNOWS]->(friend:Person {name: $friend_name})", name=name, friend_name=friend_name) def print_friends(tx, name): for record in tx.run("MATCH (a:Person)-[:KNOWS]->(friend) WHERE a.name = $name " "RETURN friend.name ORDER BY friend.name", name=name): print(record["friend.name"]) # with driver.session() as session: # count=0 # for k, v in dict_result['GO:0005575']['vertices'].iteritems(): # if dict_result['GO:0005575']['vertices'].has_key(k) \ # and dict_result['GO:0005575']['vertices'][k]['isleaf']: # session.write_transaction(add_node,k) # count+=1 # print "total vartices: {}".foramt(count) with driver.session() as session: count=0 for cur_edges, score in go_edges.iteritems(): vertices = cur_edges.split("=") if dict_result['GO:0005575']['vertices'].has_key(vertices[0]) and dict_result['GO:0005575'][ 'vertices'].has_key(vertices[1]) and score > 100000 \ and dict_result['GO:0005575']['vertices'][vertices[0]]['isleaf'] and \ dict_result['GO:0005575']['vertices'][vertices[1]]['isleaf']: count+=1 session.write_transaction(add_edge, vertices[0], vertices[1], score) print "total edges: {}".format(count)
def build_hierarcy(go_folder, roots=['GO:0008150'], ev_exclude=set()): # 0008150 0005575 0003674 go2geneids, geneids2go = fetch_go_hierarcy(go_folder, ev_exclude) """Run numerous tests for various reports.""" dag_fin = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME) tic = timeit.default_timer() godag = GODag(dag_fin, optional_attrs=['relationship']) gosubdag = GoSubDag(godag.keys(), godag) toc = timeit.default_timer() dict_result = {} for cur_term in roots: vertices, edges = extract_hier_all(gosubdag, cur_term, go2geneids) # all_go_ids=set(vertices.keys()) # for cur_id in all_go_ids: # if not cur_id in go2geneids: # go2geneids[cur_id]=set() msg = "Elapsed HMS: {}\n\n".format( str(datetime.timedelta(seconds=(toc - tic)))) sys.stdout.write(msg) dict_result[cur_term] = {"vertices": vertices, "edges": edges} return dict_result, go2geneids, geneids2go, get_entrez2ensembl_dictionary()
def get_highest_ic(): if not os.path.isfile(HIGHEST_IC_FILE_PATH): go_dag = GODag(GO_DAG_FILE_PATH, prt=open(os.devnull, 'w')) compute_highest_inc_parallel(list(go_dag.keys())) ic_file = open(HIGHEST_IC_FILE_PATH, 'r') highest_ic_anc = json.load(ic_file) ic_file.close() return highest_ic_anc
def test_all(): obo_dag = GODag(os.path.join(constants.GO_DIR, constants.GO_FILE_NAME)) assoc = read_ncbi_gene2go(os.path.join(constants.GO_DIR, constants.GO_ASSOCIATION_FILE_NAME), no_top=True) """Run numerous tests for various reports.""" dag_fin = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/mini_obo.obo") godag = GODag(dag_fin) gosubdag = GoSubDag(godag.keys(), godag) out = sys.stdout write_hier_all(gosubdag, out)
def test_i148b_semsim_lin(do_plt=False): """Test for issue 148, Lin Similarity if a term has no annotations""" fin_gaf = os.path.join(REPO, 'tests/data/yangRWC/fig2a_nonleaf0.gaf') godag = GODag(os.path.join(REPO, "tests/data/yangRWC/fig2a.obo")) annoobj = GafReader(fin_gaf, godag=godag) associations = annoobj.get_id2gos('CC') tcntobj = TermCounts(godag, associations) if do_plt: _do_plt(tcntobj, godag) goids = list(godag.keys()) ##print(lin_sim('GO:0000006', 'GO:0000002', godag, tcntobj, 1.0)) ## print(lin_sim('GO:0005575', 'GO:0005575', godag, tcntobj, 1.0)) ##return # Calculate Resnik values p2r = { frozenset([a, b]): resnik_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Resnik', goids, p2r) # Calculate Lin values p2l = { frozenset([a, b]): lin_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Lin', goids, p2l) _chk_lin(p2l) return # Calculate Resnik values p2r = { frozenset([a, b]): resnik_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Resnik', goids, p2r) # Calculate Lin values p2l = { frozenset([a, b]): lin_sim(a, b, godag, tcntobj) for a, b in combo_w_rplc(goids, 2) } _prt_values('Lin', goids, p2l) _chk_lin(p2l)
def test_all(): """Run numerous tests for various reports.""" dag_fin = os.path.join(os.path.dirname(os.path.abspath(__file__)), "data/mini_obo.obo") tic = timeit.default_timer() godag = GODag(dag_fin) gosubdag = GoSubDag(godag.keys(), godag) toc = timeit.default_timer() out = sys.stdout write_hier_all(gosubdag, out) write_hier_norep(gosubdag, out) write_hier_lim(gosubdag, out) write_hier_mrk_lst(gosubdag, out) write_hier_mrk_dct(gosubdag, out) write_hier_up(gosubdag, out) msg = "Elapsed HMS: {}\n\n".format(str(datetime.timedelta(seconds=(toc-tic)))) sys.stdout.write(msg)
def build_hierarcy(roots=['GO:0005575']): go2geneids, geneids2go = fetch_go_hierarcy() """Run numerous tests for various reports.""" dag_fin = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME) tic = timeit.default_timer() godag = GODag(dag_fin, optional_attrs=['relationship']) gosubdag = GoSubDag(godag.keys(), godag) toc = timeit.default_timer() out = file( os.path.join(constants.BASE_PROFILE, "output", "go_hierarcy.txt"), "w+") # sys.stdout dict_result = {} for cur_term in roots: vertices, edges = extract_hier_all(gosubdag, out, cur_term, go2geneids) # write_hier_norep(gosubdag, out) # write_hier_lim(gosubdag, out) # write_hier_mrk(gosubdag, out) msg = "Elapsed HMS: {}\n\n".format( str(datetime.timedelta(seconds=(toc - tic)))) sys.stdout.write(msg) dict_result[cur_term] = {"vertices": vertices, "edges": edges} return dict_result, go2geneids, geneids2go, get_entrez2ensembl_dictionary()
def build_hierarcy(): print "fetching ppi" go_edges = fetch_string_ppi_edges() go_dict = get_go_dict() prize_dictionaty = get_prize_dict() go2geneids, geneids2go = fetch_go_hierarcy() """Run numerous tests for various reports.""" dag_fin = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME) tic = timeit.default_timer() godag = GODag(dag_fin, optional_attrs=['relationship']) gosubdag = GoSubDag(godag.keys(), godag) toc = timeit.default_timer() out = file( os.path.join(constants.BASE_PROFILE, "output", "go_hierarcy.txt"), "w+") # sys.stdout dict_result = {} for cur_term in ['GO:0005575']: vertices, edges, G, layers = extract_hier_all(gosubdag, out, cur_term, go2geneids, prize_dictionaty) dict_result[cur_term] = {"vertices": vertices, "edges": edges} print layers count = 0 vertices_grid = [] vertices_grid_values = [] vertices_prizes = [] for k, v in dict_result['GO:0005575']['vertices'].iteritems(): if dict_result['GO:0005575']['vertices'].has_key(k) \ and dict_result['GO:0005575']['vertices'][k]['isleaf']: vertices_grid.append(k) vertices_grid_values.append(v) vertices_prizes.append(0) x = [x for x in layers if k in x] if len(x) > 0: vertices_prizes[-1] = (len(layers) - layers.index(x[0])) / float(len(layers)) count += 1 print "total vartices: {}".format(count) count = 0 edges_grid = [] edges_costs = [] for cur_edges, score in go_edges.iteritems(): vertices = cur_edges.split("=") if dict_result['GO:0005575']['vertices'].has_key(vertices[0]) and dict_result['GO:0005575'][ 'vertices'].has_key(vertices[1]) and score > 10000 \ and dict_result['GO:0005575']['vertices'][vertices[0]]['isleaf'] and \ dict_result['GO:0005575']['vertices'][vertices[1]]['isleaf']: cost = (len(go_dict[vertices[0]]["ENSP"]) * len(go_dict[vertices[1]]["ENSP"])) cost_alt = (len(go2geneids[vertices[0]]) * len(go2geneids[vertices[1]])) # print "cost/alt: {}, {}:".format(cost, cost_alt) if cost != 0: cost = cost / float(score) edges_grid.append([ vertices_grid.index(vertices[0]), vertices_grid.index(vertices[1]) ]) # print cost edges_costs.append(cost) count += 1 print "total edges: {}".format(count) edges_costs = np.array(edges_costs, dtype=np.float64) min_cost = np.min(edges_costs) max_cost = np.max(edges_costs) for i, cur_cost in enumerate(edges_costs): edges_costs[i] = cur_cost / (max_cost - min_cost) percentiles = [np.percentile(edges_costs, x * 10) for x in range(11)] print "edge precentiles: {}".format(percentiles) print np.min(edges_costs) vertices_prizes = [percentiles[6] * x for x in vertices_prizes] edges_grid = np.array(edges_grid).astype(np.int64) vertices_prizes = np.array(vertices_prizes).astype(np.float64) edges_costs = np.array(edges_costs).astype(np.float64) root = -1 num_clusters = 1 pruning = 'strong' # 'none' verbosity_level = 0 vertices, edges = pcst_fast.pcst_fast(edges_grid, vertices_prizes, edges_costs, root, num_clusters, pruning, verbosity_level) G = nx.Graph() # print vertices_prizes # print edges_costs # print "vertices" # print [vertices_grid[x] for x in vertices] # print [vertices_grid_values[x]["name"] for x in vertices] c_values = {} labels = {} for cur_v in vertices: cur_layer = [x for x in layers if vertices_grid[cur_v] in x] level = len(layers) if len(cur_layer) != 0: level = layers.index(cur_layer[0]) G.add_node( vertices_grid[cur_v], **{ "name": vertices_grid_values[cur_v]["name"], "level": level }) c_values[vertices_grid[cur_v]] = 1 - level / float(len(layers)) labels[vertices_grid[cur_v]] = vertices_grid_values[cur_v]["name"] c_list = [c_values[x] for x in G.nodes()] print[G.node[x] for x in G.nodes()] # print "edges" # print ["{}={}".format(vertices_grid[edges_grid[x][0]], vertices_grid[edges_grid[x][1]]) for x in edges] # print ["{} = {}".format(vertices_grid_values[int(edges_grid[x][0])]["name"], vertices_grid_values[int(edges_grid[x][1])]["name"]) for x in edges] for cur_e in edges: G.add_edge(vertices_grid[edges_grid[cur_e][0]], vertices_grid[edges_grid[cur_e][1]]) nx.draw_networkx(G, cmap=plt.get_cmap('jet'), node_color=c_list, labels=labels, font_size=8) plt.savefig( os.path.join(constants.OUTPUT_GLOBAL_DIR, "PCST_{}.png".format(time.time())))
def build_hierarcy(): print "fetching ppi" go_edges = fetch_string_ppi_edges() go_dict = get_go_dict() go2geneids, geneids2go = fetch_go_hierarcy() """Run numerous tests for various reports.""" dag_fin = os.path.join(constants.GO_DIR, constants.GO_FILE_NAME) tic = timeit.default_timer() godag = GODag(dag_fin, optional_attrs=['relationship']) gosubdag = GoSubDag(godag.keys(), godag) toc = timeit.default_timer() out = file( os.path.join(constants.BASE_PROFILE, "output", "go_hierarcy.txt"), "w+") # sys.stdout dict_result = {} for cur_term in ['GO:0005575']: vertices, edges = extract_hier_all(gosubdag, out, cur_term, go2geneids) dict_result[cur_term] = {"vertices": vertices, "edges": edges} count = 0 vertices_grid = [] vertices_grid_values = [] vertices_prizes = [] for k, v in dict_result['GO:0005575']['vertices'].iteritems(): if dict_result['GO:0005575']['vertices'].has_key(k) \ and dict_result['GO:0005575']['vertices'][k]['isleaf']: vertices_grid.append(k) vertices_grid_values.append(v) vertices_prizes.append(1 / float(10000000)) # count += 1 print "total vartices: {}".format(count) count = 0 edges_grid = [] edges_costs = [] for cur_edges, score in go_edges.iteritems(): vertices = cur_edges.split("=") if dict_result['GO:0005575']['vertices'].has_key(vertices[0]) and dict_result['GO:0005575'][ 'vertices'].has_key(vertices[1]) and score > 10000 \ and dict_result['GO:0005575']['vertices'][vertices[0]]['isleaf'] and \ dict_result['GO:0005575']['vertices'][vertices[1]]['isleaf']: cost = (len(go_dict[vertices[0]]["ENSP"]) * len(go_dict[vertices[1]]["ENSP"])) cost_alt = (len(go2geneids[vertices[0]]) * len(go2geneids[vertices[1]])) # print "cost/alt: {}, {}:".format(cost, cost_alt) if cost != 0: cost = cost / float(score) edges_grid.append([ vertices_grid.index(vertices[0]), vertices_grid.index(vertices[1]) ]) # print cost edges_costs.append(cost) count += 1 print "total edges: {}".format(count) edges_costs = np.array(edges_costs, dtype=np.float64) min_cost = np.min(edges_costs) max_cost = np.max(edges_costs) for i, cur_cost in enumerate(edges_costs): edges_costs[i] = cur_cost / (max_cost - min_cost) percentiles = [np.percentile(edges_costs, x * 10) for x in range(11)] print "edge precentiles: {}".format(percentiles) print np.min(edges_costs) vertices_prizes = [0.0005 for x in vertices_prizes] edges_grid = np.array(edges_grid).astype(np.int64) vertices_prizes = np.array(vertices_prizes).astype(np.float64) edges_costs = np.array(edges_costs).astype(np.float64) root = -1 num_clusters = 1 pruning = 'strong' # 'none' verbosity_level = 0 vertices, edges = pcst_fast.pcst_fast(edges_grid, vertices_prizes, edges_costs, root, num_clusters, pruning, verbosity_level) print vertices_prizes print edges_costs print "vertices" print[vertices_grid[x] for x in vertices] print[vertices_grid_values[x]["name"] for x in vertices] print "edges" print[ "{}={}".format(vertices_grid[edges_grid[x][0]], vertices_grid[edges_grid[x][1]]) for x in edges ] print[ "{} = {}".format(vertices_grid_values[int(edges_grid[x][0])]["name"], vertices_grid_values[int(edges_grid[x][1])]["name"]) for x in edges ]