def test_sampling_graph1(): """Test sampling of problematic graph. The issue with this graph is that the operation on (1, 3) would prune out (3, 3) the one causing the cycle, except that it is retained because there is still a non-cyclic path through (3, 3) via (1, 1). However, in subsequent steps, pruning of downstream nodes (i.e., (2, 4)) actually eliminate any acyclic paths through (1, 3). As a result, there is a circumstance, when sampling the resulting graph, that one can end up sampling into (1, 3) but there are no permissible successors from (1, 3) based on the tags. The solution was to repeat the sampling process iteratively until convergence. """ g = nx.DiGraph() g.add_edges_from([(0, 1), (0, 3), (0, 4), (0, 5), (1, 4), (2, 4), (2, 5), (3, 0), (3, 2), (3, 4), (3, 5), (4, 2), (4, 3), (4, 5)]) source, target, length = (0, 5, 5) (f_level, b_level) = pg.get_reachable_sets(g, source, target, max_depth=length) pre_cfpg = pg.PreCFPG.from_graph(g, source, target, length, f_level, b_level) paths = pre_cfpg.sample_paths(100)
def scaling_random_graphs(num_samples, min_size, max_size, edge_prob=0.5): data_shape = (max_size - min_size + 1, num_samples) times_nx_paths = np.empty(data_shape) times_pg = np.empty(data_shape) times_cfpg = np.empty(data_shape) # Iterate over number of nodes in network for i, num_nodes in enumerate(range(min_size, max_size+1)): print(f'Number of nodes in network: {num_nodes}') # Iterate over num_samples random graphs of this size for j in range(num_samples): print(f'Sample {j}') # Generate a random graph rg = nx.erdos_renyi_graph(num_nodes, edge_prob, directed=True) # Select two nodes as source and target source = 0 target = num_nodes - 1 # Time to compute all simple paths with path probabilities start = time.time() paths = [tuple(p) for p in nx.all_simple_paths(rg, source, target)] #paths2 = [tuple(p) for p in nx.shortest_simple_paths(rg, source, target)] #assert(set(paths) == set(paths2)) # Now build a path tree from the paths and calculate probabilities pt = PathsTree(paths) path_probs = pt.path_probabilities() # Save the time it took the calculate end = time.time() elapsed = end - start times_nx_paths[i, j] = elapsed # Time to compute paths_graphs and make combined graph pg_start = time.time() f_level, b_level = get_reachable_sets(rg, source, target, num_nodes) pg_list = [] for length in range(1, num_nodes): pg = PathsGraph.from_graph(rg, source, target, length, f_level, b_level) pg_list.append(pg) combined_pg = CombinedPathsGraph(pg_list) # NOTE: no count_paths method total_paths = combined_pg.count_paths() print(f'Total paths (with cycles): {total_paths}') #cf_paths = combined_pg.sample_cf_paths(100000) pg_elapsed = time.time() - pg_start times_pg[i, j] = pg_elapsed # Now compute the CFPG cfpg_list = [] for pg in pg_list: cfpg = CFPG.from_pg(pg) cfpg_list.append(cfpg) cfpg_elapsed = time.time() - pg_start times_cfpg[i, j] = cfpg_elapsed return times_nx_paths, times_pg, times_cfpg
def run_pg_vs_nx(graph, source, target, depth, num_samples): # PG sampling start = time.time() f_level, b_level = get_reachable_sets(graph, source, target, depth) pg_list = [] for i in range(1, depth + 1): pg = PathsGraph.from_graph(graph, source, target, i, f_level, b_level) pg_list.append(pg) combined_pg = CombinedPathsGraph(pg_list) print("Sampling from PG") cf_paths = [] while len(cf_paths) < num_samples: print(f'{len(cf_paths)} / {num_samples}') cf_path_chunk = combined_pg.sample_paths(100) #cf_paths = [] end = time.time() #print("Done sampling from PG") print("Done generating PGs") pg_elapsed = end - start # Networkx enumeration index = 0 start = time.time() nx_paths = [] nx_sampled_paths = [] """ for p in nx.all_simple_paths(graph, source, target, cutoff=depth): nx_paths.append(tuple(p)) if index % 10000 == 0: print(index) index += 1 #print("Making PathsTree") #paths_tree = PathsTree(nx_paths) #print("Sampling PathsTree") #nx_sampled_paths = paths_tree.sample(num_samples) end = time.time() nx_elapsed = end - start #assert set(cf_paths) <= set(nx_paths) print("all_simple_paths done") print("Total paths (nx):", len(nx_paths)) print("Unique sampled paths (pg):", len(set(cf_paths))) #print("Unique sampled_paths (tree):", len(set(nx_sampled_paths))) print("NX time", nx_elapsed) print("PG time", pg_elapsed) nx_sampled_paths = [] """ nx_elapsed = 0 return { 'pg_list': pg_list, 'pg_paths': cf_paths, 'nx_paths': nx_paths, 'nx_paths_sampled': nx_sampled_paths, 'pg_time': pg_elapsed, 'nx_time': nx_elapsed }
def test_from_graph_with_levels_bad_depth(): """Raise an exception if the requested path length is greater than the depth of the provided reach sets.""" (f_reach, b_reach) = pg.get_reachable_sets(g_uns, source, target, max_depth=2) cfpg = pg.CFPG.from_graph(g_uns, source, target, length, fwd_reachset=f_reach, back_reachset=b_reach) assert not cfpg.graph
def test_from_pg(): (f_reach, b_reach) = pg.get_reachable_sets(g_uns, source, target, max_depth=length) pg_0 = pg.PathsGraph.from_graph(g_uns, source, target, length, f_reach, b_reach) cfpg = pg.CFPG.from_pg(pg_0) paths = cfpg.enumerate_paths() assert len(paths) == 2 assert ('A', 'B', 'D', 'C', 'E') in paths assert ('A', 'C', 'D', 'B', 'E') in paths assert len(cfpg.graph) == 8 # The D node should be split into two nodes d_nodes = [n for n in cfpg.graph.nodes() if n[1] == 'D'] assert len(d_nodes) == 2
def test_from_graph_with_levels_bad_depth(): """Raise an exception if the requested path length is greater than the depth of the provided reach sets.""" g4_uns = nx.DiGraph() g4_uns.add_edges_from(((0, 1), (1, 0), (0, 2), (2, 0), (1, 2), (2, 1))) source, target, length = (0, 2, 2) max_depth = 1 (f_reach, b_reach) = \ pg.get_reachable_sets(g4_uns, source, target, max_depth=max_depth) pre_cfpg = pg.PreCFPG.from_graph(g4_uns, source, target, length, fwd_reachset=f_reach, back_reachset=b_reach) assert not pre_cfpg.graph
def test_prune(): g = nx.DiGraph() g.add_edges_from((('S', 'A'), ('S', 'B'), ('A', 'S'), ('B', 'C'), ('C', 'D'), ('D', 'T'), ('B', 'T'))) length = 4 (f_level, b_level) = pg.get_reachable_sets(g, 'S', 'T', max_depth=length) pg_raw = pg.PathsGraph.from_graph(g, 'S', 'T', length, f_level, b_level) pg_raw_edges = pg_raw.graph.edges() nodes_to_prune = [(2, 'S')] # Prune the graph pg_pruned = pcf.prune(pg_raw.graph, nodes_to_prune, (0, 'S'), (length, 'T')) # Make sure we didn't change the original graphs or node lists assert nodes_to_prune == [(2, 'S')] assert pg_raw.graph.edges() == pg_raw_edges # The correctly pruned structure assert set(pg_pruned.edges()) == \ set([((0, 'S'), (1, 'B')), ((1, 'B'), (2, 'C')), ((2, 'C'), (3, 'D')), ((3, 'D'), (4, 'T'))])
def test_from_pg(): g4_uns = nx.DiGraph() g4_uns.add_edges_from(((0, 1), (1, 0), (0, 2), (2, 0), (1, 2), (2, 1))) source, target, length = (0, 2, 2) (f_level, b_level) = pg.get_reachable_sets(g4_uns, source, target, max_depth=length) pg_raw = pg.PathsGraph.from_graph(g4_uns, source, target, length, f_level, b_level) pre_cfpg = pg.PreCFPG.from_pg(pg_raw) assert isinstance(pre_cfpg, pg.PreCFPG) assert pre_cfpg.graph assert set(pre_cfpg.graph.edges()) == \ set([((0, 0), (1, 1)), ((1, 1), (2, 2))]) assert pre_cfpg.tags == { (0, 0): [(0, 0)], (1, 1): [(0, 0), (1, 1)], (2, 2): [(0, 0), (1, 1), (2, 2)] }
def test_from_graph_with_levels(): g4_uns = nx.DiGraph() g4_uns.add_edges_from(((0, 1), (1, 0), (0, 2), (2, 0), (1, 2), (2, 1))) source, target, length = (0, 2, 2) max_depth = 5 (f_reach, b_reach) = \ pg.get_reachable_sets(g4_uns, source, target, max_depth=max_depth) pre_cfpg = pg.PreCFPG.from_graph(g4_uns, source, target, length, fwd_reachset=f_reach, back_reachset=b_reach) assert isinstance(pre_cfpg, pg.PreCFPG) assert pre_cfpg.graph assert set(pre_cfpg.graph.edges()) == \ set([((0, 0), (1, 1)), ((1, 1), (2, 2))]) assert pre_cfpg.tags == { (0, 0): [(0, 0)], (1, 1): [(0, 0), (1, 1)], (2, 2): [(0, 0), (1, 1), (2, 2)] }
print("Loading network") with open('_cache/nx_dir_graph_db_dump_20190417.pkl', 'rb') as f: g = pickle.load(f) print("Done loading network") source = 'NCKAP1' target = 'TEAD1' max_depth = 5 num_samples = 20000 print("Getting reachable sets") fwd_reach, back_reach = get_reachable_sets(g, source, target, max_depth, signed=False) print("Building PG") pg_list = [] for cur_length in range(1, max_depth + 1): print("Building paths graph for length %d" % cur_length) pg = PathsGraph.from_graph(g, source, target, cur_length, fwd_reach, back_reach, signed=False, target_polarity=0)
def run_pg_cfpg(rg, source, target): num_nodes = len(rg) # Time to compute paths_graphs and make combined graph pg_start = time.time() f_level, b_level = get_reachable_sets(rg, source, target, num_nodes) pg_list = [] for length in range(1, num_nodes): pg = PathsGraph.from_graph(rg, source, target, length, f_level, b_level) pg_list.append(pg) combined_pg = CombinedPathsGraph(pg_list) ht = HypothesisTester(0.5, 0.1, 0.1, 0.05) tf = None tfs = [] nsamples = 0 batch = 10 while tf is None: new_paths = combined_pg.sample_cf_paths(batch) if not new_paths: tf = 0 break tfs += [exists_property(p, 5) for p in new_paths] nsamples += batch tf = ht.test(tfs) print(f'PG: {tf} based on {nsamples} samples') # cf_paths = combined_pg.sample_cf_paths(10000) # print(prob_ascending_path(cf_paths)) pg_elapsed = time.time() - pg_start print(f'PG: {pg_elapsed:.2f}s') # Now compute the CFPG cfpg_list = [] for pg in pg_list: cfpg = CFPG.from_pg(pg) cfpg_list.append(cfpg) ccfpg = CombinedCFPG(cfpg_list) print('Sampling CFPG') ht = HypothesisTester(0.5, 0.1, 0.1, 0.05) tf = None tfs = [] nsamples = 0 batch = 10 while tf is None: new_paths = ccfpg.sample_paths(batch) if not new_paths: tf = 0 break tfs += [exists_property(p, 5) for p in new_paths] nsamples += batch tf = ht.test(tfs) print(f'CFPG: {tf} based on {nsamples} samples') #cfpg_paths = ccfpg.sample_paths(10000) #print(prob_ascending_path(cfpg_paths)) cfpg_elapsed = time.time() - pg_start print(f'CFPG: {cfpg_elapsed:.2f}s') return pg_elapsed, cfpg_elapsed
def _sample_paths(self, input_rule_set, obs_name, target_polarity, max_paths=1, max_path_length=5): if max_paths == 0: raise ValueError("max_paths cannot be 0 for path sampling.") if not has_pg: raise ImportError("Paths Graph is not imported") # Convert path polarity representation from 0/1 to 1/-1 def convert_polarities(path_list): return [ tuple((n[0], 0 if n[1] > 0 else 1) for n in path) for path in path_list ] pg_polarity = 0 if target_polarity > 0 else 1 nx_graph = self._im_to_signed_digraph(self.get_im()) # Add edges from dummy node to input rules source_node = 'SOURCE_NODE' for rule in input_rule_set: nx_graph.add_edge(source_node, rule, sign=0) # ------------------------------------------------- # Create combined paths_graph f_level, b_level = pg.get_reachable_sets(nx_graph, source_node, obs_name, max_path_length, signed=True) pg_list = [] for path_length in range(1, max_path_length + 1): cfpg = pg.CFPG.from_graph(nx_graph, source_node, obs_name, path_length, f_level, b_level, signed=True, target_polarity=pg_polarity) pg_list.append(cfpg) combined_pg = pg.CombinedCFPG(pg_list) # Make sure the combined paths graph is not empty if not combined_pg.graph: pr = PathResult(False, 'NO_PATHS_FOUND', max_paths, max_path_length) pr.path_metrics = None pr.paths = [] return pr # Get a dict of rule objects rule_obj_dict = {} for ann in self.model.annotations: if ann.predicate == 'rule_has_object': rule_obj_dict[ann.subject] = ann.object # Get monomer initial conditions ic_dict = {} for mon in self.model.monomers: # FIXME: A hack that depends on the _0 convention ic_name = '%s_0' % mon.name # TODO: Wrap this in try/except? ic_param = self.model.parameters[ic_name] ic_value = ic_param.value ic_dict[mon.name] = ic_value # Set weights in PG based on model initial conditions for cur_node in combined_pg.graph.nodes(): edge_weights = {} rule_obj_list = [] edge_weights_by_gene = {} for u, v in combined_pg.graph.out_edges(cur_node): v_rule = v[1][0] # Get the object of the rule (a monomer name) rule_obj = rule_obj_dict.get(v_rule) if rule_obj: # Add to list so we can count instances by gene rule_obj_list.append(rule_obj) # Get the abundance of rule object from the initial # conditions # TODO: Wrap in try/except? ic_value = ic_dict[rule_obj] else: ic_value = 1.0 edge_weights[(u, v)] = ic_value edge_weights_by_gene[rule_obj] = ic_value # Get frequency of different rule objects rule_obj_ctr = Counter(rule_obj_list) # Normalize results by weight sum and gene frequency at this level edge_weight_sum = sum(edge_weights_by_gene.values()) edge_weights_norm = {} for e, v in edge_weights.items(): v_rule = e[1][1][0] rule_obj = rule_obj_dict.get(v_rule) if rule_obj: rule_obj_count = rule_obj_ctr[rule_obj] else: rule_obj_count = 1 edge_weights_norm[e] = ((v / float(edge_weight_sum)) / float(rule_obj_count)) # Add edge weights to paths graph nx.set_edge_attributes(combined_pg.graph, name='weight', values=edge_weights_norm) # Sample from the combined CFPG paths = combined_pg.sample_paths(max_paths) # ------------------------------------------------- if paths: pr = PathResult(True, 'PATHS_FOUND', max_paths, max_path_length) pr.path_metrics = None # Convert path polarity representation from 0/1 to 1/-1 pr.paths = convert_polarities(paths) # Strip off the SOURCE_NODE prefix pr.paths = [p[1:] for p in pr.paths] else: assert False pr = PathResult(False, 'NO_PATHS_FOUND', max_paths, max_path_length) pr.path_metrics = None pr.paths = [] return pr
print(edge_count) pb_signed = nx.DiGraph() pb_signed.add_edges_from(pb_sign_edges) src_edges = list(itertools.product(['root'], src_nodes, [{'sign': 0}])) graph = pb_signed graph.add_edges_from(src_edges) source = 'root' target = chek2_node depth = 6 num_samples = 1000 f_level, b_level = get_reachable_sets(graph, source, target, depth, signed=True) pg_list = [] for i in range(1, depth + 1): pg = PathsGraph.from_graph(graph, source, target, i, f_level, b_level, signed=True, target_polarity=1) pg_list.append(pg) combined_pg = CombinedPathsGraph(pg_list)
#if ag_ns == 'HGNC': # ag_id = hgnc_client.get_hgnc_id(ag_id) source_list.append((ag_ns, ag_id)) # Add a dummy source graph_file = '../input/july_2018_pa_HGNC_FPLX_typed_directional_pairs.tsv' graph = load_stmt_graph(graph_file) dummy_edges = [('SOURCE', src[1]) for src in source_list] dummy_edges += [(tgt[1], 'TARGET') for tgt in target_list] graph.add_edges_from(dummy_edges) max_depth = 8 pg_list = [] lengths = [] stmt_counts = [] f_level, b_level = get_reachable_sets(graph, 'SOURCE', 'TARGET', max_depth) for length in range(3, max_depth + 1): pg = PathsGraph.from_graph(graph, 'SOURCE', 'TARGET', length, fwd_reachset=f_level, back_reachset=b_level) stmt_hashes = get_stmt_hashes_from_pg(graph, pg) print("%d stmts for paths of length %d" % (len(stmt_hashes), length - 2)) pg_list.append(pg) lengths.append(length - 2) stmt_counts.append(len(stmt_hashes)) plt.ion()
def test_initialize(): source = 'A' target = 'D' length = 3 # We first run the pg_0 calculation on a simple graph with no cycles # involving the source or target (f_level, b_level) = pg.get_reachable_sets(g1_uns, source, target, max_depth=length) pg_raw = pg.PathsGraph.from_graph(g1_uns, source, target, length, f_level, b_level) (pg_0, tags) = pcf._initialize_pre_cfpg(pg_raw) # Because no nodes are pruned, the initialized "cycle free" paths graph # will be the same as the path graph we started with assert pg_0 == pg_raw.graph assert tags == { (0, 'A'): [(0, 'A')], (1, 'B'): [(0, 'A')], (2, 'C'): [(0, 'A')], (3, 'D'): [(0, 'A')] } # The next graph contains a cycle passing through the source node, A, # and no acyclic paths (f_level, b_level) = pg.get_reachable_sets(g2_uns, source, target, max_depth=length) pg_raw = pg.PathsGraph.from_graph(g2_uns, source, target, length, f_level, b_level) (pg_0, tags) = pcf._initialize_pre_cfpg(pg_raw) assert not pg_0 assert not tags # The next graph contains a cycle passing through the source node, A, # with one acyclic path (f_level, b_level) = pg.get_reachable_sets(g3_uns, source, target, max_depth=length) pg_raw = pg.PathsGraph.from_graph(g3_uns, source, target, length, f_level, b_level) (pg_0, tags) = pcf._initialize_pre_cfpg(pg_raw) assert set(pg_0.edges()) == set([((0, 'A'), (1, 'B')), ((1, 'B'), (2, 'C')), ((2, 'C'), (3, 'D'))]) assert tags == { (0, 'A'): [(0, 'A')], (1, 'B'): [(0, 'A')], (2, 'C'): [(0, 'A')], (3, 'D'): [(0, 'A')] } # This test stems from a randomly-generated network where no paths # were found--guarantees that the problem is NOT that pg_0 is empty g4_uns = nx.DiGraph() g4_uns.add_edges_from(((0, 1), (1, 0), (0, 2), (2, 0), (1, 2), (2, 1))) source, target, length = (0, 2, 2) (f_level, b_level) = pg.get_reachable_sets(g4_uns, source, target, max_depth=length) pg_raw = pg.PathsGraph.from_graph(g4_uns, source, target, length, f_level, b_level) (pg_0, tags) = pcf._initialize_pre_cfpg(pg_raw) assert pg_0 assert tags
graph = nx.DiGraph() graph.add_edges_from(edges) draw(graph, join(output_dir, 'toy_%s_graph.pdf' % direction)) if __name__ == '__main__': output_dir = sys.argv[1] # Draw G draw(g, join(output_dir, 'toy_g.pdf')) depth = 4 source = 'S' target = 'T' f_level, b_level = get_reachable_sets(g, source, target, depth) draw_reachset(g, f_level, 'forward', depth, output_dir) draw_reachset(g, b_level, 'backward', depth, output_dir) print("f_level", f_level) print("b_level", b_level) pg = PathsGraph.from_graph(g, source, target, depth) draw(pg.graph, join(output_dir, 'toy_pg_%d.pdf' % depth)) # Combined paths graph pg_list = [] for i in range(1, 4+1): pg_list.append(PathsGraph.from_graph(g, source, target, i)) cpg = CombinedPathsGraph(pg_list) draw(cpg.graph, join(output_dir, 'toy_combined_pg.pdf'))
def test_on_random_graphs(): """For each of 25 random graphs, check that the number of cycle free paths for a given depth and source/target pair matches the results from networkx all_simple_paths. Graphs range from rough""" # We use 25 randomly generated graphs for testing the algorithm with open(random_graph_pkl, 'rb') as f: rg_dict = pickle.load(f) min_depth = 5 max_depth = 10 for i in range(1): edges, source, target = rg_dict[i] G_i = nx.DiGraph() G_i.add_edges_from(edges) print("graph# %d, %d nodes, %d edges" % (i, len(G_i.nodes()), len(G_i.edges()))) (f_reach, b_reach) = pg.get_reachable_sets(G_i, source, target, max_depth=max_depth, signed=False) # Try different path lengths for length in range(min_depth, max_depth + 1): print("Checking paths of length %d" % length) # For validation, we compute explicitly the set of paths in the # original graph of a fixed length P = list(nx.all_simple_paths(G_i, source, target, length + 1)) # Filter to paths of this length P_correct = [tuple(p) for p in P if len(p) == length + 1] # Generate the raw paths graph G_cf = pg.CFPG.from_graph(G_i, source, target, length, f_reach, b_reach) # Check the path count path_count = G_cf.count_paths() assert len(P_correct) == path_count # Enumerate paths using node tuples P_cf_pruned = G_cf.enumerate_paths(names_only=False) # Next we extract the paths by projecting down to second # component (node names) P_cf_pruned_names = G_cf.enumerate_paths(names_only=True) print("# of paths: %d" % len(P_cf_pruned_names)) # We verify the three required properties. # Recall: # CF1: Every source-to-target path in G_cf is cycle free. # CF2: Every cycle free path in the original graph appears as a # source-to-target path in G_cf. # CF3: There is a 1-1 correspondence between the paths in G_cf and # the paths in the original graph. This means there is no # redundancy in the representation. For every path in the original # graph there is a unique path in G_cf that corresponds to it. # We first verify CF1. for p in P_cf_pruned_names: if len(p) != len(list(set(p))): print("cycle!") print(p) assert False # Next we verify CF2. We will in fact check if the set of paths in # P_cf_pruned_names is exactly the set of paths in the original # graph. if set(P_correct) != set(P_cf_pruned_names): print("Paths do not match reference set from networkx") print("graph, length", (i, length)) assert False # Finally we verify CF3 if len(P_cf_pruned) != len(list(set(P_cf_pruned_names))): print("redundant representation!") print("graph, length", (i, length)) assert False
target = random.choice(genes) print("depth", depth, "rep", rep_ix + 1, "source", source, "target", target) index = 0 start = time.time() for p in nx.all_simple_paths(graph, source, target, cutoff=depth): if index % 10000 == 0: print(index) index += 1 end = time.time() nx_elapsed = end - start results[0, depth_ix, rep_ix] = nx_elapsed print("done") start = time.time() f_level, b_level = get_reachable_sets(graph, source, target, MAX_DEPTH) total_paths = 0 for i in range(1, depth + 1): print(i) cfpg = CFPG.from_graph(graph, source, target, i, f_level, b_level) path_count = cfpg.count_paths() print(path_count, "paths") total_paths += path_count print("total paths", total_paths) print("nx paths", index) end = time.time() pg_elapsed = end - start results[1, depth_ix, rep_ix] = pg_elapsed print("NX time", nx_elapsed) print("CFPG time", pg_elapsed)