def hrg_clique_tree (G): if G is None: return # ------------------ ## # tree decomposition # ------------------ ## num_nodes = G.number_of_nodes() prod_rules = {} if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 300): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) # pprint.pprint (children) return root, children
def tree_decomposition(g): """ Rule extraction procedure """ if g is None: return [] g.remove_edges_from(g.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(g), key=len) g = nx.subgraph(g, giant_nodes) graph_checks(g) tree_decomp_l = [] if g.number_of_nodes() >= 500: for g_prime in gs.rwr_sample(g, 2, 300): _t = td.quickbb(g_prime) root = list(_t)[0] _t = td.make_rooted(_t, root) _t = binarize(_t) tree_decomp_l.append(_t) else: _t = td.quickbb(g) root = list(_t)[0] _t = td.make_rooted(_t, root) # _t = binarize(_t) tree_decomp_l.append(_t) return tree_decomp_l
def probabilistic_hrg_deriving_prod_rules (G, K=1, n=None, gname=""): ''' Rule extraction procedure ''' if G is None: return G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) if n is None: num_nodes = G.number_of_nodes() else: num_nodes = n graph_checks(G) if DEBUG: print if DEBUG: print "--------------------" if DEBUG: print "-Tree Decomposition-" if DEBUG: print "--------------------" if num_nodes >= 500: for j,Gprime in enumerate(gs.rwr_sample(G, K, num_nodes)): if gname is "": nx.write_edgelist(Gprime, '/tmp/sampled_subgraph_200_{}.tsv'.format(j), delimiter="\t", data=False) else: nx.write_edgelist(Gprime, '/tmp/{}{}.tsv'.format(gname, j), delimiter="\t", data=False) print "... files written: /tmp/{}{}.tsv".format(gname, j) return
def derive_production_rules(G): """ Parameters ---------- G : input graph """ from PHRG import graph_checks, binarize prod_rules = {} G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) num_nodes = G.number_of_nodes() graph_checks(G) print print "--------------------" print "-Tree Decomposition-" print "--------------------" if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 100): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) print print "--------------------" print "- Production Rules -" print "--------------------" for k in prod_rules.iterkeys(): print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. #print '\t -> ', d, prod_rules[k][d] return prod_rules
def get_hrg_production_rules(fname): import graph_sampler as gs G = load_edgelist(fname) G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) Info(str(G.number_of_nodes())) if G.number_of_nodes() >= 500: Info('Grande') for Gprime in gs.rwr_sample(G, 2, 300): td([Gprime])
def nx_edges_to_nddgo_graph_sampling(graph): G = graph nbr_nodes = 200 K = int(math.ceil(.25*G.number_of_nodes()/nbr_nodes)) for j,Gprime in enumerate(gs.rwr_sample(G, K, nbr_nodes)): if gname is "": # nx.write_edgelist(Gprime, '/tmp/sampled_subgraph_200_{}.tsv'.format(j), delimiter="\t", data=False) gprime_lst.append(Gprime) else: # nx.write_edgelist(Gprime, '/tmp/{}{}.tsv'.format(gname, j), delimiter="\t", data=False) gprime_lst.append(Gprime) print "... files written: /tmp/{}{}.tsv".format(gname, j) return gprime_lst
def stochastic_hrg(G, num_graphs=1): print "++" * 4, num_graphs # Graph much be connected if not nx.is_connected(G): print "Graph must be connected" G = list(nx.connected_component_subgraphs(G))[0] # Graph must be simple G.remove_edges_from(G.selfloop_edges()) if G.number_of_selfloops() > 0: print "Graph must be not contain self-loops" exit() num_nodes = G.number_of_nodes() #print "Number of Nodes:\t" + str(num_nodes) num_edges = G.number_of_edges() #print "Number of Edges:\t" + str(num_edges) # To parse a large graph we use 10 samples of size 500 each. It is # possible to parse the whole graph, but the approximate # decomposition method we use is still quite slow. if num_nodes > 500: for Gprime in gs.rwr_sample(G, 2, 300): pr.prod_rules = {} T = td.quickbb(Gprime) prod_rules = pr.learn_production_rules(Gprime, T) else: T = td.quickbb(G) prod_rules = pr.learn_production_rules(G, T) print " -- stochastic hrg -> Rule Induction Complete" Gstar = [] Dstar = [] for run in range(0, num_graphs): nG, nD = sg.grow(prod_rules, num_nodes, 1) Gstar.append(nG) Dstar.append(nD) return Gstar, Dstar
def learn_grammars_production_rules(input_graph): G = input_graph # print G.number_of_nodes() # print G.number_of_edges() num_nodes = G.number_of_nodes() G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) graph_checks(G) if dbg: print print "--------------------" print "-Tree Decomposition-" print "--------------------" if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 100): T = tw.quickbb(Gprime) root = list(T)[0] T = tw.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T tw.new_visit(T, G, prod_rules) else: T = tw.quickbb(G) root = list(T)[0] T = tw.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T tw.new_visit(T, G, prod_rules) # return return prod_rules
def nx_edges_to_nddgo_graph_sampling(graph, n, m, peo_h): G = graph if n is None and m is None: return # n = G.number_of_nodes() # m = G.number_of_edges() nbr_nodes = 256 basefname = 'datasets/{}_{}'.format(G.name, peo_h) K = int(math.ceil(.25 * G.number_of_nodes() / nbr_nodes)) print "--", nbr_nodes, K, '--' for j, Gprime in enumerate(gs.rwr_sample(G, K, nbr_nodes)): # if gname is "": # # nx.write_edgelist(Gprime, '/tmp/sampled_subgraph_200_{}.tsv'.format(j), delimiter="\t", data=False) # gprime_lst.append(Gprime) # else: # # nx.write_edgelist(Gprime, '/tmp/{}{}.tsv'.format(gname, j), delimiter="\t", data=False) # gprime_lst.append(Gprime) # # print "... files written: /tmp/{}{}.tsv".format(gname, j) edges = Gprime.edges() edges = [(int(e[0]), int(e[1])) for e in edges] df = pd.DataFrame(edges) df.sort_values(by=[0], inplace=True) ofname = basefname + "_{}.dimacs".format(j) with open(ofname, 'w') as f: f.write('c {}\n'.format(G.name)) f.write('p edge\t{}\t{}\n'.format(n, m)) # for e in df.iterrows(): output_edges = lambda x: f.write("e\t{}\t{}\n".format(x[0], x[1])) df.apply(output_edges, axis=1) # f.write("e\t{}\t{}\n".format(e[0]+1,e[1]+1)) if os.path.exists(ofname): print 'Wrote: {}'.format(ofname) return basefname
def probabilistic_hrg_learning(G, num_samples=1, n=None, prod_rules=None): graphletG = [] # print G.number_of_nodes() # print G.number_of_edges() G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) if n is None: num_nodes = G.number_of_nodes else: num_nodes = n # print G.number_of_nodes() # print G.number_of_edges() graph_checks(G) # print # print "--------------------" # print "-Tree Decomposition-" # print "--------------------" if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 300): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) # print 'root', [x for x in T[0]]#, type(root) # import pprint as pp # pp.pprint([x for x in T]) ''' for x in T: if isinstance(x,(frozenset)): print '\t',x else: print [type(s) for s in x if isinstance(x,(list))] ''' ##while isinstance(T,(tuple,list,)) and len(T): ## for x in T: ## if isinstance(x,(frozenset)): ## print'\t', x ## else: ## T = x # print # print "--------------------" # print "- Production Rules -" # print "--------------------" for k in prod_rules.iterkeys(): # print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. # print '\t -> ', d, prod_rules[k][d] rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append( ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) # print ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 return rules
def probabilistic_hrg(G, n=None): ''' Rule extraction procedure ''' if G is None: return G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) if n is None: num_nodes = G.number_of_nodes() else: num_nodes = n graph_checks(G) if DEBUG: print if DEBUG: print "--------------------" if DEBUG: print "-Tree Decomposition-" if DEBUG: print "--------------------" prod_rules = {} if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 300): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) if DEBUG: print if DEBUG: print "--------------------" if DEBUG: print "- Production Rules -" if DEBUG: print "--------------------" for k in prod_rules.iterkeys(): if DEBUG: print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. if DEBUG: print '\t -> ', d, prod_rules[k][d] # pp.pprint(prod_rules) rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append( ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) if DEBUG: print("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 return rules
print G.number_of_nodes() print G.number_of_edges() G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) graph_checks(G) print print "--------------------" print "-Tree Decomposition-" print "--------------------" if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 100): T = tw.quickbb(Gprime) root = list(T)[0] T = tw.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T tw.new_visit(T, G, prod_rules) else: T = tw.quickbb(G) root = list(T)[0] T = tw.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T tw.new_visit(T, G, prod_rules)
def probabilistic_hrg (G, num_samples=1, n=None): ''' Args: ------------ G: input graph (nx obj) num_samples: (int) in the 'grow' process, this is number of synthetic graphs to generate n: (int) num_nodes; number of nodes in the resulting graphs Returns: List of synthetic graphs (H^stars) ''' graphletG = [] if DEBUG: print G.number_of_nodes() if DEBUG: print G.number_of_edges() G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) if n is None: num_nodes = G.number_of_nodes() else: num_nodes = n if DEBUG: print G.number_of_nodes() if DEBUG: print G.number_of_edges() graph_checks(G) if DEBUG: print if DEBUG: print "--------------------" if DEBUG: print "-Tree Decomposition-" if DEBUG: print "--------------------" prod_rules = {} if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 300): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules, TD) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules, TD) if DEBUG: print if DEBUG: print "--------------------" if DEBUG: print "- Production Rules -" if DEBUG: print "--------------------" for k in prod_rules.iterkeys(): if DEBUG: print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float(s) # normailization step to create probs not counts. if DEBUG: print '\t -> ', d, prod_rules[k][d] rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append(("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) if DEBUG: print ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 # print rules exit() g = pcfg.Grammar('S') for (id, lhs, rhs, prob) in rules: # print type(id), type(lhs), type(rhs), type(prob) g.add_rule(pcfg.Rule(id, lhs, rhs, prob)) if DEBUG: print "Starting max size" num_nodes = num_nodes num_samples = num_samples g.set_max_size(num_nodes) if DEBUG: print "Done with max size" Hstars = [] for i in range(0, num_samples): rule_list = g.sample(num_nodes) # print rule_list hstar = grow(rule_list, g)[0] # print "H* nodes: " + str(hstar.number_of_nodes()) # print "H* edges: " + str(hstar.number_of_edges()) Hstars.append(hstar) return Hstars
G.remove_edges_from(G.selfloop_edges()) if G.number_of_selfloops() > 0: print "Graph must be not contain self-loops" exit() num_nodes = G.number_of_nodes() print "Number of Nodes:\t" + str(num_nodes) num_edges = G.number_of_edges() print "Number of Edges:\t" + str(num_edges) # To parse a large graph we use 10 samples of size 500 each. It is # possible to parse the whole graph, but the approximate # decomposition method we use is still quite slow. if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 10, 500): pr.prod_rules = {} T = td.quickbb(Gprime) prod_rules = pr.learn_production_rules(Gprime, T) else: T = td.quickbb(G) prod_rules = pr.learn_production_rules(G, T) print "Rule Induction Complete" Gstar = [] Dstar = [] Gstargl = [] Ggl = [] for run in range(0, 20): if num_nodes < 100:
def get_hrg_production_rules(edgelist_data_frame, graph_name, tw=False, n_subg=2, n_nodes=300, nstats=False): from growing import derive_prules_from t_start = time.time() df = edgelist_data_frame if df.shape[1] == 4: G = nx.from_pandas_dataframe(df, 'src', 'trg', edge_attr=True) # whole graph elif df.shape[1] == 3: G = nx.from_pandas_dataframe(df, 'src', 'trg', ['ts']) # whole graph else: G = nx.from_pandas_dataframe(df, 'src', 'trg') G.name = graph_name print "==> read in graph took: {} seconds".format(time.time() - t_start) G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) num_nodes = G.number_of_nodes() phrg.graph_checks(G) if DBG: print if DBG: print "--------------------" if not DBG: print "-Tree Decomposition-" if DBG: print "--------------------" prod_rules = {} K = n_subg n = n_nodes if num_nodes >= 500: print 'Grande' t_start = time.time() for Gprime in gs.rwr_sample(G, K, n): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T # td.new_visit(T, G, prod_rules, TD) td.new_visit(T, G, prod_rules) Process(target=td.new_visit, args=( T, G, prod_rules, )).start() else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T # td.new_visit(T, G, prod_rules, TD) td.new_visit(T, G, prod_rules) print_treewidth(T) exit() if DBG: print if DBG: print "--------------------" if DBG: print "- Production Rules -" if DBG: print "--------------------" for k in prod_rules.iterkeys(): if DBG: print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. if DBG: print '\t -> ', d, prod_rules[k][d] rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append( ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) if DBG: print("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 df = pd.DataFrame(rules) '''print "++++++++++" df.to_csv('ProdRules/{}_prs.tsv'.format(G.name), header=False, index=False, sep="\t") if os.path.exists('ProdRules/{}_prs.tsv'.format(G.name)): print 'Saved', 'ProdRules/{}_prs.tsv'.format(G.name) else: print "Trouble saving" print "-----------" print [type(x) for x in rules[0]] ''' ''' Graph Generation of Synthetic Graphs Grow graphs usigng the union of rules from sampled sugbgraphs to predict the target order of the original graph ''' hStars = grow_exact_size_hrg_graphs_from_prod_rules( rules, graph_name, G.number_of_nodes(), 10) print '... hStart graphs:', len(hStars) d = {graph_name + "_hstars": hStars} with open(r"Results/{}_hstars.pickle".format(graph_name), "wb") as output_file: cPickle.dump(d, output_file) if os.path.exists(r"Results/{}_hstars.pickle".format(graph_name)): print "File saved" '''if nstats:
def probabilistic_hrg(G, num_samples=1): graphletG = [] #print G.number_of_nodes() #print G.number_of_edges() G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) num_nodes = G.number_of_nodes() # print G.number_of_nodes() # print G.number_of_edges() graph_checks(G) print print "--------------------" print "-Tree Decomposition-" print "--------------------" if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 100): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) print print "--------------------" print "- Production Rules -" print "--------------------" for k in prod_rules.iterkeys(): #print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float(s) # normailization step to create probs not counts. #print '\t -> ', d, prod_rules[k][d] rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append(("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) #print ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 g = pcfg.Grammar('S') for (id, lhs, rhs, prob) in rules: g.add_rule(pcfg.Rule(id, lhs, rhs, prob)) print "Starting max size" g.set_max_size(num_nodes) print "Done with max size" Hstars = [] for i in range(0, num_samples): rule_list = g.sample(num_nodes) # print rule_list hstar = grow(rule_list, g)[0] # print "H* nodes: " + str(hstar.number_of_nodes()) # print "H* edges: " + str(hstar.number_of_edges()) Hstars.append(hstar) return (Hstars)
print "Graph must be not contain self-loops"; exit() num_nodes = G.number_of_nodes() print "Number of Nodes:\t" + str(num_nodes) num_edges = G.number_of_edges() print "Number of Edges:\t" + str(num_edges) # To parse a large graph we use 10 samples of size 500 each. It is # possible to parse the whole graph, but the approximate # decomposition method we use is still quite slow. if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 10, 500): pr.prod_rules = {} T = td.quickbb(Gprime) prod_rules = pr.learn_production_rules(Gprime, T) else: T = td.quickbb(G) prod_rules = pr.learn_production_rules(G, T) print "Rule Induction Complete" Gstar = [] Dstar = [] Gstargl = [] Ggl = [] for run in range(0, 20): if num_nodes < 100: