def hrg_clique_tree (G): if G is None: return # ------------------ ## # tree decomposition # ------------------ ## num_nodes = G.number_of_nodes() prod_rules = {} if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 300): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) # pprint.pprint (children) return root, children
def tree_decomposition(g): """ Rule extraction procedure """ if g is None: return [] g.remove_edges_from(g.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(g), key=len) g = nx.subgraph(g, giant_nodes) graph_checks(g) tree_decomp_l = [] if g.number_of_nodes() >= 500: for g_prime in gs.rwr_sample(g, 2, 300): _t = td.quickbb(g_prime) root = list(_t)[0] _t = td.make_rooted(_t, root) _t = binarize(_t) tree_decomp_l.append(_t) else: _t = td.quickbb(g) root = list(_t)[0] _t = td.make_rooted(_t, root) # _t = binarize(_t) tree_decomp_l.append(_t) return tree_decomp_l
def derive_production_rules(G): """ Parameters ---------- G : input graph """ from PHRG import graph_checks, binarize prod_rules = {} G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) num_nodes = G.number_of_nodes() graph_checks(G) print print "--------------------" print "-Tree Decomposition-" print "--------------------" if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 100): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) print print "--------------------" print "- Production Rules -" print "--------------------" for k in prod_rules.iterkeys(): print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. #print '\t -> ', d, prod_rules[k][d] return prod_rules
def stochastic_hrg(G, num_graphs=1): print "++" * 4, num_graphs # Graph much be connected if not nx.is_connected(G): print "Graph must be connected" G = list(nx.connected_component_subgraphs(G))[0] # Graph must be simple G.remove_edges_from(G.selfloop_edges()) if G.number_of_selfloops() > 0: print "Graph must be not contain self-loops" exit() num_nodes = G.number_of_nodes() #print "Number of Nodes:\t" + str(num_nodes) num_edges = G.number_of_edges() #print "Number of Edges:\t" + str(num_edges) # To parse a large graph we use 10 samples of size 500 each. It is # possible to parse the whole graph, but the approximate # decomposition method we use is still quite slow. if num_nodes > 500: for Gprime in gs.rwr_sample(G, 2, 300): pr.prod_rules = {} T = td.quickbb(Gprime) prod_rules = pr.learn_production_rules(Gprime, T) else: T = td.quickbb(G) prod_rules = pr.learn_production_rules(G, T) print " -- stochastic hrg -> Rule Induction Complete" Gstar = [] Dstar = [] for run in range(0, num_graphs): nG, nD = sg.grow(prod_rules, num_nodes, 1) Gstar.append(nG) Dstar.append(nD) return Gstar, Dstar
def get_clique_tree(g): g.remove_edges_from(g.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(g), key=len) g = nx.subgraph(g, giant_nodes) prod_rules = {} T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) unfold_2wide_tuple(T) return
def probabilistic_hrg(G, num_samples=1): graphletG = [] #print G.number_of_nodes() #print G.number_of_edges() G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) num_nodes = G.number_of_nodes() # print G.number_of_nodes() # print G.number_of_edges() graph_checks(G) print print "--------------------" print "-Tree Decomposition-" print "--------------------" if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 100): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) print print "--------------------" print "- Production Rules -" print "--------------------" for k in prod_rules.iterkeys(): #print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float(s) # normailization step to create probs not counts. #print '\t -> ', d, prod_rules[k][d] rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append(("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) #print ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 g = pcfg.Grammar('S') for (id, lhs, rhs, prob) in rules: g.add_rule(pcfg.Rule(id, lhs, rhs, prob)) print "Starting max size" g.set_max_size(num_nodes) print "Done with max size" Hstars = [] for i in range(0, num_samples): rule_list = g.sample(num_nodes) # print rule_list hstar = grow(rule_list, g)[0] # print "H* nodes: " + str(hstar.number_of_nodes()) # print "H* edges: " + str(hstar.number_of_edges()) Hstars.append(hstar) return (Hstars)
def get_hrg_production_rules(edgelist_data_frame, graph_name, tw=False, n_subg=2, n_nodes=300, nstats=False): from growing import derive_prules_from t_start = time.time() df = edgelist_data_frame if df.shape[1] == 4: G = nx.from_pandas_dataframe(df, 'src', 'trg', edge_attr=True) # whole graph elif df.shape[1] == 3: G = nx.from_pandas_dataframe(df, 'src', 'trg', ['ts']) # whole graph else: G = nx.from_pandas_dataframe(df, 'src', 'trg') G.name = graph_name print "==> read in graph took: {} seconds".format(time.time() - t_start) G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) num_nodes = G.number_of_nodes() phrg.graph_checks(G) if DBG: print if DBG: print "--------------------" if not DBG: print "-Tree Decomposition-" if DBG: print "--------------------" prod_rules = {} K = n_subg n = n_nodes if num_nodes >= 500: print 'Grande' t_start = time.time() for Gprime in gs.rwr_sample(G, K, n): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T # td.new_visit(T, G, prod_rules, TD) td.new_visit(T, G, prod_rules) Process(target=td.new_visit, args=( T, G, prod_rules, )).start() else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T # td.new_visit(T, G, prod_rules, TD) td.new_visit(T, G, prod_rules) print_treewidth(T) exit() if DBG: print if DBG: print "--------------------" if DBG: print "- Production Rules -" if DBG: print "--------------------" for k in prod_rules.iterkeys(): if DBG: print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. if DBG: print '\t -> ', d, prod_rules[k][d] rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append( ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) if DBG: print("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 df = pd.DataFrame(rules) '''print "++++++++++" df.to_csv('ProdRules/{}_prs.tsv'.format(G.name), header=False, index=False, sep="\t") if os.path.exists('ProdRules/{}_prs.tsv'.format(G.name)): print 'Saved', 'ProdRules/{}_prs.tsv'.format(G.name) else: print "Trouble saving" print "-----------" print [type(x) for x in rules[0]] ''' ''' Graph Generation of Synthetic Graphs Grow graphs usigng the union of rules from sampled sugbgraphs to predict the target order of the original graph ''' hStars = grow_exact_size_hrg_graphs_from_prod_rules( rules, graph_name, G.number_of_nodes(), 10) print '... hStart graphs:', len(hStars) d = {graph_name + "_hstars": hStars} with open(r"Results/{}_hstars.pickle".format(graph_name), "wb") as output_file: cPickle.dump(d, output_file) if os.path.exists(r"Results/{}_hstars.pickle".format(graph_name)): print "File saved" '''if nstats:
G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) num_nodes = G.number_of_nodes() graph_checks(G) print print "--------------------" print "-Tree Decomposition-" print "--------------------" if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 100): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = hrg.binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = hrg.binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules)
def probabilistic_hrg_learning(G, num_samples=1, n=None, prod_rules=None): graphletG = [] # print G.number_of_nodes() # print G.number_of_edges() G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) if n is None: num_nodes = G.number_of_nodes else: num_nodes = n # print G.number_of_nodes() # print G.number_of_edges() graph_checks(G) # print # print "--------------------" # print "-Tree Decomposition-" # print "--------------------" if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 300): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) # print 'root', [x for x in T[0]]#, type(root) # import pprint as pp # pp.pprint([x for x in T]) ''' for x in T: if isinstance(x,(frozenset)): print '\t',x else: print [type(s) for s in x if isinstance(x,(list))] ''' ##while isinstance(T,(tuple,list,)) and len(T): ## for x in T: ## if isinstance(x,(frozenset)): ## print'\t', x ## else: ## T = x # print # print "--------------------" # print "- Production Rules -" # print "--------------------" for k in prod_rules.iterkeys(): # print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. # print '\t -> ', d, prod_rules[k][d] rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append( ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) # print ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 return rules
def probabilistic_hrg(G, n=None): ''' Rule extraction procedure ''' if G is None: return G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) if n is None: num_nodes = G.number_of_nodes() else: num_nodes = n graph_checks(G) if DEBUG: print if DEBUG: print "--------------------" if DEBUG: print "-Tree Decomposition-" if DEBUG: print "--------------------" prod_rules = {} if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 300): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) if DEBUG: print if DEBUG: print "--------------------" if DEBUG: print "- Production Rules -" if DEBUG: print "--------------------" for k in prod_rules.iterkeys(): if DEBUG: print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. if DEBUG: print '\t -> ', d, prod_rules[k][d] # pp.pprint(prod_rules) rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append( ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) if DEBUG: print("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 return rules
def probabilistic_hrg (G, num_samples=1, n=None): ''' Args: ------------ G: input graph (nx obj) num_samples: (int) in the 'grow' process, this is number of synthetic graphs to generate n: (int) num_nodes; number of nodes in the resulting graphs Returns: List of synthetic graphs (H^stars) ''' graphletG = [] if DEBUG: print G.number_of_nodes() if DEBUG: print G.number_of_edges() G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) if n is None: num_nodes = G.number_of_nodes() else: num_nodes = n if DEBUG: print G.number_of_nodes() if DEBUG: print G.number_of_edges() graph_checks(G) if DEBUG: print if DEBUG: print "--------------------" if DEBUG: print "-Tree Decomposition-" if DEBUG: print "--------------------" prod_rules = {} if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 300): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules, TD) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules, TD) if DEBUG: print if DEBUG: print "--------------------" if DEBUG: print "- Production Rules -" if DEBUG: print "--------------------" for k in prod_rules.iterkeys(): if DEBUG: print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float(s) # normailization step to create probs not counts. if DEBUG: print '\t -> ', d, prod_rules[k][d] rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append(("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) if DEBUG: print ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 # print rules exit() g = pcfg.Grammar('S') for (id, lhs, rhs, prob) in rules: # print type(id), type(lhs), type(rhs), type(prob) g.add_rule(pcfg.Rule(id, lhs, rhs, prob)) if DEBUG: print "Starting max size" num_nodes = num_nodes num_samples = num_samples g.set_max_size(num_nodes) if DEBUG: print "Done with max size" Hstars = [] for i in range(0, num_samples): rule_list = g.sample(num_nodes) # print rule_list hstar = grow(rule_list, g)[0] # print "H* nodes: " + str(hstar.number_of_nodes()) # print "H* edges: " + str(hstar.number_of_edges()) Hstars.append(hstar) return Hstars
num_nodes = G.number_of_nodes() print "Number of Nodes:\t" + str(num_nodes) num_edges = G.number_of_edges() print "Number of Edges:\t" + str(num_edges) # To parse a large graph we use 10 samples of size 500 each. It is # possible to parse the whole graph, but the approximate # decomposition method we use is still quite slow. if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 10, 500): pr.prod_rules = {} T = td.quickbb(Gprime) prod_rules = pr.learn_production_rules(Gprime, T) else: T = td.quickbb(G) prod_rules = pr.learn_production_rules(G, T) print "Rule Induction Complete" Gstar = [] Dstar = [] Gstargl = [] Ggl = [] for run in range(0, 20): if num_nodes < 100: nG, nD = sg.grow(prod_rules, num_nodes, 1) else: