def hrg_clique_tree (G): if G is None: return # ------------------ ## # tree decomposition # ------------------ ## num_nodes = G.number_of_nodes() prod_rules = {} if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 300): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) # pprint.pprint (children) return root, children
def derive_production_rules(G): """ Parameters ---------- G : input graph """ from PHRG import graph_checks, binarize prod_rules = {} G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) num_nodes = G.number_of_nodes() graph_checks(G) print print "--------------------" print "-Tree Decomposition-" print "--------------------" if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 100): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) print print "--------------------" print "- Production Rules -" print "--------------------" for k in prod_rules.iterkeys(): print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. #print '\t -> ', d, prod_rules[k][d] return prod_rules
def probabilistic_hrg(G, num_samples=1): graphletG = [] #print G.number_of_nodes() #print G.number_of_edges() G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) num_nodes = G.number_of_nodes() # print G.number_of_nodes() # print G.number_of_edges() graph_checks(G) print print "--------------------" print "-Tree Decomposition-" print "--------------------" if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 100): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) print print "--------------------" print "- Production Rules -" print "--------------------" for k in prod_rules.iterkeys(): #print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float(s) # normailization step to create probs not counts. #print '\t -> ', d, prod_rules[k][d] rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append(("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) #print ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 g = pcfg.Grammar('S') for (id, lhs, rhs, prob) in rules: g.add_rule(pcfg.Rule(id, lhs, rhs, prob)) print "Starting max size" g.set_max_size(num_nodes) print "Done with max size" Hstars = [] for i in range(0, num_samples): rule_list = g.sample(num_nodes) # print rule_list hstar = grow(rule_list, g)[0] # print "H* nodes: " + str(hstar.number_of_nodes()) # print "H* edges: " + str(hstar.number_of_edges()) Hstars.append(hstar) return (Hstars)
def get_hrg_production_rules(edgelist_data_frame, graph_name, tw=False, n_subg=2, n_nodes=300, nstats=False): from growing import derive_prules_from t_start = time.time() df = edgelist_data_frame if df.shape[1] == 4: G = nx.from_pandas_dataframe(df, 'src', 'trg', edge_attr=True) # whole graph elif df.shape[1] == 3: G = nx.from_pandas_dataframe(df, 'src', 'trg', ['ts']) # whole graph else: G = nx.from_pandas_dataframe(df, 'src', 'trg') G.name = graph_name print "==> read in graph took: {} seconds".format(time.time() - t_start) G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) num_nodes = G.number_of_nodes() phrg.graph_checks(G) if DBG: print if DBG: print "--------------------" if not DBG: print "-Tree Decomposition-" if DBG: print "--------------------" prod_rules = {} K = n_subg n = n_nodes if num_nodes >= 500: print 'Grande' t_start = time.time() for Gprime in gs.rwr_sample(G, K, n): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T # td.new_visit(T, G, prod_rules, TD) td.new_visit(T, G, prod_rules) Process(target=td.new_visit, args=( T, G, prod_rules, )).start() else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = phrg.binarize(T) root = list(T)[0] root, children = T # td.new_visit(T, G, prod_rules, TD) td.new_visit(T, G, prod_rules) print_treewidth(T) exit() if DBG: print if DBG: print "--------------------" if DBG: print "- Production Rules -" if DBG: print "--------------------" for k in prod_rules.iterkeys(): if DBG: print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. if DBG: print '\t -> ', d, prod_rules[k][d] rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append( ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) if DBG: print("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 df = pd.DataFrame(rules) '''print "++++++++++" df.to_csv('ProdRules/{}_prs.tsv'.format(G.name), header=False, index=False, sep="\t") if os.path.exists('ProdRules/{}_prs.tsv'.format(G.name)): print 'Saved', 'ProdRules/{}_prs.tsv'.format(G.name) else: print "Trouble saving" print "-----------" print [type(x) for x in rules[0]] ''' ''' Graph Generation of Synthetic Graphs Grow graphs usigng the union of rules from sampled sugbgraphs to predict the target order of the original graph ''' hStars = grow_exact_size_hrg_graphs_from_prod_rules( rules, graph_name, G.number_of_nodes(), 10) print '... hStart graphs:', len(hStars) d = {graph_name + "_hstars": hStars} with open(r"Results/{}_hstars.pickle".format(graph_name), "wb") as output_file: cPickle.dump(d, output_file) if os.path.exists(r"Results/{}_hstars.pickle".format(graph_name)): print "File saved" '''if nstats:
graph_checks(G) print print "--------------------" print "-Tree Decomposition-" print "--------------------" if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 100): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = hrg.binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = hrg.binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) def flatten(tup): if type(tup) == frozenset: print type(tup) else: print type(tup[0]), type([1])
def probabilistic_hrg_learning(G, num_samples=1, n=None, prod_rules=None): graphletG = [] # print G.number_of_nodes() # print G.number_of_edges() G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) if n is None: num_nodes = G.number_of_nodes else: num_nodes = n # print G.number_of_nodes() # print G.number_of_edges() graph_checks(G) # print # print "--------------------" # print "-Tree Decomposition-" # print "--------------------" if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 300): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) # print 'root', [x for x in T[0]]#, type(root) # import pprint as pp # pp.pprint([x for x in T]) ''' for x in T: if isinstance(x,(frozenset)): print '\t',x else: print [type(s) for s in x if isinstance(x,(list))] ''' ##while isinstance(T,(tuple,list,)) and len(T): ## for x in T: ## if isinstance(x,(frozenset)): ## print'\t', x ## else: ## T = x # print # print "--------------------" # print "- Production Rules -" # print "--------------------" for k in prod_rules.iterkeys(): # print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. # print '\t -> ', d, prod_rules[k][d] rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append( ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) # print ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 return rules
def probabilistic_hrg(G, n=None): ''' Rule extraction procedure ''' if G is None: return G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) if n is None: num_nodes = G.number_of_nodes() else: num_nodes = n graph_checks(G) if DEBUG: print if DEBUG: print "--------------------" if DEBUG: print "-Tree Decomposition-" if DEBUG: print "--------------------" prod_rules = {} if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 300): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules) if DEBUG: print if DEBUG: print "--------------------" if DEBUG: print "- Production Rules -" if DEBUG: print "--------------------" for k in prod_rules.iterkeys(): if DEBUG: print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. if DEBUG: print '\t -> ', d, prod_rules[k][d] # pp.pprint(prod_rules) rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append( ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) if DEBUG: print("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 return rules
def sampled_subgraphs_cliquetree(orig, tree_path): files = glob(tree_path + "*.dimacs.tree") prod_rules = {} graph_name = orig for fname in files: print '... input file:', fname df = Pandas_DataFrame_From_Edgelist([orig])[0] if df.shape[1] == 3: G = nx.from_pandas_dataframe(df, 'src', 'trg', ['ts']) else: G = nx.from_pandas_dataframe(df, 'src', 'trg') print nx.info(G) with open(fname, 'r') as f: # read tree decomp from inddgo lines = f.readlines() lines = [x.rstrip('\r\n') for x in lines] cbags = {} bags = [x.split() for x in lines if x.startswith('B')] for b in bags: cbags[int(b[1])] = [int(x) for x in b[3:]] # what to do with bag size? edges = [x.split()[1:] for x in lines if x.startswith('e')] edges = [[int(k) for k in x] for x in edges] tree = defaultdict(set) for s, t in edges: tree[frozenset(cbags[s])].add(frozenset(cbags[t])) if DEBUG: print '.. # of keys in `tree`:', len(tree.keys()) if DEBUG: print tree.keys() # root = list(tree)[0] root = frozenset(cbags[1]) if DEBUG: print '.. Root:', root T = td.make_rooted(tree, root) if DEBUG: print '.. T rooted:', len(T) # nfld.unfold_2wide_tuple(T) # lets me display the tree's frozen sets T = phrg.binarize(T) td.new_visit( T, G, prod_rules) # ToDo: here is where something funny is goin on. if DEBUG: print "--------------------" if DEBUG: print "- Production Rules -" if DEBUG: print "--------------------" for k in prod_rules.iterkeys(): if DEBUG: print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. if DEBUG: print '\t -> ', d, prod_rules[k][d] print '... prod_rules size', len(prod_rules.keys()) # - production rules number - rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append( ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) if DEBUG: print("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 df = pd.DataFrame(rules) print graph_name graph_name = os.path.basename(graph_name) print graph_name outdf_fname = "./ProdRules/" + graph_name + ".prules" if not os.path.isfile(outdf_fname + ".bz2"): print '...', outdf_fname, "written" df.to_csv(outdf_fname + ".bz2", compression="bz2") else: print '...', outdf_fname, "file exists" return
def dimacs_td_ct(tdfname): """ tree decomp to clique-tree """ print '... input file:', tdfname fname = tdfname graph_name = os.path.basename(fname) gname = graph_name.split('.')[0] gfname = "datasets/out." + gname tdh = os.path.basename(fname).split('.')[1] # tree decomp heuristic tfname = gname + "." + tdh G = load_edgelist(gfname) if DEBUG: print nx.info(G) print with open(fname, 'r') as f: # read tree decomp from inddgo lines = f.readlines() lines = [x.rstrip('\r\n') for x in lines] cbags = {} bags = [x.split() for x in lines if x.startswith('B')] for b in bags: cbags[int(b[1])] = [int(x) for x in b[3:]] # what to do with bag size? edges = [x.split()[1:] for x in lines if x.startswith('e')] edges = [[int(k) for k in x] for x in edges] tree = defaultdict(set) for s, t in edges: tree[frozenset(cbags[s])].add(frozenset(cbags[t])) if DEBUG: print '.. # of keys in `tree`:', len(tree.keys()) if DEBUG: print tree.keys() root = list(tree)[0] if DEBUG: print '.. Root:', root root = frozenset(cbags[1]) if DEBUG: print '.. Root:', root T = td.make_rooted(tree, root) if DEBUG: print '.. T rooted:', len(T) # nfld.unfold_2wide_tuple(T) # lets me display the tree's frozen sets T = phrg.binarize(T) prod_rules = {} td.new_visit(T, G, prod_rules) if DEBUG: print "--------------------" if DEBUG: print "- Production Rules -" if DEBUG: print "--------------------" for k in prod_rules.iterkeys(): if DEBUG: print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. if DEBUG: print '\t -> ', d, prod_rules[k][d] rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append( ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) if DEBUG: print("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 df = pd.DataFrame(rules) outdf_fname = "./ProdRules/" + tfname + ".prules" if not os.path.isfile(outdf_fname + ".bz2"): print '...', outdf_fname, "written" df.to_csv(outdf_fname + ".bz2", compression="bz2") else: print '...', outdf_fname, "file exists" return
def dimacs_td_ct_fast(oriG, tdfname): """ tree decomp to clique-tree parameters: orig: filepath to orig (input) graph in edgelist tdfname: filepath to tree decomposition from INDDGO synthg: when the input graph is a syth (orig) graph Todo: currently not handling sythg in this version of dimacs_td_ct """ G = oriG if G is None: return (1) prod_rules = {} t_basename = os.path.basename(tdfname) out_tdfname = os.path.basename(t_basename) + ".prs" if os.path.exists("../ProdRules/" + out_tdfname): # print "==> exists:", out_tdfname return out_tdfname # else: # print ("create folder ../ProdRules") print "../ProdRules/" + out_tdfname, tdfname with open(tdfname, 'r') as f: # read tree decomp from inddgo lines = f.readlines() lines = [x.rstrip('\r\n') for x in lines] cbags = {} bags = [x.split() for x in lines if x.startswith('B')] for b in bags: cbags[int(b[1])] = [int(x) for x in b[3:]] # what to do with bag size? edges = [x.split()[1:] for x in lines if x.startswith('e')] edges = [[int(k) for k in x] for x in edges] tree = defaultdict(set) for s, t in edges: tree[frozenset(cbags[s])].add(frozenset(cbags[t])) if DEBUG: print '.. # of keys in `tree`:', len(tree.keys()) root = list(tree)[0] root = frozenset(cbags[1]) T = td.make_rooted(tree, root) # nfld.unfold_2wide_tuple(T) # lets me display the tree's frozen sets T = phrg.binarize(T) root = list(T)[0] root, children = T # td.new_visit(T, G, prod_rules, TD) # print ">>",len(T) print type(G) exit() td.new_visit(T, G, prod_rules) if 0: print "--------------------" if 0: print "- Production Rules -" if 0: print "--------------------" for k in prod_rules.iterkeys(): if DEBUG: print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float( s) # normailization step to create probs not counts. if DEBUG: print '\t -> ', d, prod_rules[k][d] rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append( ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) if 0: print("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 # print rules if 0: print "--------------------" if 0: print '- P. Rules', len(rules) if 0: print "--------------------" # ToDo. # Let's save these rules to file or print proper # write_prod_rules_to_tsv(rules, out_tdfname) # g = pcfg.Grammar('S') # for (id, lhs, rhs, prob) in rules: # g.add_rule(pcfg.Rule(id, lhs, rhs, prob)) # Synthetic Graphs # hStars = grow_exact_size_hrg_graphs_from_prod_rules(rules, graph_name, G.number_of_nodes(), 20) # # metricx = ['degree', 'hops', 'clust', 'assort', 'kcore', 'gcd'] # 'eigen' # metricx = ['gcd','avgdeg'] # metrics.network_properties([G], metricx, hStars, name=graph_name, out_tsv=True) return out_tdfname
def probabilistic_hrg (G, num_samples=1, n=None): ''' Args: ------------ G: input graph (nx obj) num_samples: (int) in the 'grow' process, this is number of synthetic graphs to generate n: (int) num_nodes; number of nodes in the resulting graphs Returns: List of synthetic graphs (H^stars) ''' graphletG = [] if DEBUG: print G.number_of_nodes() if DEBUG: print G.number_of_edges() G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) if n is None: num_nodes = G.number_of_nodes() else: num_nodes = n if DEBUG: print G.number_of_nodes() if DEBUG: print G.number_of_edges() graph_checks(G) if DEBUG: print if DEBUG: print "--------------------" if DEBUG: print "-Tree Decomposition-" if DEBUG: print "--------------------" prod_rules = {} if num_nodes >= 500: for Gprime in gs.rwr_sample(G, 2, 300): T = td.quickbb(Gprime) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules, TD) else: T = td.quickbb(G) root = list(T)[0] T = td.make_rooted(T, root) T = binarize(T) root = list(T)[0] root, children = T td.new_visit(T, G, prod_rules, TD) if DEBUG: print if DEBUG: print "--------------------" if DEBUG: print "- Production Rules -" if DEBUG: print "--------------------" for k in prod_rules.iterkeys(): if DEBUG: print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float(s) # normailization step to create probs not counts. if DEBUG: print '\t -> ', d, prod_rules[k][d] rules = [] id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append(("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) if DEBUG: print ("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x]) sid += 1 id += 1 # print rules exit() g = pcfg.Grammar('S') for (id, lhs, rhs, prob) in rules: # print type(id), type(lhs), type(rhs), type(prob) g.add_rule(pcfg.Rule(id, lhs, rhs, prob)) if DEBUG: print "Starting max size" num_nodes = num_nodes num_samples = num_samples g.set_max_size(num_nodes) if DEBUG: print "Done with max size" Hstars = [] for i in range(0, num_samples): rule_list = g.sample(num_nodes) # print rule_list hstar = grow(rule_list, g)[0] # print "H* nodes: " + str(hstar.number_of_nodes()) # print "H* edges: " + str(hstar.number_of_edges()) Hstars.append(hstar) return Hstars
def isomorphic_test_from_dimacs_tree(orig, tdfname, gname=""): """" orig: path to original/refernce input graph tdfname: path fragment for a set of td pro rules gname: graph name (str) returns: """ # if whole tree path # else, assume a path fragment print '... input graph :', os.path.basename(orig) print '... td path frag :', tdfname G = load_edgelist(orig) # load edgelist into a graph obj N = G.number_of_nodes() M = G.number_of_edges() # +++ Graph Checks if G is None: sys.exit(1) G.remove_edges_from(G.selfloop_edges()) giant_nodes = max(nx.connected_component_subgraphs(G), key=len) G = nx.subgraph(G, giant_nodes) graph_checks(G) # --- graph checks G.name = gname files = glob(tdfname + "*.dimacs.tree") prod_rules = {} stacked_df = pd.DataFrame() mat_dict = {} for i, x in enumerate(sorted(files)): mat_dict[os.path.basename(x).split(".")[0].split("_")[-1]] = i if DBG: print os.path.basename(x).split(".")[0].split("_")[-1] for tfname in sorted(files): tname = os.path.basename(tfname).split(".") tname = "_".join(tname[:2]) with open(tfname, 'r') as f: # read tree decomp from inddgo lines = f.readlines() lines = [x.rstrip('\r\n') for x in lines] cbags = {} bags = [x.split() for x in lines if x.startswith('B')] for b in bags: cbags[int(b[1])] = [int(x) for x in b[3:]] # what to do with bag size? edges = [x.split()[1:] for x in lines if x.startswith('e')] edges = [[int(k) for k in x] for x in edges] tree = defaultdict(set) for s, t in edges: tree[frozenset(cbags[s])].add(frozenset(cbags[t])) if DBG: print '.. # of keys in `tree`:', len(tree.keys()) root = list(tree)[0] root = frozenset(cbags[1]) T = td.make_rooted(tree, root) # nfld.unfold_2wide_tuple(T) # lets me display the tree's frozen sets T = phrg.binarize(T) # root = list(T)[0] # root, children = T # td.new_visit(T, G, prod_rules, TD) # print ">>",len(T) td.new_visit(T, G, prod_rules) from json import dumps # print dumps(prod_rules, indent=4, sort_keys=True) for k in prod_rules.iterkeys(): if DBG: print k s = 0 for d in prod_rules[k]: s += prod_rules[k][d] for d in prod_rules[k]: prod_rules[k][d] = float(prod_rules[k][d]) / float(s) # normailization step to create probs not counts. if DBG: print '\t -> ', d, prod_rules[k][d] if DBG: print "--------------------" if DBG: print '- Prod. Rules' if DBG: print "--------------------" rules = [] # print dumps(prod_rules, indent=4, sort_keys=True) id = 0 for k, v in prod_rules.iteritems(): sid = 0 for x in prod_rules[k]: rhs = re.findall("[^()]+", x) rules.append(("r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x])) if DBG: print "r%d.%d" % (id, sid), "%s" % re.findall("[^()]+", k)[0], rhs, prod_rules[k][x] sid += 1 id += 1 df = pd.DataFrame(rules) df['cate'] = tname stacked_df = pd.concat([df, stacked_df]) # print df.shape print "\nStacked prod rules\n", "~" * 20 print " ", stacked_df.shape if args['verb']: print stacked_df.to_string() stacked_df.to_csv("../Results/{}_stacked_df.tsv".format(gname), sep="\t") if os.path.exists( "../Results/{}_stacked_df.tsv".format(gname)): print 'Wrote:', "../Results/{}_stacked_df.tsv".format(gname) print "\nisomorphic union of the rules (_mod probs)\n", "~" * 20 stacked_df.columns = ['rnbr', 'lhs', 'rhs', 'pr', df['cate'].name] iso_union, iso_interx = isomorph_intersection_2dfstacked(stacked_df) print " ", iso_union.shape if args['verb']: print iso_union.to_string() print "\nIsomorphic intersection of the prod rules\n", "~" * 20 print " ", iso_interx.shape iso_interx.to_csv('../Results/{}_isom_interxn.tsv'.format(gname)) if os.path.exists( '../Results/{}_isom_interxn.tsv'.format(gname)): print 'Wrote:', '../Results/{}_isom_interxn.tsv'.format(gname)
def main(add_edge_events={}, return_dict={}): start = time() del_edge_events = {} print(add_edge_events, file=open(logfile, 'a')) g_prev = nx.DiGraph() g_next = nx.DiGraph() events = sorted(list(set(add_edge_events.keys() + del_edge_events.keys()))) name = None shrg_rules = {} i = 0 for t in events[:-1]: decomp_time = time() if t in add_edge_events: for u, v in add_edge_events[t]: g_next.add_edge(u, v, label='e') if t in del_edge_events: for u, v in del_edge_events[t]: if (u, v) in g_next.edges(): g_next.remove_edge(u, v) nx.set_node_attributes(g_next, 'label', 'u') # get WCC if not nx.is_weakly_connected(g_next): g_next = max(nx.weakly_connected_component_subgraphs(g_next), key=len) g_union = union_graph(g_prev, g_next) tree_decomp_l = tree_decomposition(g_union) i += 1 tree_decomp = tree_decomp_l[0] tree_decomp = prune(tree_decomp, frozenset()) tree_decomp = binarize(tree_decomp) tree_decomp = prune(tree_decomp, frozenset()) td.new_visit(tree_decomp, g_prev, g_next, shrg_rules, i) g_prev = g_next.copy() print('tree decomp #{} done in {} sec'.format(t, time() - decomp_time), file=open(logfile, 'a')) prev_rules = [] next_rules = [] anchor_candidates = [] for lhs_set in shrg_rules.values(): for rule_tuple in lhs_set: nonterm = False for n in rule_tuple[0].rhs.nodes(data=True): if isinstance(n[1]['label'], grammar.Nonterminal): nonterm = True break if not nonterm and rule_tuple[1].time == i and rule_tuple[ 1].iso == False: for n in rule_tuple[0].rhs.nodes(data=True): if 'external' not in n[1] and not isinstance( n[1]['label'], grammar.Nonterminal): anchor_candidates.append((n[1]['oid'], rule_tuple)) print('Number of Anchors', len(anchor_candidates), file=open(logfile, 'a')) anchors = random.sample(anchor_candidates, len(anchor_candidates)) for anchor in anchors: oid, rule = anchor prev, next = rule for n in prev.rhs.nodes(data=True): if 'oid' in n[1] and n[1]['oid'] == oid: n[1]['label'] = oid for n in next.rhs.nodes(data=True): if 'oid' in n[1] and n[1]['oid'] == oid: n[1]['label'] = oid print('label changed to oid', rule[1].id, rule[1].time, n, file=open(logfile, 'a')) for n in g_next.nodes(data=True): if n[0] == oid: n[1]['label'] = oid for n in g_prev.nodes(data=True): if n[0] == oid: n[1]['label'] = oid for lhs_set in shrg_rules.values(): s = 0 for rule_tuple in lhs_set: prev, next = rule_tuple s += prev.weight for rule_tuple in lhs_set: rule_tuple[1].weight /= float(s) next_rules.append(rule_tuple[1]) rule_tuple[0].weight /= float(s) prev_rules.append(rule_tuple[0]) assert len(prev_rules) == len(next_rules) print('Parse start, time elapsed: {} sec'.format(time() - start), file=open(logfile, 'a')) print('Number of Rules ', len(prev_rules), file=open(logfile, 'a')) forest = p.parse(prev_rules, [grammar.Nonterminal('0')], g_next) print('Parse end, time elapsed: {} sec'.format(time() - start), file=open(logfile, 'a')) try: new_g = p.derive(p.viterbi(forest), next_rules) except KeyError: print('Goal error!', file=open(logfile, 'a')) return_dict['status'] = 'fail' return_dict['graph'] = None return_dict['shrg_rules'] = shrg_rules return_dict['time'] = time() - start return 'fail', None, shrg_rules, time() - start h_shrg = nx.DiGraph() for e in hypergraphs.edges(new_g): h_shrg.add_edge(e.h[0], e.h[1]) return_dict['status'] = 'pass' return_dict['graph'] = h_shrg return_dict['shrg_rules'] = shrg_rules return_dict['time'] = time() - start return 'pass', h_shrg, shrg_rules, time() - start