def perform_rpq(graph, regex_automaton, start_lst, end_lst, use_tc_method_adj=False): query_dict = regex_automaton.to_GrB_matrix() graph_dict = graph.graph_dict tmp_graph_dict = {} num_vert = 0 # Getting intersection with kronecker product for label in query_dict: tmp_graph_dict[label] = graph_dict[label].kronecker(query_dict[label]) if num_vert == 0: num_vert = tmp_graph_dict[label].ncols # To GrB matrix tmp = LabelGraph() tmp.graph_dict = tmp_graph_dict tmp.num_vert = num_vert result = tmp.to_GrB_matrix() # Transform double index to single value def coord_to_index(coord): v_graph, v_regex = coord return v_graph * regex_automaton.num_vert + v_regex start_states = set( map(coord_to_index, product(range(graph.num_vert), regex_automaton.start_states))) final_states = set( map(coord_to_index, product(range(graph.num_vert), regex_automaton.final_states))) if (not use_tc_method_adj): reachability_matrix_ = get_transitive_closure(result).select( lib.GxB_NONZERO) else: reachability_matrix_ = get_transitive_closure_adj(result).select( lib.GxB_NONZERO) reachability_matrix = Matrix.sparse(BOOL, graph.num_vert, graph.num_vert) print("Started r_m\n") for v_i, v_j, _ in zip( *reachability_matrix_.select(lib.GxB_NONZERO).to_lists()): if (v_i in start_states) and (v_j in final_states): # Getting initial graph vertex from index in result matrix v_from = v_i // regex_automaton.num_vert v_to = v_j // regex_automaton.num_vert # Debug output reachability_matrix[v_from, v_to] = True return (reachability_matrix, reachability_matrix_.nvals)
def test_rpq(): for i in range(n_tests): # read Regex from file regex_string = open(os.path.join(DATA_DIR, f'regex_{i}.txt'), 'r').read() regex_automaton = RegexAutomaton(regex_string) # read GrB matrix from file graph = LabelGraph().from_txt(os.path.join(DATA_DIR, f'graph_{i}.txt')) start_lst = list(range(graph.num_vert)) end_lst = list(range(graph.num_vert)) res_matrix = perform_rpq(graph, regex_automaton, start_lst, end_lst, True)[0] reachability = set() with open(os.path.join(DATA_DIR, f'reachability_{i}.txt'), 'r') as f: for line in f: v, to = line.split(' ') reachability.add((int(v), int(to))) assert (res_matrix.nvals == len(reachability)) for v, to, _ in zip(*res_matrix.to_lists()): if res_matrix[v, to] is True: assert ((v, to) in reachability) else: assert ((v, to) not in reachability)
def test_cfpq_grammar_2(): for i in range(NUM_GRAPHS): g_2 = GrammarCNF.from_text(TEST_GRAMMARS[1]) for i in range(NUM_GRAPHS): graph = LabelGraph().from_txt( os.path.join(DATA_DIR, f'graph_{i}.txt')) result = cfpq_matrix_mult(graph, g_2) expected = set() with open(os.path.join(DATA_DIR, f'expected_{1}_{i}.txt'), 'r') as f: for line in f: v, to = line.split(' ') expected.add((int(v), int(to))) edges = set(LabelGraph.get_reachable(result)) assert edges == expected
def test_cfpq_brackets(): for i in range(NUM_GRAPHS): brackets = CFGWrapper.from_text(TEST_GRAMMARS[0]) for i in range(NUM_GRAPHS): graph = LabelGraph().from_txt( os.path.join(DATA_DIR, f'graph_{i}.txt')) result = cfpq_tensor_product(graph, brackets) expected = set() with open(os.path.join(DATA_DIR, f'expected_{0}_{i}.txt'), 'r') as f: for line in f: v, to = line.split(' ') expected.add((int(v), int(to))) edges = set(LabelGraph.get_reachable(result)) assert edges == expected
def cfpq_matrix_mult(g: LabelGraph, cfg: GrammarCNF): num_vert = g.num_vert if (num_vert == 0): return Matrix.sparse(BOOL, num_vert, num_vert) result = LabelGraph() start_sym = cfg.start_symbol result.num_vert = num_vert for variable in cfg.variables: result.graph_dict[variable] = Matrix.sparse(BOOL, num_vert, num_vert) for label in g.graph_dict: term = Terminal(label) result.graph_dict[term] = g.graph_dict[label].dup() for v_from, v_to in g.get_edges(label): for production in cfg.productions: if (len(production.body) == 1 and production.body[0] == term): head = production.head result.graph_dict[head][v_from, v_to] = True if cfg.generate_epsilon(): for v in g.vertices: result.graph_dict[start_sym][v, v] = True matrix_changing = True with semiring.LOR_LAND_BOOL: while matrix_changing: matrix_changing = False for production in cfg.pair_productions: head = production.head body = production.body prev_nvals = result.graph_dict[head].nvals tmp = result.graph_dict[body[0]] @ result.graph_dict[body[1]] result.graph_dict[head] = result.graph_dict[head] + tmp if (prev_nvals != result.graph_dict[head].nvals): matrix_changing = True return result.graph_dict[start_sym]
def main(): parser = argparse.ArgumentParser(description='Basic graph DB') parser.add_argument( '--graph', required=True, type=str, help='path to graph file' ) parser.add_argument( '--regex', required=True, type=str, help='path to regex file' ) parser.add_argument( '--start', required=False, type=str, help='path to given starting vertices' ) parser.add_argument( '--end', required=False, type=str, help='path to given end vertices' ) args = parser.parse_args() # read Regex from file regex_string = open(args.regex, 'r').read() regex_automaton = RegexAutomaton(regex_string) # read GrB matrix from file graph = LabelGraph().from_txt(args.graph) # read start and end vertices start = [] if (args.start is not None): with open(args.start, 'r') as f: for line in f: start.append(int(line)) else: start = list(range(graph.num_vert)) end = [] if (args.end is not None): with open(args.end, 'r') as f: for line in f: end.append(int(line)) else: end = list(range(graph.num_vert)) perform_rpq(graph, regex_automaton, start, end)
def test_cfpq_empty_graph(): brackets_cnf = GrammarCNF.from_text(TEST_GRAMMARS[0]) result = cfpq_matrix_mult(LabelGraph(), brackets_cnf) expected = set() edges = set(LabelGraph.get_reachable(result)) assert edges == expected
def cfpq_hellings(g: LabelGraph, cfg: GrammarCNF): num_vert = g.num_vert start_sym = cfg.start_symbol result = LabelGraph() result.num_vert = num_vert m = deque() for variable in cfg.variables: result.graph_dict[variable] = Matrix.sparse(BOOL, num_vert, num_vert) if cfg.generate_epsilon(): for v in range(num_vert): result.graph_dict[start_sym][v, v] = True for label in g.graph_dict: term = Terminal(label) result.graph_dict[term] = g.graph_dict[label].dup() for v_from, v_to in g.get_edges(label): for production in cfg.productions: if (len(production.body) == 1 and production.body[0] == term): head = production.head result.graph_dict[head][v_from, v_to] = True for label in result.graph_dict: for i, j in result.get_edges(label): m.append((label, i, j)) # 3rd step: cfpq on modified matrix while m: var, v, u = m.popleft() for var_left in result.graph_dict: for v_new, v_ in result.get_edges(var_left): if (v_ == v): for production in cfg.pair_productions: if (production.body[1] == var and production.body[0] == var_left): if (v_new, u) not in result.get_edges( production.head): result.graph_dict[production.head][v_new, u] = True m.append((production.head, v_new, u)) for var_right in result.graph_dict: for u_, u_new in result.get_edges(var_right): if (u_ == u): for production in cfg.pair_productions: if (production.body[1] == var_right and production.body[0] == var): if (v, u_new) not in result.get_edges( production.head): result.graph_dict[production.head][ v, u_new] = True m.append((production.head, v, u_new)) return result.graph_dict[start_sym]
def cfpq_tensor_product(g: LabelGraph, cfg: GrammarCNF): rfa = RFA().from_cfg(cfg) # Resulting matrix initialization result = LabelGraph() result.num_vert = g.num_vert # Empty matrix case if (g.num_vert == 0): return Matrix.sparse(BOOL, g.num_vert, g.num_vert) result.graph_dict = { label: g.graph_dict[label].dup() for label in g.graph_dict } for label in rfa.graph_dict: if label not in result.graph_dict: result.graph_dict[label] = Matrix.sparse(BOOL, g.num_vert, g.num_vert) for term in cfg.terminals: if term.value not in result.graph_dict: result.graph_dict[term.value] = Matrix.sparse( BOOL, g.num_vert, g.num_vert) # Loops for epsilon productions for p in cfg.productions: if p.body == []: for v in g.vertices: result.graph_dict[p.head.value][v, v] = True matrix_changing = True tc = None while matrix_changing: matrix_changing = False tmp_graph_dict = {} num_vert = 0 # Getting intersection for label in rfa.graph_dict: tmp_graph_dict[label] = result.graph_dict[label].kronecker( rfa.graph_dict[label]) if num_vert == 0: num_vert = tmp_graph_dict[label].ncols # To GrB matrix tmp = LabelGraph() tmp.graph_dict = tmp_graph_dict tmp.num_vert = num_vert intersection = tmp.to_GrB_matrix() # Transitive closure old_nvals = 0 if tc is None else tc.nvals tc = get_transitive_closure(intersection) for s, o in LabelGraph.get_reachable(tc): # Get coordinates s_m, s_rfa = s // rfa.num_vert, s % rfa.num_vert o_m, o_rfa = o // rfa.num_vert, o % rfa.num_vert if s_rfa in rfa.start_states and o_rfa in rfa.final_states: label = rfa.var_by_vertices[(s_rfa, o_rfa)] result.graph_dict[label][s_m, o_m] = True if old_nvals != tc.nvals: matrix_changing = True return result.graph_dict[cfg.start_symbol.value]
def test_cfpq_empty_graph(): brackets_cnf = CFGWrapper.from_text(TEST_GRAMMARS[0]) result = cfpq_tensor_product(LabelGraph(), brackets_cnf) expected = set() edges = set(LabelGraph.get_reachable(result)) assert edges == expected
for d_name in GRAPH_DIRS: with open(f'query_benchmarks/{d_name}_bench.csv', 'w') as res_f: graph_filename = glob.glob(f"{bench_prefix}/{d_name}/*.txt")[0] for regex_filename in os.listdir(f'{bench_prefix}/{d_name}/regexes/'): if regex_filename in collected: print(f'{regex_filename} already done') else: print(f'Running {d_name} -- {regex_filename}...') regex_ = os.path.join(f'{bench_prefix}/{d_name}/regexes/', regex_filename) # read Regex from file regex_string = open(regex_, 'r').read() regex_automaton = RegexAutomaton(regex_string) # read GrB matrix from file graph = LabelGraph().from_txt(graph_filename) # benchmarking 2 methods of transitive closure for tc_method in range(2): # read start and end vertices start = list(range(graph.num_vert)) end = list(range(graph.num_vert)) times = [] nvals = 0 print(f'Running method {tc_method}...\n') for i in range(5): print(f'Running {i} time...\n') start = time.time_ns() nvals = perform_rpq(graph, regex_automaton, start, end, bool(tc_method))[1]