def predict_and_compute(self): assert (self.predictor is not None) input_instances, output_graphs = self.manager.run() if len(output_graphs) > 0 and type(output_graphs[0]) == tuple: # ignore conllu graphs here output_graphs = [x[0] for x in output_graphs] input_graphs = [ inst.fields['graph'].metadata for inst in input_instances ] input_sents = [ inst.fields['src_tokens_str'].metadata for inst in input_instances ] if self.pred_args.save_pred_path is not None: # save serialized graphs to pkl file try: self.save_graphs([nx.adjacency_data(x) for x in input_graphs], [nx.adjacency_data(x) for x in output_graphs], self.pred_args.save_pred_path) except AttributeError: self.save_graphs( [nx.adjacency_data(x) for x in input_graphs], [nx.adjacency_data(x[0]) for x in output_graphs], self.pred_args.save_pred_path) return compute_s_metric( input_graphs, output_graphs, input_sents, semantics_only=self.semantics_only, drop_syntax=self.drop_syntax, #args, include_attribute_scores=self.include_attribute_scores)
def save_qrep(fn, cur_qrep): assert ".pkl" in fn qrep = copy.deepcopy(cur_qrep) qrep["join_graph"] = nx.adjacency_data(qrep["join_graph"]) qrep["subset_graph"] = nx.adjacency_data(qrep["subset_graph"]) with open(fn, "wb") as f: pickle.dump(qrep, f)
def parse_sql(sql, user, db_name, db_host, port, pwd, timeout=False, compute_ground_truth=True, subset_cache_dir="./subset_cache/"): ''' @sql: sql query string. @ret: python dict with the keys: sql: original sql string join_graph: networkX graph representing query and its join_edges. Properties include: Nodes: - table - alias - predicate matches Edges: - join_condition Note: This is the only place where these strings will be stored. Each of the subplans will be represented by their nodes within the join_graph, and we can use these properties to reconstruct the appropriate query for each subplan. subset_graph: networkX graph representing each subplan as a node. Properties of each subplan will include all the cardinality data that will need to be computed: - true_count - pg_count - total_count ''' start = time.time() join_graph = extract_join_graph(sql) subset_graph = generate_subset_graph(join_graph) print("query has", len(join_graph.nodes), "relations,", len(join_graph.edges), "joins, and", len(subset_graph), " possible subplans.", "took:", time.time() - start) ret = {} ret["sql"] = sql ret["join_graph"] = join_graph ret["subset_graph"] = subset_graph ret["join_graph"] = nx.adjacency_data(ret["join_graph"]) ret["subset_graph"] = nx.adjacency_data(ret["subset_graph"]) return ret
def write_nx_to_file(nx_graph, filename): print(f"writing networkx output file: {filename}") json_out = nx.adjacency_data(nx_graph) with open(filename, "w", encoding="utf-8") as f: json.dump(json_out, f)
def from_ud_lines(cls, path): with open(path) as f1: lines = f1.readlines() graphs = {} for i, line in enumerate(lines): try: sent, tags = line.split("\t") except ValueError: pdb.set_trace() tags = tags.strip() tags = tags.split(",") sentence = sent empty_graph = nx.DiGraph() empty_graph.add_node(f"-root-0") empty_graph.nodes[f"-root-0"]['type'] = 'root' empty_graph.nodes[f"-root-0"]['domain'] = 'semantics' empty_graph.nodes[f"-root-0"]['frompredpatt'] = False empty_graph.nodes[f"-root-0"]['sentence'] = sentence empty_graph.nodes[f"-root-0"]['pos_tags'] = tags name = f"test_graph_{i}" graph_data = nx.adjacency_data(empty_graph) g = UDSSentenceGraph.from_dict(graph_data, name) graphs[name] = g return cls(graphs)
def ba_graph(name, n): print("***") # n = random.randint(10,15) G = nx.barabasi_albert_graph( n, 2 ) #n:Number of nodes m:Number of edges to attach from a new node to existing nodes for l in G.edges(): G.edges[l]["weight"] = random.randint(weight[0], weight[1]) G.edges[l]["sp"] = 1 mapping = dict(zip(G.nodes(), range(1, n + 1))) G1 = nx.relabel_nodes(G, mapping) # nodes 1..26 nx.draw(G1, with_labels=True, font_weight='bold') plt.savefig("graph_" + name + ".png") # save as png plt.close() print(G1.nodes()) print(G1.nodes().data()) G1_nl_format = nx.node_link_data(G1) G1_ad_format = nx.adjacency_data( G1) # returns the graph in a node-link format print("G1_nl_format:", G1_nl_format) print("G1_adj_format:", G1_ad_format) with open('topo_' + name + '.json', 'w') as json_file: json.dump(G1_ad_format, json_file) return G1
def fix_qrep(qrep): # json-ify the graphs qrep["subset_graph"] = nx.adjacency_data( nx.OrderedDiGraph(qrep["subset_graph"])) for nd in qrep["join_graph"].nodes(data=True): data = nd[1] for i, col in enumerate(data["pred_cols"]): # add pred related feature cmp_op = data["pred_types"][i] if cmp_op == "in" or \ "like" in cmp_op or \ cmp_op == "eq": val = data["pred_vals"][i] if isinstance(val, dict): val = [val["literal"]] elif not hasattr(val, "__len__"): val = [val] elif isinstance(val[0], dict): val = val[0]["literal"] val = set(val) data["pred_vals"][i] = val qrep["join_graph"] = nx.adjacency_data(qrep["join_graph"])
def save_file(self, path: str, file_type: str): logger.debug(locals()) if not self.graph: raise ValueError("No graph to save!") if "json" in file_type.lower(): if "node link graph" in file_type.lower(): with pathlib.Path(path).open("w") as file_p: data = networkx.node_link_data(self.graph) json.dump(data, file_p) del data elif "adjacency graph" in file_type.lower(): with pathlib.Path(path).open("w") as file_p: data = networkx.adjacency_data(self.graph) json.dump(data, file_p) del data else: raise NotImplementedError() else: raise NotImplementedError()
def from_single_line(cls, line): def tokenize(sent): # TODO: real tokenization here return sent.split(" ") if type(line) == str: lines = [line] else: lines = line graphs = {} for i, line in enumerate(lines): sentence = line.strip() empty_graph = nx.DiGraph() empty_graph.add_node(f"test-root-0") empty_graph.nodes[f"test-root-0"]['type'] = 'root' empty_graph.nodes[f"test-root-0"]['domain'] = 'semantics' empty_graph.nodes[f"test-root-0"]['frompredpatt'] = False empty_graph.nodes[f"test-root-0"]['sentence'] = sentence for i, node_name in enumerate(tokenize(sentence)): empty_graph.add_node(f"test-syntax-{i+1}") empty_graph.nodes[f"test-syntax-{i+1}"]["form"] = node_name empty_graph.nodes[f"test-syntax-{i+1}"]["domain"] = 'syntax' empty_graph.nodes[f"test-syntax-{i+1}"]["type"] = 'token' empty_graph.nodes[f"test-syntax-{i+1}"]["position"] = i + 1 name = f"test_graph_{i}" graph_data = nx.adjacency_data(empty_graph) g = UDSSentenceGraph.from_dict(graph_data, name) for node in g.nodes: if 'type' not in g.nodes[node].keys(): g.nodes[node]['type'] = None if 'domain' not in g.nodes[node].keys(): g.nodes[node]['domain'] = 'syntax' g.nodes[f"test-root-0"]['type'] = 'root' g.nodes[f"test-root-0"]['sentence'] = sentence g.nodes[f"test-root-0"]['domain'] = "semantics" graphs[name] = g return cls(graphs)
def serialize(self): return nx.adjacency_data(self.arbor_graph)
def to_dict(self) -> Dict: """Convert the graph to a dictionary""" return adjacency_data(self.graph)
def before_request(): # store empty graph if 'graph' not in session: session['graph'] = nx.adjacency_data(nx.Graph())
def setSessionGraph(graph): session['graph'] = nx.adjacency_data(graph)
import torch import os.path as osp import torch.nn.functional as F from torch.nn import ModuleList from torch_geometric.datasets import KarateClub from torch_geometric.datasets import Planetoid import torch_geometric.transforms as T from torch_geometric.nn import GCNConv, ChebConv # noq from torch_geometric.utils import convert import matplotlib.pyplot as plt from sklearn.manifold import TSNE import networkx as nx G = nx.karate_club_graph() adj = nx.adjacency_data(G) colors = [ '#ffc0cb', '#bada55', '#008080', '#420420', '#7fe5f0', '#065535', '#ffd700' ] real_label = [] label = [ 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 ] for c in adj['nodes']: if c['club'] == 'Mr. Hi': real_label += [0] else: real_label += [1]
import networkx as nx import matplotlib.pyplot as plt G = nx.Graph() G.add_edge("a", "b", weight=0.31) G.add_edge("q", "b", weight=0.50) print(G.get_edge_data("a", "b")) nx.complete_graph(3) print(nx.adjacency_data(G)) nx.draw(G, with_labels=True, with_weights=True) plt.show()
def parse_sql(sql, user, db_name, db_host, port, pwd, timeout=False, compute_ground_truth=True, subset_cache_dir="./subset_cache/"): ''' @sql: sql query string. @ret: python dict with the keys: sql: original sql string join_graph: networkX graph representing query and its join_edges. Properties include: Nodes: - table - alias # FIXME: matches, or separate it out into ops AND predicates - matches Edges: - join_condition Note: This is the only place where these strings will be stored. Each of the subqueries will be represented by their nodes within the join_graph, and we can use these properties to reconstruct the appropriate query for the subsets. subset_graph: networkX graph representing each subquery. Properties include all the ground truth data that will need to be computed: - true_count - pg_count - total_count ''' start = time.time() join_graph = extract_join_graph(sql) subset_graph = generate_subset_graph(join_graph) print("query has", len(join_graph.nodes), "relations,", len(join_graph.edges), "joins, and", len(subset_graph), " possible subsets.", "took:", time.time() - start) ret = {} ret["sql"] = sql ret["join_graph"] = join_graph ret["subset_graph"] = subset_graph if not compute_ground_truth: ret["join_graph"] = nx.adjacency_data(ret["join_graph"]) ret["subset_graph"] = nx.adjacency_data(ret["subset_graph"]) return ret assert user is not None make_dir(subset_cache_dir) subset_cache_file = subset_cache_dir + get_subset_cache_name(sql) # we should check and see which cardinalities of the subset graph # we already know. Note thate we have to cache at this level because # the maximal matching might make arbitrary choices each time. with shelve.open(subset_cache_file) as cache: if sql in cache: currently_stored = cache[sql] else: currently_stored = {} unknown_subsets = subset_graph.copy() unknown_subsets = unknown_subsets.subgraph(subset_graph.nodes - currently_stored.keys()) print(len(unknown_subsets.nodes), "/", len(subset_graph.nodes), "subsets still unknown (", len(currently_stored), "known )") # let us update the ground truth values edges = get_optimal_edges(unknown_subsets) paths = list(reconstruct_paths(edges)) for p in paths: for el1, el2 in zip(p, p[1:]): assert len(el1) > len(el2) # ensure the paths we constructed cover every possible path sanity_check_unknown_subsets = unknown_subsets.copy() for n1, n2 in edges.items(): if n1 in sanity_check_unknown_subsets.nodes: sanity_check_unknown_subsets.remove_node(n1) if n2 in sanity_check_unknown_subsets.nodes: sanity_check_unknown_subsets.remove_node(n2) assert len(sanity_check_unknown_subsets.nodes) == 0 subset_sqls = [] for path in paths: join_order = [tuple(sorted(x)) for x in path_to_join_order(path)] join_order.reverse() sql_to_exec = nodes_to_sql(join_order, join_graph) if compute_ground_truth: prefix = "explain (analyze, timing off, format json) " else: prefix = "explain (analyze off, timing off, format json) " sql_to_exec = prefix + sql_to_exec subset_sqls.append(sql_to_exec) print("computing all", len(unknown_subsets), "unknown subset cardinalities with", len(subset_sqls), "queries") pre_exec_sqls = [] # TODO: if we use the min #queries approach, maybe greedy approach and # letting pg choose join order is better? pre_exec_sqls.append("set join_collapse_limit to 1") pre_exec_sqls.append("set from_collapse_limit to 1") if timeout: pre_exec_sqls.append("set statement_timeout = {}".format(timeout)) sanity_check_unknown_subsets = unknown_subsets.copy() for idx, path_sql in enumerate(bar(subset_sqls)): res = execute_query(path_sql, user, db_host, port, pwd, db_name, pre_exec_sqls) if res is None: print("Query failed to execute, ignoring.") breakpoint() continue plan = res[0][0][0] plan_tree = plan["Plan"] results = list(analyze_plan(plan_tree)) for result in results: # this assertion is invalid because PG may choose to use an implicit join predicate, # for example, if a.c1 = b.c1 and b.c1 = c.c1, then PG may choose to join on a.c1 = c.c1 # assert nx.is_connected(join_graph.subgraph(result["aliases"])), (result["aliases"], plan_tree) aliases_key = tuple(sorted(result["aliases"])) if compute_ground_truth: currently_stored[aliases_key] = { "expected": result["expected"], "actual": result["actual"] } else: currently_stored[aliases_key] = { "expected": result["expected"] } if aliases_key in sanity_check_unknown_subsets.nodes: sanity_check_unknown_subsets.remove_node(aliases_key) if idx % 5 == 0: with shelve.open(subset_cache_file) as cache: cache[sql] = currently_stored print(len(currently_stored), "total subsets now known") assert len(sanity_check_unknown_subsets.nodes) == 0 with shelve.open(subset_cache_file) as cache: cache[sql] = currently_stored for node in subset_graph.nodes: subset_graph.nodes[node]["cardinality"] = currently_stored[node] print("total time:", time.time() - start) # json-ify the graphs ret["join_graph"] = nx.adjacency_data(ret["join_graph"]) ret["subset_graph"] = nx.adjacency_data(ret["subset_graph"]) return ret