def get_strengthening_constraint(self, input_graph: Graph) -> Graph: common_tags = None common_edges = None common_tagged_edges = None for plan, partial_mappings in self._checks.items(): strengthening, s_mapping = plan.strengthenings[self.depth] s_edges = strengthening.get_all_edges() s_tagged_edges = set(strengthening.iter_tagged_edges()) plan_tags = set(strengthening.iter_tags()) plan_tagged_edges = None plan_edges = None for partial_mapping in partial_mappings: mapping_wrt_inp_graph = partial_mapping.apply_mapping( s_mapping, only_keys=True) for m in strengthening.get_subgraph_mappings( input_graph, partial_mapping=mapping_wrt_inp_graph): if plan_tagged_edges is None: plan_tagged_edges = { TaggedEdge(m.m_node[e.src], m.m_node[e.dst], e.tag) for e in s_tagged_edges } plan_edges = { Edge(m.m_node[e.src], m.m_node[e.dst], e.label) for e in s_edges } else: plan_tagged_edges.intersection_update( TaggedEdge(m.m_node[e.src], m.m_node[e.dst], e.tag) for e in s_tagged_edges) plan_edges.intersection_update( Edge(m.m_node[e.src], m.m_node[e.dst], e.label) for e in s_edges) if common_tags is None: common_tags = plan_tags or set() common_tagged_edges = plan_tagged_edges or set() common_edges = plan_edges or set() else: common_tags.intersection_update(plan_tags or set()) common_tagged_edges.intersection_update(plan_tagged_edges or set()) common_edges.intersection_update(plan_edges or set()) nodes = {e.src for e in common_tagged_edges} nodes.update(e.dst for e in common_tagged_edges) nodes.update(e.src for e in common_edges) nodes.update(e.dst for e in common_edges) result = Graph.from_nodes_and_edges(nodes=nodes, edges=common_edges) result.add_tagged_edges(common_tagged_edges) result.add_tags(common_tags) return result
def gen_filter(df: pd.DataFrame, g_df: DataFrameGraph, datagen: bool = False): """ FILTER ------ Example: filter(df, 'C1 > 3') --------------- df result C1 C2 C3 C1 C2 C3 0 3 a d --> 0 4 b e 1 4 b e 1 5 c f 2 5 c f --------------- Graph Abstraction: - EQUAL edges between the all columns and cells in the input that are preserved to the corresponding nodes of the output. - Additional dependency edges from the cells of the column used in the filtering condition (C1 in the example). """ cands_column = g_df.columns mode = SelectConst(["equality-inequality", "relop"], uid="filter_mode") if mode == "equality-inequality": column = SelectNode(cands_column, uid="filter_column_eq") all_values = set(df[column]) value = SelectConst(list(all_values), uid="filter_value_eq") op = SelectConst(["==", "!="], uid="filter_eq_op") filter_expr = f"{column} {op} {value!r}" else: numeric_cols = set(df.select_dtypes('number').columns) column = SelectNode( [c for c in cands_column if c.value in numeric_cols], uid="filter_column_relop") all_values = set(df[column]) value = SelectConst(list(all_values), uid="filter_value_relop") op = SelectConst(["<", ">"], uid="filter_relop") filter_expr = f"{column} {op} {value!r}" result = RInterpreter.filter_(df, filter_expr, reset_index=False) filtered_indices = list(result.index) removed_indices = list(set(df.index) - set(filtered_indices)) result = result.reset_index(drop=True) call_str = f"filter({{inp1}}, {filter_expr!r})" # -------------------------------------------------------------------------------------------------------------- # # Graph Construction # -------------------------------------------------------------------------------------------------------------- # g_res = DataFrameGraph(result) graph = GraphRLang.assemble([g_df, g_res]) added_edges: List[Edge] = [] col_map_df = {c.value: c for c in g_df.columns } # Map from df's columns to their column nodes col_map_res = {c.value: c for c in g_res.columns } # Map from result's columns to their column nodes if op == "==": dep_label = ELabel.DEPENDENT_EQ elif op == "!=": dep_label = ELabel.DEPENDENT_INEQ elif op == "<": dep_label = ELabel.DEPENDENT_LT else: dep_label = ELabel.DEPENDENT_GT deletion_node_out = g_res.deletion_node # - EQUAL edges for all columns. for c in col_map_df: added_edges.append(Edge(col_map_df[c], col_map_res[c], ELabel.EQUAL)) # - EQUAL edges for kept rows for index, v1, v2 in zip(filtered_indices, g_df.loc[filtered_indices, c], g_res.loc[:, c]): interm_node = graph.create_intermediate_node(v2.value) added_edges.append( Edge(g_df.loc[index, column], interm_node, dep_label)) added_edges.append(Edge(v1, interm_node, ELabel.EQUAL)) added_edges.append(Edge(interm_node, v2, ELabel.EQUAL)) # - Mark the rest as deleted for v in g_df.loc[removed_indices, c]: added_edges.append(Edge(v, deletion_node_out, ELabel.DELETE)) # - EQUAL edge between the input deletion node and the output deletion node. added_edges.append( Edge(g_df.deletion_node, g_res.deletion_node, ELabel.EQUAL)) # Add all the edges to the graph in one go. graph.add_nodes_and_edges(edges=added_edges) # -------------------------------------------------------------------------------------------------------------- # # Add information about arguments # -------------------------------------------------------------------------------------------------------------- # tagged_edges: List[TaggedEdge] = [] if mode == 'equality-inequality': for c_node in cands_column: if column == c_node.value: tagged_edges.append( TaggedEdge(c_node, c_node, "SELECTED@filter_column_eq")) else: tagged_edges.append( TaggedEdge(c_node, c_node, "NOT_SELECTED@filter_column_eq")) for cand_op in ["==", "!="]: if cand_op == op: graph.add_tags([f"SELECTED@{cand_op}@filter_eq_op"]) else: graph.add_tags([f"NOT_SELECTED@{cand_op}@filter_eq_op"]) else: for c_node in cands_column: if column == c_node.value: tagged_edges.append( TaggedEdge(c_node, c_node, "SELECTED@filter_column_relop")) else: tagged_edges.append( TaggedEdge(c_node, c_node, "NOT_SELECTED@filter_column_relop")) for cand_op in ["<", ">"]: if cand_op == op: graph.add_tags([f"SELECTED@{cand_op}@filter_relop"]) else: graph.add_tags([f"NOT_SELECTED@{cand_op}@filter_relop"]) graph.add_tagged_edges(tagged_edges) for cand_mode in ["equality-inequality", "relop"]: if cand_mode == mode: graph.add_tags([f"SELECTED@{cand_mode}@filter_mode"]) else: graph.add_tags([f"NOT_SELECTED@{cand_mode}@filter_mode"]) return result, call_str, graph, g_res
def gen_mutate(df: pd.DataFrame, g_df: DataFrameGraph, datagen: bool = False): """ MUTATE ------ Example: mutate(df, new_col_name='C4', operation='normalize', col_args='C1') --------------- df result C1 C2 C3 C1 C2 C3 C4 0 3 a d --> 0 3 a d 0.250000 1 4 b e 1 4 b e 0.333333 2 5 c f 2 5 c f 0.416666 mutate(df, new_col_name='C4', operation='div', col_args=['C1', 'C2']) --------------- df result C1 C2 C3 C1 C2 C3 C4 0 3 5 d --> 0 3 a d 0.6 1 4 10 e 1 4 b e 0.4 2 5 20 f 2 5 c f 0.25 --------------- Graph Abstraction: - EQUAL edges between the all columns and cells in the input to the corresponding nodes of the output. - If normalize, SUM and DIV edges representing the computation. A unique sum intermediate node is created for each cell of the column being normalized. """ cands_cols = g_df.columns operation = SelectConst(["normalize", "div"], uid="mutate_operation") new_col_name = FreshColumn(uid="mutate_new_col_name") if operation == "normalize": col_arg = SelectNode(cands_cols, uid="mutate_col_args_normalize") col_args = [col_arg] result = RInterpreter.mutate(df, new_col_name=new_col_name, operation=operation, col_args=col_arg) call_str = f"mutate({{inp1}}, new_col_name={new_col_name!r}, operation={operation!r}, col_args={col_arg!r})" else: col_arg1, col_arg2 = OrderedSubsetNode(cands_cols, uid="mutate_col_args_div", min_len=2, max_len=2) col_args = [col_arg1, col_arg2] result = RInterpreter.mutate(df, new_col_name=new_col_name, operation=operation, col_args=col_args) call_str = f"mutate({{inp1}}, new_col_name={new_col_name!r}, operation={operation!r}, col_args={col_args!r})" # -------------------------------------------------------------------------------------------------------------- # # Graph Construction # -------------------------------------------------------------------------------------------------------------- # g_res = DataFrameGraph(result) graph = GraphRLang.assemble([g_df, g_res]) added_edges: List[Edge] = [] col_map_df = {c.value: c for c in g_df.columns } # Map from df's columns to their column nodes col_map_res = {c.value: c for c in g_res.columns } # Map from result's columns to their column nodes # - EQUAL edges between the corresponding columns as all columns are preserved. # - EQUAL edges between the cells that are preserved. for c in col_map_df: added_edges.append(Edge(col_map_df[c], col_map_res[c], ELabel.EQUAL)) for v1, v2 in zip(g_df.loc[:, c], g_res.loc[:, c]): added_edges.append(Edge(v1, v2, ELabel.EQUAL)) if operation == 'normalize': summation = df[col_args[0]].sum() for cell_node, res_node in zip(g_df.loc[:, col_args[0]], g_res.loc[:, new_col_name]): # Add the sum edges interm_node_sum = graph.create_intermediate_node(summation) for c in g_df.loc[:, col_args[0]]: added_edges.append(Edge(c, interm_node_sum, ELabel.SUM)) interm_node_div = graph.create_intermediate_node(res_node.value) added_edges.append( Edge(interm_node_sum, interm_node_div, ELabel.DIV)) added_edges.append(Edge(cell_node, interm_node_div, ELabel.DIV)) added_edges.append(Edge(interm_node_div, res_node, ELabel.EQUAL)) else: for cell_nodes, res_node in zip(g_df.loc[:, col_args].values, g_res.loc[:, new_col_name]): interm_node = graph.create_intermediate_node(res_node.value) added_edges.append(Edge(interm_node, res_node, ELabel.EQUAL)) for n in cell_nodes: added_edges.append(Edge(n, interm_node, ELabel.DIV)) # - EQUAL edge between the input deletion node and the output deletion node. added_edges.append( Edge(g_df.deletion_node, g_res.deletion_node, ELabel.EQUAL)) # Add all the edges to the graph in one go. graph.add_nodes_and_edges(edges=added_edges) # -------------------------------------------------------------------------------------------------------------- # # Add information about arguments # -------------------------------------------------------------------------------------------------------------- # tagged_edges: List[TaggedEdge] = [] if operation == 'normalize': for c_node in cands_cols: if c_node.value in col_args: tagged_edges.append( TaggedEdge(c_node, c_node, "SELECTED@mutate_col_args_normalize")) else: tagged_edges.append( TaggedEdge(c_node, c_node, "NOT_SELECTED@mutate_col_args_normalize")) else: for c_node in cands_cols: if c_node.value in col_args: tagged_edges.append( TaggedEdge(c_node, c_node, "SELECTED@mutate_col_args_div")) else: tagged_edges.append( TaggedEdge(c_node, c_node, "NOT_SELECTED@mutate_col_args_div")) graph.add_tags([ f"SELECTED@{cand}@mutate_operation" if cand == operation else f"NOT_SELECTED@{cand}@mutate_operation" for cand in ["normalize", "div"] ]) graph.add_tagged_edges(tagged_edges) return result, call_str, graph, g_res
def gen_select(df: pd.DataFrame, g_df: DataFrameGraph, datagen: bool = False): """ SELECT ------ Example: select(df, columns_keep=None, columns_remove=['C3']) --------------- df result C1 C2 C3 C1 C2 0 3 a d --> 0 3 a 1 4 b e 1 4 b 2 5 c f 2 5 c --------------- Graph Abstraction: - EQUAL edges between the columns, cells of the preserved columns and the corresponding cells in the output. - DELETE edges for the removed columns and their cells. """ cands_cols = list(g_df.columns) choice = SelectConst([True, False], uid="select_keep_or_remove") if choice: columns_keep = list( SubsetNode(cands_cols, uid="select_columns_keep", allow_empty=False)) columns_remove = None else: columns_keep = None columns_remove = list( SubsetNode(cands_cols, uid="select_columns_remove", allow_empty=False)) result = RInterpreter.select(df, columns_keep=columns_keep, columns_remove=columns_remove) call_str = f"select({{inp1}}, columns_keep={columns_keep!r}, columns_remove={columns_remove!r})" # -------------------------------------------------------------------------------------------------------------- # # Graph Construction # -------------------------------------------------------------------------------------------------------------- # g_res = DataFrameGraph(result) graph = GraphRLang.assemble([g_df, g_res]) added_edges: List[Edge] = [] col_map_df = {c.value: c for c in g_df.columns } # Map from df's columns to their column nodes col_map_res = {c.value: c for c in g_res.columns } # Map from result's columns to their column nodes if choice: kept_cols = set(columns_keep) removed_cols = [c for c in df.columns if c not in kept_cols] else: removed_cols = set(columns_remove) kept_cols = [c for c in df.columns if c not in removed_cols] # - EQUAL edges between the corresponding preserved columns. # - EQUAL edges between the cells that are preserved. for c in kept_cols: added_edges.append(Edge(col_map_df[c], col_map_res[c], ELabel.EQUAL)) for v1, v2 in zip(g_df.loc[:, c], g_res.loc[:, c]): added_edges.append(Edge(v1, v2, ELabel.EQUAL)) # - DELETE edges for the deleted columns and their cells. for c in removed_cols: added_edges.append( Edge(col_map_df[c], g_res.deletion_node, ELabel.DELETE)) for v in g_df.loc[:, c]: added_edges.append(Edge(v, g_res.deletion_node, ELabel.DELETE)) # - EQUAL edge between the input deletion node and the output deletion node. added_edges.append( Edge(g_df.deletion_node, g_res.deletion_node, ELabel.EQUAL)) # Add all the edges to the graph in one go. graph.add_nodes_and_edges(edges=added_edges) # -------------------------------------------------------------------------------------------------------------- # # Add information about arguments # -------------------------------------------------------------------------------------------------------------- # tagged_edges: List[TaggedEdge] = [] if choice: for c_node in cands_cols: if c_node.value in columns_keep: tagged_edges.append( TaggedEdge(src=c_node, dst=c_node, tag="SELECTED@select_columns_keep")) else: tagged_edges.append( TaggedEdge(src=c_node, dst=c_node, tag="NOT_SELECTED@select_columns_keep")) else: for c_node in cands_cols: if c_node.value in columns_remove: tagged_edges.append( TaggedEdge(src=c_node, dst=c_node, tag="SELECTED@select_columns_remove")) else: tagged_edges.append( TaggedEdge(src=c_node, dst=c_node, tag="NOT_SELECTED@select_columns_remove")) for cand_choice in [True, False]: if cand_choice == choice: graph.add_tags([f"SELECTED@{cand_choice}@select_keep_or_remove"]) else: graph.add_tags( [f"NOT_SELECTED@{cand_choice}@select_keep_or_remove"]) graph.add_tagged_edges(tagged_edges) return result, call_str, graph, g_res
def gen_spread(df: pd.DataFrame, g_df: DataFrameGraph, datagen: bool = False): """ SPREAD ------ Example: spread(columns='var', values='value') --------------- df result C1 var value C1 C2 C3 0 c C2 b 0 a d e 1 a C2 d --> 1 c b NaN 2 a C3 e --------------- Graph Abstraction: - EQUAL edge between the nodes in the `columns` column and the column node of the result. - EQUAL edge between the `index` column and the corresponding column node of the result. - EQUAL edge between the cells of `values` column and the corresponding cells in the result. - EQUAL edge between the input deletion node and the output deletion node. """ cands_columns = g_df.columns columns = SelectNode(cands_columns, uid="spread_columns") cands_values = [c for c in g_df.columns if c.value != columns] values = SelectNode(cands_values, uid="spread_values") index = [c for c in df.columns if c != columns and c != values ] # The columns that will remain as is. result = RInterpreter.spread(df, columns=columns, values=values) call_str = f"spread({{inp1}}, columns={columns!r}, values={values!r})" # -------------------------------------------------------------------------------------------------------------- # # Graph Construction # -------------------------------------------------------------------------------------------------------------- # g_res = DataFrameGraph(result) graph = GraphRLang.assemble([g_df, g_res]) added_edges: List[Edge] = [] col_map_df = {c.value: c for c in g_df.columns } # Map from df's columns to their column nodes col_map_res = {c.value: c for c in g_res.columns } # Map from result's columns to their column nodes # - EQUAL edge between the nodes in the `columns` column and the column node of the result. for cell in g_df.loc[:, columns]: added_edges.append(Edge(cell, col_map_res[cell.value], ELabel.EQUAL)) # - EQUAL edge between the `index` column and the corresponding column node of the result. # - EQUAL edge between the cells of `index` column and the corresponding cells in the result. for c in index: added_edges.append(Edge(col_map_df[c], col_map_res[c], ELabel.EQUAL)) value_map = {n.value: n for n in g_res.loc[:, c]} for cell in g_df.loc[:, c]: added_edges.append(Edge(cell, value_map[cell.value], ELabel.EQUAL)) # - EQUAL edge between the cells of `values` column and the corresponding cells in the result. for idx_vals, col_val, df_val_node in zip( df[index].values if len(index) > 0 is not None else df.index, df.loc[:, columns], g_df.loc[:, values]): if len(index) == 0: filtered = [g_res.loc[idx_vals, col_val]] else: idx_mask = True for idx_val, idx in zip(idx_vals, index): idx_mask = idx_mask & (result[idx] == idx_val) filtered = list(g_res.loc[idx_mask][col_val]) assert len(filtered) == 1 res_val_node = filtered[0] added_edges.append(Edge(df_val_node, res_val_node, ELabel.EQUAL)) # - EQUAL edge between the input deletion node and the output deletion node. added_edges.append( Edge(g_df.deletion_node, g_res.deletion_node, ELabel.EQUAL)) # Add all the edges to the graph in one go. graph.add_nodes_and_edges(edges=added_edges) # -------------------------------------------------------------------------------------------------------------- # # Add information about arguments # -------------------------------------------------------------------------------------------------------------- # tagged_edges: List[TaggedEdge] = [] for c_node in cands_columns: if c_node.value == columns: tagged_edges.append( TaggedEdge(src=c_node, dst=c_node, tag="SELECTED@spread_columns")) else: tagged_edges.append( TaggedEdge(src=c_node, dst=c_node, tag="NOT_SELECTED@spread_columns")) for c_node in cands_values: if c_node.value == values: tagged_edges.append( TaggedEdge(src=c_node, dst=c_node, tag="SELECTED@spread_values")) else: tagged_edges.append( TaggedEdge(src=c_node, dst=c_node, tag="NOT_SELECTED@spread_values")) graph.add_tagged_edges(tagged_edges) return result, call_str, graph, g_res
def gen_separate(df: pd.DataFrame, g_df: DataFrameGraph, datagen: bool = False): """ SEPARATE ------ Example: separate(split_col='C2', into=["C3", "C4"]) --------------- df result C1 C2 C1 C3 C4 0 3 a_d 0 3 a d 1 4 b_e 1 4 b e 2 5 c_f 2 5 c f --------------- Graph Abstraction: - EQUAL edges between the columns, cells of the preserved columns and the corresponding cells in the output. - STR_SPLIT edges between concerned cells. """ cands_cols = g_df.columns split_col = SelectNode(cands_cols, min_len=2, uid="separate_split_col") max_into_len = max([ len(re.compile("[^a-zA-Z0-9]+").split(str(x))) for x in df[split_col] ]) if max_into_len <= 1: raise ExceptionAsContinue into = [FreshColumn(uid="separate_into") for _ in range(max_into_len)] result = RInterpreter.separate(df, split_col=split_col, into=into) call_str = f"separate({{inp1}}, split_col={split_col!r}, into={into!r})" # -------------------------------------------------------------------------------------------------------------- # # Graph Construction # -------------------------------------------------------------------------------------------------------------- # g_res = DataFrameGraph(result) graph = GraphRLang.assemble([g_df, g_res]) added_edges: List[Edge] = [] col_map_df = {c.value: c for c in g_df.columns } # Map from df's columns to their column nodes col_map_res = {c.value: c for c in g_res.columns } # Map from result's columns to their column nodes # - EQUAL edges between columns that are unused and their cells to their counterparts. unused_cols = set(col_map_df) unused_cols.difference_update(split_col) for c in col_map_df: if c != split_col: added_edges.append( Edge(col_map_df[c], col_map_res[c], ELabel.EQUAL)) for v1, v2 in zip(g_df.loc[:, c], g_res.loc[:, c]): added_edges.append(Edge(v1, v2, ELabel.EQUAL)) for df_node, res_nodes in zip(g_df.loc[:, split_col], g_res.loc[:, into].values): for n in res_nodes: interm_node = graph.create_intermediate_node(n.value) added_edges.append(Edge(interm_node, n, ELabel.EQUAL)) added_edges.append(Edge(df_node, interm_node, ELabel.STR_SPLIT)) # - EQUAL edge between the input deletion node and the output deletion node. added_edges.append( Edge(g_df.deletion_node, g_res.deletion_node, ELabel.EQUAL)) # Add all the edges to the graph in one go. graph.add_nodes_and_edges(edges=added_edges) # -------------------------------------------------------------------------------------------------------------- # # Add information about arguments # -------------------------------------------------------------------------------------------------------------- # tagged_edges: List[TaggedEdge] = [] for c_node in cands_cols: if c_node.value == split_col: tagged_edges.append( TaggedEdge(c_node, c_node, "SELECTED@separate_split_col")) else: tagged_edges.append( TaggedEdge(c_node, c_node, "NOT_SELECTED@separate_split_col")) graph.add_tagged_edges(tagged_edges) return result, call_str, graph, g_res
def gen_unite(df: pd.DataFrame, g_df: DataFrameGraph, datagen: bool = False): """ UNITE ------ Example: unite(cols=['C2', 'C3'], new_col_name='C4') --------------- df result C1 C2 C3 C1 C4 0 3 a d --> 0 3 a_d 1 4 b e 1 4 b_e 2 5 c f 2 5 c_f --------------- Graph Abstraction: - EQUAL edges between the columns, cells of the preserved columns and the corresponding cells in the output. - STR_JOIN edges between concerned cells. """ cands_cols = g_df.columns cols = list(OrderedSubsetNode(cands_cols, min_len=2, uid="unite_cols")) new_col_name = FreshColumn(uid="unite_new_col_name") result = RInterpreter.unite(df, cols=cols, new_col_name=new_col_name) call_str = f"unite({{inp1}}, cols={cols!r}), new_col_name={new_col_name!r})" # -------------------------------------------------------------------------------------------------------------- # # Graph Construction # -------------------------------------------------------------------------------------------------------------- # g_res = DataFrameGraph(result) graph = GraphRLang.assemble([g_df, g_res]) added_edges: List[Edge] = [] col_map_df = {c.value: c for c in g_df.columns } # Map from df's columns to their column nodes col_map_res = {c.value: c for c in g_res.columns } # Map from result's columns to their column nodes # - EQUAL edges between columns that are unused and their cells to their counterparts. unused_cols = set(col_map_df) unused_cols.difference_update(cols) for c in unused_cols: added_edges.append(Edge(col_map_df[c], col_map_res[c], ELabel.EQUAL)) for v1, v2 in zip(g_df.loc[:, c], g_res.loc[:, c]): added_edges.append(Edge(v1, v2, ELabel.EQUAL)) for df_nodes, res_node in zip(g_df.loc[:, cols].values, g_res.loc[:, new_col_name]): interm_node = graph.create_intermediate_node(res_node.value) for n in df_nodes: added_edges.append(Edge(n, interm_node, ELabel.STR_JOIN)) added_edges.append(Edge(interm_node, res_node, ELabel.EQUAL)) # - EQUAL edge between the input deletion node and the output deletion node. added_edges.append( Edge(g_df.deletion_node, g_res.deletion_node, ELabel.EQUAL)) # Add all the edges to the graph in one go. graph.add_nodes_and_edges(edges=added_edges) # -------------------------------------------------------------------------------------------------------------- # # Add information about arguments # -------------------------------------------------------------------------------------------------------------- # tagged_edges: List[TaggedEdge] = [] for c_node in cands_cols: if c_node.value in cols: tagged_edges.append( TaggedEdge(c_node, c_node, "SELECTED@unite_cols")) else: tagged_edges.append( TaggedEdge(c_node, c_node, "NOT_SELECTED@unite_cols")) graph.add_tagged_edges(tagged_edges) return result, call_str, graph, g_res
def gen_gather(df: pd.DataFrame, g_df: DataFrameGraph, datagen: bool = False): """ GATHER ------ Example: gather(id_vars=['C1'], value_vars=['C2', 'C3'], var_name='var', value_name='value') df result C1 C2 C3 C1 var value 0 a b e --> 0 a C2 b 1 c d f 1 c C2 d 2 a C3 e 3 c C3 f --------------- Graph Abstraction: - EQUAL edges between id_var columns and corresponding columns in output. - EQUAL edges between value_var columns and corresponding cells in output. - EQUAL edges between cells of id_var and value_var columns to the corresponding cells in output. - EQUAL edge between the input deletion node and the output deletion node. - DELETE edge between the columns not in id_vars and value_vars to the output deletion node. - DELETE edge between the cells of columns not in id_vars and value_vars to the output deletion node. """ cands_id_vars = g_df.columns id_vars = list( SubsetNode(cands_id_vars, uid="gather_id_vars", allow_empty=True)) cands_value_vars = [c for c in g_df.columns if c.value not in id_vars] value_vars = list(SubsetNode(cands_value_vars, uid="gather_value_vars")) var_name = FreshColumn(uid="gather_var_name") value_name = FreshColumn(uid="gather_value_name") result = RInterpreter.gather(df, id_vars=id_vars, value_vars=value_vars, var_name=var_name, value_name=value_name) call_str = f"gather({{inp1}}, id_vars={id_vars!r}, value_vars={value_vars!r}, " \ f"var_name={var_name!r}, value_name={value_name!r})" # -------------------------------------------------------------------------------------------------------------- # # Graph Construction # -------------------------------------------------------------------------------------------------------------- # g_res = DataFrameGraph(result) graph = GraphRLang.assemble([g_df, g_res]) added_edges: List[Edge] = [] col_map_df = {c.value: c for c in g_df.columns } # Map from df's columns to their column nodes col_map_res = {c.value: c for c in g_res.columns } # Map from result's columns to their column nodes # - EQUAL edges between id_var columns and corresponding columns in output. for c1, c2 in ((col_map_df[c], col_map_res[c]) for c in id_vars or []): added_edges.append(Edge(c1, c2, ELabel.EQUAL)) # - EQUAL edges between value_var columns and corresponding cells in output. for var_node in g_res.loc[:, var_name]: added_edges.append( Edge(col_map_df[var_node.value], var_node, ELabel.EQUAL)) # - EQUAL edges between cells of id_var and value_var columns to the corresponding cells in output. for col in id_vars or []: df_nodes_concat = list(g_df.loc[:, col]) * len(value_vars) for n1, n2 in zip(df_nodes_concat, g_res.loc[:, col]): added_edges.append(Edge(n1, n2, ELabel.EQUAL)) value_nodes_concat = sum((list(g_df.loc[:, c]) for c in value_vars), []) for n1, n2 in zip(value_nodes_concat, g_res.loc[:, value_name]): added_edges.append(Edge(n1, n2, ELabel.EQUAL)) # - EQUAL edge between the input deletion node and the output deletion node. added_edges.append( Edge(g_df.deletion_node, g_res.deletion_node, ELabel.EQUAL)) # - DELETE edge between the columns not in id_vars and value_vars to the output deletion node. # - DELETE edge between the cells of columns not in id_vars and value_vars to the output deletion node. unused_cols = [ c for c in df.columns if c not in id_vars and c not in value_vars ] for col in unused_cols: added_edges.append( Edge(col_map_df[col], g_res.deletion_node, ELabel.DELETE)) for val_node in g_df.loc[:, col]: added_edges.append( Edge(val_node, g_res.deletion_node, ELabel.DELETE)) # Add all the edges to the graph in one go. graph.add_nodes_and_edges(edges=added_edges) # -------------------------------------------------------------------------------------------------------------- # # Add information about arguments # -------------------------------------------------------------------------------------------------------------- # tagged_edges: List[TaggedEdge] = [] for c_node in cands_id_vars: if id_vars is not None and c_node.value in id_vars: tagged_edges.append( TaggedEdge(src=c_node, dst=c_node, tag="SELECTED@gather_id_vars")) else: tagged_edges.append( TaggedEdge(src=c_node, dst=c_node, tag="NOT_SELECTED@gather_id_vars")) for c_node in cands_value_vars: if c_node.value in value_vars: tagged_edges.append( TaggedEdge(src=c_node, dst=c_node, tag="SELECTED@gather_value_vars")) else: tagged_edges.append( TaggedEdge(src=c_node, dst=c_node, tag="NOT_SELECTED@gather_value_vars")) graph.add_tagged_edges(tagged_edges) return result, call_str, graph, g_res
def gen_group_by_summarise(df: pd.DataFrame, g_df: DataFrameGraph, datagen: bool = False): """ GROUP_BY (+ SUMMARISE) ------ Example: group_by(group_cols=['C1']).summarise(summaries={"C3": ("C2", "mean")}) df result C1 C2 C1 C3 0 A 100 0 A 150 1 A 200 --> 1 B 300 2 B 300 --------------- Graph Abstraction: - EQUAL edges between group columns and corresponding columns in output. - EQUAL edges between the cells of the group columns and the corresponding cells in the output. - SUM/MEAN/COUNT edges between the cells of aggregated columns (or group columns in case of 'count') and the resulting cells in the output. - EQUAL edge between the input deletion node and the output deletion node. - DELETE edges between the non-group cols and non-aggregated columns and their cells to the deletion node of the output. - DELETE edge between the column of the aggregated column (in case of sum/mean) and the deletion node of the output. """ cands_group_cols = g_df.columns group_cols = list(SubsetNode(cands_group_cols, uid="group_by_group_cols")) new_col = FreshColumn(uid="summarise_new_col") agg_cands = ["count", "mean", "sum"] agg = SelectConst(agg_cands, uid="summarise_agg") if agg != "count": cands_agg_col = [c for c in g_df.columns if c.value not in group_cols] agg_col = SelectNode(cands_agg_col, uid="summarise_col") else: cands_agg_col = None agg_col = None summaries = {new_col: (agg_col, agg)} result_groupby = RInterpreter.group_by(df, group_cols) result = RInterpreter.summarise(result_groupby, summaries) call_str = f"summarise(group_by({{inp1}}, group_cols={group_cols!r}), summaries={{{summaries!r}}})" # -------------------------------------------------------------------------------------------------------------- # # Graph Construction # -------------------------------------------------------------------------------------------------------------- # g_res = DataFrameGraph(result) graph = GraphRLang.assemble([g_df, g_res]) added_edges: List[Edge] = [] col_map_df = {c.value: c for c in g_df.columns } # Map from df's columns to their column nodes col_map_res = {c.value: c for c in g_res.columns } # Map from result's columns to their column nodes # - EQUAL edges between group columns and corresponding columns in output. for c in group_cols: added_edges.append(Edge(col_map_df[c], col_map_res[c], ELabel.EQUAL)) # - EQUAL edges between the cells of the group columns and the corresponding cells in the output. # - SUM/MEAN/COUNT edges between the cells of aggregated columns (or group columns in case of 'count') and the # resulting cells in the output. agg_cols = [agg_col] if agg != "count" else group_cols for idx, group in enumerate(result.loc[:, group_cols].values): group = group[0] if len(group) == 1 else tuple(group) df_indices = result_groupby.groups[ group] # Get the indices in df that correspond to this group out_node = g_res.loc[result.index[idx], new_col] interm_node = graph.create_intermediate_node(out_node.value) added_edges.append(Edge(interm_node, out_node, ELabel.EQUAL)) # Aggregation edges for col in agg_cols: for val_node in g_df.loc[df_indices, col]: added_edges.append( Edge(val_node, interm_node, getattr(ELabel, agg.upper()))) # Equality edges for group_col nodes for col in group_cols: group_node = g_res.loc[result.index[idx], col] for df_group_node in g_df.loc[df_indices, col]: added_edges.append( Edge(df_group_node, group_node, ELabel.EQUAL)) # - EQUAL edge between the input deletion node and the output deletion node. added_edges.append( Edge(g_df.deletion_node, g_res.deletion_node, ELabel.EQUAL)) # - DELETE edges between the non-group cols and non-aggregated columns and their cells # to the deletion node of the output. # - DELETE edge between the column of the aggregated column (in case of sum/mean) and the deletion node # of the output. non_group_cols = [c for c in df.columns if c not in group_cols] for col in non_group_cols: added_edges.append( Edge(col_map_df[col], g_res.deletion_node, ELabel.DELETE)) if col != agg_col: for cell in g_df.loc[:, col]: added_edges.append( Edge(cell, g_res.deletion_node, ELabel.DELETE)) # Add all the edges to the graph in one go. graph.add_nodes_and_edges(edges=added_edges) # -------------------------------------------------------------------------------------------------------------- # # Add information about arguments # -------------------------------------------------------------------------------------------------------------- # tagged_edges: List[TaggedEdge] = [] for c_node in cands_group_cols: if c_node.value in group_cols: tagged_edges.append( TaggedEdge(src=c_node, dst=c_node, tag="SELECTED@group_by_group_cols")) else: tagged_edges.append( TaggedEdge(src=c_node, dst=c_node, tag="NOT_SELECTED@group_by_group_cols")) if cands_agg_col is not None: for c_node in cands_agg_col: if c_node.value == agg_col: tagged_edges.append( TaggedEdge(src=c_node, dst=c_node, tag="SELECTED@summarise_col")) else: tagged_edges.append( TaggedEdge(src=c_node, dst=c_node, tag="NOT_SELECTED@summarise_col")) graph.add_tags([ f"SELECTED@{cand}@summarise_agg" if cand == agg else f"NOT_SELECTED@{cand}@summarise_agg" for cand in agg_cands ]) graph.add_tagged_edges(tagged_edges) return result, call_str, graph, g_res
def test_greatest_common_universal_subgraph_1(self): g1 = Graph() n1 = Node(label=0, entity=DEFAULT_ENTITY, value=SYMBOLIC_VALUE) n2 = Node(label=1, entity=DEFAULT_ENTITY, value=SYMBOLIC_VALUE) n3 = Node(label=2, entity=DEFAULT_ENTITY, value=SYMBOLIC_VALUE) n4 = Node(label=3, entity=DEFAULT_ENTITY, value=SYMBOLIC_VALUE) g1.add_nodes_and_edges(nodes=[n1, n2, n3, n4]) g1.add_tags(["TAG_1", "TAG_2"]) g1.add_tagged_edges( [TaggedEdge(n2, n2, "TAG_L1"), TaggedEdge(n3, n3, "TAG_L2")]) # Linear chain from n1 to n2 and n2 to n3 and n3 to n4 g1.add_edge(Edge(src=n1, dst=n2, label=10)) g1.add_edge(Edge(src=n2, dst=n3, label=11)) g1.add_edge(Edge(src=n3, dst=n4, label=12)) g2 = Graph() n1 = Node(label=0, entity=DEFAULT_ENTITY, value=SYMBOLIC_VALUE) n2 = Node(label=1, entity=DEFAULT_ENTITY, value=SYMBOLIC_VALUE) n3 = Node(label=2, entity=DEFAULT_ENTITY, value=SYMBOLIC_VALUE) n4 = Node(label=2, entity=DEFAULT_ENTITY, value=SYMBOLIC_VALUE) n5 = Node(label=2, entity=DEFAULT_ENTITY, value=SYMBOLIC_VALUE) n6 = Node(label=2, entity=DEFAULT_ENTITY, value=SYMBOLIC_VALUE) n7 = Node(label=2, entity=DEFAULT_ENTITY, value=SYMBOLIC_VALUE) n8 = Node(label=3, entity=DEFAULT_ENTITY, value=SYMBOLIC_VALUE) g2.add_nodes_and_edges(nodes=[n1, n2, n3, n4, n5, n6, n7, n8]) g2.add_tags(["TAG_2", "TAG_3"]) g2.add_tagged_edges( [TaggedEdge(n2, n2, "TAG_L1"), TaggedEdge(n3, n3, "TAG_L2")]) # Only one of label=2 has an edge to a label=3 g2.add_edge(Edge(src=n1, dst=n2, label=10)) g2.add_edge(Edge(src=n2, dst=n3, label=11)) g2.add_edge(Edge(src=n2, dst=n4, label=11)) g2.add_edge(Edge(src=n2, dst=n5, label=11)) g2.add_edge(Edge(src=n2, dst=n6, label=11)) g2.add_edge(Edge(src=n2, dst=n7, label=11)) g2.add_edge(Edge(src=n7, dst=n8, label=12)) query = Graph() n1 = Node(label=0, entity=DEFAULT_ENTITY, value=SYMBOLIC_VALUE) n2 = Node(label=1, entity=DEFAULT_ENTITY, value=SYMBOLIC_VALUE) query.add_nodes_and_edges(nodes=[n1, n2]) query.add_edge(Edge(n1, n2, 10)) supergraph, mapping = query.get_greatest_common_universal_supergraph( [g1]) # We expect the supergraph to be equivalent to g1 self.assertEqual(3, supergraph.get_num_edges()) self.assertEqual(4, supergraph.get_num_nodes()) self.assertSetEqual({0, 1, 2, 3}, {n.label for n in supergraph.iter_nodes()}) self.assertSetEqual({10, 11, 12}, {e.label for e in supergraph.iter_edges()}) self.assertSetEqual({"TAG_1", "TAG_2"}, set(supergraph.iter_tags())) self.assertEqual({"TAG_L1", "TAG_L2"}, {e.tag for e in supergraph.iter_tagged_edges()}) for node in mapping.m_node: self.assertIn(node, query.get_all_nodes()) supergraph, mapping = query.get_greatest_common_universal_supergraph( [g1, g2]) # We expect the supergraph to be the linear chain 0 to 1 and 1 to 2 self.assertEqual(2, supergraph.get_num_edges()) self.assertEqual(3, supergraph.get_num_nodes()) self.assertSetEqual({0, 1, 2}, {n.label for n in supergraph.iter_nodes()}) self.assertSetEqual({10, 11}, {e.label for e in supergraph.iter_edges()}) self.assertSetEqual({"TAG_2"}, set(supergraph.iter_tags())) self.assertEqual({"TAG_L1"}, {e.tag for e in supergraph.iter_tagged_edges()}) for node in mapping.m_node: self.assertIn(node, query.get_all_nodes())