def test_index_name_nodes(self): df = pd.DataFrame({ 'foo': ['one', 'one', 'one', 'two', 'two', 'two'], 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 'baz': [1, 2, 3, 4, 5, 6] }) output = df.pivot(index='foo', columns='bar', values='baz') options = GraphOptions() options.COLUMN_NODES = True options.INDEX_NODES = True options.INDEX_NAME_NODES = True options.ADJACENCY_EDGES = True options.EQUALITY_EDGES = True options.NODE_TYPES = True options.INDEX_EDGES = False rel_graph: RelationGraph = RelationGraph(options) rel_graph.from_input_output([df], output) index_name_nodes = [ node for node in rel_graph.nodes if node.ntype == GraphNodeType.INDEX_NAME ] column_name_nodes = [ node for node in rel_graph.nodes if node.ntype == GraphNodeType.COL_INDEX_NAME ] self.assertEqual(len(index_name_nodes), 1) self.assertEqual(len(column_name_nodes), 1)
def test_index_name_equality_edges(self): df = pd.DataFrame({ 'foo': ['one', 'one', 'one', 'two', 'two', 'two'], 'bar': ['A', 'B', 'C', 'A', 'B', 'C'], 'baz': [1, 2, 3, 4, 5, 6] }) output = df.pivot(index='foo', columns='bar', values='baz') options = GraphOptions() options.COLUMN_NODES = True options.INDEX_NODES = True options.INDEX_NAME_NODES = True options.ADJACENCY_EDGES = False options.EQUALITY_EDGES = True options.NODE_TYPES = True options.INDEX_EDGES = False rel_graph: RelationGraph = RelationGraph(options) rel_graph.from_input_output([df], output) inp_col_nodes = [ node for node in rel_graph.nodes if node.ntype == GraphNodeType.COLUMN and node.source.startswith("I") ] out_idx_name_nodes = [ node for node in rel_graph.nodes if node.ntype == GraphNodeType.INDEX_NAME and node.source.startswith("O") ] out_col_idx_name_nodes = [ node for node in rel_graph.nodes if node.ntype == GraphNodeType.COL_INDEX_NAME and node.source.startswith("O") ] def check_edge_exists(in_node: GraphNode, out_node: GraphNode, graph: RelationGraph): for e in graph.edges: if (e.node1 == in_node and e.node2 == out_node) or (e.node1 == out_node and e.node2 == in_node): return True return False inp_foo_node = [i for i in inp_col_nodes if i.identifier == '[-1,0]'][0] inp_bar_node = [i for i in inp_col_nodes if i.identifier == '[-1,1]'][0] out_foo_node = [ i for i in out_idx_name_nodes if i.identifier == '[-1,-1]' ][0] out_bar_node = [ i for i in out_col_idx_name_nodes if i.identifier == '[-1,-1]' ][0] self.assertTrue( check_edge_exists(inp_foo_node, out_foo_node, rel_graph)) self.assertTrue( check_edge_exists(inp_bar_node, out_bar_node, rel_graph))
def __init__(self, obj: pd.DataFrame, type_str: str, source: str, options: GraphOptions, **kwargs): options = copy.copy(options) options.INDEX_NODES = False options.COLUMN_NODES = False options.INDEX_NAME_NODES = False options.INDEX_EDGES = False options.INDEX_NAME_EDGES = False super().__init__(obj, type_str, source, options, **kwargs)
def test_index_name_nodes_multiindex(self): df = pd.DataFrame( [(389.0, 'fly'), (24.0, 'fly'), (80.5, 'run'), (np.nan, 'jump')], index=pd.MultiIndex.from_tuples([('bird', 'falcon'), ('bird', 'parrot'), ('mammal', 'lion'), ('mammal', 'monkey')], names=['class', 'name']), columns=pd.MultiIndex.from_tuples([('speed', 'max'), ('species', 'type')])) df.columns.names = ['name1', 'name2'] options = GraphOptions() options.COLUMN_NODES = True options.INDEX_NODES = True options.INDEX_NAME_NODES = True options.ADJACENCY_EDGES = True options.EQUALITY_EDGES = True options.NODE_TYPES = True options.INDEX_EDGES = False rel_graph: RelationGraph = RelationGraph(options) rel_graph.from_input_output([df], df) index_name_nodes = [ node for node in rel_graph.nodes if node.ntype == GraphNodeType.INDEX_NAME ] column_name_nodes = [ node for node in rel_graph.nodes if node.ntype == GraphNodeType.COL_INDEX_NAME ] self.assertEqual(len(index_name_nodes), 4) # Both in the input and output, so x2 self.assertEqual(len(column_name_nodes), 4) # Both in the input and output, so x2