コード例 #1
0
    def test_index_name_nodes(self):
        df = pd.DataFrame({
            'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
            'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
            'baz': [1, 2, 3, 4, 5, 6]
        })
        output = df.pivot(index='foo', columns='bar', values='baz')

        options = RelationGraphOptions()
        options.COLUMN_NODES = True
        options.INDEX_NODES = True
        options.INDEX_NAME_NODES = True
        options.ADJACENCY_EDGES = True
        options.EQUALITY_EDGES = True
        options.NODE_TYPES = True
        options.INDEX_EDGES = False

        rel_graph: RelationGraph = RelationGraph.build_relation_graph([df],
                                                                      output,
                                                                      options)
        index_name_nodes = [
            node for node in rel_graph.nodes
            if node._type == RelationGraphNodeType.INDEX_NAME
        ]
        column_name_nodes = [
            node for node in rel_graph.nodes
            if node._type == RelationGraphNodeType.COL_INDEX_NAME
        ]

        self.assertEqual(len(index_name_nodes), 1)
        self.assertEqual(len(column_name_nodes), 1)
コード例 #2
0
    def test_groupby_has_artifacts(self):
        df = pd.DataFrame([[5, 2], [2, 3], [2, 0]], columns=["A", "B"])
        output = df.groupby(by="A")

        options = RelationGraphOptions()
        options.COLUMN_NODES = True
        options.INDEX_NODES = True
        options.ADJACENCY_EDGES = True
        options.EQUALITY_EDGES = True
        options.NODE_TYPES = True
        options.INDEX_EDGES = True

        rel_graph: RelationGraph = RelationGraph.build_relation_graph([df],
                                                                      output,
                                                                      options)

        index_type_nodes = [
            node for node in rel_graph.nodes
            if node._type == RelationGraphNodeType.INDEX
        ]
        column_type_nodes = [
            node for node in rel_graph.nodes
            if node._type == RelationGraphNodeType.COLUMN
        ]

        self.assertEqual(len(index_type_nodes), 6)
        self.assertEqual(len(column_type_nodes), 6)
コード例 #3
0
    def test_series_has_idx_and_cols(self):
        df = pd.DataFrame([[5, 2], [2, 3], [2, 0]], columns=["A", "B"])

        options = RelationGraphOptions()
        options.COLUMN_NODES = True
        options.INDEX_NODES = True
        options.ADJACENCY_EDGES = True
        options.EQUALITY_EDGES = True
        options.NODE_TYPES = True
        options.INDEX_EDGES = True

        rel_graph: RelationGraph = RelationGraph.build_relation_graph([df],
                                                                      df["A"],
                                                                      options)

        index_type_nodes = [
            node for node in rel_graph.nodes
            if node._type == RelationGraphNodeType.INDEX
        ]
        column_type_nodes = [
            node for node in rel_graph.nodes
            if node._type == RelationGraphNodeType.COLUMN
        ]

        self.assertEqual(len(index_type_nodes), 6)
        self.assertEqual(len(column_type_nodes), 3)
コード例 #4
0
    def test_no_spurious_for_list_arg(self):
        df = pd.DataFrame([[5, 2], [2, 3], [2, 0]], columns=["A", "B"])

        options = RelationGraphOptions()
        options.COLUMN_NODES = True
        options.INDEX_NODES = True
        options.ADJACENCY_EDGES = True
        options.EQUALITY_EDGES = True
        options.NODE_TYPES = True
        options.INDEX_EDGES = True

        rel_graph: RelationGraph = RelationGraph.build_relation_graph(
            [df, [1, 3, 4]], df, options)

        index_type_nodes = [
            node for node in rel_graph.nodes
            if node._type == RelationGraphNodeType.INDEX
        ]
        column_type_nodes = [
            node for node in rel_graph.nodes
            if node._type == RelationGraphNodeType.COLUMN
        ]

        self.assertEqual(len(index_type_nodes), 6)
        self.assertEqual(len(column_type_nodes), 4)
コード例 #5
0
    def test_index_name_equality_edges(self):
        df = pd.DataFrame({
            'foo': ['one', 'one', 'one', 'two', 'two', 'two'],
            'bar': ['A', 'B', 'C', 'A', 'B', 'C'],
            'baz': [1, 2, 3, 4, 5, 6]
        })
        output = df.pivot(index='foo', columns='bar', values='baz')

        options = RelationGraphOptions()
        options.COLUMN_NODES = True
        options.INDEX_NODES = True
        options.INDEX_NAME_NODES = True
        options.ADJACENCY_EDGES = False
        options.EQUALITY_EDGES = True
        options.NODE_TYPES = True
        options.INDEX_EDGES = False
        rel_graph: RelationGraph = RelationGraph.build_relation_graph([df],
                                                                      output,
                                                                      options)
        inp_col_nodes = [
            node for node in rel_graph.nodes
            if node._type == RelationGraphNodeType.COLUMN
            and node.dfindex.startswith("I")
        ]
        out_idx_name_nodes = [
            node for node in rel_graph.nodes
            if node._type == RelationGraphNodeType.INDEX_NAME
            and node.dfindex.startswith("O")
        ]
        out_col_idx_name_nodes = [
            node for node in rel_graph.nodes
            if node._type == RelationGraphNodeType.COL_INDEX_NAME
            and node.dfindex.startswith("O")
        ]

        def check_edge_exists(in_node: RelationGraphNode,
                              out_node: RelationGraphNode,
                              graph: RelationGraph):
            for e in graph.edges:
                if (e.node1 == in_node
                        and e.node2 == out_node) or (e.node1 == out_node
                                                     and e.node2 == in_node):
                    return True

            return False

        inp_foo_node = [i for i in inp_col_nodes if i.pos == (-1, 0)][0]
        inp_bar_node = [i for i in inp_col_nodes if i.pos == (-1, 1)][0]
        out_foo_node = [i for i in out_idx_name_nodes if i.pos == (-1, -1)][0]
        out_bar_node = [
            i for i in out_col_idx_name_nodes if i.pos == (-1, -1)
        ][0]

        self.assertTrue(
            check_edge_exists(inp_foo_node, out_foo_node, rel_graph))
        self.assertTrue(
            check_edge_exists(inp_bar_node, out_bar_node, rel_graph))
コード例 #6
0
    def test_index_name_nodes_multiindex(self):
        df = pd.DataFrame(
            [(389.0, 'fly'), (24.0, 'fly'), (80.5, 'run'), (np.nan, 'jump')],
            index=pd.MultiIndex.from_tuples([('bird', 'falcon'),
                                             ('bird', 'parrot'),
                                             ('mammal', 'lion'),
                                             ('mammal', 'monkey')],
                                            names=['class', 'name']),
            columns=pd.MultiIndex.from_tuples([('speed', 'max'),
                                               ('species', 'type')]))
        df.columns.names = ['name1', 'name2']

        options = RelationGraphOptions()
        options.COLUMN_NODES = True
        options.INDEX_NODES = True
        options.INDEX_NAME_NODES = True
        options.ADJACENCY_EDGES = True
        options.EQUALITY_EDGES = True
        options.NODE_TYPES = True
        options.INDEX_EDGES = False

        rel_graph: RelationGraph = RelationGraph.build_relation_graph([df], df,
                                                                      options)
        index_name_nodes = [
            node for node in rel_graph.nodes
            if node._type == RelationGraphNodeType.INDEX_NAME
        ]
        column_name_nodes = [
            node for node in rel_graph.nodes
            if node._type == RelationGraphNodeType.COL_INDEX_NAME
        ]

        self.assertEqual(len(index_name_nodes),
                         4)  # Both in the input and output, so x2
        self.assertEqual(len(column_name_nodes),
                         4)  # Both in the input and output, so x2
コード例 #7
0
    def test_column_multi(self):
        column_labels = [['bar', 'bar', 'baz', 'baz'],
                         ['one', 'two', 'one', 'two']]
        tuples = list(zip(*column_labels))
        col_index = pd.MultiIndex.from_tuples(tuples)
        data = [[0, 1, 2, 3], [4, 5, 6, 7]]
        input_df = pd.DataFrame(data, columns=col_index)
        #   bar     baz
        #   one two one two
        # 0   0   1   2   3
        # 1   4   5   6   7
        output_df = input_df.stack().reset_index()
        #    level_0 level_1  bar  baz
        # 0        0     one    0    2
        # 1        0     two    1    3
        # 2        1     one    4    6
        # 3        1     two    5    7

        options = RelationGraphOptions()
        options.COLUMN_NODES = True
        options.ADJACENCY_EDGES = True
        options.EQUALITY_EDGES = True
        options.NODE_TYPES = True
        options.INDEX_EDGES = True
        rel_graph: RelationGraph = RelationGraph.build_relation_graph(
            [input_df], output_df, options)
        rel_graph_edges = rel_graph.edges

        col_nodes = [
            [
                RelationGraphNode("I0", (-2, 0), RelationGraphNodeType.COLUMN),
                RelationGraphNode("I0", (-2, 1), RelationGraphNodeType.COLUMN),
                RelationGraphNode("I0", (-2, 2), RelationGraphNodeType.COLUMN),
                RelationGraphNode("I0", (-2, 3), RelationGraphNodeType.COLUMN)
            ],
            [
                RelationGraphNode("I0", (-1, 0), RelationGraphNodeType.COLUMN),
                RelationGraphNode("I0", (-1, 1), RelationGraphNodeType.COLUMN),
                RelationGraphNode("I0", (-1, 2), RelationGraphNodeType.COLUMN),
                RelationGraphNode("I0", (-1, 3), RelationGraphNodeType.COLUMN)
            ],
        ]

        adjacency_edges = [
            RelationGraphEdge(col_nodes[0][0], col_nodes[1][0],
                              RelationGraphEdgeType.ADJACENCY),
            RelationGraphEdge(col_nodes[0][0], col_nodes[0][1],
                              RelationGraphEdgeType.ADJACENCY),
            RelationGraphEdge(col_nodes[1][0], col_nodes[1][1],
                              RelationGraphEdgeType.ADJACENCY),
            RelationGraphEdge(col_nodes[1][1], col_nodes[1][2],
                              RelationGraphEdgeType.ADJACENCY),
            RelationGraphEdge(col_nodes[0][1], col_nodes[1][1],
                              RelationGraphEdgeType.ADJACENCY),
            RelationGraphEdge(col_nodes[0][1], col_nodes[0][2],
                              RelationGraphEdgeType.ADJACENCY),
            RelationGraphEdge(col_nodes[0][2], col_nodes[1][2],
                              RelationGraphEdgeType.ADJACENCY),
            RelationGraphEdge(col_nodes[0][2], col_nodes[0][3],
                              RelationGraphEdgeType.ADJACENCY),
            RelationGraphEdge(col_nodes[1][2], col_nodes[1][3],
                              RelationGraphEdgeType.ADJACENCY),
            RelationGraphEdge(col_nodes[0][3], col_nodes[1][3],
                              RelationGraphEdgeType.ADJACENCY)
        ]

        for edge in adjacency_edges:
            self.assertTrue(
                edge in rel_graph_edges,
                "Could not find edge %s in set of edges:\n%s" %
                (edge, rel_graph_edges))

        # indexing edges
        input_coli_elems = [[
            RelationGraphNode("I0", (0, 0), RelationGraphNodeType.INT),
            RelationGraphNode("I0", (1, 0), RelationGraphNodeType.INT)
        ],
                            [
                                RelationGraphNode("I0", (0, 1),
                                                  RelationGraphNodeType.INT),
                                RelationGraphNode("I0", (1, 1),
                                                  RelationGraphNodeType.INT)
                            ],
                            [
                                RelationGraphNode("I0", (0, 2),
                                                  RelationGraphNodeType.INT),
                                RelationGraphNode("I0", (1, 2),
                                                  RelationGraphNodeType.INT)
                            ],
                            [
                                RelationGraphNode("I0", (0, 3),
                                                  RelationGraphNodeType.INT),
                                RelationGraphNode("I0", (1, 3),
                                                  RelationGraphNodeType.INT)
                            ]]

        def check_edges(in_nodes, out_nodes, edge_type):
            for in_node in in_nodes:
                for out_node in out_nodes:
                    edge = RelationGraphEdge(in_node, out_node, edge_type)
                    self.assertTrue(
                        edge in rel_graph_edges,
                        "Could not find edge %s in set of edges:\n%s" %
                        (edge, rel_graph_edges))

        for i in range(4):
            in_nodes = [col_nodes[0][i], col_nodes[1][i]]
            out_nodes = input_coli_elems[i]
            check_edges(in_nodes, out_nodes, RelationGraphEdgeType.INDEX)

        # equality_edges
        bars = [col_nodes[0][0], col_nodes[0][1]]
        bazs = [col_nodes[0][2], col_nodes[0][3]]
        ones = [col_nodes[1][0], col_nodes[1][2]]
        twos = [col_nodes[1][1], col_nodes[1][3]]

        out_01 = RelationGraphNode("O0", (0, 1), RelationGraphNodeType.STR)
        out_11 = RelationGraphNode("O0", (1, 1), RelationGraphNodeType.STR)
        out_21 = RelationGraphNode("O0", (2, 1), RelationGraphNodeType.STR)
        out_31 = RelationGraphNode("O0", (3, 1), RelationGraphNodeType.STR)

        out_col_2 = RelationGraphNode("O0", (-1, 2),
                                      RelationGraphNodeType.COLUMN)
        out_col_3 = RelationGraphNode("O0", (-1, 3),
                                      RelationGraphNodeType.COLUMN)

        check_edges(bars, [out_col_2], RelationGraphEdgeType.EQUALITY)
        check_edges(bazs, [out_col_3], RelationGraphEdgeType.EQUALITY)

        check_edges(ones, [out_01, out_21], RelationGraphEdgeType.EQUALITY)
        check_edges(twos, [out_11, out_31], RelationGraphEdgeType.EQUALITY)
コード例 #8
0
    def test_idx_multi(self):
        tuples = [("bar", "one"), ("bar", "two")]
        index = pd.MultiIndex.from_tuples(tuples)
        data = [[0], [1]]
        input_df = pd.DataFrame(data, index=index)
        #          0
        # bar one  0
        #     two  1
        output_df = input_df.unstack()
        #       0
        #     one two
        # bar   0   1
        options = RelationGraphOptions()
        options.COLUMN_NODES = True
        options.INDEX_NODES = True
        options.ADJACENCY_EDGES = True
        options.EQUALITY_EDGES = True
        options.NODE_TYPES = True
        options.INDEX_EDGES = True
        rel_graph: RelationGraph = RelationGraph.build_relation_graph(
            [input_df], output_df, options)
        rel_graph_edges = rel_graph.edges

        bar_in_0 = RelationGraphNode("I0", (0, -2),
                                     RelationGraphNodeType.INDEX)
        bar_in_1 = RelationGraphNode("I0", (1, -2),
                                     RelationGraphNodeType.INDEX)
        bar_out = RelationGraphNode("O0", (0, -1), RelationGraphNodeType.INDEX)

        one_in = RelationGraphNode("I0", (0, -1), RelationGraphNodeType.INDEX)
        two_in = RelationGraphNode("I0", (1, -1), RelationGraphNodeType.INDEX)

        one_out = RelationGraphNode("O0", (-1, 0),
                                    RelationGraphNodeType.COLUMN)
        two_out = RelationGraphNode("O0", (-1, 1),
                                    RelationGraphNodeType.COLUMN)

        in_0 = RelationGraphNode("I0", (0, 0), RelationGraphNodeType.INT)
        in_1 = RelationGraphNode("I0", (1, 0), RelationGraphNodeType.INT)

        out_0 = RelationGraphNode("O0", (0, 0), RelationGraphNodeType.INT)
        out_1 = RelationGraphNode("O0", (0, 1), RelationGraphNodeType.INT)

        adjacency_edges = [
            RelationGraphEdge(bar_in_0, bar_in_1,
                              RelationGraphEdgeType.ADJACENCY),
            RelationGraphEdge(bar_in_0, one_in,
                              RelationGraphEdgeType.ADJACENCY),
            RelationGraphEdge(bar_in_1, two_in,
                              RelationGraphEdgeType.ADJACENCY),
            RelationGraphEdge(one_in, two_in, RelationGraphEdgeType.ADJACENCY)
        ]

        for edge in adjacency_edges:
            self.assertTrue(
                edge in rel_graph_edges,
                "Could not find edge %s in set of edges:\n%s" %
                (edge, rel_graph_edges))
        indexing_edges = [
            RelationGraphEdge(bar_in_0, in_0, RelationGraphEdgeType.INDEX),
            RelationGraphEdge(one_in, in_0, RelationGraphEdgeType.INDEX),
            RelationGraphEdge(bar_in_1, in_1, RelationGraphEdgeType.INDEX),
            RelationGraphEdge(two_in, in_1, RelationGraphEdgeType.INDEX),
            RelationGraphEdge(bar_out, out_0, RelationGraphEdgeType.INDEX),
            RelationGraphEdge(bar_out, out_1, RelationGraphEdgeType.INDEX)
        ]

        for edge in indexing_edges:
            self.assertTrue(
                edge in rel_graph_edges,
                "Could not find edge %s in set of edges:\n%s" %
                (edge, rel_graph_edges))

        equality_edges = [
            RelationGraphEdge(bar_in_0, bar_out,
                              RelationGraphEdgeType.EQUALITY),
            RelationGraphEdge(bar_in_1, bar_out,
                              RelationGraphEdgeType.EQUALITY),
            RelationGraphEdge(one_in, one_out, RelationGraphEdgeType.EQUALITY),
            RelationGraphEdge(two_in, two_out, RelationGraphEdgeType.EQUALITY)
        ]

        for edge in equality_edges:
            self.assertTrue(
                edge in rel_graph_edges,
                "Could not find edge %s in set of edges:\n%s" %
                (edge, rel_graph_edges))