def test_serialization_fit_model(self):
        # Setup
        instance = Tree(TreeTypes.REGULAR)
        X = pd.DataFrame(data=[
            [1, 0, 0],
            [0, 1, 0],
            [0, 0, 1]
        ])
        index = 0
        n_nodes = X.shape[1]
        tau_matrix = X.corr(method='kendall').values

        univariates_matrix = np.empty(X.shape)
        for i, column in enumerate(X):
            distribution = GaussianKDE()
            distribution.fit(X[column])
            univariates_matrix[:, i] = distribution.cumulative_distribution(X[column])

        instance.fit(index, n_nodes, tau_matrix, univariates_matrix)

        # Run
        result = Tree.from_dict(instance.to_dict())

        # Check
        assert result.to_dict() == instance.to_dict()
    def test_second_tree_likelihood(self):
        """Assert second tree likelihood is correct."""
        # Setup
        # Build first tree
        data = pd.read_csv('data/iris.data.csv')
        tau_mat = data.corr(method='kendall').values
        u_matrix = np.empty(data.shape)

        for index, col in enumerate(data):
            uni = GaussianKDE()
            uni.fit(data[col])
            u_matrix[:, index] = uni.cumulative_distribution(data[col])

        first_tree = Tree(TreeTypes.CENTER)
        first_tree.fit(0, 4, tau_mat, u_matrix)
        uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]])
        likelihood_first_tree, conditional_uni_first = first_tree.get_likelihood(uni_matrix)
        tau = first_tree.get_tau_matrix()

        # Build second tree
        second_tree = Tree(TreeTypes.CENTER)
        second_tree.fit(1, 3, tau, first_tree)
        expected_likelihood_second_tree = 0.4888802429313932

        # Run
        likelihood_second_tree, out_u = second_tree.get_likelihood(conditional_uni_first)

        # Check
        assert compare_values_epsilon(likelihood_second_tree, expected_likelihood_second_tree)
    def test_second_tree_likelihood(self):
        """Assert second tree likelihood is correct."""
        tau = self.tree.get_tau_matrix()
        second_tree = Tree(TreeTypes.REGULAR)
        second_tree.fit(1, 3, tau, self.tree)
        uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]])

        first_value, new_u = self.tree.get_likelihood(uni_matrix)
        second_value, out_u = second_tree.get_likelihood(new_u)
Example #4
0
class TestDirectTree(TestCase):
    def setUp(self):
        self.data = pd.read_csv('data/iris.data.csv')
        self.tau_mat = self.data.corr(method='kendall').values
        self.u_matrix = np.empty(self.data.shape)
        count = 0
        for col in self.data:
            uni = KDEUnivariate()
            uni.fit(self.data[col])
            self.u_matrix[:, count] = [uni.cumulative_distribution(x) for x in self.data[col]]
            count += 1
        self.tree = Tree(TreeTypes.DIRECT)
        self.tree.fit(0, 4, self.tau_mat, self.u_matrix)

    def test_first_tree(self):
        """ Assert 0 is the center node"""
        assert self.tree.edges[0].L == 0

    def test_first_tree_likelihood(self):
        """ Assert first tree likehood is correct"""
        uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]])

        value, new_u = self.tree.get_likelihood(uni_matrix)

        expected = -0.1207611551427385
        assert abs(value - expected) < 10E-3

    def test_get_constraints(self):
        """ Assert get constraint gets correct neighbor nodes"""
        self.tree._get_constraints()

        assert self.tree.edges[0].neighbors == [1]
        assert self.tree.edges[1].neighbors == [0, 2]

    def test_get_tau_matrix(self):
        """ Assert none of get tau matrix is NaN """
        self.tau = self.tree.get_tau_matrix()

        test = np.isnan(self.tau)

        self.assertFalse(test.all())

    def test_second_tree_likelihood(self):
        """ Assert second tree likelihood is correct """
        tau = self.tree.get_tau_matrix()

        second_tree = Tree(TreeTypes.DIRECT)
        second_tree.fit(1, 3, tau, self.tree)

        uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]])

        first_value, new_u = self.tree.get_likelihood(uni_matrix)
        second_value, out_u = second_tree.get_likelihood(new_u)

        expected = 0.7184205492690413
        assert abs(second_value - expected) < 10E-3
Example #5
0
    def train_vine(self, tree_type):
        """Build the wine.

        1. For the construction of the first tree :math:`T_1`, assign one node to each variable
           and then couple them by maximizing the measure of association considered.
           Different vines impose different constraints on this construction. When those are
           applied different trees are achieved at this level.

        2. Select the copula that best fits to the pair of variables coupled by each edge in
           :math:`T_1`.

        3. Let :math:`C_{ij}(u_i , u_j )` be the copula for a given edge :math:`(u_i, u_j)`
           in :math:`T_1`. Then for every edge in :math:`T_1`, compute either

           .. math:: {v^1}_{j|i} = \\frac{\\partial C_{ij}(u_i, u_j)}{\\partial u_j}

           or similarly :math:`{v^1}_{i|j}`, which are conditional cdfs. When finished with
           all the edges, construct the new matrix with :math:`v^1` that has one less column u.

        4. Set k = 2.

        5. Assign one node of :math:`T_k` to each edge of :math:`T_ {k−1}`. The structure of
           :math:`T_{k−1}` imposes a set of constraints on which edges of :math:`T_k` are
           realizable. Hence the next step is to get a linked list of the accesible nodes for
           every node in :math:`T_k`.

        6. As in step 1, nodes of :math:`T_k` are coupled maximizing the measure of association
           considered and satisfying the constraints impose by the kind of vine employed plus the
           set of constraints imposed by tree :math:`T_{k−1}`.

        7. Select the copula that best fit to each edge created in :math:`T_k`.

        8. Recompute matrix :math:`v_k` as in step 4, but taking :math:`T_k` and :math:`vk−1`
           instead of :math:`T_1` and u.

        9. Set :math:`k = k + 1` and repeat from (5) until all the trees are constructed.


        """
        LOGGER.debug('start building tree : 0')
        # 1
        tree_1 = Tree(tree_type)
        tree_1.fit(0, self.n_var, self.tau_mat, self.u_matrix)
        self.trees.append(tree_1)
        LOGGER.debug('finish building tree : 0')

        for k in range(1, min(self.n_var - 1, self.truncated)):
            # get constraints from previous tree
            self.trees[k - 1]._get_constraints()
            tau = self.trees[k - 1].get_tau_matrix()
            LOGGER.debug('start building tree: {0}'.format(k))
            tree_k = Tree(tree_type)
            tree_k.fit(k, self.n_var - k, tau, self.trees[k - 1])
            self.trees.append(tree_k)
            LOGGER.debug('finish building tree: {0}'.format(k))
Example #6
0
    def test_second_tree_likelihood(self):
        """ Assert second tree likelihood is correct """
        tau = self.tree.get_tau_matrix()
        second_tree = Tree(TreeTypes.CENTER)
        second_tree.fit(1, 3, tau, self.tree)
        uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]])

        first_value, new_u = self.tree.get_likelihood(uni_matrix)
        second_value, out_u = second_tree.get_likelihood(new_u)

        expected = 0.540089320412914
        assert abs(second_value - expected) < 10E-3
    def test_second_tree_likelihood(self):
        """Assert second tree likelihood is correct."""
        tau = self.tree.get_tau_matrix()

        second_tree = Tree(TreeTypes.DIRECT)
        second_tree.fit(1, 3, tau, self.tree)

        uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]])

        first_value, new_u = self.tree.get_likelihood(uni_matrix)
        second_value, out_u = second_tree.get_likelihood(new_u)

        expected = 0.24428294700258632
        assert abs(second_value - expected) < 10E-3
Example #8
0
    def train_vine(self, tree_type):
        LOGGER.debug('start building tree : 0')
        tree_1 = Tree(tree_type)
        tree_1.fit(0, self.n_var, self.tau_mat, self.u_matrix)
        self.trees.append(tree_1)
        LOGGER.debug('finish building tree : 0')

        for k in range(1, min(self.n_var - 1, self.truncated)):
            # get constraints from previous tree
            self.trees[k - 1]._get_constraints()
            tau = self.trees[k - 1].get_tau_matrix()
            LOGGER.debug('start building tree: {0}'.format(k))
            tree_k = Tree(tree_type)
            tree_k.fit(k, self.n_var - k, tau, self.trees[k - 1])
            self.trees.append(tree_k)
            LOGGER.debug('finish building tree: {0}'.format(k))
class TestDirectTree(TestCase):
    def setUp(self):
        self.data = pd.read_csv('data/iris.data.csv')
        self.tau_mat = self.data.corr(method='kendall').values
        self.u_matrix = np.empty(self.data.shape)
        count = 0
        for col in self.data:
            uni = GaussianKDE()
            uni.fit(self.data[col])
            self.u_matrix[:, count] = uni.cumulative_distribution(self.data[col])
            count += 1
        self.tree = Tree(TreeTypes.DIRECT)
        self.tree.fit(0, 4, self.tau_mat, self.u_matrix)

    def test_first_tree(self):
        """ Assert 0 is the center node"""
        assert self.tree.edges[0].L == 0

    @pytest.mark.xfail
    def test_first_tree_likelihood(self):
        """ Assert first tree likehood is correct"""
        uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]])

        value, new_u = self.tree.get_likelihood(uni_matrix)

        expected = -0.1207611551427385
        assert abs(value - expected) < 10E-3

    def test_get_constraints(self):
        """ Assert get constraint gets correct neighbor nodes"""
        self.tree._get_constraints()

        assert self.tree.edges[0].neighbors == [1]
        assert self.tree.edges[1].neighbors == [0, 2]

    def test_get_tau_matrix_no_edges_empty(self):
        """get_tau_matrix returns an empty array if there are no edges."""
        # Setup
        tree = Tree(TreeTypes.DIRECT)
        tree.edges = []

        # Run
        result = tree.get_tau_matrix()

        # Check
        assert result.shape == (0, 0)

    def test_get_tau_matrix(self):
        """Assert none of get tau matrix is NaN."""
        self.tau = self.tree.get_tau_matrix()

        test = np.isnan(self.tau)

        self.assertFalse(test.all())

    @pytest.mark.xfail
    def test_second_tree_likelihood(self):
        """Assert second tree likelihood is correct."""
        tau = self.tree.get_tau_matrix()

        second_tree = Tree(TreeTypes.DIRECT)
        second_tree.fit(1, 3, tau, self.tree)

        uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]])

        first_value, new_u = self.tree.get_likelihood(uni_matrix)
        second_value, out_u = second_tree.get_likelihood(new_u)

        expected = 0.24428294700258632
        assert abs(second_value - expected) < 10E-3
Example #10
0
class TestRegularTree(TestCase):
    def setUp(self):
        self.data = pd.read_csv('data/iris.data.csv')
        self.tau_mat = self.data.corr(method='kendall').values
        self.u_matrix = np.empty(self.data.shape)
        count = 0
        for col in self.data:
            uni = GaussianKDE()
            uni.fit(self.data[col])
            self.u_matrix[:, count] = uni.cumulative_distribution(self.data[col])
            count += 1
        self.tree = Tree(TreeTypes.REGULAR)
        self.tree.fit(0, 4, self.tau_mat, self.u_matrix)

    def test_first_tree(self):
        """ Assert the construction of first tree is correct
        The first tree should be:
                   1
                0--2--3
        """
        sorted_edges = Edge.sort_edge(self.tree.edges)

        assert sorted_edges[0].L == 0
        assert sorted_edges[0].R == 2
        assert sorted_edges[1].L == 1
        assert sorted_edges[1].R == 2
        assert sorted_edges[2].L == 2
        assert sorted_edges[2].R == 3

    @pytest.mark.xfail
    def test_first_tree_likelihood(self):
        """ Assert first tree likehood is correct"""
        uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]])

        value, new_u = self.tree.get_likelihood(uni_matrix)

        expected = 0.9545348664739628
        assert abs(value - expected) < 10E-3

    def test_get_constraints(self):
        """ Assert get constraint gets correct neighbor nodes"""
        self.tree._get_constraints()

        assert self.tree.edges[0].neighbors == [1, 2]
        assert self.tree.edges[1].neighbors == [0, 2]

    def test_get_tau_matrix(self):
        """ Assert second tree likelihood is correct """
        self.tau = self.tree.get_tau_matrix()

        test = np.isnan(self.tau)

        self.assertFalse(test.all())

    def test_second_tree_likelihood(self):
        """Assert second tree likelihood is correct."""
        tau = self.tree.get_tau_matrix()
        second_tree = Tree(TreeTypes.REGULAR)
        second_tree.fit(1, 3, tau, self.tree)
        uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]])

        first_value, new_u = self.tree.get_likelihood(uni_matrix)
        second_value, out_u = second_tree.get_likelihood(new_u)
Example #11
0
    def test_to_dict_fit_model(self):
        # Setup
        instance = Tree(TreeTypes.REGULAR)
        X = pd.DataFrame(data=[
            [1, 0, 0],
            [0, 1, 0],
            [0, 0, 1]
        ])
        index = 0
        n_nodes = X.shape[1]
        tau_matrix = X.corr(method='kendall').values

        univariates_matrix = np.empty(X.shape)
        for i, column in enumerate(X):
            distribution = GaussianKDE()
            distribution.fit(X[column])
            univariates_matrix[:, i] = distribution.cumulative_distribution(X[column])

        instance.fit(index, n_nodes, tau_matrix, univariates_matrix)
        expected_result = {
            'type': 'copulas.multivariate.tree.RegularTree',
            'fitted': True,
            'level': 1,
            'n_nodes': 3,
            'previous_tree': [
                [0.8230112726144534, 0.3384880496294825, 0.3384880496294825],
                [0.3384880496294825, 0.8230112726144534, 0.3384880496294825],
                [0.3384880496294825, 0.3384880496294825, 0.8230112726144534]
            ],
            'tau_matrix': [
                [1.0, -0.49999999999999994, -0.49999999999999994],
                [-0.49999999999999994, 1.0, -0.49999999999999994],
                [-0.49999999999999994, -0.49999999999999994, 1.0]
            ],
            'tree_type': TreeTypes.REGULAR,
            'edges': [
                {
                    'index': 0,
                    'D': set(),
                    'L': 0,
                    'R': 1,
                    'U': [
                        [0.7969535322648066, 0.6887525261721343, 0.12077958383821545],
                        [0.6887525261721343, 0.7969535322648066, 0.12077958383821545]
                    ],
                    'likelihood': None,
                    'name': CopulaTypes.FRANK,
                    'neighbors': [],
                    'parents': None,
                    'tau': -0.49999999999999994,
                    'theta': -5.736282443655552
                },
                {
                    'index': 1,
                    'D': set(),
                    'L': 1,
                    'R': 2,
                    'U': [
                        [0.12077958383821545, 0.7969535322648066, 0.6887525261721343],
                        [0.12077958383821545, 0.6887525261721343, 0.7969535322648066]
                    ],
                    'likelihood': None,
                    'name': CopulaTypes.FRANK,
                    'neighbors': [],
                    'parents': None,
                    'tau': -0.49999999999999994,
                    'theta': -5.736282443655552
                }
            ],
        }

        # Run
        result = instance.to_dict()

        # Check
        compare_nested_dicts(result, expected_result)
Example #12
0
class TestCenterTree(TestCase):
    def setUp(self):
        self.data = pd.read_csv('data/iris.data.csv')
        self.tau_mat = self.data.corr(method='kendall').values
        self.u_matrix = np.empty(self.data.shape)
        count = 0
        for col in self.data:
            uni = GaussianKDE()
            uni.fit(self.data[col])
            self.u_matrix[:, count] = uni.cumulative_distribution(self.data[col])
            count += 1
        self.tree = Tree(TreeTypes.CENTER)
        self.tree.fit(0, 4, self.tau_mat, self.u_matrix)

    def test_first_tree(self):
        """Assert 0 is the center node on the first tree."""
        assert self.tree.edges[0].L == 0

    @pytest.mark.xfail
    def test_first_tree_likelihood(self):
        """Assert first tree likehood is correct."""
        uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]])

        value, new_u = self.tree.get_likelihood(uni_matrix)

        expected = -0.19988720707143634
        assert abs(value - expected) < 10E-3

    def test_get_constraints(self):
        """Assert get constraint gets correct neighbor nodes."""
        self.tree._get_constraints()

        assert self.tree.edges[0].neighbors == [1, 2]
        assert self.tree.edges[1].neighbors == [0, 2]

    def test_get_tau_matrix(self):
        """Assert none of get tau matrix is NaN."""
        self.tau = self.tree.get_tau_matrix()

        test = np.isnan(self.tau)

        self.assertFalse(test.all())

    @pytest.mark.xfail
    def test_second_tree_likelihood(self):
        """Assert second tree likelihood is correct."""
        # Setup
        # Build first tree
        data = pd.read_csv('data/iris.data.csv')
        tau_mat = data.corr(method='kendall').values
        u_matrix = np.empty(data.shape)

        for index, col in enumerate(data):
            uni = GaussianKDE()
            uni.fit(data[col])
            u_matrix[:, index] = uni.cumulative_distribution(data[col])

        first_tree = Tree(TreeTypes.CENTER)
        first_tree.fit(0, 4, tau_mat, u_matrix)
        uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]])
        likelihood_first_tree, conditional_uni_first = first_tree.get_likelihood(uni_matrix)
        tau = first_tree.get_tau_matrix()

        # Build second tree
        second_tree = Tree(TreeTypes.CENTER)
        second_tree.fit(1, 3, tau, first_tree)
        expected_likelihood_second_tree = 0.4888802429313932

        # Run
        likelihood_second_tree, out_u = second_tree.get_likelihood(conditional_uni_first)

        # Check
        assert compare_values_epsilon(likelihood_second_tree, expected_likelihood_second_tree)
Example #13
0
    def test_to_dict_fit_model(self):
        # Setup
        instance = Tree(TreeTypes.REGULAR)
        X = pd.DataFrame(data=[
            [1, 0, 0],
            [0, 1, 0],
            [0, 0, 1]
        ])
        index = 0
        n_nodes = X.shape[1]
        tau_matrix = X.corr(method='kendall').values

        univariates_matrix = np.empty(X.shape)
        for i, column in enumerate(X):
            distribution = KDEUnivariate()
            distribution.fit(X[column])
            univariates_matrix[:, i] = [distribution.cumulative_distribution(x) for x in X[column]]

        instance.fit(index, n_nodes, tau_matrix, univariates_matrix)
        expected_result = {
            'type': 'copulas.multivariate.tree.RegularTree',
            'fitted': True,
            'level': 1,
            'n_nodes': 3,
            'previous_tree': [
                [0.8230112726144534, 0.3384880496294825, 0.3384880496294825],
                [0.3384880496294825, 0.8230112726144534, 0.3384880496294825],
                [0.3384880496294825, 0.3384880496294825, 0.8230112726144534]
            ],
            'tau_matrix': [
                [1.0, -0.49999999999999994, -0.49999999999999994],
                [-0.49999999999999994, 1.0, -0.49999999999999994],
                [-0.49999999999999994, -0.49999999999999994, 1.0]
            ],
            'tree_type': TreeTypes.REGULAR,
            'edges': [
                {
                    'D': set(),
                    'L': 0,
                    'R': 1,
                    'U': [
                        [6.533235975920359, 6.425034969827687, 5.857062027493768],
                        [6.425034969827687, 6.533235975920359, 5.857062027493768]
                    ],
                    'likelihood': None,
                    'name': CopulaTypes.FRANK,
                    'neighbors': [],
                    'parents': None,
                    'tau': -0.49999999999999994,
                    'theta': -5.736282443655552
                },
                {
                    'D': set(),
                    'L': 1,
                    'R': 2,
                    'U': [
                        [5.857062027493768, 6.533235975920359, 6.425034969827687],
                        [5.857062027493768, 6.425034969827687, 6.533235975920359]
                    ],
                    'likelihood': None,
                    'name': CopulaTypes.FRANK,
                    'neighbors': [],
                    'parents': None,
                    'tau': -0.49999999999999994,
                    'theta': -5.736282443655552
                }
            ],
        }

        # Run
        result = instance.to_dict()

        # Check
        compare_nested_dicts(result, expected_result)