Esempio n. 1
0
    def test_second_tree_likelihood(self):
        """Assert second tree likelihood is correct."""
        # Setup
        # Build first tree
        data = pd.read_csv('data/iris.data.csv')
        tau_mat = data.corr(method='kendall').values
        u_matrix = np.empty(data.shape)

        for index, col in enumerate(data):
            uni = GaussianKDE()
            uni.fit(data[col])
            u_matrix[:, index] = uni.cumulative_distribution(data[col])

        first_tree = get_tree(TreeTypes.CENTER)
        first_tree.fit(0, 4, tau_mat, u_matrix)
        uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]])
        likelihood_first_tree, conditional_uni_first = first_tree.get_likelihood(uni_matrix)
        tau = first_tree.get_tau_matrix()

        # Build second tree
        second_tree = get_tree(TreeTypes.CENTER)
        second_tree.fit(1, 3, tau, first_tree)
        expected_likelihood_second_tree = 0.4888802429313932

        # Run
        likelihood_second_tree, out_u = second_tree.get_likelihood(conditional_uni_first)

        # Check
        assert compare_values_epsilon(likelihood_second_tree, expected_likelihood_second_tree)
Esempio n. 2
0
    def train_vine(self, tree_type):
        """Build the wine.

        1. For the construction of the first tree :math:`T_1`, assign one node to each variable
           and then couple them by maximizing the measure of association considered.
           Different vines impose different constraints on this construction. When those are
           applied different trees are achieved at this level.

        2. Select the copula that best fits to the pair of variables coupled by each edge in
           :math:`T_1`.

        3. Let :math:`C_{ij}(u_i , u_j )` be the copula for a given edge :math:`(u_i, u_j)`
           in :math:`T_1`. Then for every edge in :math:`T_1`, compute either

           .. math:: {v^1}_{j|i} = \\frac{\\partial C_{ij}(u_i, u_j)}{\\partial u_j}

           or similarly :math:`{v^1}_{i|j}`, which are conditional cdfs. When finished with
           all the edges, construct the new matrix with :math:`v^1` that has one less column u.

        4. Set k = 2.

        5. Assign one node of :math:`T_k` to each edge of :math:`T_ {k−1}`. The structure of
           :math:`T_{k−1}` imposes a set of constraints on which edges of :math:`T_k` are
           realizable. Hence the next step is to get a linked list of the accesible nodes for
           every node in :math:`T_k`.

        6. As in step 1, nodes of :math:`T_k` are coupled maximizing the measure of association
           considered and satisfying the constraints impose by the kind of vine employed plus the
           set of constraints imposed by tree :math:`T_{k−1}`.

        7. Select the copula that best fit to each edge created in :math:`T_k`.

        8. Recompute matrix :math:`v_k` as in step 4, but taking :math:`T_k` and :math:`vk−1`
           instead of :math:`T_1` and u.

        9. Set :math:`k = k + 1` and repeat from (5) until all the trees are constructed.

        Args:
            tree_type (str or TreeTypes):
                Type of trees to use.
        """
        LOGGER.debug('start building tree : 0')
        # 1
        tree_1 = get_tree(tree_type)
        tree_1.fit(0, self.n_var, self.tau_mat, self.u_matrix)
        self.trees.append(tree_1)
        LOGGER.debug('finish building tree : 0')

        for k in range(1, min(self.n_var - 1, self.truncated)):
            # get constraints from previous tree
            self.trees[k - 1]._get_constraints()
            tau = self.trees[k - 1].get_tau_matrix()
            LOGGER.debug('start building tree: {0}'.format(k))
            tree_k = get_tree(tree_type)
            tree_k.fit(k, self.n_var - k, tau, self.trees[k - 1])
            self.trees.append(tree_k)
            LOGGER.debug('finish building tree: {0}'.format(k))
Esempio n. 3
0
    def test_serialization_fit_model(self):
        # Setup
        instance = get_tree(TreeTypes.REGULAR)
        X = pd.DataFrame(data=[
            [1, 0, 0],
            [0, 1, 0],
            [0, 0, 1]
        ])
        index = 0
        n_nodes = X.shape[1]
        tau_matrix = X.corr(method='kendall').values

        univariates_matrix = np.empty(X.shape)
        for i, column in enumerate(X):
            distribution = GaussianKDE()
            distribution.fit(X[column])
            univariates_matrix[:, i] = distribution.cumulative_distribution(X[column])

        instance.fit(index, n_nodes, tau_matrix, univariates_matrix)

        # Run
        result = Tree.from_dict(instance.to_dict())

        # Check
        assert result.to_dict() == instance.to_dict()
Esempio n. 4
0
    def test_second_tree_likelihood(self):
        """Assert second tree likelihood is correct."""
        tau = self.tree.get_tau_matrix()
        second_tree = get_tree(TreeTypes.REGULAR)
        second_tree.fit(1, 3, tau, self.tree)
        uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]])

        first_value, new_u = self.tree.get_likelihood(uni_matrix)
        second_value, out_u = second_tree.get_likelihood(new_u)
Esempio n. 5
0
    def test_serialization_unfitted_model(self):
        # Setup
        instance = get_tree(TreeTypes.REGULAR)

        # Run
        result = Tree.from_dict(instance.to_dict())

        # Check
        assert instance.to_dict() == result.to_dict()
Esempio n. 6
0
    def test_get_tau_matrix_no_edges_empty(self):
        """get_tau_matrix returns an empty array if there are no edges."""
        # Setup
        tree = get_tree(TreeTypes.DIRECT)
        tree.edges = []

        # Run
        result = tree.get_tau_matrix()

        # Check
        assert result.shape == (0, 0)
Esempio n. 7
0
    def setUp(self):
        self.data = pd.read_csv('data/iris.data.csv')
        self.tau_mat = self.data.corr(method='kendall').values
        self.u_matrix = np.empty(self.data.shape)
        count = 0
        for col in self.data:
            uni = GaussianKDE()
            uni.fit(self.data[col])
            self.u_matrix[:, count] = uni.cumulative_distribution(self.data[col])
            count += 1

        self.tree = get_tree(TreeTypes.DIRECT)
        self.tree.fit(0, 4, self.tau_mat, self.u_matrix)
Esempio n. 8
0
    def test_second_tree_likelihood(self):
        """Assert second tree likelihood is correct."""
        tau = self.tree.get_tau_matrix()

        second_tree = get_tree(TreeTypes.DIRECT)
        second_tree.fit(1, 3, tau, self.tree)

        uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]])

        first_value, new_u = self.tree.get_likelihood(uni_matrix)
        second_value, out_u = second_tree.get_likelihood(new_u)

        expected = 0.24428294700258632
        assert abs(second_value - expected) < 10E-3
Esempio n. 9
0
    def test_prepare_next_tree_regular_level(self, bivariate_mock, conditional_mock):
        """prepare_next_tree computes the conditional U matrices on its edges."""
        # Setup
        instance = get_tree(TreeTypes.REGULAR)
        instance.level = 2

        edge = MagicMock(spec=Edge)
        edge.parents = ['first_parent', 'second_parent']
        edge.name = 'copula_type'
        edge.theta = 'copula_theta'
        instance.edges = [edge]

        copula_mock = bivariate_mock.return_value
        copula_mock.partial_derivative.return_value = np.array([0.0, 0.25, 0.5, 0.75, 1.0])

        conditional_mock.return_value = (
            ['left_u_1', 'left_u_2'],
            ['right_u_1', 'right_u_2']
        )

        expected_univariate = np.array([
            [EPSILON, 0.25, 0.50, 0.75, 1 - EPSILON],
            [EPSILON, 0.25, 0.50, 0.75, 1 - EPSILON]
        ])

        conditional_univariates = np.array([
            ['left_u_1', 'right_u_1'],
            ['left_u_2', 'right_u_2']
        ])
        expected_partial_derivative_call_args = [
            ((conditional_univariates,), {}),
            ((conditional_univariates[:, np.argsort([1, 0])],), {})
        ]

        # Run
        instance.prepare_next_tree()

        # Check
        compare_nested_iterables(instance.edges[0].U, expected_univariate)

        bivariate_mock.assert_called_once_with(copula_type='copula_type')

        conditional_mock.assert_called_once_with('first_parent', 'second_parent')

        assert copula_mock.theta == 'copula_theta'
        compare_nested_iterables(
            copula_mock.partial_derivative.call_args_list,
            expected_partial_derivative_call_args
        )
Esempio n. 10
0
    def test_prepare_next_tree_first_level(self, bivariate_mock):
        """prepare_next_tree computes the conditional U matrices on its edges."""
        # Setup
        instance = get_tree(TreeTypes.REGULAR)
        instance.level = 1
        instance.u_matrix = np.array([
            [0.1, 0.2],
            [0.3, 0.4]
        ])

        edge = MagicMock(spec=Edge)
        edge.L = 0
        edge.R = 1
        edge.name = 'copula_type'
        edge.theta = 'copula_theta'
        instance.edges = [edge]

        copula_mock = bivariate_mock.return_value
        copula_mock.partial_derivative.return_value = np.array([0.0, 0.25, 0.5, 0.75, 1.0])

        expected_univariate = np.array([
            [EPSILON, 0.25, 0.50, 0.75, 1 - EPSILON],
            [EPSILON, 0.25, 0.50, 0.75, 1 - EPSILON]
        ])

        expected_partial_derivative_call_args = [
            ((instance.u_matrix,), {}),
            ((instance.u_matrix[:, np.argsort([1, 0])],), {})
        ]

        # Run
        instance.prepare_next_tree()

        # Check
        compare_nested_iterables(instance.edges[0].U, expected_univariate)

        bivariate_mock.assert_called_once_with(copula_type='copula_type')

        assert copula_mock.theta == 'copula_theta'
        compare_nested_iterables(
            copula_mock.partial_derivative.call_args_list,
            expected_partial_derivative_call_args
        )
Esempio n. 11
0
    def test_to_dict_fit_model(self):
        # Setup
        instance = get_tree(TreeTypes.REGULAR)
        X = pd.DataFrame(data=[
            [1, 0, 0],
            [0, 1, 0],
            [0, 0, 1]
        ])
        index = 0
        n_nodes = X.shape[1]
        tau_matrix = X.corr(method='kendall').values

        univariates_matrix = np.empty(X.shape)
        for i, column in enumerate(X):
            distribution = GaussianKDE()
            distribution.fit(X[column])
            univariates_matrix[:, i] = distribution.cumulative_distribution(X[column])

        instance.fit(index, n_nodes, tau_matrix, univariates_matrix)
        expected_result = {
            'type': 'copulas.multivariate.tree.RegularTree',
            'fitted': True,
            'level': 1,
            'n_nodes': 3,
            'previous_tree': [
                [0.8230112726144534, 0.3384880496294825, 0.3384880496294825],
                [0.3384880496294825, 0.8230112726144534, 0.3384880496294825],
                [0.3384880496294825, 0.3384880496294825, 0.8230112726144534]
            ],
            'tau_matrix': [
                [1.0, -0.49999999999999994, -0.49999999999999994],
                [-0.49999999999999994, 1.0, -0.49999999999999994],
                [-0.49999999999999994, -0.49999999999999994, 1.0]
            ],
            'tree_type': TreeTypes.REGULAR,
            'edges': [
                {
                    'index': 0,
                    'D': set(),
                    'L': 0,
                    'R': 1,
                    'U': [
                        [0.7969636014074211, 0.6887638642325501, 0.12078520049364487],
                        [0.6887638642325501, 0.7969636014074211, 0.12078520049364487]
                    ],
                    'likelihood': None,
                    'name': CopulaTypes.FRANK,
                    'neighbors': [],
                    'parents': None,
                    'tau': -0.49999999999999994,
                    'theta': -5.736282443655552
                },
                {
                    'index': 1,
                    'D': set(),
                    'L': 1,
                    'R': 2,
                    'U': [
                        [0.12078520049364491, 0.7969636014074213, 0.6887638642325501],
                        [0.12078520049364491, 0.6887638642325503, 0.7969636014074211]
                    ],
                    'likelihood': None,
                    'name': CopulaTypes.FRANK,
                    'neighbors': [],
                    'parents': None,
                    'tau': -0.49999999999999994,
                    'theta': -5.736282443655552
                }
            ],
        }

        # Run
        result = instance.to_dict()

        # Check
        compare_nested_dicts(result, expected_result)