def test_second_tree_likelihood(self): """Assert second tree likelihood is correct.""" # Setup # Build first tree data = pd.read_csv('data/iris.data.csv') tau_mat = data.corr(method='kendall').values u_matrix = np.empty(data.shape) for index, col in enumerate(data): uni = GaussianKDE() uni.fit(data[col]) u_matrix[:, index] = uni.cumulative_distribution(data[col]) first_tree = get_tree(TreeTypes.CENTER) first_tree.fit(0, 4, tau_mat, u_matrix) uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]]) likelihood_first_tree, conditional_uni_first = first_tree.get_likelihood(uni_matrix) tau = first_tree.get_tau_matrix() # Build second tree second_tree = get_tree(TreeTypes.CENTER) second_tree.fit(1, 3, tau, first_tree) expected_likelihood_second_tree = 0.4888802429313932 # Run likelihood_second_tree, out_u = second_tree.get_likelihood(conditional_uni_first) # Check assert compare_values_epsilon(likelihood_second_tree, expected_likelihood_second_tree)
def train_vine(self, tree_type): """Build the wine. 1. For the construction of the first tree :math:`T_1`, assign one node to each variable and then couple them by maximizing the measure of association considered. Different vines impose different constraints on this construction. When those are applied different trees are achieved at this level. 2. Select the copula that best fits to the pair of variables coupled by each edge in :math:`T_1`. 3. Let :math:`C_{ij}(u_i , u_j )` be the copula for a given edge :math:`(u_i, u_j)` in :math:`T_1`. Then for every edge in :math:`T_1`, compute either .. math:: {v^1}_{j|i} = \\frac{\\partial C_{ij}(u_i, u_j)}{\\partial u_j} or similarly :math:`{v^1}_{i|j}`, which are conditional cdfs. When finished with all the edges, construct the new matrix with :math:`v^1` that has one less column u. 4. Set k = 2. 5. Assign one node of :math:`T_k` to each edge of :math:`T_ {k−1}`. The structure of :math:`T_{k−1}` imposes a set of constraints on which edges of :math:`T_k` are realizable. Hence the next step is to get a linked list of the accesible nodes for every node in :math:`T_k`. 6. As in step 1, nodes of :math:`T_k` are coupled maximizing the measure of association considered and satisfying the constraints impose by the kind of vine employed plus the set of constraints imposed by tree :math:`T_{k−1}`. 7. Select the copula that best fit to each edge created in :math:`T_k`. 8. Recompute matrix :math:`v_k` as in step 4, but taking :math:`T_k` and :math:`vk−1` instead of :math:`T_1` and u. 9. Set :math:`k = k + 1` and repeat from (5) until all the trees are constructed. Args: tree_type (str or TreeTypes): Type of trees to use. """ LOGGER.debug('start building tree : 0') # 1 tree_1 = get_tree(tree_type) tree_1.fit(0, self.n_var, self.tau_mat, self.u_matrix) self.trees.append(tree_1) LOGGER.debug('finish building tree : 0') for k in range(1, min(self.n_var - 1, self.truncated)): # get constraints from previous tree self.trees[k - 1]._get_constraints() tau = self.trees[k - 1].get_tau_matrix() LOGGER.debug('start building tree: {0}'.format(k)) tree_k = get_tree(tree_type) tree_k.fit(k, self.n_var - k, tau, self.trees[k - 1]) self.trees.append(tree_k) LOGGER.debug('finish building tree: {0}'.format(k))
def test_serialization_fit_model(self): # Setup instance = get_tree(TreeTypes.REGULAR) X = pd.DataFrame(data=[ [1, 0, 0], [0, 1, 0], [0, 0, 1] ]) index = 0 n_nodes = X.shape[1] tau_matrix = X.corr(method='kendall').values univariates_matrix = np.empty(X.shape) for i, column in enumerate(X): distribution = GaussianKDE() distribution.fit(X[column]) univariates_matrix[:, i] = distribution.cumulative_distribution(X[column]) instance.fit(index, n_nodes, tau_matrix, univariates_matrix) # Run result = Tree.from_dict(instance.to_dict()) # Check assert result.to_dict() == instance.to_dict()
def test_second_tree_likelihood(self): """Assert second tree likelihood is correct.""" tau = self.tree.get_tau_matrix() second_tree = get_tree(TreeTypes.REGULAR) second_tree.fit(1, 3, tau, self.tree) uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]]) first_value, new_u = self.tree.get_likelihood(uni_matrix) second_value, out_u = second_tree.get_likelihood(new_u)
def test_serialization_unfitted_model(self): # Setup instance = get_tree(TreeTypes.REGULAR) # Run result = Tree.from_dict(instance.to_dict()) # Check assert instance.to_dict() == result.to_dict()
def test_get_tau_matrix_no_edges_empty(self): """get_tau_matrix returns an empty array if there are no edges.""" # Setup tree = get_tree(TreeTypes.DIRECT) tree.edges = [] # Run result = tree.get_tau_matrix() # Check assert result.shape == (0, 0)
def setUp(self): self.data = pd.read_csv('data/iris.data.csv') self.tau_mat = self.data.corr(method='kendall').values self.u_matrix = np.empty(self.data.shape) count = 0 for col in self.data: uni = GaussianKDE() uni.fit(self.data[col]) self.u_matrix[:, count] = uni.cumulative_distribution(self.data[col]) count += 1 self.tree = get_tree(TreeTypes.DIRECT) self.tree.fit(0, 4, self.tau_mat, self.u_matrix)
def test_second_tree_likelihood(self): """Assert second tree likelihood is correct.""" tau = self.tree.get_tau_matrix() second_tree = get_tree(TreeTypes.DIRECT) second_tree.fit(1, 3, tau, self.tree) uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]]) first_value, new_u = self.tree.get_likelihood(uni_matrix) second_value, out_u = second_tree.get_likelihood(new_u) expected = 0.24428294700258632 assert abs(second_value - expected) < 10E-3
def test_prepare_next_tree_regular_level(self, bivariate_mock, conditional_mock): """prepare_next_tree computes the conditional U matrices on its edges.""" # Setup instance = get_tree(TreeTypes.REGULAR) instance.level = 2 edge = MagicMock(spec=Edge) edge.parents = ['first_parent', 'second_parent'] edge.name = 'copula_type' edge.theta = 'copula_theta' instance.edges = [edge] copula_mock = bivariate_mock.return_value copula_mock.partial_derivative.return_value = np.array([0.0, 0.25, 0.5, 0.75, 1.0]) conditional_mock.return_value = ( ['left_u_1', 'left_u_2'], ['right_u_1', 'right_u_2'] ) expected_univariate = np.array([ [EPSILON, 0.25, 0.50, 0.75, 1 - EPSILON], [EPSILON, 0.25, 0.50, 0.75, 1 - EPSILON] ]) conditional_univariates = np.array([ ['left_u_1', 'right_u_1'], ['left_u_2', 'right_u_2'] ]) expected_partial_derivative_call_args = [ ((conditional_univariates,), {}), ((conditional_univariates[:, np.argsort([1, 0])],), {}) ] # Run instance.prepare_next_tree() # Check compare_nested_iterables(instance.edges[0].U, expected_univariate) bivariate_mock.assert_called_once_with(copula_type='copula_type') conditional_mock.assert_called_once_with('first_parent', 'second_parent') assert copula_mock.theta == 'copula_theta' compare_nested_iterables( copula_mock.partial_derivative.call_args_list, expected_partial_derivative_call_args )
def test_prepare_next_tree_first_level(self, bivariate_mock): """prepare_next_tree computes the conditional U matrices on its edges.""" # Setup instance = get_tree(TreeTypes.REGULAR) instance.level = 1 instance.u_matrix = np.array([ [0.1, 0.2], [0.3, 0.4] ]) edge = MagicMock(spec=Edge) edge.L = 0 edge.R = 1 edge.name = 'copula_type' edge.theta = 'copula_theta' instance.edges = [edge] copula_mock = bivariate_mock.return_value copula_mock.partial_derivative.return_value = np.array([0.0, 0.25, 0.5, 0.75, 1.0]) expected_univariate = np.array([ [EPSILON, 0.25, 0.50, 0.75, 1 - EPSILON], [EPSILON, 0.25, 0.50, 0.75, 1 - EPSILON] ]) expected_partial_derivative_call_args = [ ((instance.u_matrix,), {}), ((instance.u_matrix[:, np.argsort([1, 0])],), {}) ] # Run instance.prepare_next_tree() # Check compare_nested_iterables(instance.edges[0].U, expected_univariate) bivariate_mock.assert_called_once_with(copula_type='copula_type') assert copula_mock.theta == 'copula_theta' compare_nested_iterables( copula_mock.partial_derivative.call_args_list, expected_partial_derivative_call_args )
def test_to_dict_fit_model(self): # Setup instance = get_tree(TreeTypes.REGULAR) X = pd.DataFrame(data=[ [1, 0, 0], [0, 1, 0], [0, 0, 1] ]) index = 0 n_nodes = X.shape[1] tau_matrix = X.corr(method='kendall').values univariates_matrix = np.empty(X.shape) for i, column in enumerate(X): distribution = GaussianKDE() distribution.fit(X[column]) univariates_matrix[:, i] = distribution.cumulative_distribution(X[column]) instance.fit(index, n_nodes, tau_matrix, univariates_matrix) expected_result = { 'type': 'copulas.multivariate.tree.RegularTree', 'fitted': True, 'level': 1, 'n_nodes': 3, 'previous_tree': [ [0.8230112726144534, 0.3384880496294825, 0.3384880496294825], [0.3384880496294825, 0.8230112726144534, 0.3384880496294825], [0.3384880496294825, 0.3384880496294825, 0.8230112726144534] ], 'tau_matrix': [ [1.0, -0.49999999999999994, -0.49999999999999994], [-0.49999999999999994, 1.0, -0.49999999999999994], [-0.49999999999999994, -0.49999999999999994, 1.0] ], 'tree_type': TreeTypes.REGULAR, 'edges': [ { 'index': 0, 'D': set(), 'L': 0, 'R': 1, 'U': [ [0.7969636014074211, 0.6887638642325501, 0.12078520049364487], [0.6887638642325501, 0.7969636014074211, 0.12078520049364487] ], 'likelihood': None, 'name': CopulaTypes.FRANK, 'neighbors': [], 'parents': None, 'tau': -0.49999999999999994, 'theta': -5.736282443655552 }, { 'index': 1, 'D': set(), 'L': 1, 'R': 2, 'U': [ [0.12078520049364491, 0.7969636014074213, 0.6887638642325501], [0.12078520049364491, 0.6887638642325503, 0.7969636014074211] ], 'likelihood': None, 'name': CopulaTypes.FRANK, 'neighbors': [], 'parents': None, 'tau': -0.49999999999999994, 'theta': -5.736282443655552 } ], } # Run result = instance.to_dict() # Check compare_nested_dicts(result, expected_result)