def test_second_tree_likelihood(self): """Assert second tree likelihood is correct.""" # Setup # Build first tree data = pd.read_csv('data/iris.data.csv') tau_mat = data.corr(method='kendall').values u_matrix = np.empty(data.shape) for index, col in enumerate(data): uni = KDEUnivariate() uni.fit(data[col]) u_matrix[:, index] = [uni.cumulative_distribution(x) for x in data[col]] first_tree = Tree(TreeTypes.CENTER) first_tree.fit(0, 4, tau_mat, u_matrix) uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]]) likelihood_first_tree, conditional_uni_first = first_tree.get_likelihood(uni_matrix) tau = first_tree.get_tau_matrix() # Build second tree second_tree = Tree(TreeTypes.CENTER) second_tree.fit(1, 3, tau, first_tree) expected_likelihood_second_tree = 0.4888802429313932 # Run likelihood_second_tree, out_u = second_tree.get_likelihood(conditional_uni_first) # Check assert compare_values_epsilon(likelihood_second_tree, expected_likelihood_second_tree)
def test_serialization_fit_model(self): # Setup instance = Tree(TreeTypes.REGULAR) X = pd.DataFrame(data=[ [1, 0, 0], [0, 1, 0], [0, 0, 1] ]) index = 0 n_nodes = X.shape[1] tau_matrix = X.corr(method='kendall').values univariates_matrix = np.empty(X.shape) for i, column in enumerate(X): distribution = KDEUnivariate() distribution.fit(X[column]) univariates_matrix[:, i] = [distribution.cumulative_distribution(x) for x in X[column]] instance.fit(index, n_nodes, tau_matrix, univariates_matrix) # Run result = Tree.from_dict(instance.to_dict()) # Check assert result.to_dict() == instance.to_dict()
def setUp(self): self.data = pd.read_csv('data/iris.data.csv') self.tau_mat = self.data.corr(method='kendall').values self.u_matrix = np.empty(self.data.shape) count = 0 for col in self.data: uni = KDEUnivariate() uni.fit(self.data[col]) self.u_matrix[:, count] = [uni.cumulative_distribution(x) for x in self.data[col]] count += 1 self.tree = Tree(TreeTypes.DIRECT) self.tree.fit(0, 4, self.tau_mat, self.u_matrix)
class TestKDEUnivariate(TestCase): def setup_norm(self): """set up the model to fit standard norm data.""" self.kde = KDEUnivariate() # use 42 as a fixed random seed np.random.seed(42) column = np.random.normal(0, 1, 1000) self.kde.fit(column) def test___init__(self): """On init, model are set to None.""" self.kde = KDEUnivariate() assert self.kde.model is None def test_fit(self): """On fit, kde model is instantiated with intance of gaussian_kde.""" self.kde = KDEUnivariate() data = [1, 2, 3, 4, 5] self.kde.fit(data) self.assertIsInstance(self.kde.model, scipy.stats.gaussian_kde) def test_fit_uniform(self): """On fit, kde model is instantiated with passed data.""" self.kde = KDEUnivariate() data = [1, 2, 3, 4, 5] self.kde.fit(data) assert self.kde.model def test_fit_empty_data(self): """If fitting kde model with empty data it will raise ValueError.""" self.kde = KDEUnivariate() with self.assertRaises(ValueError): self.kde.fit([]) def test_probability_density(self): """probability_density evaluates with the model.""" self.setup_norm() x = self.kde.probability_density(0.5) expected = 0.35206532676429952 self.assertAlmostEquals(x, expected, places=1) def test_cumulative_distribution(self): """cumulative_distribution evaluates with the model.""" self.setup_norm() x = self.kde.cumulative_distribution(0.5) expected = 0.69146246127401312 self.assertAlmostEquals(x, expected, places=1) def test_percent_point(self): """percent_point evaluates with the model.""" self.setup_norm() x = self.kde.percent_point(0.5) expected = 0.0 self.assertAlmostEquals(x, expected, places=1) def test_percent_point_invalid_value(self): """Evaluating an invalid value will raise ValueError.""" self.setup_norm() with self.assertRaises(ValueError): self.kde.percent_point(2) def test_from_dict(self): """From_dict sets the values of a dictionary as attributes of the instance.""" # Setup distribution = KDEUnivariate() parameters = { 'd': 1, 'n': 10, 'dataset': [[ 0.4967141530112327, -0.13826430117118466, 0.6476885381006925, 1.5230298564080254, -0.23415337472333597, -0.23413695694918055, 1.5792128155073915, 0.7674347291529088, -0.4694743859349521, 0.5425600435859647 ]], 'covariance': [[0.2081069604419522]], 'factor': 0.6309573444801932, 'inv_cov': [[4.805221304834407]] } # Run distribution = KDEUnivariate.from_dict(parameters) # Check assert distribution.model.d == 1 assert distribution.model.n == 10 assert distribution.model.covariance == np.array([[0.2081069604419522]]) assert distribution.model.factor == 0.6309573444801932 assert distribution.model.inv_cov == np.array([[4.805221304834407]]) assert (distribution.model.dataset == np.array([[ 0.4967141530112327, -0.13826430117118466, 0.6476885381006925, 1.5230298564080254, -0.23415337472333597, -0.23413695694918055, 1.5792128155073915, 0.7674347291529088, -0.4694743859349521, 0.5425600435859647 ]])).all() def test_to_dict(self): """To_dict returns the defining parameters of a distribution in a dict.""" # Setup distribution = KDEUnivariate() column = np.array([[ 0.4967141530112327, -0.13826430117118466, 0.6476885381006925, 1.5230298564080254, -0.23415337472333597, -0.23413695694918055, 1.5792128155073915, 0.7674347291529088, -0.4694743859349521, 0.5425600435859647 ]]) distribution.fit(column) expected_result = { 'd': 1, 'n': 10, 'dataset': [[ 0.4967141530112327, -0.13826430117118466, 0.6476885381006925, 1.5230298564080254, -0.23415337472333597, -0.23413695694918055, 1.5792128155073915, 0.7674347291529088, -0.4694743859349521, 0.5425600435859647 ]], 'covariance': [[0.20810696044195218]], 'factor': 0.6309573444801932, 'inv_cov': [[4.805221304834407]] } # Run result = distribution.to_dict() # Check compare_nested_dicts(result, expected_result)
def test_to_dict_fit_model(self): # Setup instance = Tree(TreeTypes.REGULAR) X = pd.DataFrame(data=[ [1, 0, 0], [0, 1, 0], [0, 0, 1] ]) index = 0 n_nodes = X.shape[1] tau_matrix = X.corr(method='kendall').values univariates_matrix = np.empty(X.shape) for i, column in enumerate(X): distribution = KDEUnivariate() distribution.fit(X[column]) univariates_matrix[:, i] = [distribution.cumulative_distribution(x) for x in X[column]] instance.fit(index, n_nodes, tau_matrix, univariates_matrix) expected_result = { 'type': 'copulas.multivariate.tree.RegularTree', 'fitted': True, 'level': 1, 'n_nodes': 3, 'previous_tree': [ [0.8230112726144534, 0.3384880496294825, 0.3384880496294825], [0.3384880496294825, 0.8230112726144534, 0.3384880496294825], [0.3384880496294825, 0.3384880496294825, 0.8230112726144534] ], 'tau_matrix': [ [1.0, -0.49999999999999994, -0.49999999999999994], [-0.49999999999999994, 1.0, -0.49999999999999994], [-0.49999999999999994, -0.49999999999999994, 1.0] ], 'tree_type': TreeTypes.REGULAR, 'edges': [ { 'index': 0, 'D': set(), 'L': 0, 'R': 1, 'U': [ [0.7969535322648066, 0.6887525261721343, 0.12077958383821545], [0.6887525261721343, 0.7969535322648066, 0.12077958383821545] ], 'likelihood': None, 'name': CopulaTypes.FRANK, 'neighbors': [], 'parents': None, 'tau': -0.49999999999999994, 'theta': -5.736282443655552 }, { 'index': 1, 'D': set(), 'L': 1, 'R': 2, 'U': [ [0.12077958383821545, 0.7969535322648066, 0.6887525261721343], [0.12077958383821545, 0.6887525261721343, 0.7969535322648066] ], 'likelihood': None, 'name': CopulaTypes.FRANK, 'neighbors': [], 'parents': None, 'tau': -0.49999999999999994, 'theta': -5.736282443655552 } ], } # Run result = instance.to_dict() # Check compare_nested_dicts(result, expected_result)
class TestKDEUnivariate(TestCase): def setup_norm(self): """set up the model to fit standard norm data.""" self.kde = KDEUnivariate() # use 42 as a fixed random seed np.random.seed(42) column = np.random.normal(0, 1, 1000) self.kde.fit(column) def test___init__(self): """On init, model are set to None.""" # Setup / Run instance = KDEUnivariate() # Check instance.model is None instance.fitted is False instance.constant_value is None instance.sample_size == 10 @patch('copulas.univariate.kde.scipy.stats.gaussian_kde', autospec=True) def test_fit(self, kde_mock): """On fit, a new instance of gaussian_kde is fitted.""" # Setup instance = KDEUnivariate() X = np.array([1, 2, 3, 4, 5]) kde_instance_mock = kde_mock.return_value # Run instance.fit(X) # Check assert instance.model == kde_instance_mock assert instance.fitted is True assert instance.constant_value is None kde_mock.assert_called_once_with(X) def test_fit_constant(self): """If fit data is constant, no gaussian_kde model is created.""" # Setup instance = KDEUnivariate() X = np.array([1, 1, 1, 1, 1]) # Run instance.fit(X) # Check assert instance.model is None assert instance.constant_value == 1 assert instance.fitted is True @patch('copulas.univariate.kde.scipy.stats.gaussian_kde', autospec=True) def test_probability_density(self, kde_mock): """Probability_density evaluates with the model.""" # Setup kde_instance = kde_mock.return_value kde_instance.evaluate.return_value = ('probability_density_from_model',) instance = KDEUnivariate() X = np.array([1, 2, 3, 4, 5]) instance.fit(X) # Run result = instance.probability_density(0.5) assert result == 'probability_density_from_model' kde_instance.evaluate.assert_called_once_with(0.5) def test_cumulative_distribution(self): """cumulative_distribution evaluates with the model.""" self.setup_norm() x = self.kde.cumulative_distribution(0.5) expected = 0.69146246127401312 self.assertAlmostEqual(x, expected, places=1) def test_percent_point(self): """percent_point evaluates with the model.""" instance = KDEUnivariate(sample_size=1000) np.random.seed(42) X = np.random.normal(0, 1, 1000) instance.fit(X) expected_result = 0.0 # Run result = instance.percent_point(0.5) # Check np.testing.assert_allclose(result, expected_result, atol=1e-1) # TODO: Discuss if this loss of precision is acceptable def test_percent_point_invalid_value(self): """Evaluating an invalid value will raise ValueError.""" self.setup_norm() with self.assertRaises(ValueError): self.kde.percent_point(2) @patch('copulas.univariate.kde.scipy.stats.gaussian_kde', autospec=True) def test_sample(self, kde_mock): """When fitted, we are able to use the model to get samples.""" # Setup model_mock = kde_mock.return_value model_mock.resample.return_value = np.array([0, 1, 0, 1, 0]) instance = KDEUnivariate() X = np.array([1, 2, 3, 4, 5]) instance.fit(X) expected_result = np.array([0, 1, 0, 1, 0]) # Run result = instance.sample(5) # Check compare_nested_iterables(result, expected_result) assert instance.model == model_mock kde_mock.assert_called_once_with(X) model_mock.resample.assert_called_once_with(5) def test_sample_constant(self): """If constant_value is set, all the sample have the same value.""" # Setup instance = KDEUnivariate() instance.fitted = True instance.constant_value = 3 instance._replace_constant_methods() expected_result = np.array([3, 3, 3, 3, 3]) # Run result = instance.sample(5) # Check compare_nested_iterables(result, expected_result) def test_sample_random_state(self): """If random_state is set, samples will generate the exact same values.""" # Setup instance = KDEUnivariate(random_seed=0) X = np.array([1, 2, 3, 4, 5]) instance.fit(X) expected_result_random_state = np.array([ [5.02156389, 5.45857107, 6.12161148, 4.56801267, 6.14017901] ]) # Run result = instance.sample(5) # Check compare_nested_iterables(result, expected_result_random_state) def test_from_dict(self): """From_dict sets the values of a dictionary as attributes of the instance.""" # Setup parameters = { 'fitted': True, 'dataset': [[ 0.4967141530112327, -0.13826430117118466, 0.6476885381006925, 1.5230298564080254, -0.23415337472333597, -0.23413695694918055, 1.5792128155073915, 0.7674347291529088, -0.4694743859349521, 0.5425600435859647 ]], } # Run distribution = KDEUnivariate.from_dict(parameters) # Check assert distribution.model.d == 1 assert distribution.model.n == 10 assert distribution.model.covariance == np.array([[0.20810696044195226]]) assert distribution.model.factor == 0.6309573444801932 assert distribution.model.inv_cov == np.array([[4.805221304834406]]) assert (distribution.model.dataset == np.array([[ 0.4967141530112327, -0.13826430117118466, 0.6476885381006925, 1.5230298564080254, -0.23415337472333597, -0.23413695694918055, 1.5792128155073915, 0.7674347291529088, -0.4694743859349521, 0.5425600435859647 ]])).all() def test_to_dict(self): """To_dict returns the defining parameters of a distribution in a dict.""" # Setup distribution = KDEUnivariate() column = np.array([[ 0.4967141530112327, -0.13826430117118466, 0.6476885381006925, 1.5230298564080254, -0.23415337472333597, -0.23413695694918055, 1.5792128155073915, 0.7674347291529088, -0.4694743859349521, 0.5425600435859647 ]]) distribution.fit(column) expected_result = { 'type': 'copulas.univariate.kde.KDEUnivariate', 'fitted': True, 'dataset': [[ 0.4967141530112327, -0.13826430117118466, 0.6476885381006925, 1.5230298564080254, -0.23415337472333597, -0.23413695694918055, 1.5792128155073915, 0.7674347291529088, -0.4694743859349521, 0.5425600435859647 ]], } # Run result = distribution.to_dict() # Check compare_nested_dicts(result, expected_result) def test_valid_serialization_unfit_model(self): """For a unfitted model to_dict and from_dict are opposites.""" # Setup instance = KDEUnivariate() # Run result = KDEUnivariate.from_dict(instance.to_dict()) # Check assert instance.to_dict() == result.to_dict() def test_valid_serialization_fit_model(self): """For a fitted model to_dict and from_dict are opposites.""" # Setup instance = KDEUnivariate() X = np.array([1, 2, 3, 4]) instance.fit(X) # Run result = KDEUnivariate.from_dict(instance.to_dict()) # Check assert instance.to_dict() == result.to_dict() @patch('copulas.univariate.base.Univariate._constant_probability_density', autospec=True) def test_probability_density_constant(self, pdf_mock): """If constant_value, probability_density uses the degenerate version.""" # Setup instance = KDEUnivariate() instance.fitted = True instance.constant_value = 3 instance._replace_constant_methods() X = np.array([0, 1, 2, 3, 4, 5]) expected_result = np.array([0, 0, 1, 0, 0]) pdf_mock.return_value = np.array([0, 0, 1, 0, 0]) # Run result = instance.probability_density(X) # Check compare_nested_iterables(result, expected_result) pdf_mock.assert_called_once_with(instance, X) @patch('copulas.univariate.base.Univariate._constant_percent_point', autospec=True) def test_percent_point_constant_raises(self, ppf_mock): """If constant_value, percent_point uses the degenerate version.""" # Setup instance = KDEUnivariate() instance.fitted = True instance.constant_value = 3 instance._replace_constant_methods() X = np.array([0.1, 0.5, 0.75]) expected_result = np.array([3, 3, 3]) ppf_mock.return_value = np.array([3, 3, 3]) # Run result = instance.percent_point(X) # Check compare_nested_iterables(result, expected_result) ppf_mock.assert_called_once_with(instance, X)