def test_cumulative_distribution(self, kde_mock): """cumulative_distribution evaluates with the model.""" # Setup model_mock = kde_mock.return_value model_mock.integrate_box_1d.side_effect = [0.0, 0.5, 1.0] model_mock.dataset = MagicMock() model_mock.dataset.mean.return_value = 1 model_mock.dataset.std.return_value = 0.1 fit_data = np.array([1, 2, 3, 4, 5]) instance = GaussianKDE() instance.fit(fit_data) call_data = np.array([-10, 0, 10]) expected_result = np.array([0.0, 0.5, 1.0]) expected_integrate_1d_box_call_args_list = [ ((0.5, -10), {}), # The first argument is the lower_bound (1 - 0.1*5) ((0.5, 0), {}), ((0.5, 10), {}), ] # Run result = instance.cumulative_distribution(call_data) # Check compare_nested_iterables(result, expected_result) kde_mock.assert_called_once_with(fit_data) assert (model_mock.integrate_box_1d.call_args_list == expected_integrate_1d_box_call_args_list)
def test_second_tree_likelihood(self): """Assert second tree likelihood is correct.""" # Setup # Build first tree data = pd.read_csv('data/iris.data.csv') tau_mat = data.corr(method='kendall').values u_matrix = np.empty(data.shape) for index, col in enumerate(data): uni = GaussianKDE() uni.fit(data[col]) u_matrix[:, index] = uni.cumulative_distribution(data[col]) first_tree = get_tree(TreeTypes.CENTER) first_tree.fit(0, 4, tau_mat, u_matrix) uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]]) likelihood_first_tree, conditional_uni_first = first_tree.get_likelihood(uni_matrix) tau = first_tree.get_tau_matrix() # Build second tree second_tree = get_tree(TreeTypes.CENTER) second_tree.fit(1, 3, tau, first_tree) expected_likelihood_second_tree = 0.4888802429313932 # Run likelihood_second_tree, out_u = second_tree.get_likelihood(conditional_uni_first) # Check assert compare_values_epsilon(likelihood_second_tree, expected_likelihood_second_tree)
def test_serialization_fit_model(self): # Setup instance = get_tree(TreeTypes.REGULAR) X = pd.DataFrame(data=[ [1, 0, 0], [0, 1, 0], [0, 0, 1] ]) index = 0 n_nodes = X.shape[1] tau_matrix = X.corr(method='kendall').values univariates_matrix = np.empty(X.shape) for i, column in enumerate(X): distribution = GaussianKDE() distribution.fit(X[column]) univariates_matrix[:, i] = distribution.cumulative_distribution(X[column]) instance.fit(index, n_nodes, tau_matrix, univariates_matrix) # Run result = Tree.from_dict(instance.to_dict()) # Check assert result.to_dict() == instance.to_dict()
def test_to_dict(self, kde_mock): """To_dict returns the defining parameters of a distribution in a dict.""" # Setup column = np.array([[ 0.4967141530112327, -0.13826430117118466, 0.6476885381006925, 1.5230298564080254, -0.23415337472333597, -0.23413695694918055, 1.5792128155073915, 0.7674347291529088, -0.4694743859349521, 0.5425600435859647 ]]) kde_instance_mock = kde_mock.return_value kde_instance_mock.dataset = column kde_instance_mock.resample.return_value = column distribution = GaussianKDE() distribution.fit(column) expected_result = { 'type': 'copulas.univariate.gaussian_kde.GaussianKDE', 'fitted': True, 'dataset': [[ 0.4967141530112327, -0.13826430117118466, 0.6476885381006925, 1.5230298564080254, -0.23415337472333597, -0.23413695694918055, 1.5792128155073915, 0.7674347291529088, -0.4694743859349521, 0.5425600435859647 ]], } # Run result = distribution.to_dict() # Check compare_nested_dicts(result, expected_result)
def test_percent_point(self, kde_mock, brentq_mock, cdf_mock): """percent_point evaluates with the model.""" # Setup model_mock = kde_mock.return_value brentq_mock.return_value = -250.0 cdf_mock.return_value = 'a nice scalar bounded method' fit_data = np.array([1, 2, 3, 4, 5]) instance = GaussianKDE() instance.fit(fit_data) expected_result = np.array([-250.0]) # Run result = instance.percent_point([0.5]) # Check assert result == expected_result kde_mock.assert_called_once_with(fit_data) model_mock.assert_not_called() assert len(model_mock.method_calls) == 0 brentq_mock.assert_called_once_with('a nice scalar bounded method', -1000, 1000)
def test_percent_point_invalid_value(self): """Evaluating an invalid value will raise ValueError.""" fit_data = np.array([1, 2, 3, 4, 5]) instance = GaussianKDE() instance.fit(fit_data) with self.assertRaises(ValueError): instance.percent_point(np.array([2.]))
def test_fit_sample(self): model = GaussianKDE() model.fit(self.data) sampled_data = model.sample(50) assert isinstance(sampled_data, np.ndarray) assert sampled_data.shape == (50, )
def test_to_dict_sample_size(self): model = GaussianKDE(sample_size=10) model.fit(self.constant) params = model.to_dict() assert params['type'] == 'copulas.univariate.gaussian_kde.GaussianKDE' assert len(params['dataset']) == 10
def test_fit_empty_data(self): """If fitting kde model with empty data it will raise ValueError.""" # Setup instance = GaussianKDE() data = np.array([]) # Run / Check with self.assertRaises(ValueError): instance.fit(data)
def test_pdf(self): model = GaussianKDE() model.fit(self.data) sampled_data = model.sample(50) # Test PDF pdf = model.probability_density(sampled_data) assert (0 < pdf).all()
def test_percent_point_bisect(self): """percent_point evaluates with the model.""" instance = GaussianKDE() instance.fit(np.array([0.5, 1.0, 1.5])) cdf = instance.percent_point(np.array([0.001, 0.5, 0.999]), method='bisect') assert cdf[0] < 0.0, "The 0.001th percentile should be small." assert abs(cdf[1] - 1.0) < 0.1, "The 50% percentile should be the median." assert cdf[2] > 2.0, "The 0.999th percentile should be large."
def test_to_dict_constant(self): model = GaussianKDE() model.fit(self.constant) params = model.to_dict() assert params == { 'type': 'copulas.univariate.gaussian_kde.GaussianKDE', 'dataset': [5] * 100 }
def test_fit_sample_constant(self): model = GaussianKDE() model.fit(self.constant) sampled_data = model.sample(50) assert isinstance(sampled_data, np.ndarray) assert sampled_data.shape == (50, ) assert model._constant_value == 5 np.testing.assert_equal(np.full(50, 5), model.sample(50))
def test_valid_serialization_fit_model(self): """For a fitted model to_dict and from_dict are opposites.""" # Setup instance = GaussianKDE() X = np.array([1, 2, 3, 4]) instance.fit(X) # Run result = GaussianKDE.from_dict(instance.to_dict()) # Check assert instance.to_dict() == result.to_dict()
def test_sample(self, kde_mock): """Sample calls the gaussian_kde.resample method.""" instance = GaussianKDE() instance.fit(np.array([1, 2, 3, 4])) model = kde_mock.return_value model.resample.return_value = np.array([[1, 2, 3]]) samples = instance.sample(3) instance._model.resample.assert_called_once_with(3) np.testing.assert_equal(samples, np.array([1, 2, 3]))
def setUp(self): self.data = pd.read_csv('data/iris.data.csv') self.tau_mat = self.data.corr(method='kendall').values self.u_matrix = np.empty(self.data.shape) count = 0 for col in self.data: uni = GaussianKDE() uni.fit(self.data[col]) self.u_matrix[:, count] = uni.cumulative_distribution(self.data[col]) count += 1 self.tree = Tree(TreeTypes.DIRECT) self.tree.fit(0, 4, self.tau_mat, self.u_matrix)
def test_cumulative_distribution(self): """cumulative_distribution evaluates with the model.""" instance = GaussianKDE() instance.fit(np.array([0.9, 1.0, 1.1])) cdf = instance.cumulative_distribution(np.array([ 0.0, # There is no data below this (cdf = 0.0). 1.0, # Half the data is below this (cdf = 0.5). 2.0, # All the data is below this (cdf = 1.0). -1.0 # There is no data below this (cdf = 0). ])) assert np.all(np.isclose(cdf, np.array([0.0, 0.5, 1.0, 0.0]), atol=1e-3))
def test_fit_constant(self): """If fit data is constant, no gaussian_kde model is created.""" # Setup instance = GaussianKDE() X = np.array([1, 1, 1, 1, 1]) # Run instance.fit(X) # Check assert instance.model is None assert instance.constant_value == 1 assert instance.fitted is True
def test_probability_density(self, kde_mock): """Sample calls the gaussian_kde.resample method.""" instance = GaussianKDE() instance.fit(np.array([1, 2, 3, 4])) model = kde_mock.return_value model.evaluate.return_value = np.array([0.1, 0.2, 0.3]) pdf = instance.probability_density(np.array([1, 2, 3])) assert instance._model.evaluate.call_count == 1 input_array = instance._model.evaluate.call_args[0][0] np.testing.assert_equal(input_array, np.array([1, 2, 3])) np.testing.assert_equal(pdf, np.array([0.1, 0.2, 0.3]))
def test_cdf(self): model = GaussianKDE() model.fit(self.data) sampled_data = model.sample(50) # Test the CDF cdf = model.cumulative_distribution(sampled_data) assert (0 <= cdf).all() and (cdf <= 1).all() # Test CDF increasing function sorted_data = sorted(sampled_data) cdf = model.cumulative_distribution(sorted_data) assert (np.diff(cdf) >= 0).all()
def test_to_dict_from_dict(self): model = GaussianKDE() model.fit(self.data) sampled_data = model.sample(50) params = model.to_dict() model2 = GaussianKDE.from_dict(params) pdf = model.probability_density(sampled_data) pdf2 = model2.probability_density(sampled_data) assert np.all(np.isclose(pdf, pdf2, atol=0.01)) cdf = model.cumulative_distribution(sampled_data) cdf2 = model2.cumulative_distribution(sampled_data) assert np.all(np.isclose(cdf, cdf2, atol=0.01))
def test_save_load(self): model = GaussianKDE() model.fit(self.data) sampled_data = model.sample(50) path_to_model = os.path.join(self.test_dir.name, "model.pkl") model.save(path_to_model) model2 = GaussianKDE.load(path_to_model) pdf = model.probability_density(sampled_data) pdf2 = model2.probability_density(sampled_data) assert np.all(np.isclose(pdf, pdf2, atol=0.01)) cdf = model.cumulative_distribution(sampled_data) cdf2 = model2.cumulative_distribution(sampled_data) assert np.all(np.isclose(cdf, cdf2, atol=0.01))
def test_to_dict_from_dict_constant(self): model = GaussianKDE() model.fit(self.constant) sampled_data = model.sample(50) pdf = model.probability_density(sampled_data) cdf = model.cumulative_distribution(sampled_data) params = model.to_dict() model2 = GaussianKDE.from_dict(params) np.testing.assert_equal(np.full(50, 5), sampled_data) np.testing.assert_equal(np.full(50, 5), model2.sample(50)) np.testing.assert_equal(np.full(50, 1), pdf) np.testing.assert_equal(np.full(50, 1), model2.probability_density(sampled_data)) np.testing.assert_equal(np.full(50, 1), cdf) np.testing.assert_equal(np.full(50, 1), model2.cumulative_distribution(sampled_data))
def test_fit(self, kde_mock): """On fit, a new instance of gaussian_kde is fitted.""" # Setup instance = GaussianKDE() X = np.array([1, 2, 3, 4, 5]) kde_instance = MagicMock(evaluate='pdf') kde_mock.return_value = kde_instance # Run instance.fit(X) # Check assert instance.model == kde_instance assert instance.fitted is True assert instance.constant_value is None assert instance.probability_density == 'pdf' kde_mock.assert_called_once_with(X)
def test_sample(self, kde_mock): """When fitted, we are able to use the model to get samples.""" # Setup model_mock = kde_mock.return_value model_mock.resample.return_value = np.array([[0, 1, 0, 1, 0]]) instance = GaussianKDE() X = np.array([1, 2, 3, 4, 5]) instance.fit(X) expected_result = np.array([0, 1, 0, 1, 0]) # Run result = instance.sample(5) # Check compare_nested_iterables(result, expected_result) assert instance.model == model_mock kde_mock.assert_called_once_with(X) model_mock.resample.assert_called_once_with(5)
def test_probability_density(self, kde_mock): """probability_density evaluates with the model.""" # Setup model_mock = kde_mock.return_value model_mock.evaluate.return_value = np.array([0.0, 0.5, 1.0]) fit_data = np.array([1, 2, 3, 4, 5]) instance = GaussianKDE() instance.fit(fit_data) call_data = np.array([-10, 0, 10]) expected_result = np.array([0.0, 0.5, 1.0]) # Run result = instance.probability_density(call_data) # Check compare_nested_iterables(result, expected_result) kde_mock.assert_called_once_with(fit_data) model_mock.evaluate.assert_called_once_with(call_data)
def test_to_dict(self): """To_dict returns the defining parameters of a distribution in a dict.""" # Setup distribution = GaussianKDE() column = np.array([[ 0.4967141530112327, -0.13826430117118466, 0.6476885381006925, 1.5230298564080254, -0.23415337472333597, -0.23413695694918055, 1.5792128155073915, 0.7674347291529088, -0.4694743859349521, 0.5425600435859647 ]]) distribution.fit(column) expected_result = { 'type': 'copulas.univariate.gaussian_kde.GaussianKDE', 'fitted': True, 'constant_value': None, 'd': 1, 'n': 10, 'dataset': [[ 0.4967141530112327, -0.13826430117118466, 0.6476885381006925, 1.5230298564080254, -0.23415337472333597, -0.23413695694918055, 1.5792128155073915, 0.7674347291529088, -0.4694743859349521, 0.5425600435859647 ]], 'covariance': [[0.20810696044195218]], 'factor': 0.6309573444801932, 'inv_cov': [[4.805221304834407]] } # Run result = distribution.to_dict() # Check compare_nested_dicts(result, expected_result)
def test_to_dict_fit_model(self): # Setup instance = get_tree(TreeTypes.REGULAR) X = pd.DataFrame(data=[ [1, 0, 0], [0, 1, 0], [0, 0, 1] ]) index = 0 n_nodes = X.shape[1] tau_matrix = X.corr(method='kendall').values univariates_matrix = np.empty(X.shape) for i, column in enumerate(X): distribution = GaussianKDE() distribution.fit(X[column]) univariates_matrix[:, i] = distribution.cumulative_distribution(X[column]) instance.fit(index, n_nodes, tau_matrix, univariates_matrix) expected_result = { 'type': 'copulas.multivariate.tree.RegularTree', 'fitted': True, 'level': 1, 'n_nodes': 3, 'previous_tree': [ [0.8230112726144534, 0.3384880496294825, 0.3384880496294825], [0.3384880496294825, 0.8230112726144534, 0.3384880496294825], [0.3384880496294825, 0.3384880496294825, 0.8230112726144534] ], 'tau_matrix': [ [1.0, -0.49999999999999994, -0.49999999999999994], [-0.49999999999999994, 1.0, -0.49999999999999994], [-0.49999999999999994, -0.49999999999999994, 1.0] ], 'tree_type': TreeTypes.REGULAR, 'edges': [ { 'index': 0, 'D': set(), 'L': 0, 'R': 1, 'U': [ [0.7969636014074211, 0.6887638642325501, 0.12078520049364487], [0.6887638642325501, 0.7969636014074211, 0.12078520049364487] ], 'likelihood': None, 'name': CopulaTypes.FRANK, 'neighbors': [], 'parents': None, 'tau': -0.49999999999999994, 'theta': -5.736282443655552 }, { 'index': 1, 'D': set(), 'L': 1, 'R': 2, 'U': [ [0.12078520049364491, 0.7969636014074213, 0.6887638642325501], [0.12078520049364491, 0.6887638642325503, 0.7969636014074211] ], 'likelihood': None, 'name': CopulaTypes.FRANK, 'neighbors': [], 'parents': None, 'tau': -0.49999999999999994, 'theta': -5.736282443655552 } ], } # Run result = instance.to_dict() # Check compare_nested_dicts(result, expected_result)
def test__is_constant_false(self): distribution = GaussianKDE() distribution.fit(np.array([1, 2, 3, 4])) assert not distribution._is_constant()
def test__is_constant_true(self): distribution = GaussianKDE() distribution.fit(np.array([1, 1, 1, 1])) assert distribution._is_constant()