def test_cdf(self):
        model = GaussianKDE()
        model.fit(self.data)

        sampled_data = model.sample(50)

        # Test the CDF
        cdf = model.cumulative_distribution(sampled_data)
        assert (0 <= cdf).all() and (cdf <= 1).all()

        # Test CDF increasing function
        sorted_data = sorted(sampled_data)
        cdf = model.cumulative_distribution(sorted_data)
        assert (np.diff(cdf) >= 0).all()
Exemple #2
0
    def test_second_tree_likelihood(self):
        """Assert second tree likelihood is correct."""
        # Setup
        # Build first tree
        data = pd.read_csv('data/iris.data.csv')
        tau_mat = data.corr(method='kendall').values
        u_matrix = np.empty(data.shape)

        for index, col in enumerate(data):
            uni = GaussianKDE()
            uni.fit(data[col])
            u_matrix[:, index] = uni.cumulative_distribution(data[col])

        first_tree = get_tree(TreeTypes.CENTER)
        first_tree.fit(0, 4, tau_mat, u_matrix)
        uni_matrix = np.array([[0.1, 0.2, 0.3, 0.4]])
        likelihood_first_tree, conditional_uni_first = first_tree.get_likelihood(uni_matrix)
        tau = first_tree.get_tau_matrix()

        # Build second tree
        second_tree = get_tree(TreeTypes.CENTER)
        second_tree.fit(1, 3, tau, first_tree)
        expected_likelihood_second_tree = 0.4888802429313932

        # Run
        likelihood_second_tree, out_u = second_tree.get_likelihood(conditional_uni_first)

        # Check
        assert compare_values_epsilon(likelihood_second_tree, expected_likelihood_second_tree)
Exemple #3
0
    def test_serialization_fit_model(self):
        # Setup
        instance = get_tree(TreeTypes.REGULAR)
        X = pd.DataFrame(data=[
            [1, 0, 0],
            [0, 1, 0],
            [0, 0, 1]
        ])
        index = 0
        n_nodes = X.shape[1]
        tau_matrix = X.corr(method='kendall').values

        univariates_matrix = np.empty(X.shape)
        for i, column in enumerate(X):
            distribution = GaussianKDE()
            distribution.fit(X[column])
            univariates_matrix[:, i] = distribution.cumulative_distribution(X[column])

        instance.fit(index, n_nodes, tau_matrix, univariates_matrix)

        # Run
        result = Tree.from_dict(instance.to_dict())

        # Check
        assert result.to_dict() == instance.to_dict()
    def test_cumulative_distribution(self, kde_mock):
        """cumulative_distribution evaluates with the model."""
        # Setup
        model_mock = kde_mock.return_value
        model_mock.integrate_box_1d.side_effect = [0.0, 0.5, 1.0]

        model_mock.dataset = MagicMock()
        model_mock.dataset.mean.return_value = 1
        model_mock.dataset.std.return_value = 0.1

        fit_data = np.array([1, 2, 3, 4, 5])
        instance = GaussianKDE()
        instance.fit(fit_data)

        call_data = np.array([-10, 0, 10])
        expected_result = np.array([0.0, 0.5, 1.0])

        expected_integrate_1d_box_call_args_list = [
            ((0.5, -10),
             {}),  # The first argument is the lower_bound (1 - 0.1*5)
            ((0.5, 0), {}),
            ((0.5, 10), {}),
        ]

        # Run
        result = instance.cumulative_distribution(call_data)

        # Check
        compare_nested_iterables(result, expected_result)

        kde_mock.assert_called_once_with(fit_data)
        assert (model_mock.integrate_box_1d.call_args_list ==
                expected_integrate_1d_box_call_args_list)
 def setUp(self):
     self.data = pd.read_csv('data/iris.data.csv')
     self.tau_mat = self.data.corr(method='kendall').values
     self.u_matrix = np.empty(self.data.shape)
     count = 0
     for col in self.data:
         uni = GaussianKDE()
         uni.fit(self.data[col])
         self.u_matrix[:, count] = uni.cumulative_distribution(self.data[col])
         count += 1
     self.tree = Tree(TreeTypes.DIRECT)
     self.tree.fit(0, 4, self.tau_mat, self.u_matrix)
Exemple #6
0
    def test_cumulative_distribution(self):
        """cumulative_distribution evaluates with the model."""
        instance = GaussianKDE()
        instance.fit(np.array([0.9, 1.0, 1.1]))

        cdf = instance.cumulative_distribution(np.array([
            0.0,  # There is no data below this (cdf = 0.0).
            1.0,  # Half the data is below this (cdf = 0.5).
            2.0,  # All the data is below this (cdf = 1.0).
            -1.0  # There is no data below this (cdf = 0).
        ]))

        assert np.all(np.isclose(cdf, np.array([0.0, 0.5, 1.0, 0.0]), atol=1e-3))
    def test_to_dict_from_dict(self):
        model = GaussianKDE()
        model.fit(self.data)

        sampled_data = model.sample(50)

        params = model.to_dict()
        model2 = GaussianKDE.from_dict(params)

        pdf = model.probability_density(sampled_data)
        pdf2 = model2.probability_density(sampled_data)
        assert np.all(np.isclose(pdf, pdf2, atol=0.01))

        cdf = model.cumulative_distribution(sampled_data)
        cdf2 = model2.cumulative_distribution(sampled_data)
        assert np.all(np.isclose(cdf, cdf2, atol=0.01))
    def test_save_load(self):
        model = GaussianKDE()
        model.fit(self.data)

        sampled_data = model.sample(50)

        path_to_model = os.path.join(self.test_dir.name, "model.pkl")
        model.save(path_to_model)
        model2 = GaussianKDE.load(path_to_model)

        pdf = model.probability_density(sampled_data)
        pdf2 = model2.probability_density(sampled_data)
        assert np.all(np.isclose(pdf, pdf2, atol=0.01))

        cdf = model.cumulative_distribution(sampled_data)
        cdf2 = model2.cumulative_distribution(sampled_data)
        assert np.all(np.isclose(cdf, cdf2, atol=0.01))
    def test_to_dict_from_dict_constant(self):
        model = GaussianKDE()
        model.fit(self.constant)

        sampled_data = model.sample(50)
        pdf = model.probability_density(sampled_data)
        cdf = model.cumulative_distribution(sampled_data)

        params = model.to_dict()
        model2 = GaussianKDE.from_dict(params)

        np.testing.assert_equal(np.full(50, 5), sampled_data)
        np.testing.assert_equal(np.full(50, 5), model2.sample(50))
        np.testing.assert_equal(np.full(50, 1), pdf)
        np.testing.assert_equal(np.full(50, 1), model2.probability_density(sampled_data))
        np.testing.assert_equal(np.full(50, 1), cdf)
        np.testing.assert_equal(np.full(50, 1), model2.cumulative_distribution(sampled_data))
Exemple #10
0
    def test_to_dict_fit_model(self):
        # Setup
        instance = get_tree(TreeTypes.REGULAR)
        X = pd.DataFrame(data=[
            [1, 0, 0],
            [0, 1, 0],
            [0, 0, 1]
        ])
        index = 0
        n_nodes = X.shape[1]
        tau_matrix = X.corr(method='kendall').values

        univariates_matrix = np.empty(X.shape)
        for i, column in enumerate(X):
            distribution = GaussianKDE()
            distribution.fit(X[column])
            univariates_matrix[:, i] = distribution.cumulative_distribution(X[column])

        instance.fit(index, n_nodes, tau_matrix, univariates_matrix)
        expected_result = {
            'type': 'copulas.multivariate.tree.RegularTree',
            'fitted': True,
            'level': 1,
            'n_nodes': 3,
            'previous_tree': [
                [0.8230112726144534, 0.3384880496294825, 0.3384880496294825],
                [0.3384880496294825, 0.8230112726144534, 0.3384880496294825],
                [0.3384880496294825, 0.3384880496294825, 0.8230112726144534]
            ],
            'tau_matrix': [
                [1.0, -0.49999999999999994, -0.49999999999999994],
                [-0.49999999999999994, 1.0, -0.49999999999999994],
                [-0.49999999999999994, -0.49999999999999994, 1.0]
            ],
            'tree_type': TreeTypes.REGULAR,
            'edges': [
                {
                    'index': 0,
                    'D': set(),
                    'L': 0,
                    'R': 1,
                    'U': [
                        [0.7969636014074211, 0.6887638642325501, 0.12078520049364487],
                        [0.6887638642325501, 0.7969636014074211, 0.12078520049364487]
                    ],
                    'likelihood': None,
                    'name': CopulaTypes.FRANK,
                    'neighbors': [],
                    'parents': None,
                    'tau': -0.49999999999999994,
                    'theta': -5.736282443655552
                },
                {
                    'index': 1,
                    'D': set(),
                    'L': 1,
                    'R': 2,
                    'U': [
                        [0.12078520049364491, 0.7969636014074213, 0.6887638642325501],
                        [0.12078520049364491, 0.6887638642325503, 0.7969636014074211]
                    ],
                    'likelihood': None,
                    'name': CopulaTypes.FRANK,
                    'neighbors': [],
                    'parents': None,
                    'tau': -0.49999999999999994,
                    'theta': -5.736282443655552
                }
            ],
        }

        # Run
        result = instance.to_dict()

        # Check
        compare_nested_dicts(result, expected_result)
Exemple #11
0
 def test_percent_point_invertibility(self):
     instance = GaussianKDE()
     instance.fit(sample_univariate_bimodal())
     cdf = np.random.random(size=1000)
     x = instance.percent_point(cdf)
     assert np.abs(instance.cumulative_distribution(x) - cdf).max() < 1e-6