Ejemplo n.º 1
0
    def test_em_with_close_priors(self):
        """If the priors are close to real parameters, the result is very accurate"""
        df, sm, node_states, true_lv_values = naive_bayes_plus_parents(
            p_c=0.6,
            parents=1,
            children=2,
            percentage_not_missing=0,
        )
        correct_cpds = get_correct_cpds(df, sm, node_states, true_lv_values)
        priors = EMSingleLatentVariable.get_default_priors(sm, node_states, "z")
        cte = 200

        # Setting boxes
        for el in ["c_1", "c_0", "z"]:
            priors[el].loc[:] = correct_cpds[el] * cte

        em = EMSingleLatentVariable(
            data=df,
            sm=sm,
            node_states=node_states,
            lv_name="z",
            priors=priors,
        )
        em.run(n_runs=20, stopping_delta=0.01, verbose=2)

        max_error, rmse_error = compare_result_with_ideal(
            em.cpds, sm, df, true_lv_values, node_states
        )
        assert max_error < 0.01
        assert rmse_error < 4e-3
Ejemplo n.º 2
0
    def test_em_with_close_box_constraints(self):
        """
        Test EM with box constraints that are close to real parameters.
        The result should be very accurate
        """
        df, sm, node_states, true_lv_values = naive_bayes_plus_parents(
            p_c=0.6,
            parents=1,
            children=2,
            percentage_not_missing=0,
            samples=5000,
        )
        correct_cpds = get_correct_cpds(df, sm, node_states, true_lv_values)
        box = EMSingleLatentVariable.get_default_box(sm, node_states, "z")

        # Setting boxes
        for el in ["c_1", "c_0", "z"]:
            box[el][0].loc[:] = correct_cpds[el] - 0.0001  # min
            box[el][1].loc[:] = correct_cpds[el] + 0.0001  # max

        em = EMSingleLatentVariable(
            data=df, sm=sm, node_states=node_states, lv_name="z", box_constraints=box
        )
        em.run(n_runs=20, stopping_delta=0.01, verbose=2)

        max_error, rmse_error = compare_result_with_ideal(
            em.cpds, sm, df, true_lv_values, node_states
        )
        assert max_error < 0.0002
        assert rmse_error < 1e-4
Ejemplo n.º 3
0
    def test_initial_params_provided(self):
        """
        If we provide initial parameters close to the real ones,
        EM should converge to values close to the real ones
        EM will still converge to a solution different to whether we observed all missing values
        """
        df, sm, node_states, true_lv_values = naive_bayes_plus_parents(
            p_c=0.6,
            parents=1,
            children=2,
            percentage_not_missing=0,
            samples=5000,
        )
        correct_cpds = get_correct_cpds(df, sm, node_states, true_lv_values)
        correct_cpds.pop("p_0", None)

        em = EMSingleLatentVariable(
            data=df,
            sm=sm,
            node_states=node_states,
            lv_name="z",
            initial_params=correct_cpds,
        )
        em.run(n_runs=20, stopping_delta=0.001, verbose=2)

        max_error, rmse_error = compare_result_with_ideal(
            em.cpds, sm, df, true_lv_values, node_states
        )
        assert max_error < 0.01
        assert rmse_error < 4e-3
Ejemplo n.º 4
0
    def test_em_with_priors(self):
        """Test some specific priors chosen"""
        df, sm, node_states, true_lv_values = naive_bayes_plus_parents(
            p_c=0.6,
            parents=1,
            children=2,
            percentage_not_missing=0,
        )
        # Setting priors
        priors = EMSingleLatentVariable.get_default_priors(sm, node_states, "z")
        # prior values
        priors["c_0"].loc[:] = [[0.61, 0.0, 0.34], [0.39, 0.6, 0.0], [0.0, 0.4, 0.66]]
        priors["c_1"].loc[:] = [[0.61, 0.0, 0.4], [0.39, 0.6, 0.0], [0.0, 0.4, 0.6]]
        priors["z"].loc[:] = [[0.91, 0, 0.08], [0.09, 0.89, 0], [0, 0.11, 0.92]]

        # prior strengths
        priors["c_1"] = priors["c_1"] * 70
        priors["c_0"] = priors["c_0"] * 70
        priors["z"] = priors["z"] * 70

        em = EMSingleLatentVariable(
            data=df, sm=sm, node_states=node_states, lv_name="z", priors=priors
        )
        em.run(n_runs=20, stopping_delta=0.01)

        max_error, rmse_error = compare_result_with_ideal(
            em.cpds, sm, df, true_lv_values, node_states
        )
        assert max_error < 0.02
        assert rmse_error < 1e-2
Ejemplo n.º 5
0
    def test_em_likelihood_always_go_up(self, n_jobs):
        """Test convergence properties of EM algorithm"""
        df, sm, node_states, _ = naive_bayes_plus_parents(
            parents=2,
            percentage_not_missing=0.1,
            samples=500,
            categories=2,
        )
        em = EMSingleLatentVariable(
            data=df,
            sm=sm,
            node_states=node_states,
            lv_name="z",
            n_jobs=n_jobs,
        )
        likelihood_old = -np.inf

        for _ in range(50):
            likelihood = em.compute_total_likelihood()
            assert likelihood > likelihood_old
            likelihood_old = likelihood

            em.e_step()
            em.m_step()
            em.apply_box_constraints()
            delta = em._stopping_criteria()  # pylint: disable=protected-access

            if delta < 0.01:
                break
Ejemplo n.º 6
0
    def test_default_boxes_do_not_affect_result(self):
        """Test EM with box constraints that do not affect the end results"""
        df, sm, node_states, _ = naive_bayes_plus_parents(
            p_c=0.6,
            parents=1,
            children=2,
            percentage_not_missing=0,
        )
        box = EMSingleLatentVariable.get_default_box(sm, node_states, "z")
        em_box = EMSingleLatentVariable(
            data=df,
            sm=sm,
            node_states=node_states,
            lv_name="z",
            box_constraints=box,
        )
        em_box.run(n_runs=20, stopping_delta=0.06)

        em_no_box = EMSingleLatentVariable(
            data=df,
            sm=sm,
            node_states=node_states,
            lv_name="z",
        )
        em_no_box.run(n_runs=20, stopping_delta=0.06)

        for el in em_box.cpds:
            assert np.all((em_box.cpds[el] - em_no_box.cpds[el]).abs() < 1e-15)
Ejemplo n.º 7
0
 def test_invalid_priors(self, priors):
     """Test EM with invalid priors type"""
     with pytest.raises(ValueError, match=r"Invalid priors *"):
         df, sm, node_states, _ = naive_bayes_plus_parents()
         EMSingleLatentVariable(
             data=df, sm=sm, node_states=node_states, lv_name="z", priors=priors
         )
Ejemplo n.º 8
0
    def test_em_no_missing_data(self, n_jobs):
        """If all data for the latent variable is provided, the result is the same as runing bn.fit_cpds"""
        df, sm, node_states, true_lv_values = naive_bayes_plus_parents(
            percentage_not_missing=1
        )
        em = EMSingleLatentVariable(
            data=df,
            sm=sm,
            node_states=node_states,
            lv_name="z",
            n_jobs=n_jobs,
        )
        em.run(n_runs=50, stopping_delta=0.001, verbose=2)

        max_error, _ = compare_result_with_ideal(
            em.cpds, sm, df, true_lv_values, node_states
        )
        assert max_error == 0
Ejemplo n.º 9
0
    def test_get_default_priors(self):
        """Test EM algorithm on naive Bayes structure with additional parents"""
        _, sm, node_states, _ = naive_bayes_plus_parents(
            p_c=0.6, parents=3, percentage_not_missing=0.02
        )
        default_priors = EMSingleLatentVariable.get_default_priors(sm, node_states, "z")
        assert default_priors.keys() == {"c_0", "c_1", "c_2", "z"}

        for k in default_priors:
            assert np.all(default_priors[k] == 0)
Ejemplo n.º 10
0
    def test_em_no_parents(self, n_jobs):
        """Test EM algorithm on pure naive Bayes structure without parents"""
        df, sm, node_states, true_lv_values = naive_bayes_plus_parents(
            p_c=0.6, parents=0, percentage_not_missing=0.02
        )
        em = EMSingleLatentVariable(
            data=df,
            sm=sm,
            node_states=node_states,
            lv_name="z",
            n_jobs=n_jobs,
        )
        em.run(n_runs=50, stopping_delta=0.001, verbose=2)

        max_error, rmse_error = compare_result_with_ideal(
            em.cpds, sm, df, true_lv_values, node_states
        )
        assert max_error < 0.02
        assert rmse_error < 1e-2
Ejemplo n.º 11
0
 def test_invalid_box_constraints(self, box_constraints):
     """Test EM with invalid box constraint type"""
     with pytest.raises(ValueError, match=r"Invalid box constraints *"):
         df, sm, node_states, _ = naive_bayes_plus_parents()
         EMSingleLatentVariable(
             data=df,
             sm=sm,
             node_states=node_states,
             lv_name="z",
             box_constraints=box_constraints,
         )
Ejemplo n.º 12
0
    def test_default_boxes(self):
        """Test EM with default box constraints"""
        _, sm, node_states, _ = naive_bayes_plus_parents(
            p_c=0.6,
            parents=3,
            percentage_not_missing=0.02,
        )
        priors = EMSingleLatentVariable.get_default_box(sm, node_states, "z")
        assert priors.keys() == {"c_0", "c_1", "c_2", "z"}

        for _, prior in priors.items():
            assert np.all(prior[0] == 0)
            assert np.all(prior[1] == 1)
Ejemplo n.º 13
0
 def test_invalid_initial_params_dict(self, initial_params):
     """An error should be raised if the latent variable is not part of the edges to add"""
     with pytest.raises(
         ValueError,
         match=r"If `initial_params` is a dictionary, it has to map `valid nodes` to corresponding CPTs. .*",
     ):
         df, sm, node_states, _ = naive_bayes_plus_parents()
         EMSingleLatentVariable(
             data=df,
             sm=sm,
             node_states=node_states,
             lv_name="z",
             initial_params=initial_params,
         )
Ejemplo n.º 14
0
 def test_invalid_initial_params(self, initial_params):
     """An error should be raised if the latent variable is not part of the edges to add"""
     with pytest.raises(
         ValueError,
         match=r"`initial_params` must be a dictionary or one of .*",
     ):
         df, sm, node_states, _ = naive_bayes_plus_parents()
         EMSingleLatentVariable(
             data=df,
             sm=sm,
             node_states=node_states,
             lv_name="z",
             initial_params=initial_params,
         )
Ejemplo n.º 15
0
    def test_em_missing_data(self, n_jobs):
        """Test EM algorithm given some "missing" data"""
        df, sm, node_states, true_lv_values = naive_bayes_plus_parents(
            p_c=0.6,
            p_z=0.6,
            parents=1,
            percentage_not_missing=0.25,
            samples=5000,
            categories=2,
        )
        em = EMSingleLatentVariable(
            data=df,
            sm=sm,
            node_states=node_states,
            lv_name="z",
            n_jobs=n_jobs,
        )
        em.run(n_runs=50, stopping_delta=0.001, verbose=2)

        max_error, rmse_error = compare_result_with_ideal(
            em.cpds, sm, df, true_lv_values, node_states
        )
        assert max_error < 0.02
        assert rmse_error < 1e-2
Ejemplo n.º 16
0
    def fit_latent_cpds(  # pylint: disable=too-many-arguments
        self,
        lv_name: str,
        lv_states: List,
        data: pd.DataFrame,
        box_constraints: Optional[Dict[str, Tuple[pd.DataFrame, pd.DataFrame]]] = None,
        priors: Optional[Dict[str, pd.DataFrame]] = None,
        initial_params: Union[str, Dict[str, pd.DataFrame]] = "random",
        non_missing_data_factor: int = 1,
        n_runs: int = 20,
        stopping_delta: float = 0.0,
    ) -> "BayesianNetwork":
        """
        This runs the EM algorithm to estimate the CPDs of latent variables and their corresponding Markov blanket

        Args:
            lv_name: Latent variable name
            lv_states: the states the LV can assume
            data: dataframe, must contain all variables in the Markov Blanket of the latent variable. Include one column
                with the latent variable name, filled with np.nan for missing info about LV.
                If some data is present about the LV, create complete columns.
            n_runs: max number of EM alternations
            stopping_delta: if max difference in current - last iteration CPDS < stopping_delta => convergence reached
            initial_params: way to initialise parameters. Can be:
                - "random": random values (default)
                - "avg": uniform distributions everywhere. Not advised, as it may be the a stationary point on itself
                - if provide a dictionary of dataframes, this will be used as the initialisation
            box_constraints: minimum and maximum values for each model parameter. Specified with a dictionary mapping:
                - Node
                - two dataframes, in order: Min(P(Node|Par(Node))) and Max(P(Node|Par(Node)))
            priors: priors, provided as a mapping Node -> dataframe with Dirichilet priors for P(Node|Par(Node))
            non_missing_data_factor:
                This is a weight added to the non-missing data samples. The effect is as if the amount of data provided
                was bigger. Empirically, helps to set the factor to 10 if the non missing data is ~1% of the dataset

        Returns:
            self

        Raises:
            ValueError: if the latent variable is not a string or
                if the latent variable cannot be found in the network or
                if the latent variable is present/observed in the data
                if the latent variable states are empty
        """
        if not isinstance(lv_name, str):
            raise ValueError(f"Invalid latent variable name '{lv_name}'")
        if lv_name not in self._structure:
            raise ValueError(f"Latent variable '{lv_name}' not added to the network")
        if not isinstance(lv_states, list) or len(lv_states) == 0:
            raise ValueError(f"Latent variable '{lv_name}' contains no states")

        # Register states for the latent variable
        self._node_states[lv_name] = {v: k for k, v in enumerate(sorted(lv_states))}

        # Run EM algorithm
        estimator = EMSingleLatentVariable(
            sm=self.structure,
            data=data,
            lv_name=lv_name,
            node_states={n: sorted(s) for n, s in self.node_states.items()},
            initial_params=initial_params,
            box_constraints=box_constraints,
            priors=priors,
            non_missing_data_factor=non_missing_data_factor,
        )
        estimator.run(n_runs=n_runs, stopping_delta=stopping_delta)

        # Add CPDs into the model
        tab_cpds = [pd_to_tabular_cpd(el) for el in estimator.cpds.values()]
        self._model.add_cpds(*tab_cpds)

        return self