def sample(self, start_state=None, size=1, return_type="dataframe"):
        """
        Sample from the Markov Chain.

        Parameters
        ----------
        start_state: dict or array-like iterable
            Representing the starting states of the variables. If None is passed, a random start_state is chosen.
        size: int
            Number of samples to be generated.
        return_type: string (dataframe | recarray)
            Return type for samples, either of 'dataframe' or 'recarray'.
            Defaults to 'dataframe'

        Returns
        -------
        sampled: A pandas.DataFrame or a numpy.recarray object depending upon return_type argument
            the generated samples

        Examples
        --------
        >>> from ProbabilityModel.factors import DiscreteFactor
        >>> from ProbabilityModel.sampling import GibbsSampling
        >>> from ProbabilityModel.models import MarkovModel
        >>> model = MarkovModel([('A', 'B'), ('C', 'B')])
        >>> factor_ab = DiscreteFactor(['A', 'B'], [2, 2], [1, 2, 3, 4])
        >>> factor_cb = DiscreteFactor(['C', 'B'], [2, 2], [5, 6, 7, 8])
        >>> model.add_factors(factor_ab, factor_cb)
        >>> gibbs = GibbsSampling(model)
        >>> gibbs.sample(size=4, return_tupe='dataframe')
           A  B  C
        0  0  1  1
        1  1  0  0
        2  1  1  0
        3  1  1  1
        """
        if start_state is None and self.state is None:
            self.state = self.random_state()
        elif start_state is not None:
            self.set_start_state(start_state)

        types = [(var_name, "int") for var_name in self.variables]
        sampled = np.zeros(size, dtype=types).view(np.recarray)
        sampled[0] = tuple(st for var, st in self.state)
        for i in tqdm(range(size - 1)):
            for j, (var, st) in enumerate(self.state):
                other_st = tuple(st for v, st in self.state if var != v)
                next_st = sample_discrete(
                    list(range(self.cardinalities[var])),
                    self.transition_models[var][other_st],
                )[0]
                self.state[j] = State(var, next_st)
            sampled[i + 1] = tuple(st for var, st in self.state)

        return _return_samples(return_type, sampled)
    def forward_sample(self, size=1, return_type="dataframe"):
        """
        Generates sample(s) from joint distribution of the bayesian network.

        Parameters
        ----------
        size: int
            size of sample to be generated

        return_type: string (dataframe | recarray)
            Return type for samples, either of 'dataframe' or 'recarray'.
            Defaults to 'dataframe'

        Returns
        -------
        sampled: A pandas.DataFrame or a numpy.recarray object depending upon return_type argument
            the generated samples


        Examples
        --------
        >>> from ProbabilityModel.models.BayesianModel import BayesianModel
        >>> from ProbabilityModel.factors.discrete import TabularCPD
        >>> from ProbabilityModel.sampling import BayesianModelSampling
        >>> student = BayesianModel([('diff', 'grade'), ('intel', 'grade')])
        >>> cpd_d = TabularCPD('diff', 2, [[0.6], [0.4]])
        >>> cpd_i = TabularCPD('intel', 2, [[0.7], [0.3]])
        >>> cpd_g = TabularCPD('grade', 3, [[0.3, 0.05, 0.9, 0.5], [0.4, 0.25,
        ...                0.08, 0.3], [0.3, 0.7, 0.02, 0.2]],
        ...                ['intel', 'diff'], [2, 2])
        >>> student.add_cpds(cpd_d, cpd_i, cpd_g)
        >>> inference = BayesianModelSampling(student)
        >>> inference.forward_sample(size=2, return_type='recarray')
        rec.array([(0, 0, 1), (1, 0, 2)], dtype=
                  [('diff', '<i8'), ('intel', '<i8'), ('grade', '<i8')])
        """
        types = [(var_name, "int") for var_name in self.topological_order]
        sampled = np.zeros(size, dtype=types).view(np.recarray)

        pbar = tqdm(self.topological_order)
        for node in pbar:
            pbar.set_description(f"Generating for node: {node}")
            cpd = self.model.get_cpds(node)
            states = range(self.cardinality[node])
            evidence = cpd.variables[:0:-1]
            if evidence:
                cached_values = self.pre_compute_reduce(variable=node)
                evidence = np.vstack([sampled[i] for i in evidence])
                weights = list(
                    map(lambda t: cached_values[tuple(t)], evidence.T))
            else:
                weights = cpd.values
            sampled[node] = sample_discrete(states, weights, size)

        return _return_samples(return_type, sampled, self.state_names_map)
Example #3
0
    def sample(
        self,
        initial_pos,
        num_adapt,
        num_samples,
        stepsize=None,
        return_type="dataframe",
    ):
        """
        Returns samples using No U Turn Sampler with dual averaging

        Parameters
        ----------
        initial_pos: A 1d array like object
            Vector representing values of parameter position, the starting
            state in markov chain.

        num_adapt: int
            The number of iterations to run the adaptation of stepsize

        num_samples: int
            Number of samples to be generated

        stepsize: float , defaults to None
            The stepsize for proposing new values of position and momentum in simulate_dynamics
            If None, then will be chosen suitably

        return_type: string (dataframe | recarray)
            Return type for samples, either of 'dataframe' or 'recarray'.
            Defaults to 'dataframe'

        Returns
        -------
        sampled: A pandas.DataFrame or a numpy.recarray object depending upon return_type argument


        Examples
        ---------
        >>> from ProbabilityModel.sampling import NoUTurnSamplerDA as NUTSda, GradLogPDFGaussian, LeapFrog
        >>> from ProbabilityModel.factors.continuous import GaussianDistribution as JGD
        >>> import numpy as np
        >>> mean = np.array([10, -13])
        >>> covariance = np.array([[16, -3], [-3, 13]])
        >>> model = JGD(['x', 'y'], mean, covariance)
        >>> sampler = NUTSda(model=model, grad_log_pdf=GradLogPDFGaussian, simulate_dynamics=LeapFrog)
        >>> samples = sampler.sample(initial_pos=np.array([12, -4]), num_adapt=10, num_samples=10,
        ...                          stepsize=0.1, return_type='dataframe')
        >>> samples
                   x          y
        0  12.000000  -4.000000
        1  11.864821  -3.696109
        2  10.546986  -4.892169
        3   8.526596 -21.555793
        4   8.526596 -21.555793
        5  11.343194  -6.353789
        6  -1.583269 -12.802931
        7  12.411957 -11.704859
        8  13.253336 -20.169492
        9  11.295901  -7.665058
        """
        initial_pos = _check_1d_array_object(initial_pos, "initial_pos")
        _check_length_equal(initial_pos, self.model.variables, "initial_pos",
                            "model.variables")

        if stepsize is None:
            stepsize = self._find_reasonable_stepsize(initial_pos)

        if num_adapt <= 1:
            return NoUTurnSampler(self.model, self.grad_log_pdf,
                                  self.simulate_dynamics).sample(
                                      initial_pos, num_samples, stepsize)

        mu = np.log(10.0 * stepsize)
        stepsize_bar = 1.0
        h_bar = 0.0

        types = [(var_name, "float") for var_name in self.model.variables]
        samples = np.zeros(num_samples, dtype=types).view(np.recarray)
        samples[0] = tuple(initial_pos)
        position_m = initial_pos

        for i in tqdm(range(1, num_samples)):

            position_m, alpha, n_alpha = self._sample(position_m, stepsize)
            samples[i] = tuple(position_m)

            if i <= num_adapt:
                stepsize, stepsize_bar, h_bar = self._adapt_params(
                    stepsize, stepsize_bar, h_bar, mu, i, alpha, n_alpha)
            else:
                stepsize = stepsize_bar

        return _return_samples(return_type, samples)
Example #4
0
    def sample(self,
               initial_pos,
               num_samples,
               stepsize=None,
               return_type="dataframe"):
        """
        Method to return samples using No U Turn Sampler

        Parameters
        ----------
        initial_pos: A 1d array like object
            Vector representing values of parameter position, the starting
            state in markov chain.

        num_samples: int
            Number of samples to be generated

        stepsize: float , defaults to None
            The stepsize for proposing new values of position and momentum in simulate_dynamics
            If None, then will be choosen suitably

        return_type: string (dataframe | recarray)
            Return type for samples, either of 'dataframe' or 'recarray'.
            Defaults to 'dataframe'

        Returns
        -------
        sampled: A pandas.DataFrame or a numpy.recarray object depending upon return_type argument


        Examples
        ---------
        >>> from ProbabilityModel.sampling import NoUTurnSampler as NUTS, GradLogPDFGaussian, LeapFrog
        >>> from ProbabilityModel.factors.continuous import GaussianDistribution as JGD
        >>> import numpy as np
        >>> mean = np.array([0, 0, 0])
        >>> covariance = np.array([[6, 0.7, 0.2], [0.7, 3, 0.9], [0.2, 0.9, 1]])
        >>> model = JGD(['x', 'y', 'z'], mean, covariance)
        >>> sampler = NUTS(model=model, grad_log_pdf=GradLogPDFGaussian, simulate_dynamics=LeapFrog)
        >>> samples = sampler.sample(initial_pos=np.array([1, 1, 1]), num_samples=10,
        ...                          stepsize=0.4, return_type='dataframe')
        >>> samples
                  x         y         z
        0  1.000000  1.000000  1.000000
        1  1.760756  0.271543 -0.613309
        2  1.883387  0.990745 -0.611720
        3  0.980812  0.340336 -0.916283
        4  0.781338  0.647220 -0.948640
        5  0.040308 -1.391406  0.412201
        6  1.179549 -1.450552  1.105216
        7  1.100320 -1.313926  1.207815
        8  1.484520 -1.349247  0.768599
        9  0.934942 -1.894589  0.471772
        """
        initial_pos = _check_1d_array_object(initial_pos, "initial_pos")
        _check_length_equal(initial_pos, self.model.variables, "initial_pos",
                            "model.variables")

        if stepsize is None:
            stepsize = self._find_reasonable_stepsize(initial_pos)

        types = [(var_name, "float") for var_name in self.model.variables]
        samples = np.zeros(num_samples, dtype=types).view(np.recarray)

        samples[0] = tuple(initial_pos)
        position_m = initial_pos

        for i in tqdm(range(1, num_samples)):
            # Genrating sample
            position_m = self._sample(position_m, stepsize)
            samples[i] = tuple(position_m)

        return _return_samples(return_type, samples)
Example #5
0
    def sample(
        self,
        initial_pos,
        num_adapt,
        num_samples,
        trajectory_length,
        stepsize=None,
        return_type="dataframe",
    ):
        """
        Method to return samples using Hamiltonian Monte Carlo

        Parameters
        ----------
        initial_pos: A 1d array like object
            Vector representing values of parameter position, the starting
            state in markov chain.

        num_adapt: int
            The number of iterations to run the adaptation of stepsize

        num_samples: int
            Number of samples to be generated

        trajectory_length: int or float
            Target trajectory length, stepsize * number of steps(L),
            where L is the number of steps taken per HMC iteration,
            and stepsize is step size for splitting time method.

        stepsize: float , defaults to None
            The stepsize for proposing new values of position and momentum in simulate_dynamics
            If None, then will be chosen suitably

        return_type: string (dataframe | recarray)
            Return type for samples, either of 'dataframe' or 'recarray'.
            Defaults to 'dataframe'

        Returns
        -------
        sampled: A pandas.DataFrame or a numpy.recarray object depending upon return_type argument

        Examples
        ---------
        >>> from ProbabilityModel.sampling import HamiltonianMCDA as HMCda, GradLogPDFGaussian as GLPG, LeapFrog
        >>> from ProbabilityModel.factors.continuous import GaussianDistribution as JGD
        >>> import numpy as np
        >>> mean = np.array([1, 1])
        >>> covariance = np.array([[1, 0.7], [0.7, 3]])
        >>> model = JGD(['x', 'y'], mean, covariance)
        >>> sampler = HMCda(model=model, grad_log_pdf=GLPG, simulate_dynamics=LeapFrog)
        >>> samples = sampler.sample(np.array([1, 1]), num_adapt=10000, num_samples = 10000,
        ...                          trajectory_length=2, stepsize=None, return_type='recarray')
        >>> samples_array = np.array([samples[var_name] for var_name in model.variables])
        >>> np.cov(samples_array)
        array([[ 0.98432155,  0.66517394],
               [ 0.66517394,  2.95449533]])

        """

        self.accepted_proposals = 1.0

        initial_pos = _check_1d_array_object(initial_pos, "initial_pos")
        _check_length_equal(initial_pos, self.model.variables, "initial_pos",
                            "model.variables")

        if stepsize is None:
            stepsize = self._find_reasonable_stepsize(initial_pos)

        if num_adapt <= 1:  # Return samples genrated using Simple HMC algorithm
            return HamiltonianMC.sample(self, initial_pos, num_samples,
                                        trajectory_length, stepsize)

        # stepsize is epsilon
        # freely chosen point, after each iteration xt(/position) is shrunk towards it
        mu = np.log(10.0 * stepsize)
        # log(10 * stepsize) large values to save computation
        # stepsize_bar is epsilon_bar
        stepsize_bar = 1.0
        h_bar = 0.0
        # See equation (6) section 3.2.1 for details

        types = [(var_name, "float") for var_name in self.model.variables]
        samples = np.zeros(num_samples, dtype=types).view(np.recarray)
        samples[0] = tuple(initial_pos)
        position_m = initial_pos

        for i in tqdm(range(1, num_samples)):

            # Genrating sample
            position_m, alpha = self._sample(position_m, trajectory_length,
                                             stepsize)
            samples[i] = tuple(position_m)

            # Adaptation of stepsize till num_adapt iterations
            if i <= num_adapt:
                stepsize, stepsize_bar, h_bar = self._adapt_params(
                    stepsize, stepsize_bar, h_bar, mu, i, alpha)
            else:
                stepsize = stepsize_bar

        self.acceptance_rate = self.accepted_proposals / num_samples

        return _return_samples(return_type, samples)
Example #6
0
    def sample(
        self,
        initial_pos,
        num_samples,
        trajectory_length,
        stepsize=None,
        return_type="dataframe",
    ):
        """
        Method to return samples using Hamiltonian Monte Carlo

        Parameters
        ----------
        initial_pos: A 1d array like object
            Vector representing values of parameter position, the starting
            state in markov chain.

        num_samples: int
            Number of samples to be generated

        trajectory_length: int or float
            Target trajectory length, stepsize * number of steps(L),
            where L is the number of steps taken per HMC iteration,
            and stepsize is step size for splitting time method.

        stepsize: float , defaults to None
            The stepsize for proposing new values of position and momentum in simulate_dynamics
            If None, then will be chosen suitably

        return_type: string (dataframe | recarray)
            Return type for samples, either of 'dataframe' or 'recarray'.
            Defaults to 'dataframe'

        Returns
        -------
        sampled: A pandas.DataFrame or a numpy.recarray object depending upon return_type argument

        Examples
        --------
        >>> from ProbabilityModel.sampling import HamiltonianMC as HMC, GradLogPDFGaussian, ModifiedEuler
        >>> from ProbabilityModel.factors.continuous import GaussianDistribution as JGD
        >>> import numpy as np
        >>> mean = np.array([1, -1])
        >>> covariance = np.array([[1, 0.2], [0.2, 1]])
        >>> model = JGD(['x', 'y'], mean, covariance)
        >>> sampler = HMC(model=model, grad_log_pdf=GradLogPDFGaussian, simulate_dynamics=ModifiedEuler)
        >>> samples = sampler.sample(np.array([1, 1]), num_samples = 5,
        ...                          trajectory_length=6, stepsize=0.25, return_type='dataframe')
        >>> samples
                       x              y
        0   1.000000e+00   1.000000e+00
        1   1.592133e+00   1.152911e+00
        2   1.608700e+00   1.315349e+00
        3   1.608700e+00   1.315349e+00
        4   6.843856e-01   6.237043e-01
        >>> mean = np.array([4, 1, -1])
        >>> covariance = np.array([[1, 0.7 , 0.8], [0.7, 1, 0.2], [0.8, 0.2, 1]])
        >>> model = JGD(['x', 'y', 'z'], mean, covariance)
        >>> sampler = HMC(model=model, grad_log_pdf=GLPG)
        >>> samples = sampler.sample(np.array([1, 1]), num_samples = 10000,
        ...                          trajectory_length=6, stepsize=0.25, return_type='dataframe')
        >>> np.cov(samples.values.T)
        array([[ 1.00795398,  0.71384233,  0.79802097],
               [ 0.71384233,  1.00633524,  0.21313767],
               [ 0.79802097,  0.21313767,  0.98519017]])
        """

        self.accepted_proposals = 1.0
        initial_pos = _check_1d_array_object(initial_pos, "initial_pos")
        _check_length_equal(initial_pos, self.model.variables, "initial_pos",
                            "model.variables")

        if stepsize is None:
            stepsize = self._find_reasonable_stepsize(initial_pos)

        types = [(var_name, "float") for var_name in self.model.variables]
        samples = np.zeros(num_samples, dtype=types).view(np.recarray)

        # Assigning after converting into tuple because value was being changed after assignment
        # Reason for this is unknown
        samples[0] = tuple(initial_pos)
        position_m = initial_pos

        lsteps = int(max(1, round(trajectory_length / stepsize, 0)))
        for i in tqdm(range(1, num_samples)):

            # Genrating sample
            position_m, _ = self._sample(position_m, trajectory_length,
                                         stepsize, lsteps)
            samples[i] = tuple(position_m)

        self.acceptance_rate = self.accepted_proposals / num_samples

        return _return_samples(return_type, samples)
    def likelihood_weighted_sample(self,
                                   evidence=[],
                                   size=1,
                                   return_type="dataframe"):
        """
        Generates weighted sample(s) from joint distribution of the bayesian
        network, that comply with the given evidence.
        'Probabilistic Graphical Model Principles and Techniques', Koller and
        Friedman, Algorithm 12.2 pp 493.

        Parameters
        ----------
        evidence: list of `ProbabilityModel.factor.State` namedtuples
            None if no evidence
        size: int
            size of sample to be generated
        return_type: string (dataframe | recarray)
            Return type for samples, either of 'dataframe' or 'recarray'.
            Defaults to 'dataframe'

        Returns
        -------
        sampled: A pandas.DataFrame or a numpy.recarray object depending upon return_type argument
            the generated samples with corresponding weights

        Examples
        --------
        >>> from ProbabilityModel.factors.discrete import State
        >>> from ProbabilityModel.models.BayesianModel import BayesianModel
        >>> from ProbabilityModel.factors.discrete import TabularCPD
        >>> from ProbabilityModel.sampling import BayesianModelSampling
        >>> student = BayesianModel([('diff', 'grade'), ('intel', 'grade')])
        >>> cpd_d = TabularCPD('diff', 2, [[0.6], [0.4]])
        >>> cpd_i = TabularCPD('intel', 2, [[0.7], [0.3]])
        >>> cpd_g = TabularCPD('grade', 3, [[0.3, 0.05, 0.9, 0.5], [0.4, 0.25,
        ...         0.08, 0.3], [0.3, 0.7, 0.02, 0.2]],
        ...         ['intel', 'diff'], [2, 2])
        >>> student.add_cpds(cpd_d, cpd_i, cpd_g)
        >>> inference = BayesianModelSampling(student)
        >>> evidence = [State('diff', 0)]
        >>> inference.likelihood_weighted_sample(evidence=evidence, size=2, return_type='recarray')
        rec.array([(0, 0, 1, 0.6), (0, 0, 2, 0.6)], dtype=
                  [('diff', '<i8'), ('intel', '<i8'), ('grade', '<i8'), ('_weight', '<f8')])
        """
        # Covert evidence state names to number
        evidence = [(var, self.model.get_cpds(var).get_state_no(var, state))
                    for var, state in evidence]

        # Prepare the return array
        types = [(var_name, "int") for var_name in self.topological_order]
        types.append(("_weight", "float"))
        sampled = np.zeros(size, dtype=types).view(np.recarray)
        sampled["_weight"] = np.ones(size)
        evidence_dict = {var: st for var, st in evidence}

        # Do the sampling
        for node in self.topological_order:
            cpd = self.model.get_cpds(node)
            states = range(self.cardinality[node])
            evidence = cpd.get_evidence()

            if evidence:
                evidence_values = np.vstack([sampled[i] for i in evidence])
                cached_values = self.pre_compute_reduce(node)
                weights = list(
                    map(lambda t: cached_values[tuple(t)], evidence_values.T))
                if node in evidence_dict:
                    sampled[node] = evidence_dict[node]
                    for i in range(size):
                        sampled["_weight"][i] *= weights[i][
                            evidence_dict[node]]
                else:
                    sampled[node] = sample_discrete(states, weights)
            else:
                if node in evidence_dict:
                    sampled[node] = evidence_dict[node]
                    for i in range(size):
                        sampled["_weight"][i] *= cpd.values[
                            evidence_dict[node]]
                else:
                    sampled[node] = sample_discrete(states, cpd.values, size)

        # Postprocess the samples: Correct return type and change state numbers to names
        return _return_samples(return_type, sampled, self.state_names_map)
    def rejection_sample(self, evidence=[], size=1, return_type="dataframe"):
        """
        Generates sample(s) from joint distribution of the bayesian network,
        given the evidence.

        Parameters
        ----------
        evidence: list of `ProbabilityModel.factor.State` namedtuples
            None if no evidence
        size: int
            size of sample to be generated
        return_type: string (dataframe | recarray)
            Return type for samples, either of 'dataframe' or 'recarray'.
            Defaults to 'dataframe'

        Returns
        -------
        sampled: A pandas.DataFrame or a numpy.recarray object depending upon return_type argument
            the generated samples

        Examples
        --------
        >>> from ProbabilityModel.models.BayesianModel import BayesianModel
        >>> from ProbabilityModel.factors.discrete import TabularCPD
        >>> from ProbabilityModel.factors.discrete import State
        >>> from ProbabilityModel.sampling import BayesianModelSampling
        >>> student = BayesianModel([('diff', 'grade'), ('intel', 'grade')])
        >>> cpd_d = TabularCPD('diff', 2, [[0.6], [0.4]])
        >>> cpd_i = TabularCPD('intel', 2, [[0.7], [0.3]])
        >>> cpd_g = TabularCPD('grade', 3, [[0.3, 0.05, 0.9, 0.5], [0.4, 0.25,
        ...                0.08, 0.3], [0.3, 0.7, 0.02, 0.2]],
        ...                ['intel', 'diff'], [2, 2])
        >>> student.add_cpds(cpd_d, cpd_i, cpd_g)
        >>> inference = BayesianModelSampling(student)
        >>> evidence = [State(var='diff', state=0)]
        >>> inference.rejection_sample(evidence=evidence, size=2, return_type='dataframe')
                intel       diff       grade
        0         0          0          1
        1         0          0          1
        """
        # Covert evidence state names to number
        evidence = [(var, self.model.get_cpds(var).get_state_no(var, state))
                    for var, state in evidence]

        # If no evidence is given, it is equivalent to forward sampling.
        if len(evidence) == 0:
            return self.forward_sample(size)

        # Setup array to be returned
        types = [(var_name, "int") for var_name in self.topological_order]
        sampled = np.zeros(0, dtype=types).view(np.recarray)
        prob = 1
        i = 0

        # Do the sampling by generating samples from forward sampling and rejecting the
        # samples which do not match our evidence. Keep doing until we have enough
        # samples.
        pbar = tqdm(total=size)
        while i < size:
            _size = int(((size - i) / prob) * 1.5)
            _sampled = self.forward_sample(_size, "recarray")

            for evid in evidence:
                _sampled = _sampled[_sampled[evid[0]] == evid[1]]

            prob = max(len(_sampled) / _size, 0.01)
            sampled = np.append(sampled, _sampled)[:size]

            i += len(_sampled)
            pbar.update(len(_sampled))
        pbar.close()

        # Post process: Correct return type and replace state numbers with names.
        return _return_samples(return_type, sampled, self.state_names_map)