def sample(self, start_state=None, size=1, return_type="dataframe"): """ Sample from the Markov Chain. Parameters ---------- start_state: dict or array-like iterable Representing the starting states of the variables. If None is passed, a random start_state is chosen. size: int Number of samples to be generated. return_type: string (dataframe | recarray) Return type for samples, either of 'dataframe' or 'recarray'. Defaults to 'dataframe' Returns ------- sampled: A pandas.DataFrame or a numpy.recarray object depending upon return_type argument the generated samples Examples -------- >>> from ProbabilityModel.factors import DiscreteFactor >>> from ProbabilityModel.sampling import GibbsSampling >>> from ProbabilityModel.models import MarkovModel >>> model = MarkovModel([('A', 'B'), ('C', 'B')]) >>> factor_ab = DiscreteFactor(['A', 'B'], [2, 2], [1, 2, 3, 4]) >>> factor_cb = DiscreteFactor(['C', 'B'], [2, 2], [5, 6, 7, 8]) >>> model.add_factors(factor_ab, factor_cb) >>> gibbs = GibbsSampling(model) >>> gibbs.sample(size=4, return_tupe='dataframe') A B C 0 0 1 1 1 1 0 0 2 1 1 0 3 1 1 1 """ if start_state is None and self.state is None: self.state = self.random_state() elif start_state is not None: self.set_start_state(start_state) types = [(var_name, "int") for var_name in self.variables] sampled = np.zeros(size, dtype=types).view(np.recarray) sampled[0] = tuple(st for var, st in self.state) for i in tqdm(range(size - 1)): for j, (var, st) in enumerate(self.state): other_st = tuple(st for v, st in self.state if var != v) next_st = sample_discrete( list(range(self.cardinalities[var])), self.transition_models[var][other_st], )[0] self.state[j] = State(var, next_st) sampled[i + 1] = tuple(st for var, st in self.state) return _return_samples(return_type, sampled)
def forward_sample(self, size=1, return_type="dataframe"): """ Generates sample(s) from joint distribution of the bayesian network. Parameters ---------- size: int size of sample to be generated return_type: string (dataframe | recarray) Return type for samples, either of 'dataframe' or 'recarray'. Defaults to 'dataframe' Returns ------- sampled: A pandas.DataFrame or a numpy.recarray object depending upon return_type argument the generated samples Examples -------- >>> from ProbabilityModel.models.BayesianModel import BayesianModel >>> from ProbabilityModel.factors.discrete import TabularCPD >>> from ProbabilityModel.sampling import BayesianModelSampling >>> student = BayesianModel([('diff', 'grade'), ('intel', 'grade')]) >>> cpd_d = TabularCPD('diff', 2, [[0.6], [0.4]]) >>> cpd_i = TabularCPD('intel', 2, [[0.7], [0.3]]) >>> cpd_g = TabularCPD('grade', 3, [[0.3, 0.05, 0.9, 0.5], [0.4, 0.25, ... 0.08, 0.3], [0.3, 0.7, 0.02, 0.2]], ... ['intel', 'diff'], [2, 2]) >>> student.add_cpds(cpd_d, cpd_i, cpd_g) >>> inference = BayesianModelSampling(student) >>> inference.forward_sample(size=2, return_type='recarray') rec.array([(0, 0, 1), (1, 0, 2)], dtype= [('diff', '<i8'), ('intel', '<i8'), ('grade', '<i8')]) """ types = [(var_name, "int") for var_name in self.topological_order] sampled = np.zeros(size, dtype=types).view(np.recarray) pbar = tqdm(self.topological_order) for node in pbar: pbar.set_description(f"Generating for node: {node}") cpd = self.model.get_cpds(node) states = range(self.cardinality[node]) evidence = cpd.variables[:0:-1] if evidence: cached_values = self.pre_compute_reduce(variable=node) evidence = np.vstack([sampled[i] for i in evidence]) weights = list( map(lambda t: cached_values[tuple(t)], evidence.T)) else: weights = cpd.values sampled[node] = sample_discrete(states, weights, size) return _return_samples(return_type, sampled, self.state_names_map)
def sample( self, initial_pos, num_adapt, num_samples, stepsize=None, return_type="dataframe", ): """ Returns samples using No U Turn Sampler with dual averaging Parameters ---------- initial_pos: A 1d array like object Vector representing values of parameter position, the starting state in markov chain. num_adapt: int The number of iterations to run the adaptation of stepsize num_samples: int Number of samples to be generated stepsize: float , defaults to None The stepsize for proposing new values of position and momentum in simulate_dynamics If None, then will be chosen suitably return_type: string (dataframe | recarray) Return type for samples, either of 'dataframe' or 'recarray'. Defaults to 'dataframe' Returns ------- sampled: A pandas.DataFrame or a numpy.recarray object depending upon return_type argument Examples --------- >>> from ProbabilityModel.sampling import NoUTurnSamplerDA as NUTSda, GradLogPDFGaussian, LeapFrog >>> from ProbabilityModel.factors.continuous import GaussianDistribution as JGD >>> import numpy as np >>> mean = np.array([10, -13]) >>> covariance = np.array([[16, -3], [-3, 13]]) >>> model = JGD(['x', 'y'], mean, covariance) >>> sampler = NUTSda(model=model, grad_log_pdf=GradLogPDFGaussian, simulate_dynamics=LeapFrog) >>> samples = sampler.sample(initial_pos=np.array([12, -4]), num_adapt=10, num_samples=10, ... stepsize=0.1, return_type='dataframe') >>> samples x y 0 12.000000 -4.000000 1 11.864821 -3.696109 2 10.546986 -4.892169 3 8.526596 -21.555793 4 8.526596 -21.555793 5 11.343194 -6.353789 6 -1.583269 -12.802931 7 12.411957 -11.704859 8 13.253336 -20.169492 9 11.295901 -7.665058 """ initial_pos = _check_1d_array_object(initial_pos, "initial_pos") _check_length_equal(initial_pos, self.model.variables, "initial_pos", "model.variables") if stepsize is None: stepsize = self._find_reasonable_stepsize(initial_pos) if num_adapt <= 1: return NoUTurnSampler(self.model, self.grad_log_pdf, self.simulate_dynamics).sample( initial_pos, num_samples, stepsize) mu = np.log(10.0 * stepsize) stepsize_bar = 1.0 h_bar = 0.0 types = [(var_name, "float") for var_name in self.model.variables] samples = np.zeros(num_samples, dtype=types).view(np.recarray) samples[0] = tuple(initial_pos) position_m = initial_pos for i in tqdm(range(1, num_samples)): position_m, alpha, n_alpha = self._sample(position_m, stepsize) samples[i] = tuple(position_m) if i <= num_adapt: stepsize, stepsize_bar, h_bar = self._adapt_params( stepsize, stepsize_bar, h_bar, mu, i, alpha, n_alpha) else: stepsize = stepsize_bar return _return_samples(return_type, samples)
def sample(self, initial_pos, num_samples, stepsize=None, return_type="dataframe"): """ Method to return samples using No U Turn Sampler Parameters ---------- initial_pos: A 1d array like object Vector representing values of parameter position, the starting state in markov chain. num_samples: int Number of samples to be generated stepsize: float , defaults to None The stepsize for proposing new values of position and momentum in simulate_dynamics If None, then will be choosen suitably return_type: string (dataframe | recarray) Return type for samples, either of 'dataframe' or 'recarray'. Defaults to 'dataframe' Returns ------- sampled: A pandas.DataFrame or a numpy.recarray object depending upon return_type argument Examples --------- >>> from ProbabilityModel.sampling import NoUTurnSampler as NUTS, GradLogPDFGaussian, LeapFrog >>> from ProbabilityModel.factors.continuous import GaussianDistribution as JGD >>> import numpy as np >>> mean = np.array([0, 0, 0]) >>> covariance = np.array([[6, 0.7, 0.2], [0.7, 3, 0.9], [0.2, 0.9, 1]]) >>> model = JGD(['x', 'y', 'z'], mean, covariance) >>> sampler = NUTS(model=model, grad_log_pdf=GradLogPDFGaussian, simulate_dynamics=LeapFrog) >>> samples = sampler.sample(initial_pos=np.array([1, 1, 1]), num_samples=10, ... stepsize=0.4, return_type='dataframe') >>> samples x y z 0 1.000000 1.000000 1.000000 1 1.760756 0.271543 -0.613309 2 1.883387 0.990745 -0.611720 3 0.980812 0.340336 -0.916283 4 0.781338 0.647220 -0.948640 5 0.040308 -1.391406 0.412201 6 1.179549 -1.450552 1.105216 7 1.100320 -1.313926 1.207815 8 1.484520 -1.349247 0.768599 9 0.934942 -1.894589 0.471772 """ initial_pos = _check_1d_array_object(initial_pos, "initial_pos") _check_length_equal(initial_pos, self.model.variables, "initial_pos", "model.variables") if stepsize is None: stepsize = self._find_reasonable_stepsize(initial_pos) types = [(var_name, "float") for var_name in self.model.variables] samples = np.zeros(num_samples, dtype=types).view(np.recarray) samples[0] = tuple(initial_pos) position_m = initial_pos for i in tqdm(range(1, num_samples)): # Genrating sample position_m = self._sample(position_m, stepsize) samples[i] = tuple(position_m) return _return_samples(return_type, samples)
def sample( self, initial_pos, num_adapt, num_samples, trajectory_length, stepsize=None, return_type="dataframe", ): """ Method to return samples using Hamiltonian Monte Carlo Parameters ---------- initial_pos: A 1d array like object Vector representing values of parameter position, the starting state in markov chain. num_adapt: int The number of iterations to run the adaptation of stepsize num_samples: int Number of samples to be generated trajectory_length: int or float Target trajectory length, stepsize * number of steps(L), where L is the number of steps taken per HMC iteration, and stepsize is step size for splitting time method. stepsize: float , defaults to None The stepsize for proposing new values of position and momentum in simulate_dynamics If None, then will be chosen suitably return_type: string (dataframe | recarray) Return type for samples, either of 'dataframe' or 'recarray'. Defaults to 'dataframe' Returns ------- sampled: A pandas.DataFrame or a numpy.recarray object depending upon return_type argument Examples --------- >>> from ProbabilityModel.sampling import HamiltonianMCDA as HMCda, GradLogPDFGaussian as GLPG, LeapFrog >>> from ProbabilityModel.factors.continuous import GaussianDistribution as JGD >>> import numpy as np >>> mean = np.array([1, 1]) >>> covariance = np.array([[1, 0.7], [0.7, 3]]) >>> model = JGD(['x', 'y'], mean, covariance) >>> sampler = HMCda(model=model, grad_log_pdf=GLPG, simulate_dynamics=LeapFrog) >>> samples = sampler.sample(np.array([1, 1]), num_adapt=10000, num_samples = 10000, ... trajectory_length=2, stepsize=None, return_type='recarray') >>> samples_array = np.array([samples[var_name] for var_name in model.variables]) >>> np.cov(samples_array) array([[ 0.98432155, 0.66517394], [ 0.66517394, 2.95449533]]) """ self.accepted_proposals = 1.0 initial_pos = _check_1d_array_object(initial_pos, "initial_pos") _check_length_equal(initial_pos, self.model.variables, "initial_pos", "model.variables") if stepsize is None: stepsize = self._find_reasonable_stepsize(initial_pos) if num_adapt <= 1: # Return samples genrated using Simple HMC algorithm return HamiltonianMC.sample(self, initial_pos, num_samples, trajectory_length, stepsize) # stepsize is epsilon # freely chosen point, after each iteration xt(/position) is shrunk towards it mu = np.log(10.0 * stepsize) # log(10 * stepsize) large values to save computation # stepsize_bar is epsilon_bar stepsize_bar = 1.0 h_bar = 0.0 # See equation (6) section 3.2.1 for details types = [(var_name, "float") for var_name in self.model.variables] samples = np.zeros(num_samples, dtype=types).view(np.recarray) samples[0] = tuple(initial_pos) position_m = initial_pos for i in tqdm(range(1, num_samples)): # Genrating sample position_m, alpha = self._sample(position_m, trajectory_length, stepsize) samples[i] = tuple(position_m) # Adaptation of stepsize till num_adapt iterations if i <= num_adapt: stepsize, stepsize_bar, h_bar = self._adapt_params( stepsize, stepsize_bar, h_bar, mu, i, alpha) else: stepsize = stepsize_bar self.acceptance_rate = self.accepted_proposals / num_samples return _return_samples(return_type, samples)
def sample( self, initial_pos, num_samples, trajectory_length, stepsize=None, return_type="dataframe", ): """ Method to return samples using Hamiltonian Monte Carlo Parameters ---------- initial_pos: A 1d array like object Vector representing values of parameter position, the starting state in markov chain. num_samples: int Number of samples to be generated trajectory_length: int or float Target trajectory length, stepsize * number of steps(L), where L is the number of steps taken per HMC iteration, and stepsize is step size for splitting time method. stepsize: float , defaults to None The stepsize for proposing new values of position and momentum in simulate_dynamics If None, then will be chosen suitably return_type: string (dataframe | recarray) Return type for samples, either of 'dataframe' or 'recarray'. Defaults to 'dataframe' Returns ------- sampled: A pandas.DataFrame or a numpy.recarray object depending upon return_type argument Examples -------- >>> from ProbabilityModel.sampling import HamiltonianMC as HMC, GradLogPDFGaussian, ModifiedEuler >>> from ProbabilityModel.factors.continuous import GaussianDistribution as JGD >>> import numpy as np >>> mean = np.array([1, -1]) >>> covariance = np.array([[1, 0.2], [0.2, 1]]) >>> model = JGD(['x', 'y'], mean, covariance) >>> sampler = HMC(model=model, grad_log_pdf=GradLogPDFGaussian, simulate_dynamics=ModifiedEuler) >>> samples = sampler.sample(np.array([1, 1]), num_samples = 5, ... trajectory_length=6, stepsize=0.25, return_type='dataframe') >>> samples x y 0 1.000000e+00 1.000000e+00 1 1.592133e+00 1.152911e+00 2 1.608700e+00 1.315349e+00 3 1.608700e+00 1.315349e+00 4 6.843856e-01 6.237043e-01 >>> mean = np.array([4, 1, -1]) >>> covariance = np.array([[1, 0.7 , 0.8], [0.7, 1, 0.2], [0.8, 0.2, 1]]) >>> model = JGD(['x', 'y', 'z'], mean, covariance) >>> sampler = HMC(model=model, grad_log_pdf=GLPG) >>> samples = sampler.sample(np.array([1, 1]), num_samples = 10000, ... trajectory_length=6, stepsize=0.25, return_type='dataframe') >>> np.cov(samples.values.T) array([[ 1.00795398, 0.71384233, 0.79802097], [ 0.71384233, 1.00633524, 0.21313767], [ 0.79802097, 0.21313767, 0.98519017]]) """ self.accepted_proposals = 1.0 initial_pos = _check_1d_array_object(initial_pos, "initial_pos") _check_length_equal(initial_pos, self.model.variables, "initial_pos", "model.variables") if stepsize is None: stepsize = self._find_reasonable_stepsize(initial_pos) types = [(var_name, "float") for var_name in self.model.variables] samples = np.zeros(num_samples, dtype=types).view(np.recarray) # Assigning after converting into tuple because value was being changed after assignment # Reason for this is unknown samples[0] = tuple(initial_pos) position_m = initial_pos lsteps = int(max(1, round(trajectory_length / stepsize, 0))) for i in tqdm(range(1, num_samples)): # Genrating sample position_m, _ = self._sample(position_m, trajectory_length, stepsize, lsteps) samples[i] = tuple(position_m) self.acceptance_rate = self.accepted_proposals / num_samples return _return_samples(return_type, samples)
def likelihood_weighted_sample(self, evidence=[], size=1, return_type="dataframe"): """ Generates weighted sample(s) from joint distribution of the bayesian network, that comply with the given evidence. 'Probabilistic Graphical Model Principles and Techniques', Koller and Friedman, Algorithm 12.2 pp 493. Parameters ---------- evidence: list of `ProbabilityModel.factor.State` namedtuples None if no evidence size: int size of sample to be generated return_type: string (dataframe | recarray) Return type for samples, either of 'dataframe' or 'recarray'. Defaults to 'dataframe' Returns ------- sampled: A pandas.DataFrame or a numpy.recarray object depending upon return_type argument the generated samples with corresponding weights Examples -------- >>> from ProbabilityModel.factors.discrete import State >>> from ProbabilityModel.models.BayesianModel import BayesianModel >>> from ProbabilityModel.factors.discrete import TabularCPD >>> from ProbabilityModel.sampling import BayesianModelSampling >>> student = BayesianModel([('diff', 'grade'), ('intel', 'grade')]) >>> cpd_d = TabularCPD('diff', 2, [[0.6], [0.4]]) >>> cpd_i = TabularCPD('intel', 2, [[0.7], [0.3]]) >>> cpd_g = TabularCPD('grade', 3, [[0.3, 0.05, 0.9, 0.5], [0.4, 0.25, ... 0.08, 0.3], [0.3, 0.7, 0.02, 0.2]], ... ['intel', 'diff'], [2, 2]) >>> student.add_cpds(cpd_d, cpd_i, cpd_g) >>> inference = BayesianModelSampling(student) >>> evidence = [State('diff', 0)] >>> inference.likelihood_weighted_sample(evidence=evidence, size=2, return_type='recarray') rec.array([(0, 0, 1, 0.6), (0, 0, 2, 0.6)], dtype= [('diff', '<i8'), ('intel', '<i8'), ('grade', '<i8'), ('_weight', '<f8')]) """ # Covert evidence state names to number evidence = [(var, self.model.get_cpds(var).get_state_no(var, state)) for var, state in evidence] # Prepare the return array types = [(var_name, "int") for var_name in self.topological_order] types.append(("_weight", "float")) sampled = np.zeros(size, dtype=types).view(np.recarray) sampled["_weight"] = np.ones(size) evidence_dict = {var: st for var, st in evidence} # Do the sampling for node in self.topological_order: cpd = self.model.get_cpds(node) states = range(self.cardinality[node]) evidence = cpd.get_evidence() if evidence: evidence_values = np.vstack([sampled[i] for i in evidence]) cached_values = self.pre_compute_reduce(node) weights = list( map(lambda t: cached_values[tuple(t)], evidence_values.T)) if node in evidence_dict: sampled[node] = evidence_dict[node] for i in range(size): sampled["_weight"][i] *= weights[i][ evidence_dict[node]] else: sampled[node] = sample_discrete(states, weights) else: if node in evidence_dict: sampled[node] = evidence_dict[node] for i in range(size): sampled["_weight"][i] *= cpd.values[ evidence_dict[node]] else: sampled[node] = sample_discrete(states, cpd.values, size) # Postprocess the samples: Correct return type and change state numbers to names return _return_samples(return_type, sampled, self.state_names_map)
def rejection_sample(self, evidence=[], size=1, return_type="dataframe"): """ Generates sample(s) from joint distribution of the bayesian network, given the evidence. Parameters ---------- evidence: list of `ProbabilityModel.factor.State` namedtuples None if no evidence size: int size of sample to be generated return_type: string (dataframe | recarray) Return type for samples, either of 'dataframe' or 'recarray'. Defaults to 'dataframe' Returns ------- sampled: A pandas.DataFrame or a numpy.recarray object depending upon return_type argument the generated samples Examples -------- >>> from ProbabilityModel.models.BayesianModel import BayesianModel >>> from ProbabilityModel.factors.discrete import TabularCPD >>> from ProbabilityModel.factors.discrete import State >>> from ProbabilityModel.sampling import BayesianModelSampling >>> student = BayesianModel([('diff', 'grade'), ('intel', 'grade')]) >>> cpd_d = TabularCPD('diff', 2, [[0.6], [0.4]]) >>> cpd_i = TabularCPD('intel', 2, [[0.7], [0.3]]) >>> cpd_g = TabularCPD('grade', 3, [[0.3, 0.05, 0.9, 0.5], [0.4, 0.25, ... 0.08, 0.3], [0.3, 0.7, 0.02, 0.2]], ... ['intel', 'diff'], [2, 2]) >>> student.add_cpds(cpd_d, cpd_i, cpd_g) >>> inference = BayesianModelSampling(student) >>> evidence = [State(var='diff', state=0)] >>> inference.rejection_sample(evidence=evidence, size=2, return_type='dataframe') intel diff grade 0 0 0 1 1 0 0 1 """ # Covert evidence state names to number evidence = [(var, self.model.get_cpds(var).get_state_no(var, state)) for var, state in evidence] # If no evidence is given, it is equivalent to forward sampling. if len(evidence) == 0: return self.forward_sample(size) # Setup array to be returned types = [(var_name, "int") for var_name in self.topological_order] sampled = np.zeros(0, dtype=types).view(np.recarray) prob = 1 i = 0 # Do the sampling by generating samples from forward sampling and rejecting the # samples which do not match our evidence. Keep doing until we have enough # samples. pbar = tqdm(total=size) while i < size: _size = int(((size - i) / prob) * 1.5) _sampled = self.forward_sample(_size, "recarray") for evid in evidence: _sampled = _sampled[_sampled[evid[0]] == evid[1]] prob = max(len(_sampled) / _size, 0.01) sampled = np.append(sampled, _sampled)[:size] i += len(_sampled) pbar.update(len(_sampled)) pbar.close() # Post process: Correct return type and replace state numbers with names. return _return_samples(return_type, sampled, self.state_names_map)