def generate_datasets(networks, folder, nb_samples=2000): for network in networks: dataset_out_path = os.path.join(folder, 'datasets', network + '.csv') inference = BayesianModelSampling(networks[network]) samples = inference.forward_sample(size=nb_samples) samples.to_csv(dataset_out_path)
def sampling(model, n=1000, verbose=3): ''' Parameters ---------- model: [DICT] Contains model and adjmat n: [INT] Number of samples to generate n=1000 (default) verbose: [INT] Print messages to screen. 0: NONE 1: ERROR 2: WARNING 3: INFO (default) 4: DEBUG 5: TRACE Returns ------- Pandas DataFrame ''' assert n > 0, 'n must be 1 or larger' assert 'BayesianModel' in str( type(model['model']) ), 'Model must contain DAG from BayesianModel. Note that <misarables> example does not include DAG.' # http://pgmpy.org/sampling.html inference = BayesianModelSampling(model['model']) # inference = GibbsSampling(model) # Forward sampling and make dataframe df = inference.forward_sample(size=n, return_type='dataframe') return (df)
def sample(self, nb_sample=1): # sampling of pgmpy samples the index of the values # Here we convert back this index to the actual value def convert(samples): for col in samples.columns: _, states = self.get_state_space(col) samples[col] = samples[col].apply(lambda x: states[x]) return samples inference = BayesianModelSampling(self.bn) samples = inference.forward_sample(size=nb_sample) return convert(samples)
def sample(self, n_samples=1) : """ Sample n data points from the Bayesian Network :param n_samples: int, amount of datapoints to generate. :return: Dataframe of new datapoints shape (n_samples,n_features) """ np.random.seed(self.random_state) inference = BayesianModelSampling(self.model) Y = inference.forward_sample(size=n_samples, return_type='dataframe') Y = Y[sorted(Y.columns)] return Y[cols]
def getDataset(self, size=1000, return_type='DataFrame'): """ Method: retrun a set of samples generated from Bayesian Network. (Simply using forward-sampling) Parameters ---------- size: size of the dataset to be generated (default: 1000) return_type: return type of dataset (default: panda.DataFrame) """ # For more info, see: likelihood_weighted, rejection or Gibb sampling from pgmpy.sampling import BayesianModelSampling inference = BayesianModelSampling(self.__covid_model) dataset = inference.forward_sample(size=size, return_type=return_type) return dataset
def sampling(DAG, n=1000, verbose=3): """Generate sample(s) using forward sampling from joint distribution of the bayesian network. Parameters ---------- DAG : dict Contains model and adjmat of the DAG. n : int, optional Number of samples to generate. The default is 1000. verbose : int, optional Print progress to screen. The default is 3. 0: None, 1: ERROR, 2: WARN, 3: INFO (default), 4: DEBUG, 5: TRACE Returns ------- df : pd.DataFrame(). Dataframe containing sampled data from the input DAG model. Example ------- >>> import bnlearn >>> DAG = bnlearn.import_DAG('sprinkler') >>> df = bnlearn.sampling(DAG, n=1000) """ if n <= 0: raise ValueError('n must be 1 or larger') if 'BayesianModel' not in str(type(DAG['model'])): raise ValueError('DAG must contain BayesianModel.') if verbose >= 3: print('[bnlearn] >Forward sampling for %.0d samples..' % (n)) if len(DAG['model'].get_cpds()) == 0: print( '[bnlearn] >This seems like a DAG containing only edges, and no CPDs. Tip: use bn.parameter_learning.fit(DAG, df) to learn the CPDs first.' ) return # http://pgmpy.org/sampling.html infer_model = BayesianModelSampling(DAG['model']) # inference = GibbsSampling(model['model']) # Forward sampling and make dataframe df = infer_model.forward_sample(size=n, return_type='dataframe') return (df)
def sampling(model, n=1000, verbose=3): """Sample based on DAG. Parameters ---------- model : dict Contains model and adjmat. n : int, optional Number of samples to generate. The default is 1000. verbose : int, optional Print progress to screen. The default is 3. 0: NONE 1: ERROR 2: WARNING 3: INFO (default) 4: DEBUG 5: TRACE Returns ------- df : pd.DataFrame(). Example ------- >>> import bnlearn >>> model = bnlearn.import_DAG('sprinkler') >>> df = bnlearn.sampling(model, n=1000) """ assert n > 0, 'n must be 1 or larger' assert 'BayesianModel' in str( type(model['model']) ), 'Model must contain DAG from BayesianModel. Note that <misarables> example does not include DAG.' if verbose >= 3: print('[BNLEARN][sampling] Forward sampling for %.0d samples..' % (n)) # http://pgmpy.org/sampling.html inference = BayesianModelSampling(model['model']) # inference = GibbsSampling(model) # Forward sampling and make dataframe df = inference.forward_sample(size=n, return_type='dataframe') return (df)
def sample(N): bn_generate = BayesianModel([('D', 'G'), ('I', 'G'), ('E', 'L'), ('G', 'L')]) cpd_d = TabularCPD('D', 2, [[0.6], [0.4]]) cpd_i = TabularCPD('I', 2, [[0.7], [0.3]]) cpd_g = TabularCPD('G', 3, [[0.3, 0.9, 0.05, 0.5], [0.4, 0.08, 0.25, 0.3], [0.3, 0.02, 0.7, 0.2]], ['D', 'I'], [2, 2]) cpd_e = TabularCPD('E', 2, [[0.5], [0.5]]) cpd_l = TabularCPD( 'L', 2, [[0.1, 0.3, 0.4, 0.25, 0.8, 0.99], [0.9, 0.7, 0.6, 0.75, 0.2, 0.01]], ['G', 'E'], [3, 2]) bn_generate.add_cpds(cpd_d, cpd_i, cpd_g, cpd_e, cpd_l) infer = BayesianModelSampling(bn_generate) data = infer.forward_sample(N) return data, bn_generate
class DynamicBayesianNetwork(Process): defaults = { 'nodes': [], 'edges': [], 'conditional_probabilities': { 'node_id': [] } } def __init__(self, parameters=None): super().__init__(parameters) # set up the network based on the parameters self.model = DBN() self.model.add_nodes_from(self.parameters['nodes']) self.model.add_edges_from(self.parameters['edges']) print(f'EDGES: {sorted(self.model.edges())}') import ipdb ipdb.set_trace() # TODO -- add 'evidence' -- get from network? cpds = (TabularCPD(variable=node_id, variable_card=len(values), values=values, evidence=[]) for node_id, values in self.parameters['conditional_probabilities']) self.model.add_cpds(cpds) # make an inference instance for sampling the model self.inference = BayesianModelSampling(self.model) # get a sample sample = self.inference.forward_sample(size=2) def ports_schema(self): return {} def next_update(self, timestep, states): return {}
def sample_dag(dag, num): #zzz this loses disconnected nodes!!! # bayesmod = BayesianModel(dag.edges()) # bayesmod = BayesianModel(dag) bayesmod = BayesianModel() bayesmod.add_nodes_from(dag.nodes()) bayesmod.add_edges_from(dag.edges()) tab_cpds = [] cards = {node: len(dag.node[node]['cpd']) for node in dag.nodes()} for node in dag.nodes(): parents = dag.predecessors(node) cpd = dag.node[node]['cpd'] if parents: parent_cards = [cards[par] for par in parents] logging.debug("TablularCPD({}, {}, {}, {}, {})".format( node, cards[node], cpd, parents, parent_cards)) tab_cpds.append( TabularCPD(node, cards[node], cpd, parents, parent_cards)) else: logging.debug("TablularCPD({}, {}, {})".format( node, cards[node], cpd)) tab_cpds.append(TabularCPD(node, cards[node], cpd)) logging.debug("cpds add: {}".format(tab_cpds)) print "model variables:", bayesmod.nodes() for tab_cpd in tab_cpds: print "cpd variables:", tab_cpd.variables bayesmod.add_cpds(*tab_cpds) logging.debug("cpds get: {}".format(bayesmod.get_cpds())) inference = BayesianModelSampling(bayesmod) logging.debug("generating data") recs = inference.forward_sample(size=num, return_type='recarray') return recs
cpd1.append(p_21) cpd1.append(p_52) cpd1.append(p_14) cpd1.append(p_64) cpd1.append(p_36) cpd1.append(p4) model1.add_cpds(*cpd1) print("------------------------------------------") print("Edges of model1:", model1.edges()) print("Checking Model1:", model1.check_model()) print("------------------------------------------") '''generate data for model1''' inference = BayesianModelSampling(model1) data=inference.forward_sample(size=3000, return_type='dataframe') print("Data for model1:") print(data) k2=K2Score(data) print('Model1 K2 Score: ' + str(k2.score(model1))) '''Inference''' from pgmpy.inference import VariableElimination infer = VariableElimination(model1) print("Inference of x3:") print(infer.query(['x3']) ['x3']) print("Inference of x5|x2:") print(infer.query(['x5'], evidence={ 'x2': 1}) ['x5']) ''''Model2'''
class TestBayesianModelSampling(unittest.TestCase): def setUp(self): self.bayesian_model = BayesianModel([('A', 'J'), ('R', 'J'), ('J', 'Q'), ('J', 'L'), ('G', 'L')]) cpd_a = TabularCPD('A', 2, [[0.2], [0.8]]) cpd_r = TabularCPD('R', 2, [[0.4], [0.6]]) cpd_j = TabularCPD('J', 2, [[0.9, 0.6, 0.7, 0.1], [0.1, 0.4, 0.3, 0.9]], ['R', 'A'], [2, 2]) cpd_q = TabularCPD('Q', 2, [[0.9, 0.2], [0.1, 0.8]], ['J'], [2]) cpd_l = TabularCPD('L', 2, [[0.9, 0.45, 0.8, 0.1], [0.1, 0.55, 0.2, 0.9]], ['G', 'J'], [2, 2]) cpd_g = TabularCPD('G', 2, [[0.6], [0.4]]) self.bayesian_model.add_cpds(cpd_a, cpd_g, cpd_j, cpd_l, cpd_q, cpd_r) self.sampling_inference = BayesianModelSampling(self.bayesian_model) self.markov_model = MarkovModel() def test_init(self): with self.assertRaises(TypeError): BayesianModelSampling(self.markov_model) def test_forward_sample(self): sample = self.sampling_inference.forward_sample(25) self.assertEquals(len(sample), 25) self.assertEquals(len(sample.columns), 6) self.assertIn('A', sample.columns) self.assertIn('J', sample.columns) self.assertIn('R', sample.columns) self.assertIn('Q', sample.columns) self.assertIn('G', sample.columns) self.assertIn('L', sample.columns) self.assertTrue(set(sample.A).issubset({0, 1})) self.assertTrue(set(sample.J).issubset({0, 1})) self.assertTrue(set(sample.R).issubset({0, 1})) self.assertTrue(set(sample.Q).issubset({0, 1})) self.assertTrue(set(sample.G).issubset({0, 1})) self.assertTrue(set(sample.L).issubset({0, 1})) def test_rejection_sample_basic(self): sample = self.sampling_inference.rejection_sample([State('A', 1), State('J', 1), State('R', 1)], 25) self.assertEquals(len(sample), 25) self.assertEquals(len(sample.columns), 6) self.assertIn('A', sample.columns) self.assertIn('J', sample.columns) self.assertIn('R', sample.columns) self.assertIn('Q', sample.columns) self.assertIn('G', sample.columns) self.assertIn('L', sample.columns) self.assertTrue(set(sample.A).issubset({1})) self.assertTrue(set(sample.J).issubset({1})) self.assertTrue(set(sample.R).issubset({1})) self.assertTrue(set(sample.Q).issubset({0, 1})) self.assertTrue(set(sample.G).issubset({0, 1})) self.assertTrue(set(sample.L).issubset({0, 1})) @patch("pgmpy.sampling.BayesianModelSampling.forward_sample", autospec=True) def test_rejection_sample_less_arg(self, forward_sample): sample = self.sampling_inference.rejection_sample(size=5) forward_sample.assert_called_once_with(self.sampling_inference, 5) self.assertEqual(sample, forward_sample.return_value) def test_likelihood_weighted_sample(self): sample = self.sampling_inference.likelihood_weighted_sample([State('A', 0), State('J', 1), State('R', 0)], 25) self.assertEquals(len(sample), 25) self.assertEquals(len(sample.columns), 7) self.assertIn('A', sample.columns) self.assertIn('J', sample.columns) self.assertIn('R', sample.columns) self.assertIn('Q', sample.columns) self.assertIn('G', sample.columns) self.assertIn('L', sample.columns) self.assertIn('_weight', sample.columns) self.assertTrue(set(sample.A).issubset({0, 1})) self.assertTrue(set(sample.J).issubset({0, 1})) self.assertTrue(set(sample.R).issubset({0, 1})) self.assertTrue(set(sample.Q).issubset({0, 1})) self.assertTrue(set(sample.G).issubset({0, 1})) self.assertTrue(set(sample.L).issubset({0, 1})) def tearDown(self): del self.sampling_inference del self.bayesian_model del self.markov_model
class TestBayesianModelSampling(unittest.TestCase): def setUp(self): self.bayesian_model = BayesianModel([('A', 'J'), ('R', 'J'), ('J', 'Q'), ('J', 'L'), ('G', 'L')]) cpd_a = TabularCPD('A', 2, [[0.2], [0.8]]) cpd_r = TabularCPD('R', 2, [[0.4], [0.6]]) cpd_j = TabularCPD('J', 2, [[0.9, 0.6, 0.7, 0.1], [0.1, 0.4, 0.3, 0.9]], ['R', 'A'], [2, 2]) cpd_q = TabularCPD('Q', 2, [[0.9, 0.2], [0.1, 0.8]], ['J'], [2]) cpd_l = TabularCPD('L', 2, [[0.9, 0.45, 0.8, 0.1], [0.1, 0.55, 0.2, 0.9]], ['G', 'J'], [2, 2]) cpd_g = TabularCPD('G', 2, [[0.6], [0.4]]) self.bayesian_model.add_cpds(cpd_a, cpd_g, cpd_j, cpd_l, cpd_q, cpd_r) self.sampling_inference = BayesianModelSampling(self.bayesian_model) self.markov_model = MarkovModel() def test_init(self): with self.assertRaises(TypeError): BayesianModelSampling(self.markov_model) def test_forward_sample(self): sample = self.sampling_inference.forward_sample(25) self.assertEquals(len(sample), 25) self.assertEquals(len(sample.columns), 6) self.assertIn('A', sample.columns) self.assertIn('J', sample.columns) self.assertIn('R', sample.columns) self.assertIn('Q', sample.columns) self.assertIn('G', sample.columns) self.assertIn('L', sample.columns) self.assertTrue(set(sample.A).issubset({0, 1})) self.assertTrue(set(sample.J).issubset({0, 1})) self.assertTrue(set(sample.R).issubset({0, 1})) self.assertTrue(set(sample.Q).issubset({0, 1})) self.assertTrue(set(sample.G).issubset({0, 1})) self.assertTrue(set(sample.L).issubset({0, 1})) def test_rejection_sample_basic(self): sample = self.sampling_inference.rejection_sample( [State('A', 1), State('J', 1), State('R', 1)], 25) self.assertEquals(len(sample), 25) self.assertEquals(len(sample.columns), 6) self.assertIn('A', sample.columns) self.assertIn('J', sample.columns) self.assertIn('R', sample.columns) self.assertIn('Q', sample.columns) self.assertIn('G', sample.columns) self.assertIn('L', sample.columns) self.assertTrue(set(sample.A).issubset({1})) self.assertTrue(set(sample.J).issubset({1})) self.assertTrue(set(sample.R).issubset({1})) self.assertTrue(set(sample.Q).issubset({0, 1})) self.assertTrue(set(sample.G).issubset({0, 1})) self.assertTrue(set(sample.L).issubset({0, 1})) @patch("pgmpy.sampling.BayesianModelSampling.forward_sample", autospec=True) def test_rejection_sample_less_arg(self, forward_sample): sample = self.sampling_inference.rejection_sample(size=5) forward_sample.assert_called_once_with(self.sampling_inference, 5) self.assertEqual(sample, forward_sample.return_value) def test_likelihood_weighted_sample(self): sample = self.sampling_inference.likelihood_weighted_sample( [State('A', 0), State('J', 1), State('R', 0)], 25) self.assertEquals(len(sample), 25) self.assertEquals(len(sample.columns), 7) self.assertIn('A', sample.columns) self.assertIn('J', sample.columns) self.assertIn('R', sample.columns) self.assertIn('Q', sample.columns) self.assertIn('G', sample.columns) self.assertIn('L', sample.columns) self.assertIn('_weight', sample.columns) self.assertTrue(set(sample.A).issubset({0, 1})) self.assertTrue(set(sample.J).issubset({0, 1})) self.assertTrue(set(sample.R).issubset({0, 1})) self.assertTrue(set(sample.Q).issubset({0, 1})) self.assertTrue(set(sample.G).issubset({0, 1})) self.assertTrue(set(sample.L).issubset({0, 1})) def tearDown(self): del self.sampling_inference del self.bayesian_model del self.markov_model
def generateWysiwygDataDI(samplesize=4000): ''' same principle as generateWysiwygData(), but we have 3 continous variables and 3 discrete C and X variables. This distribution was used for the DI removal experiment, because IBM AIF360's DI removal only impacts continous variables. ''' wysiwygmodel = BayesianModel([('A', 'C1'), ('A', 'C2'), ('A', 'C3'), ('C1', 'Y'), ('Y', 'C2'), ('Y', 'C3'), ('A', 'X1'), ('A', 'X2'), ('A', 'X3'), ('Y', 'X1'), ('Y', 'X2'), ('Y', 'X3')]) cpd_a = TabularCPD(variable='A', variable_card=2, values=[[0.5], [0.5]]) cpd_y = TabularCPD(variable='Y', variable_card=2, values=[[0.7], [0.35], [0.3], [0.65]], evidence=['C1'], evidence_card=[2]) cpd_c1 = TabularCPD(variable='C1', variable_card=2, values=[[0.65, 0.3], [0.35, 0.7]], evidence=['A'], evidence_card=[2]) cpd_c2 = TabularCPD(variable='C2', variable_card=4, values=[[0.24, 0.27, 0.25, 0.24], [0.28, 0.23, 0.24, 0.22], [0.24, 0.27, 0.25, 0.26], [0.24, 0.23, 0.26, 0.28]], evidence=['A', 'Y'], evidence_card=[2, 2]) cpd_c3 = TabularCPD(variable='C3', variable_card=4, values=[[0.22, 0.25, 0.25, 0.37], [0.23, 0.25, 0.26, 0.21], [0.23, 0.25, 0.25, 0.22], [0.32, 0.25, 0.24, 0.20]], evidence=['A', 'Y'], evidence_card=[2, 2]) cpd_x1 = TabularCPD(variable='X1', variable_card=2, values=[[0.54, 0.48, 0.52, 0.45], [0.46, 0.52, 0.48, 0.55]], evidence=['A', 'Y'], evidence_card=[2, 2]) cpd_x2 = TabularCPD(variable='X2', variable_card=4, values=[[0.25, 0.27, 0.26, 0.23], [0.30, 0.23, 0.24, 0.23], [0.23, 0.27, 0.26, 0.23], [0.22, 0.23, 0.24, 0.31]], evidence=['A', 'Y'], evidence_card=[2, 2]) cpd_x3 = TabularCPD(variable='X3', variable_card=4, values=[[0.22, 0.25, 0.25, 0.30], [0.23, 0.25, 0.26, 0.24], [0.23, 0.25, 0.24, 0.24], [0.32, 0.25, 0.25, 0.22]], evidence=['A', 'Y'], evidence_card=[2, 2]) wysiwygmodel.add_cpds(cpd_a, cpd_c1, cpd_c2, cpd_c3, cpd_x1, cpd_x2, cpd_x3, cpd_y) datasamples = BayesianModelSampling(wysiwygmodel) discframe = datasamples.forward_sample(samplesize) AY = discframe[["A", "Y"]] C4 = samplecontinuous(AY, samplesize=samplesize, contatt="C4", meana0=5, meana1=6, covy0=[1], covy1=[1.8]) C5 = samplecontinuous(AY, samplesize=samplesize, contatt="C5", meana0=1, meana1=2, covy0=[1], covy1=[0.9]) C6 = samplecontinuous(AY, samplesize=samplesize, contatt="C6", meana0=4, meana1=5.3, covy0=[1], covy1=[0.95]) X4 = samplecontinuous(AY, samplesize=samplesize, contatt="X4", meana0=5.5, meana1=6, covy0=[1.2], covy1=[1.4]) X5 = samplecontinuous(AY, samplesize=samplesize, contatt="X5", meana0=1.1, meana1=1.7, covy0=[1.1], covy1=[1.0]) X6 = samplecontinuous(AY, samplesize=samplesize, contatt="X6", meana0=4.5, meana1=5.1, covy0=[1], covy1=[1.1]) discframe = pd.concat([discframe, C4, C5, C6, X4, X5, X6], axis=1) discframe.to_csv(path_or_buf="data/wysiwygdata5.csv")
model = BayesianModel([('A', 'B'), ('C', 'B'), ('C', 'D'), ('B', 'E')]) model.fit(values) predict_data = predict_data.copy() predict_data.drop('E', axis=1, inplace=True) #print predict_data y_pred = model.predict(predict_data) y_prob = model.predict_probability(predict_data) from pgmpy.sampling import BayesianModelSampling model = BayesianModel([('D', 'G'), ('I', 'G')]) cpd_d = TabularCPD('D', 2, [[0.6], [0.4]]) cpd_i = TabularCPD('I', 2, [[0.7], [0.3]]) cpd_g = TabularCPD('G', 3, [[0.3, 0.05, 0.9, 0.5], [0.4, 0.25, 0.08, 0.3], [0.3, 0.7, 0.02, 0.2]], ['D', 'I'], [2, 2]) model.add_cpds(cpd_d, cpd_i, cpd_g) infer = BayesianModelSampling(model) data = infer.forward_sample(500) #print data model.fit(data, estimator=MaximumLikelihoodEstimator) for cpd in model.get_cpds(): print("CPD of {variable}:".format(variable=cpd.variable)) print(cpd)
variable_card=2, values=[[0.95, 0.2], [0.05, 0.8]], evidence=['M'], evidence_card=[2]) # Associating the CPDs with the network model.add_cpds(cpd_d, cpd_m, cpd_r, cpd_l, cpd_e) # check_model checks for the network structure and CPDs and verifies that the CPDs are correctly # defined and sum to 1. model.check_model() # Forward_sample then iterate and count strong musician/ good letter/both inference = BayesianModelSampling(model) numSamples = 10000 samples = inference.forward_sample(size=numSamples, return_type='recarray') part1 = 0 strongLetter = 0 weakMusician = 0 strongLetterWeakMuscician = 0 # Samples have structure (M E D R L) for sample in samples: # P(m = strong)P(d = low)P(r = ∗ ∗ |m = strong, d = low)P(e = high|m = strong)P(letter = weak| ∗ ∗) if sample[0] and not sample[2] and sample[3] == 2 and sample[ 1] and not sample[4]: part1 += 1 # P(letter = strong) if sample[4]: strongLetter += 1
# In[15]: # In[19]: #infer1 = BayesianModelSampling(Mental_health_model) #evidence2 = [State('treatment',1)] #np.mean(infer1.likelihood_weighted_sample(evidence2,5)) # In[20]: # In[30]: infer1 = BayesianModelSampling(Mental_health_model) evidence1 = [State('treatment', 1)] sample1 = infer1.forward_sample(5) sample1 # In[31]: m = np.mean(sample1) print("Mean: ", m) # In[32]: # In[33]: scipy.stats.entropy(sample1) # In[71]:
def generateWysiwygFIDataOld(samplesize=4000, filename="data/preFIData.csv"): ''' old version of the bayesian model for the Fair Inference experiment. Here Y still influences X to make modelling Y simpler. This is not suitable for FI. This model is unused in the experiments in the final thesis. ''' wysiwygmodel = BayesianModel([('A', 'C1'), ('A', 'C2'), ('A', 'C3'), ('A', 'C4'), ('C1', 'Y'), ('Y', 'C2'), ('Y', 'C3'), ('Y', 'C4'), ('A', 'X1'), ('A', 'X2'), ('A', 'X3'), ('A', 'X4'), ('Y', 'X1'), ('Y', 'X2'), ('Y', 'X3'), ('Y', 'X4'), ('D1', 'X1'), ('D1', 'X2'), ('D2', 'X3'), ('D3', 'X4')]) cpd_a = TabularCPD(variable='A', variable_card=2, values=[[0.5], [0.5]]) cpd_d1 = TabularCPD(variable='D1', variable_card=2, values=[[0.45], [0.55]]) cpd_d2 = TabularCPD(variable='D2', variable_card=4, values=[[0.22], [0.24], [0.28], [0.26]]) cpd_d3 = TabularCPD(variable='D3', variable_card=2, values=[[0.54], [0.46]]) cpd_y = TabularCPD(variable='Y', variable_card=2, values=[[0.7], [0.3], [0.3], [0.7]], evidence=['C1'], evidence_card=[2]) cpd_c1 = TabularCPD(variable='C1', variable_card=2, values=[[0.85, 0.2], [0.15, 0.8]], evidence=['A'], evidence_card=[2]) cpd_c2 = TabularCPD(variable='C2', variable_card=4, values=[[0.23, 0.27, 0.25, 0.20], [0.35, 0.23, 0.24, 0.15], [0.22, 0.27, 0.25, 0.25], [0.20, 0.23, 0.26, 0.40]], evidence=['A', 'Y'], evidence_card=[2, 2]) cpd_c3 = TabularCPD(variable='C3', variable_card=2, values=[[0.52, 0.49, 0.5, 0.45], [0.48, 0.51, 0.5, 0.55]], evidence=['A', 'Y'], evidence_card=[2, 2]) cpd_c4 = TabularCPD(variable='C4', variable_card=4, values=[[0.22, 0.25, 0.25, 0.37], [0.23, 0.25, 0.26, 0.21], [0.23, 0.25, 0.25, 0.22], [0.32, 0.25, 0.24, 0.20]], evidence=['A', 'Y'], evidence_card=[2, 2]) cpd_x1 = TabularCPD( variable='X1', variable_card=2, values=[ [0.38, 0.40, 0.42, 0.44, 0.57, 0.59, 0.60, 0.62], #GOOD [0.62, 0.60, 0.58, 0.56, 0.43, 0.41, 0.40, 0.38] ], evidence=['A', 'Y', 'D1'], evidence_card=[2, 2, 2]) cpd_x2 = TabularCPD( variable='X2', variable_card=4, values=[ [0.30, 0.28, 0.27, 0.25, 0.17, 0.16, 0.15, 0.14], [0.24, 0.26, 0.26, 0.27, 0.29, 0.31, 0.30, 0.32], #GOOD 2 [0.16, 0.18, 0.20, 0.22, 0.35, 0.37, 0.38, 0.40], #GOOD 1 [0.30, 0.28, 0.27, 0.26, 0.19, 0.16, 0.17, 0.14] ], evidence=['A', 'Y', 'D1'], evidence_card=[2, 2, 2]) cpd_x3 = TabularCPD( variable='X3', variable_card=2, values=[[ 0.64, 0.62, 0.62, 0.63, 0.60, 0.58, 0.58, 0.59, 0.40, 0.39, 0.39, 0.38, 0.38, 0.35, 0.35, 0.37 ], [ 0.36, 0.38, 0.38, 0.37, 0.40, 0.42, 0.42, 0.41, 0.60, 0.61, 0.61, 0.62, 0.62, 0.65, 0.65, 0.63 ]], #GOOD evidence=['A', 'Y', 'D2'], evidence_card=[2, 2, 4]) cpd_x4 = TabularCPD( variable='X4', variable_card=4, values=[ [0.25, 0.27, 0.21, 0.23, 0.10, 0.12, 0.07, 0.09], [0.36, 0.34, 0.42, 0.40, 0.60, 0.58, 0.64, 0.62], #GOOD1 [0.25, 0.27, 0.21, 0.23, 0.10, 0.12, 0.07, 0.09], [0.14, 0.12, 0.16, 0.14, 0.20, 0.18, 0.22, 0.20] ], #GOOD2 evidence=['A', 'Y', 'D3'], evidence_card=[2, 2, 2]) wysiwygmodel.add_cpds(cpd_a, cpd_c1, cpd_c2, cpd_c3, cpd_c4, cpd_x1, cpd_x2, cpd_x3, cpd_x4, cpd_y, cpd_d1, cpd_d2, cpd_d3) datasamples = BayesianModelSampling(wysiwygmodel) discframe = datasamples.forward_sample(samplesize) AY = discframe[["A", "Y"]] C5 = samplecontinuous(AY, samplesize=samplesize, contatt="C5", meana0=1, meana1=1.2, covy0=[1], covy1=[0.9]) C6 = samplecontinuous(AY, samplesize=samplesize, contatt="C6", meana0=2, meana1=1.8, covy0=[1], covy1=[0.95]) X5 = samplecontinuous(AY, samplesize=samplesize, contatt="X5", meana0=1.1, meana1=1.4, covy0=[1.1], covy1=[0.95]) X6 = samplecontinuous(AY, samplesize=samplesize, contatt="X6", meana0=1.9, meana1=1.5, covy0=[1], covy1=[1.1]) discframe = pd.concat([discframe, C5, C6, X5, X6], axis=1) ndf = discframe.reindex(axis=1, labels=[ 'A', 'Y', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'D1', 'D2', 'D3' ]) ndf.to_csv(path_or_buf=filename)
def generateWysiwygFIData(samplesize=4000, filename="data/preFIData.csv"): ''' The bayesian network that was used in the FI experiment. The edges between X and Y are flipped from the previous models, so X causally influences Y. The D variables are added to more closely approximate the experiments from the 'Fair Inference on Outcomes' paper. ''' wysiwygmodel = BayesianModel([('A', 'C1'), ('A', 'C2'), ('A', 'C3'), ('A', 'C4'), ('Y', 'C2'), ('Y', 'C3'), ('Y', 'C4'), ('A', 'X1'), ('A', 'X2'), ('A', 'X3'), ('A', 'X4'), ('X1', 'Y'), ('X2', 'Y'), ('X3', 'Y'), ('X4', 'Y'), ('D1', 'X1'), ('D1', 'X2'), ('D2', 'X3'), ('D3', 'X4')]) cpd_a = TabularCPD(variable='A', variable_card=2, values=[[0.5], [0.5]]) cpd_d1 = TabularCPD(variable='D1', variable_card=2, values=[[0.45], [0.55]]) cpd_d2 = TabularCPD(variable='D2', variable_card=4, values=[[0.22], [0.24], [0.28], [0.26]]) cpd_d3 = TabularCPD(variable='D3', variable_card=2, values=[[0.54], [0.46]]) ydists = computeYDist() cpd_y = TabularCPD(variable='Y', variable_card=2, values=[ydists[0], ydists[1]], evidence=['X1', 'X3', 'X2', 'X4'], evidence_card=[2, 2, 4, 4]) cpd_c1 = TabularCPD(variable='C1', variable_card=2, values=[[0.85, 0.2], [0.15, 0.8]], evidence=['A'], evidence_card=[2]) cpd_c2 = TabularCPD(variable='C2', variable_card=4, values=[[0.23, 0.27, 0.25, 0.20], [0.35, 0.23, 0.24, 0.15], [0.22, 0.27, 0.25, 0.25], [0.20, 0.23, 0.26, 0.40]], evidence=['A', 'Y'], evidence_card=[2, 2]) cpd_c3 = TabularCPD(variable='C3', variable_card=2, values=[[0.52, 0.49, 0.5, 0.45], [0.48, 0.51, 0.5, 0.55]], evidence=['A', 'Y'], evidence_card=[2, 2]) cpd_c4 = TabularCPD(variable='C4', variable_card=4, values=[[0.22, 0.25, 0.25, 0.37], [0.23, 0.25, 0.26, 0.21], [0.23, 0.25, 0.25, 0.22], [0.32, 0.25, 0.24, 0.20]], evidence=['A', 'Y'], evidence_card=[2, 2]) cpd_x1 = TabularCPD( variable='X1', variable_card=2, values=[ [0.38, 0.40, 0.60, 0.62], #GOOD [0.62, 0.60, 0.40, 0.38] ], evidence=['A', 'D1'], evidence_card=[2, 2]) cpd_x2 = TabularCPD( variable='X2', variable_card=4, values=[ [0.30, 0.28, 0.15, 0.14], [0.24, 0.26, 0.30, 0.32], #GOOD 2 [0.16, 0.18, 0.38, 0.40], #GOOD 1 [0.30, 0.28, 0.17, 0.14] ], evidence=['A', 'D1'], evidence_card=[2, 2]) cpd_x3 = TabularCPD( variable='X3', variable_card=2, values=[[0.64, 0.62, 0.62, 0.63, 0.38, 0.35, 0.35, 0.37], [0.36, 0.38, 0.38, 0.37, 0.62, 0.65, 0.65, 0.63]], #GOOD evidence=['A', 'D2'], evidence_card=[2, 4]) cpd_x4 = TabularCPD( variable='X4', variable_card=4, values=[ [0.25, 0.27, 0.07, 0.09], [0.36, 0.34, 0.64, 0.62], #GOOD1 [0.25, 0.27, 0.07, 0.09], [0.14, 0.12, 0.22, 0.20] ], #GOOD2 evidence=['A', 'D3'], evidence_card=[2, 2]) wysiwygmodel.add_cpds(cpd_a, cpd_c1, cpd_c2, cpd_c3, cpd_c4, cpd_x1, cpd_x2, cpd_x3, cpd_x4, cpd_y, cpd_d1, cpd_d2, cpd_d3) datasamples = BayesianModelSampling(wysiwygmodel) discframe = datasamples.forward_sample(samplesize) AY = discframe[["A", "Y"]] C5 = samplecontinuous(AY, samplesize=samplesize, contatt="C5", meana0=1, meana1=1.2, covy0=[1], covy1=[0.9]) C6 = samplecontinuous(AY, samplesize=samplesize, contatt="C6", meana0=2, meana1=1.8, covy0=[1], covy1=[0.95]) X5 = samplecontinuous(AY, samplesize=samplesize, contatt="X5", meana0=1.1, meana1=1.4, covy0=[1.1], covy1=[0.95]) X6 = samplecontinuous(AY, samplesize=samplesize, contatt="X6", meana0=1.9, meana1=1.5, covy0=[1], covy1=[1.1]) discframe = pd.concat([discframe, C5, C6, X5, X6], axis=1) ndf = discframe.reindex(axis=1, labels=[ 'A', 'Y', 'C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'D1', 'D2', 'D3' ]) ndf.to_csv(path_or_buf=filename)
if parents.has_key(node): model.add_cpds( TabularCPD(node, variableCard[node], cpds[node], parents[node], parentsCardList[node])) else: model.add_cpds(TabularCPD(node, variableCard[node], cpds[node])) except Exception as e: tempException2 = 0 #print("Tabular cpds added to model") print "Creating samples using Bayesian model forward Sampling" inference = BayesianModelSampling(model) normalSamples = inference.forward_sample(size=5000, return_type='dataframe') #print "length ", normalSamples.shape print "Some of the samples are as follows" print normalSamples[1:2] print " " print "Calculating relative entropies between different Sampling models" smean = {} sentropy = {} for i in range(normalSamples.shape[1]): sentropy[list(normalSamples[[i]])[0]] = -1 * np.sum( norm.logpdf(normalSamples[[i]])) / normalSamples.shape[0] smean[list(normalSamples[[i]])[0]] = np.mean(normalSamples[[i]]) relEntropy = {}
cpd_x6x2.normalize(True) cpd_x3x5.normalize(True) # ##### Creating Models and generating data # In[31]: # First Model model1 = BayesianModel() model1.add_nodes_from(['x1', 'x2', 'x3', 'x4', 'x5', 'x6']) model1.add_edges_from([('x1', 'x2'), ('x1', 'x4'), ('x1', 'x6'), ('x2', 'x3'), ('x2', 'x5')]) model1.add_cpds(cpd_x1, cpd_x1x2, cpd_x1x4, cpd_x1x6, cpd_x2x3, cpd_x2x5) inference = BayesianModelSampling(model1) # print(inference.forward_sample(size=1000, return_type='dataframe')) data1 = inference.forward_sample(size=1000, return_type='dataframe') # Second Model model2 = BayesianModel() model2.add_nodes_from(['x1', 'x2', 'x3', 'x4', 'x5', 'x6']) model2.add_edges_from([('x1', 'x2'), ('x1', 'x4'), ('x6', 'x1'), ('x2', 'x3'), ('x2', 'x5')]) model2.add_cpds(cpd_x6, cpd_x1x2, cpd_x1x4, cpd_x6x1, cpd_x2x3, cpd_x2x5) inference = BayesianModelSampling(model2) # print(inference.forward_sample(size=1000, return_type='dataframe')) data2 = inference.forward_sample(size=1000, return_type='dataframe') # Third Model model3 = BayesianModel() model3.add_nodes_from(['x1', 'x2', 'x3', 'x4', 'x5', 'x6']) model3.add_edges_from([('x2', 'x3'), ('x2', 'x5'), ('x3', 'x6'), ('x6', 'x4'),
def start(self): cpd_difficulty = TabularCPD(variable='Difficulty', variable_card=2, values=[[0.6], [0.4]]) cpd_musicianship = TabularCPD(variable='Musicianship', variable_card=2, values=[[0.7], [0.3]]) cpd_Rating = TabularCPD(variable='Rating', variable_card=3, values=[[0.3, 0.05, 0.9, 0.5], [0.4, 0.25, 0.08, 0.3], [0.3, 0.7, 0.02, 0.2]], evidence=['Difficulty', 'Musicianship'], evidence_card=[2, 2]) cpd_Exam = TabularCPD(variable='Exam', variable_card=2, values=[[0.95, 0.2], [0.05, 0.8]], evidence=['Musicianship'], evidence_card=[2]) cpd_Letter = TabularCPD(variable='Letter', variable_card=2, values=[[0.1, 0.4, 0.99], [0.9, 0.6, 0.01]], evidence=['Rating'], evidence_card=[3]) self.musicModel.add_cpds(cpd_difficulty, cpd_musicianship, cpd_Rating, cpd_Exam, cpd_Letter) print(self.musicModel.check_model()) inference = BayesianModelSampling(self.musicModel) #Part one P(m = strong)P(d = low)P(r = ∗∗|m = strong,d = low) P(e = high|m = strong)P(letter = weak|∗∗) resultOne = inference.forward_sample(size=10000, return_type='recarray') # Musicianship = 0, Exam = 1, Difficulty = 2, Rating = 3, Letter = 4 # Musicianship = 0 musSum = 0 for sample in resultOne: if sample[0] == 1: musSum += 1 musProb = musSum/10000 print('music prob: ', musProb) difSum = 0 for sample in resultOne: if sample[2] == 0: difSum += 1 diffProb = difSum/10000 print('difficulty prob: ', diffProb) examSum = 0 for sample in resultOne: if sample[1] == 1 and sample[0] == 1: examSum += 1 examProb = (examSum/10000) / musProb print('exam prob: ', examProb) ratingSum = 0 for sample in resultOne: if sample[3] == 1 and sample[0] == 1 and sample[2] == 0: ratingSum += 1 ratingProb = (ratingSum/10000)/ (diffProb * musProb) print('rating prob: ', ratingProb) letterSum = 0 for sample in resultOne: if sample[4] == 0 and sample[3] == 1: letterSum += 1 letterProb = (letterSum/10000) / ratingProb print('letter prob: ', letterProb) letterStrongSum = 0 for sample in resultOne: if sample[4] == 1: letterStrongSum += 1 letterStrongSumProb = (letterStrongSum/10000) print('letter strong no evidence prob: ', letterStrongSumProb) letterStrongGivenMusicianshipSum = 0 for sample in resultOne: if sample[4] == 1 and sample[0] == 0: letterStrongGivenMusicianshipSum += 1 letterStrongGivenProb = (letterStrongGivenMusicianshipSum/10000) / (1-musProb) print('letter strong given weak music prob: ', letterStrongGivenProb)
# Next, let us define another objective function, _i.e._ the KL divergence. #------------------------------------ # Method 2: Training Markov Network #------------------------------------ # Generate samples from Bayesian network bn_sampler = BayesianModelSampling(grass_model) bn_sampler.topological_order = NODES # make sure the topological oder is consistent with NODES order kld_temp = 10. i = 1 while kld_temp > 0.001: print('Iteration %d, kld %f' % (i, kld_temp)) i += 1 bn_samps = bn_sampler.forward_sample(size=NUM_READS, return_type='dataframe') # calculate true data stats data_stats = np.zeros(shape=(len(NODES) + len(MORAL_EDGES), )) np.copyto(data_stats[:len(NODES)], np.mean(bn_samps, axis=0)) np.copyto( data_stats[len(NODES):], np.dot(bn_samps.T, bn_samps)[np.array(MORAL_EDGES)[:, 0], np.array(MORAL_EDGES)[:, 1]] / (NUM_READS * 1.)) p_data_bn = calculate_histogram(bn_samps.as_matrix()) kld_temp = kld(p_true, p_data_bn) plt.bar(range(16), p_true, width=0.4) plt.bar(np.array(range(16)) + 0.4, p_data_bn, width=0.4) print('KLD from true distribution to generated data distribution:', kld_temp)
cpd_self_harm) # check_model checks for the network structure and CPDs and verifies that the CPDs are correctly # defined and sum to 1. pg_model.check_model() # examine conditional independence relationships: pg_model.local_independencies("parent_education") pg_model.local_independencies("child_obesity") pg_model.local_independencies("child_screen_time") pg_model.local_independencies("child_physical_activity") # sample data from the network: inference = BayesianModelSampling(pg_model) sim_n = 50_000 simulated_sample = inference.forward_sample(size=sim_n) for colname_j in simulated_sample.columns: simulated_sample[colname_j] = ( simulated_sample[colname_j] == "high").astype(int) # draw correlation plot of the variables: corr_mat = simulated_sample.corr() corr_mat.style.background_gradient(cmap="coolwarm").set_precision(2) # eaxmple: if we condition on "child_screen_time".. # ..then "child_physical_activity" becomes independent of "parent_education": corr_mat = simulated_sample.query("child_screen_time==1").drop( "child_screen_time", axis=1).corr() corr_mat.style.background_gradient(cmap="coolwarm").set_precision(2) corr_mat = simulated_sample.query("child_screen_time==0").drop(
'difficulty': 2, 'Q9': 3 })) # print(belpro.map_query(variables=['Q25', 'Q18','Q16'],evidence={'instr':1})) print( belpro.map_query(variables=['attendance', 'Q9', 'difficulty'], evidence={'class': 7})) #Commented some queries because taking a lot of time to run # print(belpro.map_query(variables=['Q28','Q11'],evidence={'instr':2, 'class':10})) # print(belpro.map_query(variables=['Q18', 'Q26','Q13'],evidence={'instr':2})) # print(belpro.map_query(variables=['Q23', 'Q21','Q17'],evidence={'instr':2})) inference = BayesianModelSampling(bayesmodel) df = inference.forward_sample(5) # print df.shape print df print np.mean(df) # print scipy.stats.entropy(df) dataarray = panda.DataFrame.as_matrix(df) print dataarray arr = dataarray.astype(float) print arr sum1 = [] total = 0 count = 0 for j in range(0, 18): for i in arr:
from pgmpy.models.BayesianModel import BayesianModel from pgmpy.factors.discrete import TabularCPD from pgmpy.sampling import BayesianModelSampling student = BayesianModel([('diff', 'grade'), ('intel', 'grade')]) cpd_d = TabularCPD('diff', 2, [[0.6], [0.4]]) cpd_i = TabularCPD('intel', 2, [[0.7], [0.3]]) cpd_g = TabularCPD('grade', 3, [[0.3, 0.05, 0.9, 0.5], [0.4, 0.25,0.08, 0.3], [0.3, 0.7, 0.02, 0.2]],['intel', 'diff'], [2, 2]) student.add_cpds(cpd_d, cpd_i, cpd_g) inference = BayesianModelSampling(student) print inference.forward_sample(size=3, return_type='recarray')
def generateWysiwygData(samplesize=4000, filename="data/wysiwygdata4.csv"): ''' We define a bayesian model based on the WYSIWYG model from the thesis. There are 6 C variables and 6 X variables. For both C and X the first four are discrete variables, the other two continous. The variable C1 is causally influencing Y to assure a certain level of group unfairness in the data.''' wysiwygmodel = BayesianModel([('A', 'C1'), ('A', 'C2'), ('A', 'C3'), ('A', 'C4'), ('C1', 'Y'), ('Y', 'C2'), ('Y', 'C3'), ('Y', 'C4'), ('A', 'X1'), ('A', 'X2'), ('A', 'X3'), ('A', 'X4'), ('Y', 'X1'), ('Y', 'X2'), ('Y', 'X3'), ('Y', 'X4')]) cpd_a = TabularCPD(variable='A', variable_card=2, values=[[0.5], [0.5]]) cpd_y = TabularCPD(variable='Y', variable_card=2, values=[[0.65], [0.4], [0.35], [0.6]], evidence=['C1'], evidence_card=[2]) cpd_c1 = TabularCPD(variable='C1', variable_card=2, values=[[0.85, 0.2], [0.15, 0.8]], evidence=['A'], evidence_card=[2]) cpd_c2 = TabularCPD(variable='C2', variable_card=4, values=[[0.23, 0.27, 0.25, 0.20], [0.35, 0.23, 0.24, 0.15], [0.22, 0.27, 0.25, 0.25], [0.20, 0.23, 0.26, 0.40]], evidence=['A', 'Y'], evidence_card=[2, 2]) cpd_c3 = TabularCPD(variable='C3', variable_card=2, values=[[0.52, 0.49, 0.5, 0.45], [0.48, 0.51, 0.5, 0.55]], evidence=['A', 'Y'], evidence_card=[2, 2]) cpd_c4 = TabularCPD(variable='C4', variable_card=4, values=[[0.22, 0.25, 0.25, 0.37], [0.23, 0.25, 0.26, 0.21], [0.23, 0.25, 0.25, 0.22], [0.32, 0.25, 0.24, 0.20]], evidence=['A', 'Y'], evidence_card=[2, 2]) cpd_x1 = TabularCPD(variable='X1', variable_card=2, values=[[0.57, 0.48, 0.52, 0.38], [0.43, 0.52, 0.48, 0.62]], evidence=['A', 'Y'], evidence_card=[2, 2]) cpd_x2 = TabularCPD(variable='X2', variable_card=4, values=[[0.24, 0.28, 0.26, 0.19], [0.38, 0.22, 0.24, 0.15], [0.20, 0.28, 0.26, 0.23], [0.18, 0.22, 0.24, 0.43]], evidence=['A', 'Y'], evidence_card=[2, 2]) cpd_x3 = TabularCPD(variable='X3', variable_card=2, values=[[0.54, 0.48, 0.52, 0.4], [0.46, 0.52, 0.48, 0.6]], evidence=['A', 'Y'], evidence_card=[2, 2]) cpd_x4 = TabularCPD(variable='X4', variable_card=4, values=[[0.20, 0.25, 0.24, 0.40], [0.21, 0.25, 0.28, 0.21], [0.21, 0.25, 0.24, 0.21], [0.38, 0.25, 0.24, 0.18]], evidence=['A', 'Y'], evidence_card=[2, 2]) wysiwygmodel.add_cpds(cpd_a, cpd_c1, cpd_c2, cpd_c3, cpd_c4, cpd_x1, cpd_x2, cpd_x3, cpd_x4, cpd_y) datasamples = BayesianModelSampling(wysiwygmodel) discframe = datasamples.forward_sample(samplesize) AY = discframe[["A", "Y"]] C5 = samplecontinuous(AY, samplesize=samplesize, contatt="C5", meana0=1, meana1=1.2, covy0=[1], covy1=[0.9]) C6 = samplecontinuous(AY, samplesize=samplesize, contatt="C6", meana0=2, meana1=1.8, covy0=[1], covy1=[0.95]) X5 = samplecontinuous(AY, samplesize=samplesize, contatt="X5", meana0=1.1, meana1=1.4, covy0=[1.1], covy1=[0.95]) X6 = samplecontinuous(AY, samplesize=samplesize, contatt="X6", meana0=1.9, meana1=1.5, covy0=[1], covy1=[1.1]) discframe = pd.concat([discframe, C5, C6, X5, X6], axis=1) discframe.to_csv(path_or_buf=filename)