Exemple #1
0
def generate_time_series(
    sampler: BayesianModelSampling,
    length: int,
    labels: typing.List[str],
    seed: int = 42,
):
    # Initialize progress bar
    pbar = notebook.tqdm(total=length)

    # Generate first sample given no evidence
    with io.capture_output() as captured:
        # When no evidence is provided, the function under-the-hood performs forward sampling
        sample = sampler.rejection_sample(seed=seed)
    sample = sample.reindex(sorted(sample.columns), axis=1)

    # Split sample in 'current' and 'next' slices:
    # - the 'current' slice will be the first row of the generated time series
    # - the 'next' slice is added as the second row, and will be used as
    # evidence for subsequent predictions
    df_synth = sample.filter(regex="_T$")
    next_slice = sample.filter(regex="_T\+1").iloc[0].values.tolist()
    df_synth = df_synth.append(pd.Series(next_slice, index=df_synth.columns),
                               ignore_index=True)
    evidence = [
        State(n, v) for n, v in zip(df_synth.columns.values, next_slice)
    ]

    # Update progress bar
    pbar.update(2)

    for _ in range(2, length):
        # Generate new data
        with io.capture_output() as captured:
            sample = sampler.rejection_sample(evidence=evidence)
        sample = sample.reindex(sorted(sample.columns), axis=1)

        # Append 'next' slice to the generated time series, and use it as new evidence
        next_slice = sample.filter(regex="_T\+1").iloc[0].values.tolist()
        df_synth = df_synth.append(pd.Series(next_slice,
                                             index=df_synth.columns),
                                   ignore_index=True)
        evidence = [
            State(n, v) for n, v in zip(df_synth.columns.values, next_slice)
        ]

        # Update progress bar
        pbar.update(1)
    # Close progress bar
    pbar.close()
    # Update column names
    df_synth.columns = labels
    return df_synth
Exemple #2
0
def rejection_estimate(n):
    inferences = BayesianModelSampling(disease_model)
    evidences = [
        State(var='Fatigue', state=0),
        State(var='Fever', state=0),
        State(var='FluShot', state=0)
    ]

    p = inferences.rejection_sample(evidences, n)
    i = 0

    for t in range(n):
        if p['Flu'][t] == float(0):
            i = i + 1
            plt.plot(t, (i / n), 'bo')
    plt.ylabel('Evolving esimate')
    plt.xlabel('Number of samples')
    plt.show()
def sample_slots(model_info_file, mr_slot_names):
    model_info = helpers.load_from_pickle(model_info_file)
    model = model_info['model']
    inference = BayesianModelSampling(model)
    # use the missing mr slots as evidence
    all_slots = model_info['all_slots']
    missing_slots = [mr for mr in all_slots if mr not in mr_slot_names]
    evidence = [State(mr, 0) for mr in missing_slots]
    inference = BayesianModelSampling(model)
    # don't allow empty samples
    sampled_slots = []
    while (sampled_slots == []):
        sample = inference.rejection_sample(evidence=evidence,
                                            size=1,
                                            return_type='recarray')
        # return a list of the column names which had presence
        sampled_slots = [
            name for var, name in zip(sample.view('<i8'), sample.dtype.names)
            if var == 1
        ]
    return sampled_slots
Exemple #4
0
corr_mat = simulated_sample.corr()
corr_mat.style.background_gradient(cmap="coolwarm").set_precision(2)

# eaxmple: if we condition on "child_screen_time"..
# ..then "child_physical_activity" becomes independent of "parent_education":
corr_mat = simulated_sample.query("child_screen_time==1").drop(
    "child_screen_time", axis=1).corr()
corr_mat.style.background_gradient(cmap="coolwarm").set_precision(2)

corr_mat = simulated_sample.query("child_screen_time==0").drop(
    "child_screen_time", axis=1).corr()
corr_mat.style.background_gradient(cmap="coolwarm").set_precision(2)

# suppose that we are interested in measuring the average causal effect of "child_screen_time" on "obesity"
# we can estimate this by simulating from the system:
simulated_sample_lowScreentime = inference.rejection_sample(
    evidence=[State(var='child_screen_time', state="low")], size=10_000)
simulated_sample_highScreentime = inference.rejection_sample(
    evidence=[State(var='child_screen_time', state="high")], size=10_000)
# the observed effect of high screen time on prob. of high child obesity is:
((simulated_sample_highScreentime["child_obesity"] == "high").sum() /
 len(simulated_sample_highScreentime)) / (
     (simulated_sample_lowScreentime["child_obesity"] == "high").sum() /
     len(simulated_sample_lowScreentime))
# i.e. around 2x

infer_adjusted = CausalInference(pg_model)
print(
    infer_adjusted.query(variables=["child_obesity"],
                         do={"child_screen_time": "high"}))

# we can estimate this effect from the observed data using a logistic regression model:
Exemple #5
0
class TestBayesianModelSampling(unittest.TestCase):
    def setUp(self):
        self.bayesian_model = BayesianModel([('A', 'J'), ('R', 'J'),
                                             ('J', 'Q'), ('J', 'L'),
                                             ('G', 'L')])
        cpd_a = TabularCPD('A', 2, [[0.2], [0.8]])
        cpd_r = TabularCPD('R', 2, [[0.4], [0.6]])
        cpd_j = TabularCPD('J', 2,
                           [[0.9, 0.6, 0.7, 0.1], [0.1, 0.4, 0.3, 0.9]],
                           ['R', 'A'], [2, 2])
        cpd_q = TabularCPD('Q', 2, [[0.9, 0.2], [0.1, 0.8]], ['J'], [2])
        cpd_l = TabularCPD('L', 2,
                           [[0.9, 0.45, 0.8, 0.1], [0.1, 0.55, 0.2, 0.9]],
                           ['G', 'J'], [2, 2])
        cpd_g = TabularCPD('G', 2, [[0.6], [0.4]])
        self.bayesian_model.add_cpds(cpd_a, cpd_g, cpd_j, cpd_l, cpd_q, cpd_r)
        self.sampling_inference = BayesianModelSampling(self.bayesian_model)
        self.markov_model = MarkovModel()

    def test_init(self):
        with self.assertRaises(TypeError):
            BayesianModelSampling(self.markov_model)

    def test_forward_sample(self):
        sample = self.sampling_inference.forward_sample(25)
        self.assertEquals(len(sample), 25)
        self.assertEquals(len(sample.columns), 6)
        self.assertIn('A', sample.columns)
        self.assertIn('J', sample.columns)
        self.assertIn('R', sample.columns)
        self.assertIn('Q', sample.columns)
        self.assertIn('G', sample.columns)
        self.assertIn('L', sample.columns)
        self.assertTrue(set(sample.A).issubset({0, 1}))
        self.assertTrue(set(sample.J).issubset({0, 1}))
        self.assertTrue(set(sample.R).issubset({0, 1}))
        self.assertTrue(set(sample.Q).issubset({0, 1}))
        self.assertTrue(set(sample.G).issubset({0, 1}))
        self.assertTrue(set(sample.L).issubset({0, 1}))

    def test_rejection_sample_basic(self):
        sample = self.sampling_inference.rejection_sample(
            [State('A', 1), State('J', 1),
             State('R', 1)], 25)
        self.assertEquals(len(sample), 25)
        self.assertEquals(len(sample.columns), 6)
        self.assertIn('A', sample.columns)
        self.assertIn('J', sample.columns)
        self.assertIn('R', sample.columns)
        self.assertIn('Q', sample.columns)
        self.assertIn('G', sample.columns)
        self.assertIn('L', sample.columns)
        self.assertTrue(set(sample.A).issubset({1}))
        self.assertTrue(set(sample.J).issubset({1}))
        self.assertTrue(set(sample.R).issubset({1}))
        self.assertTrue(set(sample.Q).issubset({0, 1}))
        self.assertTrue(set(sample.G).issubset({0, 1}))
        self.assertTrue(set(sample.L).issubset({0, 1}))

    @patch("pgmpy.sampling.BayesianModelSampling.forward_sample",
           autospec=True)
    def test_rejection_sample_less_arg(self, forward_sample):
        sample = self.sampling_inference.rejection_sample(size=5)
        forward_sample.assert_called_once_with(self.sampling_inference, 5)
        self.assertEqual(sample, forward_sample.return_value)

    def test_likelihood_weighted_sample(self):
        sample = self.sampling_inference.likelihood_weighted_sample(
            [State('A', 0), State('J', 1),
             State('R', 0)], 25)
        self.assertEquals(len(sample), 25)
        self.assertEquals(len(sample.columns), 7)
        self.assertIn('A', sample.columns)
        self.assertIn('J', sample.columns)
        self.assertIn('R', sample.columns)
        self.assertIn('Q', sample.columns)
        self.assertIn('G', sample.columns)
        self.assertIn('L', sample.columns)
        self.assertIn('_weight', sample.columns)
        self.assertTrue(set(sample.A).issubset({0, 1}))
        self.assertTrue(set(sample.J).issubset({0, 1}))
        self.assertTrue(set(sample.R).issubset({0, 1}))
        self.assertTrue(set(sample.Q).issubset({0, 1}))
        self.assertTrue(set(sample.G).issubset({0, 1}))
        self.assertTrue(set(sample.L).issubset({0, 1}))

    def tearDown(self):
        del self.sampling_inference
        del self.bayesian_model
        del self.markov_model
Exemple #6
0
def bayesian_net():
    alarm_model = BayesianModel([
        ('Burglary',
         'Alarm'),  # Alarm has two parents, thus, it is twice as son.
        ('Earthquake', 'Alarm'),
        ('Alarm', 'JohnCalls'),
        ('Alarm', 'MaryCalls')
    ])
    for i in alarm_model.get_parents('Alarm'):
        print(i)

    # variable_card indicates the number of posible values this variable can take.

    cpd_burglary = TabularCPD(
        variable='Burglary',
        variable_card=2,  # 0->True, 1->False
        values=[
            [0.001],  # true probabilities of the table
            [0.999]
        ])  # false probabilities of the table
    cpd_earthquake = TabularCPD(
        variable='Earthquake',
        variable_card=2,  # 0->True 1->False
        values=[
            [0.002],  # true probabilities of the table
            [0.998]
        ])  # false probabilities of the table

    # evidence_card indicates the number of possible values the parents of the variable can take

    cpd_alarm = TabularCPD(
        variable='Alarm',
        variable_card=2,  # 0->True 1->False
        values=[
            [0.95, 0.94, 0.29, 0.001],  # true probabilities of the table
            [0.05, 0.06, 0.71, 0.999]
        ],  # false probabilities of the table
        evidence=['Burglary', 'Earthquake'],
        evidence_card=[2, 2])
    cpd_john_calls = TabularCPD(
        variable='JohnCalls',
        variable_card=2,  # 0->True 1->False
        values=[[0.95, 0.05], [0.05, 0.95]],
        evidence=['Alarm'],
        evidence_card=[2])

    cpd_mary_calls = TabularCPD(
        variable='MaryCalls',
        variable_card=2,  # 0->True 1->False
        values=[
            [0.7, 0.1],  # true probabilities of the table
            [0.3, 0.9]
        ],  # false probabilities of the table
        evidence=['Alarm'],
        evidence_card=[2])
    for i in [
            cpd_burglary, cpd_earthquake, cpd_alarm, cpd_john_calls,
            cpd_mary_calls
    ]:
        print(i)

    alarm_model.add_cpds(cpd_burglary, cpd_earthquake, cpd_alarm,
                         cpd_john_calls, cpd_mary_calls)
    alarm_model.check_model()

    infer = VariableElimination(alarm_model)

    # Uncomment to obtain the result before normalization
    # infer = SimpleInference(alarm_model)

    print(
        infer.query(
            ['JohnCalls'],
            evidence={
                'Burglary': 1,
                'Earthquake': 1,
                'Alarm': 0,
                'MaryCalls': 0
            },
        )['JohnCalls'])

    print(
        infer.query(['Burglary'], evidence={
            'JohnCalls': 0,
            'MaryCalls': 0
        })['Burglary'])

    # Variable order can be specified if necessary

    print(
        infer.query(['Burglary'],
                    evidence={
                        'JohnCalls': 0,
                        'MaryCalls': 0
                    },
                    elimination_order=['Alarm', 'Earthquake'])['Burglary'])

    sampling = BayesianModelSampling(alarm_model)
    data = sampling.rejection_sample(evidence={},
                                     size=20,
                                     return_type="dataframe")
    print(data)

    data = sampling.rejection_sample(evidence=[('JohnCalls', 0),
                                               ('MaryCalls', 0)],
                                     size=20,
                                     return_type='dataframe')
    print(data)

    sampling = BayesianModelSampling(alarm_model)
    data = sampling.rejection_sample(evidence=None,
                                     size=5000,
                                     return_type="dataframe")
    approx_alarm_model = BayesianModel([('Burglary', 'Alarm'),
                                        ('Earthquake', 'Alarm'),
                                        ('Alarm', 'JohnCalls'),
                                        ('Alarm', 'MaryCalls')])
    approx_alarm_model.fit(data, estimator=BayesianEstimator)
    approx_alarm_model.check_model()

    for cpd in approx_alarm_model.get_cpds():
        print("CPD of {variable}:".format(variable=cpd.variable))
        print(cpd)

    infer = VariableElimination(approx_alarm_model)

    print(
        infer.query(
            ['JohnCalls'],
            evidence={
                'Burglary': 1,
                'Earthquake': 1,
                'Alarm': 0,
                'MaryCalls': 0
            },
        )['JohnCalls'])

    print(
        infer.query(['Burglary'], evidence={
            'JohnCalls': 0,
            'MaryCalls': 0
        })['Burglary'])

    print(
        alarm_model.predict_probability(
            data[['Burglary', 'Earthquake', 'Alarm', 'JohnCalls']]))
class TestBayesianModelSampling(unittest.TestCase):
    def setUp(self):
        self.bayesian_model = BayesianModel([('A', 'J'), ('R', 'J'), ('J', 'Q'),
                                             ('J', 'L'), ('G', 'L')])
        cpd_a = TabularCPD('A', 2, [[0.2], [0.8]])
        cpd_r = TabularCPD('R', 2, [[0.4], [0.6]])
        cpd_j = TabularCPD('J', 2,
                           [[0.9, 0.6, 0.7, 0.1],
                            [0.1, 0.4, 0.3, 0.9]],
                           ['R', 'A'], [2, 2])
        cpd_q = TabularCPD('Q', 2,
                           [[0.9, 0.2],
                            [0.1, 0.8]],
                           ['J'], [2])
        cpd_l = TabularCPD('L', 2,
                           [[0.9, 0.45, 0.8, 0.1],
                            [0.1, 0.55, 0.2, 0.9]],
                           ['G', 'J'], [2, 2])
        cpd_g = TabularCPD('G', 2, [[0.6], [0.4]])
        self.bayesian_model.add_cpds(cpd_a, cpd_g, cpd_j, cpd_l, cpd_q, cpd_r)
        self.sampling_inference = BayesianModelSampling(self.bayesian_model)
        self.markov_model = MarkovModel()

    def test_init(self):
        with self.assertRaises(TypeError):
            BayesianModelSampling(self.markov_model)

    def test_forward_sample(self):
        sample = self.sampling_inference.forward_sample(25)
        self.assertEquals(len(sample), 25)
        self.assertEquals(len(sample.columns), 6)
        self.assertIn('A', sample.columns)
        self.assertIn('J', sample.columns)
        self.assertIn('R', sample.columns)
        self.assertIn('Q', sample.columns)
        self.assertIn('G', sample.columns)
        self.assertIn('L', sample.columns)
        self.assertTrue(set(sample.A).issubset({0, 1}))
        self.assertTrue(set(sample.J).issubset({0, 1}))
        self.assertTrue(set(sample.R).issubset({0, 1}))
        self.assertTrue(set(sample.Q).issubset({0, 1}))
        self.assertTrue(set(sample.G).issubset({0, 1}))
        self.assertTrue(set(sample.L).issubset({0, 1}))

    def test_rejection_sample_basic(self):
        sample = self.sampling_inference.rejection_sample([State('A', 1), State('J', 1), State('R', 1)], 25)
        self.assertEquals(len(sample), 25)
        self.assertEquals(len(sample.columns), 6)
        self.assertIn('A', sample.columns)
        self.assertIn('J', sample.columns)
        self.assertIn('R', sample.columns)
        self.assertIn('Q', sample.columns)
        self.assertIn('G', sample.columns)
        self.assertIn('L', sample.columns)
        self.assertTrue(set(sample.A).issubset({1}))
        self.assertTrue(set(sample.J).issubset({1}))
        self.assertTrue(set(sample.R).issubset({1}))
        self.assertTrue(set(sample.Q).issubset({0, 1}))
        self.assertTrue(set(sample.G).issubset({0, 1}))
        self.assertTrue(set(sample.L).issubset({0, 1}))

    @patch("pgmpy.sampling.BayesianModelSampling.forward_sample", autospec=True)
    def test_rejection_sample_less_arg(self, forward_sample):
        sample = self.sampling_inference.rejection_sample(size=5)
        forward_sample.assert_called_once_with(self.sampling_inference, 5)
        self.assertEqual(sample, forward_sample.return_value)

    def test_likelihood_weighted_sample(self):
        sample = self.sampling_inference.likelihood_weighted_sample([State('A', 0), State('J', 1), State('R', 0)], 25)
        self.assertEquals(len(sample), 25)
        self.assertEquals(len(sample.columns), 7)
        self.assertIn('A', sample.columns)
        self.assertIn('J', sample.columns)
        self.assertIn('R', sample.columns)
        self.assertIn('Q', sample.columns)
        self.assertIn('G', sample.columns)
        self.assertIn('L', sample.columns)
        self.assertIn('_weight', sample.columns)
        self.assertTrue(set(sample.A).issubset({0, 1}))
        self.assertTrue(set(sample.J).issubset({0, 1}))
        self.assertTrue(set(sample.R).issubset({0, 1}))
        self.assertTrue(set(sample.Q).issubset({0, 1}))
        self.assertTrue(set(sample.G).issubset({0, 1}))
        self.assertTrue(set(sample.L).issubset({0, 1}))

    def tearDown(self):
        del self.sampling_inference
        del self.bayesian_model
        del self.markov_model