Beispiel #1
0
 def test_state_count(self):
     e = BaseEstimator(self.d1)
     self.assertEqual(e.state_counts("A").values.tolist(), [[2], [1]])
     self.assertEqual(
         e.state_counts("C", ["A", "B"]).values.tolist(),
         [[0.0, 0.0, 1.0, 0.0], [1.0, 1.0, 0.0, 0.0]],
     )
    def test_test_conditional_independence(self):
        data = pd.DataFrame(np.random.randint(0, 2, size=(1000, 4)), columns=list('ABCD'))
        data['E'] = data['A'] + data['B'] + data['C']
        est = BaseEstimator(data)

        self.assertGreater(est.test_conditional_independence('A', 'C')[1], 0.01)  # independent
        self.assertGreater(est.test_conditional_independence('A', 'B', 'D')[1], 0.01)  # independent
        self.assertLess(est.test_conditional_independence('A', 'B', ['D', 'E'])[1], 0.01)  # dependent
 def test_missing_data(self):
     e = BaseEstimator(self.d2, state_names={'C': [0, 1]}, complete_samples_only=False)
     self.assertEqual(e.state_counts('A', complete_samples_only=True).values.tolist(), [[0], [0]])
     self.assertEqual(e.state_counts('A').values.tolist(), [[1], [1]])
     self.assertEqual(e.state_counts('C', parents=['A', 'B'], complete_samples_only=True).values.tolist(),
                      [[0, 0, 0, 0], [0, 0, 0, 0]])
     self.assertEqual(e.state_counts('C', parents=['A', 'B']).values.tolist(),
                      [[0, 0, 0, 0], [1, 0, 0, 0]])
    def test_test_conditional_independence_titanic(self):
        est = BaseEstimator(self.titanic_data)

        np.testing.assert_almost_equal(est.test_conditional_independence('Embarked', 'Sex'),
                                       (13.355630515001746, 0.020264556044311655, True))
        np.testing.assert_almost_equal(est.test_conditional_independence('Pclass', 'Survived', ['Embarked']),
                                       (96.403283942888635, 4.1082315854166553e-13, True))
        np.testing.assert_almost_equal(est.test_conditional_independence('Embarked', 'Survived', ["Sex", "Pclass"]),
                                       (21.537481934494085, 0.96380273702382602, True))
Beispiel #5
0
    def test_test_conditional_independence_titanic(self):
        est = BaseEstimator(self.titanic_data)

        self.assertTrue(est.test_conditional_independence("Embarked", "Sex"))
        self.assertFalse(
            est.test_conditional_independence("Pclass", "Survived",
                                              ["Embarked"]))
        self.assertTrue(
            est.test_conditional_independence("Embarked", "Survived",
                                              ["Sex", "Pclass"]))
Beispiel #6
0
    def test_test_conditional_independence_titanic(self):
        est = BaseEstimator(self.titanic_data)

        self.assertEqual(est.test_conditional_independence('Embarked', 'Sex'),
                         (13.355630515001746, 0.020264556044311655, True))
        self.assertEqual(
            est.test_conditional_independence('Pclass', 'Survived',
                                              ['Embarked']),
            (96.403283942888635, 4.1082315854166553e-13, True))
        self.assertEqual(
            est.test_conditional_independence('Embarked', 'Survived',
                                              ["Sex", "Pclass"]),
            (21.537481934494085, 0.96380273702382602, True))
Beispiel #7
0
    def test_test_conditional_independence(self):
        data = pd.DataFrame(np.random.randint(0, 2, size=(1000, 4)),
                            columns=list("ABCD"))
        data["E"] = data["A"] + data["B"] + data["C"]
        est = BaseEstimator(data)

        self.assertTrue(est.test_conditional_independence("A",
                                                          "C"))  # independent
        self.assertTrue(est.test_conditional_independence("A", "B",
                                                          "D"))  # independent
        self.assertFalse(
            est.test_conditional_independence("A", "B",
                                              ["D", "E"]))  # dependent
Beispiel #8
0
    def test_test_conditional_independence(self):
        data = pd.DataFrame(np.random.randint(0, 2, size=(1000, 4)),
                            columns=list('ABCD'))
        data['E'] = data['A'] + data['B'] + data['C']
        est = BaseEstimator(data)

        self.assertGreater(
            est.test_conditional_independence('A', 'C')[1],
            0.01)  # independent
        self.assertGreater(
            est.test_conditional_independence('A', 'B', 'D')[1],
            0.01)  # independent
        self.assertLess(
            est.test_conditional_independence('A', 'B', ['D', 'E'])[1],
            0.01)  # dependent
Beispiel #9
0
 def test_missing_data(self):
     e = BaseEstimator(self.d2,
                       state_names={'C': [0, 1]},
                       complete_samples_only=False)
     self.assertEqual(
         e.state_counts('A', complete_samples_only=True).values.tolist(),
         [[0], [0]])
     self.assertEqual(e.state_counts('A').values.tolist(), [[1], [1]])
     self.assertEqual(
         e.state_counts('C', parents=['A', 'B'],
                        complete_samples_only=True).values.tolist(),
         [[0, 0, 0, 0], [0, 0, 0, 0]])
     self.assertEqual(
         e.state_counts('C', parents=['A', 'B']).values.tolist(),
         [[0, 0, 0, 0], [1, 0, 0, 0]])
Beispiel #10
0
 def test_missing_data(self):
     e = BaseEstimator(self.d2,
                       state_names={"C": [0, 1]},
                       complete_samples_only=False)
     self.assertEqual(
         e.state_counts("A", complete_samples_only=True).values.tolist(),
         [[0], [0]])
     self.assertEqual(e.state_counts("A").values.tolist(), [[1], [1]])
     self.assertEqual(
         e.state_counts("C", parents=["A", "B"],
                        complete_samples_only=True).values.tolist(),
         [[0, 0, 0, 0], [0, 0, 0, 0]],
     )
     self.assertEqual(
         e.state_counts("C", parents=["A", "B"]).values.tolist(),
         [[0, 0, 0, 0], [1, 0, 0, 0]],
     )
Beispiel #11
0
    def process(self):
        def calculate_distribution_nodes_input():
            for key in pr.keys():
                Distribution[key] = np.array(
                    [1 - abs(np.sign(pr[key] - i)) for i in range(5)])
                Distribution[key + '2'] = Distribution[key]
                Distribution[key + '3'] = Distribution[key]
                nodes.remove(key)
                nodes2.remove(key + '2')
                nodes3.remove(key + '3')

        def query_time_frame_1(infer):
            print('query 1', pr, nodes)
            query = infer.query(nodes, evidence=pr)
            for key, value in query.items():
                Distribution[key].append(value.values)

        def query_time_frame_2(infer):
            global pr2
            for key, value in pr.items():
                pr2[key + '2'] = pr[key]

            print('query 2', pr2, nodes2)
            query = infer.query(nodes2, evidence=pr2)
            for key, value in query.items():
                Distribution[key].append(value.values)

        def query_time_frame_3(infer):
            global pr3
            for key, value in pr.items():
                pr3[key + '3'] = pr[key]
            print('query 3', pr3, nodes3)
            query = infer.query(nodes3, evidence=pr3)
            for key, value in query.items():
                Distribution[key].append(value.values)

        def stretch_distributions(max_value_di):
            # sketch number axis with max values = max values DI + 1
            remove_nodes = [
                'DPQ', 'C', 'TQ', 'OU', 'DPQ2', 'C2', 'TQ2', 'OU2', 'DPQ3',
                'C3', 'TQ3', 'OU3'
            ]
            for key in [
                    x for x in list(nodes + nodes2 + nodes3)
                    if x not in remove_nodes
            ]:
                if self.state_names[key][-1] == max_value_di:
                    self.state_names[key].append(max_value_di + 1)
                    Distribution[key] = np.append(Distribution[key], [0])
                elif self.state_names[key][-1] < max_value_di:
                    self.state_names[key].extend(
                        [self.state_names[key][-1] + 1, max_value_di + 1])
                    Distribution[key] = np.append(Distribution[key], [0, 0])

        def process_segments(size):
            global Distribution
            loop = int(np.ceil(float(self.data_size) / size))
            last_size = self.data_size - size * (loop - 1)
            print('size: ', size, ' | last_size ', last_size, ' | loop: ',
                  loop)
            for i in range(loop):
                print('process: ', i)
                self.model.fit(self.data.loc[i * size:(i + 1) * size],
                               estimator_type=BayesianEstimator,
                               prior_type="BDeu",
                               equivalent_sample_size=1,
                               state_names=self.state_names)
                infer = VariableElimination(self.model)
                query_time_frame_1(infer)
                query_time_frame_2(infer)
                query_time_frame_3(infer)

            for node in list(nodes + nodes2 + nodes3):
                temp = [0] * len(self.state_names[node])
                length_distribution = len(Distribution[node])
                length_state_name = len(self.state_names[node])
                for distribution_index in range(0, length_distribution - 1):
                    for value_distr_index in range(length_state_name):
                        temp[value_distr_index] += Distribution[node][
                            distribution_index][value_distr_index]
                percent = float(last_size) / size
                for value_distr_index in range(length_state_name):
                    temp[value_distr_index] += (
                        Distribution[node][-1][value_distr_index] * percent)
                Distribution[node] = [x * size / self.data_size for x in temp]

        def calculate_expected_value_and_variance():
            print('Expected value and Variance: ')
            expected_value = {}
            variance = {}
            for key in list(nodes + nodes2 + nodes3):
                expected_value[key] = sum([
                    value * prob for value, prob in zip(
                        self.state_names[key], Distribution[key])
                ])
                # https://en.wikipedia.org/wiki/Variance#Discrete_random_variable
                variance[key] = sum([
                    prob * (value - expected_value[key]) *
                    (value - expected_value[key]) for prob, value in zip(
                        Distribution[key], self.state_names[key])
                ])
                print('E(', key, ') = ', expected_value[key], ' | Var(', key,
                      ') = ', variance[key])

        global nodes
        global Distribution
        global pr
        global pr2
        global pr3
        pr = self.process_box()
        pr2 = {}
        pr3 = {}
        nodes = ['DPQ', 'C', 'TQ', 'DI', 'DFT', 'RD', 'OU', 'DFO']
        nodes2 = ['DPQ2', 'C2', 'TQ2', 'DI2', 'DFT2', 'RD2', 'OU2', 'DFO2']
        nodes3 = ['DPQ3', 'C3', 'TQ3', 'DI3', 'DFT3', 'RD3', 'OU3', 'DFO3']
        Distribution = {}

        if self.history_file != self.file_path:
            self.data = pd.read_csv(self.file_path)  # "fisrm.csv"
            self.data_size = len(self.data)
            self.history_file = self.file_path

            self.model = BayesianModel([('TQ', 'DFT'), ('DPQ', 'DI'),
                                        ('C', 'DI'), ('DI', 'DFT'),
                                        ('DI', 'RD'), ('DFT', 'RD'),
                                        ('RD', 'DFO'), ('OU', 'DFO'),
                                        ('DPQ', 'DPQ2'), ('C', 'C2'),
                                        ('TQ', 'TQ2'), ('OU', 'OU2'),
                                        ('RD', 'DI2'), ('DI2', 'DFT2'),
                                        ('DI2', 'RD2'), ('DFT2', 'RD2'),
                                        ('RD2', 'DFO2'), ('OU2', 'DFO2'),
                                        ('DPQ2', 'DPQ3'), ('C2', 'C3'),
                                        ('TQ2', 'TQ3'), ('OU2', 'OU3'),
                                        ('RD2', 'DI3'), ('DI3', 'DFT3'),
                                        ('DI3', 'RD3'), ('DFT3', 'RD3'),
                                        ('RD3', 'DFO3'), ('OU3', 'DFO3')])
        self.state_names = BaseEstimator(self.data).state_names

        calculate_distribution_nodes_input()

        for node in nodes:
            Distribution[node] = []
            Distribution[node + '2'] = []
            Distribution[node + '3'] = []

        length = len(pr.keys())
        if length == 0:
            size = 40
        elif length == 1:
            size = 150
        elif length == 2:
            size = 500
        elif length == 3:
            size = 1500
        elif length == 4:
            size = 5000
        process_segments(size)

        calculate_expected_value_and_variance()
        # Draw
        max_value_di = self.state_names['DI'][-1]  # array has been sorted
        stretch_distributions(max_value_di)

        self.draw_subplots(Distribution, max_value_di)
        plt.show()
Beispiel #12
0
 def test_state_count(self):
     e = BaseEstimator(self.d1)
     self.assertEqual(e.state_counts('A').values.tolist(), [[2], [1]])
     self.assertEqual(
         e.state_counts('C', ['A', 'B']).values.tolist(),
         [[0., 0., 1., 0.], [1., 1., 0., 0.]])
 def test_state_count(self):
     e = BaseEstimator(self.d1)
     self.assertEqual(e.state_counts('A').values.tolist(), [[2], [1]])
     self.assertEqual(e.state_counts('C', ['A', 'B']).values.tolist(),
                      [[0., 0., 1., 0.], [1., 1., 0., 0.]])