Example #1
0
    def test_transform(self):
        """transform continous columns into discrete bins."""
        # Setup
        instance = DiscretizeTransformer(n_bins=2)
        data = pd.DataFrame({
            'A': [x for x in range(10)],
            'B': [2 * x for x in range(10)]
        }).values
        expected_result = np.array([
            [0, 0],
            [0, 0],
            [0, 0],
            [0, 0],
            [0, 0],
            [1, 1],
            [1, 1],
            [1, 1],
            [1, 1],
            [1, 1],
        ])
        instance.fit(data)

        # Run
        result = instance.transform(data)

        # Check
        np.testing.assert_equal(result, expected_result)
Example #2
0
    def test___init__(self):
        """On init attributes are set as None, and n_bins assigned."""
        # Setup
        n_bins = 5

        # Run
        instance = DiscretizeTransformer(n_bins=n_bins)

        # Check
        assert instance.n_bins == 5
        assert instance.meta is None
        assert instance.column_index is None
        assert instance.discretizer is None
Example #3
0
    def test_fit(self, kbins_mock):
        # Setup
        n_bins = 2
        instance = DiscretizeTransformer(n_bins=n_bins)
        data = pd.DataFrame({
            'A': [1 / (x + 1) for x in range(10)],
            'B': [x for x in range(10)]
        }).values
        kbins_instance = kbins_mock.return_value

        # Run
        instance.fit(data, [], [])

        # Check
        assert instance.column_index == [0, 1]
        assert instance.discretizer == kbins_instance
        assert instance.meta == [{
            'name': 0,
            'type': 'continuous',
            'min': 0.1,
            'max': 1.0
        }, {
            'name': 1,
            'type': 'continuous',
            'min': 0.0,
            'max': 9.0
        }]

        kbins_mock.assert_called_once_with(n_bins=2,
                                           encode='ordinal',
                                           strategy='uniform')
        call_list = kbins_instance.fit.call_args_list
        assert len(call_list) == 1
        call_args, call_kwargs = call_list[0]
        assert call_kwargs == {}
        assert len(call_args) == 1
        np.testing.assert_equal(call_args[0], data)
Example #4
0
    def test_inverse_transform(self):
        """Transform discrete values back into its original space."""
        # Setup
        n_bins = 2
        instance = DiscretizeTransformer(n_bins=n_bins)
        data = pd.DataFrame({
            'A': [1 / (x + 1) for x in range(10)],
            'B': [x for x in range(10)]
        }).values
        instance.fit(data)
        transformed_data = instance.transform(data)
        expected_result = pd.DataFrame({
            'A': [
                0.775, 0.325, 0.325, 0.325, 0.325, 0.325, 0.325, 0.325, 0.325,
                0.325
            ],
            'B': [2.25, 2.25, 2.25, 2.25, 2.25, 6.75, 6.75, 6.75, 6.75, 6.75]
        })

        # Run
        result = instance.inverse_transform(transformed_data)

        # Check
        np.testing.assert_allclose(result, expected_result)
Example #5
0
class CLBN(LegacySingleTableBaseline):
    """CLBNSynthesizer."""
    def fit(self, data, categorical_columns=tuple(), ordinal_columns=tuple()):
        self.discretizer = DiscretizeTransformer(n_bins=15)
        self.discretizer.fit(data, categorical_columns, ordinal_columns)
        discretized_data = self.discretizer.transform(data)
        self.model = BayesianNetwork.from_samples(discretized_data,
                                                  algorithm='chow-liu')

    def bn_sample(self, num_samples):
        """Sample from the bayesian network.

        Args:
            num_samples(int): Number of samples to generate.
        """
        nodes_parents = self.model.structure
        processing_order = []

        while len(processing_order) != len(nodes_parents):
            update = False

            for id_, parents in enumerate(nodes_parents):
                if id_ in processing_order:
                    continue

                flag = True
                for parent in parents:
                    if parent not in processing_order:
                        flag = False

                if flag:
                    processing_order.append(id_)
                    update = True

            assert update

        data = np.zeros((num_samples, len(nodes_parents)), dtype='int32')
        for current in processing_order:
            distribution = self.model.states[current].distribution
            if isinstance(distribution, DiscreteDistribution):
                data[:, current] = distribution.sample(num_samples)
            else:
                assert isinstance(distribution, ConditionalProbabilityTable)
                output_size = list(distribution.keys())
                output_size = max([int(x) for x in output_size]) + 1

                distribution = json.loads(distribution.to_json())
                distribution = distribution['table']

                distribution_dict = {}

                for row in distribution:
                    key = tuple(np.asarray(row[:-2], dtype='int'))
                    output = int(row[-2])
                    p = float(row[-1])

                    if key not in distribution_dict:
                        distribution_dict[key] = np.zeros(output_size)

                    distribution_dict[key][int(output)] = p

                parents = nodes_parents[current]
                conds = data[:, parents]
                for _id, cond in enumerate(conds):
                    data[_id, current] = np.random.choice(
                        np.arange(output_size),
                        p=distribution_dict[tuple(cond)])

        return data

    def sample(self, num_samples):
        data = self.bn_sample(num_samples)
        return self.discretizer.inverse_transform(data)
Example #6
0
 def fit(self, data, categorical_columns=tuple(), ordinal_columns=tuple()):
     self.discretizer = DiscretizeTransformer(n_bins=15)
     self.discretizer.fit(data, categorical_columns, ordinal_columns)
     discretized_data = self.discretizer.transform(data)
     self.model = BayesianNetwork.from_samples(discretized_data,
                                               algorithm='chow-liu')