Beispiel #1
0
    def test_normalize_feature_map_enum(self):
        feature_name_1 = 'f1'
        feature_name_2 = 'f2'
        feature_name_3 = 'f3'
        normalization_parameters = {
            feature_name_1:
            NormalizationParameters(identify_types.ENUM, None, None, None,
                                    None, [12.0, 4.2, 2.1]),
            feature_name_2:
            NormalizationParameters(identify_types.CONTINUOUS, None, 0, 0, 1,
                                    None),
            feature_name_3:
            NormalizationParameters(identify_types.ENUM, None, None, None,
                                    None, [15.1, -3.2])
        }

        feature_value_map = {
            feature_name_1:
            np.array([2.1, 4.2, 12.0, 12.0], dtype=np.float32),
            feature_name_2:
            np.array([1.9, 2.2, 5.0, 1.0], dtype=np.float32),
            feature_name_3:
            np.array([-3.2, -3.2, 15.1, normalization.MISSING_VALUE],
                     dtype=np.float32)
        }

        features = list(feature_value_map.keys())
        norm_net = core.Net("net")
        blobname_template = '{}_blob'
        blob_map = prepare_normalization(norm_net, normalization_parameters,
                                         features, blobname_template, False)
        normalized_features = normalize_feature_map(feature_value_map,
                                                    norm_net, features,
                                                    blob_map,
                                                    blobname_template)

        for v in normalized_features.values():
            self.assertTrue(np.all(np.isfinite(v)))

        np.testing.assert_array_equal(
            np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0], [1, 0, 0]]),
            normalized_features[feature_name_1])
        np.testing.assert_array_equal(
            np.array([[1.9, 2.2, 5.0, 1.0]], dtype=np.float32),
            normalized_features[feature_name_2])
        np.testing.assert_array_equal(
            np.array([
                [0, 1],
                [0, 1],
                [1, 0],
                [0, 0]  # Missing value should go to all 0
            ]),
            normalized_features[feature_name_3])
Beispiel #2
0
    def test_prepare_normalization_and_normalize(self):
        feature_value_map = preprocessing_util.read_data()

        types = identify_types.identify_types(feature_value_map)
        types_dict = identify_types.identify_types_dict(feature_value_map)
        normalization_parameters = normalization.identify_parameters(
            feature_value_map, types_dict)

        features = list(feature_value_map.keys())
        norm_net = core.Net("net")
        blobname_template = '{}_blob'
        blob_map = prepare_normalization(norm_net, normalization_parameters,
                                         features, blobname_template, False)

        normalized_features = normalize_feature_map(feature_value_map,
                                                    norm_net, features,
                                                    blob_map,
                                                    blobname_template)

        self.assertTrue(
            all([
                np.isfinite(parameter.stddev) and np.isfinite(parameter.mean)
                for parameter in normalization_parameters.values()
            ]))
        for k, v in six.iteritems(normalized_features):
            self.assertTrue(np.all(np.isfinite(v)))
            feature_type = normalization_parameters[k].feature_type
            if feature_type == identify_types.PROBABILITY:
                sigmoidv = special.expit(v)
                self.assertTrue(
                    np.all(
                        np.logical_and(np.greater(sigmoidv, 0),
                                       np.less(sigmoidv, 1))))
            elif feature_type == identify_types.ENUM:
                possible_values = normalization_parameters[k].possible_values
                self.assertEqual(v.shape[0], len(feature_value_map[k]))
                self.assertEqual(v.shape[1], len(possible_values))

                possible_value_map = {}
                for i, possible_value in enumerate(possible_values):
                    possible_value_map[possible_value] = i

                for i, row in enumerate(v):
                    original_feature = feature_value_map[k][i]
                    self.assertEqual(possible_value_map[original_feature],
                                     np.where(row == 1)[0][0])
            else:
                one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.00001)
                zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.00001)
                zero_mean = np.isclose(np.mean(v), 0, atol=0.00001)
                is_binary = types[k] == identify_types.BINARY
                self.assertTrue(np.all(np.logical_or(zero_mean, is_binary)))
                self.assertTrue(
                    np.all(
                        np.logical_or(np.logical_or(one_stddev, zero_stddev),
                                      is_binary)))

                has_boxcox = normalization_parameters[
                    k].boxcox_lambda is not None
                is_ctd = types[k] == identify_types.CONTINUOUS
                # This should be true at the moment
                self.assertTrue(is_ctd == has_boxcox)
    def test_prepare_normalization_and_normalize(self):
        feature_value_map = preprocessing_util.read_data()

        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                values, 10
            )
        for k, v in normalization_parameters.items():
            if k == 'normal':
                self.assertEqual(v.feature_type, 'CONTINUOUS')
                self.assertIs(v.boxcox_lambda, None)
                self.assertIs(v.boxcox_shift, None)
            elif k == 'boxcox':
                self.assertEqual(v.feature_type, 'CONTINUOUS')
                self.assertIsNot(v.boxcox_lambda, None)
                self.assertIsNot(v.boxcox_shift, None)
            else:
                self.assertEqual(v.feature_type, k)

        features = list(feature_value_map.keys())
        norm_net = core.Net("net")
        blobname_template = '{}_blob'
        blob_map = prepare_normalization(
            norm_net, normalization_parameters, features, blobname_template,
            False
        )

        normalized_features = normalize_feature_map(
            feature_value_map, norm_net, features, blob_map, blobname_template
        )

        self.assertTrue(
            all(
                [
                    np.isfinite(parameter.stddev) and
                    np.isfinite(parameter.mean)
                    for parameter in normalization_parameters.values()
                ]
            )
        )
        for k, v in six.iteritems(normalized_features):
            self.assertTrue(np.all(np.isfinite(v)))
            feature_type = normalization_parameters[k].feature_type
            if feature_type == identify_types.PROBABILITY:
                sigmoidv = special.expit(v)
                self.assertTrue(
                    np.all(
                        np.logical_and(
                            np.greater(sigmoidv, 0), np.less(sigmoidv, 1)
                        )
                    )
                )
            elif feature_type == identify_types.ENUM:
                possible_values = normalization_parameters[k].possible_values
                self.assertEqual(v.shape[0], len(feature_value_map[k]))
                self.assertEqual(v.shape[1], len(possible_values))

                possible_value_map = {}
                for i, possible_value in enumerate(possible_values):
                    possible_value_map[possible_value] = i

                for i, row in enumerate(v):
                    original_feature = feature_value_map[k][i]
                    self.assertEqual(
                        possible_value_map[original_feature],
                        np.where(row == 1)[0][0]
                    )
            elif feature_type == identify_types.QUANTILE:
                quantiles = normalization_parameters[k].quantiles
                for i, feature in enumerate(v[0]):
                    original_feature = feature_value_map[k][i]
                    count = 0
                    for quantile in quantiles:
                        if original_feature >= quantile:
                            count += 1
                    count /= float(len(quantiles))
                    self.assertAlmostEqual(feature, count, 2)
            elif feature_type == identify_types.BINARY:
                pass
            elif feature_type == identify_types.CONTINUOUS:
                one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.01)
                zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.01)
                zero_mean = np.isclose(np.mean(v), 0, atol=0.01)
                self.assertTrue(
                    np.all(zero_mean),
                    'mean of feature {} is {}, not 0'.format(k, np.mean(v))
                )
                self.assertTrue(np.all(np.logical_or(one_stddev, zero_stddev)))
            else:
                raise NotImplementedError()