def test_preprocessing_network(self):
        feature_value_map = read_data()

        normalization_parameters = {}
        name_preprocessed_blob_map = {}

        for feature_name, feature_values in feature_value_map.items():
            normalization_parameters[
                feature_name] = normalization.identify_parameter(
                    feature_name,
                    feature_values,
                    feature_type=self._feature_type_override(feature_name),
                )
            feature_values[
                0] = MISSING_VALUE  # Set one entry to MISSING_VALUE to test that

            preprocessor = Preprocessor(
                {feature_name: normalization_parameters[feature_name]}, False)
            feature_values_matrix = torch.from_numpy(
                np.expand_dims(feature_values, -1))
            normalized_feature_values = preprocessor(
                feature_values_matrix,
                (feature_values_matrix != MISSING_VALUE))
            name_preprocessed_blob_map[
                feature_name] = normalized_feature_values.numpy()

        test_features = NumpyFeatureProcessor.preprocess(
            feature_value_map, normalization_parameters)

        for feature_name in feature_value_map:
            normalized_features = name_preprocessed_blob_map[feature_name]
            if feature_name != ENUM_FEATURE_ID:
                normalized_features = np.squeeze(normalized_features, -1)

            tolerance = 0.01
            if feature_name == BOXCOX_FEATURE_ID:
                # At the limit, boxcox has some numerical instability
                tolerance = 0.5
            non_matching = np.where(
                np.logical_not(
                    np.isclose(
                        normalized_features.flatten(),
                        test_features[feature_name].flatten(),
                        rtol=tolerance,
                        atol=tolerance,
                    )))
            self.assertTrue(
                np.all(
                    np.isclose(
                        normalized_features.flatten(),
                        test_features[feature_name].flatten(),
                        rtol=tolerance,
                        atol=tolerance,
                    )),
                "{} does not match: {} \n!=\n {}".format(
                    feature_name,
                    normalized_features.flatten()[non_matching],
                    test_features[feature_name].flatten()[non_matching],
                ),
            )
    def test_type_override(self):
        # Take a feature that should be identified as probability
        feature_value_map = read_data()
        probability_values = feature_value_map[PROBABILITY_FEATURE_ID]

        # And ask for a binary anyways
        parameter = normalization.identify_parameter(
            "_", probability_values, feature_type=identify_types.BINARY)
        self.assertEqual(parameter.feature_type, "BINARY")
Exemple #3
0
    def test_type_override_quantile(self):
        # Take a feature that should be identified as CONTINUOUS
        feature_value_map = read_data()
        probability_values = feature_value_map[BOXCOX_FEATURE_ID]

        # And ask for a QUANTILE anyways
        parameter = normalization.identify_parameter(
            "_", probability_values, feature_type=identify_types.QUANTILE
        )
        self.assertEqual(parameter.feature_type, "QUANTILE")
Exemple #4
0
    def test_type_override_boxcox(self):
        # Take a feature that should be identified as CONTINUOUS
        feature_value_map = read_data()
        probability_values = feature_value_map[CONTINUOUS_FEATURE_ID]

        # And ask for a BOXCOX anyways
        parameter = normalization.identify_parameter(
            "_", probability_values, feature_type=identify_types.BOXCOX
        )
        self.assertEqual(parameter.feature_type, "BOXCOX")
Exemple #5
0
    def test_type_override_continuous(self):
        # Take a feature that should be identified as BOXCOX
        feature_value_map = read_data()
        probability_values = feature_value_map[BOXCOX_FEATURE_ID]

        # And ask for a CONTINUOUS anyways
        parameter = normalization.identify_parameter(
            "_", probability_values, feature_type=identify_types.CONTINUOUS
        )
        self.assertEqual(parameter.feature_type, "CONTINUOUS")
    def test_persistency(self):
        feature_value_map = read_data()
        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                name, values, feature_type=self._feature_type_override(name))
            values[
                0] = MISSING_VALUE  # Set one entry to MISSING_VALUE to test that

        s = normalization.serialize(normalization_parameters)
        read_parameters = normalization.deserialize(s)
        # Unfortunately, Thrift serializatin seems to lose a bit of precision.
        # Using `==` will be false.
        self.assertEqual(read_parameters.keys(),
                         normalization_parameters.keys())
        for k in normalization_parameters:
            self.assertEqual(
                read_parameters[k].feature_type,
                normalization_parameters[k].feature_type,
            )
            self.assertEqual(
                read_parameters[k].possible_values,
                normalization_parameters[k].possible_values,
            )
            for field in [
                    "boxcox_lambda",
                    "boxcox_shift",
                    "mean",
                    "stddev",
                    "quantiles",
                    "min_value",
                    "max_value",
            ]:
                if getattr(normalization_parameters[k], field) is None:
                    self.assertEqual(
                        getattr(read_parameters[k], field),
                        getattr(normalization_parameters[k], field),
                    )
                else:
                    npt.assert_allclose(
                        getattr(read_parameters[k], field),
                        getattr(normalization_parameters[k], field),
                    )
    def test_prepare_normalization_and_normalize(self):
        feature_value_map = read_data()

        normalization_parameters = {}
        for name, values in feature_value_map.items():
            normalization_parameters[name] = normalization.identify_parameter(
                name,
                values,
                10,
                feature_type=self._feature_type_override(name))
        for k, v in normalization_parameters.items():
            if id_to_type(k) == CONTINUOUS:
                self.assertEqual(v.feature_type, CONTINUOUS)
                self.assertIs(v.boxcox_lambda, None)
                self.assertIs(v.boxcox_shift, None)
            elif id_to_type(k) == BOXCOX:
                self.assertEqual(v.feature_type, BOXCOX)
                self.assertIsNot(v.boxcox_lambda, None)
                self.assertIsNot(v.boxcox_shift, None)
            else:
                assert v.feature_type == id_to_type(k)

        preprocessor = Preprocessor(normalization_parameters, False)
        sorted_features, _ = sort_features_by_normalization(
            normalization_parameters)
        input_matrix = torch.zeros([10000, len(sorted_features)])
        for i, feature in enumerate(sorted_features):
            input_matrix[:, i] = torch.from_numpy(feature_value_map[feature])
        normalized_feature_matrix = preprocessor(
            input_matrix, (input_matrix != MISSING_VALUE))

        normalized_features = {}
        on_column = 0
        for feature in sorted_features:
            norm = normalization_parameters[feature]
            if norm.feature_type == ENUM:
                column_size = len(norm.possible_values)
            else:
                column_size = 1
            normalized_features[
                feature] = normalized_feature_matrix[:,
                                                     on_column:(on_column +
                                                                column_size)]
            on_column += column_size

        self.assertTrue(
            all([
                np.isfinite(parameter.stddev) and np.isfinite(parameter.mean)
                for parameter in normalization_parameters.values()
            ]))
        for k, v in six.iteritems(normalized_features):
            v = v.numpy()
            self.assertTrue(np.all(np.isfinite(v)))
            feature_type = normalization_parameters[k].feature_type
            if feature_type == identify_types.PROBABILITY:
                sigmoidv = special.expit(v)
                self.assertTrue(
                    np.all(
                        np.logical_and(np.greater(sigmoidv, 0),
                                       np.less(sigmoidv, 1))))
            elif feature_type == identify_types.ENUM:
                possible_values = normalization_parameters[k].possible_values
                self.assertEqual(v.shape[0], len(feature_value_map[k]))
                self.assertEqual(v.shape[1], len(possible_values))

                possible_value_map = {}
                for i, possible_value in enumerate(possible_values):
                    possible_value_map[possible_value] = i

                for i, row in enumerate(v):
                    original_feature = feature_value_map[k][i]
                    if abs(original_feature - MISSING_VALUE) < 0.01:
                        self.assertEqual(0.0, np.sum(row))
                    else:
                        self.assertEqual(
                            possible_value_map[original_feature],
                            np.where(row == 1)[0][0],
                        )
            elif feature_type == identify_types.QUANTILE:
                for i, feature in enumerate(v[0]):
                    original_feature = feature_value_map[k][i]
                    expected = NumpyFeatureProcessor.value_to_quantile(
                        original_feature,
                        normalization_parameters[k].quantiles)
                    self.assertAlmostEqual(feature, expected, 2)
            elif feature_type == identify_types.BINARY:
                pass
            elif (feature_type == identify_types.CONTINUOUS
                  or feature_type == identify_types.BOXCOX):
                one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.01)
                zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.01)
                zero_mean = np.isclose(np.mean(v), 0, atol=0.01)
                self.assertTrue(
                    np.all(zero_mean),
                    "mean of feature {} is {}, not 0".format(k, np.mean(v)),
                )
                self.assertTrue(np.all(np.logical_or(one_stddev, zero_stddev)))
            elif feature_type == identify_types.CONTINUOUS_ACTION:
                less_than_max = v < 1
                more_than_min = v > -1
                self.assertTrue(
                    np.all(less_than_max),
                    "values are not less than 1: {}".format(
                        v[less_than_max == False]),
                )
                self.assertTrue(
                    np.all(more_than_min),
                    "values are not more than -1: {}".format(
                        v[more_than_min == False]),
                )
            else:
                raise NotImplementedError()