def test_preprocessing_network(self): feature_value_map = read_data() normalization_parameters = {} name_preprocessed_blob_map = {} for feature_name, feature_values in feature_value_map.items(): normalization_parameters[ feature_name] = normalization.identify_parameter( feature_name, feature_values, feature_type=self._feature_type_override(feature_name), ) feature_values[ 0] = MISSING_VALUE # Set one entry to MISSING_VALUE to test that preprocessor = Preprocessor( {feature_name: normalization_parameters[feature_name]}, False) feature_values_matrix = torch.from_numpy( np.expand_dims(feature_values, -1)) normalized_feature_values = preprocessor( feature_values_matrix, (feature_values_matrix != MISSING_VALUE)) name_preprocessed_blob_map[ feature_name] = normalized_feature_values.numpy() test_features = NumpyFeatureProcessor.preprocess( feature_value_map, normalization_parameters) for feature_name in feature_value_map: normalized_features = name_preprocessed_blob_map[feature_name] if feature_name != ENUM_FEATURE_ID: normalized_features = np.squeeze(normalized_features, -1) tolerance = 0.01 if feature_name == BOXCOX_FEATURE_ID: # At the limit, boxcox has some numerical instability tolerance = 0.5 non_matching = np.where( np.logical_not( np.isclose( normalized_features.flatten(), test_features[feature_name].flatten(), rtol=tolerance, atol=tolerance, ))) self.assertTrue( np.all( np.isclose( normalized_features.flatten(), test_features[feature_name].flatten(), rtol=tolerance, atol=tolerance, )), "{} does not match: {} \n!=\n {}".format( feature_name, normalized_features.flatten()[non_matching], test_features[feature_name].flatten()[non_matching], ), )
def test_type_override(self): # Take a feature that should be identified as probability feature_value_map = read_data() probability_values = feature_value_map[PROBABILITY_FEATURE_ID] # And ask for a binary anyways parameter = normalization.identify_parameter( "_", probability_values, feature_type=identify_types.BINARY) self.assertEqual(parameter.feature_type, "BINARY")
def test_type_override_quantile(self): # Take a feature that should be identified as CONTINUOUS feature_value_map = read_data() probability_values = feature_value_map[BOXCOX_FEATURE_ID] # And ask for a QUANTILE anyways parameter = normalization.identify_parameter( "_", probability_values, feature_type=identify_types.QUANTILE ) self.assertEqual(parameter.feature_type, "QUANTILE")
def test_type_override_boxcox(self): # Take a feature that should be identified as CONTINUOUS feature_value_map = read_data() probability_values = feature_value_map[CONTINUOUS_FEATURE_ID] # And ask for a BOXCOX anyways parameter = normalization.identify_parameter( "_", probability_values, feature_type=identify_types.BOXCOX ) self.assertEqual(parameter.feature_type, "BOXCOX")
def test_type_override_continuous(self): # Take a feature that should be identified as BOXCOX feature_value_map = read_data() probability_values = feature_value_map[BOXCOX_FEATURE_ID] # And ask for a CONTINUOUS anyways parameter = normalization.identify_parameter( "_", probability_values, feature_type=identify_types.CONTINUOUS ) self.assertEqual(parameter.feature_type, "CONTINUOUS")
def test_identification(self): feature_value_map = read_data() types = {} for name, values in feature_value_map.items(): types[name] = identify_types.identify_type(values) # Examples through manual inspection self.assertEqual(types[BINARY_FEATURE_ID], identify_types.BINARY) self.assertEqual(types[CONTINUOUS_FEATURE_ID], identify_types.CONTINUOUS) # We don't yet know the boxcox type self.assertEqual(types[BOXCOX_FEATURE_ID], identify_types.CONTINUOUS) # We don't yet know the quantile type self.assertEqual(types[QUANTILE_FEATURE_ID], identify_types.CONTINUOUS) self.assertEqual(types[ENUM_FEATURE_ID], identify_types.ENUM) self.assertEqual(types[PROBABILITY_FEATURE_ID], identify_types.PROBABILITY)
def test_persistency(self): feature_value_map = read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( name, values, feature_type=self._feature_type_override(name)) values[ 0] = MISSING_VALUE # Set one entry to MISSING_VALUE to test that s = normalization.serialize(normalization_parameters) read_parameters = normalization.deserialize(s) # Unfortunately, Thrift serializatin seems to lose a bit of precision. # Using `==` will be false. self.assertEqual(read_parameters.keys(), normalization_parameters.keys()) for k in normalization_parameters: self.assertEqual( read_parameters[k].feature_type, normalization_parameters[k].feature_type, ) self.assertEqual( read_parameters[k].possible_values, normalization_parameters[k].possible_values, ) for field in [ "boxcox_lambda", "boxcox_shift", "mean", "stddev", "quantiles", "min_value", "max_value", ]: if getattr(normalization_parameters[k], field) is None: self.assertEqual( getattr(read_parameters[k], field), getattr(normalization_parameters[k], field), ) else: npt.assert_allclose( getattr(read_parameters[k], field), getattr(normalization_parameters[k], field), )
def test_prepare_normalization_and_normalize(self): feature_value_map = read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( name, values, 10, feature_type=self._feature_type_override(name)) for k, v in normalization_parameters.items(): if id_to_type(k) == CONTINUOUS: self.assertEqual(v.feature_type, CONTINUOUS) self.assertIs(v.boxcox_lambda, None) self.assertIs(v.boxcox_shift, None) elif id_to_type(k) == BOXCOX: self.assertEqual(v.feature_type, BOXCOX) self.assertIsNot(v.boxcox_lambda, None) self.assertIsNot(v.boxcox_shift, None) else: assert v.feature_type == id_to_type(k) preprocessor = Preprocessor(normalization_parameters, False) sorted_features, _ = sort_features_by_normalization( normalization_parameters) input_matrix = torch.zeros([10000, len(sorted_features)]) for i, feature in enumerate(sorted_features): input_matrix[:, i] = torch.from_numpy(feature_value_map[feature]) normalized_feature_matrix = preprocessor( input_matrix, (input_matrix != MISSING_VALUE)) normalized_features = {} on_column = 0 for feature in sorted_features: norm = normalization_parameters[feature] if norm.feature_type == ENUM: column_size = len(norm.possible_values) else: column_size = 1 normalized_features[ feature] = normalized_feature_matrix[:, on_column:(on_column + column_size)] on_column += column_size self.assertTrue( all([ np.isfinite(parameter.stddev) and np.isfinite(parameter.mean) for parameter in normalization_parameters.values() ])) for k, v in six.iteritems(normalized_features): v = v.numpy() self.assertTrue(np.all(np.isfinite(v))) feature_type = normalization_parameters[k].feature_type if feature_type == identify_types.PROBABILITY: sigmoidv = special.expit(v) self.assertTrue( np.all( np.logical_and(np.greater(sigmoidv, 0), np.less(sigmoidv, 1)))) elif feature_type == identify_types.ENUM: possible_values = normalization_parameters[k].possible_values self.assertEqual(v.shape[0], len(feature_value_map[k])) self.assertEqual(v.shape[1], len(possible_values)) possible_value_map = {} for i, possible_value in enumerate(possible_values): possible_value_map[possible_value] = i for i, row in enumerate(v): original_feature = feature_value_map[k][i] if abs(original_feature - MISSING_VALUE) < 0.01: self.assertEqual(0.0, np.sum(row)) else: self.assertEqual( possible_value_map[original_feature], np.where(row == 1)[0][0], ) elif feature_type == identify_types.QUANTILE: for i, feature in enumerate(v[0]): original_feature = feature_value_map[k][i] expected = NumpyFeatureProcessor.value_to_quantile( original_feature, normalization_parameters[k].quantiles) self.assertAlmostEqual(feature, expected, 2) elif feature_type == identify_types.BINARY: pass elif (feature_type == identify_types.CONTINUOUS or feature_type == identify_types.BOXCOX): one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.01) zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.01) zero_mean = np.isclose(np.mean(v), 0, atol=0.01) self.assertTrue( np.all(zero_mean), "mean of feature {} is {}, not 0".format(k, np.mean(v)), ) self.assertTrue(np.all(np.logical_or(one_stddev, zero_stddev))) elif feature_type == identify_types.CONTINUOUS_ACTION: less_than_max = v < 1 more_than_min = v > -1 self.assertTrue( np.all(less_than_max), "values are not less than 1: {}".format( v[less_than_max == False]), ) self.assertTrue( np.all(more_than_min), "values are not more than -1: {}".format( v[more_than_min == False]), ) else: raise NotImplementedError()