def test_preprocessing_network(self): feature_value_map = read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( name, values, feature_type=self._feature_type_override(name)) test_features = NumpyFeatureProcessor.preprocess( feature_value_map, normalization_parameters) net = core.Net("PreprocessingTestNet") C2.set_net(net) preprocessor = PreprocessorNet() name_preprocessed_blob_map = {} for feature_name in feature_value_map: workspace.FeedBlob(str(feature_name), np.array([0], dtype=np.int32)) preprocessed_blob, _ = preprocessor.preprocess_blob( str(feature_name), [normalization_parameters[feature_name]]) name_preprocessed_blob_map[feature_name] = preprocessed_blob workspace.CreateNet(net) for feature_name, feature_value in six.iteritems(feature_value_map): feature_value = np.expand_dims(feature_value, -1) workspace.FeedBlob(str(feature_name), feature_value) workspace.RunNetOnce(net) for feature_name in feature_value_map: normalized_features = workspace.FetchBlob( name_preprocessed_blob_map[feature_name]) if feature_name != ENUM_FEATURE_ID: normalized_features = np.squeeze(normalized_features, -1) tolerance = 0.01 if feature_name == BOXCOX_FEATURE_ID: # At the limit, boxcox has some numerical instability tolerance = 0.5 non_matching = np.where( np.logical_not( np.isclose( normalized_features, test_features[feature_name], rtol=tolerance, atol=tolerance, ))) self.assertTrue( np.all( np.isclose( normalized_features, test_features[feature_name], rtol=tolerance, atol=tolerance, )), "{} does not match: {} {}".format( feature_name, normalized_features[non_matching].tolist(), test_features[feature_name][non_matching].tolist(), ), )
def test_preprocessing_network(self): feature_value_map = read_data() normalization_parameters = {} name_preprocessed_blob_map = {} for feature_name, feature_values in feature_value_map.items(): normalization_parameters[feature_name] = normalization.identify_parameter( feature_values, feature_type=self._feature_type_override(feature_name) ) feature_values[ 0 ] = MISSING_VALUE # Set one entry to MISSING_VALUE to test that preprocessor = Preprocessor( {feature_name: normalization_parameters[feature_name]}, False ) preprocessor.clamp = False feature_values_matrix = np.expand_dims(feature_values, -1) normalized_feature_values = preprocessor.forward(feature_values_matrix) name_preprocessed_blob_map[feature_name] = normalized_feature_values.numpy() test_features = self.preprocess(feature_value_map, normalization_parameters) for feature_name in feature_value_map: normalized_features = name_preprocessed_blob_map[feature_name] if feature_name != ENUM_FEATURE_ID: normalized_features = np.squeeze(normalized_features, -1) tolerance = 0.01 if feature_name == BOXCOX_FEATURE_ID: # At the limit, boxcox has some numerical instability tolerance = 0.5 non_matching = np.where( np.logical_not( np.isclose( normalized_features.flatten(), test_features[feature_name].flatten(), rtol=tolerance, atol=tolerance, ) ) ) self.assertTrue( np.all( np.isclose( normalized_features.flatten(), test_features[feature_name].flatten(), rtol=tolerance, atol=tolerance, ) ), "{} does not match: {} \n!=\n {}".format( feature_name, normalized_features.flatten()[non_matching], test_features[feature_name].flatten()[non_matching], ), )
def test_type_override(self): # Take a feature that should be identified as probability _, feature_value_map = preprocessing_util.read_data() probability_values = feature_value_map[identify_types.PROBABILITY] # And ask for a binary anyways parameter = normalization.identify_parameter( probability_values, feature_type=identify_types.BINARY) self.assertEqual(parameter.feature_type, "BINARY")
def test_type_override(self): # Take a feature that should be identified as probability feature_value_map = read_data() probability_values = feature_value_map[PROBABILITY_FEATURE_ID] # And ask for a binary anyways parameter = normalization.identify_parameter( "_", probability_values, feature_type=identify_types.BINARY) self.assertEqual(parameter.feature_type, "BINARY")
def test_persistency(self): _, feature_value_map = preprocessing_util.read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter(values) s = normalization.serialize(normalization_parameters) read_parameters = normalization.deserialize(s) self.assertEqual(read_parameters, normalization_parameters)
def test_identification(self): feature_value_map = preprocessing_util.read_data() types = identify_types.identify_types(feature_value_map) # Examples through manual inspection self.assertEqual(types['179'], identify_types.BINARY) self.assertEqual(types['124'], identify_types.CONTINUOUS) self.assertEqual(types['74'], identify_types.PROBABILITY)
def test_preprocessing_network(self): feature_value_map = preprocessing_util.read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( values ) test_features = self.preprocess( feature_value_map, normalization_parameters ) net = core.Net("PreprocessingTestNet") preprocessor = PreprocessorNet(net, False) for feature_name in feature_value_map: workspace.FeedBlob(feature_name, np.array([0], dtype=np.int32)) preprocessor.preprocess_blob( feature_name, normalization_parameters[feature_name] ) workspace.CreateNet(net) for feature_name in feature_value_map: workspace.FeedBlob(feature_name, feature_value_map[feature_name]) workspace.RunNetOnce(net) for feature_name in feature_value_map: normalized_features = workspace.FetchBlob( feature_name + "_preprocessed" ) tolerance = 0.01 if feature_name == 'boxcox': # At the limit, boxcox has some numerical instability tolerance = 0.1 non_matching = np.where( np.logical_not( np.isclose( normalized_features, test_features[feature_name], rtol=tolerance, atol=tolerance, ) ) ) self.assertTrue( np.all( np.isclose( normalized_features, test_features[feature_name], rtol=tolerance, atol=tolerance, ) ), '{} does not match: {} {}'.format( feature_name, normalized_features[non_matching].tolist(), test_features[feature_name][non_matching].tolist() ) )
def test_persistency(self): feature_value_map = preprocessing_util.read_data() normalization_parameters = normalization.identify_parameters( feature_value_map ) s = normalization.serialize(normalization_parameters) read_parameters = normalization.deserialize(s) self.assertEqual(read_parameters, normalization_parameters)
def test_persistency(self): feature_value_map = read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( values, feature_type=self._feature_type_override(name)) s = normalization.serialize(normalization_parameters) read_parameters = normalization.deserialize(s) self.assertEqual(read_parameters, normalization_parameters)
def test_type_override(self): # Take a feature that should be identified as probability feature_value_map = read_data() probability_values = feature_value_map[PROBABILITY_FEATURE_ID] # And ask for a binary anyways parameter = normalization.identify_parameter( "_", probability_values, feature_type=identify_types.BINARY ) self.assertEqual(parameter.feature_type, "BINARY")
def test_preprocessing_network(self): feature_value_map = preprocessing_util.read_data() types = identify_types.identify_types_dict(feature_value_map) normalization_parameters = normalization.identify_parameters( feature_value_map, types) test_features = self.preprocess(feature_value_map, normalization_parameters) test_features[u'186'] = 0 net = core.Net("PreprocessingTestNet") preprocessor = PreprocessorNet(net, False) for feature_name in feature_value_map: workspace.FeedBlob(feature_name, np.array([0], dtype=np.int32)) preprocessor.preprocess_blob( feature_name, normalization_parameters[feature_name]) workspace.CreateNet(net) for feature_name in feature_value_map: if feature_name != u'186': workspace.FeedBlob( feature_name, feature_value_map[feature_name].astype(np.float32)) else: workspace.FeedBlob( feature_name, normalization.MISSING_VALUE * np.ones(1, dtype=np.float32)) workspace.RunNetOnce(net) for feature_name in feature_value_map: normalized_features = workspace.FetchBlob(feature_name + "_preprocessed") self.assertTrue( np.all( np.isclose(normalized_features, test_features[feature_name]))) for feature_name in feature_value_map: if feature_name != u'186': workspace.FeedBlob( feature_name, feature_value_map[feature_name].astype(np.float32)) else: workspace.FeedBlob( feature_name, normalization.MISSING_VALUE * np.ones(1, dtype=np.float32)) workspace.RunNetOnce(net) for feature_name in feature_value_map: normalized_features = workspace.FetchBlob(feature_name + "_preprocessed") self.assertTrue( np.all( np.isclose(normalized_features, test_features[feature_name])))
def test_preprocessing_network_onnx(self): feature_value_map = read_data() for feature_name, feature_values in feature_value_map.items(): normalization_parameters = normalization.identify_parameter( feature_name, feature_values, feature_type=self._feature_type_override(feature_name), ) feature_values[ 0 ] = MISSING_VALUE # Set one entry to MISSING_VALUE to test that feature_values_matrix = np.expand_dims(feature_values, -1) preprocessor = Preprocessor({feature_name: normalization_parameters}, False) normalized_feature_values = preprocessor.forward(feature_values_matrix) input_blob, output_blob, netdef = PytorchCaffe2Converter.pytorch_net_to_caffe2_netdef( preprocessor, 1, False, float_input=True ) preproc_workspace = netdef.workspace preproc_workspace.FeedBlob(input_blob, feature_values_matrix) preproc_workspace.RunNetOnce(core.Net(netdef.init_net)) preproc_workspace.RunNetOnce(core.Net(netdef.predict_net)) normalized_feature_values_onnx = netdef.workspace.FetchBlob(output_blob) tolerance = 0.0001 non_matching = np.where( np.logical_not( np.isclose( normalized_feature_values, normalized_feature_values_onnx, rtol=tolerance, atol=tolerance, ) ) ) self.assertTrue( np.all( np.isclose( normalized_feature_values, normalized_feature_values_onnx, rtol=tolerance, atol=tolerance, ) ), "{} does not match: {} \n!=\n {}".format( feature_name, normalized_feature_values[non_matching].tolist()[0:10], normalized_feature_values_onnx[non_matching].tolist()[0:10], ), )
def test_preprocessing_network(self): features, feature_value_map = preprocessing_util.read_data() normalization_parameters = {} name_preprocessed_blob_map = {} for feature_name, feature_values in feature_value_map.items(): normalization_parameters[feature_name] = normalization.identify_parameter( feature_values ) preprocessor = Preprocessor( {feature_name: normalization_parameters[feature_name]}, False ) preprocessor.clamp = False feature_values_matrix = np.expand_dims(feature_values, -1) normalized_feature_values = preprocessor.forward(feature_values_matrix) name_preprocessed_blob_map[feature_name] = normalized_feature_values.numpy() test_features = self.preprocess(feature_value_map, normalization_parameters) for feature_name in feature_value_map: normalized_features = name_preprocessed_blob_map[feature_name] if feature_name != identify_types.ENUM: normalized_features = np.squeeze(normalized_features, -1) tolerance = 0.01 if feature_name == BOXCOX: # At the limit, boxcox has some numerical instability tolerance = 0.5 non_matching = np.where( np.logical_not( np.isclose( normalized_features, test_features[feature_name], rtol=tolerance, atol=tolerance, ) ) ) self.assertTrue( np.all( np.isclose( normalized_features, test_features[feature_name], rtol=tolerance, atol=tolerance, ) ), "{} does not match: {} {}".format( feature_name, normalized_features[non_matching].tolist()[0:10], test_features[feature_name][non_matching].tolist()[0:10], ), )
def test_persistency(self): feature_value_map = read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( values) values[ 0] = MISSING_VALUE # Set one entry to MISSING_VALUE to test that s = normalization.serialize(normalization_parameters) read_parameters = normalization.deserialize(s) self.assertEqual(read_parameters, normalization_parameters)
def test_persistency(self): feature_value_map = preprocessing_util.read_data() types = identify_types.identify_types_dict(feature_value_map) normalization_parameters = normalization.identify_parameters( feature_value_map, types) with io.StringIO() as f: normalization.write_parameters(f, normalization_parameters) f.seek(0) read_parameters = normalization.load_parameters(f) self.assertEqual(read_parameters, normalization_parameters)
def test_feature_parsing(self): feature_value_map = preprocessing_util.read_data() # There are features and we have mapped all them all self.assertTrue(len(feature_value_map) > 3) # A few samples based on manual inspection self.assertFalse( (feature_value_map['413'] - [473.763927022, 65.0, 65.0, 1.0, 50.0, 50.0, 2.0, 2.0, 1.0, 23.0 ]).any()) self.assertFalse( (feature_value_map['186'] - [0.0, 0.0, 0.0, 11.0, 1.0, 1.0, 7.0, 7.0, 14.0, 0.0]).any())
def test_identification(self): feature_value_map = preprocessing_util.read_data() types = identify_types.identify_types(feature_value_map) # Examples through manual inspection self.assertEqual(types[identify_types.BINARY], identify_types.BINARY) self.assertEqual(types['normal'], identify_types.CONTINUOUS) self.assertEqual(types['boxcox'], identify_types.CONTINUOUS) # We don't yet know the quantile type self.assertEqual(types[identify_types.QUANTILE], identify_types.CONTINUOUS) self.assertEqual(types[identify_types.ENUM], identify_types.ENUM) self.assertEqual(types[identify_types.PROBABILITY], identify_types.PROBABILITY)
def test_identification(self): feature_value_map = read_data() types = {} for name, values in feature_value_map.items(): types[name] = identify_types.identify_type(values) # Examples through manual inspection self.assertEqual(types[BINARY_FEATURE_ID], identify_types.BINARY) self.assertEqual(types[CONTINUOUS_FEATURE_ID], identify_types.CONTINUOUS) # We don't yet know the boxcox type self.assertEqual(types[BOXCOX_FEATURE_ID], identify_types.CONTINUOUS) # We don't yet know the quantile type self.assertEqual(types[QUANTILE_FEATURE_ID], identify_types.CONTINUOUS) self.assertEqual(types[ENUM_FEATURE_ID], identify_types.ENUM) self.assertEqual(types[PROBABILITY_FEATURE_ID], identify_types.PROBABILITY)
def test_persistency(self): feature_value_map = read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( values, feature_type=self._feature_type_override(name)) values[ 0] = MISSING_VALUE # Set one entry to MISSING_VALUE to test that s = normalization.serialize(normalization_parameters) read_parameters = normalization.deserialize(s) # Unfortunately, Thrift serializatin seems to lose a bit of precision. # Using `==` will be false. self.assertEqual(read_parameters.keys(), normalization_parameters.keys()) for k in normalization_parameters: self.assertEqual( read_parameters[k].feature_type, normalization_parameters[k].feature_type, ) self.assertEqual( read_parameters[k].possible_values, normalization_parameters[k].possible_values, ) for field in [ "boxcox_lambda", "boxcox_shift", "mean", "stddev", "quantiles", "min_value", "max_value", ]: if getattr(normalization_parameters[k], field) is None: self.assertEqual( getattr(read_parameters[k], field), getattr(normalization_parameters[k], field), ) else: npt.assert_allclose( getattr(read_parameters[k], field), getattr(normalization_parameters[k], field), )
def test_persistency(self): feature_value_map = read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( name, values, feature_type=self._feature_type_override(name) ) values[0] = MISSING_VALUE # Set one entry to MISSING_VALUE to test that s = normalization.serialize(normalization_parameters) read_parameters = normalization.deserialize(s) # Unfortunately, Thrift serializatin seems to lose a bit of precision. # Using `==` will be false. self.assertEqual(read_parameters.keys(), normalization_parameters.keys()) for k in normalization_parameters: self.assertEqual( read_parameters[k].feature_type, normalization_parameters[k].feature_type, ) self.assertEqual( read_parameters[k].possible_values, normalization_parameters[k].possible_values, ) for field in [ "boxcox_lambda", "boxcox_shift", "mean", "stddev", "quantiles", "min_value", "max_value", ]: if getattr(normalization_parameters[k], field) is None: self.assertEqual( getattr(read_parameters[k], field), getattr(normalization_parameters[k], field), ) else: npt.assert_allclose( getattr(read_parameters[k], field), getattr(normalization_parameters[k], field), )
def test_identification(self): _, feature_value_map = preprocessing_util.read_data() types = {} for name, values in feature_value_map.items(): types[name] = identify_types.identify_type(values) # Examples through manual inspection self.assertEqual(types[identify_types.BINARY], identify_types.BINARY) self.assertEqual(types[identify_types.CONTINUOUS], identify_types.CONTINUOUS) # We don't yet know the boxcox type self.assertEqual(types[identify_types.BOXCOX], identify_types.CONTINUOUS) # We don't yet know the quantile type self.assertEqual(types[identify_types.QUANTILE], identify_types.CONTINUOUS) self.assertEqual(types[identify_types.ENUM], identify_types.ENUM) self.assertEqual(types[identify_types.PROBABILITY], identify_types.PROBABILITY)
def test_prepare_normalization_and_normalize(self): feature_value_map = read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( name, values, 10, feature_type=self._feature_type_override(name)) for k, v in normalization_parameters.items(): if id_to_type(k) == CONTINUOUS: self.assertEqual(v.feature_type, CONTINUOUS) self.assertIs(v.boxcox_lambda, None) self.assertIs(v.boxcox_shift, None) elif id_to_type(k) == BOXCOX: self.assertEqual(v.feature_type, BOXCOX) self.assertIsNot(v.boxcox_lambda, None) self.assertIsNot(v.boxcox_shift, None) else: assert v.feature_type == id_to_type(k) sorted_features, _ = sort_features_by_normalization( normalization_parameters) norm_net = core.Net("net") C2.set_net(norm_net) preprocessor = PreprocessorNet() input_matrix = np.zeros([10000, len(sorted_features)], dtype=np.float32) for i, feature in enumerate(sorted_features): input_matrix[:, i] = feature_value_map[feature] input_matrix_blob = "input_matrix_blob" workspace.FeedBlob(input_matrix_blob, np.array([], dtype=np.float32)) output_blob, _ = preprocessor.normalize_dense_matrix( input_matrix_blob, sorted_features, normalization_parameters, "", False) workspace.FeedBlob(input_matrix_blob, input_matrix) workspace.RunNetOnce(norm_net) normalized_feature_matrix = workspace.FetchBlob(output_blob) normalized_features = {} on_column = 0 for feature in sorted_features: norm = normalization_parameters[feature] if norm.feature_type == ENUM: column_size = len(norm.possible_values) else: column_size = 1 normalized_features[ feature] = normalized_feature_matrix[:, on_column:(on_column + column_size)] on_column += column_size self.assertTrue( all([ np.isfinite(parameter.stddev) and np.isfinite(parameter.mean) for parameter in normalization_parameters.values() ])) for k, v in six.iteritems(normalized_features): self.assertTrue(np.all(np.isfinite(v))) feature_type = normalization_parameters[k].feature_type if feature_type == identify_types.PROBABILITY: sigmoidv = special.expit(v) self.assertTrue( np.all( np.logical_and(np.greater(sigmoidv, 0), np.less(sigmoidv, 1)))) elif feature_type == identify_types.ENUM: possible_values = normalization_parameters[k].possible_values self.assertEqual(v.shape[0], len(feature_value_map[k])) self.assertEqual(v.shape[1], len(possible_values)) possible_value_map = {} for i, possible_value in enumerate(possible_values): possible_value_map[possible_value] = i for i, row in enumerate(v): original_feature = feature_value_map[k][i] self.assertEqual(possible_value_map[original_feature], np.where(row == 1)[0][0]) elif feature_type == identify_types.QUANTILE: for i, feature in enumerate(v[0]): original_feature = feature_value_map[k][i] expected = NumpyFeatureProcessor.value_to_quantile( original_feature, normalization_parameters[k].quantiles) self.assertAlmostEqual(feature, expected, 2) elif feature_type == identify_types.BINARY: pass elif (feature_type == identify_types.CONTINUOUS or feature_type == identify_types.BOXCOX): one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.01) zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.01) zero_mean = np.isclose(np.mean(v), 0, atol=0.01) self.assertTrue( np.all(zero_mean), "mean of feature {} is {}, not 0".format(k, np.mean(v)), ) self.assertTrue(np.all(np.logical_or(one_stddev, zero_stddev))) elif feature_type == identify_types.CONTINUOUS_ACTION: less_than_max = v < 1 more_than_min = v > -1 self.assertTrue( np.all(less_than_max), "values are not less than 1: {}".format( v[less_than_max == False]), ) self.assertTrue( np.all(more_than_min), "values are not more than -1: {}".format( v[more_than_min == False]), ) else: raise NotImplementedError()
def test_prepare_normalization_and_normalize(self): feature_value_map = read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( values, 10) for k, v in normalization_parameters.items(): if id_to_type(k) == CONTINUOUS: self.assertEqual(v.feature_type, CONTINUOUS) self.assertIs(v.boxcox_lambda, None) self.assertIs(v.boxcox_shift, None) elif id_to_type(k) == BOXCOX: self.assertEqual(v.feature_type, BOXCOX) self.assertIsNot(v.boxcox_lambda, None) self.assertIsNot(v.boxcox_shift, None) else: assert v.feature_type == id_to_type(k) preprocessor = Preprocessor(normalization_parameters, False) sorted_features, _ = sort_features_by_normalization( normalization_parameters) preprocessor.clamp = False input_matrix = np.zeros([10000, len(sorted_features)], dtype=np.float32) for i, feature in enumerate(sorted_features): input_matrix[:, i] = feature_value_map[feature] normalized_feature_matrix = preprocessor.forward(input_matrix) normalized_features = {} on_column = 0 for feature in sorted_features: norm = normalization_parameters[feature] if norm.feature_type == ENUM: column_size = len(norm.possible_values) else: column_size = 1 normalized_features[ feature] = normalized_feature_matrix[:, on_column:(on_column + column_size)] on_column += column_size self.assertTrue( all([ np.isfinite(parameter.stddev) and np.isfinite(parameter.mean) for parameter in normalization_parameters.values() ])) for k, v in six.iteritems(normalized_features): v = v.numpy() self.assertTrue(np.all(np.isfinite(v))) feature_type = normalization_parameters[k].feature_type if feature_type == identify_types.PROBABILITY: sigmoidv = special.expit(v) self.assertTrue( np.all( np.logical_and(np.greater(sigmoidv, 0), np.less(sigmoidv, 1)))) elif feature_type == identify_types.ENUM: possible_values = normalization_parameters[k].possible_values self.assertEqual(v.shape[0], len(feature_value_map[k])) self.assertEqual(v.shape[1], len(possible_values)) possible_value_map = {} for i, possible_value in enumerate(possible_values): possible_value_map[possible_value] = i for i, row in enumerate(v): original_feature = feature_value_map[k][i] if abs(original_feature - MISSING_VALUE) < 0.01: self.assertEqual(0.0, np.sum(row)) else: self.assertEqual( possible_value_map[original_feature], np.where(row == 1)[0][0], ) elif feature_type == identify_types.QUANTILE: for i, feature in enumerate(v[0]): original_feature = feature_value_map[k][i] expected = self._value_to_quantile( original_feature, normalization_parameters[k].quantiles) self.assertAlmostEqual(feature, expected, 2) elif feature_type == identify_types.BINARY: pass elif (feature_type == identify_types.CONTINUOUS or feature_type == identify_types.BOXCOX): one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.01) zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.01) zero_mean = np.isclose(np.mean(v), 0, atol=0.01) self.assertTrue( np.all(zero_mean), "mean of feature {} is {}, not 0".format(k, np.mean(v)), ) self.assertTrue(np.all(np.logical_or(one_stddev, zero_stddev))) else: raise NotImplementedError()
def test_preprocessing_network(self): feature_value_map = read_data() normalization_parameters = {} name_preprocessed_blob_map = {} for feature_name, feature_values in feature_value_map.items(): normalization_parameters[feature_name] = normalization.identify_parameter( feature_name, feature_values, feature_type=self._feature_type_override(feature_name), ) feature_values[ 0 ] = MISSING_VALUE # Set one entry to MISSING_VALUE to test that preprocessor = Preprocessor( {feature_name: normalization_parameters[feature_name]}, False ) feature_values_matrix = np.expand_dims(feature_values, -1) normalized_feature_values = preprocessor.forward(feature_values_matrix) name_preprocessed_blob_map[feature_name] = normalized_feature_values.numpy() test_features = NumpyFeatureProcessor.preprocess( feature_value_map, normalization_parameters ) for feature_name in feature_value_map: normalized_features = name_preprocessed_blob_map[feature_name] if feature_name != ENUM_FEATURE_ID: normalized_features = np.squeeze(normalized_features, -1) tolerance = 0.01 if feature_name == BOXCOX_FEATURE_ID: # At the limit, boxcox has some numerical instability tolerance = 0.5 non_matching = np.where( np.logical_not( np.isclose( normalized_features.flatten(), test_features[feature_name].flatten(), rtol=tolerance, atol=tolerance, ) ) ) self.assertTrue( np.all( np.isclose( normalized_features.flatten(), test_features[feature_name].flatten(), rtol=tolerance, atol=tolerance, ) ), "{} does not match: {} \n!=\n {}".format( feature_name, normalized_features.flatten()[non_matching], test_features[feature_name].flatten()[non_matching], ), )
def test_prepare_normalization_and_normalize(self): feature_value_map = preprocessing_util.read_data() types = identify_types.identify_types(feature_value_map) types_dict = identify_types.identify_types_dict(feature_value_map) normalization_parameters = normalization.identify_parameters( feature_value_map, types_dict) features = list(feature_value_map.keys()) norm_net = core.Net("net") blobname_template = '{}_blob' blob_map = prepare_normalization(norm_net, normalization_parameters, features, blobname_template, False) normalized_features = normalize_feature_map(feature_value_map, norm_net, features, blob_map, blobname_template) self.assertTrue( all([ np.isfinite(parameter.stddev) and np.isfinite(parameter.mean) for parameter in normalization_parameters.values() ])) for k, v in six.iteritems(normalized_features): self.assertTrue(np.all(np.isfinite(v))) feature_type = normalization_parameters[k].feature_type if feature_type == identify_types.PROBABILITY: sigmoidv = special.expit(v) self.assertTrue( np.all( np.logical_and(np.greater(sigmoidv, 0), np.less(sigmoidv, 1)))) elif feature_type == identify_types.ENUM: possible_values = normalization_parameters[k].possible_values self.assertEqual(v.shape[0], len(feature_value_map[k])) self.assertEqual(v.shape[1], len(possible_values)) possible_value_map = {} for i, possible_value in enumerate(possible_values): possible_value_map[possible_value] = i for i, row in enumerate(v): original_feature = feature_value_map[k][i] self.assertEqual(possible_value_map[original_feature], np.where(row == 1)[0][0]) else: one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.00001) zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.00001) zero_mean = np.isclose(np.mean(v), 0, atol=0.00001) is_binary = types[k] == identify_types.BINARY self.assertTrue(np.all(np.logical_or(zero_mean, is_binary))) self.assertTrue( np.all( np.logical_or(np.logical_or(one_stddev, zero_stddev), is_binary))) has_boxcox = normalization_parameters[ k].boxcox_lambda is not None is_ctd = types[k] == identify_types.CONTINUOUS # This should be true at the moment self.assertTrue(is_ctd == has_boxcox)
def test_prepare_normalization_and_normalize(self): feature_value_map = preprocessing_util.read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( values, 10 ) for k, v in normalization_parameters.items(): if k == 'normal': self.assertEqual(v.feature_type, 'CONTINUOUS') self.assertIs(v.boxcox_lambda, None) self.assertIs(v.boxcox_shift, None) elif k == 'boxcox': self.assertEqual(v.feature_type, 'CONTINUOUS') self.assertIsNot(v.boxcox_lambda, None) self.assertIsNot(v.boxcox_shift, None) else: self.assertEqual(v.feature_type, k) features = list(feature_value_map.keys()) norm_net = core.Net("net") blobname_template = '{}_blob' blob_map = prepare_normalization( norm_net, normalization_parameters, features, blobname_template, False ) normalized_features = normalize_feature_map( feature_value_map, norm_net, features, blob_map, blobname_template ) self.assertTrue( all( [ np.isfinite(parameter.stddev) and np.isfinite(parameter.mean) for parameter in normalization_parameters.values() ] ) ) for k, v in six.iteritems(normalized_features): self.assertTrue(np.all(np.isfinite(v))) feature_type = normalization_parameters[k].feature_type if feature_type == identify_types.PROBABILITY: sigmoidv = special.expit(v) self.assertTrue( np.all( np.logical_and( np.greater(sigmoidv, 0), np.less(sigmoidv, 1) ) ) ) elif feature_type == identify_types.ENUM: possible_values = normalization_parameters[k].possible_values self.assertEqual(v.shape[0], len(feature_value_map[k])) self.assertEqual(v.shape[1], len(possible_values)) possible_value_map = {} for i, possible_value in enumerate(possible_values): possible_value_map[possible_value] = i for i, row in enumerate(v): original_feature = feature_value_map[k][i] self.assertEqual( possible_value_map[original_feature], np.where(row == 1)[0][0] ) elif feature_type == identify_types.QUANTILE: quantiles = normalization_parameters[k].quantiles for i, feature in enumerate(v[0]): original_feature = feature_value_map[k][i] count = 0 for quantile in quantiles: if original_feature >= quantile: count += 1 count /= float(len(quantiles)) self.assertAlmostEqual(feature, count, 2) elif feature_type == identify_types.BINARY: pass elif feature_type == identify_types.CONTINUOUS: one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.01) zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.01) zero_mean = np.isclose(np.mean(v), 0, atol=0.01) self.assertTrue( np.all(zero_mean), 'mean of feature {} is {}, not 0'.format(k, np.mean(v)) ) self.assertTrue(np.all(np.logical_or(one_stddev, zero_stddev))) else: raise NotImplementedError()
def test_prepare_normalization_and_normalize(self): features, feature_value_map = preprocessing_util.read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( values, 10) for k, v in normalization_parameters.items(): if k == CONTINUOUS: self.assertEqual(v.feature_type, CONTINUOUS) self.assertIs(v.boxcox_lambda, None) self.assertIs(v.boxcox_shift, None) elif k == BOXCOX: self.assertEqual(v.feature_type, BOXCOX) self.assertIsNot(v.boxcox_lambda, None) self.assertIsNot(v.boxcox_shift, None) else: assert v.feature_type == k or v.feature_type + "_2" + k norm_net = core.Net("net") preprocessor = PreprocessorNet(norm_net, False) input_matrix = np.zeros([10000, len(features)], dtype=np.float32) for i, feature in enumerate(features): input_matrix[:, i] = feature_value_map[feature] input_matrix_blob = 'input_matrix_blob' workspace.FeedBlob(input_matrix_blob, np.array([], dtype=np.float32)) output_blob, _ = preprocessor.normalize_dense_matrix( input_matrix_blob, features, normalization_parameters, '') workspace.FeedBlob(input_matrix_blob, input_matrix) workspace.RunNetOnce(norm_net) normalized_feature_matrix = workspace.FetchBlob(output_blob) normalized_features = {} on_column = 0 for feature in features: norm = normalization_parameters[feature] if norm.feature_type == ENUM: column_size = len(norm.possible_values) else: column_size = 1 normalized_features[feature] = \ normalized_feature_matrix[:, on_column:( on_column + column_size )] on_column += column_size self.assertTrue( all([ np.isfinite(parameter.stddev) and np.isfinite(parameter.mean) for parameter in normalization_parameters.values() ])) for k, v in six.iteritems(normalized_features): self.assertTrue(np.all(np.isfinite(v))) feature_type = normalization_parameters[k].feature_type if feature_type == identify_types.PROBABILITY: sigmoidv = special.expit(v) self.assertTrue( np.all( np.logical_and(np.greater(sigmoidv, 0), np.less(sigmoidv, 1)))) elif feature_type == identify_types.ENUM: possible_values = normalization_parameters[k].possible_values self.assertEqual(v.shape[0], len(feature_value_map[k])) self.assertEqual(v.shape[1], len(possible_values)) possible_value_map = {} for i, possible_value in enumerate(possible_values): possible_value_map[possible_value] = i for i, row in enumerate(v): original_feature = feature_value_map[k][i] self.assertEqual(possible_value_map[original_feature], np.where(row == 1)[0][0]) elif feature_type == identify_types.QUANTILE: for i, feature in enumerate(v[0]): original_feature = feature_value_map[k][i] expected = self._value_to_quantile( original_feature, normalization_parameters[k].quantiles) self.assertAlmostEqual(feature, expected, 2) elif feature_type == identify_types.BINARY: pass elif feature_type == identify_types.CONTINUOUS or \ feature_type == identify_types.BOXCOX: one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.01) zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.01) zero_mean = np.isclose(np.mean(v), 0, atol=0.01) self.assertTrue( np.all(zero_mean), 'mean of feature {} is {}, not 0'.format(k, np.mean(v))) self.assertTrue(np.all(np.logical_or(one_stddev, zero_stddev))) else: raise NotImplementedError()
def test_prepare_normalization_and_normalize(self): feature_value_map = read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( name, values, 10, feature_type=self._feature_type_override(name) ) for k, v in normalization_parameters.items(): if id_to_type(k) == CONTINUOUS: self.assertEqual(v.feature_type, CONTINUOUS) self.assertIs(v.boxcox_lambda, None) self.assertIs(v.boxcox_shift, None) elif id_to_type(k) == BOXCOX: self.assertEqual(v.feature_type, BOXCOX) self.assertIsNot(v.boxcox_lambda, None) self.assertIsNot(v.boxcox_shift, None) else: assert v.feature_type == id_to_type(k) sorted_features, _ = sort_features_by_normalization(normalization_parameters) norm_net = core.Net("net") C2.set_net(norm_net) preprocessor = PreprocessorNet() input_matrix = np.zeros([10000, len(sorted_features)], dtype=np.float32) for i, feature in enumerate(sorted_features): input_matrix[:, i] = feature_value_map[feature] input_matrix_blob = "input_matrix_blob" workspace.FeedBlob(input_matrix_blob, np.array([], dtype=np.float32)) output_blob, _ = preprocessor.normalize_dense_matrix( input_matrix_blob, sorted_features, normalization_parameters, "", False ) workspace.FeedBlob(input_matrix_blob, input_matrix) workspace.RunNetOnce(norm_net) normalized_feature_matrix = workspace.FetchBlob(output_blob) normalized_features = {} on_column = 0 for feature in sorted_features: norm = normalization_parameters[feature] if norm.feature_type == ENUM: column_size = len(norm.possible_values) else: column_size = 1 normalized_features[feature] = normalized_feature_matrix[ :, on_column : (on_column + column_size) ] on_column += column_size self.assertTrue( all( [ np.isfinite(parameter.stddev) and np.isfinite(parameter.mean) for parameter in normalization_parameters.values() ] ) ) for k, v in six.iteritems(normalized_features): self.assertTrue(np.all(np.isfinite(v))) feature_type = normalization_parameters[k].feature_type if feature_type == identify_types.PROBABILITY: sigmoidv = special.expit(v) self.assertTrue( np.all( np.logical_and(np.greater(sigmoidv, 0), np.less(sigmoidv, 1)) ) ) elif feature_type == identify_types.ENUM: possible_values = normalization_parameters[k].possible_values self.assertEqual(v.shape[0], len(feature_value_map[k])) self.assertEqual(v.shape[1], len(possible_values)) possible_value_map = {} for i, possible_value in enumerate(possible_values): possible_value_map[possible_value] = i for i, row in enumerate(v): original_feature = feature_value_map[k][i] self.assertEqual( possible_value_map[original_feature], np.where(row == 1)[0][0] ) elif feature_type == identify_types.QUANTILE: for i, feature in enumerate(v[0]): original_feature = feature_value_map[k][i] expected = NumpyFeatureProcessor.value_to_quantile( original_feature, normalization_parameters[k].quantiles ) self.assertAlmostEqual(feature, expected, 2) elif feature_type == identify_types.BINARY: pass elif ( feature_type == identify_types.CONTINUOUS or feature_type == identify_types.BOXCOX ): one_stddev = np.isclose(np.std(v, ddof=1), 1, atol=0.01) zero_stddev = np.isclose(np.std(v, ddof=1), 0, atol=0.01) zero_mean = np.isclose(np.mean(v), 0, atol=0.01) self.assertTrue( np.all(zero_mean), "mean of feature {} is {}, not 0".format(k, np.mean(v)), ) self.assertTrue(np.all(np.logical_or(one_stddev, zero_stddev))) elif feature_type == identify_types.CONTINUOUS_ACTION: less_than_max = v < 1 more_than_min = v > -1 self.assertTrue( np.all(less_than_max), "values are not less than 1: {}".format(v[less_than_max == False]), ) self.assertTrue( np.all(more_than_min), "values are not more than -1: {}".format(v[more_than_min == False]), ) else: raise NotImplementedError()
def test_preprocessing_network(self): feature_value_map = read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( name, values, feature_type=self._feature_type_override(name) ) test_features = NumpyFeatureProcessor.preprocess( feature_value_map, normalization_parameters ) net = core.Net("PreprocessingTestNet") C2.set_net(net) preprocessor = PreprocessorNet() name_preprocessed_blob_map = {} for feature_name in feature_value_map: workspace.FeedBlob(str(feature_name), np.array([0], dtype=np.int32)) preprocessed_blob, _ = preprocessor.preprocess_blob( str(feature_name), [normalization_parameters[feature_name]] ) name_preprocessed_blob_map[feature_name] = preprocessed_blob workspace.CreateNet(net) for feature_name, feature_value in six.iteritems(feature_value_map): feature_value = np.expand_dims(feature_value, -1) workspace.FeedBlob(str(feature_name), feature_value) workspace.RunNetOnce(net) for feature_name in feature_value_map: normalized_features = workspace.FetchBlob( name_preprocessed_blob_map[feature_name] ) if feature_name != ENUM_FEATURE_ID: normalized_features = np.squeeze(normalized_features, -1) tolerance = 0.01 if feature_name == BOXCOX_FEATURE_ID: # At the limit, boxcox has some numerical instability tolerance = 0.5 non_matching = np.where( np.logical_not( np.isclose( normalized_features, test_features[feature_name], rtol=tolerance, atol=tolerance, ) ) ) self.assertTrue( np.all( np.isclose( normalized_features, test_features[feature_name], rtol=tolerance, atol=tolerance, ) ), "{} does not match: {} {}".format( feature_name, normalized_features[non_matching].tolist(), test_features[feature_name][non_matching].tolist(), ), )