def get_norm_metadata(dataset, norm_params, norm_col): done = False batch = dataset.read_batch() samples_per_feature, samples = defaultdict(int), defaultdict(list) while not done: if batch is None or len(batch[norm_col]) == 0: logger.info("No more data in training data. Breaking.") break feature_df = batch[norm_col].apply(pd.Series) for feature in feature_df: values = feature_df[feature].dropna().values samples_per_feature[feature] += len(values) samples[feature].extend(values) done = check_samples_per_feature(samples_per_feature, norm_params["num_samples"]) logger.info("Samples per feature: {}".format(samples_per_feature)) if done: logger.info( "Collected sufficient sample size for all features. Breaking.") batch = dataset.read_batch(astype="df") output = {} for feature, values in samples.items(): output[feature] = get_feature_norm_metadata(feature, values, norm_params) return serialize(output)
def get_norm_metadata(dataset, norm_params, norm_col): done = False batch = dataset.read_batch(astype="df") samples_per_feature, samples = defaultdict(int), defaultdict(list) while not done: if batch is None or len(batch[norm_col]) == 0: logger.info("No more data in training data. Breaking.") break feature_df = batch[norm_col].apply(pd.Series) for feature in feature_df: values = feature_df[feature].dropna().values samples_per_feature[feature] += len(values) samples[feature].extend(values) done = check_samples_per_feature( samples_per_feature, norm_params["num_samples"] ) logger.info("Samples per feature: {}".format(samples_per_feature)) if done: logger.info("Collected sufficient sample size for all features. Breaking.") batch = dataset.read_batch(astype="df") output = {} for feature, values in samples.items(): output[feature] = get_feature_norm_metadata(feature, values, norm_params) return serialize(output)
def test_persistency(self): _, feature_value_map = preprocessing_util.read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter(values) s = normalization.serialize(normalization_parameters) read_parameters = normalization.deserialize(s) self.assertEqual(read_parameters, normalization_parameters)
def test_persistency(self): feature_value_map = read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( values, feature_type=self._feature_type_override(name)) s = normalization.serialize(normalization_parameters) read_parameters = normalization.deserialize(s) self.assertEqual(read_parameters, normalization_parameters)
def test_persistency(self): feature_value_map = preprocessing_util.read_data() normalization_parameters = normalization.identify_parameters( feature_value_map ) s = normalization.serialize(normalization_parameters) read_parameters = normalization.deserialize(s) self.assertEqual(read_parameters, normalization_parameters)
def test_persistency(self): feature_value_map = read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( values) values[ 0] = MISSING_VALUE # Set one entry to MISSING_VALUE to test that s = normalization.serialize(normalization_parameters) read_parameters = normalization.deserialize(s) self.assertEqual(read_parameters, normalization_parameters)
def test_persistency(self): feature_value_map = read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( name, values, feature_type=self._feature_type_override(name)) values[ 0] = MISSING_VALUE # Set one entry to MISSING_VALUE to test that s = normalization.serialize(normalization_parameters) read_parameters = normalization.deserialize(s) # Unfortunately, Thrift serializatin seems to lose a bit of precision. # Using `==` will be false. self.assertEqual(read_parameters.keys(), normalization_parameters.keys()) for k in normalization_parameters: self.assertEqual( read_parameters[k].feature_type, normalization_parameters[k].feature_type, ) self.assertEqual( read_parameters[k].possible_values, normalization_parameters[k].possible_values, ) for field in [ "boxcox_lambda", "boxcox_shift", "mean", "stddev", "quantiles", "min_value", "max_value", ]: if getattr(normalization_parameters[k], field) is None: self.assertEqual( getattr(read_parameters[k], field), getattr(normalization_parameters[k], field), ) else: npt.assert_allclose( getattr(read_parameters[k], field), getattr(normalization_parameters[k], field), )
def test_persistency(self): feature_value_map = read_data() normalization_parameters = {} for name, values in feature_value_map.items(): normalization_parameters[name] = normalization.identify_parameter( name, values, feature_type=self._feature_type_override(name) ) values[0] = MISSING_VALUE # Set one entry to MISSING_VALUE to test that s = normalization.serialize(normalization_parameters) read_parameters = normalization.deserialize(s) # Unfortunately, Thrift serializatin seems to lose a bit of precision. # Using `==` will be false. self.assertEqual(read_parameters.keys(), normalization_parameters.keys()) for k in normalization_parameters: self.assertEqual( read_parameters[k].feature_type, normalization_parameters[k].feature_type, ) self.assertEqual( read_parameters[k].possible_values, normalization_parameters[k].possible_values, ) for field in [ "boxcox_lambda", "boxcox_shift", "mean", "stddev", "quantiles", "min_value", "max_value", ]: if getattr(normalization_parameters[k], field) is None: self.assertEqual( getattr(read_parameters[k], field), getattr(normalization_parameters[k], field), ) else: npt.assert_allclose( getattr(read_parameters[k], field), getattr(normalization_parameters[k], field), )