def test_identification(self): feature_value_map = read_data() types = {} for name, values in feature_value_map.items(): types[name] = identify_types.identify_type(values) # Examples through manual inspection self.assertEqual(types[BINARY_FEATURE_ID], identify_types.BINARY) self.assertEqual(types[CONTINUOUS_FEATURE_ID], identify_types.CONTINUOUS) # We don't yet know the boxcox type self.assertEqual(types[BOXCOX_FEATURE_ID], identify_types.CONTINUOUS) # We don't yet know the quantile type self.assertEqual(types[QUANTILE_FEATURE_ID], identify_types.CONTINUOUS) self.assertEqual(types[ENUM_FEATURE_ID], identify_types.ENUM) self.assertEqual(types[PROBABILITY_FEATURE_ID], identify_types.PROBABILITY)
def identify_parameter( feature_name, values, max_unique_enum_values=DEFAULT_MAX_UNIQUE_ENUM, quantile_size=DEFAULT_MAX_QUANTILE_SIZE, quantile_k2_threshold=DEFAULT_QUANTILE_K2_THRESHOLD, skip_box_cox=False, skip_quantiles=False, feature_type=None, ): if feature_type is None: feature_type = identify_types.identify_type(values, max_unique_enum_values) boxcox_lambda = None boxcox_shift = 0.0 mean = 0.0 stddev = 1.0 possible_values = None quantiles = None assert feature_type in [ identify_types.CONTINUOUS, identify_types.PROBABILITY, identify_types.BINARY, identify_types.ENUM, identify_types.CONTINUOUS_ACTION, identify_types.DO_NOT_PREPROCESS, ], "unknown type {}".format(feature_type) assert ( len(values) >= MINIMUM_SAMPLES_TO_IDENTIFY ), "insufficient information to identify parameter" min_value = float(np.min(values)) max_value = float(np.max(values)) if feature_type == identify_types.DO_NOT_PREPROCESS: mean = float(np.mean(values)) values = values - mean stddev = max(float(np.std(values, ddof=1)), 1.0) if feature_type == identify_types.CONTINUOUS: if min_value == max_value: return no_op_feature() k2_original, p_original = stats.normaltest(values) # shift can be estimated but not in scipy boxcox_shift = float(min_value * -1) candidate_values, lambda_ = stats.boxcox( np.maximum(values + boxcox_shift, BOX_COX_MARGIN) ) k2_boxcox, p_boxcox = stats.normaltest(candidate_values) logger.info( "Feature stats. Original K2: {} P: {} Boxcox K2: {} P: {}".format( k2_original, p_original, k2_boxcox, p_boxcox ) ) if lambda_ < 0.9 or lambda_ > 1.1: # Lambda is far enough from 1.0 to be worth doing boxcox if k2_original > k2_boxcox * 10 and k2_boxcox <= quantile_k2_threshold: # The boxcox output is significantly more normally distributed # than the original data and is normal enough to apply # effectively. stddev = float(np.std(candidate_values, ddof=1)) # Unclear whether this happens in practice or not if ( np.isfinite(stddev) and stddev < BOX_COX_MAX_STDDEV and not np.isclose(stddev, 0) ): values = candidate_values boxcox_lambda = float(lambda_) if boxcox_lambda is None or skip_box_cox: boxcox_shift = None boxcox_lambda = None if boxcox_lambda is not None: feature_type = identify_types.BOXCOX if ( boxcox_lambda is None and k2_original > quantile_k2_threshold and (not skip_quantiles) ): feature_type = identify_types.QUANTILE quantiles = ( np.unique( mquantiles( values, np.arange(quantile_size + 1, dtype=np.float64) / float(quantile_size), alphap=0.0, betap=1.0, ) ) .astype(float) .tolist() ) logger.info("Feature is non-normal, using quantiles: {}".format(quantiles)) if ( feature_type == identify_types.CONTINUOUS or feature_type == identify_types.BOXCOX or feature_type == identify_types.CONTINUOUS_ACTION ): mean = float(np.mean(values)) values = values - mean stddev = max(float(np.std(values, ddof=1)), 1.0) if not np.isfinite(stddev): logger.info("Std. dev not finite for feature {}".format(feature_name)) return None values /= stddev if feature_type == identify_types.ENUM: possible_values = np.unique(values.astype(int)).astype(int).tolist() return NormalizationParameters( feature_type, boxcox_lambda, boxcox_shift, mean, stddev, possible_values, quantiles, min_value, max_value, )