Esempio n. 1
0
    def test_identification(self):
        feature_value_map = preprocessing_util.read_data()

        types = {}
        for name, values in feature_value_map.items():
            types[name] = identify_types.identify_type(values)

        # Examples through manual inspection
        self.assertEqual(types[identify_types.BINARY], identify_types.BINARY)
        self.assertEqual(types['normal'], identify_types.CONTINUOUS)
        self.assertEqual(types['boxcox'], identify_types.CONTINUOUS)

        # We don't yet know the quantile type
        self.assertEqual(types[identify_types.QUANTILE],
                         identify_types.CONTINUOUS)
        self.assertEqual(types[identify_types.ENUM], identify_types.ENUM)
        self.assertEqual(types[identify_types.PROBABILITY],
                         identify_types.PROBABILITY)
    def test_identification(self):
        feature_value_map = read_data()

        types = {}
        for name, values in feature_value_map.items():
            types[name] = identify_types.identify_type(values)

        # Examples through manual inspection
        self.assertEqual(types[BINARY_FEATURE_ID], identify_types.BINARY)
        self.assertEqual(types[CONTINUOUS_FEATURE_ID], identify_types.CONTINUOUS)

        # We don't yet know the boxcox type
        self.assertEqual(types[BOXCOX_FEATURE_ID], identify_types.CONTINUOUS)

        # We don't yet know the quantile type
        self.assertEqual(types[QUANTILE_FEATURE_ID], identify_types.CONTINUOUS)
        self.assertEqual(types[ENUM_FEATURE_ID], identify_types.ENUM)
        self.assertEqual(types[PROBABILITY_FEATURE_ID], identify_types.PROBABILITY)
    def test_identification(self):
        feature_value_map = read_data()

        types = {}
        for name, values in feature_value_map.items():
            types[name] = identify_types.identify_type(values)

        # Examples through manual inspection
        self.assertEqual(types[BINARY_FEATURE_ID], identify_types.BINARY)
        self.assertEqual(types[CONTINUOUS_FEATURE_ID],
                         identify_types.CONTINUOUS)

        # We don't yet know the boxcox type
        self.assertEqual(types[BOXCOX_FEATURE_ID], identify_types.CONTINUOUS)

        # We don't yet know the quantile type
        self.assertEqual(types[QUANTILE_FEATURE_ID], identify_types.CONTINUOUS)
        self.assertEqual(types[ENUM_FEATURE_ID], identify_types.ENUM)
        self.assertEqual(types[PROBABILITY_FEATURE_ID],
                         identify_types.PROBABILITY)
Esempio n. 4
0
def identify_parameter(
    feature_name,
    values,
    max_unique_enum_values=DEFAULT_MAX_UNIQUE_ENUM,
    quantile_size=DEFAULT_MAX_QUANTILE_SIZE,
    quantile_k2_threshold=DEFAULT_QUANTILE_K2_THRESHOLD,
    skip_box_cox=False,
    skip_quantiles=False,
    feature_type=None,
):
    if feature_type is None:
        feature_type = identify_types.identify_type(values,
                                                    max_unique_enum_values)

    boxcox_lambda = None
    boxcox_shift = 0.0
    mean = 0.0
    stddev = 1.0
    possible_values = None
    quantiles = None
    assert feature_type in [
        identify_types.CONTINUOUS,
        identify_types.PROBABILITY,
        identify_types.BINARY,
        identify_types.ENUM,
        identify_types.CONTINUOUS_ACTION,
    ], "unknown type {}".format(feature_type)
    assert (len(values) >= MINIMUM_SAMPLES_TO_IDENTIFY
            ), "insufficient information to identify parameter"

    min_value = np.min(values)
    max_value = np.max(values)
    if feature_type == identify_types.CONTINUOUS:
        if min_value == max_value:
            return no_op_feature()
        k2_original, p_original = stats.normaltest(values)

        # shift can be estimated but not in scipy
        boxcox_shift = float(min_value * -1)
        candidate_values, lmbda = stats.boxcox(
            np.maximum(values + boxcox_shift, BOX_COX_MARGIN))
        k2_boxcox, p_boxcox = stats.normaltest(candidate_values)
        logger.info(
            "Feature stats.  Original K2: {} P: {} Boxcox K2: {} P: {}".format(
                k2_original, p_original, k2_boxcox, p_boxcox))
        if lmbda < 0.9 or lmbda > 1.1:
            # Lambda is far enough from 1.0 to be worth doing boxcox
            if k2_original > k2_boxcox * 10 and k2_boxcox <= quantile_k2_threshold:
                # The boxcox output is significantly more normally distributed
                # than the original data and is normal enough to apply
                # effectively.

                stddev = np.std(candidate_values, ddof=1)
                # Unclear whether this happens in practice or not
                if (np.isfinite(stddev) and stddev < BOX_COX_MAX_STDDEV
                        and not np.isclose(stddev, 0)):
                    values = candidate_values
                    boxcox_lambda = float(lmbda)
        if boxcox_lambda is None or skip_box_cox:
            boxcox_shift = None
            boxcox_lambda = None
        if boxcox_lambda is not None:
            feature_type = identify_types.BOXCOX
        if (boxcox_lambda is None and k2_original > quantile_k2_threshold
                and (not skip_quantiles)):
            feature_type = identify_types.QUANTILE
            quantiles = (np.unique(
                mquantiles(
                    values,
                    np.arange(quantile_size + 1, dtype=np.float64) /
                    float(quantile_size),
                    alphap=0.0,
                    betap=1.0,
                )).astype(float).tolist())
            logger.info(
                "Feature is non-normal, using quantiles: {}".format(quantiles))

    if (feature_type == identify_types.CONTINUOUS
            or feature_type == identify_types.BOXCOX
            or feature_type == identify_types.CONTINUOUS_ACTION):
        mean = float(np.mean(values))
        values = values - mean
        stddev = max(float(np.std(values, ddof=1)), 1.0)
        if not np.isfinite(stddev):
            logger.info(
                "Std. dev not finite for feature {}".format(feature_name))
            return None
        values /= stddev

    if feature_type == identify_types.ENUM:
        possible_values = np.unique(values.astype(int)).tolist()

    return NormalizationParameters(
        feature_type,
        boxcox_lambda,
        boxcox_shift,
        mean,
        stddev,
        possible_values,
        quantiles,
        min_value,
        max_value,
    )
def identify_parameter(
    values,
    max_unique_enum_values=DEFAULT_MAX_UNIQUE_ENUM,
    quantile_size=DEFAULT_MAX_QUANTILE_SIZE,
    quantile_k2_threshold=DEFAULT_QUANTILE_K2_THRESHOLD,
):
    feature_type = identify_types.identify_type(values, max_unique_enum_values)

    boxcox_lambda = None
    boxcox_shift = 0
    mean = 0
    stddev = 1
    possible_values = None
    quantiles = None
    assert feature_type in [
        identify_types.CONTINUOUS, identify_types.PROBABILITY,
        identify_types.BINARY, identify_types.ENUM, identify_types.QUANTILE
    ], "unknown type {}".format(feature_type)
    assert len(
        values
    ) >= MINIMUM_SAMPLES_TO_IDENTIFY, "insufficient information to identify parameter"

    min_value = np.min(values)
    max_value = np.max(values)
    if feature_type == identify_types.CONTINUOUS:
        assert min_value < max_value, "Binary feature marked as continuous"
        k2_original, p_original = stats.normaltest(values)

        # shift can be estimated but not in scipy
        boxcox_shift = float(min_value * -1)
        candidate_values, lmbda = stats.boxcox(
            np.maximum(values + boxcox_shift, BOX_COX_MARGIN))
        k2_boxcox, p_boxcox = stats.normaltest(candidate_values)
        logger.info(
            "Feature stats.  Original K2: {} P: {} Boxcox K2: {} P: {}".format(
                k2_original, p_original, k2_boxcox, p_boxcox))
        if lmbda < 0.9 or lmbda > 1.1:
            # Lambda is far enough from 1.0 to be worth doing boxcox
            if k2_original > k2_boxcox * 10 and k2_boxcox <= quantile_k2_threshold:
                # The boxcox output is significantly more normally distributed
                # than the original data and is normal enough to apply
                # effectively.

                stddev = np.std(candidate_values, ddof=1)
                # Unclear whether this happens in practice or not
                if np.isfinite(stddev) and stddev < BOX_COX_MAX_STDDEV and \
                   not np.isclose(stddev, 0):
                    values = candidate_values
                    boxcox_lambda = float(lmbda)
        if boxcox_lambda is None:
            boxcox_shift = None
        if boxcox_lambda is None and k2_original > quantile_k2_threshold:
            feature_type = identify_types.QUANTILE
            quantiles = mquantiles(
                values,
                np.arange(quantile_size, dtype=np.float32) /
                float(quantile_size)).astype(float).tolist()
            logger.info(
                "Feature is non-normal, using quantiles: {}".format(quantiles))

    if feature_type == identify_types.CONTINUOUS:
        mean = float(np.mean(values))
        values = values - mean
        stddev = float(np.std(values, ddof=1))
        if np.isclose(stddev, 0) or not np.isfinite(stddev):
            stddev = 1
        values /= stddev

    if feature_type == identify_types.ENUM:
        possible_values = np.unique(values).astype(float).tolist()

    return NormalizationParameters(feature_type, boxcox_lambda, boxcox_shift,
                                   mean, stddev, possible_values, quantiles)
Esempio n. 6
0
def identify_parameter(
    feature_name,
    values,
    max_unique_enum_values=DEFAULT_MAX_UNIQUE_ENUM,
    quantile_size=DEFAULT_MAX_QUANTILE_SIZE,
    quantile_k2_threshold=DEFAULT_QUANTILE_K2_THRESHOLD,
    skip_box_cox=False,
    skip_quantiles=False,
    feature_type=None,
):
    if feature_type is None:
        feature_type = identify_types.identify_type(values, max_unique_enum_values)

    boxcox_lambda = None
    boxcox_shift = 0.0
    mean = 0.0
    stddev = 1.0
    possible_values = None
    quantiles = None
    assert feature_type in [
        identify_types.CONTINUOUS,
        identify_types.PROBABILITY,
        identify_types.BINARY,
        identify_types.ENUM,
        identify_types.CONTINUOUS_ACTION,
    ], "unknown type {}".format(feature_type)
    assert (
        len(values) >= MINIMUM_SAMPLES_TO_IDENTIFY
    ), "insufficient information to identify parameter"

    min_value = np.min(values)
    max_value = np.max(values)
    if feature_type == identify_types.CONTINUOUS:
        if min_value == max_value:
            return no_op_feature()
        k2_original, p_original = stats.normaltest(values)

        # shift can be estimated but not in scipy
        boxcox_shift = float(min_value * -1)
        candidate_values, lmbda = stats.boxcox(
            np.maximum(values + boxcox_shift, BOX_COX_MARGIN)
        )
        k2_boxcox, p_boxcox = stats.normaltest(candidate_values)
        logger.info(
            "Feature stats.  Original K2: {} P: {} Boxcox K2: {} P: {}".format(
                k2_original, p_original, k2_boxcox, p_boxcox
            )
        )
        if lmbda < 0.9 or lmbda > 1.1:
            # Lambda is far enough from 1.0 to be worth doing boxcox
            if k2_original > k2_boxcox * 10 and k2_boxcox <= quantile_k2_threshold:
                # The boxcox output is significantly more normally distributed
                # than the original data and is normal enough to apply
                # effectively.

                stddev = np.std(candidate_values, ddof=1)
                # Unclear whether this happens in practice or not
                if (
                    np.isfinite(stddev)
                    and stddev < BOX_COX_MAX_STDDEV
                    and not np.isclose(stddev, 0)
                ):
                    values = candidate_values
                    boxcox_lambda = float(lmbda)
        if boxcox_lambda is None or skip_box_cox:
            boxcox_shift = None
            boxcox_lambda = None
        if boxcox_lambda is not None:
            feature_type = identify_types.BOXCOX
        if (
            boxcox_lambda is None
            and k2_original > quantile_k2_threshold
            and (not skip_quantiles)
        ):
            feature_type = identify_types.QUANTILE
            quantiles = (
                np.unique(
                    mquantiles(
                        values,
                        np.arange(quantile_size + 1, dtype=np.float64)
                        / float(quantile_size),
                        alphap=0.0,
                        betap=1.0,
                    )
                )
                .astype(float)
                .tolist()
            )
            logger.info("Feature is non-normal, using quantiles: {}".format(quantiles))

    if (
        feature_type == identify_types.CONTINUOUS
        or feature_type == identify_types.BOXCOX
        or feature_type == identify_types.CONTINUOUS_ACTION
    ):
        mean = float(np.mean(values))
        values = values - mean
        stddev = max(float(np.std(values, ddof=1)), 1.0)
        if not np.isfinite(stddev):
            logger.info("Std. dev not finite for feature {}".format(feature_name))
            return None
        values /= stddev

    if feature_type == identify_types.ENUM:
        possible_values = np.unique(values.astype(int)).tolist()

    return NormalizationParameters(
        feature_type,
        boxcox_lambda,
        boxcox_shift,
        mean,
        stddev,
        possible_values,
        quantiles,
        min_value,
        max_value,
    )