def test_are_conventional_bools():
    assert strings_utils.are_conventional_bools(["True", "False"])
    assert strings_utils.are_conventional_bools(["T", "F"])
    assert strings_utils.are_conventional_bools(["t", "f"])
    assert not strings_utils.are_conventional_bools(["True", "Fails"])
    assert strings_utils.are_conventional_bools(["0", "1"])
    assert not strings_utils.are_conventional_bools(["0", "2"])
    assert strings_utils.are_conventional_bools(["1.0", "0.0"])
    assert not strings_utils.are_conventional_bools(["high", "low"])
    assert not strings_utils.are_conventional_bools(["human", "bot"])
Example #2
0
def infer_type(
    field: FieldInfo,
    missing_value_percent: float,
) -> str:
    """Perform type inference on field.

    # Inputs
    :param field: (FieldInfo) object describing field
    :param missing_value_percent: (float) percent of missing values in the column

    # Return
    :return: (str) feature type
    """
    num_distinct_values = field.num_distinct_values
    if num_distinct_values == 0:
        return CATEGORY
    distinct_values = field.distinct_values
    if num_distinct_values <= 2 and missing_value_percent == 0:
        # Check that all distinct values are conventional bools.
        if strings_utils.are_conventional_bools(distinct_values):
            return BINARY

    if field.image_values >= 3:
        return IMAGE

    # If small number of distinct values, use CATEGORY if either not all are numerical or
    # they form a sequential list of integers suggesting the values represent categories
    if num_distinct_values < SMALL_DISTINCT_COUNT and (
        (not strings_utils.are_all_numericals(distinct_values))
        or strings_utils.are_sequential_integers(distinct_values)
    ):
        # TODO (tgaddair): come up with something better than this, maybe attempt to fit to Gaussian
        # NOTE (ASN): edge case -- there are less than SMALL_DISTINCT_COUNT samples in dataset
        return CATEGORY

    # add criteria for number of spaces
    if field.avg_words and field.avg_words > 2:
        return TEXT

    # TODO (ASN): add other modalities (image, etc. )

    # If either of 2 examples is not numerical, use CATEGORY type.  We examine 2 since missing
    # values can be coded as NaN, which is numerical, even for fields that are otherwise strings.
    if num_distinct_values > 1 and (
        (not strings_utils.is_numerical(distinct_values[0])) or (not strings_utils.is_numerical(distinct_values[1]))
    ):
        return CATEGORY

    return NUMERICAL
Example #3
0
def infer_type(field: FieldInfo, missing_value_percent: float,
               row_count: int) -> str:
    """Perform type inference on field.

    # Inputs
    :param field: (FieldInfo) object describing field
    :param missing_value_percent: (float) percent of missing values in the column
    :param row_count: (int) total number of entries in original dataset

    # Return
    :return: (str) feature type
    """
    if field.dtype == DATE:
        return DATE

    num_distinct_values = field.num_distinct_values
    if num_distinct_values == 0:
        return CATEGORY
    distinct_values = field.distinct_values
    if num_distinct_values <= 2 and missing_value_percent == 0:
        # Check that all distinct values are conventional bools.
        if strings_utils.are_conventional_bools(distinct_values):
            return BINARY

    if field.image_values >= 3:
        return IMAGE

    if field.audio_values >= 3:
        return AUDIO

    # Use CATEGORY if:
    # - The number of distinct values is significantly less than the total number of examples.
    # - The distinct values are not all numbers.
    # - The distinct values are all numbers but comprise of a perfectly sequential list of integers that suggests the
    #   values represent categories.
    if num_distinct_values < row_count * CATEGORY_TYPE_DISTINCT_VALUE_PERCENTAGE_CUTOFF and (
        (not strings_utils.are_all_numbers(distinct_values))
            or strings_utils.are_sequential_integers(distinct_values)):
        return CATEGORY

    # Use NUMBER if all of the distinct values are numbers.
    if strings_utils.are_all_numbers(distinct_values):
        return NUMBER

    # TODO (ASN): add other modalities (image, etc. )
    # Fallback to TEXT.
    return TEXT
Example #4
0
def infer_type(
    field: FieldInfo,
    missing_value_percent: float,
) -> str:
    """Perform type inference on field.

    # Inputs
    :param field: (FieldInfo) object describing field
    :param missing_value_percent: (float) percent of missing values in the column

    # Return
    :return: (str) feature type
    """
    num_distinct_values = field.num_distinct_values
    if num_distinct_values == 0:
        return CATEGORY
    distinct_values = field.distinct_values
    if num_distinct_values <= 2 and missing_value_percent == 0:
        # Check that all distinct values are conventional bools.
        if strings_utils.are_conventional_bools(distinct_values):
            return BINARY

    if field.image_values >= 3:
        return IMAGE

    # Use CATEGORY if there are a small number of distinct values if they are either not all numerical or if they
    # comprise of a perfectly sequential list of integers that suggests the values represent categories.
    if num_distinct_values < SMALL_DISTINCT_COUNT and (
        (not strings_utils.are_all_numericals(distinct_values))
            or strings_utils.are_sequential_integers(distinct_values)):
        # TODO (tgaddair): come up with something better than this, maybe attempt to fit to Gaussian
        # NOTE (ASN): edge case -- there are less than SMALL_DISTINCT_COUNT samples in dataset
        return CATEGORY

    # Use numerical if all of the distinct values are numerical.
    if strings_utils.are_all_numericals(distinct_values):
        return NUMERICAL

    # TODO (ASN): add other modalities (image, etc. )
    # Fallback to TEXT.
    return TEXT