def test_are_conventional_bools(): assert strings_utils.are_conventional_bools(["True", "False"]) assert strings_utils.are_conventional_bools(["T", "F"]) assert strings_utils.are_conventional_bools(["t", "f"]) assert not strings_utils.are_conventional_bools(["True", "Fails"]) assert strings_utils.are_conventional_bools(["0", "1"]) assert not strings_utils.are_conventional_bools(["0", "2"]) assert strings_utils.are_conventional_bools(["1.0", "0.0"]) assert not strings_utils.are_conventional_bools(["high", "low"]) assert not strings_utils.are_conventional_bools(["human", "bot"])
def infer_type( field: FieldInfo, missing_value_percent: float, ) -> str: """Perform type inference on field. # Inputs :param field: (FieldInfo) object describing field :param missing_value_percent: (float) percent of missing values in the column # Return :return: (str) feature type """ num_distinct_values = field.num_distinct_values if num_distinct_values == 0: return CATEGORY distinct_values = field.distinct_values if num_distinct_values <= 2 and missing_value_percent == 0: # Check that all distinct values are conventional bools. if strings_utils.are_conventional_bools(distinct_values): return BINARY if field.image_values >= 3: return IMAGE # If small number of distinct values, use CATEGORY if either not all are numerical or # they form a sequential list of integers suggesting the values represent categories if num_distinct_values < SMALL_DISTINCT_COUNT and ( (not strings_utils.are_all_numericals(distinct_values)) or strings_utils.are_sequential_integers(distinct_values) ): # TODO (tgaddair): come up with something better than this, maybe attempt to fit to Gaussian # NOTE (ASN): edge case -- there are less than SMALL_DISTINCT_COUNT samples in dataset return CATEGORY # add criteria for number of spaces if field.avg_words and field.avg_words > 2: return TEXT # TODO (ASN): add other modalities (image, etc. ) # If either of 2 examples is not numerical, use CATEGORY type. We examine 2 since missing # values can be coded as NaN, which is numerical, even for fields that are otherwise strings. if num_distinct_values > 1 and ( (not strings_utils.is_numerical(distinct_values[0])) or (not strings_utils.is_numerical(distinct_values[1])) ): return CATEGORY return NUMERICAL
def infer_type(field: FieldInfo, missing_value_percent: float, row_count: int) -> str: """Perform type inference on field. # Inputs :param field: (FieldInfo) object describing field :param missing_value_percent: (float) percent of missing values in the column :param row_count: (int) total number of entries in original dataset # Return :return: (str) feature type """ if field.dtype == DATE: return DATE num_distinct_values = field.num_distinct_values if num_distinct_values == 0: return CATEGORY distinct_values = field.distinct_values if num_distinct_values <= 2 and missing_value_percent == 0: # Check that all distinct values are conventional bools. if strings_utils.are_conventional_bools(distinct_values): return BINARY if field.image_values >= 3: return IMAGE if field.audio_values >= 3: return AUDIO # Use CATEGORY if: # - The number of distinct values is significantly less than the total number of examples. # - The distinct values are not all numbers. # - The distinct values are all numbers but comprise of a perfectly sequential list of integers that suggests the # values represent categories. if num_distinct_values < row_count * CATEGORY_TYPE_DISTINCT_VALUE_PERCENTAGE_CUTOFF and ( (not strings_utils.are_all_numbers(distinct_values)) or strings_utils.are_sequential_integers(distinct_values)): return CATEGORY # Use NUMBER if all of the distinct values are numbers. if strings_utils.are_all_numbers(distinct_values): return NUMBER # TODO (ASN): add other modalities (image, etc. ) # Fallback to TEXT. return TEXT
def infer_type( field: FieldInfo, missing_value_percent: float, ) -> str: """Perform type inference on field. # Inputs :param field: (FieldInfo) object describing field :param missing_value_percent: (float) percent of missing values in the column # Return :return: (str) feature type """ num_distinct_values = field.num_distinct_values if num_distinct_values == 0: return CATEGORY distinct_values = field.distinct_values if num_distinct_values <= 2 and missing_value_percent == 0: # Check that all distinct values are conventional bools. if strings_utils.are_conventional_bools(distinct_values): return BINARY if field.image_values >= 3: return IMAGE # Use CATEGORY if there are a small number of distinct values if they are either not all numerical or if they # comprise of a perfectly sequential list of integers that suggests the values represent categories. if num_distinct_values < SMALL_DISTINCT_COUNT and ( (not strings_utils.are_all_numericals(distinct_values)) or strings_utils.are_sequential_integers(distinct_values)): # TODO (tgaddair): come up with something better than this, maybe attempt to fit to Gaussian # NOTE (ASN): edge case -- there are less than SMALL_DISTINCT_COUNT samples in dataset return CATEGORY # Use numerical if all of the distinct values are numerical. if strings_utils.are_all_numericals(distinct_values): return NUMERICAL # TODO (ASN): add other modalities (image, etc. ) # Fallback to TEXT. return TEXT