def test_infer_type(num_distinct_values, distinct_values, img_values, missing_vals, expected): field = FieldInfo( name="foo", dtype="object", num_distinct_values=num_distinct_values, distinct_values=distinct_values, image_values=img_values, ) assert infer_type(field, missing_vals, ROW_COUNT) == expected
def test_infer_type(distinct_values, avg_words, img_values, expected): field = FieldInfo( name='foo', dtype='object', distinct_values=distinct_values, avg_words=avg_words, image_values=img_values, ) assert infer_type(field) == expected
def get_dataset_info_from_source(source: DataSource) -> DatasetInfo: row_count = len(source) fields = [] for field in source.columns: dtype = source.get_dtype(field) distinct_values = source.get_distinct_values(field) nonnull_values = source.get_nonnull_values(field) image_values = source.get_image_values(field) avg_words = None if source.is_string_type(dtype): avg_words = source.get_avg_num_tokens(field) fields.append( FieldInfo(name=field, dtype=dtype, distinct_values=distinct_values, nonnull_values=nonnull_values, image_values=image_values, avg_words=avg_words)) return DatasetInfo(fields=fields, row_count=row_count)
def test_should_exclude(idx, distinct_values, dtype, name, expected): field = FieldInfo(name=name, dtype=dtype, distinct_values=distinct_values) assert should_exclude(idx, field, dtype, ROW_COUNT, TARGET_NAME) == expected