def test_column_cardinality_functions(cardinality_validator): profiler = UserConfigurableProfiler(cardinality_validator) # assert profiler.column_info.get("col_none").get("cardinality") == "NONE" assert profiler.column_info.get("col_one").get("cardinality") == "ONE" assert profiler.column_info.get("col_two").get("cardinality") == "TWO" assert profiler.column_info.get("col_very_few").get( "cardinality") == "VERY_FEW" assert profiler.column_info.get("col_few").get("cardinality") == "FEW" assert profiler.column_info.get("col_many").get("cardinality") == "MANY" assert profiler.column_info.get("col_very_many").get( "cardinality") == "VERY_MANY" cardinality_with_ten_num_and_no_pct = ( OrderedProfilerCardinality.get_basic_column_cardinality(num_unique=10)) assert cardinality_with_ten_num_and_no_pct.name == "VERY_FEW" cardinality_with_unique_pct_and_no_num = ( OrderedProfilerCardinality.get_basic_column_cardinality( pct_unique=1.0)) assert cardinality_with_unique_pct_and_no_num.name == "UNIQUE" cardinality_with_no_pct_and_no_num = ( OrderedProfilerCardinality.get_basic_column_cardinality()) assert cardinality_with_no_pct_and_no_num.name == "NONE" cardinality_with_large_pct_and_no_num = ( OrderedProfilerCardinality.get_basic_column_cardinality( pct_unique=0.5)) assert cardinality_with_large_pct_and_no_num.name == "NONE"
def _get_column_cardinality(self, dataset, column): """ Determines the cardinality of a column using the get_basic_column_cardinality method from OrderedProfilerCardinality Args: dataset: A GE Dataset column: The column for which to get cardinality Returns: The cardinality of the specified column """ num_unique = None pct_unique = None try: num_unique = dataset.expect_column_unique_value_count_to_be_between( column, None, None).result["observed_value"] pct_unique = ( dataset. expect_column_proportion_of_unique_values_to_be_between( column, None, None).result["observed_value"]) except KeyError: # if observed_value value is not set logger.error( "Failed to get cardinality of column {:s} - continuing...". format(column)) # Previously, if we had 25 possible categories out of 1000 rows, this would comes up as many, because of its # percentage, so it was tweaked here, but is still experimental. cardinality = OrderedProfilerCardinality.get_basic_column_cardinality( num_unique, pct_unique) return cardinality.name