def _handle_convert_table_evrs( self, profile: DatasetProfileClass, table_evrs: Iterable[ExpectationValidationResult], pretty_name: str, ) -> None: # TRICKY: This method mutates the profile directly. for evr in table_evrs: exp: str = evr.expectation_config.expectation_type res: dict = evr.result if exp == "expect_table_row_count_to_be_between": profile.rowCount = res["observed_value"] elif exp == "expect_table_columns_to_match_ordered_list": profile.columnCount = len(res["observed_value"]) else: self.report.report_warning(f"profile of {pretty_name}", f"unknown table mapper {exp}")
def generate_dataset_profile( # noqa: C901 (complexity) self, ) -> DatasetProfileClass: self.dataset.set_default_expectation_argument( "catch_exceptions", self.config.catch_exceptions) profile = DatasetProfileClass(timestampMillis=get_sys_time()) if self.partition: profile.partitionSpec = PartitionSpecClass( partition=self.partition) profile.fieldProfiles = [] self._get_dataset_rows(profile) all_columns = self.dataset.get_table_columns() profile.columnCount = len(all_columns) columns_to_profile = set(self._get_columns_to_profile()) logger.debug( f"profiling {self.dataset_name}: flushing stage 1 queries") self.query_combiner.flush() columns_profiling_queue: List[_SingleColumnSpec] = [] for column in all_columns: column_profile = DatasetFieldProfileClass(fieldPath=column) profile.fieldProfiles.append(column_profile) if column in columns_to_profile: column_spec = _SingleColumnSpec(column, column_profile) columns_profiling_queue.append(column_spec) self._get_column_type(column_spec, column) self._get_column_cardinality(column_spec, column) logger.debug( f"profiling {self.dataset_name}: flushing stage 2 queries") self.query_combiner.flush() assert profile.rowCount is not None row_count: int = profile.rowCount telemetry.telemetry_instance.ping( "profile_sql_table", # bucket by taking floor of log of the number of rows scanned { "rows_profiled": 10**int(log10(row_count + 1)), }, ) for column_spec in columns_profiling_queue: column = column_spec.column column_profile = column_spec.column_profile type_ = column_spec.type_ cardinality = column_spec.cardinality non_null_count = column_spec.nonnull_count unique_count = column_spec.unique_count if self.config.include_field_null_count and non_null_count is not None: null_count = row_count - non_null_count if null_count < 0: null_count = 0 column_profile.nullCount = null_count if row_count > 0: column_profile.nullProportion = null_count / row_count # Sometimes this value is bigger than 1 because of the approx queries if column_profile.nullProportion > 1: column_profile.nullProportion = 1 if unique_count is not None: column_profile.uniqueCount = unique_count if non_null_count is not None and non_null_count > 0: column_profile.uniqueProportion = unique_count / non_null_count # Sometimes this value is bigger than 1 because of the approx queries if column_profile.uniqueProportion > 1: column_profile.uniqueProportion = 1 self._get_dataset_column_sample_values(column_profile, column) if (type_ == ProfilerDataType.INT or type_ == ProfilerDataType.FLOAT or type_ == ProfilerDataType.NUMERIC): if cardinality == Cardinality.UNIQUE: pass elif cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, Cardinality.MANY, Cardinality.VERY_MANY, Cardinality.UNIQUE, ]: self._get_dataset_column_min(column_profile, column) self._get_dataset_column_max(column_profile, column) self._get_dataset_column_mean(column_profile, column) self._get_dataset_column_median(column_profile, column) if type_ == ProfilerDataType.INT: self._get_dataset_column_stdev(column_profile, column) self._get_dataset_column_quantiles(column_profile, column) self._get_dataset_column_histogram(column_profile, column) if cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, ]: self._get_dataset_column_distinct_value_frequencies( column_profile, column, ) else: # unknown cardinality - skip pass elif type_ == ProfilerDataType.STRING: if cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, ]: self._get_dataset_column_distinct_value_frequencies( column_profile, column, ) elif type_ == ProfilerDataType.DATETIME: self._get_dataset_column_min(column_profile, column) self._get_dataset_column_max(column_profile, column) # FIXME: Re-add histogram once kl_divergence has been modified to support datetimes if cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, ]: self._get_dataset_column_distinct_value_frequencies( column_profile, column, ) else: if cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, ]: self._get_dataset_column_distinct_value_frequencies( column_profile, column, ) logger.debug( f"profiling {self.dataset_name}: flushing stage 3 queries") self.query_combiner.flush() return profile