def test_df_value_meta( self, pandas_data_frame, pandas_data_frame_histograms, pandas_data_frame_stats ): expected_data_schema = { "type": DataFrameValueType.type_str, "columns": list(pandas_data_frame.columns), "size": int(pandas_data_frame.size), "shape": pandas_data_frame.shape, "dtypes": { col: str(type_) for col, type_ in pandas_data_frame.dtypes.items() }, } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=DataFrameValueType().to_preview( pandas_data_frame, preview_size=meta_conf.get_preview_size() ), data_dimensions=pandas_data_frame.shape, data_schema=expected_data_schema, data_hash=fast_hasher.hash( hash_pandas_object(pandas_data_frame, index=True).values ), descriptive_stats=pandas_data_frame_stats, histograms=pandas_data_frame_histograms, ) df_value_meta = DataFrameValueType().get_value_meta( pandas_data_frame, meta_conf=meta_conf ) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps( expected_value_meta.data_schema ) assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions std = df_value_meta.descriptive_stats["Births"].pop("std") expected_std = expected_value_meta.descriptive_stats["Births"].pop("std") assert round(std, 2) == expected_std df_value_meta.descriptive_stats["Names"].pop("top") assert df_value_meta.descriptive_stats == expected_value_meta.descriptive_stats counts, values = df_value_meta.histograms.pop("Names") expected_counts, expected_values = expected_value_meta.histograms.pop("Names") assert counts == expected_counts assert set(values) == set(expected_values) # order changes in each run # histograms are tested in histogram tests and they change a lot, no need to test also here df_value_meta.histograms = expected_value_meta.histograms = None expected_value_meta.histogram_system_metrics = ( df_value_meta.histogram_system_metrics ) assert df_value_meta.data_schema == expected_value_meta.data_schema assert attr.asdict(df_value_meta) == attr.asdict(expected_value_meta)
def test_spark_df_value_meta(self, spark_data_frame): expected_data_schema = { "type": SparkDataFrameValueType.type_str, "columns": list(spark_data_frame.schema.names), "size": int(spark_data_frame.count() * len(spark_data_frame.columns)), "shape": (spark_data_frame.count(), len(spark_data_frame.columns)), "dtypes": {f.name: str(f.dataType) for f in spark_data_frame.schema.fields}, } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=SparkDataFrameValueType().to_preview( spark_data_frame, meta_conf.get_preview_size()), data_dimensions=(spark_data_frame.count(), len(spark_data_frame.columns)), data_schema=expected_data_schema, data_hash=None, ) df_value_meta = SparkDataFrameValueType().get_value_meta( spark_data_frame) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert df_value_meta.data_schema == expected_value_meta.data_schema assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions assert df_value_meta == expected_value_meta
def get_value_meta(self, value: SqlOperation, meta_conf): data_schema = {} data_dimensions = None if meta_conf.log_schema: data_schema = {"type": self.type_str, "dtypes": value.dtypes} if meta_conf.log_size: data_dimensions = [value.records_count, value.columns_count] data_schema["shape"] = data_dimensions # todo: size? # currently columns_stats and histogram are not supported columns_stats, histograms = [], {} hist_sys_metrics = None return ValueMeta( value_preview=None, data_dimensions=data_dimensions, data_schema=data_schema, data_hash=str(hash(self.to_signature(value))), columns_stats=columns_stats, histogram_system_metrics=hist_sys_metrics, histograms=histograms, )
def get_value_meta(self, value, meta_conf): # type: (list, ValueMetaConf) -> ValueMeta data_schema = self.get_list_metrics(value, meta_conf) data_dimensions = data_schema.get("shape") if meta_conf.log_size: data_schema["size.bytes"] = value.__sizeof__() value_preview, data_hash = None, None if meta_conf.log_preview: value_preview = self.to_preview( value, preview_size=self.get_preview_size(meta_conf) ) try: data_hash = hash(json.dumps(value)) except Exception as e: logger.warning("Could not hash list %s! Exception: %s", value, e) # calculating stats, metrics and histograms are out of scope at the moment stats, histograms = [], {} hist_sys_metrics = None return ValueMeta( value_preview=value_preview, data_dimensions=data_dimensions, data_schema=data_schema if meta_conf.log_schema else None, data_hash=data_hash, columns_stats=stats, histogram_system_metrics=hist_sys_metrics, histograms=histograms, )
def test_df_value_meta(self, pandas_data_frame): expected_data_schema = { "type": DataFrameValueType.type_str, "columns": list(pandas_data_frame.columns), "size": int(pandas_data_frame.size), "shape": pandas_data_frame.shape, "dtypes": { col: str(type_) for col, type_ in pandas_data_frame.dtypes.items() }, } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=DataFrameValueType().to_preview( pandas_data_frame, preview_size=meta_conf.get_preview_size()), data_dimensions=pandas_data_frame.shape, data_schema=expected_data_schema, data_hash=fast_hasher.hash( hash_pandas_object(pandas_data_frame, index=True).values), ) df_value_meta = DataFrameValueType().get_value_meta( pandas_data_frame, meta_conf=meta_conf) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps( expected_value_meta.data_schema) assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions assert df_value_meta == expected_value_meta
def get_value_meta(self, value, meta_conf): # type: (PostgresTable, ValueMetaConf) -> ValueMeta data_schema = data_preview = None with PostgresController(value.connection_string, value.table_name) as postgres: if meta_conf.log_histograms or meta_conf.log_stats: start_time = time.time() stats, histograms = postgres.get_histograms_and_stats( meta_conf) hist_sys_metrics = { "histograms_and_stats_calc_time": time.time() - start_time } else: stats, histograms = {}, {} hist_sys_metrics = None if meta_conf.log_preview: data_preview = postgres.to_preview() if meta_conf.log_schema: data_schema = { "type": self.type_str, "column_types": postgres.get_column_types(), } return ValueMeta( value_preview=data_preview, data_dimensions=None, data_schema=data_schema, data_hash=self.to_signature(value), descriptive_stats=stats, histogram_system_metrics=hist_sys_metrics, histograms=histograms, )
def get_value_meta(self, value, meta_conf): # type: (pd.DataFrame, ValueMetaConf) -> ValueMeta data_schema = {} if meta_conf.log_schema: data_schema.update({ "type": self.type_str, "columns": list(value.columns), "shape": value.shape, "dtypes": {col: str(type_) for col, type_ in value.dtypes.items()}, }) if meta_conf.log_size: data_schema["size"] = int(value.size) if meta_conf.log_preview: value_preview = self.to_preview( value, preview_size=meta_conf.get_preview_size()) data_hash = fast_hasher.hash( hash_pandas_object(value, index=True).values) else: value_preview = None data_hash = None return ValueMeta( value_preview=value_preview, data_dimensions=value.shape, data_schema=data_schema, data_hash=data_hash, )
def get_value_meta(self, value, meta_conf): # type: (SnowflakeTable, ValueMetaConf) -> ValueMeta data_schema = {} data_preview = data_dimensions = None with self.get_snowflake(value) as snowflake: stats, histograms = {}, {} hist_sys_metrics = None if meta_conf.log_preview: data_preview = snowflake.to_preview(value) if meta_conf.log_schema: data_schema = { "type": self.type_str, "column_types": snowflake.get_column_types(value), } if meta_conf.log_size: dimensions = snowflake.get_dimensions(value) data_dimensions = [dimensions["rows"], dimensions["cols"]] data_schema["size"] = humanize_bytes(dimensions["bytes"]) return ValueMeta( value_preview=data_preview, data_dimensions=data_dimensions, data_schema=data_schema, data_hash=self.to_signature(value), descriptive_stats=stats, histogram_system_metrics=hist_sys_metrics, histograms=histograms, )
def get_value_meta(self, value, meta_conf): # type: (SnowflakeTable, ValueMetaConf) -> ValueMeta data_schema = {} data_preview = data_dimensions = None if meta_conf.log_preview: data_preview = value.snowflake_ctrl.to_preview(value) if meta_conf.log_schema: data_schema = { "type": self.type_str, "dtypes": value.snowflake_ctrl.get_column_types(value), } if meta_conf.log_size: dimensions = value.snowflake_ctrl.get_dimensions(value) data_dimensions = [dimensions["rows"], dimensions["cols"]] data_schema["size.bytes"] = dimensions["bytes"] # currently columns_stats and histogram are not supported columns_stats, histograms = [], {} hist_sys_metrics = None return ValueMeta( value_preview=data_preview, data_dimensions=data_dimensions, data_schema=data_schema, data_hash=str(hash(self.to_signature(value))), columns_stats=columns_stats, histogram_system_metrics=hist_sys_metrics, histograms=histograms, )
def test_str_value_meta(self): str_value_meta = StrValueType().get_value_meta("foo", ValueMetaConf.enabled()) expected_value_meta = ValueMeta( value_preview="foo", data_dimensions=None, data_schema={"type": "str"}, data_hash=fast_hasher.hash("foo"), ) assert str_value_meta == expected_value_meta
def test_spark_df_value_meta(self, spark_data_frame, spark_data_frame_histograms, spark_data_frame_stats): expected_data_schema = { "type": SparkDataFrameValueType.type_str, "columns": list(spark_data_frame.schema.names), "size.bytes": int(spark_data_frame.count() * len(spark_data_frame.columns)), "shape": (spark_data_frame.count(), len(spark_data_frame.columns)), "dtypes": {f.name: str(f.dataType) for f in spark_data_frame.schema.fields}, } expected_hist_sys_metrics = { "boolean_histograms_and_stats_calc_time", "histograms_and_stats_calc_time", "numeric_histograms_and_stats_calc_time", "string_histograms_and_stats_calc_time", } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=SparkDataFrameValueType().to_preview( spark_data_frame, meta_conf.get_preview_size()), data_dimensions=(spark_data_frame.count(), len(spark_data_frame.columns)), data_hash=SparkDataFrameValueType().to_signature(spark_data_frame), data_schema=expected_data_schema, descriptive_stats=spark_data_frame_stats, histograms=spark_data_frame_histograms, ) df_value_meta = SparkDataFrameValueType().get_value_meta( spark_data_frame, meta_conf) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions assert df_value_meta.data_schema == expected_value_meta.data_schema # it changes all the time, it has different formats, and it's already tested in histogram tests # assert df_value_meta.descriptive_stats == expected_value_meta.descriptive_stats # histogram_system_metrics values are too dynamic, so checking only keys, but not values assert (set(df_value_meta.histogram_system_metrics.keys()) == expected_hist_sys_metrics) df_value_meta.histogram_system_metrics = None # assert df_value_meta.histograms == expected_value_meta.histograms # assert attr.asdict(df_value_meta) == attr.asdict(expected_value_meta) pandas_data_frame = spark_data_frame.toPandas() pandas_value_meta = DataFrameValueType().get_value_meta( pandas_data_frame, meta_conf)
def test_target_value_meta(self): v = target("a") meta_conf = ValueMetaConf.enabled() target_value_meta = TargetPathLibValueType().get_value_meta( v, meta_conf=meta_conf) expected_value_meta = ValueMeta( value_preview='"a"', data_dimensions=None, data_schema={"type": "Path"}, data_hash=fast_hasher.hash(v), ) assert target_value_meta == expected_value_meta
def _calc_meta_data(self, data, meta_conf): # type: (Any, ValueMetaConf) -> ValueMeta data_meta = None if data is not None and meta_conf is not None: # Combine meta_conf with the config settings try: data_meta = get_value_meta( data, meta_conf, tracking_config=self.settings.tracking) except Exception as e: log_exception_to_server(e) if data_meta is None: data_meta = ValueMeta("") return data_meta
def get_value_meta(self, value, meta_conf): # type: (pd.DataFrame, ValueMetaConf) -> ValueMeta data_schema = {} if meta_conf.log_schema: data_schema.update({ "type": self.type_str, "columns": list(value.columns), "shape": value.shape, "dtypes": {col: str(type_) for col, type_ in value.dtypes.items()}, }) if meta_conf.log_size: data_schema["size.bytes"] = int(value.size) value_preview, data_hash = None, None if meta_conf.log_preview: value_preview = self.to_preview( value, preview_size=meta_conf.get_preview_size()) try: data_hash = fast_hasher.hash( hash_pandas_object(value, index=True).values) except Exception as e: logger.warning( "Could not hash dataframe object %s! Exception: %s", value, e) if meta_conf.log_histograms: start_time = time.time() stats, histograms = PandasHistograms( value, meta_conf).get_histograms_and_stats() hist_sys_metrics = { "histograms_and_stats_calc_time": time.time() - start_time } else: stats, histograms = {}, {} hist_sys_metrics = None return ValueMeta( value_preview=value_preview, data_dimensions=value.shape, data_schema=data_schema, data_hash=data_hash, descriptive_stats=stats, histogram_system_metrics=hist_sys_metrics, histograms=histograms, )
def get_value_meta(self, value, meta_conf): # type: (spark.DataFrame, ValueMetaConf) -> ValueMeta if meta_conf.log_schema: data_schema = { "type": self.type_str, "columns": list(value.schema.names), "dtypes": {f.name: str(f.dataType) for f in value.schema.fields}, } else: data_schema = None if meta_conf.log_preview: data_preview = self.to_preview(value, meta_conf.get_preview_size()) else: data_preview = None if meta_conf.log_size: data_schema = data_schema or {} rows = value.count() data_dimensions = (rows, len(value.columns)) data_schema.update({ "size": int(rows * len(value.columns)), "shape": (rows, len(value.columns)), }) else: data_dimensions = None if meta_conf.log_histograms or meta_conf.log_stats: spark_histograms = SparkHistograms(value, meta_conf) df_stats, histogram_dict = spark_histograms.get_histograms_and_stats( ) hist_sys_metrics = spark_histograms.system_metrics else: df_stats, histogram_dict = {}, {} hist_sys_metrics = None return ValueMeta( value_preview=data_preview, data_dimensions=data_dimensions, data_schema=data_schema, data_hash=self.to_signature(value), descriptive_stats=df_stats, histogram_system_metrics=hist_sys_metrics, histograms=histogram_dict, )
def validate_numeric_histogram_and_stats(self, value_meta: ValueMeta, column_name: str) -> None: """assuming numbers fixture is used""" assert column_name in value_meta.histograms histogram = value_meta.histograms[column_name] assert len(histogram) == 2 assert len(histogram[0]) == 20 assert len(histogram[1]) == 21 assert sum(histogram[0]) == 8 col_stats = value_meta.get_column_stats_by_col_name(column_name) assert col_stats.records_count == 10 assert col_stats.non_null_count == 8 assert col_stats.distinct_count == 4 assert col_stats.min_value == 1 assert col_stats.max_value == 5
def get_value_meta(self, value, meta_conf): # type: (spark.DataFrame, ValueMetaConf) -> ValueMeta if meta_conf.log_schema: data_schema = { "type": self.type_str, "columns": list(value.schema.names), "dtypes": {f.name: str(f.dataType) for f in value.schema.fields}, } else: data_schema = None if meta_conf.log_preview: data_preview = self.to_preview(value, meta_conf.get_preview_size()) else: data_preview = None if meta_conf.log_stats: data_schema["stats"] = self.to_preview( value.summary(), meta_conf.get_preview_size()) if meta_conf.log_size: data_schema = data_schema or {} rows = value.count() data_dimensions = (rows, len(value.columns)) data_schema.update({ "size": int(rows * len(value.columns)), "shape": (rows, len(value.columns)), }) else: data_dimensions = None df_stats, histograms = None, None if meta_conf.log_df_hist: df_stats, histograms = self.get_histograms(value) return ValueMeta( value_preview=data_preview, data_dimensions=data_dimensions, data_schema=data_schema, data_hash=self.to_signature(value), descriptive_stats=df_stats, histograms=histograms, )
def get_value_meta(self, value, meta_conf): # type: (Any, ValueMetaConf) -> ValueMeta if meta_conf.log_preview: preview = self.to_preview(value, preview_size=meta_conf.get_preview_size()) data_hash = _safe_hash(value) else: preview = None data_hash = None data_schema = {"type": self.type_str} return ValueMeta( value_preview=preview, data_dimensions=None, data_schema=data_schema, data_hash=data_hash, )
def get_value_meta(self, value: RedshiftOperation, meta_conf: ValueMetaConf): # currently, histograms are not supported histograms = {} hist_sys_metrics = None dimensions = None if meta_conf.log_size: dimensions = value.schema["shape"] data_schema = None if meta_conf.log_schema: data_schema = value.schema column_stats = {} if meta_conf.log_stats: if value.dataframe is not None: column_stats, _ = PandasHistograms( value.dataframe, meta_conf ).get_histograms_and_stats() else: column_stats = value.column_stats preview = "" if meta_conf.log_preview: preview = value.preview return ValueMeta( value_preview=preview, data_dimensions=dimensions, data_schema=data_schema, data_hash=str(hash(self.to_signature(value))), columns_stats=column_stats, histogram_system_metrics=hist_sys_metrics, histograms=histograms, query=value.query, )
def test_df_value_meta(self, pandas_data_frame): expected_data_schema = { "type": DataFrameValueType.type_str, "columns": list(pandas_data_frame.columns), "size.bytes": int(pandas_data_frame.size), "shape": pandas_data_frame.shape, "dtypes": { col: str(type_) for col, type_ in pandas_data_frame.dtypes.items() }, } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=DataFrameValueType().to_preview( pandas_data_frame, preview_size=meta_conf.get_preview_size()), data_dimensions=pandas_data_frame.shape, data_schema=expected_data_schema, data_hash=fast_hasher.hash( hash_pandas_object(pandas_data_frame, index=True).values), ) df_value_meta = DataFrameValueType().get_value_meta( pandas_data_frame, meta_conf=meta_conf) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert df_value_meta.data_schema == expected_value_meta.data_schema assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions assert df_value_meta.data_schema == expected_value_meta.data_schema # histograms and stats are tested in histogram tests and they change a lot, no need to test also here assert set([ col_stats.column_name for col_stats in df_value_meta.columns_stats ]) == {"Names", "Births"} assert set(df_value_meta.histograms.keys()) == {"Names", "Births"}