def calc_meta_conf_for_value_type(tracking_level, value_type, target=None): # type: (ValueTrackingLevel, ValueType, Optional[Target]) -> ValueMetaConf """ Calculating the right value log config base on the value type in order control the tracking of lazy evaluated types like spark dataframes IMPORTANT - The result is ValueMetaConf with restrictions only! this should be merged into a full ValueMetaConf. """ if tracking_level == ValueTrackingLevel.ALL: # no restrictions return ValueMetaConf() if tracking_level == ValueTrackingLevel.SMART: result = ValueMetaConf() if value_type.is_lazy_evaluated: # restrict only for lazy evaluate values result = ValueMetaConf.disabled_expensive() elif target is not None and not value_type.support_fast_count(target): # we don't set it to True cause there might # be different configuration that will want it to be False result = attr.evolve(result, log_size=False) return result if tracking_level == ValueTrackingLevel.NONE: # restrict any return ValueMetaConf.disabled_expensive()
def test_task_metrics_histograms(self, tmpdir, pandas_data_frame): metrics_folder = target(str(tmpdir)) task_run = Mock() task_run.meta_files = TaskRunMetaFiles(metrics_folder) t = FileTrackingStore() tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t) tr_tracker.settings.tracking.get_value_meta_conf = Mock( return_value=ValueMetaConf.enabled() ) tr_tracker.log_data("df", pandas_data_frame, meta_conf=ValueMetaConf.enabled()) hist_metrics = TaskRunMetricsFileStoreReader( metrics_folder ).get_all_metrics_values(MetricSource.histograms) expected_preview = ( " Names Births Married\n" " Bob 968 True\n" " Jessica 155 False\n" " Mary 77 True\n" " John 578 False\n" " Mel 973 True" ) # std value varies in different py versions due to float precision fluctuation df_births_std = hist_metrics["df.Births.std"] assert df_births_std == pytest.approx(428.4246)
def test_task_metrics_simple(self, tmpdir, pandas_data_frame): metrics_folder = target(str(tmpdir)) task_run = Mock() task_run.meta_files = TaskRunMetaFiles(metrics_folder) t = FileTrackingStore() tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t) tr_tracker.settings.features.get_value_meta_conf = Mock( return_value=ValueMetaConf.enabled()) tr_tracker.log_metric("a", 1) tr_tracker.log_metric("a_string", "1") tr_tracker.log_metric("a_list", [1, 3]) tr_tracker.log_metric("a_tuple", (1, 2)) tr_tracker.log_dataframe("df", pandas_data_frame, meta_conf=ValueMetaConf.enabled()) actual = TaskRunMetricsFileStoreReader( metrics_folder).get_all_metrics_values() print(actual) assert "df.schema" in actual del actual["df.schema"] assert actual == { "a": 1.0, "a_list": "[1, 3]", "a_string": 1.0, "a_tuple": "(1, 2)", "df.preview": "Names Births", "df.shape": "(5, 2)", "df.shape_0_": 5.0, "df.shape_1_": 2.0, }
class TestValueMetaConf(object): @pytest.mark.parametrize( "left, right, expected", [ (ALL_NONE, ALL_TRUE, ALL_TRUE), (ALL_TRUE, ALL_NONE, ALL_TRUE), ( ValueMetaConf( log_schema=True, log_size=False, log_preview=True, log_stats=True ), ALL_FALSE, ValueMetaConf( log_schema=True, log_size=False, log_preview=True, log_stats=True, log_histograms=False, ), ), (ALL_FALSE, ALL_TRUE, ALL_FALSE), (ALL_TRUE, ALL_FALSE, ALL_TRUE), ], ) def test_merging_2(self, left, right, expected): assert left.merge_if_none(right) == expected @pytest.mark.parametrize( "meta_conf_list, expected", [ ([ALL_NONE, ALL_TRUE, ALL_FALSE], ALL_TRUE), ([ALL_NONE, ALL_NONE, ALL_FALSE], ALL_FALSE), ( [ ALL_NONE, ValueMetaConf( log_preview=True, log_schema=True, log_size=True, log_stats=False, ), ALL_FALSE, ], ValueMetaConf( log_preview=True, log_schema=True, log_size=True, log_stats=False, log_histograms=False, ), ), ], ) def test_summing(self, meta_conf_list, expected): assert reduce(lambda x, y: x.merge_if_none(y), meta_conf_list) == expected
def test_spark_df_value_meta(self, spark_data_frame): expected_data_schema = { "type": SparkDataFrameValueType.type_str, "columns": list(spark_data_frame.schema.names), "size": int(spark_data_frame.count() * len(spark_data_frame.columns)), "shape": (spark_data_frame.count(), len(spark_data_frame.columns)), "dtypes": {f.name: str(f.dataType) for f in spark_data_frame.schema.fields}, } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=SparkDataFrameValueType().to_preview( spark_data_frame, meta_conf.get_preview_size()), data_dimensions=(spark_data_frame.count(), len(spark_data_frame.columns)), data_schema=expected_data_schema, data_hash=None, ) df_value_meta = SparkDataFrameValueType().get_value_meta( spark_data_frame) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert df_value_meta.data_schema == expected_value_meta.data_schema assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions assert df_value_meta == expected_value_meta
def test_task_metrics_simple(self, tmpdir, pandas_data_frame): metrics_folder = target(str(tmpdir)) task_run = Mock() task_run.meta_files = TaskRunMetaFiles(metrics_folder) t = FileTrackingStore() tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t) tr_tracker.settings.features.get_value_meta_conf = Mock( return_value=ValueMetaConf.enabled() ) tr_tracker.log_metric("a", 1) tr_tracker.log_metric("a_string", "1") tr_tracker.log_metric("a_list", [1, 3]) tr_tracker.log_metric("a_tuple", (1, 2)) user_metrics = TaskRunMetricsFileStoreReader( metrics_folder ).get_all_metrics_values(MetricSource.user) assert user_metrics == { "a": 1.0, "a_list": "[1, 3]", "a_string": 1.0, "a_tuple": "(1, 2)", }
def test_get_value_meta_empty(self, snowflake_table): # Arrange with mock.patch( "dbnd_snowflake.snowflake_values.SnowflakeController", new_callable=snowflake_controller_mock, ) as snowflake: # Act value_meta = SnowflakeTableValueType().get_value_meta( snowflake_table, meta_conf=(ValueMetaConf(log_preview=False, log_schema=False, log_size=False)), ) # Assert assert value_meta.value_preview is None assert value_meta.data_dimensions is None assert value_meta.data_schema == {} assert ( value_meta.data_hash == "snowflake://*****:*****@SNOWFLAKE_ACCOUNT/SNOWFLAKE_SAMPLE_DATA.TPCDS_SF100TCL/CUSTOMER" ) assert not snowflake.get_column_types.called assert not snowflake.get_dimensions.called assert not snowflake.to_preview.called
def _log_inputs(task_run): """ For tracking mode. Logs InMemoryTarget inputs. """ try: params = task_run.task._params for param in params.get_params(input_only=True): value = params.get_value(param.name) if isinstance(value, InMemoryTarget): try: param = param.modify(value_meta_conf=ValueMetaConf( log_preview=True, log_schema=True, )) task_run.tracker.log_parameter_data( parameter=param, target=value, value=value._obj, operation_type=DbndTargetOperationType.read, operation_status=DbndTargetOperationStatus.OK, ) except Exception as ex: log_exception( "Failed to log input param to tracking store.", ex=ex, non_critical=True, ) except Exception as ex: log_exception("Failed to log input params to tracking store.", ex=ex, non_critical=True)
def log_dataframe(key, value, with_preview=True, with_size=True, with_schema=True, with_stats=False): # type: (str, Union[pd.DataFrame, spark.DataFrame], bool,bool, bool) -> None meta_conf = ValueMetaConf( log_preview=with_preview, log_schema=with_schema, log_size=with_size, log_stats=with_stats, ) tracker = _get_tracker() if tracker: tracker.log_dataframe(key, value, meta_conf=meta_conf) return from dbnd._core.task_run.task_run_tracker import get_value_meta_for_metric value_type = get_value_meta_for_metric(key, value, meta_conf=meta_conf) if value_type: logger.info("Log DataFrame '{}': shape='{}'".format( key, value_type.data_dimensions)) else: logger.info("Log DataFrame '{}': {} is not supported".format( key, type(value)))
def log_data( key, # type: str value=None, # type: Union[pd.DataFrame, spark.DataFrame, PostgresTable, SnowflakeTable] path=None, # type: Optional[str] operation_type=DbndTargetOperationType.read, # type: DbndTargetOperationType with_preview=None, # type: Optional[bool] with_size=None, # type: Optional[bool] with_schema=None, # type: Optional[bool] with_stats=None, # type: Optional[Union[bool, str, List[str], LogDataRequest]] with_histograms=None, # type: Optional[Union[bool, str, List[str], LogDataRequest]] raise_on_error=False, # type: bool ): # type: (...) -> None tracker = _get_tracker() if not tracker: return meta_conf = ValueMetaConf( log_preview=with_preview, log_schema=with_schema, log_size=with_size, log_stats=with_stats, log_histograms=with_histograms, ) tracker.log_data( key, value, meta_conf=meta_conf, path=path, operation_type=operation_type, raise_on_error=raise_on_error, )
def test_df_value_meta(self, pandas_data_frame): expected_data_schema = { "type": DataFrameValueType.type_str, "columns": list(pandas_data_frame.columns), "size": int(pandas_data_frame.size), "shape": pandas_data_frame.shape, "dtypes": { col: str(type_) for col, type_ in pandas_data_frame.dtypes.items() }, } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=DataFrameValueType().to_preview( pandas_data_frame, preview_size=meta_conf.get_preview_size()), data_dimensions=pandas_data_frame.shape, data_schema=expected_data_schema, data_hash=fast_hasher.hash( hash_pandas_object(pandas_data_frame, index=True).values), ) df_value_meta = DataFrameValueType().get_value_meta( pandas_data_frame, meta_conf=meta_conf) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps( expected_value_meta.data_schema) assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions assert df_value_meta == expected_value_meta
def test_get_value_meta(self, snowflake_table): # Arrange with mock.patch( "dbnd_snowflake.snowflake_values.SnowflakeController", new_callable=snowflake_controller_mock, ) as snowflake: # Act value_meta = SnowflakeTableValueType().get_value_meta( snowflake_table, meta_conf=(ValueMetaConf.enabled())) # Assert assert value_meta.value_preview == "test preview" assert value_meta.data_dimensions == [42, 12] assert value_meta.data_schema == { "type": "SnowflakeTable", "column_types": { "name": "varchar" }, "size": "500 B", } assert ( value_meta.data_hash == "snowflake://*****:*****@SNOWFLAKE_ACCOUNT/SNOWFLAKE_SAMPLE_DATA.TPCDS_SF100TCL/CUSTOMER" ) assert snowflake.get_column_types.called assert snowflake.get_dimensions.called assert snowflake.to_preview.called
def _log_parameter_value(task_run, parameter_definition, target, value): # make sure it will be logged correctly parameter_definition = parameter_definition.modify( value_meta_conf=ValueMetaConf( log_preview=True, log_schema=True, )) try: # case what if result is Proxy value_type = get_value_type_of_obj(value, parameter_definition.value_type) task_run.run.target_origin.add(target, value, value_type) except Exception as ex: log_exception("Failed to register result to target tracking.", ex=ex, non_critical=True) try: task_run.tracker.log_parameter_data( parameter= parameter_definition, # was: task_run.task.task_definition.task_class.result, target=target, value=value, operation_type=DbndTargetOperationType. write, # is it write? (or log?) operation_status=DbndTargetOperationStatus.OK, ) except Exception as ex: log_exception("Failed to log result to tracking store.", ex=ex, non_critical=True)
def meta_conf(self): return ValueMetaConf( log_preview=self.with_preview, log_schema=self.with_schema, log_size=self.with_schema, log_stats=self.with_stats, log_histograms=self.with_histograms, )
def test_str_value_meta(self): str_value_meta = StrValueType().get_value_meta("foo", ValueMetaConf.enabled()) expected_value_meta = ValueMeta( value_preview="foo", data_dimensions=None, data_schema={"type": "str"}, data_hash=fast_hasher.hash("foo"), ) assert str_value_meta == expected_value_meta
def _build_parameter(self, context="inline"): s = self.parameter # type: ParameterDefinition update_kwargs = {} value_type = self._build_value_type(context) validator = s.validator if s.choices: validator = ChoiceValidator(s.choices) if is_not_defined(s.default): if s.empty_default: update_kwargs["default"] = value_type._generate_empty_default() if not is_defined(s.load_on_build): update_kwargs["load_on_build"] = value_type.load_on_build # create value meta if s.value_meta_conf is None: update_kwargs["value_meta_conf"] = ValueMetaConf( log_preview=s.log_preview, log_preview_size=s.log_preview_size, log_schema=s.log_schema, log_size=s.log_size, log_stats=s.log_stats, log_histograms=s.log_histograms, ) # Whether different values for this parameter will differentiate otherwise equal tasks description = s.description or "" if not is_defined(description): if s.is_output() and s.default_output_description: description = s.default_output_description elif not s.load_on_build and s.default_input_description: description = s.default_input_description else: description = s.default_description if s.validator: description = _add_description(description, validator.description) update_kwargs["description"] = description() # We need to keep track of this to get the order right (see Task class) ParameterDefinition._total_counter += 1 if s.kind == _ParameterKind.task_output: update_kwargs["significant"] = False updated = self.modify( value_type=value_type, value_type_defined=value_type, validator=validator, description=description, parameter_id=ParameterDefinition._total_counter, **update_kwargs ) return updated.parameter
def test_spark_df_value_meta(self, spark_data_frame, spark_data_frame_histograms, spark_data_frame_stats): expected_data_schema = { "type": SparkDataFrameValueType.type_str, "columns": list(spark_data_frame.schema.names), "size.bytes": int(spark_data_frame.count() * len(spark_data_frame.columns)), "shape": (spark_data_frame.count(), len(spark_data_frame.columns)), "dtypes": {f.name: str(f.dataType) for f in spark_data_frame.schema.fields}, } expected_hist_sys_metrics = { "boolean_histograms_and_stats_calc_time", "histograms_and_stats_calc_time", "numeric_histograms_and_stats_calc_time", "string_histograms_and_stats_calc_time", } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=SparkDataFrameValueType().to_preview( spark_data_frame, meta_conf.get_preview_size()), data_dimensions=(spark_data_frame.count(), len(spark_data_frame.columns)), data_hash=SparkDataFrameValueType().to_signature(spark_data_frame), data_schema=expected_data_schema, descriptive_stats=spark_data_frame_stats, histograms=spark_data_frame_histograms, ) df_value_meta = SparkDataFrameValueType().get_value_meta( spark_data_frame, meta_conf) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions assert df_value_meta.data_schema == expected_value_meta.data_schema # it changes all the time, it has different formats, and it's already tested in histogram tests # assert df_value_meta.descriptive_stats == expected_value_meta.descriptive_stats # histogram_system_metrics values are too dynamic, so checking only keys, but not values assert (set(df_value_meta.histogram_system_metrics.keys()) == expected_hist_sys_metrics) df_value_meta.histogram_system_metrics = None # assert df_value_meta.histograms == expected_value_meta.histograms # assert attr.asdict(df_value_meta) == attr.asdict(expected_value_meta) pandas_data_frame = spark_data_frame.toPandas() pandas_value_meta = DataFrameValueType().get_value_meta( pandas_data_frame, meta_conf)
def test_df_value_meta( self, pandas_data_frame, pandas_data_frame_histograms, pandas_data_frame_stats ): expected_data_schema = { "type": DataFrameValueType.type_str, "columns": list(pandas_data_frame.columns), "size": int(pandas_data_frame.size), "shape": pandas_data_frame.shape, "dtypes": { col: str(type_) for col, type_ in pandas_data_frame.dtypes.items() }, } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=DataFrameValueType().to_preview( pandas_data_frame, preview_size=meta_conf.get_preview_size() ), data_dimensions=pandas_data_frame.shape, data_schema=expected_data_schema, data_hash=fast_hasher.hash( hash_pandas_object(pandas_data_frame, index=True).values ), descriptive_stats=pandas_data_frame_stats, histograms=pandas_data_frame_histograms, ) df_value_meta = DataFrameValueType().get_value_meta( pandas_data_frame, meta_conf=meta_conf ) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps( expected_value_meta.data_schema ) assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions std = df_value_meta.descriptive_stats["Births"].pop("std") expected_std = expected_value_meta.descriptive_stats["Births"].pop("std") assert round(std, 2) == expected_std df_value_meta.descriptive_stats["Names"].pop("top") assert df_value_meta.descriptive_stats == expected_value_meta.descriptive_stats counts, values = df_value_meta.histograms.pop("Names") expected_counts, expected_values = expected_value_meta.histograms.pop("Names") assert counts == expected_counts assert set(values) == set(expected_values) # order changes in each run # histograms are tested in histogram tests and they change a lot, no need to test also here df_value_meta.histograms = expected_value_meta.histograms = None expected_value_meta.histogram_system_metrics = ( df_value_meta.histogram_system_metrics ) assert df_value_meta.data_schema == expected_value_meta.data_schema assert attr.asdict(df_value_meta) == attr.asdict(expected_value_meta)
def test_task_run_sync_local_multi_target( self, monkeypatch, my_multitarget, test_task, create_local_multitarget, mock_fs_download, mock_file_metadata_registry, mock_target_move_from, ): test_task = test_task.t(my_multitarget) task_run = test_task.dbnd_run().root_task_run sync_local = task_run.sync_local assert len(sync_local.inputs_to_sync) == 1 task_param, old_multitarget = sync_local.inputs_to_sync[0] task_param.value_meta_conf = ValueMetaConf( log_preview=True, log_preview_size=10000, log_schema=True, log_size=True, log_stats=LogDataRequest( include_all_boolean=True, include_all_numeric=True, include_all_string=True, ), log_histograms=LogDataRequest(), ) assert task_param == test_task._params.get_param("input_") assert old_multitarget == my_multitarget local_multitarget = create_local_multitarget() with mock_fs_download as mocked_fs_download, mock_file_metadata_registry, mock_target_move_from as mock_target_move_from: monkeypatch.setattr(FileTarget, "tmp", mock_tmp) # only pre_execute is checked because post_execute code is unreachable for MultiTargets sync_local.sync_pre_execute() assert mocked_fs_download.call_count == 2 mocked_fs_download.assert_has_calls([ call(remote_subtarget.path, TMP_FILE_PATH) for remote_subtarget, local_subtarget in zip( my_multitarget.targets, local_multitarget.targets) ]) # check if test_task.input_ was changed to local after sync_pre_execute self.compare_multitargets(test_task.input_, local_multitarget) sync_local.sync_post_execute() # check if test_task.input_ was set back to original target self.compare_multitargets(test_task.input_, my_multitarget)
def test_get_value_meta_preview_small_size(self, value, value_type, target, expected_value_preview): tracking_config = TrackingConfig.from_databand_context() tracking_config.value_reporting_strategy = ValueTrackingLevel.ALL result = get_value_meta( value, ValueMetaConf(), tracking_config, value_type=value_type, target=target, ) assert result.value_preview == expected_value_preview
def test_target_value_meta(self): v = target("a") meta_conf = ValueMetaConf.enabled() target_value_meta = TargetPathLibValueType().get_value_meta( v, meta_conf=meta_conf) expected_value_meta = ValueMeta( value_preview='"a"', data_dimensions=None, data_schema={"type": "Path"}, data_hash=fast_hasher.hash(v), ) assert target_value_meta == expected_value_meta
def _build_meta_conf(self): # type: () -> ValueMetaConf """ Translate this configuration into value meta conf WE EXPECT IT TO HAVE ALL THE INNER VALUES SET WITHOUT NONES """ return ValueMetaConf( log_schema=self.log_value_schema, log_size=self.log_value_size, log_preview_size=self.log_value_preview_max_len, log_preview=self.log_value_preview, log_stats=self.log_value_stats, log_histograms=self.log_histograms, )
def log_data( key, # type: str value=None, # type: Union[pd.DataFrame, spark.DataFrame, PostgresTable, SnowflakeTable] path=None, # type: Optional[str] operation_type=DbndTargetOperationType. read, # type: DbndTargetOperationType with_preview=None, # type: Optional[bool] with_size=None, # type: Optional[bool] with_schema=None, # type: Optional[bool] with_stats=None, # type: Optional[Union[bool, str, List[str], LogDataRequest]] with_histograms=None, # type: Optional[Union[bool, str, List[str], LogDataRequest]] raise_on_error=False, # type: bool ): # type: (...) -> None """ Log data information to dbnd. @param key: Name of the data. @param value: Value of the data, currently supporting only dataframes and tables view. @param path: Optional target or path representing a target to connect the data to. @param operation_type: Type of the operation doing with the target - reading or writing the data? @param with_preview: True if should log a preview of the data. @param with_size: True if should log the size of the data. @param with_schema: True if should log the schema of the data. @param with_stats: True if should calculate and log stats of the data. @param with_histograms: True if should calculate and log histogram of the data. @param raise_on_error: raise if error occur. """ tracker = _get_tracker() if not tracker: message = TRACKER_MISSING_MESSAGE % ("log_data", ) get_one_time_logger().log_once(message, "log_data", logging.WARNING) return meta_conf = ValueMetaConf( log_preview=with_preview, log_schema=with_schema, log_size=with_size, log_stats=with_stats, log_histograms=with_histograms, ) tracker.log_data( key, value, meta_conf=meta_conf, path=path, operation_type=operation_type, raise_on_error=raise_on_error, )
def log_dataframe( self, key, df, with_preview=True, with_schema=True, with_size=True, with_stats=False, ): meta_conf = ValueMetaConf( log_preview=with_preview, log_schema=with_schema, log_size=with_size, log_stats=with_stats, ) self.tracker.log_data(key, df, meta_conf=meta_conf)
def test_get_value_meta_empty(self, snowflake_table): value_meta = SnowflakeTableValueType().get_value_meta( snowflake_table, meta_conf=(ValueMetaConf(log_preview=False, log_schema=False, log_size=False)), ) # Assert assert value_meta.value_preview is None assert value_meta.data_dimensions is None assert value_meta.data_schema == {} assert value_meta.data_hash == EXPECTED_SNOWFLAKE_TABLE_SIGNATURE assert not snowflake_table.snowflake_ctrl.get_column_types.called assert not snowflake_table.snowflake_ctrl.get_dimensions.called assert not snowflake_table.snowflake_ctrl.to_preview.called
def test_log_schema( self, tracking_config, param_log_schema, config_log_schema, expected_log_schema ): # type: (Callable[[], TrackingConfig], bool, bool, bool) -> None # Arrange tracking_config = tracking_config() param_mc = ValueMetaConf(log_schema=param_log_schema) if config_log_schema is not None: tracking_config.log_value_schema = config_log_schema # Act actual_value_meta_conf = tracking_config.get_value_meta_conf( param_mc, ObjectValueType()) # Assert assert actual_value_meta_conf.log_schema == expected_log_schema
def test_task_run_sync_local_file_target( self, monkeypatch, test_task, my_target, mock_fs_download, mock_file_metadata_registry, mock_target_move_from, ): test_task = test_task.t(my_target) task_run = test_task.dbnd_run().root_task_run sync_local = task_run.sync_local assert len(sync_local.inputs_to_sync) == 1 task_param, old_target = sync_local.inputs_to_sync[0] task_param.value_meta_conf = ValueMetaConf( log_preview=True, log_preview_size=10000, log_schema=True, log_size=True, log_stats=LogDataRequest( include_all_boolean=True, include_all_numeric=True, include_all_string=True, ), log_histograms=LogDataRequest(), ) assert task_param == test_task._params.get_param("input_") assert old_target == my_target local_target = target( os.path.join(DBND_LOCAL_ROOT, LOCAL_SYNC_CACHE_NAME), my_target.path.lstrip("/"), ) with mock_fs_download as mocked_fs_download, mock_file_metadata_registry, mock_target_move_from as mock_target_move_from: monkeypatch.setattr(FileTarget, "tmp", mock_tmp) sync_local.sync_pre_execute() mocked_fs_download.assert_called_once_with(my_target.path, TMP_FILE_PATH) assert test_task.input_ == local_target sync_local.sync_post_execute() assert test_task.input_ == my_target
def test_get_histograms_and_stats(self): with mock.patch( "dbnd_postgres.postgres_values.PostgresController._query" ) as query_patch: # Arrange pg_stats_data = [{ "attname": "customer", "null_frac": 0.5, "n_distinct": 8, "most_common_vals": "{customerA, customerB}", "most_common_freqs": [0.2, 0.2], }] pg_class_data = [{"reltuples": 10}] information_schema_columns_data = [{ "column_name": "customer", "data_type": "varchar" }] query_patch.side_effect = [ pg_stats_data, pg_class_data, information_schema_columns_data, ] expected_columns_stats = [ ColumnStatsArgs( column_name="customer", column_type="varchar", records_count=10, distinct_count=8, null_count=5, ) ] expected_histograms = { "customer": ([2, 2, 1], ["customerA", "customerB", "_others"]) } # Act postgres = PostgresController("user@database", "data_table") meta_conf = ValueMetaConf.enabled() columns_stats, histograms = postgres.get_histograms_and_stats( meta_conf) # Assert assert columns_stats == expected_columns_stats assert histograms == expected_histograms
def test_get_value_meta(self, snowflake_table): value_meta = SnowflakeTableValueType().get_value_meta( snowflake_table, meta_conf=(ValueMetaConf.enabled())) # Assert assert value_meta.value_preview == "test preview" assert value_meta.data_dimensions == [42, 12] assert value_meta.data_schema == { "type": "SnowflakeTable", "column_types": { "name": "varchar" }, "size.bytes": 500, } assert value_meta.data_hash == EXPECTED_SNOWFLAKE_TABLE_SIGNATURE assert snowflake_table.snowflake_ctrl.get_column_types.called assert snowflake_table.snowflake_ctrl.get_dimensions.called assert snowflake_table.snowflake_ctrl.to_preview.called
def test_log_preview_size( self, tracking_config, param_log_preview_size, config_log_preview_size, expected_log_preview_size, ): # type: (Callable[[], TrackingConfig], int, int, int) -> None # Arrange tc = tracking_config() param_mc = ValueMetaConf(log_preview_size=param_log_preview_size) if config_log_preview_size is not None: tc.log_value_preview_max_len = config_log_preview_size # Act actual_value_meta_conf = tc.get_value_meta_conf(param_mc) # Assert assert actual_value_meta_conf.log_preview_size == expected_log_preview_size