def test_df_value_meta(self, pandas_data_frame): expected_data_schema = { "type": DataFrameValueType.type_str, "columns": list(pandas_data_frame.columns), "size": int(pandas_data_frame.size), "shape": pandas_data_frame.shape, "dtypes": { col: str(type_) for col, type_ in pandas_data_frame.dtypes.items() }, } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=DataFrameValueType().to_preview( pandas_data_frame, preview_size=meta_conf.get_preview_size()), data_dimensions=pandas_data_frame.shape, data_schema=expected_data_schema, data_hash=fast_hasher.hash( hash_pandas_object(pandas_data_frame, index=True).values), ) df_value_meta = DataFrameValueType().get_value_meta( pandas_data_frame, meta_conf=meta_conf) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps( expected_value_meta.data_schema) assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions assert df_value_meta == expected_value_meta
def test_df_value_meta( self, pandas_data_frame, pandas_data_frame_histograms, pandas_data_frame_stats ): expected_data_schema = { "type": DataFrameValueType.type_str, "columns": list(pandas_data_frame.columns), "size": int(pandas_data_frame.size), "shape": pandas_data_frame.shape, "dtypes": { col: str(type_) for col, type_ in pandas_data_frame.dtypes.items() }, } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=DataFrameValueType().to_preview( pandas_data_frame, preview_size=meta_conf.get_preview_size() ), data_dimensions=pandas_data_frame.shape, data_schema=expected_data_schema, data_hash=fast_hasher.hash( hash_pandas_object(pandas_data_frame, index=True).values ), descriptive_stats=pandas_data_frame_stats, histograms=pandas_data_frame_histograms, ) df_value_meta = DataFrameValueType().get_value_meta( pandas_data_frame, meta_conf=meta_conf ) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps( expected_value_meta.data_schema ) assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions std = df_value_meta.descriptive_stats["Births"].pop("std") expected_std = expected_value_meta.descriptive_stats["Births"].pop("std") assert round(std, 2) == expected_std df_value_meta.descriptive_stats["Names"].pop("top") assert df_value_meta.descriptive_stats == expected_value_meta.descriptive_stats counts, values = df_value_meta.histograms.pop("Names") expected_counts, expected_values = expected_value_meta.histograms.pop("Names") assert counts == expected_counts assert set(values) == set(expected_values) # order changes in each run # histograms are tested in histogram tests and they change a lot, no need to test also here df_value_meta.histograms = expected_value_meta.histograms = None expected_value_meta.histogram_system_metrics = ( df_value_meta.histogram_system_metrics ) assert df_value_meta.data_schema == expected_value_meta.data_schema assert attr.asdict(df_value_meta) == attr.asdict(expected_value_meta)
def log_target( self, task_run, target, target_meta, # type: ValueMeta operation_type, # type: DbndTargetOperationType operation_status, # type: DbndTargetOperationStatus param_name, task_def_uid, ): data_schema = (json_utils.dumps(target_meta.data_schema) if target_meta.data_schema is not None else None) target_info = LogTargetArgs( run_uid=task_run.run.run_uid, task_run_uid=task_run.task_run_uid, task_run_name=task_run.job_name, task_run_attempt_uid=task_run.task_run_attempt_uid, task_def_uid=task_def_uid, param_name=param_name, target_path=str(target), operation_type=operation_type, operation_status=operation_status, value_preview=target_meta.value_preview, data_dimensions=target_meta.data_dimensions, data_schema=data_schema, data_hash=target_meta.data_hash, ) res = self.log_targets(targets_info=[target_info]) if getattr(target_meta, "descriptive_stats", None) and getattr( target_meta, "histograms", None): self.log_dataframe_histograms(target, target_meta, target_info) return res
def f_struct(self, structure): # return p.pformat(_dump_struct(structure)) structure_str = traverse_to_str(structure) dumped = json_utils.dumps(structure_str, indent=2) if isinstance(structure_str, dict): dumped = self._hjson_optimizer.sub("\g<1>", dumped) return dumped
def log_target( self, task_run, target, target_meta, # type: ValueMeta operation_type, # type: DbndTargetOperationType operation_status, # type: DbndTargetOperationStatus param_name=None, # type: Optional[str] task_def_uid=None, # type: Optional[UUID] ): data_schema = (json_utils.dumps(target_meta.data_schema) if target_meta.data_schema is not None else None) target_info = LogTargetArgs( run_uid=task_run.run.run_uid, task_run_uid=task_run.task_run_uid, task_run_name=task_run.job_name, task_run_attempt_uid=task_run.task_run_attempt_uid, task_def_uid=task_def_uid, param_name=param_name, target_path=str(target), operation_type=operation_type, operation_status=operation_status, value_preview=target_meta.value_preview, data_dimensions=target_meta.data_dimensions, data_schema=data_schema, data_hash=target_meta.data_hash, ) res = self.log_targets(targets_info=[target_info]) if target_meta.histograms: self.log_histograms(task_run, param_name, target_meta, utcnow()) return res
def log_target( self, task_run, target, target_meta, operation_type, operation_status, param_name, task_def_uid, ): data_schema = (json_utils.dumps(target_meta.data_schema) if target_meta.data_schema is not None else None) target_info = LogTargetArgs( run_uid=task_run.run.run_uid, task_run_uid=task_run.task_run_uid, task_run_name=task_run.job_name, task_run_attempt_uid=task_run.task_run_attempt_uid, task_def_uid=task_def_uid, param_name=param_name, target_path=str(target), operation_type=operation_type, operation_status=operation_status, value_preview=target_meta.value_preview, data_dimensions=target_meta.data_dimensions, data_schema=data_schema, data_hash=target_meta.data_hash, ) return self.log_targets(targets_info=[target_info])
def get_data_schema(self, df): # type: (pd.DataFrame) -> str schema = { "type": self.type_str, "columns": list(df.columns), "size": int(df.size), "shape": df.shape, "dtypes": {col: str(type_) for col, type_ in df.dtypes.items()}, } return json_utils.dumps(schema)
def f_io(self, structure): structure_str = traverse_to_str(structure) structure_str = traverse( structure_str, lambda x: x if not x or len(x) <= 600 else ("%s... (%s files)" % (x[:400], len(x.split(",")))), ) dumped = json_utils.dumps(structure_str, indent=2) return dumped
def f_io(self, structure): structure_str = traverse_to_str(structure) structure_str = traverse( structure_str, lambda x: x if not x or len(x) <= 600 else ("%s... (%s files)" % (x[:400], len(x.split(",")))), ) dumped = json_utils.dumps(structure_str, indent=2) if isinstance(structure_str, dict): dumped = self._hjson_optimizer.sub("\g<1>", dumped) return dumped
def test_df_value_meta(self, pandas_data_frame): expected_data_schema = { "type": DataFrameValueType.type_str, "columns": list(pandas_data_frame.columns), "size": int(pandas_data_frame.size), "shape": pandas_data_frame.shape, "dtypes": { col: str(type_) for col, type_ in pandas_data_frame.dtypes.items() }, } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=DataFrameValueType().to_preview( pandas_data_frame, preview_size=meta_conf.get_preview_size()), data_dimensions=pandas_data_frame.shape, data_schema=expected_data_schema, data_hash=fast_hasher.hash( hash_pandas_object(pandas_data_frame, index=True).values), ) df_value_meta = DataFrameValueType().get_value_meta( pandas_data_frame, meta_conf=meta_conf) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps( expected_value_meta.data_schema) assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions assert df_value_meta.data_schema == expected_value_meta.data_schema # histograms and stats are tested in histogram tests and they change a lot, no need to test also here assert set( df_value_meta.descriptive_stats.keys()) == {"Names", "Births"} assert set(df_value_meta.histograms.keys()) == {"Names", "Births"}
def test_data_shape(self, pandas_data_frame): expected_schema = json_utils.dumps({ "type": "DataFrame", "columns": list(pandas_data_frame.columns), "size": int(pandas_data_frame.size), "dtypes": { "Births": "int64", "Names": "object" }, "shape": pandas_data_frame.shape, }) schema = DataFrameValueType().get_data_schema(pandas_data_frame) assert isinstance(schema, six.string_types) assert schema == expected_schema
def f_struct(self, structure): # return p.pformat(_dump_struct(structure)) structure_str = traverse_to_str(structure) dumped = json_utils.dumps(structure_str, indent=2) return dumped
def non_completed_outputs_to_str(non_completed_outputs): return json_utils.dumps(non_completed_outputs)
def get_data_schema(self, value): # type: (Any) -> str return json_utils.dumps({"type": self.type_str})