Example #1
0
    def test_df_value_meta(self, pandas_data_frame):
        expected_data_schema = {
            "type": DataFrameValueType.type_str,
            "columns": list(pandas_data_frame.columns),
            "size": int(pandas_data_frame.size),
            "shape": pandas_data_frame.shape,
            "dtypes": {
                col: str(type_)
                for col, type_ in pandas_data_frame.dtypes.items()
            },
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=DataFrameValueType().to_preview(
                pandas_data_frame, preview_size=meta_conf.get_preview_size()),
            data_dimensions=pandas_data_frame.shape,
            data_schema=expected_data_schema,
            data_hash=fast_hasher.hash(
                hash_pandas_object(pandas_data_frame, index=True).values),
        )

        df_value_meta = DataFrameValueType().get_value_meta(
            pandas_data_frame, meta_conf=meta_conf)

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps(
            expected_value_meta.data_schema)
        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions
        assert df_value_meta == expected_value_meta
    def test_df_value_meta(
        self, pandas_data_frame, pandas_data_frame_histograms, pandas_data_frame_stats
    ):
        expected_data_schema = {
            "type": DataFrameValueType.type_str,
            "columns": list(pandas_data_frame.columns),
            "size": int(pandas_data_frame.size),
            "shape": pandas_data_frame.shape,
            "dtypes": {
                col: str(type_) for col, type_ in pandas_data_frame.dtypes.items()
            },
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=DataFrameValueType().to_preview(
                pandas_data_frame, preview_size=meta_conf.get_preview_size()
            ),
            data_dimensions=pandas_data_frame.shape,
            data_schema=expected_data_schema,
            data_hash=fast_hasher.hash(
                hash_pandas_object(pandas_data_frame, index=True).values
            ),
            descriptive_stats=pandas_data_frame_stats,
            histograms=pandas_data_frame_histograms,
        )

        df_value_meta = DataFrameValueType().get_value_meta(
            pandas_data_frame, meta_conf=meta_conf
        )

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps(
            expected_value_meta.data_schema
        )
        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions

        std = df_value_meta.descriptive_stats["Births"].pop("std")
        expected_std = expected_value_meta.descriptive_stats["Births"].pop("std")
        assert round(std, 2) == expected_std
        df_value_meta.descriptive_stats["Names"].pop("top")
        assert df_value_meta.descriptive_stats == expected_value_meta.descriptive_stats

        counts, values = df_value_meta.histograms.pop("Names")
        expected_counts, expected_values = expected_value_meta.histograms.pop("Names")
        assert counts == expected_counts
        assert set(values) == set(expected_values)  # order changes in each run
        # histograms are tested in histogram tests and they change a lot, no need to test also here
        df_value_meta.histograms = expected_value_meta.histograms = None

        expected_value_meta.histogram_system_metrics = (
            df_value_meta.histogram_system_metrics
        )
        assert df_value_meta.data_schema == expected_value_meta.data_schema
        assert attr.asdict(df_value_meta) == attr.asdict(expected_value_meta)
Example #3
0
 def log_target(
     self,
     task_run,
     target,
     target_meta,  # type: ValueMeta
     operation_type,  # type: DbndTargetOperationType
     operation_status,  # type: DbndTargetOperationStatus
     param_name,
     task_def_uid,
 ):
     data_schema = (json_utils.dumps(target_meta.data_schema)
                    if target_meta.data_schema is not None else None)
     target_info = LogTargetArgs(
         run_uid=task_run.run.run_uid,
         task_run_uid=task_run.task_run_uid,
         task_run_name=task_run.job_name,
         task_run_attempt_uid=task_run.task_run_attempt_uid,
         task_def_uid=task_def_uid,
         param_name=param_name,
         target_path=str(target),
         operation_type=operation_type,
         operation_status=operation_status,
         value_preview=target_meta.value_preview,
         data_dimensions=target_meta.data_dimensions,
         data_schema=data_schema,
         data_hash=target_meta.data_hash,
     )
     res = self.log_targets(targets_info=[target_info])
     if getattr(target_meta, "descriptive_stats", None) and getattr(
             target_meta, "histograms", None):
         self.log_dataframe_histograms(target, target_meta, target_info)
     return res
Example #4
0
 def f_struct(self, structure):
     # return p.pformat(_dump_struct(structure))
     structure_str = traverse_to_str(structure)
     dumped = json_utils.dumps(structure_str, indent=2)
     if isinstance(structure_str, dict):
         dumped = self._hjson_optimizer.sub("\g<1>", dumped)
     return dumped
Example #5
0
 def log_target(
         self,
         task_run,
         target,
         target_meta,  # type: ValueMeta
         operation_type,  # type: DbndTargetOperationType
         operation_status,  # type: DbndTargetOperationStatus
         param_name=None,  # type: Optional[str]
         task_def_uid=None,  # type: Optional[UUID]
 ):
     data_schema = (json_utils.dumps(target_meta.data_schema)
                    if target_meta.data_schema is not None else None)
     target_info = LogTargetArgs(
         run_uid=task_run.run.run_uid,
         task_run_uid=task_run.task_run_uid,
         task_run_name=task_run.job_name,
         task_run_attempt_uid=task_run.task_run_attempt_uid,
         task_def_uid=task_def_uid,
         param_name=param_name,
         target_path=str(target),
         operation_type=operation_type,
         operation_status=operation_status,
         value_preview=target_meta.value_preview,
         data_dimensions=target_meta.data_dimensions,
         data_schema=data_schema,
         data_hash=target_meta.data_hash,
     )
     res = self.log_targets(targets_info=[target_info])
     if target_meta.histograms:
         self.log_histograms(task_run, param_name, target_meta, utcnow())
     return res
Example #6
0
 def log_target(
     self,
     task_run,
     target,
     target_meta,
     operation_type,
     operation_status,
     param_name,
     task_def_uid,
 ):
     data_schema = (json_utils.dumps(target_meta.data_schema)
                    if target_meta.data_schema is not None else None)
     target_info = LogTargetArgs(
         run_uid=task_run.run.run_uid,
         task_run_uid=task_run.task_run_uid,
         task_run_name=task_run.job_name,
         task_run_attempt_uid=task_run.task_run_attempt_uid,
         task_def_uid=task_def_uid,
         param_name=param_name,
         target_path=str(target),
         operation_type=operation_type,
         operation_status=operation_status,
         value_preview=target_meta.value_preview,
         data_dimensions=target_meta.data_dimensions,
         data_schema=data_schema,
         data_hash=target_meta.data_hash,
     )
     return self.log_targets(targets_info=[target_info])
Example #7
0
 def get_data_schema(self, df):  # type: (pd.DataFrame) -> str
     schema = {
         "type": self.type_str,
         "columns": list(df.columns),
         "size": int(df.size),
         "shape": df.shape,
         "dtypes": {col: str(type_) for col, type_ in df.dtypes.items()},
     }
     return json_utils.dumps(schema)
Example #8
0
    def f_io(self, structure):

        structure_str = traverse_to_str(structure)
        structure_str = traverse(
            structure_str,
            lambda x: x if not x or len(x) <= 600 else
            ("%s... (%s files)" % (x[:400], len(x.split(",")))),
        )
        dumped = json_utils.dumps(structure_str, indent=2)
        return dumped
Example #9
0
    def f_io(self, structure):

        structure_str = traverse_to_str(structure)
        structure_str = traverse(
            structure_str,
            lambda x: x if not x or len(x) <= 600 else
            ("%s... (%s files)" % (x[:400], len(x.split(",")))),
        )
        dumped = json_utils.dumps(structure_str, indent=2)
        if isinstance(structure_str, dict):
            dumped = self._hjson_optimizer.sub("\g<1>", dumped)
        return dumped
Example #10
0
    def test_df_value_meta(self, pandas_data_frame):
        expected_data_schema = {
            "type": DataFrameValueType.type_str,
            "columns": list(pandas_data_frame.columns),
            "size": int(pandas_data_frame.size),
            "shape": pandas_data_frame.shape,
            "dtypes": {
                col: str(type_)
                for col, type_ in pandas_data_frame.dtypes.items()
            },
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=DataFrameValueType().to_preview(
                pandas_data_frame, preview_size=meta_conf.get_preview_size()),
            data_dimensions=pandas_data_frame.shape,
            data_schema=expected_data_schema,
            data_hash=fast_hasher.hash(
                hash_pandas_object(pandas_data_frame, index=True).values),
        )

        df_value_meta = DataFrameValueType().get_value_meta(
            pandas_data_frame, meta_conf=meta_conf)

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps(
            expected_value_meta.data_schema)
        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions
        assert df_value_meta.data_schema == expected_value_meta.data_schema

        # histograms and stats are tested in histogram tests and they change a lot, no need to test also here
        assert set(
            df_value_meta.descriptive_stats.keys()) == {"Names", "Births"}
        assert set(df_value_meta.histograms.keys()) == {"Names", "Births"}
Example #11
0
 def test_data_shape(self, pandas_data_frame):
     expected_schema = json_utils.dumps({
         "type":
         "DataFrame",
         "columns":
         list(pandas_data_frame.columns),
         "size":
         int(pandas_data_frame.size),
         "dtypes": {
             "Births": "int64",
             "Names": "object"
         },
         "shape":
         pandas_data_frame.shape,
     })
     schema = DataFrameValueType().get_data_schema(pandas_data_frame)
     assert isinstance(schema, six.string_types)
     assert schema == expected_schema
Example #12
0
 def f_struct(self, structure):
     # return p.pformat(_dump_struct(structure))
     structure_str = traverse_to_str(structure)
     dumped = json_utils.dumps(structure_str, indent=2)
     return dumped
Example #13
0
def non_completed_outputs_to_str(non_completed_outputs):
    return json_utils.dumps(non_completed_outputs)
Example #14
0
 def get_data_schema(self, value):  # type: (Any) -> str
     return json_utils.dumps({"type": self.type_str})