def log_parameter_data(self, parameter, target, value, operation_type,
                           operation_status):
        # type: (TaskRunTracker, ParameterDefinition, Target, Any, DbndTargetOperationType, DbndTargetOperationStatus) -> None
        features_conf = self.settings.features
        if not features_conf.log_value_meta:
            return
        if value is None:
            return

        try:
            meta_conf = features_conf.get_value_meta_conf(
                parameter.value_meta_conf,
                value_type=parameter.value_type,
                target=target,
            )
            key = "{}.{}".format(self.task_run.task.task_name, parameter.name)
            target.target_meta = get_value_meta_from_value(
                key, value, meta_conf)

            self.tracking_store.log_target(
                task_run=self.task_run,
                target=target,
                target_meta=target.target_meta,
                operation_type=operation_type,
                operation_status=operation_status,
                param_name=parameter.name,
                task_def_uid=parameter.task_definition_uid,
            )
        except Exception as ex:
            log_exception(
                "Error occurred during target logging for %s" % (target, ),
                ex,
                non_critical=True,
            )
Exemple #2
0
    def test_histogram_others(self, meta_conf):
        strings = []
        for i in range(1, 101):
            str_i = "str-{}".format(i)
            new_strings = [str_i] * i
            strings.extend(new_strings)
        strings_value = self.data_to_value([strings])

        value_meta = get_value_meta_from_value("string_with_others",
                                               strings_value, meta_conf)

        histogram = value_meta.histograms["test_column_0"]
        assert len(histogram[0]) == 50 and len(histogram[1]) == 50
        assert histogram[0][0] == 100 and histogram[1][0] == "str-100"
        assert histogram[0][10] == 90 and histogram[1][10] == "str-90"
        assert histogram[0][-2] == 52 and histogram[1][-2] == "str-52"
        assert histogram[0][-1] == sum(range(
            1, 52)) and histogram[1][-1] == "_others"

        stats = value_meta.descriptive_stats["test_column_0"]
        assert stats["count"] == 5050 == sum(histogram[0])
        assert stats["non-null"] == 5050
        assert stats["null-count"] == 0
        assert stats["distinct"] == 100
        assert stats["type"] in ["str", "string"]
Exemple #3
0
    def log_data(
            self,
            key,  # type: str
            data,  # type: Union[pd.DataFrame, spark.DataFrame, PostgresTable, SnowflakeTable]
            meta_conf,  # type: ValueMetaConf
            path=None,  # type: Optional[Union[Target,str]]
            operation_type=DbndTargetOperationType.
        read,  # type: DbndTargetOperationType
            operation_status=DbndTargetOperationStatus.
        OK,  # type: DbndTargetOperationStatus
            raise_on_error=False,  # type: bool
    ):  # type: (...) -> None
        try:
            # Combine meta_conf with the config settings
            meta_conf = self.settings.tracking.get_value_meta_conf(meta_conf)
            value_meta = get_value_meta_from_value(key, data, meta_conf,
                                                   raise_on_error)
            if not value_meta:
                logger.warning(
                    "Couldn't log the wanted data {name}, reason - can't log objects of type {value_type} "
                    .format(name=key, value_type=type(data)))
                return

            ts = utcnow()

            if path:
                self.tracking_store.log_target(
                    task_run=self.task_run,
                    target=path,
                    target_meta=value_meta,
                    operation_type=operation_type,
                    operation_status=operation_status,
                    param_name=key,
                )
            metrics = value_meta.build_metrics_for_key(key, meta_conf)

            if metrics["user"]:
                self._log_metrics(metrics["user"])

            if metrics["histograms"]:
                self.tracking_store.log_histograms(task_run=self.task_run,
                                                   key=key,
                                                   value_meta=value_meta,
                                                   timestamp=ts)

            if not (metrics["user"] or metrics["histograms"] or path):
                logger.info("No metrics to log_data(key={}, data={})".format(
                    key, data))

        except Exception as ex:
            log_exception(
                "Error occurred during log_dataframe for %s" % (key, ),
                ex,
                non_critical=not raise_on_error,
            )
            if raise_on_error:
                raise
Exemple #4
0
    def test_boolean_histogram(self, meta_conf, booleans_value):
        value_meta = get_value_meta_from_value("booleans", booleans_value,
                                               meta_conf)

        histogram = value_meta.histograms["test_column_0"]
        assert histogram[0] == [30, 20, 10]
        assert histogram[1] == [True, False, None]

        stats = value_meta.descriptive_stats["test_column_0"]
        assert stats["count"] == 60
        assert stats["type"] in ["bool", "boolean"]
Exemple #5
0
    def test_complex_column(self, spark_session, meta_conf, numbers):
        # list is a complex value, and it can't have a histogram,
        # so we want to make sure we handle it correctly and nothing breaks
        complex_column = [(i, [str(i), str(i + 1)]) if i else [None] * 2
                          for i in numbers]
        complex_column = list(zip(*complex_column))
        df = self.data_to_value(complex_column)
        value_meta = get_value_meta_from_value("complex", df, meta_conf)

        assert list(value_meta.histograms.keys()) == ["test_column_0"]
        assert list(value_meta.descriptive_stats.keys()) == ["test_column_0"]
        self.validate_numeric_histogram_and_stats(value_meta, "test_column_0")
Exemple #6
0
    def test_multiple_columns(self, meta_conf, numbers):
        values = [(i, float(i), str(i), str(i)) if i else [None] * 4
                  for i in numbers]
        values = list(zip(*values))
        df = self.data_to_value(values)
        value_meta = get_value_meta_from_value("multi_column", df, meta_conf)

        self.validate_numeric_histogram_and_stats(value_meta, "test_column_0")
        self.validate_numeric_histogram_and_stats(value_meta, "test_column_1")
        str_histogram_1 = value_meta.histograms["test_column_2"]
        str_histogram_2 = value_meta.histograms["test_column_3"]
        assert str_histogram_1[0] == [4, 3, 2, 1]
        assert str_histogram_1[1] == ["1", "5", None, "3"]
        assert str_histogram_1 == str_histogram_2
Exemple #7
0
    def test_null_column(self, meta_conf, numbers_value):
        nulls = [None] * 20
        df = self.data_to_value([nulls])
        value_meta = get_value_meta_from_value("nulls", df, meta_conf)

        histogram = value_meta.histograms["test_column_0"]
        assert histogram[0] == [20]
        assert histogram[1] == [None]

        stats = value_meta.descriptive_stats["test_column_0"]
        assert stats["count"] == 20
        assert stats["non-null"] == 0
        assert stats["null-count"] == 20
        assert stats["distinct"] == 1
        assert stats["type"] == "object"
Exemple #8
0
    def test_strings_histogram(self, meta_conf, strings_value):
        value_meta = get_value_meta_from_value("strings", strings_value,
                                               meta_conf)

        histogram = value_meta.histograms["test_column_0"]
        assert histogram[0] == [30, 20, 15, 5]
        assert histogram[1] == [
            "Ola Mundo!", "Shalom Olam!", "Hello World!", None
        ]

        stats = value_meta.descriptive_stats["test_column_0"]
        assert stats["count"] == 70
        assert stats["non-null"] == 65
        assert stats["null-count"] == 5
        assert stats["distinct"] == 4
        assert stats["type"] in ["str", "string"]
    def log_data(
            self,
            key,  # type: str
            data,  # type: Union[pd.DataFrame, spark.DataFrame, PostgresTable, SnowflakeTable]
            meta_conf,  # type: ValueMetaConf
            path=None,  # type: Optional[Union[Target,str]]
            operation_type=DbndTargetOperationType.
        read,  # type: DbndTargetOperationType
            operation_status=DbndTargetOperationStatus.
        OK,  # type: DbndTargetOperationStatus
    ):  # type: (...) -> None
        try:
            # Combine meta_conf with the config settings
            meta_conf = self.settings.features.get_value_meta_conf(meta_conf)
            value_meta = get_value_meta_from_value(key,
                                                   data,
                                                   meta_conf=meta_conf)
            if not value_meta:
                return

            ts = utcnow()

            if path:
                self.tracking_store.log_target(
                    task_run=self.task_run,
                    target=path,
                    target_meta=value_meta,
                    operation_type=operation_type,
                    operation_status=operation_status,
                )
            metrics = value_meta.build_metrics_for_key(key, meta_conf)

            if metrics["user"]:
                self._log_metrics(metrics["user"])

            if metrics["histograms"]:
                self.tracking_store.log_histograms(task_run=self.task_run,
                                                   key=key,
                                                   value_meta=value_meta,
                                                   timestamp=ts)

        except Exception as ex:
            log_exception(
                "Error occurred during log_dataframe for %s" % (key, ),
                ex,
                non_critical=True,
            )
Exemple #10
0
 def test_float_column(self, meta_conf, floats_value):
     value_meta = get_value_meta_from_value("floats", floats_value,
                                            meta_conf)
     self.validate_numeric_histogram_and_stats(value_meta, "test_column_0")
     return value_meta