Esempio n. 1
0
    def test_log_metric(self, mock_channel_tracker, value, attribute):
        @task()
        def task_with_log_metric():
            log_metric(key="test", value=value)

        task_with_log_metric()

        # will raise if no exist
        metric_info = one(get_log_metrics(mock_channel_tracker))
        metric = metric_info["metric"]

        assert metric.value == getattr(metric, attribute)
        assert metric.value == value
Esempio n. 2
0
    def test_log_metrics(self, mock_channel_tracker):
        @task()
        def task_with_log_metrics():
            # all lower alphabet chars -> {"a": 97,..., "z": 122}
            log_metrics({chr(i): i for i in range(97, 123)})

        task_with_log_metrics()
        metrics_info = list(get_log_metrics(mock_channel_tracker))
        assert len(metrics_info) == 26

        for metric_info in metrics_info:
            metric = metric_info["metric"]
            assert metric.value == metric.value_int
            assert chr(metric.value) == metric.key
Esempio n. 3
0
    def test_path_with_data_meta(self, mock_channel_tracker,
                                 pandas_data_frame):
        @task()
        def task_with_log_datasets():
            log_dataset_op(
                "/path/to/value.csv",
                DbndDatasetOperationType.read,
                data=pandas_data_frame,
                with_preview=True,
                with_schema=True,
            )

        task_with_log_datasets()

        log_dataset_arg = one(get_log_datasets(mock_channel_tracker))

        assert log_dataset_arg.operation_path == "/path/to/value.csv"
        assert log_dataset_arg.operation_type == DbndDatasetOperationType.read
        assert log_dataset_arg.operation_status == DbndTargetOperationStatus.OK
        assert log_dataset_arg.value_preview is not None
        assert log_dataset_arg.data_dimensions == (5, 3)
        assert set(log_dataset_arg.data_schema.as_dict().keys()) == {
            "columns",
            "dtypes",
            "shape",
            "size.bytes",
            "type",
        }

        log_metrics_args = get_log_metrics(mock_channel_tracker)
        metrics_names = {
            metric_row["metric"].key
            for metric_row in log_metrics_args
        }
        assert metrics_names.issuperset({
            "path.to.value.csv.schema",
            "path.to.value.csv.shape0",
            "path.to.value.csv.shape1",
            "path.to.value.csv.rows",
            "path.to.value.csv.columns",
            "path.to.value.csv",
        })
Esempio n. 4
0
    def test_with_actual_op_path(self, mock_channel_tracker):
        @task()
        def task_with_log_datasets():
            a_target = target("/path/to/value.csv")
            log_dataset_op(a_target,
                           DbndDatasetOperationType.read,
                           with_schema=False)

        task_with_log_datasets()

        log_dataset_arg = one(get_log_datasets(mock_channel_tracker))
        assert log_dataset_arg.operation_path == "/path/to/value.csv"
        assert log_dataset_arg.operation_type == DbndDatasetOperationType.read
        assert log_dataset_arg.operation_status == DbndTargetOperationStatus.OK
        assert log_dataset_arg.value_preview == ""
        assert log_dataset_arg.data_dimensions is None
        assert log_dataset_arg.data_schema is None

        log_metrics_args = get_log_metrics(mock_channel_tracker)
        assert len(list(log_metrics_args)) == 0
Esempio n. 5
0
    def test_log_dataset(self, mock_channel_tracker):
        @task()
        def task_with_log_datasets():
            log_dataset_op(
                "location://path/to/value.csv",
                DbndDatasetOperationType.read,
                with_schema=False,
            )

        task_with_log_datasets()

        log_dataset_arg = one(get_log_datasets(mock_channel_tracker))
        assert log_dataset_arg.operation_path == "location://path/to/value.csv"
        assert log_dataset_arg.operation_type == DbndDatasetOperationType.read
        assert log_dataset_arg.operation_status == DbndTargetOperationStatus.OK
        assert log_dataset_arg.value_preview == ""
        assert log_dataset_arg.data_dimensions is None
        assert log_dataset_arg.data_schema is None

        # no metrics reported
        log_metrics_args = list(get_log_metrics(mock_channel_tracker))
        assert len(log_metrics_args) == 0
Esempio n. 6
0
    def test_failed_target(self, mock_channel_tracker):
        @task()
        def task_with_log_datasets():
            log_dataset_op(
                "location://path/to/value.csv",
                "read",  # Check passing str values too
                success=False,
                with_schema=False,
            )

        task_with_log_datasets()

        log_dataset_arg = one(get_log_datasets(mock_channel_tracker))
        assert log_dataset_arg.operation_path == "location://path/to/value.csv"
        assert log_dataset_arg.operation_type == DbndDatasetOperationType.read
        assert log_dataset_arg.operation_status == DbndTargetOperationStatus.NOK
        assert log_dataset_arg.value_preview == ""
        assert log_dataset_arg.data_dimensions is None
        assert log_dataset_arg.data_schema is None

        log_metrics_args = get_log_metrics(mock_channel_tracker)
        assert len(list(log_metrics_args)) == 0
Esempio n. 7
0
    def test_log_dataset_op_histograms_stats_flags(self, mock_channel_tracker,
                                                   with_histograms,
                                                   with_stats):
        # Test with_histograms/with_stats flag for pandas dataframe

        with open(THIS_DIR + "/nested_data.json", encoding="utf-8-sig") as f:
            nested_json = pd.json_normalize(json.load(f))

        @task()
        def task_log_dataset_op_nested_json_data():
            log_dataset_op(
                op_path="/my/path/to/nested_data.json",
                op_type=DbndDatasetOperationType.write,
                data=nested_json,
                with_histograms=with_histograms,
                with_stats=with_stats,
            )

        task_log_dataset_op_nested_json_data()

        log_dataset_arg: LogDatasetArgs = one(
            get_log_datasets(mock_channel_tracker))
        metrics_info = list(get_log_metrics(mock_channel_tracker))
        histograms_metrics = list(
            filter(lambda m: m["metric"].key.endswith("histograms"),
                   metrics_info))
        if with_histograms and with_stats:
            assert histograms_metrics
            assert log_dataset_arg.columns_stats
        elif with_histograms:
            assert histograms_metrics
            assert not log_dataset_arg.columns_stats
        elif with_stats:
            assert not histograms_metrics
            assert log_dataset_arg.columns_stats
        else:
            assert not histograms_metrics
            assert not log_dataset_arg.columns_stats
Esempio n. 8
0
    def test_log_dataset_op_nested_json_data(self, mock_channel_tracker,
                                             preview, schema):
        with open(THIS_DIR + "/nested_data.json", encoding="utf-8-sig") as f:
            nested_json = pd.json_normalize(json.load(f))

        @task()
        def task_log_dataset_op_nested_json_data():
            log_dataset_op(
                op_path="/my/path/to/nested_data.json",
                op_type=DbndDatasetOperationType.read,
                data=nested_json,
                with_schema=schema,
                with_preview=preview,
                with_histograms=True,
                with_partition=False,
            )

        task_log_dataset_op_nested_json_data()
        metrics_info = list(get_log_metrics(mock_channel_tracker))
        map_metrics = {
            metric_info["metric"].key: metric_info["metric"]
            for metric_info in metrics_info
        }

        print("preview={preview}, schema={schema}".format(preview=preview,
                                                          schema=schema))

        for m in metrics_info:
            print(m["metric"], m["metric"].value)

        assert "my.path.to.nested_data.json.shape0" in map_metrics
        assert map_metrics["my.path.to.nested_data.json.shape0"].value == 3
        assert "my.path.to.nested_data.json.shape1" in map_metrics
        assert map_metrics["my.path.to.nested_data.json.shape1"].value == 22

        # Tests for schema
        # ------------------
        # Only report schema if the schema flag is on, the schema source is the user.
        assert if_and_only_if(
            schema,
            ("my.path.to.nested_data.json.schema" in map_metrics
             and map_metrics["my.path.to.nested_data.json.schema"].source
             == "user"),
        )

        #
        # Size flag is used only with schema flag
        # together they add a size.bytes calculation in the schema value
        assert if_and_only_if(
            schema,
            ("my.path.to.nested_data.json.schema" in map_metrics
             and "size.bytes"
             in map_metrics["my.path.to.nested_data.json.schema"].value),
        )

        # Tests for preview
        # ------------------
        # When preview is on we expect to have a the value sent with a preview
        assert if_and_only_if(
            preview,
            ("my.path.to.nested_data.json" in map_metrics and "value_preview"
             in map_metrics["my.path.to.nested_data.json"].value),
        )

        #
        # When we have both preview and schema we expect to have a schema part of the value metric
        assert if_and_only_if(
            (preview and schema),
            ("my.path.to.nested_data.json" in map_metrics
             and "schema" in map_metrics["my.path.to.nested_data.json"].value),
        )
        #
        # When we preview, schema and size we expect the the preview inside the schema inside the value
        # would have size.bytes value
        assert if_and_only_if(
            (preview and schema),
            ("my.path.to.nested_data.json" in map_metrics
             and "schema" in map_metrics["my.path.to.nested_data.json"].value
             and "size.bytes"
             in map_metrics["my.path.to.nested_data.json"].value["schema"]),
        )

        #
        # Only report columns if the schema flag is on, the schema columns is expected.
        expected_columns = {
            "greeting",
            "longitude",
            "eyeColor",
            "address",
            "name",
            "age",
            "isActive",
            "tags",
            "guid",
            "about",
            "index",
            "balance",
            "email",
            "phone",
            "registered",
            "latitude",
            "_id",
            "favoriteFruit",
            "picture",
            "gender",
            "friends",
            "company",
        }
        if schema:
            assert set(map_metrics["my.path.to.nested_data.json.schema"].
                       value["columns"]) == set(expected_columns)
Esempio n. 9
0
    def test_log_dataset_op_flat_json_data(self, mock_channel_tracker, preview,
                                           schema):
        with open(THIS_DIR + "/flat_data.json", encoding="utf-8-sig") as f:
            flat_json = json.load(f)

        @task()
        def task_log_dataset_op_flat_json_data():
            log_dataset_op(
                op_path="/my/path/to/flat_data.json",
                op_type=DbndDatasetOperationType.read,
                data=flat_json,
                with_schema=schema,
                with_preview=preview,
                with_histograms=False,
                with_partition=False,
            )

        task_log_dataset_op_flat_json_data()
        metrics_info = list(get_log_metrics(mock_channel_tracker))
        map_metrics = {
            metric_info["metric"].key: metric_info["metric"]
            for metric_info in metrics_info
        }

        print("preview={preview}, schema={schema}".format(preview=preview,
                                                          schema=schema))

        for m in metrics_info:
            print(m["metric"], m["metric"].value)

        assert "my.path.to.flat_data.json.shape0" in map_metrics
        assert map_metrics["my.path.to.flat_data.json.shape0"].value == 6
        assert "my.path.to.flat_data.json.shape1" in map_metrics
        assert map_metrics["my.path.to.flat_data.json.shape1"].value == 5

        # Tests for schema
        # ------------------
        # Only report schema if the schema flag is on, the schema source is the user.
        assert if_and_only_if(
            schema,
            ("my.path.to.flat_data.json.schema" in map_metrics and
             map_metrics["my.path.to.flat_data.json.schema"].source == "user"),
        )
        #
        # Size flag is used only with schema flag
        # together they add a size.bytes calculation in the schema value
        assert if_and_only_if(
            schema,
            ("my.path.to.flat_data.json.schema" in map_metrics and "size.bytes"
             in map_metrics["my.path.to.flat_data.json.schema"].value),
        )

        # Tests for preview
        # ------------------
        # When preview is on we expect to have a the value sent with a preview
        assert if_and_only_if(
            preview,
            ("my.path.to.flat_data.json" in map_metrics and "value_preview"
             in map_metrics["my.path.to.flat_data.json"].value),
        )
        #
        # When we have both preview and schema we expect to have a schema part of the value metric
        assert if_and_only_if(
            (preview and schema),
            ("my.path.to.flat_data.json" in map_metrics
             and "schema" in map_metrics["my.path.to.flat_data.json"].value),
        )
        #
        # When we preview, schema and size we expect the the preview inside the schema inside the value
        # would have size.bytes value
        assert if_and_only_if(
            (preview and schema),
            ("my.path.to.flat_data.json" in map_metrics
             and "schema" in map_metrics["my.path.to.flat_data.json"].value
             and "size.bytes"
             in map_metrics["my.path.to.flat_data.json"].value["schema"]),
        )

        # Only report columns if the schema flag is on, the schema columns is expected.
        expected_columns = {
            "Leave",
            "Serial Number",
            "Employee Markme",
            "Description",
            "Company Name",
        }
        if schema:
            assert (set(map_metrics["my.path.to.flat_data.json.schema"].
                        value["columns"]) == expected_columns)
Esempio n. 10
0
    def test_log_data(
        self,
        mock_channel_tracker,
        pandas_data_frame,
        preview,
        size,
        schema,
        stats,
        histograms,
        path,
    ):
        """
        This test is a bit complicated but it cover any flag that there is in this function (almost)
        For each scenario we want to be sure that the expected output sent to the tracker.
        This mean that when a specific flag is set to false we expect that the relevant metrics will *not* be send.

        This is not a test to make sure that the histogram calculations or preview or schema output are as expected.
        This is only a test for the api of log_data/log_dataframe.

        !! This test help us see that this interface is not very intuitive !!
        """
        @task()
        def task_with_log_data():
            log_data(
                key="df",
                value=pandas_data_frame,
                with_preview=preview,
                with_size=size,
                with_schema=schema,
                with_stats=stats,
                with_histograms=histograms,
                path="/my/path/to_file.txt" if path else None,
            )

        task_with_log_data()
        metrics_info = list(get_log_metrics(mock_channel_tracker))
        map_metrics = {
            metric_info["metric"].key: metric_info["metric"]
            for metric_info in metrics_info
        }

        # note: This is a test helper to use when debugging
        # side-note: I wish we didn't had to support py2 and could use f-string
        # side-note: I wish to use the f-string debug available from py3.8
        # https://tirkarthi.github.io/programming/2019/05/08/f-string-debugging.html
        # >>> print(f"{preview=}, {size=}, {schema=}, {stats=},{histograms=}")
        print(
            "preview={preview}, size={size}, schema={schema}, stats={stats}, histograms={histograms}, path={path}"
            .format(
                preview=preview,
                size=size,
                schema=schema,
                stats=stats,
                histograms=histograms,
                path=path,
            ))

        for m in metrics_info:
            print(m["metric"], m["metric"].value)

        # no matter which configuration is set we expect to log the shape:
        assert "df.shape0" in map_metrics
        assert map_metrics["df.shape0"].value == 5
        assert "df.shape1" in map_metrics
        assert map_metrics["df.shape1"].value == 3

        # Tests for schema
        # ------------------
        # Only report schema if the schema flag is on, the schema source is the user.
        assert if_and_only_if(
            schema,
            ("df.schema" in map_metrics
             and map_metrics["df.schema"].source == "user"),
        )
        #
        # Size flag is used only with schema flag
        # together they add a size.bytes calculation in the schema value
        assert if_and_only_if(
            (schema and size),
            ("df.schema" in map_metrics
             and "size.bytes" in map_metrics["df.schema"].value),
        )

        # Tests for preview
        # ------------------
        # When preview is on we expect to have a the value sent with a preview
        assert if_and_only_if(
            preview,
            ("df" in map_metrics
             and "value_preview" in map_metrics["df"].value),
        )
        #
        # When we have both preview and schema we expect to have a schema part of the value metric
        assert if_and_only_if(
            (preview and schema),
            ("df" in map_metrics and "schema" in map_metrics["df"].value),
        )
        #
        # When we preview, schema and size we expect the the preview inside the schema inside the value
        # would have size.bytes value
        assert if_and_only_if(
            (preview and schema and size),
            ("df" in map_metrics and "schema" in map_metrics["df"].value
             and "size.bytes" in map_metrics["df"].value["schema"]),
        )

        # Tests for histograms
        # ---------------------
        # We only log the histogram metrics when we use the histogram flag
        assert if_and_only_if(
            histograms,
            ("df.histograms" in map_metrics
             and map_metrics["df.histograms"].source == "histograms"
             and "df.histogram_system_metrics" in map_metrics
             and map_metrics["df.histogram_system_metrics"].source
             == "histograms"),
        )
        #
        # This is a tricky one - when we have stats on
        # we create for each of the columns multiple histograms' metrics.
        assert if_and_only_if(
            stats,
            all(
                any(header in metric_name for metric_name in map_metrics)
                for header in pandas_data_frame.columns),
        )

        if path:
            log_target = first_true(
                get_log_targets(mock_channel_tracker),
                pred=lambda t: not t.target_path.startswith("memory://"),
            )
            assert log_target.target_path == "/my/path/to_file.txt"
            # the data dimensions is taken from the data frame
            assert log_target.data_dimensions == (5, 3)

            has_data_schema = bool(eval(log_target.data_schema))
            assert if_and_only_if(schema or size, has_data_schema)