Beispiel #1
0
    def test_task_metrics_simple(self, tmpdir, pandas_data_frame):
        metrics_folder = target(str(tmpdir))

        task_run = Mock()
        task_run.meta_files = TaskRunMetaFiles(metrics_folder)
        t = FileTrackingStore()
        tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t)
        tr_tracker.settings.features.get_value_meta_conf = Mock(
            return_value=ValueMetaConf.enabled())
        tr_tracker.log_metric("a", 1)
        tr_tracker.log_metric("a_string", "1")
        tr_tracker.log_metric("a_list", [1, 3])
        tr_tracker.log_metric("a_tuple", (1, 2))
        tr_tracker.log_dataframe("df",
                                 pandas_data_frame,
                                 meta_conf=ValueMetaConf.enabled())

        actual = TaskRunMetricsFileStoreReader(
            metrics_folder).get_all_metrics_values()

        print(actual)
        assert "df.schema" in actual
        del actual["df.schema"]
        assert actual == {
            "a": 1.0,
            "a_list": "[1, 3]",
            "a_string": 1.0,
            "a_tuple": "(1, 2)",
            "df.preview": "Names  Births",
            "df.shape": "(5, 2)",
            "df.shape_0_": 5.0,
            "df.shape_1_": 2.0,
        }
    def test_task_metrics_histograms(self, tmpdir, pandas_data_frame):
        metrics_folder = target(str(tmpdir))

        task_run = Mock()
        task_run.meta_files = TaskRunMetaFiles(metrics_folder)
        t = FileTrackingStore()
        tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t)
        tr_tracker.settings.tracking.get_value_meta_conf = Mock(
            return_value=ValueMetaConf.enabled()
        )
        tr_tracker.log_data("df", pandas_data_frame, meta_conf=ValueMetaConf.enabled())

        hist_metrics = TaskRunMetricsFileStoreReader(
            metrics_folder
        ).get_all_metrics_values(MetricSource.histograms)

        expected_preview = (
            "   Names  Births  Married\n"
            "     Bob     968     True\n"
            " Jessica     155    False\n"
            "    Mary      77     True\n"
            "    John     578    False\n"
            "     Mel     973     True"
        )

        # std value varies in different py versions due to float precision fluctuation
        df_births_std = hist_metrics["df.Births.std"]
        assert df_births_std == pytest.approx(428.4246)
Beispiel #3
0
    def test_get_value_meta(self, snowflake_table):
        # Arrange
        with mock.patch(
                "dbnd_snowflake.snowflake_values.SnowflakeController",
                new_callable=snowflake_controller_mock,
        ) as snowflake:
            # Act
            value_meta = SnowflakeTableValueType().get_value_meta(
                snowflake_table, meta_conf=(ValueMetaConf.enabled()))

        # Assert
        assert value_meta.value_preview == "test preview"
        assert value_meta.data_dimensions == [42, 12]
        assert value_meta.data_schema == {
            "type": "SnowflakeTable",
            "column_types": {
                "name": "varchar"
            },
            "size": "500 B",
        }
        assert (
            value_meta.data_hash ==
            "snowflake://*****:*****@SNOWFLAKE_ACCOUNT/SNOWFLAKE_SAMPLE_DATA.TPCDS_SF100TCL/CUSTOMER"
        )
        assert snowflake.get_column_types.called
        assert snowflake.get_dimensions.called
        assert snowflake.to_preview.called
Beispiel #4
0
    def test_df_value_meta(self, pandas_data_frame):
        expected_data_schema = {
            "type": DataFrameValueType.type_str,
            "columns": list(pandas_data_frame.columns),
            "size": int(pandas_data_frame.size),
            "shape": pandas_data_frame.shape,
            "dtypes": {
                col: str(type_)
                for col, type_ in pandas_data_frame.dtypes.items()
            },
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=DataFrameValueType().to_preview(
                pandas_data_frame, preview_size=meta_conf.get_preview_size()),
            data_dimensions=pandas_data_frame.shape,
            data_schema=expected_data_schema,
            data_hash=fast_hasher.hash(
                hash_pandas_object(pandas_data_frame, index=True).values),
        )

        df_value_meta = DataFrameValueType().get_value_meta(
            pandas_data_frame, meta_conf=meta_conf)

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps(
            expected_value_meta.data_schema)
        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions
        assert df_value_meta == expected_value_meta
Beispiel #5
0
    def test_task_metrics_simple(self, tmpdir, pandas_data_frame):
        metrics_folder = target(str(tmpdir))

        task_run = Mock()
        task_run.meta_files = TaskRunMetaFiles(metrics_folder)
        t = FileTrackingStore()
        tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t)
        tr_tracker.settings.features.get_value_meta_conf = Mock(
            return_value=ValueMetaConf.enabled()
        )
        tr_tracker.log_metric("a", 1)
        tr_tracker.log_metric("a_string", "1")
        tr_tracker.log_metric("a_list", [1, 3])
        tr_tracker.log_metric("a_tuple", (1, 2))

        user_metrics = TaskRunMetricsFileStoreReader(
            metrics_folder
        ).get_all_metrics_values(MetricSource.user)

        assert user_metrics == {
            "a": 1.0,
            "a_list": "[1, 3]",
            "a_string": 1.0,
            "a_tuple": "(1, 2)",
        }
    def test_spark_df_value_meta(self, spark_data_frame):
        expected_data_schema = {
            "type": SparkDataFrameValueType.type_str,
            "columns": list(spark_data_frame.schema.names),
            "size":
            int(spark_data_frame.count() * len(spark_data_frame.columns)),
            "shape": (spark_data_frame.count(), len(spark_data_frame.columns)),
            "dtypes":
            {f.name: str(f.dataType)
             for f in spark_data_frame.schema.fields},
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=SparkDataFrameValueType().to_preview(
                spark_data_frame, meta_conf.get_preview_size()),
            data_dimensions=(spark_data_frame.count(),
                             len(spark_data_frame.columns)),
            data_schema=expected_data_schema,
            data_hash=None,
        )

        df_value_meta = SparkDataFrameValueType().get_value_meta(
            spark_data_frame)

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert df_value_meta.data_schema == expected_value_meta.data_schema
        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions
        assert df_value_meta == expected_value_meta
Beispiel #7
0
 def test_str_value_meta(self):
     str_value_meta = StrValueType().get_value_meta("foo", ValueMetaConf.enabled())
     expected_value_meta = ValueMeta(
         value_preview="foo",
         data_dimensions=None,
         data_schema={"type": "str"},
         data_hash=fast_hasher.hash("foo"),
     )
     assert str_value_meta == expected_value_meta
    def test_spark_df_value_meta(self, spark_data_frame,
                                 spark_data_frame_histograms,
                                 spark_data_frame_stats):
        expected_data_schema = {
            "type":
            SparkDataFrameValueType.type_str,
            "columns":
            list(spark_data_frame.schema.names),
            "size.bytes":
            int(spark_data_frame.count() * len(spark_data_frame.columns)),
            "shape": (spark_data_frame.count(), len(spark_data_frame.columns)),
            "dtypes":
            {f.name: str(f.dataType)
             for f in spark_data_frame.schema.fields},
        }

        expected_hist_sys_metrics = {
            "boolean_histograms_and_stats_calc_time",
            "histograms_and_stats_calc_time",
            "numeric_histograms_and_stats_calc_time",
            "string_histograms_and_stats_calc_time",
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=SparkDataFrameValueType().to_preview(
                spark_data_frame, meta_conf.get_preview_size()),
            data_dimensions=(spark_data_frame.count(),
                             len(spark_data_frame.columns)),
            data_hash=SparkDataFrameValueType().to_signature(spark_data_frame),
            data_schema=expected_data_schema,
            descriptive_stats=spark_data_frame_stats,
            histograms=spark_data_frame_histograms,
        )

        df_value_meta = SparkDataFrameValueType().get_value_meta(
            spark_data_frame, meta_conf)

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions
        assert df_value_meta.data_schema == expected_value_meta.data_schema
        # it changes all the time, it has different formats, and it's already tested in histogram tests
        # assert df_value_meta.descriptive_stats == expected_value_meta.descriptive_stats

        # histogram_system_metrics values are too dynamic, so checking only keys, but not values
        assert (set(df_value_meta.histogram_system_metrics.keys()) ==
                expected_hist_sys_metrics)
        df_value_meta.histogram_system_metrics = None

        # assert df_value_meta.histograms == expected_value_meta.histograms
        # assert attr.asdict(df_value_meta) == attr.asdict(expected_value_meta)

        pandas_data_frame = spark_data_frame.toPandas()
        pandas_value_meta = DataFrameValueType().get_value_meta(
            pandas_data_frame, meta_conf)
    def test_df_value_meta(
        self, pandas_data_frame, pandas_data_frame_histograms, pandas_data_frame_stats
    ):
        expected_data_schema = {
            "type": DataFrameValueType.type_str,
            "columns": list(pandas_data_frame.columns),
            "size": int(pandas_data_frame.size),
            "shape": pandas_data_frame.shape,
            "dtypes": {
                col: str(type_) for col, type_ in pandas_data_frame.dtypes.items()
            },
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=DataFrameValueType().to_preview(
                pandas_data_frame, preview_size=meta_conf.get_preview_size()
            ),
            data_dimensions=pandas_data_frame.shape,
            data_schema=expected_data_schema,
            data_hash=fast_hasher.hash(
                hash_pandas_object(pandas_data_frame, index=True).values
            ),
            descriptive_stats=pandas_data_frame_stats,
            histograms=pandas_data_frame_histograms,
        )

        df_value_meta = DataFrameValueType().get_value_meta(
            pandas_data_frame, meta_conf=meta_conf
        )

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps(
            expected_value_meta.data_schema
        )
        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions

        std = df_value_meta.descriptive_stats["Births"].pop("std")
        expected_std = expected_value_meta.descriptive_stats["Births"].pop("std")
        assert round(std, 2) == expected_std
        df_value_meta.descriptive_stats["Names"].pop("top")
        assert df_value_meta.descriptive_stats == expected_value_meta.descriptive_stats

        counts, values = df_value_meta.histograms.pop("Names")
        expected_counts, expected_values = expected_value_meta.histograms.pop("Names")
        assert counts == expected_counts
        assert set(values) == set(expected_values)  # order changes in each run
        # histograms are tested in histogram tests and they change a lot, no need to test also here
        df_value_meta.histograms = expected_value_meta.histograms = None

        expected_value_meta.histogram_system_metrics = (
            df_value_meta.histogram_system_metrics
        )
        assert df_value_meta.data_schema == expected_value_meta.data_schema
        assert attr.asdict(df_value_meta) == attr.asdict(expected_value_meta)
Beispiel #10
0
    def test_target_value_meta(self):
        v = target("a")
        meta_conf = ValueMetaConf.enabled()
        target_value_meta = TargetPathLibValueType().get_value_meta(
            v, meta_conf=meta_conf)

        expected_value_meta = ValueMeta(
            value_preview='"a"',
            data_dimensions=None,
            data_schema={"type": "Path"},
            data_hash=fast_hasher.hash(v),
        )

        assert target_value_meta == expected_value_meta
Beispiel #11
0
    def test_get_histograms_and_stats(self):
        with mock.patch(
                "dbnd_postgres.postgres_values.PostgresController._query"
        ) as query_patch:
            # Arrange
            pg_stats_data = [{
                "attname": "customer",
                "null_frac": 0.5,
                "n_distinct": 8,
                "most_common_vals": "{customerA, customerB}",
                "most_common_freqs": [0.2, 0.2],
            }]
            pg_class_data = [{"reltuples": 10}]
            information_schema_columns_data = [{
                "column_name": "customer",
                "data_type": "varchar"
            }]
            query_patch.side_effect = [
                pg_stats_data,
                pg_class_data,
                information_schema_columns_data,
            ]

            expected_columns_stats = [
                ColumnStatsArgs(
                    column_name="customer",
                    column_type="varchar",
                    records_count=10,
                    distinct_count=8,
                    null_count=5,
                )
            ]
            expected_histograms = {
                "customer": ([2, 2, 1], ["customerA", "customerB", "_others"])
            }

            # Act
            postgres = PostgresController("user@database", "data_table")
            meta_conf = ValueMetaConf.enabled()
            columns_stats, histograms = postgres.get_histograms_and_stats(
                meta_conf)

            # Assert
            assert columns_stats == expected_columns_stats
            assert histograms == expected_histograms
Beispiel #12
0
    def test_get_value_meta(self, snowflake_table):
        value_meta = SnowflakeTableValueType().get_value_meta(
            snowflake_table, meta_conf=(ValueMetaConf.enabled()))

        # Assert
        assert value_meta.value_preview == "test preview"
        assert value_meta.data_dimensions == [42, 12]
        assert value_meta.data_schema == {
            "type": "SnowflakeTable",
            "column_types": {
                "name": "varchar"
            },
            "size.bytes": 500,
        }
        assert value_meta.data_hash == EXPECTED_SNOWFLAKE_TABLE_SIGNATURE
        assert snowflake_table.snowflake_ctrl.get_column_types.called
        assert snowflake_table.snowflake_ctrl.get_dimensions.called
        assert snowflake_table.snowflake_ctrl.to_preview.called
Beispiel #13
0
    def test_df_value_meta(self):
        # Arrange
        postgres_table = PostgresTable(table_name="test_table",
                                       connection_string="*****@*****.**")
        meta_conf = ValueMetaConf.enabled()

        with mock.patch(
                "dbnd_postgres.postgres_values.PostgresController",
                new_callable=postgres_controller_mock,
        ) as postgres:
            # Act
            PostgresTableValueType().get_value_meta(postgres_table,
                                                    meta_conf=meta_conf)

            # Assert
            assert postgres.columns_types.called
            assert postgres.get_histograms_and_stats.called
            assert postgres.to_preview.called
    def test_df_value_meta(self, pandas_data_frame):
        expected_data_schema = {
            "type": DataFrameValueType.type_str,
            "columns": list(pandas_data_frame.columns),
            "size.bytes": int(pandas_data_frame.size),
            "shape": pandas_data_frame.shape,
            "dtypes": {
                col: str(type_)
                for col, type_ in pandas_data_frame.dtypes.items()
            },
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=DataFrameValueType().to_preview(
                pandas_data_frame, preview_size=meta_conf.get_preview_size()),
            data_dimensions=pandas_data_frame.shape,
            data_schema=expected_data_schema,
            data_hash=fast_hasher.hash(
                hash_pandas_object(pandas_data_frame, index=True).values),
        )

        df_value_meta = DataFrameValueType().get_value_meta(
            pandas_data_frame, meta_conf=meta_conf)

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert df_value_meta.data_schema == expected_value_meta.data_schema

        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions
        assert df_value_meta.data_schema == expected_value_meta.data_schema

        # histograms and stats are tested in histogram tests and they change a lot, no need to test also here
        assert set([
            col_stats.column_name for col_stats in df_value_meta.columns_stats
        ]) == {"Names", "Births"}
        assert set(df_value_meta.histograms.keys()) == {"Names", "Births"}
Beispiel #15
0
def test_pandas_v0_histograms():
    # Tests pandas histograms calculation is stable across Pandas v1 & v0
    meta_conf = ValueMetaConf.enabled()
    columns_stats, histograms = PandasHistograms(
        diverse_df, meta_conf).get_histograms_and_stats()

    # fmt: off
    columns_stats == [
        ColumnStatsArgs(
            column_name="bool_column",
            column_type="bool",
            records_count=100,
            distinct_count=3,
            null_count=35,
            most_freq_value=False,
            most_freq_value_count=33,
            unique_count=2,
        ),
        ColumnStatsArgs(
            column_name="float_column",
            column_type="float64",
            records_count=100,
            distinct_count=11,
            null_count=6,
            quartile_1=2.0,
            quartile_2=5.0,
            quartile_3=7.0,
            max_value=9.0,
            mean_value=4.7127659574,
            min_value=0.0,
            std_value=2.8572576537,
        ),
        ColumnStatsArgs(
            column_name="int_column",
            column_type="float64",
            records_count=100,
            distinct_count=11,
            null_count=8,
            quartile_1=2.0,
            quartile_2=5.0,
            quartile_3=7.0,
            max_value=9.0,
            mean_value=4.8804347826,
            min_value=0.0,
            std_value=2.7449950111,
        ),
        ColumnStatsArgs(
            column_name="str_column",
            column_type="str",
            records_count=100,
            distinct_count=5,
            null_count=21,
            most_freq_value="foo",
            most_freq_value_count=22,
            unique_count=4,
        ),
        ColumnStatsArgs(
            column_name="multi_data_types",
            column_type="str",
            records_count=100,
            distinct_count=8,
            null_count=10,
            most_freq_value="foo",
            most_freq_value_count=11,
            unique_count=18,
        ),
    ]
    # "str_column" calculation is unstable hence these unpacked assertions
    assert set(histograms.keys()) == {
        "bool_column", "float_column", "int_column", "str_column",
        "multi_data_types"
    }
    assert histograms["bool_column"] == [[35, 33, 32], [None, False, True]]
    assert histograms["float_column"] == [
        [6, 0, 9, 0, 13, 0, 8, 0, 10, 0, 0, 8, 0, 6, 0, 15, 0, 8, 0, 11],
        [
            0.0, 0.45, 0.9, 1.35, 1.8, 2.25, 2.7, 3.15, 3.6, 4.05, 4.5, 4.95,
            5.4, 5.8500000000000005, 6.3, 6.75, 7.2, 7.65, 8.1, 8.55, 9.0
        ]
    ]
    assert histograms["int_column"] == [
        [2, 0, 13, 0, 9, 0, 9, 0, 7, 0, 0, 8, 0, 15, 0, 11, 0, 6, 0, 12],
        [
            0.0, 0.45, 0.9, 1.35, 1.8, 2.25, 2.7, 3.15, 3.6, 4.05, 4.5, 4.95,
            5.4, 5.8500000000000005, 6.3, 6.75, 7.2, 7.65, 8.1, 8.55, 9.0
        ]
    ]
    assert histograms["str_column"][0] == [22, 21, 20, 20, 17]
    # "str_column" calculation is unstable
    assert set(histograms["str_column"][1]) == {"foo", None, "", "baz", "bar"}
Beispiel #16
0
 def meta_conf(self):
     return ValueMetaConf.enabled()
Beispiel #17
0
def test_pandas_v0_histograms():
    # Tests pandas histograms calculation is stable across Pandas v1 & v0
    meta_conf = ValueMetaConf.enabled()
    stats, histograms = PandasHistograms(diverse_df,
                                         meta_conf).get_histograms_and_stats()

    # fmt: off
    assert stats == {
        "bool_column": {
            "count": 100,
            "distinct": 3,
            "freq": 33,
            "non-null": 65,
            "null-count": 35,
            "top": False,
            "type": "bool",
            "unique": 2,
        },
        "float_column": {
            "25%": 2.0,
            "50%": 5.0,
            "75%": 7.0,
            "count": 100,
            "distinct": 11,
            "max": 9.0,
            "mean": 4.7127659574,
            "min": 0.0,
            "non-null": 94,
            "null-count": 6,
            "std": 2.8572576537,
            "type": "float64",
        },
        "int_column": {
            "25%": 2.0,
            "50%": 5.0,
            "75%": 7.0,
            "count": 100,
            "distinct": 11,
            "max": 9.0,
            "mean": 4.8804347826,
            "min": 0.0,
            "non-null": 92,
            "null-count": 8,
            "std": 2.7449950111,
            "type": "float64",
        },
        "str_column": {
            "count": 100,
            "distinct": 5,
            "freq": 22,
            "non-null": 79,
            "null-count": 21,
            "top": "foo",
            "type": "str",
            "unique": 4,
        },
    }
    # "str_column" calculation is unstable hence these unpacked assertions
    assert set(histograms.keys()) == {
        "bool_column", "float_column", "int_column", "str_column"
    }
    assert histograms["bool_column"] == [[35, 33, 32], [None, False, True]]
    assert histograms["float_column"] == [
        [6, 0, 9, 0, 13, 0, 8, 0, 10, 0, 0, 8, 0, 6, 0, 15, 0, 8, 0, 11],
        [
            0.0, 0.45, 0.9, 1.35, 1.8, 2.25, 2.7, 3.15, 3.6, 4.05, 4.5, 4.95,
            5.4, 5.8500000000000005, 6.3, 6.75, 7.2, 7.65, 8.1, 8.55, 9.0
        ]
    ]
    assert histograms["int_column"] == [
        [2, 0, 13, 0, 9, 0, 9, 0, 7, 0, 0, 8, 0, 15, 0, 11, 0, 6, 0, 12],
        [
            0.0, 0.45, 0.9, 1.35, 1.8, 2.25, 2.7, 3.15, 3.6, 4.05, 4.5, 4.95,
            5.4, 5.8500000000000005, 6.3, 6.75, 7.2, 7.65, 8.1, 8.55, 9.0
        ]
    ]
    assert histograms["str_column"][0] == [22, 21, 20, 20, 17]
    # "str_column" calculation is unstable
    assert set(histograms["str_column"][1]) == {"foo", None, "", "baz", "bar"}
Beispiel #18
0
def test_pandas_histograms_work_with_NaNs_and_nonseq_index(pandas_data_frame):
    # Arrange
    pandas_data_frame = (
        pandas_data_frame.drop(columns="Names").set_index(
            [pd.Index([90, 30, 50, 70, 10])])  # emulate real world DF indices
        .append([{
            "foo": 42
        }]))
    meta_conf = ValueMetaConf.enabled()

    # Act
    stats, histograms = PandasHistograms(pandas_data_frame,
                                         meta_conf).get_histograms_and_stats()

    # Assert
    assert sorted(stats.keys()) == sorted(["Births", "foo"])  # noqa
    assert sorted(histograms.keys()) == sorted(["Births", "foo"])  # noqa
    assert stats == {
        "Births": {
            "25%": 155.0,
            "50%": 578.0,
            "75%": 968.0,
            "count": 6,
            "distinct": 6,
            "max": 973.0,
            "mean": 550.2,
            "min": 77.0,
            "non-null": 5,
            "null-count": 1,
            "std": 428.4246724921,
            "type": "float64",
        },
        "foo": {
            "25%": 42.0,
            "50%": 42.0,
            "75%": 42.0,
            "count": 6,
            "distinct": 2,
            "max": 42.0,
            "mean": 42.0,
            "min": 42.0,
            "non-null": 1,
            "null-count": 5,
            "type": "float64",
        },
    }
    # fmt: off
    assert histograms == {
        "Births": [
            [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2],
            [
                77.0, 121.8, 166.6, 211.39999999999998, 256.2, 301.0,
                345.79999999999995, 390.59999999999997, 435.4, 480.2, 525.0,
                569.8, 614.5999999999999, 659.4, 704.1999999999999, 749.0,
                793.8, 838.5999999999999, 883.4, 928.1999999999999, 973.0
            ],
        ],
        "foo": [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [
                    41.5, 41.55, 41.6, 41.65, 41.7, 41.75, 41.8, 41.85, 41.9,
                    41.95, 42.0, 42.05, 42.1, 42.15, 42.2, 42.25, 42.3, 42.35,
                    42.4, 42.45, 42.5
                ]]
    }
Beispiel #19
0
    def test_task_metrics_histograms(self, tmpdir, pandas_data_frame):
        metrics_folder = target(str(tmpdir))

        task_run = Mock()
        task_run.meta_files = TaskRunMetaFiles(metrics_folder)
        t = FileTrackingStore()
        tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t)
        tr_tracker.settings.features.get_value_meta_conf = Mock(
            return_value=ValueMetaConf.enabled()
        )
        tr_tracker.log_data(
            "df", pandas_data_frame, meta_conf=ValueMetaConf.enabled(),
        )

        hist_metrics = TaskRunMetricsFileStoreReader(
            metrics_folder
        ).get_all_metrics_values(MetricSource.histograms)

        expected_preview = (
            "   Names  Births  Married\n"
            "     Bob     968     True\n"
            " Jessica     155    False\n"
            "    Mary      77     True\n"
            "    John     578    False\n"
            "     Mel     973     True"
        )

        # std value varies in different py versions due to float precision fluctuation
        df_births_std = hist_metrics["df.Births.std"]
        assert df_births_std == pytest.approx(428.4246)
        hist_metrics["df.histograms"].pop("Names")
        hist_metrics["df.histograms"].pop("Births")
        hist_metrics.pop("df.Married.top")
        hist_metrics.pop("df.Names.top")
        hist_metrics["df.stats"]["Names"].pop("top")
        hist_metrics["df.stats"]["Married"].pop("top")
        assert hist_metrics == {
            "df.Births.type": "int64",
            "df.Births.25%": 155.0,
            "df.Births.50%": 578.0,
            "df.Births.75%": 968.0,
            "df.Births.count": 5.0,
            "df.Births.distinct": 5,
            "df.Births.std": df_births_std,
            "df.Births.max": 973.0,
            "df.Births.mean": 550.2,
            "df.Births.min": 77.0,
            "df.Births.non-null": 5,
            "df.Births.null-count": 0,
            "df.Married.count": 5,
            "df.Married.distinct": 2,
            "df.Married.freq": 3,
            "df.Married.non-null": 5,
            "df.Married.null-count": 0,
            "df.Married.type": "bool",
            "df.Married.unique": 2,
            "df.Names.count": 5,
            "df.Names.distinct": 5,
            "df.Names.freq": 1,
            "df.Names.non-null": 5,
            "df.Names.null-count": 0,
            "df.Names.type": "object",
            "df.Names.unique": 5,
            "df.histograms": {"Married": [[3, 2], [True, False]],},
            "df.preview": expected_preview,
            "df.schema": {
                "columns": ["Names", "Births", "Married"],
                "dtypes": {"Births": "int64", "Names": "object", "Married": "bool"},
                "shape": [5, 3],
                "size": 15,
                "type": "DataFrame",
            },
            "df.shape": [5, 3],
            "df.shape0": 5,
            "df.shape1": 3,
            "df.stats": {
                "Births": {
                    "type": "int64",
                    "25%": 155.0,
                    "50%": 578.0,
                    "75%": 968.0,
                    "count": 5.0,
                    "distinct": 5,
                    "max": 973.0,
                    "mean": 550.2,
                    "min": 77.0,
                    "non-null": 5,
                    "null-count": 0,
                    "std": df_births_std,
                },
                "Married": {
                    "count": 5,
                    "distinct": 2,
                    "freq": 3,
                    "non-null": 5,
                    "null-count": 0,
                    "type": "bool",
                    "unique": 2,
                },
                "Names": {
                    "count": 5,
                    "distinct": 5,
                    "freq": 1,
                    "non-null": 5,
                    "null-count": 0,
                    "type": "object",
                    "unique": 5,
                },
            },
        }
Beispiel #20
0
 def meta_conf(self):
     conf = ValueMetaConf.enabled()
     return conf
class LazyValueType(DataValueType):
    is_lazy_evaluated = True

    def support_fast_count(self, target):
        from targets import FileTarget

        if not isinstance(target, FileTarget):
            return False
        from targets.target_config import FileFormat

        return target.config.format == FileFormat.parquet


ALL_NONE = ValueMetaConf()
ALL_TRUE = ValueMetaConf.enabled()
ALL_FALSE = ValueMetaConf(
    log_size=False,
    log_preview=False,
    log_schema=False,
    log_stats=False,
    log_histograms=False,
)


class TestValueMetaConf(object):
    @pytest.mark.parametrize(
        "left, right, expected",
        [
            (ALL_NONE, ALL_TRUE, ALL_TRUE),
            (ALL_TRUE, ALL_NONE, ALL_TRUE),