Example #1
0
    def test_collect_feature_data_and_vis_attributes(self, df):
        """Unit test that attributes pre-logging are the correct format."""
        in_df, out_df = df.iloc[:, :-1], df.iloc[:, [-1]]

        feature_data_list = RegisteredModelVersion._compute_training_data_profile(
            in_df,
            out_df,
        )
        feature_data_attrs = (
            RegisteredModelVersion._collect_feature_data_and_vis_attributes(
                feature_data_list, ))

        for key, val in feature_data_attrs.items():
            if key.startswith(_deployable_entity._FEATURE_DATA_ATTR_PREFIX):
                feature_data = _utils.json_to_proto(val,
                                                    FeatureDataInModelVersion)
                self.assert_feature_data_correctness(feature_data, in_df,
                                                     out_df)

                if feature_data.profiler_name == "MissingValuesProfiler":
                    sample_key = feature_data.feature_name + "MissingValues"
                else:
                    sample_key = feature_data.feature_name + "Distribution"
                sample_key = (
                    _deployable_entity._TRAINING_DATA_ATTR_PREFIX +
                    RegisteredModelVersion._normalize_attribute_key(sample_key)
                )
                assert feature_data_attrs[sample_key] == json.loads(
                    feature_data.content)
Example #2
0
    def test_create_summaries(self, df, labels):
        """Unit test for the exact expected output of discrete & continuous columns."""
        pytest.importorskip("numpy")

        # missing
        for col in ["continuous", "discrete"]:
            feature_data = RegisteredModelVersion._create_missing_value_summary(
                df,
                col,
                labels,
            )
            _sample = profiler.MissingValuesProfiler([col]).profile(df)
            _histogram = list(_sample.values())[0]
            assert feature_data.feature_name == col
            assert feature_data.profiler_name == "MissingValuesProfiler"
            assert json.loads(feature_data.profiler_parameters) == {
                "columns": [col]
            }
            assert feature_data.summary_type_name == "verta.discreteHistogram.v1"
            assert feature_data.labels == labels
            assert json.loads(feature_data.content) == _histogram._as_dict()

        # continuous distribution
        feature_data = RegisteredModelVersion._create_continuous_histogram_summary(
            df,
            "continuous",
            labels,
        )
        _sample = profiler.ContinuousHistogramProfiler(["continuous"
                                                        ]).profile(df)
        _histogram = list(_sample.values())[0]
        assert feature_data.feature_name == "continuous"
        assert feature_data.profiler_name == "ContinuousHistogramProfiler"
        assert json.loads(feature_data.profiler_parameters) == {
            "columns": ["continuous"],
            "bins": _histogram._bucket_limits,
        }
        assert feature_data.summary_type_name == "verta.floatHistogram.v1"
        assert feature_data.labels == labels
        assert json.loads(feature_data.content) == _histogram._as_dict()

        # discrete distribution
        feature_data = RegisteredModelVersion._create_discrete_histogram_summary(
            df,
            "discrete",
            labels,
        )
        _sample = profiler.BinaryHistogramProfiler(["discrete"]).profile(df)
        _histogram = list(_sample.values())[0]
        assert feature_data.feature_name == "discrete"
        assert feature_data.profiler_name == "BinaryHistogramProfiler"
        assert json.loads(feature_data.profiler_parameters) == {
            "columns": ["discrete"]
        }
        assert feature_data.summary_type_name == "verta.discreteHistogram.v1"
        assert feature_data.labels == labels
        assert json.loads(feature_data.content) == _histogram._as_dict()
Example #3
0
    def test_compute_training_data_profile(self, df):
        """Unit test for helper functions handling DFs of various sizes."""
        in_df, out_df = df.iloc[:, :-1], df.iloc[:, [-1]]

        feature_data_list = RegisteredModelVersion._compute_training_data_profile(
            in_df,
            out_df,
        )
        for feature_data in feature_data_list:
            self.assert_feature_data_correctness(feature_data, in_df, out_df)
Example #4
0
    def _update_from_dict(self, update_dict, wait=False):
        if update_dict["strategy"] == "direct":
            strategy = DirectUpdateStrategy()
        elif update_dict["strategy"] == "canary":
            strategy = CanaryUpdateStrategy(
                interval=int(update_dict["canary_strategy"]
                             ["progress_interval_seconds"]),
                step=float(update_dict["canary_strategy"]["progress_step"]),
            )

            for rule in update_dict["canary_strategy"]["rules"]:
                strategy.add_rule(_UpdateRule._from_dict(rule))
        else:
            raise ValueError('update strategy must be "direct" or "canary"')

        if "autoscaling" in update_dict:
            autoscaling_obj = Autoscaling._from_dict(
                update_dict["autoscaling"]["quantities"])

            for metric in update_dict["autoscaling"]["metrics"]:
                autoscaling_obj.add_metric(
                    _AutoscalingMetric._from_dict(metric))
        else:
            autoscaling_obj = None

        if "resources" in update_dict:
            resources_list = Resources._from_dict(update_dict["resources"])
        else:
            resources_list = None

        if "run_id" in update_dict and "model_version_id" in update_dict:
            raise ValueError("cannot provide both run_id and model_version_id")
        elif "run_id" in update_dict:
            model_reference = ExperimentRun._get_by_id(
                self._conn, self._conf, id=update_dict["run_id"])
        elif "model_version_id" in update_dict:
            model_reference = RegisteredModelVersion._get_by_id(
                self._conn, self._conf, id=update_dict["model_version_id"])
        else:
            raise RuntimeError(
                "must provide either model_version_id or run_id")

        return self.update(
            model_reference,
            strategy,
            wait=wait,
            resources=resources_list,
            autoscaling=autoscaling_obj,
            env_vars=update_dict.get("env_vars"),
        )