Exemple #1
0
    def test_custom_transform_server(
        self,
        resources,
        framework,
        problem,
        language,
        docker,
        tmp_path,
        use_arrow,
    ):
        custom_model_dir = _create_custom_model_dir(
            resources,
            tmp_path,
            framework,
            problem,
            language,
        )

        with DrumServerRun(
                resources.target_types(problem),
                resources.class_labels(framework, problem),
                custom_model_dir,
                docker,
        ) as run:
            input_dataset = resources.datasets(framework, problem)
            # do predictions
            files = {"X": open(input_dataset)}
            if use_arrow:
                files["arrow_version"] = ".2"

            response = requests.post(run.url_server_address + "/transform/",
                                     files=files)
            print(response.text)
            assert response.ok

            in_data = pd.read_csv(input_dataset)

            if framework == SKLEARN_TRANSFORM_DENSE:
                if use_arrow:
                    transformed_out = read_arrow_payload(eval(response.text))
                    assert eval(response.text)["out.format"] == "arrow"
                else:
                    transformed_out = read_csv_payload(eval(response.text))
                    assert eval(response.text)["out.format"] == "csv"
                actual_num_predictions = transformed_out.shape[0]
            else:
                transformed_out = read_mtx_payload(eval(response.text))
                actual_num_predictions = transformed_out.shape[0]
                assert eval(response.text)["out.format"] == "sparse"
            validate_transformed_output(
                transformed_out,
                should_be_sparse=framework == SKLEARN_TRANSFORM)
            assert in_data.shape[0] == actual_num_predictions
    def test_custom_transforms_with_drum_nginx_prediction_server(
        self,
        resources,
        framework,
        problem,
        language,
        docker,
        tmp_path,
    ):
        custom_model_dir = _create_custom_model_dir(
            resources,
            tmp_path,
            framework,
            problem,
            language,
        )

        with DrumServerRun(
                resources.target_types(problem),
                resources.class_labels(framework, problem),
                custom_model_dir,
                docker,
                nginx=True,
        ) as run:
            input_dataset = resources.datasets(framework, problem)
            # do predictions
            response = requests.post(run.url_server_address + "/transform/",
                                     files={"X": open(input_dataset)})

            assert response.ok

            in_data = pd.read_csv(input_dataset)

            parsed_response = parse_multi_part_response(response)

            transformed_mat = read_mtx_payload(parsed_response,
                                               X_TRANSFORM_KEY)
            actual_num_predictions = transformed_mat.shape[0]
            assert in_data.shape[0] == actual_num_predictions
    def check_prediction_side_effects(self):
        rtol = 2e-02
        atol = 1e-06
        input_extension = os.path.splitext(self.options.input)
        is_sparse = input_extension[1] == ".mtx"

        if is_sparse:
            df = pd.DataFrame(mmread(self.options.input).tocsr())
            samplesize = min(1000, max(int(len(df) * 0.1), 10))
            data_subset = df.sample(n=samplesize, random_state=42)
            _, __tempfile_sample = mkstemp(suffix=".mtx")
            sparse_mat = vstack(x[0] for x in data_subset.values)
            mmwrite(__tempfile_sample, sparse_mat)
        else:
            df = pd.read_csv(self.options.input)
            samplesize = min(1000, max(int(len(df) * 0.1), 10))
            data_subset = df.sample(n=samplesize, random_state=42)
            _, __tempfile_sample = mkstemp(suffix=".csv")
            data_subset.to_csv(__tempfile_sample, index=False)

        if self.target_type == TargetType.BINARY:
            labels = [self.options.negative_class_label, self.options.positive_class_label]
        elif self.target_type == TargetType.MULTICLASS:
            labels = self.options.class_labels
        else:
            labels = None

        with DrumServerRun(
            self.target_type.value,
            labels,
            self.options.code_dir,
        ) as run:
            response_key = (
                X_TRANSFORM_KEY
                if self.target_type == TargetType.TRANSFORM
                else RESPONSE_PREDICTIONS_KEY
            )
            endpoint = "/transform/" if self.target_type == TargetType.TRANSFORM else "/predict/"

            response_full = requests.post(
                run.url_server_address + endpoint, files={"X": open(self.options.input)}
            )

            response_sample = requests.post(
                run.url_server_address + endpoint, files={"X": open(__tempfile_sample)}
            )

            if self.target_type == TargetType.TRANSFORM:
                if is_sparse:
                    preds_full = pd.DataFrame(read_mtx_payload(eval(response_full.text)))
                    preds_sample = pd.DataFrame(read_mtx_payload(eval(response_sample.text)))
                else:
                    preds_full = read_csv_payload(eval(response_full.text))
                    preds_sample = read_csv_payload(eval(response_sample.text))
            else:
                preds_full = pd.DataFrame(json.loads(response_full.text)[response_key])
                preds_sample = pd.DataFrame(json.loads(response_sample.text)[response_key])

            preds_full_subset = preds_full.iloc[data_subset.index]

            matches = np.isclose(preds_full_subset, preds_sample, rtol=rtol, atol=atol)
            if not np.all(matches):
                message = """
                            Error: Your predictions were different when we tried to predict twice.
                            No randomness is allowed.
                            The last 10 predictions from the main predict run were: {}
                            However when we reran predictions on the same data, we got: {}.
                            The sample used to calculate prediction reruns can be found in this file: {}""".format(
                    preds_full_subset[~matches][:10], preds_sample[~matches][:10], __tempfile_sample
                )
                raise ValueError(message)
            else:
                os.remove(__tempfile_sample)
    def test_custom_transform_server(
        self,
        resources,
        framework,
        problem,
        language,
        docker,
        tmp_path,
        use_arrow,
        pass_target,
    ):
        custom_model_dir = _create_custom_model_dir(
            resources,
            tmp_path,
            framework,
            problem,
            language,
        )

        with DrumServerRun(
                resources.target_types(problem),
                resources.class_labels(framework, problem),
                custom_model_dir,
                docker,
        ) as run:
            input_dataset = resources.datasets(framework, problem)
            in_data = pd.read_csv(input_dataset)

            files = {"X": open(input_dataset)}
            if pass_target:
                target_dataset = resources.targets(problem)
                files["y"] = open(target_dataset)

            if use_arrow:
                files["arrow_version"] = ".2"

            response = requests.post(run.url_server_address + "/transform/",
                                     files=files)
            assert response.ok

            parsed_response = parse_multi_part_response(response)

            if framework == SKLEARN_TRANSFORM_DENSE:
                if use_arrow:
                    transformed_out = read_arrow_payload(
                        parsed_response, X_TRANSFORM_KEY)
                    if pass_target:
                        target_out = read_arrow_payload(
                            parsed_response, Y_TRANSFORM_KEY)
                    assert parsed_response["X.format"] == "arrow"
                    if pass_target:
                        assert parsed_response["y.format"] == "arrow"
                else:
                    transformed_out = read_csv_payload(parsed_response,
                                                       X_TRANSFORM_KEY)
                    if pass_target:
                        target_out = read_csv_payload(parsed_response,
                                                      Y_TRANSFORM_KEY)
                    assert parsed_response["X.format"] == "csv"
                    if pass_target:
                        assert parsed_response["y.format"] == "csv"
                actual_num_predictions = transformed_out.shape[0]
            else:
                transformed_out = read_mtx_payload(parsed_response,
                                                   X_TRANSFORM_KEY)
                colnames = parsed_response["X.colnames"].decode("utf-8").split(
                    "\n")
                assert len(colnames) == transformed_out.shape[1]
                if pass_target:
                    # this shouldn't be sparse even though features are
                    if use_arrow:
                        target_out = read_arrow_payload(
                            parsed_response, Y_TRANSFORM_KEY)
                        if pass_target:
                            assert parsed_response["y.format"] == "arrow"
                    else:
                        target_out = read_csv_payload(parsed_response,
                                                      Y_TRANSFORM_KEY)
                        if pass_target:
                            assert parsed_response["y.format"] == "csv"
                actual_num_predictions = transformed_out.shape[0]
                assert parsed_response["X.format"] == "sparse"
            validate_transformed_output(
                transformed_out,
                should_be_sparse=framework == SKLEARN_TRANSFORM)
            if pass_target:
                assert all(pd.read_csv(target_dataset) == target_out)
            assert in_data.shape[0] == actual_num_predictions
Exemple #5
0
 def load_transform_output(response, is_sparse, request_key):
     parsed_response = parse_multi_part_response(response)
     if is_sparse:
         return pd.DataFrame(read_mtx_payload(parsed_response, request_key))
     else:
         return pd.DataFrame(read_csv_payload(parsed_response, request_key))