Ejemplo n.º 1
0
def test_model_prediction_differentials(client):
    test_inputs_df = load_dataset(file_name="test.csv")
    secondary_test_inputs_df = test_inputs_df.rename(
        columns=SECONDARY_VARIABLES_TO_RENAME)

    primary_response = client.post(
        "v1/predictions/primary",
        json=test_inputs_df.to_dict(orient="records"))
    primary_predictions = json.loads(primary_response.data)["predictions"]

    secondary_response = client.post(
        "v1/predictions/secondary",
        json=secondary_test_inputs_df.to_dict(orient="records"),
    )
    secondary_predictions = json.loads(secondary_response.data)["predictions"]

    # We just pass in the first 10 rows as the two models' validation differs
    # which means they filter out a slightly different number of rows
    # which would cause the differential tests to fail.
    compare_differences(
        expected_predictions=secondary_predictions[:10],
        actual_predictions=primary_predictions[:10],
        # you would adjust the rel_tol level parameter on your model.
        # right now this is extremely permissive of variation.
        rel_tol=0.2,
    )
def pipeline_inputs():
    # For larger datasets, here we would use a testing sub-sample.
    data = load_dataset(file_name=config.app_config.training_data_file)

    # Divide train and test
    X_train, X_test, y_train, y_test = train_test_split(
        data[config.model_config.features],  # predictors
        data[config.model_config.target],
        test_size=config.model_config.test_size,
        # we are setting the random seed here
        # for reproducibility
        random_state=config.model_config.random_state,
    )

    return X_train, X_test, y_train, y_test
def populate_database(n_predictions: int = 500, anomaly: bool = False) -> None:
    """
    Manipulate the test data to generate random
    predictions and save them to the database.
    Before running this script, ensure that the
    API and Database docker containers are running.
    """

    print(f"Preparing to generate: {n_predictions} predictions.")

    # Load the gradient boosting test dataset which
    # is included in the model package
    test_inputs_df = load_dataset(file_name="test.csv")
    clean_inputs_df = _prepare_inputs(dataframe=test_inputs_df)
    if len(clean_inputs_df) < n_predictions:
        print(
            f"If you want {n_predictions} predictions, you need to"
            "extend the script to handle more predictions."
        )

    if anomaly:
        # set extremely low values to generate an outlier
        n_predictions = 1
        clean_inputs_df.loc[:, "FirstFlrSF"] = 1
        clean_inputs_df.loc[:, "LotArea"] = 1
        clean_inputs_df.loc[:, "OverallQual"] = 1
        clean_inputs_df.loc[:, "GrLivArea"] = 1

    for index, data in clean_inputs_df.iterrows():
        if index > n_predictions:
            if anomaly:
                print('Created 1 anomaly')
            break

        response = requests.post(
            f"{LOCAL_URL}/v1/predictions/regression",
            headers=HEADERS,
            json=[data.to_dict()],
        )
        response.raise_for_status()

        if index % 50 == 0:
            print(f"{index} predictions complete")

            # prevent overloading the server
            time.sleep(0.5)

    print("Prediction generation complete.")
Ejemplo n.º 4
0
def test_prediction_endpoint(api_endpoint, expected_no_predictions, client):
    # Given
    # Load the test dataset which is included in the model package
    test_inputs_df = load_dataset(file_name="test.csv")  # dataframe
    if api_endpoint == "v1/predictions/secondary":
        # adjust column names to those expected by the secondary model
        test_inputs_df.rename(columns=SECONDARY_VARIABLES_TO_RENAME,
                              inplace=True)

    # When
    response = client.post(api_endpoint,
                           json=test_inputs_df.to_dict(orient="records"))

    # Then
    assert response.status_code == 200
    data = json.loads(response.data)
    assert data["errors"] is None
    assert len(data["predictions"]) == expected_no_predictions
Ejemplo n.º 5
0
def test_prediction_validation(field, field_value, index, expected_error,
                               client):
    # Given
    # Load the test dataset which is included in the model package
    test_inputs_df = load_dataset(file_name="test.csv")  # dataframe

    # Check gradient_boosting_model.processing.validation import HouseDataInputSchema
    # and you will see the expected values for the inputs to the house price prediction
    # model. In this test, inputs are changed to incorrect values to check the validation.
    test_inputs_df.loc[index, field] = field_value

    # When
    response = client.post("/v1/predictions/primary",
                           json=test_inputs_df.to_dict(orient="records"))

    # Then
    assert response.status_code == 400
    data = json.loads(response.data)
    assert data == expected_error
Ejemplo n.º 6
0
def run_training() -> None:
    """Train the model."""

    # read training data
    data = load_dataset(file_name=config.app_config.training_data_file)

    # divide train and test
    X_train, X_test, y_train, y_test = train_test_split(
        data[config.model_config.features],  # predictors
        data[config.model_config.target],
        test_size=config.model_config.test_size,
        # we are setting the random seed here
        # for reproducibility
        random_state=config.model_config.random_state,
    )

    pipeline.price_pipe.fit(X_train, y_train)

    _logger.warning(f"saving model version: {_version}")
    save_pipeline(pipeline_to_persist=pipeline.price_pipe)
def sample_input_data():
    """easy access to the test data, which is referenced in yaml"""
    return load_dataset(file_name=config.app_config.test_data_file)
def raw_training_data():
    # For larger datasets, here we would use a testing sub-sample.
    return load_dataset(file_name=config.app_config.training_data_file)
Ejemplo n.º 9
0
def sample_input_data():
    return load_dataset(file_name=config.app_config.test_data_file)
Ejemplo n.º 10
0
def test_inputs_df():
    # Load the gradient boosting test dataset which
    # is included in the model package
    test_inputs_df = load_dataset(file_name="test.csv")
    return test_inputs_df.copy(deep=True)