def test_model_prediction_differentials(client): test_inputs_df = load_dataset(file_name="test.csv") secondary_test_inputs_df = test_inputs_df.rename( columns=SECONDARY_VARIABLES_TO_RENAME) primary_response = client.post( "v1/predictions/primary", json=test_inputs_df.to_dict(orient="records")) primary_predictions = json.loads(primary_response.data)["predictions"] secondary_response = client.post( "v1/predictions/secondary", json=secondary_test_inputs_df.to_dict(orient="records"), ) secondary_predictions = json.loads(secondary_response.data)["predictions"] # We just pass in the first 10 rows as the two models' validation differs # which means they filter out a slightly different number of rows # which would cause the differential tests to fail. compare_differences( expected_predictions=secondary_predictions[:10], actual_predictions=primary_predictions[:10], # you would adjust the rel_tol level parameter on your model. # right now this is extremely permissive of variation. rel_tol=0.2, )
def pipeline_inputs(): # For larger datasets, here we would use a testing sub-sample. data = load_dataset(file_name=config.app_config.training_data_file) # Divide train and test X_train, X_test, y_train, y_test = train_test_split( data[config.model_config.features], # predictors data[config.model_config.target], test_size=config.model_config.test_size, # we are setting the random seed here # for reproducibility random_state=config.model_config.random_state, ) return X_train, X_test, y_train, y_test
def populate_database(n_predictions: int = 500, anomaly: bool = False) -> None: """ Manipulate the test data to generate random predictions and save them to the database. Before running this script, ensure that the API and Database docker containers are running. """ print(f"Preparing to generate: {n_predictions} predictions.") # Load the gradient boosting test dataset which # is included in the model package test_inputs_df = load_dataset(file_name="test.csv") clean_inputs_df = _prepare_inputs(dataframe=test_inputs_df) if len(clean_inputs_df) < n_predictions: print( f"If you want {n_predictions} predictions, you need to" "extend the script to handle more predictions." ) if anomaly: # set extremely low values to generate an outlier n_predictions = 1 clean_inputs_df.loc[:, "FirstFlrSF"] = 1 clean_inputs_df.loc[:, "LotArea"] = 1 clean_inputs_df.loc[:, "OverallQual"] = 1 clean_inputs_df.loc[:, "GrLivArea"] = 1 for index, data in clean_inputs_df.iterrows(): if index > n_predictions: if anomaly: print('Created 1 anomaly') break response = requests.post( f"{LOCAL_URL}/v1/predictions/regression", headers=HEADERS, json=[data.to_dict()], ) response.raise_for_status() if index % 50 == 0: print(f"{index} predictions complete") # prevent overloading the server time.sleep(0.5) print("Prediction generation complete.")
def test_prediction_endpoint(api_endpoint, expected_no_predictions, client): # Given # Load the test dataset which is included in the model package test_inputs_df = load_dataset(file_name="test.csv") # dataframe if api_endpoint == "v1/predictions/secondary": # adjust column names to those expected by the secondary model test_inputs_df.rename(columns=SECONDARY_VARIABLES_TO_RENAME, inplace=True) # When response = client.post(api_endpoint, json=test_inputs_df.to_dict(orient="records")) # Then assert response.status_code == 200 data = json.loads(response.data) assert data["errors"] is None assert len(data["predictions"]) == expected_no_predictions
def test_prediction_validation(field, field_value, index, expected_error, client): # Given # Load the test dataset which is included in the model package test_inputs_df = load_dataset(file_name="test.csv") # dataframe # Check gradient_boosting_model.processing.validation import HouseDataInputSchema # and you will see the expected values for the inputs to the house price prediction # model. In this test, inputs are changed to incorrect values to check the validation. test_inputs_df.loc[index, field] = field_value # When response = client.post("/v1/predictions/primary", json=test_inputs_df.to_dict(orient="records")) # Then assert response.status_code == 400 data = json.loads(response.data) assert data == expected_error
def run_training() -> None: """Train the model.""" # read training data data = load_dataset(file_name=config.app_config.training_data_file) # divide train and test X_train, X_test, y_train, y_test = train_test_split( data[config.model_config.features], # predictors data[config.model_config.target], test_size=config.model_config.test_size, # we are setting the random seed here # for reproducibility random_state=config.model_config.random_state, ) pipeline.price_pipe.fit(X_train, y_train) _logger.warning(f"saving model version: {_version}") save_pipeline(pipeline_to_persist=pipeline.price_pipe)
def sample_input_data(): """easy access to the test data, which is referenced in yaml""" return load_dataset(file_name=config.app_config.test_data_file)
def raw_training_data(): # For larger datasets, here we would use a testing sub-sample. return load_dataset(file_name=config.app_config.training_data_file)
def sample_input_data(): return load_dataset(file_name=config.app_config.test_data_file)
def test_inputs_df(): # Load the gradient boosting test dataset which # is included in the model package test_inputs_df = load_dataset(file_name="test.csv") return test_inputs_df.copy(deep=True)