def run_training() -> None: """Train the model.""" _logger.info(f"training the pipeline with version: {_version}") # read training data data = load_dataset(file_name=config.DATA_FILE) # divide train and test X_train, X_test, y_train, y_test = train_test_split( data[config.FEATURES], data[config.TARGET], test_size=0.1, random_state=0 ) # we are setting the seed here pipeline.marathon_pipeline.fit(X_train[config.FEATURES], y_train) _logger.info(f"saving model version: {_version}") save_pipeline(pipeline_to_persist=pipeline.marathon_pipeline) print("########################################") print("Test prediction: ") test_data = load_dataset(file_name='test.csv') single_test_json = test_data[0:1].to_json(orient='records') # When subject = make_prediction(input_data=single_test_json) print(subject)
def test_prediction_endpoint_returns_prediction(flask_test_client): # Load the test data from the regression_model package # This is important as it makes it harder for the test # data versions to get confused by not spreading it # across packages. test_data = load_dataset(file_name=model_config.DATASET_FILE) X_train, X_test, y_train, y_test = train_test_split( test_data.drop(model_config.TARGET, axis=1), test_data[model_config.TARGET], test_size=0.2, random_state=0) # we are setting the seed here post_json = X_test.to_json(orient='records') response = flask_test_client.post('/v1/predict/regression', json=json.loads(post_json)) # Check status code is OK assert response.status_code == 200 # Check performance, accuracy. response_json = json.loads(response.data) prediction = response_json['predictions'] assert prediction is not None assert accuracy_score(y_test, prediction) < 1 # Check versions match response_version = response_json['version'] assert response_version == _version # Check correct number of errors removed assert len(prediction) + len(response_json.get('errors')) == len(X_test)
def test_prediction_endpoint_returns_prediction(flask_test_client): #given #load the test data from the regression_model package #this is important as it makes it harder for the test #data versions to get confused by not spreading it across packages test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE) post_json = test_data[0:1].to_json(orient="records") #when response = flask_test_client.post("/v1/predict/regression", json=post_json) #Then assert response.status_code == 200 response_json = json.loads(response.data) prediction = response_json["predictions"] response_version = response_json["version"] assert math.ceil(prediction[0]) == 12 assert response_version == _version # The response_version looks like {"predictions": 11.84092..., "version": None} # So indeed the version is none, due to some null objects created while changing from # one data type to another. Solve this problem in later versions. # Problem has been solved in the later versions.
def make_prediction(): """Make a prediction using a saved model pipeline. Args: input_data: Array of model prediction inputs. Returns: Predictions for each input row, as well as the model version. """ test_data = load_dataset(file_name='test.csv') data = pd.DataFrame(test_data) validated_data = validate_inputs(input_data=data) prediction = _price_pipe.predict(validated_data[config.FEATURES]) output = np.exp(prediction) results = {'predictions': output, 'version': _version} _logger.info( f'Making predictions with model version: {_version} ' f'Inputs: {validated_data} ' f'Predictions: {results}') return results
def test_model_prediction_differential( *, save_file: str = 'test_data_predictions.csv'): """ This test compares the prediction result similarity of the current model with the previous model's results. """ # Given # Load the saved previous model predictions previous_model_df = pd.read_csv(f'{config.PACKAGE_ROOT}/{save_file}') previous_model_predictions = previous_model_df.predictions.values test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE) multiple_test_input = test_data[99:600] # When current_result = make_prediction(input_data=multiple_test_input) current_model_predictions = current_result.get('predictions') # Then # diff the current model vs. the old model #assert len(previous_model_predictions) == len(current_model_predictions) # Perform the differential test for previous_value, current_value in zip(previous_model_predictions, current_model_predictions): # convert numpy float64 to Python float. previous_value = previous_value.item() current_value = current_value.item()
def run_training() -> None: """Train the model.""" # read training data data = load_dataset(file_name=config.TRAINING_DATA_FILE) # print(data.head()) # divide train and test X_train, X_test, y_train, y_test = train_test_split( data[config.FEATURES], data[config.TARGET], test_size=0.1, random_state=0) # we are setting the seed here pipeline.end_to_end_pipeline.fit(X_train[config.FEATURES], y_train) pred = pipeline.end_to_end_pipeline.predict(X_test) # determine mse and rmse print("test mse: {}".format(int(mean_squared_error(y_test, (pred))))) print("test rmse: {}".format( int(np.sqrt(mean_squared_error(y_test, (pred)))))) print("test r2: {}".format(r2_score(y_test, (pred)))) print(pipeline.end_to_end_pipeline.named_steps["Linear_model"].coef_) _version = "0.0.1" _logger.info(f"saving model version: {_version}") save_pipeline(pipeline_to_persist=pipeline.end_to_end_pipeline)
def test_model_prediction_differential(*, save_file="test_data_predictions.csv"): # This test compares the prediction result similarity of the # current model with the previous model's results. save_file = "test_data_predictions.csv" #Given previous_model_df = pd.read_csv(f'{model_config.PACKAGE_ROOT}/{save_file}') previous_model_predictions = previous_model_df.predictions.values test_data = load_dataset(file_name=config.TESTING_DATA_FILE) multiple_test_json = test_data[99:600] # When response = make_prediction(input_data=multiple_test_json) current_model_predictions = response.get("predictions") assert len(previous_model_predictions) == len(current_model_predictions) # Perform the differential test for previous_value, current_value in zip(previous_model_predictions,current_model_predictions): # convert numpy float64 to python float previous_value = previous_value.item() current_value = current_value.item() # rel_tol is the relative tolerance - it is the maximum allowed assert math.isclose(previous_value, current_value, rel_tol=config.ACCEPTABLE_MODEL_DIFFERENCE)
def test_make_single_prediction(): test_data = load_dataset(file_name='test.csv') single_test_input = test_data[0:1] subject = make_prediction(input_data=single_test_input) assert subject is not None assert isinstance(subject.get('predictions')[0], float) assert math.ceil(subject.get('predictions')[0]) == 112476
def test_make_single_prediction(): """Test a prediction is correct""" # load test file test_data = load_dataset(file_name='test.csv') # make a single predicition single_test_input = test_data[0:1] subject = make_prediction(input_data=single_test_input) assert subject is not None assert isinstance(subject.get('predictions')[0], float) assert math.ceil(subject.get('predictions')[0]) == 112476
def test_prediction_endpoint_validation_200(flask_test_client): test_data = load_dataset(file_name= config.TRAINING_DATA_FILE) post_json = test_data.to_json(orient= 'records') response = flask_test_client.post('/v1/predict/regression', json = post_json) assert response.status_code == 200 response_json = json.loads(response.data) #assert len(response_json.get('predictions')) + len(response_json.get('errors')) == len(test_data)
def test_make_single_prediction(): # Given test_data = load_dataset(file_name='test.csv') single_test_json = test_data[0:1] # When subject = make_prediction(input_data=single_test_json) # Then assert subject is not None assert isinstance(subject.get('predictions')[0], float)
def run_training() : data = load_dataset(file_name= TRAINING_DATA_FILE) X_train, X_test , Y_train, Y_test = train_test_split(data[FEATURES],data[TARGET], test_size = 0.2, random_state=0 ) Y_train = np.log(Y_train) Y_test = np.log(Y_test) price_pipe.fit(X_train[FEATURES], Y_train) _logger.info(f"saving model version:{_version}") save_pipeline(pipeline_to_save=price_pipe)
def test_make_multiple_predictions(): test_data = load_dataset(file_name='test.csv') original_data_length = len(test_data) multiple_test_input = test_data subject = make_prediction(input_data=multiple_test_input) assert subject is not None assert len(subject.get('predictions')) == 1451 assert len(subject.get('predictions')) != original_data_length
def pipeline_inputs(): data = load_dataset(filename=config.TESTING_DATA_FILE) #train test split X_train, X_test, y_train, y_test = train_test_split( data[config.FEATURES], data[config.TARGET], test_size=config.TEST_SIZE, random_state=config.RANDOM_STATE, ) return X_train, X_test, y_train, y_test
def test_make_single_prediction(): # Given test_data = load_dataset(file_name="test.csv") single_test_input = test_data[0:1] # When subject = make_prediction(input_data=single_test_input) # Then assert subject is not None assert isinstance(subject.get("predictions")[0], float) assert math.ceil(subject.get("predictions")[0]) == 112476
def test_make_multiple_predictions(): # Given test_data = load_dataset(file_name='test.xlsx') original_data_length = len(test_data) multiple_test_input = test_data # When subject = make_prediction(input_data=multiple_test_input) # Then assert subject is not None assert len(subject.get('predictions')) == original_data_length
def test_make_multiple_predictions(): """Test a prediction is correct""" # load test file test_data = load_dataset(file_name='test.csv') original_data_length = len(test_data) # make a prediciton for all the testset multiple_test_input = test_data subject = make_prediction(input_data=multiple_test_input) assert subject is not None assert len(subject.get('predictions')) == 1451 # It is expected some rows to be filtered out assert len(subject.get('predictions')) != original_data_length
def test_prediction_endpoint_returns_prediction(test_client): test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE) post_json = test_data[0:1].to_json(orient="records") response = test_client.post(f"/api/v1/score", json=post_json) assert response.status_code == 200 response_json = json.loads(response.data) prediction = response_json["predictions"] response_version = response_json["version"] assert math.ceil(prediction) == 112476 assert response_version == _version
def test_make_single_prediction(): #Given test_data = load_dataset(file_name="test.csv") single_test_json = test_data[0:1].to_json(orient='records') #When subject = make_prediction(input_data=single_test_json) #Then assert subject is not None assert isinstance(subject.get('predictions')[0], float) assert math.ceil(subject.get('predictions')[0]) == 112476
def run_training() -> None: """Train the model""" data = load_dataset(file_name=config.TRAINING_DATA_FILE) # divide train and test X, y = data[config.FEATURES], data[config.TARGET] pipeline.fit(X, y) _logger.info(f"saving model version: {_version}") save_pipeline(pipeline_to_persist=pipeline)
def test_make_single_prediction(): # Given test_data = load_dataset(file_name='test.csv') single_test_json = test_data[0:1].to_json(orient='records') print("hola") # When subject = make_prediction(input_data=single_test_json) # Then assert subject is not None assert isinstance(subject.get('predictions')[0], float) assert subject.get('predictions')[0]>=3.6 and subject.get('predictions')[0]<=3.7
def test_prediction_endpoint_returns_prediction(flask_test_client): # Given # Load the test data from the regression_model package # This is important as it makes it harder for the test # data versions to get confused by not spreading it # across packages. test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE) post_json = test_data[0:1].to_json(orient='records') # When response = flask_test_client.post('/v1/predict/regression', <<<<<<< HEAD json=json.loads(post_json))
def test_make_multiple_prediction(): #Given test_data = load_dataset(file_name='test.csv') original_data_length = len(test_data) multiple_test_json = test_data.to_json(orient='records') #When subject = make_prediction(input_data=multiple_test_json) #Then assert subject is not None assert len(subject.get('predictions')) == 1451 assert len(subject.get('predictions')) != original_data_length
def test_make_single_prediction(): # Given test_data = load_dataset(file_name='test.csv') single_test_input = test_data[0:1] print(single_test_input) # When subject = make_prediction(input_data=single_test_input) # Then assert subject is not None assert isinstance(subject.get('predictions')[0], np.int64)
def test_health_prediction_endpoint_returns_200(flask_test_client): test_data = load_dataset(filename=model_config.TEST_DATA_FILE) post_json = test_data.to_json(orient='records') #when resonse = flask_test_client.get('/v1/predict/regression') #then assert resonse.status_code == 200 resonse_data = json.load(resonse.data) prediction = resonse_data['predictions'] response_version = resonse_data['version'] assert math.ceil(prediction) == 9 assert response_version == _version
def test_prediction_endpoint_validation_200(flask_test_client): data = load_dataset(TESTING_DATA_FILE) data_json = data.to_json(orient='records') response = flask_test_client('/v1/prediction/regression', json=json.loads(data_json)) assert response.status_code == 200 response_json = json.loads(response.data) assert len(response_json.get('predictions')) + len( response_json.get('errors')) == len(data_json)
def make_single_prediction(): test_data_file = 'test_one.csv' df = load_dataset(filename=test_data_file) print('testdata') print(len(df)) df = df[0:1] print(df['booking_date']) output = make_prediction(input_data=df) print(output) assert output is not None # assert isinstance(output['predictions'][0],float) print(math.ceil(output['predictions'][0])) assert math.ceil(output['predictions'][0]) == 9
def test_make_multiple_predictions(): # Given test_data = load_dataset(file_name='test.csv') original_data_length = test_data.shape[0] multiple_test_json = test_data # When subject = make_prediction(input_data=multiple_test_json) # Then assert subject is not None assert len(subject.get('predictions')) == 1451 assert len(subject.get("predictions")) != original_data_length
def test_make_single_prediction(): # Given test_data = load_dataset(file_name='test.csv') single_test_input = test_data[0:1] #select single row from the csv # When subject = make_prediction(input_data=single_test_input) # Then assert subject is not None assert isinstance( subject.get('predictions')[0], float ) #prediction come as dictionary so we extract preditions and that should be float assert math.ceil(subject.get('predictions')[0]) == 112476
def test_make_multiple_predictions(): # Given test_data = load_dataset(file_name='test.csv') original_data_length = len(test_data) multiple_test_input = test_data # When subject = make_prediction(input_data=multiple_test_input) # Then assert subject is not None assert len(subject.get('predictions')) == 1451 # We expect some rows to be filtered out assert len(subject.get('predictions')) != original_data_length