def run_training() -> None:
    """Train the model."""
    _logger.info(f"training the pipeline with version: {_version}")
    # read training data
    data = load_dataset(file_name=config.DATA_FILE)

    # divide train and test
    X_train, X_test, y_train, y_test = train_test_split(
        data[config.FEATURES], data[config.TARGET], test_size=0.1, random_state=0
    )  # we are setting the seed here

    pipeline.marathon_pipeline.fit(X_train[config.FEATURES], y_train)

    _logger.info(f"saving model version: {_version}")
    save_pipeline(pipeline_to_persist=pipeline.marathon_pipeline)

    print("########################################")
    print("Test prediction: ")

    test_data = load_dataset(file_name='test.csv')
    single_test_json = test_data[0:1].to_json(orient='records')

    # When
    subject = make_prediction(input_data=single_test_json)
    print(subject)
def test_prediction_endpoint_returns_prediction(flask_test_client):
    # Load the test data from the regression_model package
    # This is important as it makes it harder for the test
    # data versions to get confused by not spreading it
    # across packages.
    test_data = load_dataset(file_name=model_config.DATASET_FILE)

    X_train, X_test, y_train, y_test = train_test_split(
        test_data.drop(model_config.TARGET, axis=1),
        test_data[model_config.TARGET],
        test_size=0.2,
        random_state=0)  # we are setting the seed here
    post_json = X_test.to_json(orient='records')

    response = flask_test_client.post('/v1/predict/regression',
                                      json=json.loads(post_json))

    # Check status code is OK
    assert response.status_code == 200

    # Check performance, accuracy.
    response_json = json.loads(response.data)
    prediction = response_json['predictions']
    assert prediction is not None
    assert accuracy_score(y_test, prediction) < 1

    # Check versions match
    response_version = response_json['version']
    assert response_version == _version

    # Check correct number of errors removed
    assert len(prediction) + len(response_json.get('errors')) == len(X_test)
Exemple #3
0
def test_prediction_endpoint_returns_prediction(flask_test_client):
    #given
    #load the test data from the regression_model package
    #this is important as it makes it harder for the test
    #data versions to get confused by not spreading it across packages

    test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE)
    post_json = test_data[0:1].to_json(orient="records")

    #when
    response = flask_test_client.post("/v1/predict/regression", json=post_json)

    #Then
    assert response.status_code == 200
    response_json = json.loads(response.data)
    prediction = response_json["predictions"]
    response_version = response_json["version"]
    assert math.ceil(prediction[0]) == 12
    assert response_version == _version
    # The response_version looks like {"predictions": 11.84092..., "version": None}
    # So indeed the version is none, due to some null objects created while changing from 
    # one data type to another. Solve this problem in later versions.

    # Problem has been solved in the later versions.

    
Exemple #4
0
def make_prediction():
    """Make a prediction using a saved model pipeline.

    Args:
        input_data: Array of model prediction inputs.

    Returns:
        Predictions for each input row, as well as the model version.
    """
    test_data = load_dataset(file_name='test.csv')

    data = pd.DataFrame(test_data)
    validated_data = validate_inputs(input_data=data)

    prediction = _price_pipe.predict(validated_data[config.FEATURES])

    output = np.exp(prediction)

    results = {'predictions': output, 'version': _version}

    _logger.info(
        f'Making predictions with model version: {_version} '
        f'Inputs: {validated_data} '
        f'Predictions: {results}')

    return results
def test_model_prediction_differential(
        *, save_file: str = 'test_data_predictions.csv'):
    """
    This test compares the prediction result similarity of
    the current model with the previous model's results.
    """

    # Given
    # Load the saved previous model predictions
    previous_model_df = pd.read_csv(f'{config.PACKAGE_ROOT}/{save_file}')
    previous_model_predictions = previous_model_df.predictions.values

    test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE)
    multiple_test_input = test_data[99:600]

    # When
    current_result = make_prediction(input_data=multiple_test_input)
    current_model_predictions = current_result.get('predictions')

    # Then
    # diff the current model vs. the old model
    #assert len(previous_model_predictions) == len(current_model_predictions)

    # Perform the differential test
    for previous_value, current_value in zip(previous_model_predictions,
                                             current_model_predictions):

        # convert numpy float64 to Python float.
        previous_value = previous_value.item()
        current_value = current_value.item()
def run_training() -> None:
    """Train the model."""

    # read training data
    data = load_dataset(file_name=config.TRAINING_DATA_FILE)

    # print(data.head())

    # divide train and test
    X_train, X_test, y_train, y_test = train_test_split(
        data[config.FEATURES],
        data[config.TARGET],
        test_size=0.1,
        random_state=0)  # we are setting the seed here

    pipeline.end_to_end_pipeline.fit(X_train[config.FEATURES], y_train)

    pred = pipeline.end_to_end_pipeline.predict(X_test)

    # determine mse and rmse
    print("test mse: {}".format(int(mean_squared_error(y_test, (pred)))))
    print("test rmse: {}".format(
        int(np.sqrt(mean_squared_error(y_test, (pred))))))
    print("test r2: {}".format(r2_score(y_test, (pred))))
    print(pipeline.end_to_end_pipeline.named_steps["Linear_model"].coef_)

    _version = "0.0.1"
    _logger.info(f"saving model version: {_version}")
    save_pipeline(pipeline_to_persist=pipeline.end_to_end_pipeline)
def test_model_prediction_differential(*, save_file="test_data_predictions.csv"):


# This test compares the prediction result similarity of the 
# current model with the previous model's results.
    save_file = "test_data_predictions.csv"
    #Given
    previous_model_df = pd.read_csv(f'{model_config.PACKAGE_ROOT}/{save_file}')
    previous_model_predictions = previous_model_df.predictions.values
    test_data = load_dataset(file_name=config.TESTING_DATA_FILE)
    multiple_test_json = test_data[99:600]

    # When 
    response = make_prediction(input_data=multiple_test_json)
    current_model_predictions = response.get("predictions")

    assert len(previous_model_predictions) == len(current_model_predictions)

    # Perform the differential test
    for previous_value, current_value in zip(previous_model_predictions,current_model_predictions):

        # convert numpy float64 to python float
        previous_value = previous_value.item()
        current_value = current_value.item()

        # rel_tol is the relative tolerance - it is the maximum allowed

        assert math.isclose(previous_value, current_value, rel_tol=config.ACCEPTABLE_MODEL_DIFFERENCE)
def test_make_single_prediction():
    test_data = load_dataset(file_name='test.csv')
    single_test_input = test_data[0:1]

    subject = make_prediction(input_data=single_test_input)

    assert subject is not None
    assert isinstance(subject.get('predictions')[0], float)
    assert math.ceil(subject.get('predictions')[0]) == 112476
def test_make_single_prediction():
    """Test a prediction is correct"""
    # load test file
    test_data = load_dataset(file_name='test.csv')
    # make a single predicition
    single_test_input = test_data[0:1]
    subject = make_prediction(input_data=single_test_input)
    assert subject is not None
    assert isinstance(subject.get('predictions')[0], float)
    assert math.ceil(subject.get('predictions')[0]) == 112476
def test_prediction_endpoint_validation_200(flask_test_client):

    test_data = load_dataset(file_name= config.TRAINING_DATA_FILE)
    post_json = test_data.to_json(orient= 'records')

    response = flask_test_client.post('/v1/predict/regression', json = post_json)

    assert response.status_code == 200
    response_json = json.loads(response.data)

    #assert len(response_json.get('predictions')) + len(response_json.get('errors')) == len(test_data)
def test_make_single_prediction():
    # Given
    test_data = load_dataset(file_name='test.csv')
    single_test_json = test_data[0:1]

    # When
    subject = make_prediction(input_data=single_test_json)

    # Then
    assert subject is not None
    assert isinstance(subject.get('predictions')[0], float)
Exemple #12
0
def run_training() :

    data = load_dataset(file_name= TRAINING_DATA_FILE)

    X_train, X_test , Y_train, Y_test = train_test_split(data[FEATURES],data[TARGET],
                                                        test_size = 0.2, random_state=0 )
    Y_train = np.log(Y_train)
    Y_test  = np.log(Y_test)
    price_pipe.fit(X_train[FEATURES], Y_train)
    _logger.info(f"saving model version:{_version}")
    save_pipeline(pipeline_to_save=price_pipe)
def test_make_multiple_predictions():
    test_data = load_dataset(file_name='test.csv')
    original_data_length = len(test_data)
    multiple_test_input = test_data

    subject = make_prediction(input_data=multiple_test_input)

    assert subject is not None
    assert len(subject.get('predictions')) == 1451

    assert len(subject.get('predictions')) != original_data_length
def pipeline_inputs():
    data = load_dataset(filename=config.TESTING_DATA_FILE)

    #train test split
    X_train, X_test, y_train, y_test = train_test_split(
        data[config.FEATURES],
        data[config.TARGET],
        test_size=config.TEST_SIZE,
        random_state=config.RANDOM_STATE,
    )
    return X_train, X_test, y_train, y_test
Exemple #15
0
def test_make_single_prediction():
    # Given
    test_data = load_dataset(file_name="test.csv")
    single_test_input = test_data[0:1]

    # When
    subject = make_prediction(input_data=single_test_input)

    # Then
    assert subject is not None
    assert isinstance(subject.get("predictions")[0], float)
    assert math.ceil(subject.get("predictions")[0]) == 112476
def test_make_multiple_predictions():
    # Given
    test_data = load_dataset(file_name='test.xlsx')
    original_data_length = len(test_data)
    multiple_test_input = test_data

    # When
    subject = make_prediction(input_data=multiple_test_input)

    # Then
    assert subject is not None
    assert len(subject.get('predictions')) == original_data_length
def test_make_multiple_predictions():
    """Test a prediction is correct"""
    # load test file
    test_data = load_dataset(file_name='test.csv')
    original_data_length = len(test_data)
    # make a prediciton for all the testset
    multiple_test_input = test_data
    subject = make_prediction(input_data=multiple_test_input)
    assert subject is not None
    assert len(subject.get('predictions')) == 1451
    # It is expected some rows to be filtered out
    assert len(subject.get('predictions')) != original_data_length
Exemple #18
0
def test_prediction_endpoint_returns_prediction(test_client):
    test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE)
    post_json = test_data[0:1].to_json(orient="records")

    response = test_client.post(f"/api/v1/score", json=post_json)

    assert response.status_code == 200
    response_json = json.loads(response.data)
    prediction = response_json["predictions"]
    response_version = response_json["version"]
    assert math.ceil(prediction) == 112476
    assert response_version == _version
Exemple #19
0
def test_make_single_prediction():
    #Given
    test_data = load_dataset(file_name="test.csv")
    single_test_json = test_data[0:1].to_json(orient='records')

    #When
    subject = make_prediction(input_data=single_test_json)

    #Then
    assert subject is not None
    assert isinstance(subject.get('predictions')[0], float)
    assert math.ceil(subject.get('predictions')[0]) == 112476
def run_training() -> None:
    """Train the model"""

    data = load_dataset(file_name=config.TRAINING_DATA_FILE)

    # divide train and test
    X, y = data[config.FEATURES], data[config.TARGET]

    pipeline.fit(X, y)

    _logger.info(f"saving model version: {_version}")
    save_pipeline(pipeline_to_persist=pipeline)
def test_make_single_prediction():
    # Given
    test_data = load_dataset(file_name='test.csv')
    single_test_json = test_data[0:1].to_json(orient='records')
    print("hola")
    # When
    subject = make_prediction(input_data=single_test_json)

    # Then
    assert subject is not None
    assert isinstance(subject.get('predictions')[0], float)
    assert subject.get('predictions')[0]>=3.6 and subject.get('predictions')[0]<=3.7
def test_prediction_endpoint_returns_prediction(flask_test_client):
    # Given
    # Load the test data from the regression_model package
    # This is important as it makes it harder for the test
    # data versions to get confused by not spreading it
    # across packages.
    test_data = load_dataset(file_name=model_config.TESTING_DATA_FILE)
    post_json = test_data[0:1].to_json(orient='records')

    # When
    response = flask_test_client.post('/v1/predict/regression',
<<<<<<< HEAD
                                      json=json.loads(post_json))
Exemple #23
0
def test_make_multiple_prediction():
    #Given
    test_data = load_dataset(file_name='test.csv')
    original_data_length = len(test_data)
    multiple_test_json = test_data.to_json(orient='records')

    #When
    subject = make_prediction(input_data=multiple_test_json)

    #Then
    assert subject is not None
    assert len(subject.get('predictions')) == 1451
    assert len(subject.get('predictions')) != original_data_length
def test_make_single_prediction():
    # Given
    test_data = load_dataset(file_name='test.csv')
    single_test_input = test_data[0:1]

    print(single_test_input)
    # When
    subject = make_prediction(input_data=single_test_input)

    # Then
    assert subject is not None

    assert isinstance(subject.get('predictions')[0], np.int64)
def test_health_prediction_endpoint_returns_200(flask_test_client):
    test_data = load_dataset(filename=model_config.TEST_DATA_FILE)
    post_json = test_data.to_json(orient='records')
    #when
    resonse = flask_test_client.get('/v1/predict/regression')

    #then
    assert resonse.status_code == 200
    resonse_data = json.load(resonse.data)
    prediction = resonse_data['predictions']
    response_version = resonse_data['version']
    assert math.ceil(prediction) == 9
    assert response_version == _version
def test_prediction_endpoint_validation_200(flask_test_client):

    data = load_dataset(TESTING_DATA_FILE)
    data_json = data.to_json(orient='records')
    response = flask_test_client('/v1/prediction/regression',
                                 json=json.loads(data_json))

    assert response.status_code == 200

    response_json = json.loads(response.data)

    assert len(response_json.get('predictions')) + len(
        response_json.get('errors')) == len(data_json)
Exemple #27
0
def make_single_prediction():
    test_data_file = 'test_one.csv'
    df = load_dataset(filename=test_data_file)
    print('testdata')
    print(len(df))
    df = df[0:1]
    print(df['booking_date'])
    output = make_prediction(input_data=df)

    print(output)
    assert output is not None
    # assert isinstance(output['predictions'][0],float)
    print(math.ceil(output['predictions'][0]))
    assert math.ceil(output['predictions'][0]) == 9
Exemple #28
0
def test_make_multiple_predictions():
    # Given
    test_data = load_dataset(file_name='test.csv')
    original_data_length = test_data.shape[0]
    multiple_test_json = test_data

    # When
    subject = make_prediction(input_data=multiple_test_json)

    # Then
    assert subject is not None
    assert len(subject.get('predictions')) == 1451

    assert len(subject.get("predictions")) != original_data_length
Exemple #29
0
def test_make_single_prediction():
    # Given
    test_data = load_dataset(file_name='test.csv')
    single_test_input = test_data[0:1]  #select single row from the csv

    # When
    subject = make_prediction(input_data=single_test_input)

    # Then
    assert subject is not None
    assert isinstance(
        subject.get('predictions')[0], float
    )  #prediction come as dictionary so we extract preditions and that should be float
    assert math.ceil(subject.get('predictions')[0]) == 112476
def test_make_multiple_predictions():
    # Given
    test_data = load_dataset(file_name='test.csv')
    original_data_length = len(test_data)
    multiple_test_input = test_data
    # When
    subject = make_prediction(input_data=multiple_test_input)

    # Then
    assert subject is not None
    assert len(subject.get('predictions')) == 1451

    # We expect some rows to be filtered out
    assert len(subject.get('predictions')) != original_data_length