def test_make_single_prediction():

    data = pd.read_csv(config.TRAINING_DATA_FILE)

    drop_duplicate = pp.DropDuplicateValues()
    data = drop_duplicate.fit_transform(data)

    drop_missing = pp.DropMissingValues()
    data = drop_missing.fit_transform(data)

    create_polarity = pp.CreatePolarityFeature()
    data = create_polarity.fit_transform(data)

    X_train, X_test, y_train, y_test = train_test_split(
        data[config.INPUT_FEATURES],
        data[config.TARGET],
        test_size=0.3,
        random_state=42)

    # Given
    single_test_json = X_test[0:1].to_json(orient='records')

    # When
    subject = make_prediction(input_data=single_test_json)

    # Then
    assert subject is not None
def run_training():
    """Train the model."""

    # read training data
    data = pd.read_csv(config.TRAINING_DATA_FILE)
    data = data.sample(n=100000)
    drop_duplicate = pp.DropDuplicateValues()
    data = drop_duplicate.fit_transform(data)

    drop_missing = pp.DropMissingValues()
    data = drop_missing.fit_transform(data)

    create_polarity = pp.CreatePolarityFeature()
    data = create_polarity.fit_transform(data)

    # divide train and test
    X_train, X_test, y_train, y_test = train_test_split(
        data[config.INPUT_FEATURES],
        data[config.TARGET],
        test_size=0.1,
        random_state=42)  # we are setting the seed here

    pipeline.review_pipe.fit(X_train[config.INPUT_FEATURES], y_train)
    joblib.dump(pipeline.review_pipe, config.PIPELINE_NAME)
Ejemplo n.º 3
0
    results = _pipe_price.predict(input_data)

    return results


if __name__ == '__main__':

    # test pipeline

    data = pd.read_csv(config.TRAINING_DATA_FILE)

    drop_duplicate = pp.DropDuplicateValues()
    data = drop_duplicate.fit_transform(data)

    drop_missing = pp.DropMissingValues()
    data = drop_missing.fit_transform(data)

    create_polarity = pp.CreatePolarityFeature()
    data = create_polarity.fit_transform(data)

    X_train, X_test, y_train, y_test = train_test_split(
        data[config.INPUT_FEATURES],
        data[config.TARGET],
        test_size=0.1,
        random_state=42)

    pred = make_prediction(X_test)

    # determine mse and rmse
    print('test precision_score: {}'.format(precision_score(y_test, pred)))