def test_single_prediction(trained_model): test_data = load_dataset(file_name=config.TESTING_DATA_FILE) # get first row of the Dataframe single_test_input = test_data.iloc[0:1] pred = make_prediction(input_data=single_test_input) assert pred is not None # assert isinstance(pred.get('predictions')[0], float) assert pytest.approx(pred.get('predictions')[0], 0.0285, abs=1e-3)
def run_testing(file_name=config.TESTING_DATA_FILE) -> Tuple[float, float]: """ Run testing using held out data """ test_data = load_dataset(file_name=file_name) pipeline_file_name = f'{config.PIPELINE_SAVE_FILE}{_version}.pkl' curr_model = load_pipeline(file_name=pipeline_file_name) test_mape, test_99per = get_accuracy(curr_model, test_data, test_data['wall_time']) logger.info(f'Testing Mean absolute % error: {test_mape}') logger.info(f'Testing 99th Percentile % error: {test_99per}') return test_mape, test_99per
def test_multiple_predictions(trained_model): test_data = load_dataset(file_name=config.TESTING_DATA_FILE) pred = make_prediction(input_data=test_data) assert pred is not None assert len(pred.get('predictions')) == test_data.shape[0] pipeline_file_name = f'{config.PIPELINE_SAVE_FILE}{_version}.pkl' curr_model = load_pipeline(file_name=pipeline_file_name) test_mape, percentile_99 = get_accuracy(curr_model, test_data, test_data['wall_time']) print(f'Test MAPE score: {test_mape}, 99th Percentile: {percentile_99}') # Current Model expected MAPE accuracy is ~18.0 assert test_mape < 30.0 assert percentile_99 < 150.0
'nn_model__input_dim': [ 22, ], 'nn_model__nodes_per_layer': [(10, 10, 5), (10, 10, 7, 5)], 'nn_model__dropout': [0, 0.05, 0.1, 0.015, 0.2], 'nn_model__batch_size': [64, 128, 256, 512], 'nn_model__epochs': [100, 200, 300, 400], 'nn_model__optimizer': ['adam'], #, 'rmsprop'], # adam is better 'nn_model__learning_rate': [0.0001, 0.0005, 0.001, 0.005, 0.01], } if __name__ == "__main__": # multiprocessing requires the fork to happen in a __main__ protected block # change max_rows, None means all data data = load_dataset(file_name=config.TRAINING_DATA_FILE, nrows=None) X_train, X_test, y_train, y_test = get_train_test_split(data, test_size=0.2) grid_search = GridSearchCV( qc_time_nn, parameters, scoring={ 'percentile99': make_scorer(percentile_rel_90, greater_is_better=False), 'MAPE': make_scorer(mape, greater_is_better=False), }, refit='percentile99', n_jobs=-1, # -2 to use all CPUs except one return_train_score=True,
def test_load_dataset(): data = load_dataset(file_name=config.TRAINING_DATA_FILE, nrows=5) assert data.shape[0] == 5
def run_training(with_accuracy=True, overwrite=True, use_all_data=False) -> Union[Tuple[float, float], None]: """ Run trainging using the data and prams in the config file Saves the model (using the name and location in the config) Optionally: calculate the train and test accuracy (Mean Absolute Percent Error) Parameters ---------- with_accuracy: bool, default True If true, calculate and return the training and test accuracy overwrite: bool overwrite the model file if it exists use_all_data: bool use all available data for training (used ONLY for out of sample prediction in production) """ if not overwrite and current_model_exists(): logger.info("Model is already saved. Skipping training") return logger.info('Reading training data.') # read training data data = load_dataset(file_name=config.TRAINING_DATA_FILE) logger.debug(f'Training data columns: \n{data.columns}') test_size, train_size = config.TEST_SIZE, config.TRAIN_SIZE if use_all_data: test_size, train_size = None, 0.99 X_train, X_test, y_train, y_test = get_train_test_split( data, test_size=test_size, train_size=train_size) logger.info('Start fitting model...') # Save some formatted test data X_test_ = input_features_pipeline.fit_transform(X_test, y_test) save_data(X=X_test_, y=y_test, file_name=config.TESTING_DATA_FILE) #, max_rows=5000) # train and save the model model.set_params(**config.BEST_MODEL_PARAMS) model.fit(X_train, y_train) logger.info(f'Saving model version: {_version}') save_pipeline(pipeline_to_persist=model) if with_accuracy: train_mape, train_99per = get_accuracy(model, X_train, y_train) test_mape, test_99per = get_accuracy(model, X_test, y_test) logger.info(f'Training Mean absolute % error: {train_mape}') logger.info(f'Testing Mean absolute % error: {test_mape}') logger.info(f'Training 99th Percentile % error: {train_99per}') logger.info(f'Testing 99th Percentile % error: {test_99per}') return train_mape, test_mape