def test_02_train(self): """ ensure that content can be retrieved from log file """ log_file = os.path.join(LOG_DIR, "train-test.log") ## update the log country = 'all' timespan = ('2021-04-19', '2021-04-19') eval_test = {'rmse': 0.5} runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(country, timespan, eval_test, runtime, model_version, model_version_note, test=True) df = pd.read_csv(log_file) logged_eval_test = [literal_eval(i) for i in df['eval_test'].copy()][-1] self.assertEqual(eval_test, logged_eval_test)
def test_01_train(self): """ ensure log file is created """ log_file = os.path.join(LOG_DIR, "train-test.log") if os.path.exists(log_file): os.remove(log_file) ## update the log country = 'all' timespan = ('2021-04-19', '2021-04-19') eval_test = {'rmse': 0.5} runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(country, timespan, eval_test, runtime, model_version, model_version_note, test=True) self.assertTrue(os.path.exists(log_file))
def test_02_train(self): """ ensure that content can be retrieved from log file """ log_file = os.path.join("logs", "model_train", "train-test.log") ## update the log data_shape = (100, 10) eval_test = {'rmse': 0.5} runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(data_shape, eval_test, runtime, model_version, model_version_note, test=True) df = pd.read_csv(log_file) logged_eval_test = [literal_eval(i) for i in df['eval_test'].copy()][-1] self.assertEqual(eval_test, logged_eval_test)
def test_02_create_train_log(self): """ test that train log is created """ logfile = the_testlogname("train") if os.path.exists(logfile): os.remove(logfile) # update_train_log(data_shape,eval_summary,model_version, runtime) update_train_log((150, 2), {'rmse': 0.8}, "9.9", 0., True) self.assertTrue(os.path.exists(logfile))
def test_03_archive_train_data(self): """ test that train data is logged """ logfile = the_testlogname("train") if os.path.exists(logfile): os.remove(logfile) # update_train_log(data_shape,eval_summary,model_version, runtime) data_shape = (150, 2) eval_summary = {'rmse': 0.8} model_version = MODEL_VERSION runtime = 0.05 update_train_log(data_shape, eval_summary, model_version, runtime, True) # get last row of log df = pd.read_csv(logfile, delimiter=',', quotechar='|') last = df.tail(1).iloc[0].to_dict() self.assertEqual(last['eval_summary'], str(eval_summary)) self.assertEqual(last['data_shape'], str(data_shape))
def test_01_train(self): """ ensure log file is created """ log_file = os.path.join("logs", "model_train", "train-test.log") if os.path.exists(log_file): os.remove(log_file) ## update the log data_shape = (100, 10) eval_test = {'rmse': 0.5} runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(data_shape, eval_test, runtime, model_version, model_version_note, test=True) self.assertTrue(os.path.exists(log_file))
def _model_train(df, tag, test=False): """ example funtion to train model The 'test' flag when set to 'True': (1) subsets the data and serializes a test version (2) specifies that the use of the 'test' log file """ ## start timer for runtime time_start = time.time() X, y, dates = engineer_features(df) if test: n_samples = int(np.round(0.3 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]), n_samples, replace=False).astype(int) mask = np.in1d(np.arange(y.size), subset_indices) y = y[mask] X = X[mask] dates = dates[mask] ## Perform a train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) ## train a random forest model param_grid_rf = { 'rf__criterion': ['mse', 'mae'], 'rf__n_estimators': [10, 15, 20, 25] } pipe_rf = Pipeline( steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())]) grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, n_jobs=-1) grid.fit(X_train, y_train) y_pred = grid.predict(X_test) eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred))) ## retrain using all data grid.fit(X, y) model_name = re.sub("\.", "_", str(MODEL_VERSION)) if test: saved_model = os.path.join(MODEL_DIR, "test-{}-{}.joblib".format(tag, model_name)) print("... saving test version of model: {}".format(saved_model)) else: saved_model = os.path.join(MODEL_DIR, "sl-{}-{}.joblib".format(tag, model_name)) print("... saving model: {}".format(saved_model)) joblib.dump(grid, saved_model) m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) ## update log update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': eval_rmse}, runtime, MODEL_VERSION, MODEL_VERSION_NOTE, test=True)
def model_train(test=False): """ example funtion to train model The 'test' flag when set to 'True': (1) subsets the data and serializes a test version (2) specifies that the use of the 'test' log file The iris dataset is already small so the subset is shown as an example Note that the latest training data is always saved to be used by perfromance monitoring tools. """ ## load model parameters from conf/base/parameters.yml model_config = load_config("parameters.yml") test_size = model_config["model"]["test_size"] n_est = model_config["classifier"]["param_grid"]["clf__n_estimators"] criterion = model_config["classifier"]["param_grid"]["clf__criterion"] max_depth = model_config["classifier"]["param_grid"]["clf__max_depth"] ## start timer for runtime time_start = time.time() ## data ingestion from build_features X, y = load_data() preprocessor = get_preprocessor() ## subset the data to enable faster unittests if test: n_samples = int(np.round(0.9 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]), n_samples, replace=False).astype(int) mask = np.in1d(np.arange(y.size), subset_indices) y = y[mask] X = X[mask] ## Perform a train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) ## Specify parameters and model param_grid = { 'clf__n_estimators': n_est, 'clf__criterion': criterion, 'clf__max_depth': max_depth } print("... grid searching") clf = ensemble.RandomForestClassifier() pipe = Pipeline(steps=[('pre', preprocessor), ('clf', clf)]) grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1) grid.fit(X_train, y_train) params = grid.best_params_ params = {re.sub("clf__", "", key): value for key, value in params.items()} ## fit model on training data clf = ensemble.RandomForestClassifier(**params) pipe = Pipeline(steps=[('pre', preprocessor), ('clf', clf)]) pipe.fit(X_train, y_train) y_pred = pipe.predict(X_test) eval_test = classification_report(y_test, y_pred, output_dict=True) ## retrain using all data pipe.fit(X, y) if test: print("... saving test version of model") joblib.dump(pipe, os.path.join("models", "test.joblib")) else: print("... saving model: {}".format(SAVED_MODEL)) joblib.dump(pipe, SAVED_MODEL) print("... saving latest data") data_file = os.path.join("models", 'latest-train.pickle') with open(data_file, 'wb') as tmp: pickle.dump({'y': y, 'X': X}, tmp) m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) ## update the log file update_train_log(X.shape, eval_test, runtime, MODEL_VERSION, MODEL_VERSION_NOTE, test=test)