def test_01_train(self): """ ensure log file is created """ log_file = os.path.join("logs", "train-test.log") if os.path.exists(log_file): os.remove(log_file) ## YOUR CODE HERE ## Call the update_train_log() function from logger.py with arbitrary input values and test if the log file ## exists in you file system using the assertTrue() base method from unittest. ## update the log data_shape = (100, 10) eval_test = {'rmse': 0.5} runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(self, data_shape, eval_test, runtime, model_version, model_version_note, test=True) self.assertTrue(os.path.exists(log_file))
def test_02_train(self): """ ensure that content can be retrieved from log file """ log_file = os.path.join("logs", "train-test.log") ## update the log tag = 'netherlands' period = ('2017-12-01', '2019-05-31') rmse = {'rmse': 0.5} runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(tag, period, rmse, runtime, model_version, model_version_note, test=True) df = pd.read_csv(log_file) logged_period = [literal_eval(i) for i in df['period'].copy()][-1] self.assertEqual(period, logged_period)
def test_02_train(self): """ ensure that content can be retrieved from log file """ log_file = os.path.join("logs", "train-test.log") ## YOUR CODE HERE ## Log arbitrary values calling update_train_log from logger.py. Then load the data ## from this log file and assert that the loaded data is the same as the data you logged. ## update the log data_shape = (100, 10) eval_test = {'rmse': 0.5} runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(self, data_shape, eval_test, runtime, model_version, model_version_note, test=True) df = pd.read_csv(log_file) logged_eval_test = [literal_eval(i) for i in df['eval_test'].copy()][-1] self.assertEqual(eval_test, logged_eval_test)
def test_01_train(self): """ ensure log file is created """ log_file = os.path.join("logs", "train-test.log") if os.path.exists(log_file): os.remove(log_file) ## update the log country = "United Kingdom" data_shape = (100, 10) eval_test = {'rmse': 0.5} runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(country, data_shape, eval_test, runtime, model_version, model_version_note, test=True) self.assertTrue(os.path.exists(log_file))
def test_02_train(self): """ ensure that content can be retrieved from log file """ log_file = os.path.join("logs", "train-test.log") ## update the log tag = "Another Country" period = "('2017-12-01', '2019-05-29')" eval_test = {'rmse': 0.5} runtime = "00:00:01" model_version = 0.2 model_version_note = "test model" update_train_log(tag, period, eval_test, runtime, model_version, model_version_note, test=True) df = pd.read_csv(log_file) logged_eval_test = [literal_eval(i) for i in df['rmse'].copy()][-1] self.assertEqual(eval_test, logged_eval_test)
def test_01_train(self): """ ensure log file is created """ today = date.today() log_file = os.path.join( LOG_DIR, "{}-train-{}-{}.log".format(LOG_PREFIX, today.year, today.month)) if os.path.exists(log_file): os.remove(log_file) ## update the log country = 'india' date_range = ('2017-11-29', '2019-05-24') metric = {'rmse': 0.5} runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(country, date_range, metric, runtime, model_version, model_version_note, test=True, prefix=LOG_PREFIX) self.assertTrue(os.path.exists(log_file))
def test_02_train(self): """ ensure that content can be retrieved from log file """ log_file = os.path.join("logs", "train-test.log") ## update the log data_shape = (100, 10) eval_test = {'rmse': 0.5} runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(data_shape, eval_test, runtime, model_version, model_version_note, test=True) df = pd.read_csv(log_file) logged_eval_test = [literal_eval(i) for i in df['eval_test'].copy()][-1] self.assertEqual(eval_test, logged_eval_test)
def test_01_train(self): """ ensure log file is created """ today = date.today() log_file = os.path.join( "logs", "train-{}-{}.log".format(today.year, today.month)) if os.path.exists(log_file): os.remove(log_file) ## update the log tag = 'all' dates = ('2018-01-15', '2019-01-15') eval_test = {'rmse': 0.5} runtime = "00:00:01" MODEL_VERSION = 0.1 update_train_log(tag, dates, eval_test, runtime, MODEL_VERSION, test=False) self.assertTrue(os.path.exists(log_file))
def test_01_train(self): """ ensure log file is created """ log_file = os.path.join("logs", "train-test.log") if os.path.exists(log_file): os.remove(log_file) ## update the log country = "Test Country" date_range = ("2018-01-01", "2019-01-01") eval_test = {'rmse': 0.5} runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(country, date_range, eval_test, runtime, model_version, model_version_note, test=True) self.assertTrue(os.path.exists(log_file))
def test_02_train(self): """ ensure that content can be retrieved from log file """ today = date.today() log_file = os.path.join( "logs", "train-{}-{}.log".format(today.year, today.month)) if os.path.exists(log_file): os.remove(log_file) ## update the log tag = 'all' dates = ('2018-01-15', '2019-01-15') eval_test = {'rmse': 0.5} runtime = "00:00:01" MODEL_VERSION = 0.1 update_train_log(tag, dates, eval_test, runtime, MODEL_VERSION, test=False) df = pd.read_csv(log_file) logged_eval_test = [literal_eval(i) for i in df['eval_test'].copy()][-1] self.assertEqual(eval_test, logged_eval_test)
def model_train(): ## start timer for runtime time_start = time.time() data_dir = os.path.join("data", "cs_train", "data") ts_data = fetch_ts(data_dir) for country, df in ts_data.items(): m = Prophet() df2 = df[["date", "revenue"]] df2.columns = ['ds', 'y'] m.fit(df2) future = m.make_future_dataframe(periods=120) forecast = m.predict(future) forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail() filename = "data/forecasts/forecast_" + country forecast.to_csv(filename) ## update the log file m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) test = False update_train_log(forecast.shape, runtime, MODEL_VERSION, MODEL_VERSION_NOTE, test) return True
def _model_train(df, tag, pipe, param_grid, test=False): """ example funtion to train model The 'test' flag when set to 'True': (1) subsets the data and serializes a test version (2) specifies that the use of the 'test' log file """ ## start timer for runtime time_start = time.time() X, y, dates = engineer_features(df) if test: n_samples = int(np.round(0.3 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]), n_samples, replace=False).astype(int) mask = np.in1d(np.arange(y.size), subset_indices) y = y[mask] X = X[mask] dates = dates[mask] ## Perform a train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1) grid.fit(X_train, y_train) y_pred = grid.predict(X_test) eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred))) ## retrain using all data grid.fit(X, y) model_name = re.sub("\.", "_", str(MODEL_VERSION)) if test: saved_model = os.path.join(MODEL_DIR, "test-{}-{}.joblib".format(tag, model_name)) print("... saving test version of model: {}".format(saved_model)) else: saved_model = os.path.join(MODEL_DIR, "sl-{}-{}.joblib".format(tag, model_name)) print("... saving model: {}".format(saved_model)) joblib.dump(grid, saved_model) m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) ## update log update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': eval_rmse}, runtime, MODEL_VERSION, test=test)
def test_01_train(self): """ ensure log file is created """ log_file = os.path.join("logs", "train-test.log") if os.path.exists(log_file): os.remove(log_file) # update the log tag = (100, 10) date0 = "xxx" eval_test = {'rmse': 0.5} runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(tag, date0, eval_test, runtime, model_version, model_version_note, test=True) self.assertTrue(os.path.exists(log_file))
def test_02_train(self): """ ensure that content can be retrieved from log file """ today = date.today() log_file = os.path.join( LOG_DIR, "{}-train-{}-{}.log".format(LOG_PREFIX, today.year, today.month)) ## update the log country = 'india' date_range = ('2017-11-29', '2019-05-24') metric = {'rmse': 0.5} runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(country, date_range, metric, runtime, model_version, model_version_note, test=True, prefix=LOG_PREFIX) df = pd.read_csv(log_file) logged_metric = [literal_eval(i) for i in df['metric'].copy()][-1] self.assertEqual(metric, logged_metric)
def test_01_train(self): """ ensure log file is created """ log_file = os.path.join("logs", "train-test.log") if os.path.exists(log_file): os.remove(log_file) ## update the log tag = "Some Country" period = "('2017-12-01', '2019-05-29')" eval_test = {'rmse': 0.5} runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(tag, period, eval_test, runtime, model_version, model_version_note, test=True) self.assertTrue(os.path.exists(log_file))
def model_train(clf=None): """ example funtion to train model """ ## start timer for runtime time_start = time.time() ## data ingestion X, y = fetch_data() ## Perform a train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) if clf is None: print("Support Vector Machine will be used per default") ## Specify parameters and model params = {'C': 1.0, 'kernel': 'linear', 'gamma': 0.5} clf = svm.SVC(**params, probability=True) #clf = ensemble.RandomForestClassifier() pipe = Pipeline(steps=[('pre', preprocessPipe()), ('clf', clf)]) p = pipe ## fit model on training data p = p.fit(X_train, y_train) y_pred = p.predict(X_test) eval_test = classification_report(y_test, y_pred, output_dict=True) print("... saving model: {}".format(SAVED_MODEL)) joblib.dump(p, SAVED_MODEL) print(pd.DataFrame(eval_test)) m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) update_train_log(X.shape, eval_test, runtime, MODEL_VERSION, MODEL_VERSION_NOTE, test=False)
def model_train(df=None, test=False): """ example funtion to train model The 'test' flag when set to 'True': (1) subsets the data and serializes a test version (2) specifies that the use of the 'test' log file Note that the latest training data is always saved to be used by perfromance monitoring tools. """ ## start timer for runtime time_start = time.time() if df is None: df = load_data() ts = df.sort_values(by="invoice_date") ts = ts.groupby("invoice_date")["price"].sum() model = ARIMA(ts, order=(8, 0, 8)) results_ARIMA = model.fit(disp=-1) if test: print("... saving test version of model") joblib.dump(results_ARIMA, os.path.join("models", "test.joblib")) else: print("... saving model: {}".format(SAVED_MODEL)) joblib.dump(results_ARIMA, SAVED_MODEL) print("... saving latest data") data_file = os.path.join("models", 'latest-train.pickle') with open(data_file, 'wb') as tmp: pickle.dump({'df': df}, tmp) m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) ## update the log file update_train_log(len(ts), 'eval_test TBD', runtime, MODEL_VERSION, MODEL_VERSION_NOTE, test=test)
def test_02_train(self): """ ensure that content can be retrieved from log file """ today = date.today() log_file = os.path.join( "logs", "train-{}-{}.log".format(today.year, today.month)) ## update the log x_shape = (2000, ) runtime = "00:00:02" model_version = 0.1 model_version_note = "test model" update_train_log(x_shape, runtime, model_version, model_version_note) df = pd.read_csv(log_file) logged_x_shape = [literal_eval(i) for i in df['x_shape'].copy()][-1] self.assertEqual(x_shape, logged_x_shape)
def test_01_train(self): """ ensure log file is created """ today = date.today() log_file = os.path.join( "logs", "train-{}-{}.log".format(today.year, today.month)) if os.path.exists(log_file): os.remove(log_file) ## update the log x_shape = (1000, ) runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(x_shape, runtime, model_version, model_version_note) self.assertTrue(os.path.exists(log_file))
def test_02_train(self): """ ensure that content can be retrieved from log file """ log_file = os.path.join("logs","all-train-test.log") ## update the log tag = 'all' dt_range = ('2018-01-01','2018-02-01') eval_test = {'rmse':0.5} runtime = "00:00:01" MODEL_VERSION = 0.1 MODEL_VERSION_NOTE = "test model" update_train_log(tag,dt_range,eval_test,runtime,MODEL_VERSION,MODEL_VERSION_NOTE,test=True) df = pd.read_csv(log_file) logged_eval_test = [literal_eval(i) for i in df['eval_test'].copy()][-1] self.assertEqual(eval_test,logged_eval_test)
def test_01_train(self): """ ensure log file is created """ log_file = os.path.join("logs","all-train-test.log") if os.path.exists(log_file): os.remove(log_file) ## update the log tag = 'all' dt_range = ('2018-01-01','2018-02-01') eval_test = {'rmse':0.5} runtime = "00:00:01" MODEL_VERSION = 0.1 MODEL_VERSION_NOTE = "test model" update_train_log(tag,dt_range,eval_test,runtime,MODEL_VERSION,MODEL_VERSION_NOTE,test=True) self.assertTrue(os.path.exists(log_file))
def test_log_training(self): """ Ensure that content can be retrieved from log file """ log_file = os.path.join("logs", "train-test.log") ## update the log data_shape = (100, 10) runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(data_shape, runtime, model_version, model_version_note, test=True) df = pd.read_csv(log_file) self.assertTrue(os.path.exists(log_file))
def test_01_train(self): """ ensure log file is created """ log_file = os.path.join("logs", "train-test.log") if os.path.exists(log_file): os.remove(log_file) ## update the log tag = "all" time_window = "('2019-06-01', '2020-09-30')" eval_test = {'rmse':0.5} runtime = "000:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(tag, time_window, eval_test, runtime, model_version, model_version_note, test=True) self.assertTrue(os.path.exists(log_file))
def test_02_train(self): """ ensure that content can be retrieved from log file """ log_file = os.path.join("logs", "train-test.log") ## update the log tag = "all" time_window = "('2019-06-01', '2020-09-30')" eval_test = {'rmse':0.5} runtime = "000:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(tag, time_window, eval_test, runtime, model_version, model_version_note, test=True) df = pd.read_csv(log_file) logged_eval_test = [literal_eval(i) for i in df['eval_test'].copy()][-1] self.assertEqual(eval_test, logged_eval_test)
def model_train(mode=None, test=False): """ example funtion to train model 'mode' - can be used to subset data essentially simulating a train """ ## start timer for runtime time_start = time.time() ## data ingestion X, y = fetch_data() ## Perform a train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42) ## Specify parameters and model params = {'C': 1.0, 'kernel': 'linear', 'gamma': 0.5} clf = svm.SVC(**params, probability=True) ## fit model on training data clf = clf.fit(X_train, y_train) y_pred = clf.predict(X_test) print(classification_report(y_test, y_pred)) ## retrain using all data # clf.fit(X, y) print("... saving model: {}".format(SAVED_MODEL)) joblib.dump(clf, SAVED_MODEL) m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) eval_summary = {'rmse': mean_squared_error(y_test, y_pred, squared=False)} update_train_log(X.shape, eval_summary, MODEL_VERSION, runtime, test)
def test_log_creation(self): """ Ensure log file is created """ log_file = os.path.join("logs", "train-test.log") if os.path.exists(log_file): os.remove(log_file) ## update the log data_shape = (100, 10) runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(data_shape, runtime, model_version, model_version_note, test=True) self.assertTrue(os.path.exists(log_file))
def test_02_train(self): log_file = os.path.join("logs", "train-test.log") ## update the log tag = 'united_kingdom' period = ('2017-12-01', '2018-12-01') rmse = {'rmse': 0.5} runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(tag, period, rmse, runtime, model_version, model_version_note, test=True) df = pd.read_csv(log_file) logged_period = [literal_eval(i) for i in df['period'].copy()][-1] self.assertEqual(period, logged_period)
def test_01_train(self): log_file = os.path.join("logs", "train-test.log") if os.path.exists(log_file): os.remove(log_file) ## update the log tag = 'united_kingdom' period = ('2017-12-01', '2018-12-01') rmse = {'rmse': 0.5} runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(tag, period, rmse, runtime, model_version, model_version_note, test=True) self.assertTrue(os.path.exists(log_file))
def test_01_train(self): """ ensure log file is created """ today = date.today() log_file = os.path.join( LOG_DIR, "train-{}-{}.log".format(today.year, today.month)) if os.path.exists(log_file): os.remove(log_file) ## update the log date_range = ('2017-11-29', '2019-05-24') runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(date_range, runtime, model_version, model_version_note, test=False) self.assertTrue(os.path.exists(log_file))
def test_02_train(self): """ ensure that content can be retrieved from log file """ today = date.today() log_file = os.path.join( LOG_DIR, "train-{}-{}.log".format(today.year, today.month)) ## update the log date_range = ('2017-11-29', '2019-05-24') runtime = "00:00:01" model_version = 0.1 model_version_note = "test model" update_train_log(date_range, runtime, model_version, model_version_note, test=False) df = pd.read_csv(log_file) logged_model_version = df["model_version"].iloc[-1] self.assertEqual(model_version, logged_model_version)