def model_load(prefix='sl',data_dir=None,training=True): """ example funtion to load model The prefix allows the loading of different models """ if not data_dir: localpath = os.path.abspath('') data_dir = os.path.join(localpath,"data_dir") models = [f for f in os.listdir(os.path.join(".","models")) if re.search("sl",f)] if len(models) == 0: raise Exception("Models with prefix '{}' cannot be found did you train?".format(prefix)) all_models = {} for model in models: all_models[re.split("-",model)[1]] = joblib.load(os.path.join(".","models",model)) ## load data ts_data = fetch_ts(data_dir) all_data = {} for country, df in ts_data.items(): X,y,dates = engineer_features(df,training=training) dates = np.array([str(d) for d in dates]) all_data[country] = {"X":X,"y":y,"dates": dates} return(all_data, all_models)
def model1_RandomForest(data_dir): ts_data = fetch_ts(data_dir) df = ts_data['all'] X, y, dates = engineer_features(df) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) param_grid_rf = { 'rf__criterion': ['mse', 'mae'], 'rf__n_estimators': [10, 15, 20, 25] } pipe_rf = Pipeline( steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())]) grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, iid=False, n_jobs=-1) grid.fit(X_train, y_train) y_pred = grid.predict(X_test) eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred))) return (eval_rmse)
def get_monitoring_tools(df): """ determine outlier and distance thresholds return thresholds, outlier model(s) and source distributions for distances NOTE: for classification the outlier detection on y is not needed """ X, y, dates = engineer_features(df) X1 = X.to_numpy() xpipe = Pipeline(steps=[( 'pca', PCA(2)), ('clf', EllipticEnvelope(random_state=0, contamination=0.01))]) xpipe.fit(X1) bs_samples = 549 outliers_X = np.zeros(bs_samples) wasserstein_X = np.zeros(bs_samples) wasserstein_y = np.zeros(bs_samples) for b in range(bs_samples): n_samples = int(np.round(0.80 * X.shape[0])) subset_indices = np.random.choice(np.arange(X1.shape[0]), n_samples, replace=True).astype(int) y_bs = y[subset_indices] X_bs = X1[subset_indices, :] test1 = xpipe.predict(X_bs) wasserstein_X[b] = wasserstein_distance(X1.flatten(), X_bs.flatten()) wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten()) outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size)) outliers_X.sort() outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int( 0.025 * bs_samples)] wasserstein_X.sort() wasserstein_X_threshold = wasserstein_X[int( 0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)] wasserstein_y.sort() wasserstein_y_threshold = wasserstein_y[int( 0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)] to_return = { "outlier_X": np.round(outlier_X_threshold, 1), "wasserstein_X": np.round(wasserstein_X_threshold, 2), "wasserstein_y": np.round(wasserstein_y_threshold, 2), "clf_X": xpipe, "X_source": X1, "y_source": y, "latest_X": X, "latest_y": y } return (to_return)
def model2_Linearregression(data_dir): ts_data = fetch_ts(data_dir) df = ts_data['all'] X, y, dates = engineer_features(df) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) reg = linear_model.LinearRegression() reg.fit(X_train, y_train) y_pred = reg.predict(X_test) eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred))) return (eval_rmse)
def _model_train(df,tag,test=False): """ example funtion to train model The 'test' flag when set to 'True': (1) subsets the data and serializes a test version (2) specifies that the use of the 'test' log file """ ## start timer for runtime time_start = time.time() X,y,dates = engineer_features(df) if test: n_samples = int(np.round(0.3 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]),n_samples, replace=False).astype(int) mask = np.in1d(np.arange(y.size),subset_indices) y=y[mask] X=X[mask] dates=dates[mask] ## Perform a train-test split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, shuffle=True, random_state=42) ## train a random forest model param_grid_rf = { 'rf__criterion': ['mse','mae'], 'rf__n_estimators': [10,15,20,25] } pipe_rf = Pipeline(steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())]) grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, iid=False, n_jobs=-1) grid.fit(X_train, y_train) y_pred = grid.predict(X_test) eval_rmse = round(np.sqrt(mean_squared_error(y_test,y_pred))) ## retrain using all data grid.fit(X, y) model_name = re.sub("\.","_",str(MODEL_VERSION)) if test: saved_model = os.path.join(MODEL_DIR, "test-{}-{}.joblib".format(tag,model_name)) print("... saving test version of model: {}".format(saved_model)) else: saved_model = os.path.join(MODEL_DIR, "sl-{}-{}.joblib".format(tag,model_name)) print("... saving model: {}".format(saved_model)) joblib.dump(grid,saved_model) m, s = divmod(time.time()-time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d"%(h, m, s) ## update log update_train_log(tag,(str(dates[0]),str(dates[-1])),eval_rmse,runtime,MODEL_VERSION, MODEL_VERSION_NOTE,test=test)