def model_monitor(country="total", dev=DEV, training=True): """ performance monitoring """ print("Monitor Model") ## import data datasets = engineer_features(training=training, dev=dev) X, y, dates, labels = datasets[country] dates = pd.to_datetime(dates) print(X.shape) ## train the model if training: _model_train(X, y, labels, tag=country, dev=dev) ## monitor RMSE samples = [10, 20, 30, 50, 60] for n in samples: X_new, y_new, dates_new = simulate_samples(n, X, y, dates) queries = [(str(d.year), str(d.month), str(d.day), country) for d in dates_new] y_pred = [model_predict(year=query[0], month=query[1], day=query[2], country=query[3],verbose=False, dev=dev)["y_pred"][0].round(2) for query in queries] rmse = np.sqrt(mean_squared_error(y_new.tolist(),y_pred)) print("sample size: {}, RSME: {}".format(n, rmse.round(2))) ## monitor performance ## scaling scaler = StandardScaler() X = scaler.fit_transform(X) samples = [25, 50, 75, 90] clf_y = EllipticEnvelope(random_state=0,contamination=0.01) clf_X = EllipticEnvelope(random_state=0,contamination=0.01) clf_X.fit(X) clf_y.fit(y.reshape(y.size,1)) results = defaultdict(list) for n in samples: X_new, y_new, dates_new = simulate_samples(n,X,y, dates) results["sample_size"].append(n) results['wasserstein_X'].append(np.round(wasserstein_distance(X.flatten(),X_new.flatten()),2)) results['wasserstein_y'].append(np.round(wasserstein_distance(y,y_new),2)) test1 = clf_X.predict(X_new) test2 = clf_y.predict(y_new.reshape(y_new.size,1)) results["outlier_percent_X"].append(np.round(1.0 - (test1[test1==1].size / test1.size),2)) results["outlier_percent_y"].append(np.round(1.0 - (test2[test2==1].size / test2.size),2)) return pd.DataFrame(results)
def model_train(save_img=False, dev=DEV, verbose=True): """ train models """ ## load engineered features datasets = engineer_features(dev=dev, training=True, verbose=verbose) if verbose: print("Training Models") ## build, train and save models for country in datasets.keys(): tag = country if verbose: print("...training model for {}".format(tag.upper())) X, y, dates, feature_names = datasets[tag] _model_train(X, y, feature_names, tag=tag, dev=dev, save_img=save_img, verbose=verbose)
def model_predict(year, month, day, country, dev=DEV, verbose=True): """ make predictions """ ## start timer for runtime time_start = time.time() ## load data datasets = engineer_features(training=False, dev=dev, verbose=verbose) ## load models models = model_load(dev=dev, verbose=verbose) if verbose: print("Make Prediction") ## check if the model is available if country not in models.keys(): raise Exception( "ERROR (model_predict) - model for country '{}' could not be found" .format(country)) ## ckeck if the data is available if country not in datasets.keys(): raise Exception( "ERROR (model_predict) - dataset for country '{}' could not be found" .format(country)) ## ensure the year, month day are numbers for d in [year, month, day]: if re.search("\D", d): raise Exception( "ERROR (model_predict) - invalid year, month or day") ## get the dataset and model for the given country X, y, dates, labels = datasets[country] df = pd.DataFrame(X, columns=labels, index=dates) model = models[country] ## check date target_date = "{}-{}-{}".format(year, str(month).zfill(2), str(day).zfill(2)) if verbose: print(target_date) if target_date not in df.index.strftime('%Y-%m-%d'): raise Exception( "ERROR (model_predict) - {} not in range {} and {}".format( target_date, df.index.strftime('%Y-%m-%d')[0], df.index.strftime('%Y-%m-%d')[-1])) ## query the data query = pd.to_datetime(target_date) X_pred = df.loc[pd.to_datetime(query), :].values.reshape(1, -1) ## make prediction y_pred = model.predict(X_pred) m, s = divmod(time.time() - time_start, 60) h, m = divmod(m, 60) runtime = "%03d:%02d:%02d" % (h, m, s) ## update predict log _update_predict_log(country.upper(), y_pred, target_date, runtime, MODEL_VERSION, MODEL_VERSION_NOTE, dev=dev, verbose=verbose) return ({"y_pred": y_pred})