def model_monitor(country="total", dev=DEV, training=True):
    """
    performance monitoring
    """
    print("Monitor Model")
    
    ## import data
    datasets = engineer_features(training=training, dev=dev)
    X, y, dates, labels = datasets[country]
    dates = pd.to_datetime(dates)
    print(X.shape)
    
    ## train the model
    if training:
        _model_train(X, y, labels, tag=country, dev=dev)
    
    ## monitor RMSE
    samples = [10, 20, 30, 50, 60]

    for n in samples:
        X_new, y_new, dates_new = simulate_samples(n, X, y, dates)
        queries = [(str(d.year), str(d.month), str(d.day), country) for d in dates_new]
        y_pred = [model_predict(year=query[0], month=query[1], day=query[2], country=query[3],verbose=False, dev=dev)["y_pred"][0].round(2) for query in queries]
        rmse = np.sqrt(mean_squared_error(y_new.tolist(),y_pred))
        print("sample size: {}, RSME: {}".format(n, rmse.round(2)))
        
    ## monitor performance
    ## scaling
    scaler = StandardScaler()
    X = scaler.fit_transform(X)

    samples = [25, 50, 75, 90]

    clf_y = EllipticEnvelope(random_state=0,contamination=0.01)
    clf_X = EllipticEnvelope(random_state=0,contamination=0.01)

    clf_X.fit(X)
    clf_y.fit(y.reshape(y.size,1))

    results = defaultdict(list)
    for n in samples:
        X_new, y_new, dates_new = simulate_samples(n,X,y, dates)
        results["sample_size"].append(n)
        results['wasserstein_X'].append(np.round(wasserstein_distance(X.flatten(),X_new.flatten()),2))
        results['wasserstein_y'].append(np.round(wasserstein_distance(y,y_new),2))
        test1 = clf_X.predict(X_new)
        test2 = clf_y.predict(y_new.reshape(y_new.size,1))
        results["outlier_percent_X"].append(np.round(1.0 - (test1[test1==1].size / test1.size),2))
        results["outlier_percent_y"].append(np.round(1.0 - (test2[test2==1].size / test2.size),2))
    
    return pd.DataFrame(results)
Example #2
0
def model_train(save_img=False, dev=DEV, verbose=True):
    """
    train models
    """

    ## load engineered features
    datasets = engineer_features(dev=dev, training=True, verbose=verbose)

    if verbose:
        print("Training Models")

    ## build, train and save models
    for country in datasets.keys():
        tag = country
        if verbose:
            print("...training model for {}".format(tag.upper()))
        X, y, dates, feature_names = datasets[tag]
        _model_train(X,
                     y,
                     feature_names,
                     tag=tag,
                     dev=dev,
                     save_img=save_img,
                     verbose=verbose)
Example #3
0
def model_predict(year, month, day, country, dev=DEV, verbose=True):
    """
    make predictions
    """

    ## start timer for runtime
    time_start = time.time()

    ## load data
    datasets = engineer_features(training=False, dev=dev, verbose=verbose)

    ## load models
    models = model_load(dev=dev, verbose=verbose)

    if verbose:
        print("Make Prediction")

    ## check if the model is available
    if country not in models.keys():
        raise Exception(
            "ERROR (model_predict) - model for country '{}' could not be found"
            .format(country))

    ## ckeck if the data is available
    if country not in datasets.keys():
        raise Exception(
            "ERROR (model_predict) - dataset for country '{}' could not be found"
            .format(country))

    ## ensure the year, month day are numbers
    for d in [year, month, day]:
        if re.search("\D", d):
            raise Exception(
                "ERROR (model_predict) - invalid year, month or day")

    ## get the dataset and model for the given country
    X, y, dates, labels = datasets[country]
    df = pd.DataFrame(X, columns=labels, index=dates)
    model = models[country]

    ## check date
    target_date = "{}-{}-{}".format(year,
                                    str(month).zfill(2),
                                    str(day).zfill(2))

    if verbose:
        print(target_date)

    if target_date not in df.index.strftime('%Y-%m-%d'):
        raise Exception(
            "ERROR (model_predict) - {} not in range {} and {}".format(
                target_date,
                df.index.strftime('%Y-%m-%d')[0],
                df.index.strftime('%Y-%m-%d')[-1]))

    ## query the data
    query = pd.to_datetime(target_date)
    X_pred = df.loc[pd.to_datetime(query), :].values.reshape(1, -1)

    ## make prediction
    y_pred = model.predict(X_pred)

    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)

    ## update predict log
    _update_predict_log(country.upper(),
                        y_pred,
                        target_date,
                        runtime,
                        MODEL_VERSION,
                        MODEL_VERSION_NOTE,
                        dev=dev,
                        verbose=verbose)

    return ({"y_pred": y_pred})