Example #1
0
def model_load(prefix='sl',data_dir=None,training=True):
    """
    example funtion to load model
    
    The prefix allows the loading of different models
    """

    if not data_dir:
        localpath = os.path.abspath('')
        data_dir = os.path.join(localpath,"data_dir")
            
    models = [f for f in os.listdir(os.path.join(".","models")) if re.search("sl",f)]

    if len(models) == 0:
        raise Exception("Models with prefix '{}' cannot be found did you train?".format(prefix))

    all_models = {}
    for model in models:
        all_models[re.split("-",model)[1]] = joblib.load(os.path.join(".","models",model))

    ## load data
    ts_data = fetch_ts(data_dir)
    all_data = {}
    for country, df in ts_data.items():
        X,y,dates = engineer_features(df,training=training)
        dates = np.array([str(d) for d in dates])
        all_data[country] = {"X":X,"y":y,"dates": dates}
        
    return(all_data, all_models)
Example #2
0
def model1_RandomForest(data_dir):
    ts_data = fetch_ts(data_dir)
    df = ts_data['all']
    X, y, dates = engineer_features(df)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=True,
                                                        random_state=42)
    param_grid_rf = {
        'rf__criterion': ['mse', 'mae'],
        'rf__n_estimators': [10, 15, 20, 25]
    }

    pipe_rf = Pipeline(
        steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())])

    grid = GridSearchCV(pipe_rf,
                        param_grid=param_grid_rf,
                        cv=5,
                        iid=False,
                        n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)))
    return (eval_rmse)
Example #3
0
def get_monitoring_tools(df):
    """
    determine outlier and distance thresholds
    return thresholds, outlier model(s) and source distributions for distances
    NOTE: for classification the outlier detection on y is not needed

    """
    X, y, dates = engineer_features(df)
    X1 = X.to_numpy()
    xpipe = Pipeline(steps=[(
        'pca',
        PCA(2)), ('clf',
                  EllipticEnvelope(random_state=0, contamination=0.01))])
    xpipe.fit(X1)
    bs_samples = 549
    outliers_X = np.zeros(bs_samples)
    wasserstein_X = np.zeros(bs_samples)
    wasserstein_y = np.zeros(bs_samples)
    for b in range(bs_samples):
        n_samples = int(np.round(0.80 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X1.shape[0]),
                                          n_samples,
                                          replace=True).astype(int)
        y_bs = y[subset_indices]
        X_bs = X1[subset_indices, :]

        test1 = xpipe.predict(X_bs)
        wasserstein_X[b] = wasserstein_distance(X1.flatten(), X_bs.flatten())
        wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten())
        outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size))

    outliers_X.sort()
    outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int(
        0.025 * bs_samples)]

    wasserstein_X.sort()
    wasserstein_X_threshold = wasserstein_X[int(
        0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)]

    wasserstein_y.sort()
    wasserstein_y_threshold = wasserstein_y[int(
        0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)]

    to_return = {
        "outlier_X": np.round(outlier_X_threshold, 1),
        "wasserstein_X": np.round(wasserstein_X_threshold, 2),
        "wasserstein_y": np.round(wasserstein_y_threshold, 2),
        "clf_X": xpipe,
        "X_source": X1,
        "y_source": y,
        "latest_X": X,
        "latest_y": y
    }
    return (to_return)
Example #4
0
def model2_Linearregression(data_dir):
    ts_data = fetch_ts(data_dir)
    df = ts_data['all']
    X, y, dates = engineer_features(df)
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=True,
                                                        random_state=42)

    reg = linear_model.LinearRegression()
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)))
    return (eval_rmse)
Example #5
0
def _model_train(df,tag,test=False):
    """
    example funtion to train model
    
    The 'test' flag when set to 'True':
        (1) subsets the data and serializes a test version
        (2) specifies that the use of the 'test' log file 
    """


    ## start timer for runtime
    time_start = time.time()
    
    X,y,dates = engineer_features(df)

    if test:
        n_samples = int(np.round(0.3 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size),subset_indices)
        y=y[mask]
        X=X[mask]
        dates=dates[mask]
        
    ## Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                        shuffle=True, random_state=42)
    ## train a random forest model
    param_grid_rf = {
    'rf__criterion': ['mse','mae'],
    'rf__n_estimators': [10,15,20,25]
    }

    pipe_rf = Pipeline(steps=[('scaler', StandardScaler()),
                              ('rf', RandomForestRegressor())])
    
    grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, iid=False, n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    eval_rmse =  round(np.sqrt(mean_squared_error(y_test,y_pred)))
    
    ## retrain using all data
    grid.fit(X, y)
    model_name = re.sub("\.","_",str(MODEL_VERSION))
    if test:
        saved_model = os.path.join(MODEL_DIR,
                                   "test-{}-{}.joblib".format(tag,model_name))
        print("... saving test version of model: {}".format(saved_model))
    else:
        saved_model = os.path.join(MODEL_DIR,
                                   "sl-{}-{}.joblib".format(tag,model_name))
        print("... saving model: {}".format(saved_model))
        
    joblib.dump(grid,saved_model)

    m, s = divmod(time.time()-time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d"%(h, m, s)

    ## update log
    update_train_log(tag,(str(dates[0]),str(dates[-1])),eval_rmse,runtime,MODEL_VERSION, MODEL_VERSION_NOTE,test=test)