def test_02_train(self):
        """
        ensure that content can be retrieved from log file
        """

        log_file = os.path.join(LOG_DIR, "train-test.log")

        ## update the log
        country = 'all'
        timespan = ('2021-04-19', '2021-04-19')
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(country,
                         timespan,
                         eval_test,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        df = pd.read_csv(log_file)
        logged_eval_test = [literal_eval(i)
                            for i in df['eval_test'].copy()][-1]
        self.assertEqual(eval_test, logged_eval_test)
    def test_01_train(self):
        """
        ensure log file is created
        """

        log_file = os.path.join(LOG_DIR, "train-test.log")
        if os.path.exists(log_file):
            os.remove(log_file)

        ## update the log
        country = 'all'
        timespan = ('2021-04-19', '2021-04-19')
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(country,
                         timespan,
                         eval_test,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        self.assertTrue(os.path.exists(log_file))
    def test_02_train(self):
        """
        ensure that content can be retrieved from log file
        """

        log_file = os.path.join("logs", "model_train", "train-test.log")

        ## update the log
        data_shape = (100, 10)
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(data_shape,
                         eval_test,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        df = pd.read_csv(log_file)
        logged_eval_test = [literal_eval(i)
                            for i in df['eval_test'].copy()][-1]
        self.assertEqual(eval_test, logged_eval_test)
Example #4
0
    def test_02_create_train_log(self):
        """
        test that train log is created
        """

        logfile = the_testlogname("train")

        if os.path.exists(logfile):
            os.remove(logfile)

        # update_train_log(data_shape,eval_summary,model_version, runtime)
        update_train_log((150, 2), {'rmse': 0.8}, "9.9", 0., True)

        self.assertTrue(os.path.exists(logfile))
Example #5
0
    def test_03_archive_train_data(self):
        """
        test that train data is logged
        """

        logfile = the_testlogname("train")

        if os.path.exists(logfile):
            os.remove(logfile)

        # update_train_log(data_shape,eval_summary,model_version, runtime)
        data_shape = (150, 2)
        eval_summary = {'rmse': 0.8}
        model_version = MODEL_VERSION
        runtime = 0.05
        update_train_log(data_shape, eval_summary, model_version, runtime,
                         True)

        # get last row of log
        df = pd.read_csv(logfile, delimiter=',', quotechar='|')
        last = df.tail(1).iloc[0].to_dict()

        self.assertEqual(last['eval_summary'], str(eval_summary))
        self.assertEqual(last['data_shape'], str(data_shape))
    def test_01_train(self):
        """
        ensure log file is created
        """

        log_file = os.path.join("logs", "model_train", "train-test.log")
        if os.path.exists(log_file):
            os.remove(log_file)

        ## update the log
        data_shape = (100, 10)
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(data_shape,
                         eval_test,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        self.assertTrue(os.path.exists(log_file))
Example #7
0
def _model_train(df, tag, test=False):
    """
    example funtion to train model
    
    The 'test' flag when set to 'True':
        (1) subsets the data and serializes a test version
        (2) specifies that the use of the 'test' log file 

    """

    ## start timer for runtime
    time_start = time.time()

    X, y, dates = engineer_features(df)

    if test:
        n_samples = int(np.round(0.3 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size), subset_indices)
        y = y[mask]
        X = X[mask]
        dates = dates[mask]

    ## Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=True,
                                                        random_state=42)
    ## train a random forest model
    param_grid_rf = {
        'rf__criterion': ['mse', 'mae'],
        'rf__n_estimators': [10, 15, 20, 25]
    }

    pipe_rf = Pipeline(
        steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())])

    grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)))

    ## retrain using all data
    grid.fit(X, y)
    model_name = re.sub("\.", "_", str(MODEL_VERSION))
    if test:
        saved_model = os.path.join(MODEL_DIR,
                                   "test-{}-{}.joblib".format(tag, model_name))
        print("... saving test version of model: {}".format(saved_model))
    else:
        saved_model = os.path.join(MODEL_DIR,
                                   "sl-{}-{}.joblib".format(tag, model_name))
        print("... saving model: {}".format(saved_model))

    joblib.dump(grid, saved_model)

    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)

    ## update log
    update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': eval_rmse},
                     runtime,
                     MODEL_VERSION,
                     MODEL_VERSION_NOTE,
                     test=True)
Example #8
0
def model_train(test=False):
    """
    example funtion to train model
    
    The 'test' flag when set to 'True':
        (1) subsets the data and serializes a test version
        (2) specifies that the use of the 'test' log file

    The iris dataset is already small so the subset is shown as an example

    Note that the latest training data is always saved to be used by perfromance monitoring tools.
    """

    ## load model parameters from conf/base/parameters.yml
    model_config = load_config("parameters.yml")
    test_size = model_config["model"]["test_size"]
    n_est = model_config["classifier"]["param_grid"]["clf__n_estimators"]
    criterion = model_config["classifier"]["param_grid"]["clf__criterion"]
    max_depth = model_config["classifier"]["param_grid"]["clf__max_depth"]

    ## start timer for runtime
    time_start = time.time()

    ## data ingestion from build_features
    X, y = load_data()

    preprocessor = get_preprocessor()

    ## subset the data to enable faster unittests
    if test:
        n_samples = int(np.round(0.9 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size), subset_indices)
        y = y[mask]
        X = X[mask]

    ## Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=42)

    ## Specify parameters and model
    param_grid = {
        'clf__n_estimators': n_est,
        'clf__criterion': criterion,
        'clf__max_depth': max_depth
    }

    print("... grid searching")
    clf = ensemble.RandomForestClassifier()
    pipe = Pipeline(steps=[('pre', preprocessor), ('clf', clf)])

    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    params = grid.best_params_
    params = {re.sub("clf__", "", key): value for key, value in params.items()}

    ## fit model on training data
    clf = ensemble.RandomForestClassifier(**params)
    pipe = Pipeline(steps=[('pre', preprocessor), ('clf', clf)])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    eval_test = classification_report(y_test, y_pred, output_dict=True)

    ## retrain using all data
    pipe.fit(X, y)

    if test:
        print("... saving test version of model")
        joblib.dump(pipe, os.path.join("models", "test.joblib"))
    else:
        print("... saving model: {}".format(SAVED_MODEL))
        joblib.dump(pipe, SAVED_MODEL)

        print("... saving latest data")
        data_file = os.path.join("models", 'latest-train.pickle')
        with open(data_file, 'wb') as tmp:
            pickle.dump({'y': y, 'X': X}, tmp)

    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)

    ## update the log file
    update_train_log(X.shape,
                     eval_test,
                     runtime,
                     MODEL_VERSION,
                     MODEL_VERSION_NOTE,
                     test=test)