Python update_train_log Examples, src.logger.update_train_log Python Examples

Example #1

0

Show file

File: LoggerTests.py Project: haya1455/ai-workflow-capstone

    def test_02_train(self):
        """
        ensure that content can be retrieved from log file
        """

        log_file = os.path.join(LOG_DIR, "train-test.log")

        ## update the log
        country = 'all'
        timespan = ('2021-04-19', '2021-04-19')
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(country,
                         timespan,
                         eval_test,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        df = pd.read_csv(log_file)
        logged_eval_test = [literal_eval(i)
                            for i in df['eval_test'].copy()][-1]
        self.assertEqual(eval_test, logged_eval_test)

Example #2

0

Show file

File: LoggerTests.py Project: haya1455/ai-workflow-capstone

    def test_01_train(self):
        """
        ensure log file is created
        """

        log_file = os.path.join(LOG_DIR, "train-test.log")
        if os.path.exists(log_file):
            os.remove(log_file)

        ## update the log
        country = 'all'
        timespan = ('2021-04-19', '2021-04-19')
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(country,
                         timespan,
                         eval_test,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        self.assertTrue(os.path.exists(log_file))

Example #3

0

Show file

File: LoggerTests.py Project: girlrilaz/basic_project_template

    def test_02_train(self):
        """
        ensure that content can be retrieved from log file
        """

        log_file = os.path.join("logs", "model_train", "train-test.log")

        ## update the log
        data_shape = (100, 10)
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(data_shape,
                         eval_test,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        df = pd.read_csv(log_file)
        logged_eval_test = [literal_eval(i)
                            for i in df['eval_test'].copy()][-1]
        self.assertEqual(eval_test, logged_eval_test)

Example #4

0

Show file

    def test_02_create_train_log(self):
        """
        test that train log is created
        """

        logfile = the_testlogname("train")

        if os.path.exists(logfile):
            os.remove(logfile)

        # update_train_log(data_shape,eval_summary,model_version, runtime)
        update_train_log((150, 2), {'rmse': 0.8}, "9.9", 0., True)

        self.assertTrue(os.path.exists(logfile))

Example #5

0

Show file

    def test_03_archive_train_data(self):
        """
        test that train data is logged
        """

        logfile = the_testlogname("train")

        if os.path.exists(logfile):
            os.remove(logfile)

        # update_train_log(data_shape,eval_summary,model_version, runtime)
        data_shape = (150, 2)
        eval_summary = {'rmse': 0.8}
        model_version = MODEL_VERSION
        runtime = 0.05
        update_train_log(data_shape, eval_summary, model_version, runtime,
                         True)

        # get last row of log
        df = pd.read_csv(logfile, delimiter=',', quotechar='|')
        last = df.tail(1).iloc[0].to_dict()

        self.assertEqual(last['eval_summary'], str(eval_summary))
        self.assertEqual(last['data_shape'], str(data_shape))

Example #6

0

Show file

File: LoggerTests.py Project: girlrilaz/basic_project_template

    def test_01_train(self):
        """
        ensure log file is created
        """

        log_file = os.path.join("logs", "model_train", "train-test.log")
        if os.path.exists(log_file):
            os.remove(log_file)

        ## update the log
        data_shape = (100, 10)
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(data_shape,
                         eval_test,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        self.assertTrue(os.path.exists(log_file))

Example #7

0

Show file

def _model_train(df, tag, test=False):
    """
    example funtion to train model
    
    The 'test' flag when set to 'True':
        (1) subsets the data and serializes a test version
        (2) specifies that the use of the 'test' log file 

    """

    ## start timer for runtime
    time_start = time.time()

    X, y, dates = engineer_features(df)

    if test:
        n_samples = int(np.round(0.3 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size), subset_indices)
        y = y[mask]
        X = X[mask]
        dates = dates[mask]

    ## Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=True,
                                                        random_state=42)
    ## train a random forest model
    param_grid_rf = {
        'rf__criterion': ['mse', 'mae'],
        'rf__n_estimators': [10, 15, 20, 25]
    }

    pipe_rf = Pipeline(
        steps=[('scaler', StandardScaler()), ('rf', RandomForestRegressor())])

    grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf, cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)))

    ## retrain using all data
    grid.fit(X, y)
    model_name = re.sub("\.", "_", str(MODEL_VERSION))
    if test:
        saved_model = os.path.join(MODEL_DIR,
                                   "test-{}-{}.joblib".format(tag, model_name))
        print("... saving test version of model: {}".format(saved_model))
    else:
        saved_model = os.path.join(MODEL_DIR,
                                   "sl-{}-{}.joblib".format(tag, model_name))
        print("... saving model: {}".format(saved_model))

    joblib.dump(grid, saved_model)

    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)

    ## update log
    update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': eval_rmse},
                     runtime,
                     MODEL_VERSION,
                     MODEL_VERSION_NOTE,
                     test=True)

Example #8

0

Show file

def model_train(test=False):
    """
    example funtion to train model
    
    The 'test' flag when set to 'True':
        (1) subsets the data and serializes a test version
        (2) specifies that the use of the 'test' log file

    The iris dataset is already small so the subset is shown as an example

    Note that the latest training data is always saved to be used by perfromance monitoring tools.
    """

    ## load model parameters from conf/base/parameters.yml
    model_config = load_config("parameters.yml")
    test_size = model_config["model"]["test_size"]
    n_est = model_config["classifier"]["param_grid"]["clf__n_estimators"]
    criterion = model_config["classifier"]["param_grid"]["clf__criterion"]
    max_depth = model_config["classifier"]["param_grid"]["clf__max_depth"]

    ## start timer for runtime
    time_start = time.time()

    ## data ingestion from build_features
    X, y = load_data()

    preprocessor = get_preprocessor()

    ## subset the data to enable faster unittests
    if test:
        n_samples = int(np.round(0.9 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size), subset_indices)
        y = y[mask]
        X = X[mask]

    ## Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=test_size,
                                                        random_state=42)

    ## Specify parameters and model
    param_grid = {
        'clf__n_estimators': n_est,
        'clf__criterion': criterion,
        'clf__max_depth': max_depth
    }

    print("... grid searching")
    clf = ensemble.RandomForestClassifier()
    pipe = Pipeline(steps=[('pre', preprocessor), ('clf', clf)])

    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    params = grid.best_params_
    params = {re.sub("clf__", "", key): value for key, value in params.items()}

    ## fit model on training data
    clf = ensemble.RandomForestClassifier(**params)
    pipe = Pipeline(steps=[('pre', preprocessor), ('clf', clf)])

    pipe.fit(X_train, y_train)
    y_pred = pipe.predict(X_test)
    eval_test = classification_report(y_test, y_pred, output_dict=True)

    ## retrain using all data
    pipe.fit(X, y)

    if test:
        print("... saving test version of model")
        joblib.dump(pipe, os.path.join("models", "test.joblib"))
    else:
        print("... saving model: {}".format(SAVED_MODEL))
        joblib.dump(pipe, SAVED_MODEL)

        print("... saving latest data")
        data_file = os.path.join("models", 'latest-train.pickle')
        with open(data_file, 'wb') as tmp:
            pickle.dump({'y': y, 'X': X}, tmp)

    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)

    ## update the log file
    update_train_log(X.shape,
                     eval_test,
                     runtime,
                     MODEL_VERSION,
                     MODEL_VERSION_NOTE,
                     test=test)