def test_01_train(self):
        """
        ensure log file is created
        """

        log_file = os.path.join("logs", "train-test.log")
        if os.path.exists(log_file):
            os.remove(log_file)

        ## YOUR CODE HERE
        ## Call the update_train_log() function from logger.py with arbitrary input values and test if the log file
        ## exists in you file system using the assertTrue() base method from unittest.
        ## update the log
        data_shape = (100, 10)
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(self,
                         data_shape,
                         eval_test,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        self.assertTrue(os.path.exists(log_file))
Example #2
0
    def test_02_train(self):
        """
        ensure that content can be retrieved from log file
        """

        log_file = os.path.join("logs", "train-test.log")

        ## update the log
        tag = 'netherlands'
        period = ('2017-12-01', '2019-05-31')
        rmse = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(tag,
                         period,
                         rmse,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        df = pd.read_csv(log_file)
        logged_period = [literal_eval(i) for i in df['period'].copy()][-1]
        self.assertEqual(period, logged_period)
    def test_02_train(self):
        """
        ensure that content can be retrieved from log file
        """

        log_file = os.path.join("logs", "train-test.log")

        ## YOUR CODE HERE
        ## Log arbitrary values calling update_train_log from logger.py. Then load the data
        ## from this log file and assert that the loaded data is the same as the data you logged.
        ## update the log
        data_shape = (100, 10)
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(self,
                         data_shape,
                         eval_test,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        df = pd.read_csv(log_file)
        logged_eval_test = [literal_eval(i)
                            for i in df['eval_test'].copy()][-1]
        self.assertEqual(eval_test, logged_eval_test)
Example #4
0
    def test_01_train(self):
        """
        ensure log file is created
        """

        log_file = os.path.join("logs", "train-test.log")
        if os.path.exists(log_file):
            os.remove(log_file)

        ## update the log
        country = "United Kingdom"
        data_shape = (100, 10)
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(country,
                         data_shape,
                         eval_test,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        self.assertTrue(os.path.exists(log_file))
Example #5
0
    def test_02_train(self):
        """
        ensure that content can be retrieved from log file
        """

        log_file = os.path.join("logs", "train-test.log")

        ## update the log
        tag = "Another Country"
        period = "('2017-12-01', '2019-05-29')"
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.2
        model_version_note = "test model"

        update_train_log(tag,
                         period,
                         eval_test,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        df = pd.read_csv(log_file)
        logged_eval_test = [literal_eval(i) for i in df['rmse'].copy()][-1]
        self.assertEqual(eval_test, logged_eval_test)
Example #6
0
    def test_01_train(self):
        """
        ensure log file is created
        """
        today = date.today()
        log_file = os.path.join(
            LOG_DIR, "{}-train-{}-{}.log".format(LOG_PREFIX, today.year,
                                                 today.month))
        if os.path.exists(log_file):
            os.remove(log_file)

        ## update the log
        country = 'india'
        date_range = ('2017-11-29', '2019-05-24')
        metric = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(country,
                         date_range,
                         metric,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True,
                         prefix=LOG_PREFIX)

        self.assertTrue(os.path.exists(log_file))
Example #7
0
    def test_02_train(self):
        """
        ensure that content can be retrieved from log file
        """

        log_file = os.path.join("logs", "train-test.log")

        ## update the log
        data_shape = (100, 10)
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(data_shape,
                         eval_test,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        df = pd.read_csv(log_file)
        logged_eval_test = [literal_eval(i)
                            for i in df['eval_test'].copy()][-1]
        self.assertEqual(eval_test, logged_eval_test)
    def test_01_train(self):
        """
        ensure log file is created
        """
        today = date.today()
        log_file = os.path.join(
            "logs", "train-{}-{}.log".format(today.year, today.month))
        if os.path.exists(log_file):
            os.remove(log_file)

        ## update the log
        tag = 'all'
        dates = ('2018-01-15', '2019-01-15')
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        MODEL_VERSION = 0.1

        update_train_log(tag,
                         dates,
                         eval_test,
                         runtime,
                         MODEL_VERSION,
                         test=False)

        self.assertTrue(os.path.exists(log_file))
Example #9
0
    def test_01_train(self):
        """
        ensure log file is created
        """

        log_file = os.path.join("logs", "train-test.log")
        if os.path.exists(log_file):
            os.remove(log_file)

        ## update the log
        country = "Test Country"
        date_range = ("2018-01-01", "2019-01-01")
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(country,
                         date_range,
                         eval_test,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        self.assertTrue(os.path.exists(log_file))
    def test_02_train(self):
        """
        ensure that content can be retrieved from log file
        """

        today = date.today()
        log_file = os.path.join(
            "logs", "train-{}-{}.log".format(today.year, today.month))
        if os.path.exists(log_file):
            os.remove(log_file)

        ## update the log
        tag = 'all'
        dates = ('2018-01-15', '2019-01-15')
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        MODEL_VERSION = 0.1

        update_train_log(tag,
                         dates,
                         eval_test,
                         runtime,
                         MODEL_VERSION,
                         test=False)

        df = pd.read_csv(log_file)
        logged_eval_test = [literal_eval(i)
                            for i in df['eval_test'].copy()][-1]
        self.assertEqual(eval_test, logged_eval_test)
Example #11
0
def model_train():
    ## start timer for runtime
    time_start = time.time()
    data_dir = os.path.join("data", "cs_train", "data")
    ts_data = fetch_ts(data_dir)

    for country, df in ts_data.items():
        m = Prophet()
        df2 = df[["date", "revenue"]]
        df2.columns = ['ds', 'y']
        m.fit(df2)
        future = m.make_future_dataframe(periods=120)
        forecast = m.predict(future)
        forecast[['ds', 'yhat', 'yhat_lower', 'yhat_upper']].tail()
        filename = "data/forecasts/forecast_" + country
        forecast.to_csv(filename)

    ## update the log file
    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)
    test = False
    update_train_log(forecast.shape, runtime, MODEL_VERSION,
                     MODEL_VERSION_NOTE, test)

    return True
def _model_train(df, tag, pipe, param_grid, test=False):
    """
    example funtion to train model
    
    The 'test' flag when set to 'True':
        (1) subsets the data and serializes a test version
        (2) specifies that the use of the 'test' log file 

    """

    ## start timer for runtime
    time_start = time.time()

    X, y, dates = engineer_features(df)

    if test:
        n_samples = int(np.round(0.3 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=False).astype(int)
        mask = np.in1d(np.arange(y.size), subset_indices)
        y = y[mask]
        X = X[mask]
        dates = dates[mask]

    ## Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.25,
                                                        shuffle=True,
                                                        random_state=42)

    grid = GridSearchCV(pipe, param_grid=param_grid, cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    y_pred = grid.predict(X_test)
    eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)))

    ## retrain using all data
    grid.fit(X, y)
    model_name = re.sub("\.", "_", str(MODEL_VERSION))
    if test:
        saved_model = os.path.join(MODEL_DIR,
                                   "test-{}-{}.joblib".format(tag, model_name))
        print("... saving test version of model: {}".format(saved_model))
    else:
        saved_model = os.path.join(MODEL_DIR,
                                   "sl-{}-{}.joblib".format(tag, model_name))
        print("... saving model: {}".format(saved_model))

    joblib.dump(grid, saved_model)

    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)

    ## update log
    update_train_log(tag, (str(dates[0]), str(dates[-1])), {'rmse': eval_rmse},
                     runtime,
                     MODEL_VERSION,
                     test=test)
    def test_01_train(self):
        """
        ensure log file is created
        """

        log_file = os.path.join("logs", "train-test.log")
        if os.path.exists(log_file):
            os.remove(log_file)

        # update the log
        tag = (100, 10)
        date0 = "xxx"
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(tag,
                         date0,
                         eval_test,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        self.assertTrue(os.path.exists(log_file))
Example #14
0
    def test_02_train(self):
        """
        ensure that content can be retrieved from log file
        """
        today = date.today()
        log_file = os.path.join(
            LOG_DIR, "{}-train-{}-{}.log".format(LOG_PREFIX, today.year,
                                                 today.month))

        ## update the log
        country = 'india'
        date_range = ('2017-11-29', '2019-05-24')
        metric = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(country,
                         date_range,
                         metric,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True,
                         prefix=LOG_PREFIX)

        df = pd.read_csv(log_file)
        logged_metric = [literal_eval(i) for i in df['metric'].copy()][-1]
        self.assertEqual(metric, logged_metric)
Example #15
0
    def test_01_train(self):
        """
        ensure log file is created
        """

        log_file = os.path.join("logs", "train-test.log")
        if os.path.exists(log_file):
            os.remove(log_file)

        ## update the log
        tag = "Some Country"
        period = "('2017-12-01', '2019-05-29')"
        eval_test = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(tag,
                         period,
                         eval_test,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        self.assertTrue(os.path.exists(log_file))
Example #16
0
def model_train(clf=None):
    """
    example funtion to train model
    """
    ## start timer for runtime
    time_start = time.time()

    ## data ingestion
    X, y = fetch_data()

    ## Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)

    if clf is None:
        print("Support Vector Machine will be used per default")
        ## Specify parameters and model
        params = {'C': 1.0, 'kernel': 'linear', 'gamma': 0.5}
        clf = svm.SVC(**params, probability=True)
        #clf = ensemble.RandomForestClassifier()

    pipe = Pipeline(steps=[('pre', preprocessPipe()), ('clf', clf)])

    p = pipe

    ## fit model on training data
    p = p.fit(X_train, y_train)
    y_pred = p.predict(X_test)
    eval_test = classification_report(y_test, y_pred, output_dict=True)

    print("... saving model: {}".format(SAVED_MODEL))
    joblib.dump(p, SAVED_MODEL)

    print(pd.DataFrame(eval_test))

    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)

    update_train_log(X.shape,
                     eval_test,
                     runtime,
                     MODEL_VERSION,
                     MODEL_VERSION_NOTE,
                     test=False)
def model_train(df=None, test=False):
    """
    example funtion to train model
    
    The 'test' flag when set to 'True':
        (1) subsets the data and serializes a test version
        (2) specifies that the use of the 'test' log file
    Note that the latest training data is always saved to be used by perfromance monitoring tools.
    """

    ## start timer for runtime
    time_start = time.time()

    if df is None:
        df = load_data()

    ts = df.sort_values(by="invoice_date")
    ts = ts.groupby("invoice_date")["price"].sum()
    model = ARIMA(ts, order=(8, 0, 8))
    results_ARIMA = model.fit(disp=-1)

    if test:
        print("... saving test version of model")
        joblib.dump(results_ARIMA, os.path.join("models", "test.joblib"))
    else:
        print("... saving model: {}".format(SAVED_MODEL))
        joblib.dump(results_ARIMA, SAVED_MODEL)

        print("... saving latest data")
        data_file = os.path.join("models", 'latest-train.pickle')
        with open(data_file, 'wb') as tmp:
            pickle.dump({'df': df}, tmp)

    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)

    ## update the log file
    update_train_log(len(ts),
                     'eval_test TBD',
                     runtime,
                     MODEL_VERSION,
                     MODEL_VERSION_NOTE,
                     test=test)
Example #18
0
    def test_02_train(self):
        """
        ensure that content can be retrieved from log file
        """
        today = date.today()
        log_file = os.path.join(
            "logs", "train-{}-{}.log".format(today.year, today.month))

        ## update the log
        x_shape = (2000, )
        runtime = "00:00:02"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(x_shape, runtime, model_version, model_version_note)

        df = pd.read_csv(log_file)
        logged_x_shape = [literal_eval(i) for i in df['x_shape'].copy()][-1]
        self.assertEqual(x_shape, logged_x_shape)
Example #19
0
    def test_01_train(self):
        """
        ensure log file is created
        """
        today = date.today()
        log_file = os.path.join(
            "logs", "train-{}-{}.log".format(today.year, today.month))
        if os.path.exists(log_file):
            os.remove(log_file)

        ## update the log
        x_shape = (1000, )
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(x_shape, runtime, model_version, model_version_note)

        self.assertTrue(os.path.exists(log_file))
Example #20
0
    def test_02_train(self):
        """
        ensure that content can be retrieved from log file
        """

        log_file = os.path.join("logs","all-train-test.log")

        ## update the log
        tag = 'all'
        dt_range = ('2018-01-01','2018-02-01')
        eval_test = {'rmse':0.5}
        runtime = "00:00:01"
        MODEL_VERSION = 0.1
        MODEL_VERSION_NOTE = "test model"

        update_train_log(tag,dt_range,eval_test,runtime,MODEL_VERSION,MODEL_VERSION_NOTE,test=True)

        df = pd.read_csv(log_file)
        logged_eval_test = [literal_eval(i) for i in df['eval_test'].copy()][-1]
        self.assertEqual(eval_test,logged_eval_test)
Example #21
0
    def test_01_train(self):
        """
        ensure log file is created
        """

        log_file = os.path.join("logs","all-train-test.log")
        if os.path.exists(log_file):
            os.remove(log_file)

        ## update the log
        tag = 'all'
        dt_range = ('2018-01-01','2018-02-01')
        eval_test = {'rmse':0.5}
        runtime = "00:00:01"
        MODEL_VERSION = 0.1
        MODEL_VERSION_NOTE = "test model"

        update_train_log(tag,dt_range,eval_test,runtime,MODEL_VERSION,MODEL_VERSION_NOTE,test=True)

        self.assertTrue(os.path.exists(log_file))
Example #22
0
    def test_log_training(self):
        """
        Ensure that content can be retrieved from log file
        """

        log_file = os.path.join("logs", "train-test.log")

        ## update the log
        data_shape = (100, 10)
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(data_shape,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        df = pd.read_csv(log_file)
        self.assertTrue(os.path.exists(log_file))
Example #23
0
    def test_01_train(self):
        """
        ensure log file is created
        """

        log_file = os.path.join("logs", "train-test.log")
        if os.path.exists(log_file):
            os.remove(log_file)
        
        ## update the log
        tag = "all"
        time_window = "('2019-06-01', '2020-09-30')"
        eval_test = {'rmse':0.5}
        runtime = "000:00:01"
        model_version = 0.1
        model_version_note = "test model"
        
        update_train_log(tag, time_window, eval_test, runtime,
                         model_version, model_version_note, test=True)

        self.assertTrue(os.path.exists(log_file))
Example #24
0
    def test_02_train(self):
        """
        ensure that content can be retrieved from log file
        """

        log_file = os.path.join("logs", "train-test.log")
        
        ## update the log
        tag = "all"
        time_window = "('2019-06-01', '2020-09-30')"
        eval_test = {'rmse':0.5}
        runtime = "000:00:01"
        model_version = 0.1
        model_version_note = "test model"
        
        update_train_log(tag, time_window, eval_test, runtime,
                         model_version, model_version_note, test=True)

        df = pd.read_csv(log_file)
        logged_eval_test = [literal_eval(i) for i in df['eval_test'].copy()][-1]
        self.assertEqual(eval_test, logged_eval_test)
Example #25
0
def model_train(mode=None, test=False):
    """
    example funtion to train model
    
    'mode' -  can be used to subset data essentially simulating a train
    """

    ## start timer for runtime
    time_start = time.time()

    ## data ingestion
    X, y = fetch_data()

    ## Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)

    ## Specify parameters and model
    params = {'C': 1.0, 'kernel': 'linear', 'gamma': 0.5}
    clf = svm.SVC(**params, probability=True)

    ## fit model on training data
    clf = clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(classification_report(y_test, y_pred))

    ## retrain using all data
    # clf.fit(X, y)
    print("... saving model: {}".format(SAVED_MODEL))
    joblib.dump(clf, SAVED_MODEL)

    m, s = divmod(time.time() - time_start, 60)
    h, m = divmod(m, 60)
    runtime = "%03d:%02d:%02d" % (h, m, s)

    eval_summary = {'rmse': mean_squared_error(y_test, y_pred, squared=False)}

    update_train_log(X.shape, eval_summary, MODEL_VERSION, runtime, test)
Example #26
0
    def test_log_creation(self):
        """
        Ensure log file is created
        """

        log_file = os.path.join("logs", "train-test.log")
        if os.path.exists(log_file):
            os.remove(log_file)

        ## update the log
        data_shape = (100, 10)
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(data_shape,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        self.assertTrue(os.path.exists(log_file))
Example #27
0
    def test_02_train(self):

        log_file = os.path.join("logs", "train-test.log")

        ## update the log
        tag = 'united_kingdom'
        period = ('2017-12-01', '2018-12-01')
        rmse = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(tag,
                         period,
                         rmse,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        df = pd.read_csv(log_file)
        logged_period = [literal_eval(i) for i in df['period'].copy()][-1]
        self.assertEqual(period, logged_period)
Example #28
0
    def test_01_train(self):

        log_file = os.path.join("logs", "train-test.log")
        if os.path.exists(log_file):
            os.remove(log_file)

        ## update the log
        tag = 'united_kingdom'
        period = ('2017-12-01', '2018-12-01')
        rmse = {'rmse': 0.5}
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(tag,
                         period,
                         rmse,
                         runtime,
                         model_version,
                         model_version_note,
                         test=True)

        self.assertTrue(os.path.exists(log_file))
Example #29
0
    def test_01_train(self):
        """
        ensure log file is created
        """
        today = date.today()
        log_file = os.path.join(
            LOG_DIR, "train-{}-{}.log".format(today.year, today.month))
        if os.path.exists(log_file):
            os.remove(log_file)

        ## update the log
        date_range = ('2017-11-29', '2019-05-24')
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(date_range,
                         runtime,
                         model_version,
                         model_version_note,
                         test=False)

        self.assertTrue(os.path.exists(log_file))
Example #30
0
    def test_02_train(self):
        """
        ensure that content can be retrieved from log file
        """
        today = date.today()
        log_file = os.path.join(
            LOG_DIR, "train-{}-{}.log".format(today.year, today.month))

        ## update the log
        date_range = ('2017-11-29', '2019-05-24')
        runtime = "00:00:01"
        model_version = 0.1
        model_version_note = "test model"

        update_train_log(date_range,
                         runtime,
                         model_version,
                         model_version_note,
                         test=False)

        df = pd.read_csv(log_file)
        logged_model_version = df["model_version"].iloc[-1]
        self.assertEqual(model_version, logged_model_version)