Exemple #1
0
def trainClf(clf, param_grid_rf):

    data_dir = os.path.join(os.path.join('.', 'data'), 'cs-train')
    work_dir = os.path.join(os.path.join('.', 'data'), 'work-data')

    aDf = ingestTrainData(data_dir)

    data = getAllTS(aDf, work_dir)

    X, y, dates = engineer_features(data['united_kingdom'])

    # Perform a train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25,
                                                        shuffle=True, random_state=42)
    # train a random forest model

    pipe_rf = Pipeline(steps=[('scaler', StandardScaler()),
                              ('clf', clf)])

    grid = GridSearchCV(pipe_rf, param_grid=param_grid_rf,
                        cv=5, iid=False, n_jobs=-1)
    grid.fit(X_train, y_train)
    y_train_pred = grid.predict(X_train)
    y_pred = grid.predict(X_test)
    eval_train_rmse = round(np.sqrt(mean_squared_error(y_train, y_train_pred)))
    eval_rmse = round(np.sqrt(mean_squared_error(y_test, y_pred)))
    print('eval_rmse_test', eval_rmse)
    print('eval_rmse_train', eval_train_rmse)
    print(grid.best_estimator_)
    return grid
Exemple #2
0
def load_data(inp_dir=None, work_dir=None, training=None):
    # load data
    idf = ingestTrainData(inp_dir)
    ts_data = getAllTS(idf, work_dir)
    all_data = {}
    for country, df in ts_data.items():
        X, y, dates = engineer_features(df, training=training)
        dates = np.array([str(d) for d in dates])
        all_data[country] = {"X": X, "y": y, "dates": dates}
    return all_data
Exemple #3
0
def getFeatures():
    # load time series
    work_dir = join(data_dir, 'work-data')

    ts_file_path = join(work_dir, 'ts-data-all.csv')
    df = pd.DataFrame()
    if not exists(ts_file_path):
        # create time series
        idf = ingestTrainData(join('.', 'data'))
        df = getTimeSeries(idf)
    else:
        df = pd.read_csv(ts_file_path)

    X, y, dates = engineer_features(df)
    return X, y, dates
Exemple #4
0
def model_train(data_dir, test=False, model_dir=None, force_data_load=True):
    """
    funtion to train model given a df

    'mode' -  can be used to subset data essentially simulating a train
    """

    work_dir = os.path.join(data_dir, TSDIR)
    inp_dir = os.path.join(data_dir, 'cs-train')

    if not model_dir:
        model_dir = MODEL_DIR

    if not os.path.isdir(MODEL_DIR):
        os.mkdir(MODEL_DIR)

    if test:
        print("running training in test mode only uk will be trained")

    idf = pd.DataFrame()
    if (force_data_load):
        # print('loading data from ', inp_dir)
        idf = ingestTrainData(inp_dir)
    else:
        train_path = os.path.join(work_dir, 'train-data-cleaned.csv')
        idf = pd.read_csv(train_path)

    # fetch time-series formatted data
    ts_data = getAllTS(idf, work_dir)

    # train a different model for each data sets
    for country, df in ts_data.items():

        if test and country not in ['all', 'united_kingdom']:
            continue

        _model_train(df, country, model_dir=model_dir, test=test)
Exemple #5
0
def monitoring():

    # load time series
    data_dir = join('.', 'data')
    model_dir = join('.', 'models')
    monitor_dir = join('.', 'monitor')
    if not exists(monitor_dir):
        os.mkdir(monitor_dir)
    work_dir = join(data_dir, 'work-data')

    ts_file_path = join(work_dir, 'ts-data-all.csv')
    df = pd.DataFrame()
    if not exists(ts_file_path):
        # create time series
        idf = ingestTrainData(join('.', 'data'))
        df = getTimeSeries(idf)
    else:
        df = pd.read_csv(ts_file_path)

    X, y, dates = engineer_features(df)

    all_data, all_models = model_load(training=False,
                                      data_dir=data_dir,
                                      model_dir=model_dir,
                                      test=False)

    results = pd.DataFrame(columns=['date', 'y_pred', 'y', 'diff'])
    for idx, d in enumerate(dates):
        date = pd.to_datetime(d)
        error = False
        answ = None
        try:
            answ = model_predict('all',
                                 str(date.year),
                                 str(date.month),
                                 str(date.day),
                                 test=False,
                                 all_data=all_data,
                                 all_models=all_models)
        except:
            print('system error:' + str(sys.exc_info()[1]))
            error = True
        y_pred = None
        diff = None
        yt = y[idx]
        if not error:
            y_pred = answ['y_pred'][0]

            diff = abs(y_pred - yt)
        results = results.append(
            {
                'date': date,
                'y_pred': y_pred,
                'y': yt,
                'diff': diff
            },
            ignore_index=True)
        # take only the last dates
    today = dt.datetime.today()

    monname = "model-monitoring-{}-{}-{}".format(today.year, today.month,
                                                 today.day)
    results.to_csv(join(monitor_dir, monname + ".csv"))

    fig, ax = plt.subplots(1, 1)
    fig.set_size_inches(15, 8)
    ax.set_title('prediction error distribution')
    sns.distplot(results['diff'], bins=50, color='#008899', ax=ax)
    fig.savefig(join(monitor_dir, monname) + '.png', dpi=200)

    statistics_path = join(monitor_dir, 'monitor_statistics.csv')
    statDF = pd.DataFrame()
    today_iso = today.strftime('%y-%m-%d')

    mse = mean_squared_error(results['y'].values, results['y_pred'].values)
    if exists(statistics_path):
        statDF = pd.read_csv(statistics_path)
        found = statDF[statDF['date'] == today_iso]
        if (found.shape[0] > 0):
            statDF.loc[statDF['date'] == today_iso, ['mse']] = mse
    else:
        statDF = statDF.append({
            'date': today_iso,
            'mse': mse
        },
                               ignore_index=True)
    statDF.to_csv(statistics_path)