def main(scoring, features, response, output_file):
    estimators = get_estimators()
    results = []
    names = []
    X_unscaled = read_processed_data(features)
    X = X_unscaled
    # X = rescale_data(X_unscaled)
    y = read_processed_data(response)
    kfold = KFold(n_splits=10, random_state=7)
    # Use cv = kfold if using kfold validation or cv = LeaveOneOut() for Leave One Out
    cv = kfold

    title = "Scoring metrics: {}\n\n".format(scoring)
    with open(output_file, 'a+') as fh:
        fh.write(title)

    for name, estimator in estimators:
        cv_results = cross_val_score(estimator, X, y, cv=cv, scoring=scoring)
        results.append(cv_results)
        names.append(name)

        msg = "{}:  Mean: {:.3f}  Std: {:.3f}\n".format(
            name, cv_results.mean(), cv_results.std())
        with open(output_file, 'a+') as fh:
            fh.write(msg)
Beispiel #2
0
def main(output_file, features, response,  scaler='standardized'):
    X = read_processed_data(features)
    y = read_processed_data(response)
    estimators = []
    if scaler == 'standardized':
        estimators.append(('standardized', StandardScaler()))
    elif scaler == 'minmax':
        estimators.append(('minmax', MinMaxScaler()))
    else:
        pass
    estimators.append(('lda', LinearDiscriminantAnalysis()))
    model = Pipeline(estimators)
    model.fit(X, y)

    # Evaluate pipeline
    kfold = KFold(n_splits=10, random_state=7)
    cv = kfold
    results = cross_val_score(model, X, y, cv=cv)
    print(results.mean())

    joblib.dump(model, output_file)

    test_data = read_processed_data('data/processed/features.npy')[0:5,]

    print(model.predict(test_data))
Beispiel #3
0
def main(scoring, features, response, output_file):
    estimators = get_estimators()
    results = []
    names = []

    title = "Scoring metrics: {}\n\n".format(scoring)
    X = read_processed_data(features)
    y = read_processed_data(response)
    kfold = KFold(n_splits=10, random_state=7)
    cv = kfold  # or cv=LeaveOneOut()

    with open(output_file, 'a+') as fh:
        fh.write(title)

    for name, estimator in estimators:
        cv_results = cross_val_score(estimator, X, y, cv=cv, scoring=scoring)
        results.append(cv_results)
        names.append(name)

        y_pred = cross_val_predict(estimator, X, y, cv=cv)
        cls_rpt = classification_report(y, y_pred)
        conf_mat = confusion_matrix(y, y_pred)
        conf_mat = np.array2string(conf_mat)

        msg = "{}:  Mean: {:.3f}  Std: {:.3f}\n".format(
            name, cv_results.mean(), cv_results.std())
        with open(output_file, 'a+') as fh:
            fh.write(msg)
            fh.write('\n')
            fh.write(conf_mat)
            fh.write('\n')
            fh.write(cls_rpt)
            fh.write('\n')
Beispiel #4
0
def main(features, response):
    X = read_processed_data(features)
    y = read_processed_data(response)
    estimators = []
    estimators.append(('standardize', StandardScaler()))
    estimators.append(('LR', LinearDiscriminantAnalysis()))
    model = Pipeline(estimators)
    kfold = KFold(n_splits=10, random_state=7)
    cv = kfold
    results = cross_val_score(model, X, y, cv=cv)
    print(results.mean())
Beispiel #5
0
def tune_knn(features, response, output_tuning_results, output_tuned_model):
    estimator = KNeighborsClassifier()
    n_neighbor_range = list(range(1, 31))
    weights = ['uniform', 'distance']
    param_grid = dict(n_neighbors=n_neighbor_range, weights=weights)
    grid = GridSearchCV(estimator, param_grid, cv=10, scoring='accuracy')
    X = read_processed_data(features)
    y = read_processed_data(response)
    grid.fit(X, y)
    tuning_results = 'Best score: {}\nBest estimator: {}\nBest params: {}\n'.format(
        grid.best_score_, grid.best_params_, grid.best_estimator_)
    with open(output_tuning_results, 'w') as fh:
        fh.write(tuning_results)
    joblib.dump(grid.best_estimator_, output_tuned_model)
def main(input_file, output_file, frequency, scenario, regr_vars, multiplier,
         baseline, look_back, look_ahead, test_year):

    df = read_processed_data(input_file)

    X, y = generate_data(df,
                         freq=frequency,
                         scenario=scenario,
                         regr_vars=regr_vars,
                         multiplier=multiplier,
                         baseline=baseline,
                         look_back=look_back,
                         look_ahead=look_ahead)

    trainX, trainY, testX, testY = split_train_test(
        X,
        y,
        test_year=test_year,
        save_data=True,
        pname='data/processed/CatBoost/')
    print("Generated data for CatBoost")
    print('Using as training features: ', trainX.columns)
    print('Training model...')
    model = CatBoostModel()
    model.train(trainX, trainY)
    model.save_model(output_file)
    model.make_predictions(testX, save_to='data/output/CatBoost/preds.pkl')
def main(input_file, output_file):
    print("Training model")

    dframe = read_processed_data(input_file)
    model = RandomForestModel()
    model.train(dframe)
    model.save(output_file)
def main(input_file, output_file, iterations, scenario, regr_vars, multiplier,
         baseline, look_back, look_ahead, test_year):

    df = read_processed_data(input_file)

    X, y = generate_data(df,
                         freq='D',
                         scenario=scenario,
                         regr_vars=regr_vars,
                         multiplier=multiplier,
                         baseline=baseline,
                         look_back=look_back,
                         look_ahead=look_ahead)

    trainX, trainY, testX, testY = split_train_test(
        X, y, test_year=test_year, save_data=True, pname='data/processed/GPC/')
    print("Generated data for GPC")
    print('Training model...')

    x_data_dim = trainX.size(-1)
    model = GPClassificationModel(data_dim=x_data_dim)
    # Cuda the model and likelihood function
    model = GPClassificationModel(data_dim=x_data_dim).cuda()
    likelihood = gpytorch.likelihoods.BernoulliLikelihood().cuda()
    # Find optimal model hyperparameters
    model.train()
    likelihood.train()
    # Use the Adam optimizer
    optimizer = torch.optim.Adam(model.parameters(), lr=0.1)
    # "Loss" for GPs - the marginal log likelihood
    # n_data refers to the amount of training data
    mll = gpytorch.mlls.VariationalELBO(likelihood,
                                        model,
                                        num_data=trainY.numel())

    # Training function
    def train(num_iter=iterations):
        for i in range(num_iter):
            optimizer.zero_grad()
            output = model(trainX)
            loss = -mll(output, trainY)
            loss.backward()
            print('Iter %d/%d - Loss: %.3f' % (i + 1, num_iter, loss.item()))
            optimizer.step()

    train()

    model.save(output_file)
    print("Saved model to: ", output_file)
    model.make_predictions(testX,
                           testY,
                           likelihood,
                           save_to='data/output/GPC/preds.pkl')
def main(input_file, output_file, hist_keys, regr_vars, test_year, n_epochs):
    df = read_processed_data(input_file)
    X, y = generate_data(df,
                         freq='D',
                         regr_vars=regr_vars,
                         hist_keys=hist_keys,
                         hist_steps=2)
    trainX, trainY, testX, testY = split_train_test(X, y, test_year=test_year)
    print("Generated data for LSTM")
    print('Training model...')
    model = Model(dict(features=5, forecast_horizon=1)).cuda()
    model.batch_train(trainX, trainY, n_epochs=n_epochs, lr=0.0005)
    model.save(output_file)
Beispiel #10
0
def main(input_file, output_file, scenario, regr_vars, multiplier, baseline,
         look_back, look_ahead, test_year):
    
    df = read_processed_data(input_file)
    
    X, y= generate_data(df, freq='D', scenario=scenario, regr_vars = regr_vars,
                        multiplier = multiplier, baseline= baseline, 
                        look_back = look_back, look_ahead = look_ahead)
    
    trainX, trainY, testX, testY = split_train_test(X, y, test_year=test_year,
                                                    save_data = True,
                                                    pname = 'data/processed/GPR/')
    print("Generated data for GPR")
    print('Training model...')
    model = GPRegressionModel()
    model.train(trainX, trainY)
    model.save(output_file)
    model.make_predictions(testX, save_to = 'data/output/GPR/preds.pkl')
Beispiel #11
0
def test_feature_and_response_shape():
    features = read_processed_data('data/processed/features.npy')
    response = read_processed_data('data/processed/response.npy')

    assert features.shape == (768, 8)
    assert response.shape == (768, )