def main(scoring, features, response, output_file): estimators = get_estimators() results = [] names = [] X_unscaled = read_processed_data(features) X = X_unscaled # X = rescale_data(X_unscaled) y = read_processed_data(response) kfold = KFold(n_splits=10, random_state=7) # Use cv = kfold if using kfold validation or cv = LeaveOneOut() for Leave One Out cv = kfold title = "Scoring metrics: {}\n\n".format(scoring) with open(output_file, 'a+') as fh: fh.write(title) for name, estimator in estimators: cv_results = cross_val_score(estimator, X, y, cv=cv, scoring=scoring) results.append(cv_results) names.append(name) msg = "{}: Mean: {:.3f} Std: {:.3f}\n".format( name, cv_results.mean(), cv_results.std()) with open(output_file, 'a+') as fh: fh.write(msg)
def main(output_file, features, response, scaler='standardized'): X = read_processed_data(features) y = read_processed_data(response) estimators = [] if scaler == 'standardized': estimators.append(('standardized', StandardScaler())) elif scaler == 'minmax': estimators.append(('minmax', MinMaxScaler())) else: pass estimators.append(('lda', LinearDiscriminantAnalysis())) model = Pipeline(estimators) model.fit(X, y) # Evaluate pipeline kfold = KFold(n_splits=10, random_state=7) cv = kfold results = cross_val_score(model, X, y, cv=cv) print(results.mean()) joblib.dump(model, output_file) test_data = read_processed_data('data/processed/features.npy')[0:5,] print(model.predict(test_data))
def main(scoring, features, response, output_file): estimators = get_estimators() results = [] names = [] title = "Scoring metrics: {}\n\n".format(scoring) X = read_processed_data(features) y = read_processed_data(response) kfold = KFold(n_splits=10, random_state=7) cv = kfold # or cv=LeaveOneOut() with open(output_file, 'a+') as fh: fh.write(title) for name, estimator in estimators: cv_results = cross_val_score(estimator, X, y, cv=cv, scoring=scoring) results.append(cv_results) names.append(name) y_pred = cross_val_predict(estimator, X, y, cv=cv) cls_rpt = classification_report(y, y_pred) conf_mat = confusion_matrix(y, y_pred) conf_mat = np.array2string(conf_mat) msg = "{}: Mean: {:.3f} Std: {:.3f}\n".format( name, cv_results.mean(), cv_results.std()) with open(output_file, 'a+') as fh: fh.write(msg) fh.write('\n') fh.write(conf_mat) fh.write('\n') fh.write(cls_rpt) fh.write('\n')
def main(features, response): X = read_processed_data(features) y = read_processed_data(response) estimators = [] estimators.append(('standardize', StandardScaler())) estimators.append(('LR', LinearDiscriminantAnalysis())) model = Pipeline(estimators) kfold = KFold(n_splits=10, random_state=7) cv = kfold results = cross_val_score(model, X, y, cv=cv) print(results.mean())
def tune_knn(features, response, output_tuning_results, output_tuned_model): estimator = KNeighborsClassifier() n_neighbor_range = list(range(1, 31)) weights = ['uniform', 'distance'] param_grid = dict(n_neighbors=n_neighbor_range, weights=weights) grid = GridSearchCV(estimator, param_grid, cv=10, scoring='accuracy') X = read_processed_data(features) y = read_processed_data(response) grid.fit(X, y) tuning_results = 'Best score: {}\nBest estimator: {}\nBest params: {}\n'.format( grid.best_score_, grid.best_params_, grid.best_estimator_) with open(output_tuning_results, 'w') as fh: fh.write(tuning_results) joblib.dump(grid.best_estimator_, output_tuned_model)
def main(input_file, output_file, frequency, scenario, regr_vars, multiplier, baseline, look_back, look_ahead, test_year): df = read_processed_data(input_file) X, y = generate_data(df, freq=frequency, scenario=scenario, regr_vars=regr_vars, multiplier=multiplier, baseline=baseline, look_back=look_back, look_ahead=look_ahead) trainX, trainY, testX, testY = split_train_test( X, y, test_year=test_year, save_data=True, pname='data/processed/CatBoost/') print("Generated data for CatBoost") print('Using as training features: ', trainX.columns) print('Training model...') model = CatBoostModel() model.train(trainX, trainY) model.save_model(output_file) model.make_predictions(testX, save_to='data/output/CatBoost/preds.pkl')
def main(input_file, output_file): print("Training model") dframe = read_processed_data(input_file) model = RandomForestModel() model.train(dframe) model.save(output_file)
def main(input_file, output_file, iterations, scenario, regr_vars, multiplier, baseline, look_back, look_ahead, test_year): df = read_processed_data(input_file) X, y = generate_data(df, freq='D', scenario=scenario, regr_vars=regr_vars, multiplier=multiplier, baseline=baseline, look_back=look_back, look_ahead=look_ahead) trainX, trainY, testX, testY = split_train_test( X, y, test_year=test_year, save_data=True, pname='data/processed/GPC/') print("Generated data for GPC") print('Training model...') x_data_dim = trainX.size(-1) model = GPClassificationModel(data_dim=x_data_dim) # Cuda the model and likelihood function model = GPClassificationModel(data_dim=x_data_dim).cuda() likelihood = gpytorch.likelihoods.BernoulliLikelihood().cuda() # Find optimal model hyperparameters model.train() likelihood.train() # Use the Adam optimizer optimizer = torch.optim.Adam(model.parameters(), lr=0.1) # "Loss" for GPs - the marginal log likelihood # n_data refers to the amount of training data mll = gpytorch.mlls.VariationalELBO(likelihood, model, num_data=trainY.numel()) # Training function def train(num_iter=iterations): for i in range(num_iter): optimizer.zero_grad() output = model(trainX) loss = -mll(output, trainY) loss.backward() print('Iter %d/%d - Loss: %.3f' % (i + 1, num_iter, loss.item())) optimizer.step() train() model.save(output_file) print("Saved model to: ", output_file) model.make_predictions(testX, testY, likelihood, save_to='data/output/GPC/preds.pkl')
def main(input_file, output_file, hist_keys, regr_vars, test_year, n_epochs): df = read_processed_data(input_file) X, y = generate_data(df, freq='D', regr_vars=regr_vars, hist_keys=hist_keys, hist_steps=2) trainX, trainY, testX, testY = split_train_test(X, y, test_year=test_year) print("Generated data for LSTM") print('Training model...') model = Model(dict(features=5, forecast_horizon=1)).cuda() model.batch_train(trainX, trainY, n_epochs=n_epochs, lr=0.0005) model.save(output_file)
def main(input_file, output_file, scenario, regr_vars, multiplier, baseline, look_back, look_ahead, test_year): df = read_processed_data(input_file) X, y= generate_data(df, freq='D', scenario=scenario, regr_vars = regr_vars, multiplier = multiplier, baseline= baseline, look_back = look_back, look_ahead = look_ahead) trainX, trainY, testX, testY = split_train_test(X, y, test_year=test_year, save_data = True, pname = 'data/processed/GPR/') print("Generated data for GPR") print('Training model...') model = GPRegressionModel() model.train(trainX, trainY) model.save(output_file) model.make_predictions(testX, save_to = 'data/output/GPR/preds.pkl')
def test_feature_and_response_shape(): features = read_processed_data('data/processed/features.npy') response = read_processed_data('data/processed/response.npy') assert features.shape == (768, 8) assert response.shape == (768, )