def main(): x, y = dataloader(mode='train', reduced=False) x_test = dataloader(mode='test', reduced=False) '''For the dataloader there are two modes train and test, depending on the dataset loaded.''' x = standardize(x) x_test = standardize(x_test) config = Config(batch_size=120, num_epochs=400, learning_rate=5 * 10**-4, lambda_=2.15443469003e-05, mode='train') log_class = LogisticClassifier(config, (build_polynomial(x), y)) log_class.train(show_every=10) predictions_test = log_class.predict_submission( log_class(build_polynomial(x_test))) create_csv_submission(np.arange(350000, 350000 + x_test.shape[0]), predictions_test, 'dataset/submission_0x.csv')
def main(): x, y = dataloader(mode='train', reduced=False) x_test = dataloader(mode='test', reduced=False) x = standardize(x) x_test = standardize(x_test) config = Config(batch_size=120, num_epochs=300, learning_rate=5 * 10**-4, lambda_=2.15443469003e-05, mode='train') ensemble = EnsembleClassifiers(config, build_polynomial(x), y, 50, LogisticClassifier, label='ensemble_50_log') ensemble.train() predictions_test = ensemble.predict(ensemble(build_polynomial(x_test))) create_csv_submission(np.arange(350000, 350000 + x_test.shape[0]), predictions_test, 'dataset/submission_0x.csv')
def find_best_batch(batch_sizes): x, y = dataloader(mode='train', reduced=False) x = standardize(x) best_size = 0 best_accuracy = 0 for idx, batch_size in enumerate(batch_sizes): print('Ensemble nr ' + str(idx) + 30 * '=') config = Config(batch_size=batch_size, num_epochs=300, learning_rate=5 * 10 ** -4, lambda_= 2.16e-05) ensemble = EnsembleClassifiers(config, build_polynomial(x), y, 2, LogisticClassifier, label='ensemble_' + str(idx)) ensemble.train() print("ensemble accuracy " + str(ensemble.accuracy) + 30 * "=") if ensemble.accuracy > best_accuracy: best_accuracy = ensemble.accuracy best_size = batch_size print("best_lambda :", best_size)
def find_best_regularizer(lambdas): """Hyperparamenter search for regularization constant""" x, y = dataloader(mode='train', reduced=False) x = standardize(x) best_lambda = 0 best_accuracy = 0 for idx, lambda_ in enumerate(lambdas): print('Ensemble nr ' + str(idx) + 30 * '=') config = Config(batch_size=200, num_epochs=200, learning_rate=5 * 10 ** -4, lambda_=lambda_) ensemble = EnsembleClassifiers(config, build_polynomial(x), y, 10, LogisticClassifier, label='ensemble_' + str(idx)) ensemble.train() print("ensemble accuracy " + str(ensemble.accuracy) + 30 * "=") if ensemble.accuracy > best_accuracy: best_accuracy = ensemble.accuracy best_lambda = lambda_ print("best_lambda :", best_lambda)
from torch import optim import torch import torch.utils.data import numpy as np from torch.autograd import Variable from torch import nn import torch.nn.functional as f from src.utils import dataloader, standardize, split_data, build_polynomial x, y = dataloader(mode='train', reduced=False) x = standardize(x) train_dataset, test_dataset = split_data(x, y, ratio=0.9) test_data, test_target = test_dataset train_data, train_target = train_dataset test_data = build_polynomial(test_data) train_data = build_polynomial(train_data) num_features = np.shape(train_data)[1] train = torch.utils.data.TensorDataset( torch.from_numpy(train_data).type(torch.FloatTensor), torch.from_numpy(train_target).type(torch.LongTensor)) train_loader = torch.utils.data.DataLoader(train, batch_size=128, shuffle=True) test = torch.utils.data.TensorDataset( torch.from_numpy(test_data).type(torch.FloatTensor), torch.from_numpy(test_target).type(torch.LongTensor)) test_loader = torch.utils.data.DataLoader(test, batch_size=128, shuffle=True) class SimpleNN(torch.nn.Module): def __init__(self, batch_size=128,
lambda_= 2.16e-05) ensemble = EnsembleClassifiers(config, build_polynomial(x), y, 2, LogisticClassifier, label='ensemble_' + str(idx)) ensemble.train() print("ensemble accuracy " + str(ensemble.accuracy) + 30 * "=") if ensemble.accuracy > best_accuracy: best_accuracy = ensemble.accuracy best_size = batch_size print("best_lambda :", best_size) if __name__ == '__main__': # find_best_batch([20, 40, 60, 80, 100, 120, 140, 160, 180, 200]) # find_best_regularizer(np.logspace(-5, -2, 10)) x, y = dataloader(mode='train', reduced=False) x_test = dataloader(mode='test', reduced=False) # print(x.shape) # print(x_test.shape) x = standardize(x) x_test = standardize(x_test) # train_dataset, test_dataset = split_data(x, y, ratio=0.9) # train_set = (build_polynomial(train_dataset[0]), train_dataset[1]) # test_set = (build_polynomial(test_dataset[0]), test_dataset[1]) # # # # x = dataloader(mode='test', reduced=False) # # # # x = standardize(x) # # # # x = build_polynomial(x) config = Config(batch_size=120, num_epochs=400, learning_rate=5 * 10 ** -4, lambda_=2.15443469003e-05, mode='train') log_class = LogisticClassifier(config, (build_polynomial(x), y)) log_class.train(show_every=1p)
def run(args): train_diagnosis, test_diagnosis = data(args) SEED = 2021 torch.manual_seed(SEED) torch.cuda.manual_seed_all(SEED) torch.backends.cudnn.deterministic = True torch.backends.cudnn.benchmark = False device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') logging.basicConfig(filename='train.log', filemode='w', level=logging.DEBUG) logging.info("Model Name: %s", args.model_name.upper()) logging.info("Device: %s", device) logging.info("Batch Size: %d", args.batch_size) logging.info("Learning Rate: %f", args.learning_rate) if args.model_name == "bert": learning_rate = args.learning_rate loss_fn = nn.BCELoss() opt_fn = torch.optim.Adam bert_train_dataset = BERTdataset(train_diagnosis) bert_test_dataset = BERTdataset(test_diagnosis) bert_train_loader, bert_val_loader, bert_test_loader = dataloader( bert_train_dataset, bert_test_dataset, args.batch_size, args.val_split) model = BERTclassifier().to(device) bert_fit(args.epochs, model, bert_train_loader, bert_val_loader, args.icd_type, opt_fn, loss_fn, learning_rate, device) bert_test_results(model, bert_test_loader, args.icd_type, device) elif args.model_name == 'gru': learning_rate = args.learning_rate loss_fn = nn.BCELoss() opt_fn = torch.optim.Adam counts, vocab2index = count_vocab_index(train_diagnosis, test_diagnosis) rnn_train_dataset = rnndataset(train_diagnosis, vocab2index) rnn_test_dataset = rnndataset(train_diagnosis, vocab2index) rnn_train_loader, rnn_val_loader, rnn_test_loader = dataloader( rnn_train_dataset, rnn_test_dataset, args.batch_size, args.val_split) w2vmodel = Word2Vec.load(args.w2vmodel) weights = get_emb_matrix(w2vmodel, counts) gruw2vmodel = GRUw2vmodel(weights_matrix=weights, hidden_size=256, num_layers=2, device=device).to(device) fit(args.epochs, gruw2vmodel, rnn_train_loader, rnn_val_loader, args.icd_type, opt_fn, loss_fn, learning_rate, device) test_results(gruw2vmodel, rnn_test_loader, args.icd_type, device) elif args.model_name == 'lstm': learning_rate = args.learning_rate loss_fn = nn.BCELoss() opt_fn = torch.optim.Adam counts, vocab2index = count_vocab_index(train_diagnosis, test_diagnosis) rnn_train_dataset = rnndataset(train_diagnosis, vocab2index) rnn_test_dataset = rnndataset(train_diagnosis, vocab2index) rnn_train_loader, rnn_val_loader, rnn_test_loader = dataloader( rnn_train_dataset, rnn_test_dataset, args.batch_size, args.val_split) w2vmodel = Word2Vec.load(args.w2vmodel) weights = get_emb_matrix(w2vmodel, counts) lstmw2vmodel = LSTMw2vmodel(weights_matrix=weights, hidden_size=256, num_layers=2, device=device).to(device) fit(args.epochs, lstmw2vmodel, rnn_train_loader, rnn_val_loader, args.icd_type, opt_fn, loss_fn, learning_rate, device) test_results(lstmw2vmodel, rnn_test_loader, args.icd_type, device) elif args.model_name == "cnn": learning_rate = args.learning_rate loss_fn = nn.BCELoss() opt_fn = torch.optim.Adam cnn_train_dataset = cnndataset(train_diagnosis) cnn_test_dataset = cnndataset(test_diagnosis) cnn_train_loader, cnn_val_loader, cnn_test_loader = dataloader( cnn_train_dataset, cnn_test_dataset, args.batch_size, args.val_split) model = character_cnn(cnn_train_dataset.vocabulary, cnn_train_dataset.sequence_length).to(device) fit(args.epochs, model, cnn_train_loader, cnn_val_loader, args.icd_type, opt_fn, loss_fn, learning_rate, device) test_results(model, cnn_test_loader, args.icd_type, device) elif args.model_name == 'hybrid': learning_rate = args.learning_rate loss_fn = nn.BCELoss() opt_fn = torch.optim.Adam counts, vocab2index = count_vocab_index(train_diagnosis, test_diagnosis) hybrid_train_dataset = hybriddataset(train_diagnosis, vocab2index) hybrid_test_dataset = hybriddataset(train_diagnosis, vocab2index) hybrid_train_loader, hybrid_val_loader, hybrid_test_loader = dataloader( hybrid_train_dataset, hybrid_test_dataset, args.batch_size, args.val_split) w2vmodel = Word2Vec.load(args.w2vmodel) weights = get_emb_matrix(w2vmodel, counts) model = hybrid(hybrid_train_dataset.vocabulary, hybrid_train_dataset.sequence_length, weights_matrix=weights, hidden_size=256, num_layers=2).to(device) hybrid_fit(args.epochs, model, hybrid_train_loader, hybrid_val_loader, args.icd_type, opt_fn, loss_fn, learning_rate, device) hybrid_test_results(model, hybrid_test_loader, args.icd_type, device) elif args.model_name == 'ovr': X_train, y_train = mlmodel_data(train_diagnosis, args.icd_type) X_test, y_test = mlmodel_data(test_diagnosis, args.icd_type) tfidf_vectorizer = TfidfVectorizer(max_df=0.8) X_train = tfidf_vectorizer.fit_transform(X_train) X_test = tfidf_vectorizer.transform(X_test) ml_model = train_classifier(X_train, y_train) y_predict = ml_model.predict(X_test) print('-' * 20 + args.icd_type + '-' * 20) mlmodel_result(y_test, y_predict)