Beispiel #1
0
def main():
    x, y = dataloader(mode='train', reduced=False)
    x_test = dataloader(mode='test', reduced=False)
    '''For the dataloader there are two modes train and test, depending on the dataset loaded.'''
    x = standardize(x)
    x_test = standardize(x_test)
    config = Config(batch_size=120,
                    num_epochs=400,
                    learning_rate=5 * 10**-4,
                    lambda_=2.15443469003e-05,
                    mode='train')
    log_class = LogisticClassifier(config, (build_polynomial(x), y))
    log_class.train(show_every=10)
    predictions_test = log_class.predict_submission(
        log_class(build_polynomial(x_test)))

    create_csv_submission(np.arange(350000, 350000 + x_test.shape[0]),
                          predictions_test, 'dataset/submission_0x.csv')
Beispiel #2
0
def main():
    x, y = dataloader(mode='train', reduced=False)
    x_test = dataloader(mode='test', reduced=False)
    x = standardize(x)
    x_test = standardize(x_test)
    config = Config(batch_size=120,
                    num_epochs=300,
                    learning_rate=5 * 10**-4,
                    lambda_=2.15443469003e-05,
                    mode='train')
    ensemble = EnsembleClassifiers(config,
                                   build_polynomial(x),
                                   y,
                                   50,
                                   LogisticClassifier,
                                   label='ensemble_50_log')
    ensemble.train()
    predictions_test = ensemble.predict(ensemble(build_polynomial(x_test)))
    create_csv_submission(np.arange(350000, 350000 + x_test.shape[0]),
                          predictions_test, 'dataset/submission_0x.csv')
def find_best_batch(batch_sizes):
    x, y = dataloader(mode='train', reduced=False)
    x = standardize(x)
    best_size = 0
    best_accuracy = 0
    for idx, batch_size in enumerate(batch_sizes):
        print('Ensemble nr ' + str(idx) + 30 * '=')
        config = Config(batch_size=batch_size, num_epochs=300, learning_rate=5 * 10 ** -4,
                        lambda_= 2.16e-05)
        ensemble = EnsembleClassifiers(config, build_polynomial(x), y, 2, LogisticClassifier,
                                       label='ensemble_' + str(idx))
        ensemble.train()
        print("ensemble accuracy " + str(ensemble.accuracy) + 30 * "=")
        if ensemble.accuracy > best_accuracy:
            best_accuracy = ensemble.accuracy
            best_size = batch_size
        print("best_lambda :", best_size)
def find_best_regularizer(lambdas):
    """Hyperparamenter search for regularization constant"""
    x, y = dataloader(mode='train', reduced=False)
    x = standardize(x)
    best_lambda = 0
    best_accuracy = 0
    for idx, lambda_ in enumerate(lambdas):
        print('Ensemble nr ' + str(idx) + 30 * '=')
        config = Config(batch_size=200, num_epochs=200, learning_rate=5 * 10 ** -4, lambda_=lambda_)
        ensemble = EnsembleClassifiers(config, build_polynomial(x), y, 10, LogisticClassifier,
                                       label='ensemble_' + str(idx))
        ensemble.train()
        print("ensemble accuracy " + str(ensemble.accuracy) + 30 * "=")
        if ensemble.accuracy > best_accuracy:
            best_accuracy = ensemble.accuracy
            best_lambda = lambda_
        print("best_lambda :", best_lambda)
from torch import optim
import torch
import torch.utils.data
import numpy as np
from torch.autograd import Variable
from torch import nn
import torch.nn.functional as f
from src.utils import dataloader, standardize, split_data, build_polynomial

x, y = dataloader(mode='train', reduced=False)
x = standardize(x)
train_dataset, test_dataset = split_data(x, y, ratio=0.9)
test_data, test_target = test_dataset
train_data, train_target = train_dataset
test_data = build_polynomial(test_data)
train_data = build_polynomial(train_data)
num_features = np.shape(train_data)[1]

train = torch.utils.data.TensorDataset(
    torch.from_numpy(train_data).type(torch.FloatTensor),
    torch.from_numpy(train_target).type(torch.LongTensor))
train_loader = torch.utils.data.DataLoader(train, batch_size=128, shuffle=True)
test = torch.utils.data.TensorDataset(
    torch.from_numpy(test_data).type(torch.FloatTensor),
    torch.from_numpy(test_target).type(torch.LongTensor))
test_loader = torch.utils.data.DataLoader(test, batch_size=128, shuffle=True)


class SimpleNN(torch.nn.Module):
    def __init__(self,
                 batch_size=128,
                        lambda_= 2.16e-05)
        ensemble = EnsembleClassifiers(config, build_polynomial(x), y, 2, LogisticClassifier,
                                       label='ensemble_' + str(idx))
        ensemble.train()
        print("ensemble accuracy " + str(ensemble.accuracy) + 30 * "=")
        if ensemble.accuracy > best_accuracy:
            best_accuracy = ensemble.accuracy
            best_size = batch_size
        print("best_lambda :", best_size)



if __name__ == '__main__':
    # find_best_batch([20, 40, 60, 80, 100, 120, 140, 160, 180, 200])
    # find_best_regularizer(np.logspace(-5, -2, 10))
    x, y = dataloader(mode='train', reduced=False)
    x_test = dataloader(mode='test', reduced=False)
    # print(x.shape)
    # print(x_test.shape)
    x = standardize(x)
    x_test = standardize(x_test)
    # train_dataset, test_dataset = split_data(x, y, ratio=0.9)
    # train_set = (build_polynomial(train_dataset[0]), train_dataset[1])
    # test_set = (build_polynomial(test_dataset[0]), test_dataset[1])
    # # # # x = dataloader(mode='test', reduced=False)
    # # # # x = standardize(x)
    # # # # x = build_polynomial(x)
    config = Config(batch_size=120, num_epochs=400, learning_rate=5 * 10 ** -4,
                    lambda_=2.15443469003e-05, mode='train')
    log_class = LogisticClassifier(config, (build_polynomial(x), y))
    log_class.train(show_every=1p)
def run(args):

    train_diagnosis, test_diagnosis = data(args)

    SEED = 2021
    torch.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    logging.basicConfig(filename='train.log',
                        filemode='w',
                        level=logging.DEBUG)
    logging.info("Model Name: %s", args.model_name.upper())
    logging.info("Device: %s", device)
    logging.info("Batch Size: %d", args.batch_size)
    logging.info("Learning Rate: %f", args.learning_rate)

    if args.model_name == "bert":

        learning_rate = args.learning_rate
        loss_fn = nn.BCELoss()
        opt_fn = torch.optim.Adam

        bert_train_dataset = BERTdataset(train_diagnosis)
        bert_test_dataset = BERTdataset(test_diagnosis)

        bert_train_loader, bert_val_loader, bert_test_loader = dataloader(
            bert_train_dataset, bert_test_dataset, args.batch_size,
            args.val_split)

        model = BERTclassifier().to(device)

        bert_fit(args.epochs, model, bert_train_loader, bert_val_loader,
                 args.icd_type, opt_fn, loss_fn, learning_rate, device)
        bert_test_results(model, bert_test_loader, args.icd_type, device)

    elif args.model_name == 'gru':
        learning_rate = args.learning_rate
        loss_fn = nn.BCELoss()
        opt_fn = torch.optim.Adam

        counts, vocab2index = count_vocab_index(train_diagnosis,
                                                test_diagnosis)
        rnn_train_dataset = rnndataset(train_diagnosis, vocab2index)
        rnn_test_dataset = rnndataset(train_diagnosis, vocab2index)

        rnn_train_loader, rnn_val_loader, rnn_test_loader = dataloader(
            rnn_train_dataset, rnn_test_dataset, args.batch_size,
            args.val_split)

        w2vmodel = Word2Vec.load(args.w2vmodel)
        weights = get_emb_matrix(w2vmodel, counts)

        gruw2vmodel = GRUw2vmodel(weights_matrix=weights,
                                  hidden_size=256,
                                  num_layers=2,
                                  device=device).to(device)

        fit(args.epochs, gruw2vmodel, rnn_train_loader, rnn_val_loader,
            args.icd_type, opt_fn, loss_fn, learning_rate, device)
        test_results(gruw2vmodel, rnn_test_loader, args.icd_type, device)

    elif args.model_name == 'lstm':
        learning_rate = args.learning_rate
        loss_fn = nn.BCELoss()
        opt_fn = torch.optim.Adam

        counts, vocab2index = count_vocab_index(train_diagnosis,
                                                test_diagnosis)
        rnn_train_dataset = rnndataset(train_diagnosis, vocab2index)
        rnn_test_dataset = rnndataset(train_diagnosis, vocab2index)

        rnn_train_loader, rnn_val_loader, rnn_test_loader = dataloader(
            rnn_train_dataset, rnn_test_dataset, args.batch_size,
            args.val_split)

        w2vmodel = Word2Vec.load(args.w2vmodel)
        weights = get_emb_matrix(w2vmodel, counts)

        lstmw2vmodel = LSTMw2vmodel(weights_matrix=weights,
                                    hidden_size=256,
                                    num_layers=2,
                                    device=device).to(device)

        fit(args.epochs, lstmw2vmodel, rnn_train_loader, rnn_val_loader,
            args.icd_type, opt_fn, loss_fn, learning_rate, device)
        test_results(lstmw2vmodel, rnn_test_loader, args.icd_type, device)

    elif args.model_name == "cnn":

        learning_rate = args.learning_rate
        loss_fn = nn.BCELoss()
        opt_fn = torch.optim.Adam

        cnn_train_dataset = cnndataset(train_diagnosis)
        cnn_test_dataset = cnndataset(test_diagnosis)

        cnn_train_loader, cnn_val_loader, cnn_test_loader = dataloader(
            cnn_train_dataset, cnn_test_dataset, args.batch_size,
            args.val_split)

        model = character_cnn(cnn_train_dataset.vocabulary,
                              cnn_train_dataset.sequence_length).to(device)

        fit(args.epochs, model, cnn_train_loader, cnn_val_loader,
            args.icd_type, opt_fn, loss_fn, learning_rate, device)
        test_results(model, cnn_test_loader, args.icd_type, device)

    elif args.model_name == 'hybrid':

        learning_rate = args.learning_rate
        loss_fn = nn.BCELoss()
        opt_fn = torch.optim.Adam

        counts, vocab2index = count_vocab_index(train_diagnosis,
                                                test_diagnosis)

        hybrid_train_dataset = hybriddataset(train_diagnosis, vocab2index)
        hybrid_test_dataset = hybriddataset(train_diagnosis, vocab2index)

        hybrid_train_loader, hybrid_val_loader, hybrid_test_loader = dataloader(
            hybrid_train_dataset, hybrid_test_dataset, args.batch_size,
            args.val_split)

        w2vmodel = Word2Vec.load(args.w2vmodel)
        weights = get_emb_matrix(w2vmodel, counts)

        model = hybrid(hybrid_train_dataset.vocabulary,
                       hybrid_train_dataset.sequence_length,
                       weights_matrix=weights,
                       hidden_size=256,
                       num_layers=2).to(device)

        hybrid_fit(args.epochs, model, hybrid_train_loader, hybrid_val_loader,
                   args.icd_type, opt_fn, loss_fn, learning_rate, device)
        hybrid_test_results(model, hybrid_test_loader, args.icd_type, device)

    elif args.model_name == 'ovr':

        X_train, y_train = mlmodel_data(train_diagnosis, args.icd_type)
        X_test, y_test = mlmodel_data(test_diagnosis, args.icd_type)

        tfidf_vectorizer = TfidfVectorizer(max_df=0.8)
        X_train = tfidf_vectorizer.fit_transform(X_train)
        X_test = tfidf_vectorizer.transform(X_test)

        ml_model = train_classifier(X_train, y_train)
        y_predict = ml_model.predict(X_test)

        print('-' * 20 + args.icd_type + '-' * 20)
        mlmodel_result(y_test, y_predict)