Beispiel #1
0
def test(dim, args):
    import torch
    import numpy as np
    from features import ExtractWordEmbeddings
    from preprocess_data import batchify, padBatch
    from models.lstm import LSTMClassifier
    from sklearn.utils import shuffle
    from sklearn.metrics import roc_auc_score, recall_score, accuracy_score

    # hyperparameters
    is_cuda = True
    batch_size = 60
    embedding_dim = 300
    hidden_dim = args.hidden_dim
    weight_dir = 'weights/LSTM/%s' % dim
    weight_file = join(weight_dir, 'best-weights.pth')
    assert os.path.exists(
        weight_file), "The file directory for the saved model doesn't exist"

    # load datasets
    X_t, y_t = loadDatasetForLSTM(dim, 'test')

    # load model and settings for training
    model = LSTMClassifier(embedding_dim=embedding_dim, hidden_dim=hidden_dim)

    state_dict = torch.load(weight_file)
    model.load_state_dict(state_dict)
    if is_cuda:
        model.cuda()

    em = ExtractWordEmbeddings(emb_type='glove')

    # validate
    y_scores = []
    X_t, y_t = shuffle(X_t, y_t)
    val_batches = batchify(X_t, y_t, batch_size)
    model.eval()
    with torch.no_grad():
        for X_b, y_b in val_batches:
            inputs = torch.tensor(
                padBatch([
                    em.obtain_vectors_from_sentence(sent, True) for sent in X_b
                ])).float()
            targets = torch.tensor(y_b, dtype=torch.float32)
            if is_cuda:
                inputs, targets = inputs.cuda(), targets.cuda()
            outputs = model(inputs).tolist()
            y_scores.extend(outputs)
    y_preds = np.array(np.array(y_scores) >= 0.5, dtype=int)
    auc = roc_auc_score(y_true=y_t, y_score=y_scores)
    rec = recall_score(y_true=y_t, y_pred=y_preds)
    acc = accuracy_score(y_true=y_t, y_pred=y_preds)
    print('AUC: ', round(auc, 2))
    print('REC: ', round(rec, 2))
    print('ACC: ', round(acc, 2))
    with open(join(weight_dir, 'scores.txt'), 'w') as f:
        f.write('AUC: %1.2f\n' % auc)
        f.write('REC: %1.2f\n' % rec)
        f.write('ACC: %1.2f\n' % acc)
    return
Beispiel #2
0
class LSTMTrainer(Trainer):
    def __init__(self, opt, emb_matrix=None):
        self.opt = opt
        self.emb_matrix = emb_matrix
        self.model = LSTMClassifier(opt, emb_matrix=emb_matrix)
        self.criterion = nn.CrossEntropyLoss()
        self.parameters = [p for p in self.model.parameters() if p.requires_grad]
        if opt['cuda']:
            self.model.cuda()
            self.criterion.cuda()
        self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
    
    def update(self, batch):
        inputs, labels = unpack_batch(batch)

        # Step 1 init and forward
        self.model.train()
        self.optimizer.zero_grad()

        logits = self.model(inputs)
        loss = self.criterion(logits, labels)
        loss_val = loss.item()

        # Step 2 backward
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.opt['max_grad_norm'])

        # Step 3 update
        self.optimizer.step()
        return loss_val 
    
    def predict(self, batch, unsort=True):
        inputs, labels = unpack_batch(batch)
        
        self.model.eval()

        logits = self.model(inputs)
        loss = self.criterion(logits, labels)
        loss_val = loss.item()
        
        probs = F.softmax(logits, 1).data.cpu().numpy().tolist()
        predictions = np.argmax(logits.data.cpu().numpy(), axis=1).tolist()
        labels = labels.data.cpu().numpy().tolist()
        return predictions, probs, labels, loss_val
Beispiel #3
0
def train(dim, args):
    import torch
    from torch import nn, optim
    import numpy as np
    from features import ExtractWordEmbeddings
    from preprocess_data import batchify, padBatch
    from models.lstm import LSTMClassifier
    from sklearn.utils import shuffle

    # hyperparameters
    embedding_dim = 300  # changes only with different word embeddings
    hidden_dim = args.hidden_dim
    max_epochs = args.max_epochs
    is_cuda = True
    batch_size = 60
    lr = args.lr
    n_decreases = 10
    save_dir = 'weights/LSTM/%s' % dim
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    """
    Loading train / validation datasets
    X_tr: a list of tokenized sentences
    y_tr: a list of 0 and 1
    """
    X_tr, y_tr = loadDatasetForLSTM(dim,
                                    'train')  # a list of tokenized sentences
    X_d, y_d = loadDatasetForLSTM(dim, 'dev')

    # load model and settings for training
    model = LSTMClassifier(embedding_dim=embedding_dim, hidden_dim=hidden_dim)
    if is_cuda:
        model.cuda()
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    flag = True
    old_val = np.inf  # previous validation error
    em = ExtractWordEmbeddings(emb_type='glove')
    loss_fn = nn.BCELoss()

    # train model
    epoch = 0
    cnt_decrease = 0
    while (flag):
        tr_loss = 0.0
        epoch += 1
        if (epoch > max_epochs) | (cnt_decrease > n_decreases):
            break
        # train
        model.train()
        # for each iteration, shuffles X_tr and y_tr and puts them into batches
        X_tr, y_tr = shuffle(X_tr, y_tr)
        tr_batches = batchify(X_tr, y_tr, batch_size)
        for X_b, y_b in tr_batches:
            # X_b is still a list of tokenized sentences (list of list of words)
            optimizer.zero_grad()
            """
            obtain_vectors_from_sentence(sent=list of words, include_unk=True)
            : changes each word into an embedding, and returns a list of embeddings
            padBatch(list of embedding lists, max_seq=None)
            : for each batch, returns a tensor fixed to the max size, applies zero padding
            """
            inputs = torch.tensor(
                padBatch([
                    em.obtain_vectors_from_sentence(sent, True) for sent in X_b
                ])).float()
            # here, inputs become a tensor of shape (B * seq_len * dim)
            targets = torch.tensor(y_b, dtype=torch.float32)
            if is_cuda:
                inputs, targets = inputs.cuda(), targets.cuda()
            outputs = model(inputs)
            loss = loss_fn(outputs, targets)  # error here
            loss.backward()
            tr_loss += loss.item()
            optimizer.step()

        print("[Epoch %d] train loss: %1.3f" % (epoch, tr_loss))

        # validate
        model.eval()
        current_loss = 0.0
        X_d, y_d = shuffle(X_d, y_d)
        val_batches = batchify(X_d, y_d, batch_size)
        with torch.no_grad():
            for X_b, y_b in val_batches:
                inputs = torch.tensor(
                    padBatch([
                        em.obtain_vectors_from_sentence(sent, True)
                        for sent in X_b
                    ])).float()
                targets = torch.tensor(y_b, dtype=torch.float32)
                if is_cuda:
                    inputs, targets = inputs.cuda(), targets.cuda()
                outputs = model(inputs)
                loss = loss_fn(outputs, targets)  # error here
                current_loss += loss.item()

        print("[Epoch %d] validation loss: %1.3f" % (epoch, current_loss))
        if current_loss < old_val:
            # if current round is better than the previous round
            best_state = model.state_dict()  # save this model
            torch.save(best_state, join(save_dir, 'best-weights.pth'))
            print("Updated model")
            old_val = current_loss
            cnt_decrease = 0
        else:
            # if the current round is doing worse
            cnt_decrease += 1

        if cnt_decrease >= n_decreases:
            flag = False
    return
Beispiel #4
0
    def __init__(self,
                 models_dir='./models/lstm_trained_models',
                 embeddings_dir='./embeddings',
                 is_cuda=False):
        """
		@param models_dir: the directory where the LSTM models are stored
		@param embeddings_dir: the directory where the embeddings are stored. The directory must contain the following subdirectories:
		                       word2vec/GoogleNews-vectors-negative300.wv
		                       fasttext/wiki-news-300d-1M-subword.wv
		                       glove/glove.42B.300d.wv
		@param is_cuda: to enable cuda
		"""
        self.is_cuda = is_cuda
        self.models_dir = models_dir
        self.embeddings_dir = embeddings_dir

        #load embeddings
        self.em_glove = ExtractWordEmbeddings('glove',
                                              emb_dir=self.embeddings_dir)
        self.em_word2vec = ExtractWordEmbeddings('word2vec',
                                                 emb_dir=self.embeddings_dir)
        self.em_fasttext = ExtractWordEmbeddings('fasttext',
                                                 emb_dir=self.embeddings_dir)
        self.dimensions_list = [
            'support', 'knowledge', 'conflict', 'power', 'similarity', 'fun',
            'status', 'trust', 'identity', 'romance'
        ]

        #load models
        self.dim2model = {}
        self.dim2embedding = {}

        for dim in self.dimensions_list:
            model = LSTMClassifier(embedding_dim=300, hidden_dim=300)
            if self.is_cuda:
                print(f'Torch version: {torch.__version__}')
                print(f'Torch CUDA available : {torch.cuda.is_available()}')
                if torch.cuda.is_available():
                    print(
                        f'Torch current device : {torch.cuda.current_device()}'
                    )
                    print(f'Torch device count : {torch.cuda.device_count()}')
                    print(
                        f'Torch device name : {torch.cuda.get_device_name(0)}')
                    model.cuda()
                else:
                    print(
                        'Cuda not available. Instantiated the TenDimensionsClassifier with CUDA=False'
                    )
                    self.is_cuda = False
            model.eval()
            for modelname in os.listdir(self.models_dir):
                if ('-best.lstm' in modelname) & (dim in modelname):
                    best_state = torch.load(join(self.models_dir, modelname),
                                            map_location='cpu')
                    model.load_state_dict(best_state)
                    if 'glove' in modelname:
                        em = self.em_glove
                    elif 'word2vec' in modelname:
                        em = self.em_word2vec
                    elif 'fasttext' in modelname:
                        em = self.em_fasttext
                    self.dim2model[dim] = model
                    self.dim2embedding[dim] = em
                    break