Example #1
0
 def __init__(self, opt, emb_matrix=None):
     self.opt = opt
     self.emb_matrix = emb_matrix
     self.model = LSTMClassifier(opt, emb_matrix=emb_matrix)
     self.criterion = nn.CrossEntropyLoss()
     self.parameters = [p for p in self.model.parameters() if p.requires_grad]
     if opt['cuda']:
         self.model.cuda()
         self.criterion.cuda()
     self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
Example #2
0
def test(dim, args):
    import torch
    import numpy as np
    from features import ExtractWordEmbeddings
    from preprocess_data import batchify, padBatch
    from models.lstm import LSTMClassifier
    from sklearn.utils import shuffle
    from sklearn.metrics import roc_auc_score, recall_score, accuracy_score

    # hyperparameters
    is_cuda = True
    batch_size = 60
    embedding_dim = 300
    hidden_dim = args.hidden_dim
    weight_dir = 'weights/LSTM/%s' % dim
    weight_file = join(weight_dir, 'best-weights.pth')
    assert os.path.exists(
        weight_file), "The file directory for the saved model doesn't exist"

    # load datasets
    X_t, y_t = loadDatasetForLSTM(dim, 'test')

    # load model and settings for training
    model = LSTMClassifier(embedding_dim=embedding_dim, hidden_dim=hidden_dim)

    state_dict = torch.load(weight_file)
    model.load_state_dict(state_dict)
    if is_cuda:
        model.cuda()

    em = ExtractWordEmbeddings(emb_type='glove')

    # validate
    y_scores = []
    X_t, y_t = shuffle(X_t, y_t)
    val_batches = batchify(X_t, y_t, batch_size)
    model.eval()
    with torch.no_grad():
        for X_b, y_b in val_batches:
            inputs = torch.tensor(
                padBatch([
                    em.obtain_vectors_from_sentence(sent, True) for sent in X_b
                ])).float()
            targets = torch.tensor(y_b, dtype=torch.float32)
            if is_cuda:
                inputs, targets = inputs.cuda(), targets.cuda()
            outputs = model(inputs).tolist()
            y_scores.extend(outputs)
    y_preds = np.array(np.array(y_scores) >= 0.5, dtype=int)
    auc = roc_auc_score(y_true=y_t, y_score=y_scores)
    rec = recall_score(y_true=y_t, y_pred=y_preds)
    acc = accuracy_score(y_true=y_t, y_pred=y_preds)
    print('AUC: ', round(auc, 2))
    print('REC: ', round(rec, 2))
    print('ACC: ', round(acc, 2))
    with open(join(weight_dir, 'scores.txt'), 'w') as f:
        f.write('AUC: %1.2f\n' % auc)
        f.write('REC: %1.2f\n' % rec)
        f.write('ACC: %1.2f\n' % acc)
    return
Example #3
0
class LSTMTrainer(Trainer):
    def __init__(self, opt, emb_matrix=None):
        self.opt = opt
        self.emb_matrix = emb_matrix
        self.model = LSTMClassifier(opt, emb_matrix=emb_matrix)
        self.criterion = nn.CrossEntropyLoss()
        self.parameters = [p for p in self.model.parameters() if p.requires_grad]
        if opt['cuda']:
            self.model.cuda()
            self.criterion.cuda()
        self.optimizer = torch_utils.get_optimizer(opt['optim'], self.parameters, opt['lr'])
    
    def update(self, batch):
        inputs, labels = unpack_batch(batch)

        # Step 1 init and forward
        self.model.train()
        self.optimizer.zero_grad()

        logits = self.model(inputs)
        loss = self.criterion(logits, labels)
        loss_val = loss.item()

        # Step 2 backward
        loss.backward()
        torch.nn.utils.clip_grad_norm_(self.model.parameters(), self.opt['max_grad_norm'])

        # Step 3 update
        self.optimizer.step()
        return loss_val 
    
    def predict(self, batch, unsort=True):
        inputs, labels = unpack_batch(batch)
        
        self.model.eval()

        logits = self.model(inputs)
        loss = self.criterion(logits, labels)
        loss_val = loss.item()
        
        probs = F.softmax(logits, 1).data.cpu().numpy().tolist()
        predictions = np.argmax(logits.data.cpu().numpy(), axis=1).tolist()
        labels = labels.data.cpu().numpy().tolist()
        return predictions, probs, labels, loss_val
Example #4
0
import time
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from utils.dataloader import TIMITDataset
from utils.dataloader import pad_seqs_to_batch
from models.lstm import LSTMClassifier
import torch.backends.cudnn as cudnn
from utils import train
device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cuda':
    cudnn.benchmark = True

if __name__ == "__main__":
    net = LSTMClassifier(39, 48, n_hidden=78, num_layers=3)
    traindata = TIMITDataset(root="./data", split="train")
    trainloader = DataLoader(dataset=traindata,
                             batch_size=100,
                             shuffle=True,
                             collate_fn=pad_seqs_to_batch)
    # ==============================
    optimizer = optim.Adam(net.parameters(), lr=0.05)
    net.to(device)
    # net.train()
    for epoch in range(30):
        train(trainloader, net, optimizer, device=device)
Example #5
0
                         batch_size=Config.batch_size,
                         collate_fn=pad_seqs_to_batch)
 # ========================================================================
 # teacher
 teacher = BiLSTMClassifier(n_feature=Config.n_features,
                            n_class=Config.n_classes,
                            n_hidden=Config.n_hidden_nodes,
                            num_layers=3)
 teacher_save = torch.load(
     Config.teacher_tar_fmt.format(plbl=Config.part_labeled))
 teacher.load_state_dict(teacher_save['state_dict'])
 teacher.to(device)
 # ========================================================================
 # student
 student = LSTMClassifier(n_feature=Config.n_features,
                          n_class=Config.n_classes,
                          n_hidden=Config.n_hidden_nodes,
                          num_layers=3)
 student.to(device)
 # ========================================================================
 # Pre evaluate if not shuffle
 if not Config.shuffle:
     teacher.eval()
     target_logist_list = []
     with torch.no_grad():
         for pack_inputs, _ in tqdm(trainloader, desc="TeacherTagging"):
             pack_inputs = pack_inputs.to(device)
             target_logit = teacher(pack_inputs)
             target_logist_list.append(target_logit)
 else:
     target_logist_list = None
 # =======================================================================
Example #6
0
def train(dim, args):
    import torch
    from torch import nn, optim
    import numpy as np
    from features import ExtractWordEmbeddings
    from preprocess_data import batchify, padBatch
    from models.lstm import LSTMClassifier
    from sklearn.utils import shuffle

    # hyperparameters
    embedding_dim = 300  # changes only with different word embeddings
    hidden_dim = args.hidden_dim
    max_epochs = args.max_epochs
    is_cuda = True
    batch_size = 60
    lr = args.lr
    n_decreases = 10
    save_dir = 'weights/LSTM/%s' % dim
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    """
    Loading train / validation datasets
    X_tr: a list of tokenized sentences
    y_tr: a list of 0 and 1
    """
    X_tr, y_tr = loadDatasetForLSTM(dim,
                                    'train')  # a list of tokenized sentences
    X_d, y_d = loadDatasetForLSTM(dim, 'dev')

    # load model and settings for training
    model = LSTMClassifier(embedding_dim=embedding_dim, hidden_dim=hidden_dim)
    if is_cuda:
        model.cuda()
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    flag = True
    old_val = np.inf  # previous validation error
    em = ExtractWordEmbeddings(emb_type='glove')
    loss_fn = nn.BCELoss()

    # train model
    epoch = 0
    cnt_decrease = 0
    while (flag):
        tr_loss = 0.0
        epoch += 1
        if (epoch > max_epochs) | (cnt_decrease > n_decreases):
            break
        # train
        model.train()
        # for each iteration, shuffles X_tr and y_tr and puts them into batches
        X_tr, y_tr = shuffle(X_tr, y_tr)
        tr_batches = batchify(X_tr, y_tr, batch_size)
        for X_b, y_b in tr_batches:
            # X_b is still a list of tokenized sentences (list of list of words)
            optimizer.zero_grad()
            """
            obtain_vectors_from_sentence(sent=list of words, include_unk=True)
            : changes each word into an embedding, and returns a list of embeddings
            padBatch(list of embedding lists, max_seq=None)
            : for each batch, returns a tensor fixed to the max size, applies zero padding
            """
            inputs = torch.tensor(
                padBatch([
                    em.obtain_vectors_from_sentence(sent, True) for sent in X_b
                ])).float()
            # here, inputs become a tensor of shape (B * seq_len * dim)
            targets = torch.tensor(y_b, dtype=torch.float32)
            if is_cuda:
                inputs, targets = inputs.cuda(), targets.cuda()
            outputs = model(inputs)
            loss = loss_fn(outputs, targets)  # error here
            loss.backward()
            tr_loss += loss.item()
            optimizer.step()

        print("[Epoch %d] train loss: %1.3f" % (epoch, tr_loss))

        # validate
        model.eval()
        current_loss = 0.0
        X_d, y_d = shuffle(X_d, y_d)
        val_batches = batchify(X_d, y_d, batch_size)
        with torch.no_grad():
            for X_b, y_b in val_batches:
                inputs = torch.tensor(
                    padBatch([
                        em.obtain_vectors_from_sentence(sent, True)
                        for sent in X_b
                    ])).float()
                targets = torch.tensor(y_b, dtype=torch.float32)
                if is_cuda:
                    inputs, targets = inputs.cuda(), targets.cuda()
                outputs = model(inputs)
                loss = loss_fn(outputs, targets)  # error here
                current_loss += loss.item()

        print("[Epoch %d] validation loss: %1.3f" % (epoch, current_loss))
        if current_loss < old_val:
            # if current round is better than the previous round
            best_state = model.state_dict()  # save this model
            torch.save(best_state, join(save_dir, 'best-weights.pth'))
            print("Updated model")
            old_val = current_loss
            cnt_decrease = 0
        else:
            # if the current round is doing worse
            cnt_decrease += 1

        if cnt_decrease >= n_decreases:
            flag = False
    return
Example #7
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('--phase', type=str, help='Train or test.')
    parser.add_argument('--embedding_file',
                        type=str,
                        help='Filename to save the trained word embeddings.')
    parser.add_argument('--model_path',
                        type=str,
                        help='The file of the lstm model.')
    parser.add_argument('--test_file',
                        type=str,
                        help='The file of the tesing data.')
    parser.add_argument('--epochs',
                        type=int,
                        default=100,
                        help='The number of training epochs.')
    parser.add_argument('--batch_size',
                        type=int,
                        default=50,
                        help='The batch size of the training phrase.')
    args = parser.parse_args()
    phase = args.phase
    embedding_file = args.embedding_file
    model_path = args.model_path

    embeddings, word2id, id2word = load_embedding(embedding_file)
    id2label = dict({
        0: u'游戏',
        1: u'角色扮演',
        2: u'moba',
        3: u'运动',
        4: u'三国',
        5: u'战争',
        6: u'服饰',
        7: u'T恤',
        8: u'婚姻'
    })

    EMBEDDING_DIM = 100
    HIDDEN_DIM = 200
    LINEAR_HIDDEN_DIM = 100
    N_CLASSES = len(id2label)

    # Create the lstm model
    model = LSTMClassifier(EMBEDDING_DIM, HIDDEN_DIM, LINEAR_HIDDEN_DIM,
                           len(word2id.keys()), N_CLASSES, embeddings)
    optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                  model.parameters()),
                           lr=5e-4)
    print(model)

    if phase == 'train':
        print('Load the training data and prepare labels...')
        game_roleplay = 'data/train/1k_std_rollplay.word'
        game_moba = 'data/train/1k_std_moba.word'
        game_sport = 'data/train/1k_std_sport_game.word'
        sanguo_battle = 'data/train/1k_std_sanguo.word'
        cloth_shirt = 'data/train/1k_std_cloth.word'
        marriage = 'data/train/1k_std_marriage.word'
        sport = 'data/train/1k_std_sport.word'

        corpus2label = dict({
            'game_roleplay': (game_roleplay, [1, 1, 0, 0, 0, 0, 0, 0, 0]),
            'game_moba': (game_moba, [1, 0, 1, 0, 0, 0, 0, 0, 0]),
            'game_sport': (game_sport, [1, 0, 0, 1, 0, 0, 0, 0, 0]),
            'sanguo_battle': (sanguo_battle, [0, 0, 0, 0, 1, 1, 0, 0, 0]),
            'cloth_shirt': (cloth_shirt, [0, 0, 0, 0, 0, 0, 1, 1, 0]),
            'marriage': (marriage, [0, 0, 0, 0, 0, 0, 0, 0, 1]),
            'sport': (sport, [0, 0, 0, 1, 0, 0, 0, 0, 0])
        })

        corpus_data = []
        labels = []
        for file_name, label in corpus2label.values():
            print(file_name, label)
            tmp_codes, tmp_labels = encode_setence(file_name, word2id, label)
            corpus_data.extend(tmp_codes)
            labels.extend(tmp_labels)

        corpus_data, lengths = get_padding_codes(corpus_data)
        corpus_data = torch.tensor(np.array(corpus_data), dtype=torch.long)
        lengths = torch.tensor(np.array(lengths), dtype=torch.long)
        labels = torch.tensor(np.array(labels), dtype=torch.float)

        # Train and validate
        # labels = np.array(labels)
        train_size = int(corpus_data.shape[0] * 0.8)
        indices = list(range(corpus_data.shape[0]))
        random.shuffle(indices)
        train_indices = indices[0:train_size]
        validate_indices = indices[train_size:]

        train_data = corpus_data[train_indices, :]
        train_labels = labels[train_indices, :]
        train_lengths = lengths[train_indices]
        validate_data = corpus_data[validate_indices, :]
        validate_labels = labels[validate_indices, :]
        validate_lengths = lengths[validate_indices]

        # bind variables to cuda
        if torch.cuda.is_available:
            train_data = train_data.cuda()
            train_lengths = train_lengths.cuda()
            train_labels = train_labels.cuda()
            validate_data = validate_data.cuda()
            validate_labels = validate_labels.cuda()
            validate_lengths = validate_lengths.cuda()
            model.cuda()

        text_data = TextDataset(train_data, train_labels, train_lengths)
        train_dataloader = data.DataLoader(text_data,
                                           batch_size=args.batch_size,
                                           shuffle=True)

        print('Train the LSTM text classifier model...')
        train_lstm(model, model_path, optimizer, train_dataloader,
                   validate_data, validate_labels, validate_lengths,
                   args.epochs)

    if phase == 'test':
        test_file = args.test_file
        model.load_state_dict(torch.load(model_path))
        optimizer.zero_grad()
        test_data, labels = encode_setence(test_file, word2id, 1)
        padding_test_data, lengths = get_padding_codes(test_data)
        padding_test_data = torch.tensor(np.array(padding_test_data),
                                         dtype=torch.long)
        lengths = torch.tensor(np.array(lengths), dtype=torch.long)
        scores = evaluate_lstm(model, padding_test_data, lengths)
        scoers = scores.data.cpu().numpy()

        # for print the result
        for idx, score in enumerate(scores):
            sentence = [id2word[int(code)] for code in test_data[idx]]
            tmp_labels = [id2label[i] for i in np.where(score > 0.5)[0]]
            tmp_score = np.array(
                [float(score[i]) for i in np.where(score > 0.5)[0]])
            tmp_score = tmp_score.prod()
            print(idx),
            print(' '.join(sentence).encode('utf-8').decode('utf-8'))
            print(' '.join(tmp_labels).encode('utf-8').decode('utf-8')),
            print(tmp_score)
    n_epochs = 30
    init_lr = 0.01  # this would not take effect as using cyclic lr
    momentum = 0.9
    weight_decay = 5e-4
    eta_min = 1e-5
    eta_max = 1e-2
    shuffle = True
    num_hidden_nodes = 78


if __name__ == "__main__":
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    if device == 'cuda':
        cudnn.benchmark = True
    net = LSTMClassifier(n_feature=39,
                         n_class=48,
                         n_hidden=Config.num_hidden_nodes,
                         num_layers=3)

    traindata = TIMITDataset(root="./data", split="train")
    trainloader = DataLoader(dataset=traindata,
                             batch_size=Config.batch_size,
                             shuffle=Config.shuffle,
                             collate_fn=pad_seqs_to_batch)
    validdata = TIMITDataset(root="./data", split="valid")
    validloader = DataLoader(dataset=validdata,
                             batch_size=Config.batch_size,
                             collate_fn=pad_seqs_to_batch)
    testdata = TIMITDataset(root="./data", split="test")
    testloader = DataLoader(dataset=testdata,
                            batch_size=Config.batch_size,
                            collate_fn=pad_seqs_to_batch)
train_pairs = preprocess.read_pairs(mode='train', config=conf)
for pair in train_pairs:
    x_train.append(pair[0])
    y_train.append(pair[1])

val_pairs = preprocess.read_pairs(mode='test', config=conf)
for pair in val_pairs:
    x_val.append(pair[0])
    y_val.append(pair[1])

print('Train and Test Label distribution respectively:')
print(Counter(y_train), Counter(y_val))

device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
model = LSTMClassifier(conf, embedding_wts, n_lables=len(labels_dict))
model = model.to(device)
criterion = nn.CrossEntropyLoss(reduction='sum')
optimizer = optim.Adadelta(model.parameters(),
                           lr=conf['lr'],
                           weight_decay=1e-5)

best_f1 = 0
for e in range(conf['n_epochs']):
    losses = []
    all_train_predictions = np.array([])
    all_train_targets = np.array(y_train)
    for iter in range(0, len(x_train), conf['batch_size']):
        input_seq, input_lengths = preprocess.btmcd(
            vocab, x_train[iter:iter + conf['batch_size']])
        targets = torch.tensor(y_train[iter:iter + conf['batch_size']])
Example #10
0
    def __init__(self,
                 models_dir='./models/lstm_trained_models',
                 embeddings_dir='./embeddings',
                 is_cuda=False):
        """
		@param models_dir: the directory where the LSTM models are stored
		@param embeddings_dir: the directory where the embeddings are stored. The directory must contain the following subdirectories:
		                       word2vec/GoogleNews-vectors-negative300.wv
		                       fasttext/wiki-news-300d-1M-subword.wv
		                       glove/glove.42B.300d.wv
		@param is_cuda: to enable cuda
		"""
        self.is_cuda = is_cuda
        self.models_dir = models_dir
        self.embeddings_dir = embeddings_dir

        #load embeddings
        self.em_glove = ExtractWordEmbeddings('glove',
                                              emb_dir=self.embeddings_dir)
        self.em_word2vec = ExtractWordEmbeddings('word2vec',
                                                 emb_dir=self.embeddings_dir)
        self.em_fasttext = ExtractWordEmbeddings('fasttext',
                                                 emb_dir=self.embeddings_dir)
        self.dimensions_list = [
            'support', 'knowledge', 'conflict', 'power', 'similarity', 'fun',
            'status', 'trust', 'identity', 'romance'
        ]

        #load models
        self.dim2model = {}
        self.dim2embedding = {}

        for dim in self.dimensions_list:
            model = LSTMClassifier(embedding_dim=300, hidden_dim=300)
            if self.is_cuda:
                print(f'Torch version: {torch.__version__}')
                print(f'Torch CUDA available : {torch.cuda.is_available()}')
                if torch.cuda.is_available():
                    print(
                        f'Torch current device : {torch.cuda.current_device()}'
                    )
                    print(f'Torch device count : {torch.cuda.device_count()}')
                    print(
                        f'Torch device name : {torch.cuda.get_device_name(0)}')
                    model.cuda()
                else:
                    print(
                        'Cuda not available. Instantiated the TenDimensionsClassifier with CUDA=False'
                    )
                    self.is_cuda = False
            model.eval()
            for modelname in os.listdir(self.models_dir):
                if ('-best.lstm' in modelname) & (dim in modelname):
                    best_state = torch.load(join(self.models_dir, modelname),
                                            map_location='cpu')
                    model.load_state_dict(best_state)
                    if 'glove' in modelname:
                        em = self.em_glove
                    elif 'word2vec' in modelname:
                        em = self.em_word2vec
                    elif 'fasttext' in modelname:
                        em = self.em_fasttext
                    self.dim2model[dim] = model
                    self.dim2embedding[dim] = em
                    break