Beispiel #1
0
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertModel
import pandas as pd
from model import SentimentClassifier
from dataset import SSTDataset

#Create validation set
val_set = SSTDataset(filename='data/dev.tsv', maxlen=30)
#Create validation dataloader
val_loader = DataLoader(val_set, batch_size=64, num_workers=5)
#Create the network
net = SentimentClassifier()
#CPU or GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#Put the network to the GPU if available
net = net.to(device)
#Load the state dictionary of the network
net.load_state_dict(torch.load('./models/model', map_location=device))
#Takes as the input the logits of the positive class and computes the binary cross-entropy
criterion = nn.BCEWithLogitsLoss()


def get_accuracy_from_logits(logits, labels):
    #Get a tensor of shape [B, 1, 1] with probabilities that the sentiment is positive
    probs = torch.sigmoid(logits.unsqueeze(-1))
    #Convert probabilities to predictions, 1 being positive and 0 being negative
    soft_probs = (probs > 0.5).long()
    #Check which predictions are the same as the ground truth and calculate the accuracy
Beispiel #2
0
def main(write_to):

    startTime = time.time()

    global args
    args = parse_args(type=1)
    args.input_dim = 300
    if args.model_name == 'dependency':
        args.mem_dim = 168
    elif args.model_name == 'constituency':
        args.mem_dim = 150
    if args.fine_grain:
        args.num_classes = 5  # 0 1 2 3 4
    else:
        args.num_classes = 3  # 0 1 2 (1 neutral)
    args.cuda = args.cuda and torch.cuda.is_available()
    # args.cuda = False
    print(args)
    # torch.manual_seed(args.seed)
    # if args.cuda:
    # torch.cuda.manual_seed(args.seed)

    #    train_dir = os.path.join(args.data,'train/')
    train_dir = os.path.join(
        args.data, 'dev/')  # Fei: wants to train on a smaller data set
    #    dev_dir = os.path.join(args.data,'dev/')
    #    test_dir = os.path.join(args.data,'test/')

    # write unique words from all token files
    token_files = [os.path.join(split, 'sents.toks') for split in [train_dir]]
    vocab_file = os.path.join(args.data, 'vocab-cased.txt')  # use vocab-cased
    # build_vocab(token_files, vocab_file) NO, DO NOT BUILD VOCAB,  USE OLD VOCAB

    #    vocab_file = os.path.join(args.data, 'vocab-cased-dev.txt')
    #    build_vocab(token_files, vocab_file)

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=vocab_file)
    print('==> SST vocabulary size : %d ' % vocab.size())

    # Load SST dataset splits

    is_preprocessing_data = False  # let program turn off after preprocess data

    # train
    train_file = os.path.join(args.data, 'sst_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = SSTDataset(train_dir, vocab, args.num_classes,
                                   args.fine_grain, args.model_name)
        torch.save(train_dataset, train_file)
        is_preprocessing_data = True

    # dev


#    dev_file = os.path.join(args.data,'sst_dev.pth')
#    if os.path.isfile(dev_file):
#        dev_dataset = torch.load(dev_file)
#    else:
#        dev_dataset = SSTDataset(dev_dir, vocab, args.num_classes, args.fine_grain, args.model_name)
#        torch.save(dev_dataset, dev_file)
#        is_preprocessing_data = True

# test
#    test_file = os.path.join(args.data,'sst_test.pth')
#    if os.path.isfile(test_file):
#        test_dataset = torch.load(test_file)
#    else:
#        test_dataset = SSTDataset(test_dir, vocab, args.num_classes, args.fine_grain, args.model_name)
#        torch.save(test_dataset, test_file)
#        is_preprocessing_data = True

    criterion = nn.NLLLoss()
    # initialize model, criterion/loss_function, optimizer
    model = TreeLSTMSentiment(args.cuda, vocab.size(), args.input_dim,
                              args.mem_dim, args.num_classes, args.model_name,
                              criterion)

    embedding_model = nn.Embedding(vocab.size(), args.input_dim)
    # Fei: don't optimize embedding
    embedding_model.weight.requires_grad = False

    if args.cuda:
        embedding_model = embedding_model.cuda()

    if args.cuda:
        model.cuda(), criterion.cuda()
    if args.optim == 'adam':
        optimizer = optim.Adam(filter(lambda p: p.requires_grad,
                                      model.parameters()),
                               lr=args.lr,
                               weight_decay=args.wd)
    elif args.optim == 'adagrad':
        # optimizer   = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd)
        optimizer = optim.Adagrad(
            [{
                'params': filter(lambda p: p.requires_grad,
                                 model.parameters()),
                'lr': args.lr
            }  # Fei: filter non_trainable
             ],
            lr=args.lr,
            weight_decay=args.wd)
    metrics = Metrics(args.num_classes)

    utils.count_param(model)

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    emb_file = os.path.join(args.data, 'sst_embed.pth')
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:

        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(
            os.path.join(args.glove, 'glove.840B.300d'))
        print('==> GLOVE vocabulary size: %d ' % glove_vocab.size())

        emb = torch.zeros(vocab.size(), glove_emb.size(1))

        for word in vocab.labelToIdx.keys():
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(
                    word)]
            else:
                emb[vocab.getIndex(word)] = torch.Tensor(
                    emb[vocab.getIndex(word)].size()).normal_(-0.05, 0.05)
        torch.save(emb, emb_file)
        is_preprocessing_data = True  # flag to quit
        print('done creating emb, quit')

    if is_preprocessing_data:
        print('done preprocessing data, quit program to prevent memory leak')
        print('please run again')
        quit()

    # plug these into embedding matrix inside model
    if args.cuda:
        emb = emb.cuda()

    # model.childsumtreelstm.emb.state_dict()['weight'].copy_(emb)
    embedding_model.state_dict()['weight'].copy_(emb)

    # create trainer object for training and testing
    trainer = SentimentTrainer(args, model, embedding_model, criterion,
                               optimizer)

    loopStart = time.time()
    #print('prepare time is %s ' % (loopStart - startTime))
    loss_save = []

    mode = 'EXPERIMENT'
    if mode == 'DEBUG':
        for epoch in range(args.epochs):
            dev_loss = trainer.train(dev_dataset)
            dev_loss, dev_pred = trainer.test(dev_dataset)
            test_loss, test_pred = trainer.test(test_dataset)

            dev_acc = metrics.sentiment_accuracy_score(dev_pred,
                                                       dev_dataset.labels)
            test_acc = metrics.sentiment_accuracy_score(
                test_pred, test_dataset.labels)
            print('==> Dev loss   : %f \t' % dev_loss, end="")
            print('Epoch ', epoch, 'dev percentage ', dev_acc)
    elif mode == "PRINT_TREE":
        for i in range(0, 10):
            ttree, tsent, tlabel = dev_dataset[i]
            utils.print_tree(ttree, 0)
            print('_______________')
        print('break')
        quit()
    elif mode == "EXPERIMENT":
        max_dev = 0
        max_dev_epoch = 0
        filename = args.name + '.pth'
        for epoch in range(args.epochs):
            train_loss = trainer.train(train_dataset)
            #dev_loss, dev_pred = trainer.test(dev_dataset)
            #dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels)
            print('==> Train loss   : %f \t' % train_loss, end="")
            loss_save.append(train_loss)
            #print('Epoch ', epoch, 'dev percentage ', dev_acc)
            #torch.save(model, args.saved + str(epoch) + '_model_' + filename)
            #torch.save(embedding_model, args.saved + str(epoch) + '_embedding_' + filename)
            #if dev_acc > max_dev:
            #    max_dev = dev_acc
            #    max_dev_epoch = epoch
            #gc.collect()

        print("done")
        #print('epoch ' + str(max_dev_epoch) + ' dev score of ' + str(max_dev))
        #print('eva on test set ')
        #model = torch.load(args.saved + str(max_dev_epoch) + '_model_' + filename)
        #embedding_model = torch.load(args.saved + str(max_dev_epoch) + '_embedding_' + filename)
        #trainer = SentimentTrainer(args, model, embedding_model, criterion, optimizer)
        #test_loss, test_pred = trainer.test(test_dataset)
        #test_acc = metrics.sentiment_accuracy_score(test_pred, test_dataset.labels)
        #print('Epoch with max dev:' + str(max_dev_epoch) + ' |test percentage ' + str(test_acc))
        #print('____________________' + str(args.name) + '___________________')
    else:
        for epoch in range(args.epochs):
            train_loss = trainer.train(train_dataset)
            train_loss, train_pred = trainer.test(train_dataset)
            dev_loss, dev_pred = trainer.test(dev_dataset)
            test_loss, test_pred = trainer.test(test_dataset)

            train_acc = metrics.sentiment_accuracy_score(
                train_pred, train_dataset.labels)
            dev_acc = metrics.sentiment_accuracy_score(dev_pred,
                                                       dev_dataset.labels)
            test_acc = metrics.sentiment_accuracy_score(
                test_pred, test_dataset.labels)
            print('==> Train loss   : %f \t' % train_loss, end="")
            print('Epoch ', epoch, 'train percentage ', train_acc)
            print('Epoch ', epoch, 'dev percentage ', dev_acc)
            print('Epoch ', epoch, 'test percentage ', test_acc)

    loopEnd = time.time()
    print('looptime is %s ' % (loopEnd - loopStart))

    prepareTime = loopStart - startTime
    loopTime = loopEnd - loopStart
    timePerEpoch = loopTime / args.epochs

    with open(write_to, "w") as f:
        f.write("unit: " + "1 epoch\n")
        for loss in loss_save:
            f.write(str(loss) + "\n")
        f.write("run time: " + str(prepareTime) + " " + str(timePerEpoch) +
                "\n")
def main():

    # Device configuration
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    parser = argparse.ArgumentParser()
    parser.add_argument('--lr', type=float, default=0.001)
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--epoch', type=int, default=1)
    parser.add_argument('--kernel_num', type=int, default=100)
    parser.add_argument('--label_num', type=int, default=2)
    parser.add_argument('--log_interval', type=int, default=100)
    parser.add_argument('--wordvec_dim', type=int, default=50)
    parser.add_argument('--model_name', type=str, default='rcnn')
    parser.add_argument(
        '--early-stop',
        type=int,
        default=1000,
        help='iteration numbers to stop without performance increasing')
    parser.add_argument(
        '--test-interval',
        type=int,
        default=200,
        help='how many steps to wait before testing [default: 100]')
    parser.add_argument('--kernel_sizes', type=str, default='3,4,5')
    parser.add_argument('--dataset_path', type=str, default='data/dataset/')

    args = parser.parse_args()
    # torch.manual_seed(args.seed)[]

    start = time.time()
    wordvec = loadGloveModel('data/glove/glove.6B.' + str(args.wordvec_dim) +
                             'd.txt')
    args.device = device
    args.weight = torch.tensor(wordvec.values, dtype=torch.float)
    args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]

    # Datasets
    testing_set = SSTDataset(args.dataset_path, 'test', args.label_num,
                             args.wordvec_dim, wordvec)
    testing_iter = DataLoader(dataset=testing_set,
                              batch_size=args.batch_size,
                              num_workers=0,
                              collate_fn=collate_fn,
                              pin_memory=True)

    print(time.time() - start)

    model_name = args.model_name.lower()

    # training_set = SSTDataset(args.dataset_path, 'train', args.label_num, args.wordvec_dim, wordvec)
    models = [
        TextCNN(args).to(device),
        LSTMClassifier(args).to(device),
        RCNN(args).to(device),
        myRNN(args).to(device)
    ]
    models[0].load_state_dict(
        torch.load('model_cnn_{}_{}.ckpt'.format(args.wordvec_dim,
                                                 args.label_num)))
    models[1].load_state_dict(
        torch.load('model_lstm_{}_{}.ckpt'.format(args.wordvec_dim,
                                                  args.label_num)))
    models[2].load_state_dict(
        torch.load('model_rcnn_{}_{}.ckpt'.format(args.wordvec_dim,
                                                  args.label_num)))
    models[3].load_state_dict(
        torch.load('model_rnn_{}_{}.ckpt'.format(args.wordvec_dim,
                                                 args.label_num)))

    del wordvec  # Save some memory

    print(evaluation(testing_iter, models, args))
    print("Parameters:")
    delattr(args, 'weight')
    for attr, value in sorted(args.__dict__.items()):
        print("\t{}={}".format(attr.upper(), value))
Beispiel #4
0
if __name__ == "__main__":
    #Get the parameters from arguments if used
    parser = ArgumentParser()
    parser.add_argument('-freeze_bert', action='store_true')
    parser.add_argument('-maxlen', type=int, default=25)
    parser.add_argument('-batch_size', type=int, default=32)
    parser.add_argument('-lr', type=float, default=2e-5)
    parser.add_argument('-print_every', type=int, default=100)
    parser.add_argument('-num_eps', type=int, default=5)
    args = parser.parse_args()
    #Instantiate the classifier model
    net = SentimentClassifier(args.freeze_bert)
    #CPU or GPU
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    #Put the network to the GPU if available
    net = net.to(device)
    #Takes as the input the logits of the positive class and computes the binary cross-entropy
    criterion = nn.BCEWithLogitsLoss()
    #Adam optimizer
    optimizer = optim.Adam(net.parameters(), lr=args.lr)
    #Create instances of training and validation set
    train_set = SSTDataset(filename='data/train.tsv', maxlen=args.maxlen)
    val_set = SSTDataset(filename='data/dev.tsv', maxlen=args.maxlen)
    #Create intsances of training and validation dataloaders
    train_loader = DataLoader(train_set,
                              batch_size=args.batch_size,
                              num_workers=5)
    val_loader = DataLoader(val_set, batch_size=args.batch_size, num_workers=5)
    #Train the network
    train(net, criterion, optimizer, train_loader, val_loader, args)
    elif config.model_type == 'distilbert':
        model = DistilBertForSentimentClassification.from_pretrained(
            args.model_name_or_path, config=config)
    else:
        raise ValueError('This transformer model is not supported yet.')

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    #Takes as the input the logits of the positive class and computes the binary cross-entropy
    criterion = nn.BCEWithLogitsLoss()

    optimizer = optim.Adam(params=model.parameters(), lr=args.lr)

    train_set = SSTDataset(filename='data/train.tsv',
                           maxlen=args.maxlen_train,
                           tokenizer=tokenizer)
    val_set = SSTDataset(filename='data/dev.tsv',
                         maxlen=args.maxlen_val,
                         tokenizer=tokenizer)

    train_loader = DataLoader(dataset=train_set,
                              batch_size=args.batch_size,
                              num_workers=args.num_threads)
    val_loader = DataLoader(dataset=val_set,
                            batch_size=args.batch_size,
                            num_workers=args.num_threads)

    train(model=model,
          criterion=criterion,
          optimizer=optimizer,
Beispiel #6
0
def main():
    global args
    args = parse_args(type=1)
    args.input_dim, args.mem_dim = 300, 168
    if args.fine_grain:
        args.num_classes = 5  # 0 1 2 3 4
    else:
        args.num_classes = 3  # 0 1 2 (1 neutral)
    args.cuda = args.cuda and torch.cuda.is_available()
    print(args)
    torch.manual_seed(args.seed)
    if args.cuda:
        torch.cuda.manual_seed(args.seed)

    train_dir = os.path.join(args.data, 'train/')
    dev_dir = os.path.join(args.data, 'dev/')
    test_dir = os.path.join(args.data, 'test/')

    # write unique words from all token files
    token_files = [
        os.path.join(split, 'sents.toks')
        for split in [train_dir, dev_dir, test_dir]
    ]
    vocab_file = os.path.join(args.data, 'vocab.txt')
    build_vocab(token_files, vocab_file)

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=vocab_file,
                  data=[
                      Constants.PAD_WORD, Constants.UNK_WORD,
                      Constants.BOS_WORD, Constants.EOS_WORD
                  ])
    print('==> SST vocabulary size : %d ' % vocab.size())

    # Load SST dataset splits

    is_preprocessing_data = False  # let program turn off after preprocess data

    # train
    train_file = os.path.join(args.data, 'sst_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = SSTDataset(train_dir, vocab, args.num_classes,
                                   args.fine_grain)
        torch.save(train_dataset, train_file)
        is_preprocessing_data = True

    # dev
    dev_file = os.path.join(args.data, 'sst_dev.pth')
    if os.path.isfile(dev_file):
        dev_dataset = torch.load(dev_file)
    else:
        dev_dataset = SSTDataset(dev_dir, vocab, args.num_classes,
                                 args.fine_grain)
        torch.save(dev_dataset, dev_file)
        is_preprocessing_data = True

    # test
    test_file = os.path.join(args.data, 'sst_test.pth')
    if os.path.isfile(test_file):
        test_dataset = torch.load(test_file)
    else:
        test_dataset = SSTDataset(test_dir, vocab, args.num_classes,
                                  args.fine_grain)
        torch.save(test_dataset, test_file)
        is_preprocessing_data = True

    # initialize model, criterion/loss_function, optimizer
    model = TreeLSTMSentiment(args.cuda, vocab.size(), args.input_dim,
                              args.mem_dim, args.num_classes)
    criterion = nn.CrossEntropyLoss()
    if args.cuda:
        model.cuda(), criterion.cuda()
    if args.optim == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.wd)
    elif args.optim == 'adagrad':
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.wd)
    metrics = Metrics(args.num_classes)

    utils.count_param(model)

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    emb_file = os.path.join(args.data, 'sst_embed.pth')
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:
        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(
            os.path.join(args.glove, 'glove.840B.300d'))
        print('==> GLOVE vocabulary size: %d ' % glove_vocab.size())
        emb = torch.Tensor(vocab.size(),
                           glove_emb.size(1)).normal_(-0.05, 0.05)
        # zero out the embeddings for padding and other special words if they are absent in vocab
        for idx, item in enumerate([
                Constants.PAD_WORD, Constants.UNK_WORD, Constants.BOS_WORD,
                Constants.EOS_WORD
        ]):
            emb[idx].zero_()
        for word in vocab.labelToIdx.keys():
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(
                    word)]
        torch.save(emb, emb_file)
        is_preprocessing_data = True  # flag to quit
        print('done creating emb, quit')

    if is_preprocessing_data:
        print(
            'quit program due to memory leak during preprocess data, please rerun sentiment.py'
        )
        quit()

    # plug these into embedding matrix inside model
    if args.cuda:
        emb = emb.cuda()
    model.childsumtreelstm.emb.state_dict()['weight'].copy_(emb)

    # create trainer object for training and testing
    trainer = SentimentTrainer(args, model, criterion, optimizer)

    for epoch in range(args.epochs):
        train_loss = trainer.train(train_dataset)
        # train_loss, train_pred = trainer.test(dev_dataset)
        dev_loss, dev_pred = trainer.test(dev_dataset)
        test_loss, test_pred = trainer.test(test_dataset)
        # TODO: torch.Tensor(dev_dataset.labels) turn label into tensor # done
        dev_acc = metrics.sentiment_accuracy_score(dev_pred,
                                                   dev_dataset.labels)
        test_acc = metrics.sentiment_accuracy_score(test_pred,
                                                    test_dataset.labels)
        print('==> Train loss   : %f \t' % train_loss, end="")
        print('Epoch ', epoch, 'dev percentage ', dev_acc)
        print('Epoch ', epoch, 'test percentage ', test_acc)
Beispiel #7
0
def main():
    global args
    args = parse_args(type=1)
    print(args.name)
    print(args.model_name)

    args.input_dim = 300

    if args.mem_dim == 0:
        if args.model_name == 'dependency':
            args.mem_dim = 168
        elif args.model_name == 'constituency':
            args.mem_dim = 150
        elif args.model_name == 'lstm':
            args.mem_dim = 168
        elif args.model_name == 'bilstm':
            args.mem_dim = 168

    if args.num_classes == 0:
        if args.fine_grain:
            args.num_classes = 5  # 0 1 2 3 4
        else:
            args.num_classes = 3  # 0 1 2 (1 neutral)
    elif args.num_classes == 2:
        # assert False # this will not work
        assert not args.fine_grain

    args.cuda = args.cuda and torch.cuda.is_available()
    # args.cuda = False
    print(args)
    # torch.manual_seed(args.seed)
    # if args.cuda:
    # torch.cuda.manual_seed(args.seed)

    train_dir = os.path.join(args.data, 'train/')
    dev_dir = os.path.join(args.data, 'dev/')
    test_dir = os.path.join(args.data, 'test/')

    # write unique words from all token files
    token_files = [
        os.path.join(split, 'sents.toks')
        for split in [train_dir, dev_dir, test_dir]
    ]
    vocab_file = os.path.join(args.data, 'vocab-cased.txt')  # use vocab-cased
    # build_vocab(token_files, vocab_file) NO, DO NOT BUILD VOCAB,  USE OLD VOCAB

    # get vocab object from vocab file previously written
    vocab = Vocab(filename=vocab_file)
    print('==> SST vocabulary size : %d ' % vocab.size())

    # Load SST dataset splits

    is_preprocessing_data = False  # let program turn off after preprocess data

    # train
    train_file = os.path.join(args.data, 'sst_train.pth')
    if os.path.isfile(train_file):
        train_dataset = torch.load(train_file)
    else:
        train_dataset = SSTDataset(train_dir, vocab, args.num_classes,
                                   args.fine_grain, args.model_name)
        torch.save(train_dataset, train_file)
        is_preprocessing_data = True

    # dev
    dev_file = os.path.join(args.data, 'sst_dev.pth')
    if os.path.isfile(dev_file):
        dev_dataset = torch.load(dev_file)
    else:
        dev_dataset = SSTDataset(dev_dir, vocab, args.num_classes,
                                 args.fine_grain, args.model_name)
        torch.save(dev_dataset, dev_file)
        is_preprocessing_data = True

    # test
    test_file = os.path.join(args.data, 'sst_test.pth')
    if os.path.isfile(test_file):
        test_dataset = torch.load(test_file)
    else:
        test_dataset = SSTDataset(test_dir, vocab, args.num_classes,
                                  args.fine_grain, args.model_name)
        torch.save(test_dataset, test_file)
        is_preprocessing_data = True

    criterion = nn.NLLLoss()
    # initialize model, criterion/loss_function, optimizer

    model = DMNWraper(args.cuda, args.input_dim, args.mem_dim, criterion,
                      args.train_subtrees, args.num_classes, args.embdrop)

    embedding_model = nn.Embedding(vocab.size(), args.input_dim)

    if args.cuda:
        embedding_model = embedding_model.cuda()

    if args.cuda:
        model.cuda(), criterion.cuda()

    # for words common to dataset vocab and GLOVE, use GLOVE vectors
    # for other words in dataset vocab, use random normal vectors
    if args.embedding == 'glove':
        emb_torch = 'sst_embed.pth'
        emb_vector = 'glove.840B.300d'
        emb_vector_path = os.path.join(args.glove, emb_vector)
        assert os.path.isfile(emb_vector_path + '.txt')
    elif args.embedding == 'paragram':
        emb_torch = 'sst_embed_paragram.pth'
        emb_vector = 'paragram_300_sl999'
        emb_vector_path = os.path.join(args.paragram, emb_vector)
        assert os.path.isfile(emb_vector_path + '.txt')
    elif args.embedding == 'paragram_xxl':
        emb_torch = 'sst_embed_paragram_xxl.pth'
        emb_vector = 'paragram-phrase-XXL'
        emb_vector_path = os.path.join(args.paragram, emb_vector)
        assert os.path.isfile(emb_vector_path + '.txt')
    else:
        assert False

    emb_file = os.path.join(args.data, emb_torch)
    if os.path.isfile(emb_file):
        emb = torch.load(emb_file)
    else:

        # load glove embeddings and vocab
        glove_vocab, glove_emb = load_word_vectors(emb_vector_path)
        print('==> Embedding vocabulary size: %d ' % glove_vocab.size())

        emb = torch.zeros(vocab.size(), glove_emb.size(1))

        for word in vocab.labelToIdx.keys():
            if glove_vocab.getIndex(word):
                emb[vocab.getIndex(word)] = glove_emb[glove_vocab.getIndex(
                    word)]
            else:
                emb[vocab.getIndex(word)] = torch.Tensor(
                    emb[vocab.getIndex(word)].size()).normal_(-0.05, 0.05)
        torch.save(emb, emb_file)
        is_preprocessing_data = True  # flag to quit
        print('done creating emb, quit')

    if is_preprocessing_data:
        print('quit program')
        quit()

    # plug these into embedding matrix inside model
    if args.cuda:
        emb = emb.cuda()
    embedding_model.state_dict()['weight'].copy_(emb)

    if args.optim == 'adam':
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.wd)
    elif args.optim == 'adagrad':
        # optimizer   = optim.Adagrad(filter(lambda p: p.requires_grad, model.parameters()), lr=args.lr, weight_decay=args.wd)
        optimizer = optim.Adagrad(model.parameters(),
                                  lr=args.lr,
                                  weight_decay=args.wd)
    elif args.optim == 'adam_combine':
        optimizer = optim.Adam([{
            'params': model.parameters(),
            'lr': args.lr,
            'weight_decay': args.wd
        }, {
            'params': embedding_model.parameters(),
            'lr': args.emblr,
            'weight_decay': args.embwd
        }])
        args.manually_emb = 0
    elif args.optim == 'adagrad_combine':
        optimizer = optim.Adagrad([{
            'params': model.parameters(),
            'lr': args.lr,
            'weight_decay': args.wd
        }, {
            'params': embedding_model.parameters(),
            'lr': args.emblr,
            'weight_decay': args.embwd
        }])
        args.manually_emb = 0
    elif args.optim == 'adam_combine_v2':
        model.embedding_model = embedding_model
        optimizer = optim.Adam(model.parameters(),
                               lr=args.lr,
                               weight_decay=args.wd)
        args.manually_emb = 0
    metrics = Metrics(args.num_classes)
    utils.count_param(model)

    trainer = SentimentTrainer(args, model, embedding_model, criterion,
                               optimizer)

    trainer.set_initial_emb(emb)
    question_idx = vocab.labelToIdx['sentiment']
    question_idx = torch.Tensor([question_idx])
    trainer.set_question(question_idx)

    # trainer = SentimentTrainer(args, model, embedding_model ,criterion, optimizer)

    mode = args.mode
    if mode == 'DEBUG':
        for epoch in range(args.epochs):
            # print a tree
            tree, sent, label = dev_dataset[3]
            utils.print_span(tree, sent, vocab)
            quit()

            dev_loss = trainer.train(dev_dataset)
            dev_loss, dev_pred, _ = trainer.test(dev_dataset)
            test_loss, test_pred, _ = trainer.test(test_dataset)

            dev_acc = metrics.sentiment_accuracy_score(dev_pred,
                                                       dev_dataset.labels)
            test_acc = metrics.sentiment_accuracy_score(
                test_pred, test_dataset.labels)
            print('==> Dev loss   : %f \t' % dev_loss, end="")
            print('Epoch ', epoch, 'dev percentage ', dev_acc)
    elif mode == "PRINT_TREE":
        for i in range(0, 10):
            ttree, tsent, tlabel = dev_dataset[i]
            utils.print_tree(ttree, 0)
            print('_______________')
        print('break')
        quit()
    elif mode == 'EVALUATE':
        filename = args.name + '.pth'
        epoch = args.epochs
        model_name = str(epoch) + '_model_' + filename
        embedding_name = str(epoch) + '_embedding_' + filename
        model = torch.load(os.path.join(args.saved, model_name))
        embedding_model = torch.load(os.path.join(args.saved, embedding_name))

        trainer = SentimentTrainer(args, model, embedding_model, criterion,
                                   optimizer)
        trainer.set_question(question_idx)
        test_loss, test_pred, subtree_metrics = trainer.test(test_dataset)
        test_acc = metrics.sentiment_accuracy_score(
            test_pred, test_dataset.labels, num_classes=args.num_classes)
        print('Epoch with max dev:' + str(epoch) + ' |test percentage ' +
              str(test_acc))
        print('____________________' + str(args.name) + '___________________')
        print_list = subtree_metrics.print_list
        torch.save(print_list,
                   os.path.join(args.saved, args.name + 'printlist.pth'))
        utils.print_trees_file(args,
                               vocab,
                               test_dataset,
                               print_list,
                               name='tree')
    elif mode == "EXPERIMENT":
        # dev_loss, dev_pred = trainer.test(dev_dataset)
        # dev_acc = metrics.sentiment_accuracy_score(dev_pred, dev_dataset.labels, num_classes=args.num_classes)
        max_dev = 0
        max_dev_epoch = 0
        filename = args.name + '.pth'
        for epoch in range(args.epochs):
            # train_loss, train_pred, _ = trainer.test(train_dataset)
            train_loss_while_training = trainer.train(train_dataset)
            train_loss, train_pred, _ = trainer.test(train_dataset)
            dev_loss, dev_pred, _ = trainer.test(dev_dataset)
            dev_acc = metrics.sentiment_accuracy_score(
                dev_pred, dev_dataset.labels, num_classes=args.num_classes)
            train_acc = metrics.sentiment_accuracy_score(
                train_pred, train_dataset.labels, num_classes=args.num_classes)
            print('==> Train loss   : %f \t' % train_loss_while_training,
                  end="")
            print('Epoch ', epoch, 'dev percentage ', dev_acc)
            print('Epoch %d dev percentage %f ' % (epoch, dev_acc))
            print('Train acc %f ' % (train_acc))
            if dev_acc > max_dev:
                print('update best dev acc %f ' % (dev_acc))
                max_dev = dev_acc
                max_dev_epoch = epoch
                utils.mkdir_p(args.saved)
                torch.save(
                    model,
                    os.path.join(args.saved,
                                 str(epoch) + '_model_' + filename))
                torch.save(
                    embedding_model,
                    os.path.join(args.saved,
                                 str(epoch) + '_embedding_' + filename))
            gc.collect()
        print('epoch ' + str(max_dev_epoch) + ' dev score of ' + str(max_dev))
        print('eva on test set ')
        model = torch.load(
            os.path.join(args.saved,
                         str(max_dev_epoch) + '_model_' + filename))
        embedding_model = torch.load(
            os.path.join(args.saved,
                         str(max_dev_epoch) + '_embedding_' + filename))
        trainer = SentimentTrainer(args, model, embedding_model, criterion,
                                   optimizer)
        trainer.set_question(question_idx)
        test_loss, test_pred, _ = trainer.test(test_dataset)
        test_acc = metrics.sentiment_accuracy_score(
            test_pred, test_dataset.labels, num_classes=args.num_classes)
        print('Epoch with max dev:' + str(max_dev_epoch) +
              ' |test percentage ' + str(test_acc))
        print('____________________' + str(args.name) + '___________________')
    else:
        for epoch in range(args.epochs):
            train_loss = trainer.train(train_dataset)
            train_loss, train_pred, _ = trainer.test(train_dataset)
            dev_loss, dev_pred, _ = trainer.test(dev_dataset)
            test_loss, test_pred, _ = trainer.test(test_dataset)

            train_acc = metrics.sentiment_accuracy_score(
                train_pred, train_dataset.labels)
            dev_acc = metrics.sentiment_accuracy_score(dev_pred,
                                                       dev_dataset.labels)
            test_acc = metrics.sentiment_accuracy_score(
                test_pred, test_dataset.labels)
            print('==> Train loss   : %f \t' % train_loss, end="")
            print('Epoch ', epoch, 'train percentage ', train_acc)
            print('Epoch ', epoch, 'dev percentage ', dev_acc)
            print('Epoch ', epoch, 'test percentage ', test_acc)
Beispiel #8
0
from tqdm import tqdm
import argparse

if __name__ == "__main__":

    parser = argparse.ArgumentParser()
    parser.add_argument('--model', type=str, default='',  help='model path')
    parser.add_argument('--train_data', type=str)
    parser.add_argument('--test_data', type=str)
    parser.add_argument('--normalizer', type=str)
    parser.add_argument('--tstep', type=float)

    opt = parser.parse_args()
    print(opt)

    dset = SSTDataset(opt.train_data,
                      dim_control=1, dim_state=4)
    dset_eval = SSTDataset(opt.test_data,
                           dim_control=1, dim_state=4)
    use_cuda = True

    dl = DataLoader(dset, batch_size=200, num_workers=0, drop_last=True)
    dl_eval = DataLoader(dset_eval, batch_size=200, num_workers=0,
                         drop_last=True)

    G1 = nx.path_graph(2).to_directed()
    G_target = nx.path_graph(2).to_directed()
    # nx.draw(G1)
    # plt.show()
    node_feat_size = 2
    edge_feat_size = 3
    graph_feat_size = 10
Beispiel #9
0
def main():
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    parser = argparse.ArgumentParser()
    parser.add_argument('--lr',
                        type=float,
                        default=0.001,
                        help='learning rate')
    parser.add_argument('--batch_size', type=int, default=16)
    parser.add_argument('--epoch', type=int, default=10)
    parser.add_argument('--kernel_num',
                        type=int,
                        default=100,
                        help='Number of each size of kernels used in CNN')
    parser.add_argument('--label_num',
                        type=int,
                        default=2,
                        help='Target label numbers')
    parser.add_argument('--log_interval', type=int, default=100)
    parser.add_argument('--wordvec_dim',
                        type=int,
                        default=50,
                        help='Dimension of GloVe vectors')
    parser.add_argument('--model_name',
                        type=str,
                        default='rcnn',
                        help='Which model to use')
    parser.add_argument('--kernel_sizes',
                        type=str,
                        default='3,4,5',
                        help='Sizes of kernels used in CNN')
    parser.add_argument('--dataset_path',
                        type=str,
                        default='data/dataset/',
                        help='PATH to dataset')

    args = parser.parse_args()
    # torch.manual_seed(args.seed)[]

    start = time.time()
    wordvec = loadGloveModel('data/glove/glove.6B.' + str(args.wordvec_dim) +
                             'd.txt')
    args.device = device
    args.weight = torch.tensor(
        wordvec.values,
        dtype=torch.float)  # word embedding for the embedding layer
    args.kernel_sizes = [int(k) for k in args.kernel_sizes.split(',')]

    # Datasets
    training_set = SSTDataset(args.dataset_path, 'train', args.label_num,
                              args.wordvec_dim, wordvec)
    testing_set = SSTDataset(args.dataset_path, 'test', args.label_num,
                             args.wordvec_dim, wordvec)
    validation_set = SSTDataset(args.dataset_path, 'dev', args.label_num,
                                args.wordvec_dim, wordvec)

    training_iter = DataLoader(dataset=training_set,
                               batch_size=args.batch_size,
                               num_workers=0,
                               shuffle=True,
                               collate_fn=collate_fn,
                               pin_memory=True)
    testing_iter = DataLoader(dataset=testing_set,
                              batch_size=args.batch_size,
                              num_workers=0,
                              collate_fn=collate_fn,
                              pin_memory=True)
    validation_iter = DataLoader(dataset=validation_set,
                                 batch_size=args.batch_size,
                                 num_workers=0,
                                 collate_fn=collate_fn,
                                 pin_memory=True)
    print(time.time() - start)

    model_name = args.model_name.lower()
    print(model_name)

    # Select model
    if model_name == 'cnn':
        model = TextCNN(args).to(device)
    elif model_name == 'lstm':
        model = LSTMClassifier(args).to(device)
    elif model_name == 'rcnn':
        model = RCNN(args).to(device)
    elif model_name == 'rnn':
        model = myRNN(args).to(device)
    else:
        print('Unrecognized model name!')
        exit(1)
    del wordvec  # Save some memory

    criterion = nn.CrossEntropyLoss()
    # optimizer = optim.SGD(model.parameters(), lr=config.lr)
    optimizer = torch.optim.Adam(model.parameters(), lr=args.lr)  # Adam优化器

    step = 0
    loss_sum = 0
    # Train
    test_acc = []
    best_acc = 0
    for epoch in range(1, args.epoch + 1):
        for data, label in training_iter:
            sentences = data.to(device,
                                non_blocking=True)  # Asynchronous loading
            # sentences = data.flip(dims=(-1,)).to(device, dtype=torch.long)
            labels = label.to(device, non_blocking=True)
            optimizer.zero_grad()
            logits = model(sentences)  # 训练

            loss = criterion(logits, labels)  # 损失

            loss_sum += loss.data  # 求和loss
            step += 1

            if step % args.log_interval == 0:
                print("epoch", epoch, end='  ')
                print("avg loss: %.5f" % (loss_sum / args.log_interval))
                loss_sum = 0
                step = 0

            loss.backward()
            optimizer.step()

        # test
        acc = evaluation(testing_iter, model, args)
        if acc > best_acc:
            best_acc = acc
            # torch.save(model.state_dict(), 'model_{}_{}_{}.ckpt'.format(args.model_name, args.wordvec_dim, args.label_num))
        test_acc.append(acc)
        print('test acc {:.4f}'.format(acc))
        print('train acc {:.4f}'.format(evaluation(training_iter, model,
                                                   args)))
    best = 0
    best_acc = 0
    for i, a in enumerate(test_acc):
        if a > best_acc:
            best_acc = a
            best = i + 1

    print('best: epoch {}, acc {:.4f}'.format(best, best_acc))

    print("Parameters:")
    delattr(args, 'weight')
    for attr, value in sorted(args.__dict__.items()):
        print("\t{}={}".format(attr.upper(), value))
import rntn
from sentiment_tensor import SentimentTree
import pickle
import torch
from dataset import SSTDataset
from torch.utils.data import DataLoader

stoi = pickle.load(open('./assets/stoi.pkl', 'rb'))

lexis_size = len(stoi)

BATCH_SIZE = 128
PARAMETERS = "./assets/batch_parameters/net_parameters_6.pth"

test = SSTDataset("./sst/test.txt", stoi)
testloader = DataLoader(test, batch_size=BATCH_SIZE)
N = test.__len__()

# Since Sentiment Tree have no support for GPU allocation
# they can't be fed to the model using cuda device. Training is done
# on the CPU with a subset of the training set.

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

net = rntn.RNTensorN(lexis_size)
net.load_state_dict(torch.load(PARAMETERS))

test_loss = acc = 0

with torch.no_grad():