def main(args):
    if args.dataset == 'mr':
        #     data, label = dataloader.read_MR(args.path)
        #     train_x, train_y, test_x, test_y = dataloader.cv_split2(
        #         data, label,
        #         nfold=10,
        #         valid_id=args.cv
        #     )
        #
        #     if args.save_data_split:
        #         save_data(train_x, train_y, args.path, 'train')
        #         save_data(test_x, test_y, args.path, 'test')
        train_x, train_y = dataloader.read_corpus(
            '/data/medg/misc/jindi/nlp/datasets/mr/train.txt')
        test_x, test_y = dataloader.read_corpus(
            '/data/medg/misc/jindi/nlp/datasets/mr/test.txt')
    elif args.dataset == 'imdb':
        train_x, train_y = dataloader.read_corpus(os.path.join(
            '/data/medg/misc/jindi/nlp/datasets/imdb', 'train_tok.csv'),
                                                  clean=False,
                                                  MR=True,
                                                  shuffle=True)
        test_x, test_y = dataloader.read_corpus(os.path.join(
            '/data/medg/misc/jindi/nlp/datasets/imdb', 'test_tok.csv'),
                                                clean=False,
                                                MR=True,
                                                shuffle=True)
    else:
        train_x, train_y = dataloader.read_corpus(
            '/afs/csail.mit.edu/u/z/zhijing/proj/to_di/data/{}/'
            'train_tok.csv'.format(args.dataset),
            clean=False,
            MR=False,
            shuffle=True)
        test_x, test_y = dataloader.read_corpus(
            '/afs/csail.mit.edu/u/z/zhijing/proj/to_di/data/{}/'
            'test_tok.csv'.format(args.dataset),
            clean=False,
            MR=False,
            shuffle=True)

    nclasses = max(train_y) + 1
    # elif args.dataset == 'subj':
    #     data, label = dataloader.read_SUBJ(args.path)
    # elif args.dataset == 'cr':
    #     data, label = dataloader.read_CR(args.path)
    # elif args.dataset == 'mpqa':
    #     data, label = dataloader.read_MPQA(args.path)
    # elif args.dataset == 'trec':
    #     train_x, train_y, test_x, test_y = dataloader.read_TREC(args.path)
    #     data = train_x + test_x
    #     label = None
    # elif args.dataset == 'sst':
    #     train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.read_SST(args.path)
    #     data = train_x + valid_x + test_x
    #     label = None
    # else:
    #     raise Exception("unknown dataset: {}".format(args.dataset))

    # if args.dataset == 'trec':

    # elif args.dataset != 'sst':
    #     train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.cv_split(
    #         data, label,
    #         nfold = 10,
    #         test_id = args.cv
    #     )

    model = Model(args.embedding, args.d, args.depth, args.dropout, args.cnn,
                  nclasses).cuda()
    need_grad = lambda x: x.requires_grad
    optimizer = optim.Adam(filter(need_grad, model.parameters()), lr=args.lr)

    train_x, train_y = dataloader.create_batches(
        train_x,
        train_y,
        args.batch_size,
        model.word2id,
    )
    # valid_x, valid_y = dataloader.create_batches(
    #     valid_x, valid_y,
    #     args.batch_size,
    #     emb_layer.word2id,
    # )
    test_x, test_y = dataloader.create_batches(
        test_x,
        test_y,
        args.batch_size,
        model.word2id,
    )

    best_test = 0
    # test_err = 1e+8
    for epoch in range(args.max_epoch):
        best_test = train_model(
            epoch,
            model,
            optimizer,
            train_x,
            train_y,
            # valid_x, valid_y,
            test_x,
            test_y,
            best_test,
            args.save_path)
        if args.lr_decay > 0:
            optimizer.param_groups[0]['lr'] *= args.lr_decay

    # sys.stdout.write("best_valid: {:.6f}\n".format(
    #     best_valid
    # ))
    sys.stdout.write("test_err: {:.6f}\n".format(best_test))
Beispiel #2
0
def main(args):
    if args.dataset == 'mr':
        data, label = dataloader.read_MR(args.path)
    elif args.dataset == 'subj':
        data, label = dataloader.read_SUBJ(args.path)
    elif args.dataset == 'cr':
        data, label = dataloader.read_CR(args.path)
    elif args.dataset == 'mpqa':
        data, label = dataloader.read_MPQA(args.path)
    elif args.dataset == 'trec':
        train_x, train_y, test_x, test_y = dataloader.read_TREC(args.path)
        data = train_x + test_x
        label = None
    elif args.dataset == 'sst':
        train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.read_SST(args.path)
        data = train_x + valid_x + test_x
        label = None
    else:
        raise Exception("unknown dataset: {}".format(args.dataset))

    emb_layer = modules.EmbeddingLayer(
        args.d, data,
        embs = dataloader.load_embedding(args.embedding)
    )

    if args.dataset == 'trec':
        train_x, train_y, valid_x, valid_y = dataloader.cv_split2(
            train_x, train_y,
            nfold = 10,
            valid_id = args.cv
        )
    elif args.dataset != 'sst':
        train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.cv_split(
            data, label,
            nfold = 10,
            test_id = args.cv
        )

    nclasses = max(train_y)+1

    train_x, train_y = dataloader.create_batches(
        train_x, train_y,
        args.batch_size,
        emb_layer.word2id,
        sort = args.dataset == 'sst'
    )
    valid_x, valid_y = dataloader.create_batches(
        valid_x, valid_y,
        args.batch_size,
        emb_layer.word2id,
        sort = args.dataset == 'sst'
    )
    test_x, test_y = dataloader.create_batches(
        test_x, test_y,
        args.batch_size,
        emb_layer.word2id,
        sort = args.dataset == 'sst'
    )

    model = Model(args, emb_layer, nclasses).cuda()
    need_grad = lambda x: x.requires_grad
    optimizer = optim.Adam(
        filter(need_grad, model.parameters()),
        lr = args.lr
    )

    best_valid = 1e+8
    test_err = 1e+8
    for epoch in range(args.max_epoch):
        best_valid, test_err = train_model(epoch, model, optimizer,
            train_x, train_y,
            valid_x, valid_y,
            test_x, test_y,
            best_valid, test_err
        )
        if args.lr_decay>0:
            optimizer.param_groups[0]['lr'] *= args.lr_decay

    sys.stdout.write("best_valid: {:.6f}\n".format(
        best_valid
    ))
    sys.stdout.write("test_err: {:.6f}\n".format(
        test_err
    ))
def main(args):
    max_length = args.max_length
    if args.dataset == 'mr':
        #     data, label = dataloader.read_MR(args.path)
        #     train_x, train_y, test_x, test_y = dataloader.cv_split2(
        #         data, label,
        #         nfold=10,
        #         valid_id=args.cv
        #     )
        #
        #     if args.save_data_split:
        #         save_data(train_x, train_y, args.path, 'train')
        #         save_data(test_x, test_y, args.path, 'test')
        #     train_x, train_y = dataloader.read_corpus('/data/medg/misc/jindi/nlp/datasets/mr/train.txt', max_length=max_length)
        #     test_x, test_y = dataloader.read_corpus('/data/medg/misc/jindi/nlp/datasets/mr/test.txt', max_length=max_length)
        train_x, train_y = dataloader.read_corpus(
            '/home/mahmoudm/pb90_scratch/mahmoud/TextFooler-master/data/adversary_training_corpora/mr/train.txt',
            max_length=max_length)
        test_x, test_y = dataloader.read_corpus(
            '/home/mahmoudm/pb90_scratch/mahmoud/TextFooler-master/data/adversary_training_corpora/mr/test.txt',
            max_length=max_length)
    elif args.dataset == 'imdb':
        train_x, train_y = dataloader.read_corpus(os.path.join(
            '/home/mahmoudm/pb90_scratch/mahmoud/TextFooler-master/data/adversary_training_corpora/imdb',
            'train_tok.csv'),
                                                  clean=False,
                                                  MR=True,
                                                  shuffle=False,
                                                  max_length=max_length)
        test_x, test_y = dataloader.read_corpus(os.path.join(
            '/home/mahmoudm/pb90_scratch/mahmoud/TextFooler-master/data/adversary_training_corpora/imdb',
            'test_tok.csv'),
                                                clean=False,
                                                MR=True,
                                                shuffle=False,
                                                max_length=max_length)
    else:
        fix_labels = False
        if args.dataset == "yelp" or args.dataset == "fake" or args.dataset == "ag":
            fix_labels = True

        train_x, train_y = dataloader.read_corpus(
            '/home/mahmoudm/pb90_scratch/mahmoud/TextFooler-master/data/adversary_training_corpora/{}/'
            'train_tok.csv'.format(args.dataset),
            clean=False,
            MR=True,
            shuffle=False,
            fix_labels=fix_labels,
            max_length=max_length)
        test_x, test_y = dataloader.read_corpus(
            '/home/mahmoudm/pb90_scratch/mahmoud/TextFooler-master/data/adversary_training_corpora/{}/'
            'test_tok.csv'.format(args.dataset),
            clean=False,
            MR=True,
            shuffle=False,
            fix_labels=fix_labels,
            max_length=max_length)

    nclasses = max(train_y) + 1
    # elif args.dataset == 'subj':
    #     data, label = dataloader.read_SUBJ(args.path)
    # elif args.dataset == 'cr':
    #     data, label = dataloader.read_CR(args.path)
    # elif args.dataset == 'mpqa':
    #     data, label = dataloader.read_MPQA(args.path)
    # elif args.dataset == 'trec':
    #     train_x, train_y, test_x, test_y = dataloader.read_TREC(args.path)
    #     data = train_x + test_x
    #     label = None
    # elif args.dataset == 'sst':
    #     train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.read_SST(args.path)
    #     data = train_x + valid_x + test_x
    #     label = None
    # else:
    #     raise Exception("unknown dataset: {}".format(args.dataset))

    # if args.dataset == 'trec':

    # elif args.dataset != 'sst':
    #     train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.cv_split(
    #         data, label,
    #         nfold = 10,
    #         test_id = args.cv
    #     )

    log_file = open(
        os.path.join(os.path.dirname(args.save_path),
                     f'{os.path.basename(args.save_path)}.log'), 'a')

    model = Model(args.embedding, args.d, args.depth, args.dropout, args.cnn,
                  nclasses).cuda()
    need_grad = lambda x: x.requires_grad
    optimizer = optim.Adam(filter(need_grad, model.parameters()), lr=args.lr)

    train_x, train_y = dataloader.create_batches(train_x,
                                                 train_y,
                                                 args.batch_size,
                                                 model.word2id,
                                                 max_len=max_length)
    # valid_x, valid_y = dataloader.create_batches(
    #     valid_x, valid_y,
    #     args.batch_size,
    #     emb_layer.word2id, max_len=max_length)
    test_x, test_y = dataloader.create_batches(test_x,
                                               test_y,
                                               args.batch_size,
                                               model.word2id,
                                               max_len=max_length)

    lengths = np.array(
        [len(seq) for batch in train_x for seq in batch.t().contiguous()])
    log_file.write("Run with command:\n" +
                   " ".join([arg for arg in sys.argv[1:]]) + "\n")
    log_file.write("\n")
    log_file.write(f"Max seq length found = {np.max(lengths)}\n")
    log_file.flush()

    best_test = 0
    # test_err = 1e+8
    progress = tqdm(total=args.max_epoch)
    for epoch in range(args.max_epoch):
        best_test = train_model(
            epoch,
            model,
            optimizer,
            train_x,
            train_y,
            # valid_x, valid_y,
            test_x,
            test_y,
            best_test,
            args.save_path,
            log_file)
        if args.lr_decay > 0:
            optimizer.param_groups[0]['lr'] *= args.lr_decay
        if epoch % 20 == 0:
            progress.update(20)
            log_file.write(str(progress) + '\n')
            log_file.flush()

    # sys.stdout.write("best_valid: {:.6f}\n".format(
    #     best_valid
    # ))
    sys.stdout.write("test_acc: {:.6f}\n".format(best_test))
    log_file.write("test_acc: {:.6f}\n".format(best_test))
    log_file.flush()
    log_file.close()
Beispiel #4
0
def main(config, progress):
    # save config file
    with open("./log/config_history.txt", "a+") as f:
        f.write(json.dumps(config) + "\n")

    logging.info("*" * 80)
    logging.info("Experiment progress: {0:.2f}%".format(progress * 100))
    logging.info("*" * 80)

    train_all = bool(config["train_all"])

    # data dir
    data_dir = config["data_dir"]  # data dir
    train_csv = data_dir + config["train_csv"]  # train.csv or train_val.csv
    val_csv = data_dir + config["val_csv"]  # val.csv or testc.csv

    # path to save model
    model_dir = config["save_dir"]  # dir to save model
    f1_criteria = config["f1_criteria"]  # f1 criteria to save model

    # data preprocessing settings
    min_freq = config["min_freq"]  # min frequency in vocabulary
    pretrained_embedding = config[
        "embedding_name"]  # embedding name provided in torchtext
    batch_size = config["batch_size"]

    # model settings
    twitter_embedding = config[
        "twitter_embedding"]  # 0: default to word2vec or glove; 1: from datastories; 2: from trained sentiment classifier
    twitter_embedding_file = config[
        "twitter_embedding_file"]  # the saved sentiment classifier
    use_deepmoji = bool(config["use_deepmoji"])
    use_infersent = bool(config["infersent_file"])
    infersent_file = config[
        "infersent_file"]  # the infersent embedding in numpy
    use_elmo = bool(config["use_elmo"])
    use_bert_word = bool(config["use_bert_word"])
    embedding_size = config["embedding_size"]
    embedding_size = int(pretrained_embedding[-4:-1])
    if twitter_embedding > 0:
        embedding_size = 100

    freeze_epochs = config[
        "freeze_epochs"]  # freeze embedding for a few epochs
    kmaxpooling = config["kmaxpooling"]  # top k max pooling
    hidden_size = config["hidden_size"]
    additional_hidden_size = config[
        "additional_hidden_size"]  # an additional hidden layer before softmax
    output_size = config["output_size"]  # 4-class classification
    n_layers = config["n_layers"]
    bidirectional = bool(config["bidirectional"])
    dropout = config["dropout"]
    weight_decay = config["weight_decay"]
    recurrent_dropout = config["recurrent_dropout"]
    gradient_clip = config["gradient_clip"]

    # training settings
    num_epochs = config["epochs"]
    learning_rate = config["lr"]
    epoch_to_lower_lr = config["epoch_to_lower_lr"]  # scheduled lr decay
    lr_gamma = config["lr_gamma"]  # scheduled lr decay rate
    device = torch.device(config["device"])  # gpu id or "cpu"
    exp = config["exp"]  # experiment number or code
    seed = config["seed"]
    config_id = config["config_id"]

    # set seed
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)

    ######################
    #### Process data ####
    ######################

    # tokenization
    logging.info("Tokenizing data {0}, {1}...".format(train_csv, val_csv))
    TEXT = Field(sequential=True, tokenize=tokenizer, batch_first=True)
    LABEL = Field(sequential=False, use_vocab=False, batch_first=True)
    train_set = TabularDataset(path=train_csv,
                               format="csv",
                               fields=[("text", TEXT), ("label", LABEL)],
                               skip_header=False)
    val_set = TabularDataset(path=val_csv,
                             format="csv",
                             fields=[("text", TEXT), ("label", LABEL)],
                             skip_header=False)

    ########################
    #### Load embedding ####
    ########################
    deepmoji_train = [None]
    deepmoji_val = [None]
    if use_deepmoji:
        # load deepmoji representation
        deepmoji_file = data_dir + "deepmoji/train.npy"
        logging.info(
            "Loading deepmoji representation from {0}".format(deepmoji_file))
        with open(deepmoji_file, "rb") as f:
            deepmoji_train = np.load(f)
        if config["val_csv"].startswith("val"):
            with open(data_dir + "deepmoji/val.npy", "rb") as f:
                deepmoji_val = np.load(f)
        elif config["val_csv"].startswith("test"):
            with open(data_dir + "deepmoji/test.npy", "rb") as f:
                deepmoji_val = np.load(f)
        if train_all:
            deepmoji_train = np.concatenate((deepmoji_train, deepmoji_val),
                                            axis=0)

    infersent_train = [None]
    infersent_val = [None]
    if use_infersent:
        infersent_file = data_dir + "infersent/" + infersent_file
        logging.info(
            "Loading infersent representation from {0}".format(infersent_file))
        with open(infersent_file + "_train.npy", "rb") as f:
            infersent_train = np.load(f)
        if config["val_csv"].startswith("val"):
            with open(infersent_file + "_val.npy", "rb") as f:
                infersent_val = np.load(f)
        elif config["val_csv"].startswith("test"):
            with open(infersent_file + "_test.npy", "rb") as f:
                infersent_val = np.load(f)

    elmo_train = [None]
    elmo_val = [None]
    if use_elmo:
        elmo_file = data_dir + "elmo/"
        logging.info("Loading elmo representation from {0}".format(elmo_file))
        with open(elmo_file + "elmo_train.pkl", "rb") as f:
            elmo_train = np.load(f)
        if config["val_csv"].startswith("val"):
            with open(elmo_file + "elmo_val.pkl", "rb") as f:
                elmo_val = np.load(f)
        elif config["val_csv"].startswith("test"):
            with open(elmo_file + "elmo_test.pkl", "rb") as f:
                elmo_val = np.load(f)

    bert_word_train = [None]
    bert_word_val = [None]
    if use_bert_word:
        bert_file = data_dir + "bert/"
        logging.info("Loading bert representation from {0}".format(bert_file))
        with open(bert_file + "bert_train.pkl", "rb") as f:
            bert_word_train = np.load(f)
        if config["val_csv"].startswith("val"):
            with open(bert_file + "bert_val.pkl", "rb") as f:
                bert_word_val = np.load(f)
        elif config["val_csv"].startswith("test"):
            with open(bert_file + "bert_test.pkl", "rb") as f:
                bert_word_val = np.load(f)

    # build vocab
    logging.info("Building vocabulary...")
    if twitter_embedding == 0:
        TEXT.build_vocab(train_set,
                         min_freq=min_freq,
                         vectors=pretrained_embedding)
    else:
        TEXT.build_vocab(train_set, min_freq=min_freq)
    vocab_size = len(TEXT.vocab.itos)

    # use pretrained twitter embedding
    if twitter_embedding > 0:
        if twitter_embedding == 1:
            with open(data_dir + "datastories.twitter.100d.pkl", "rb") as f:
                tweet_embedding_raw = pickle.load(f)
        elif twitter_embedding == 2:
            checkpoint = torch.load("./saved_model/" + twitter_embedding_file)
            embedding = checkpoint["embedding"]
            vocab = checkpoint["vocab"]
        tweet_vectors = torch.zeros(vocab_size, embedding_size)

        if twitter_embedding != 2:
            for w, idx in TEXT.vocab.stoi.items():
                if w in tweet_embedding_raw:
                    tweet_vectors[idx] = torch.Tensor(tweet_embedding_raw[w])
                else:
                    tweet_vectors[idx] = torch.Tensor(
                        tweet_embedding_raw["<unk>"])
        if twitter_embedding == 2:
            for w, idx in TEXT.vocab.stoi.items():
                if w in vocab.stoi:
                    tweet_vectors[idx] = torch.Tensor(embedding[vocab.stoi[w]])
                else:
                    tweet_vectors[idx] = torch.Tensor(
                        embedding[vocab.stoi["<unk>"]])
        TEXT.vocab.vectors = tweet_vectors
    logging.info("Vocab size: {0}".format(vocab_size))

    #######################
    ### Model Training ####
    #######################
    metrics = {
        "accuracy": [],
        "microPrecision": [],
        "microRecall": [],
        "microF1": []
    }

    # create model
    logging.info("Building model...")
    model_kwargs = {
        "embed_size": embedding_size,
        "hidden_size": hidden_size,
        "output_size": output_size,
        "vocab_size": vocab_size,
        "n_layers": n_layers,
        "dropout": dropout,
        "bidirection": bidirectional,
        "use_deepmoji": use_deepmoji,
        "use_infersent": use_infersent,
        "use_elmo": use_elmo,
        "use_bert_word": use_bert_word,
        "additional_hidden_size": additional_hidden_size,
        "recurrent_dropout": recurrent_dropout,
        "kmaxpooling": kmaxpooling,
    }
    model = globals()[config["model"]](**model_kwargs)
    logging.info("Initializing model weight...")
    for name, param in model.named_parameters():
        if "weight" in name and len(param.shape) >= 2:
            xavier_uniform_(param)

    if use_elmo == False:
        model.init_embedding(TEXT.vocab.vectors,
                             config)  # load GloVe 100d embedding
    logging.info(model)
    logging.info("Number of model params: {0}".format(count_parameters(model)))
    model.to(device)

    # weighted crossentropy loss
    label_weights = torch.tensor(label_weight[config["train_csv"]]).to(device)
    criterion = nn.CrossEntropyLoss(weight=label_weights)
    optimizer = optim.Adam(model.parameters(),
                           learning_rate,
                           weight_decay=weight_decay)
    scheduler = MultiStepLR(optimizer,
                            milestones=epoch_to_lower_lr,
                            gamma=lr_gamma)

    train_losses = []
    train_epoch_losses = []
    val_losses = []
    val_epoch_losses = []

    # train
    logging.info("Start training...")

    # freeze embedding
    model.embedding.weight.requires_grad = False

    for epoch in range(1, num_epochs + 1):

        # load data
        train_batches = create_batches(
            train_set,
            TEXT.vocab,
            batch_size,
            [deepmoji_train, infersent_train, elmo_train, bert_word_train],
            shuffle=True,
            use_elmo=use_elmo)
        val_batches = create_batches(
            val_set,
            TEXT.vocab,
            1, [deepmoji_val, infersent_val, elmo_val, bert_word_val],
            shuffle=False,
            use_elmo=use_elmo)

        logging.info("-" * 80)
        logging.critical("config_id: {0}".format(config_id))
        logging.info("Epoch {0}/{1}".format(epoch, num_epochs))

        train_epoch_loss = []
        val_epoch_loss = []

        # unfreeze embedding
        if epoch >= freeze_epochs:
            model.embedding.weight.requires_grad = True

        # lr scheduler
        scheduler.step()

        model.train()
        for batch_idx, ((batch_x, batch_y), [
                batch_deepmoji, batch_infersent, batch_elmo, batch_bert
        ]) in enumerate(train_batches):
            batch_x = torch.from_numpy(batch_x).to(device)
            batch_y = torch.from_numpy(batch_y).to(device)
            if use_deepmoji:
                batch_deepmoji = torch.from_numpy(batch_deepmoji).float().to(
                    device)
            if use_infersent:
                batch_infersent = torch.from_numpy(batch_infersent).float().to(
                    device)
            if use_elmo:
                batch_elmo = torch.from_numpy(batch_elmo).float().to(device)
            if use_bert_word:
                batch_bert = torch.from_numpy(batch_bert).float().to(device)

            optimizer.zero_grad()

            additional_sent_representations = {
                "deepmoji": None,
                "infersent": None,
                "elmo": None,
                "bert_word": None
            }
            if use_deepmoji:
                additional_sent_representations["deepmoji"] = batch_deepmoji
            if use_infersent:
                additional_sent_representations["infersent"] = batch_infersent
            if use_elmo:
                additional_sent_representations["elmo"] = batch_elmo
            if use_bert_word:
                additional_sent_representations["bert_word"] = batch_bert
            output = model(batch_x, config, **additional_sent_representations)
            loss = criterion(output, batch_y)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), gradient_clip)
            optimizer.step()

            # log
            train_epoch_loss.append(loss.item())
            train_losses.append(loss.item())

        logging.info("Training loss: {0:.4f}".format(
            np.mean(train_epoch_loss)))
        train_epoch_losses.append(np.mean(train_epoch_loss))

        # val
        if train_all == False:
            model.eval()
            eval_epoch_outputs = np.zeros((len(val_batches), output_size))
            eval_epoch_labels = np.zeros((len(val_batches), ))

            with torch.no_grad():
                for batch_idx, ((batch_x, batch_y), [
                        batch_deepmoji, batch_infersent, batch_elmo, batch_bert
                ]) in enumerate(val_batches):
                    batch_x = torch.from_numpy(batch_x).to(device)
                    batch_y = torch.from_numpy(batch_y).to(device)
                    if use_deepmoji:
                        batch_deepmoji = torch.from_numpy(
                            batch_deepmoji).float().to(device)
                    if use_infersent:
                        batch_infersent = torch.from_numpy(
                            batch_infersent).float().to(device)
                    if use_elmo:
                        batch_elmo = torch.from_numpy(batch_elmo).float().to(
                            device)
                    if use_bert_word:
                        batch_bert = torch.from_numpy(batch_bert).float().to(
                            device)

                    additional_sent_representations = {
                        "deepmoji": None,
                        "infersent": None,
                        "elmo": None,
                        "bert_word": None
                    }
                    if use_deepmoji:
                        additional_sent_representations[
                            "deepmoji"] = batch_deepmoji
                    if use_infersent:
                        additional_sent_representations[
                            "infersent"] = batch_infersent
                    if use_elmo:
                        additional_sent_representations["elmo"] = batch_elmo
                    if use_bert_word:
                        additional_sent_representations[
                            "bert_word"] = batch_bert

                    output = model(batch_x, config,
                                   **additional_sent_representations)
                    loss = criterion(output, batch_y)

                    # log
                    val_epoch_loss.append(loss.item())
                    val_losses.append(loss.item())

                    # save predictions and labels for metrics computation
                    eval_epoch_outputs[batch_idx:batch_idx +
                                       1, :] = output.cpu().detach().numpy()
                    eval_epoch_labels[batch_idx:batch_idx +
                                      1] = batch_y.cpu().detach().numpy()

            logging.info("Validation loss: {0:.4f}".format(
                np.mean(val_epoch_loss)))
            val_epoch_losses.append(np.mean(val_epoch_loss))

            # get metrics
            logging.critical("config_id: {0}".format(config_id))
            accuracy, microPrecision, microRecall, microF1 = getMetrics(
                eval_epoch_outputs, eval_epoch_labels, output_size)

            # scheduler.step(microF1)

            # save model upon improvement and F1 beyond f1_criteria
            if microF1 > f1_criteria and (metrics["microF1"] == [] or
                                          microF1 > max(metrics["microF1"])):
                model_path = "{0}{1}_id_{4}_e{2}_F1_{3:.4f}.pt".format(
                    model_dir, exp, epoch, microF1, config_id)
                torch.save(
                    {
                        'epoch': epoch,
                        'model_state_dict': model.state_dict(),
                        'optimizer_state_dict': optimizer.state_dict(),
                        'config': config,
                        'model_kwargs': model_kwargs
                    }, model_path)
            metrics["accuracy"].append(accuracy)
            metrics["microPrecision"].append(microPrecision)
            metrics["microRecall"].append(microRecall)
            metrics["microF1"].append(microF1)

    if train_all:
        # save model
        model_path = "{0}{1}_id_{2}_e{3}.pt".format(model_dir, exp, config_id,
                                                    epoch)
        torch.save(
            {
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'config': config,
                'model_kwargs': model_kwargs
            }, model_path)
    config.pop("seed")
    config.pop("device")
    config.pop("config_id")
    metrics["config"] = config
    return metrics
Beispiel #5
0
def main(args):
    datasetList = ['mr', 'subj', 'cr', 'mpqa', 'trec', 'sst']
    numberOfTest = 5
    args.max_epoch = 100
    for dset in datasetList:
        if dset == 'mr':
            data, label = dataloader.read_MR(args.path)
        elif dset == 'subj':
            data, label = dataloader.read_SUBJ(args.path)
        elif dset == 'cr':
            data, label = dataloader.read_CR(args.path)
        elif dset == 'mpqa':
            data, label = dataloader.read_MPQA(args.path)
        elif dset == 'trec':
            train_x, train_y, test_x, test_y = dataloader.read_TREC(args.path)
            data = train_x + test_x
            label = None
        elif dset == 'sst':
            train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.read_SST(args.path)
            data = train_x + valid_x + test_x
            label = None
        else:
            raise Exception("unknown dataset: {}".format(dset))

        emb_layer = modules.EmbeddingLayer(
            args.d, data,
            embs = dataloader.load_embedding(args.embedding)
        )

        if dset == 'trec':
            train_x, train_y, valid_x, valid_y = dataloader.cv_split2(
                train_x, train_y,
                nfold = 10,
                valid_id = args.cv
            )
        elif dset != 'sst':
            train_x, train_y, valid_x, valid_y, test_x, test_y = dataloader.cv_split(
                data, label,
                nfold = 10,
                test_id = args.cv
            )
        nclasses = max(train_y)+1

        train_x, train_y = dataloader.create_batches(train_x, train_y, args.batch_size, emb_layer.word2id, sort = dset == 'sst')
        valid_x, valid_y = dataloader.create_batches(valid_x, valid_y, args.batch_size, emb_layer.word2id, sort = dset == 'sst')
        test_x, test_y = dataloader.create_batches(test_x, test_y, args.batch_size, emb_layer.word2id, sort = dset == 'sst')

        for models in range(3):
            if models == 1:
                args.cnn = True
                modelName = 'CNN'
            elif models == 2:
                args.cnn = False
                args.lstm = True
                modelName = 'LSTM'
            else:
                args.lstm = False
                modelName = 'SRU'

            sys.stdout.write("Training {} with {} architecture: \n".format(dset,modelName))
            args.dropout = 0.5


            for testNo in range(numberOfTest):
                model = Model(args, emb_layer, nclasses).cuda()
                need_grad = lambda x: x.requires_grad
                optimizer = optim.Adam(filter(need_grad, model.parameters()), lr = args.lr)

                best_valid = 1e+8
                test_err = 1e+8
                results = []
                for epoch in range(args.max_epoch):
                    results.append(train_model(epoch, model, optimizer, train_x, train_y, valid_x, valid_y, test_x, test_y, best_valid, test_err))
                
                with open('results_{d}_{m}_{i}.csv'.format(d=dset, m=modelName, i=(testNo+1)), 'wb') as dump:
                    wr = csv.writer(dump, delimiter=',')
                    wr.writerow(['Epoch','Training Loss', 'Validation Error', 'Test Error', 'Duration'])
                    for idx, value in enumerate(results):
                        wr.writerow(value)