Ejemplo n.º 1
0
 def __checkpoints__(self):
     try:
         ckpt = training.load_checkpoint('%s/latest.ckpt' % self.args.checkpoint_dir)
         self.start_epoch = ckpt['epoch']
         self.Gab.load_state_dict(ckpt['Gab'])
         self.Gba.load_state_dict(ckpt['Gba'])
     except:
         print('Model is still untrained!')
Ejemplo n.º 2
0
 def load_checkpoint(self, name, path):
     state = load_checkpoint(name, path=path)
     self.config = state["config"]
     self.epoch = state["epoch"]
     self.step = state["step"]
     self.model.load_state_dict(state["model"])
     self.model.__class__.__name__ = state["model_class"]
     # [x.state_dict() for x in self.optimizers] = state["optimizers"]: ,
     _vocab = state["vocab"]
     self.best_f1 = state["f1:"]
     self.best_acc = state["acc"]
     return _vocab
    def __checkpoints__(self):
        # 如果没有checkpoint就整一个出来
        if not os.path.isdir(self.args.checkpoint_dir):
            os.makedirs(self.args.checkpoint_dir)

        # 试图进行断点恢复,当然如果没有断点就算了
        try:
            ckpt = training.load_checkpoint('%s/latest.ckpt' %
                                            self.args.checkpoint_dir)
            self.start_epoch = ckpt['epoch']
            self.Da.load_state_dict(ckpt['Da'])
            self.Db.load_state_dict(ckpt['Db'])
            self.Gab.load_state_dict(ckpt['Gab'])
            self.Gba.load_state_dict(ckpt['Gba'])
            self.d_optimizer.load_state_dict(ckpt['d_optimizer'])
            self.g_optimizer.load_state_dict(ckpt['g_optimizer'])
        except:
            print(' [*] No checkpoint!')
Ejemplo n.º 4
0
def bcn(config, data_file, embeddings, device, chekpoint, dataset, embeddings_type):
    #   extensions : add 2 languages, use a combination of CoVe embeddings (like ELMo)

    inputs = data.Field(lower=True, include_lengths=True, batch_first=True)
    labels = data.Field(sequential=False, unk_token=None)

    print('Generating train, dev, test splits')
    if dataset == 'IWSLT':
        # using the IWSLT 2016 TED talk translation task
        train, dev, test = datasets.IWSLT.splits(root=data_file, exts=['.en', '.de'], fields=[inputs, inputs])
    elif dataset == 'SST-2':
        train, dev, test = datasets.SST.splits(text_field=inputs, label_field=labels, root=data_file,
                                               fine_grained=False, train_subtrees=True,
                                               filter_pred=lambda ex: ex.label != 'neutral')
    elif dataset == 'SST-5':
        train, dev, test = datasets.SST.splits(text_field=inputs, label_field=labels, root=data_file,
                                               fine_grained=True, train_subtrees=True)
    elif dataset == 'IMDB':
        train, test = datasets.IMDB.splits(text_field=inputs, label_field=labels, root=data_file)
        train, dev = train.split(split_ratio=0.9, stratified=True)  # 0.9 in order to be close to the paper
    elif dataset == 'TREC-6':
        train, test = datasets.TREC.splits(text_field=inputs, label_field=labels, root=data_file,
                                           fine_grained=False)
        train, dev = train.split(split_ratio=0.9, stratified=True)
    elif dataset == 'TREC-50':
        train, test = datasets.TREC.splits(text_field=inputs, label_field=labels, root=data_file,
                                           fine_grained=True)
        train, dev = train.split()
    elif dataset == 'SNLI':
        train, dev, test = datasets.SNLI.splits(text_field=inputs, label_field=labels, root=data_file)
    else:
        print('Invalid dataset name detected...')
        return

    print('Building vocabulary')
    inputs.build_vocab(train, dev, test)
    inputs.vocab.load_vectors(vectors=GloVe(name='840B', dim=300, cache=embeddings))

    labels.build_vocab(train, dev, test)

    train_iter, dev_iter, test_iter = data.BucketIterator.splits(
        (train, dev, test), batch_size=config["train_batch_size"], device=torch.device(device) if device >= 0 else None,
        sort_within_batch=True)

    model = BCN(config=config, n_vocab=len(inputs.vocab), vocabulary=inputs.vocab.vectors, embeddings=embeddings,
                num_labels=len(labels.vocab.freqs), embeddings_type=embeddings_type)

    bcn_params = [p for n, p in model.named_parameters() if "mtlstm" not in n and p.requires_grad]

    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(bcn_params, lr=0.001)

    if device != -1:
        model.to(device)
    print(model)

    total_params = sum(p.numel() for p in model.parameters())
    total_trainable_params = sum(p.numel() for p in bcn_params
                                 if p.requires_grad)

    print("Total Params:", number_h(total_params))
    print("Total Trainable Params:", number_h(total_trainable_params))
    #####################################
    # Training Pipeline
    #####################################
    trainer = BCNTrainer(model=model, train_loader=None, valid_loader=test_iter, criterion=criterion,
                         device="cpu" if device == -1 else 'cuda',
                         config=config, optimizers=[optimizer])

    state = load_checkpoint(chekpoint)
    model.load_state_dict(state["model"])
    print('Generating CoVe')

    test_loss, y_test, y_pred_test = trainer.test_step()

    print("Test cls loss is {}".format(test_loss))
    print("\n")
    print("F1 on test set is {}".format(f1_macro(y_test, y_pred_test)))
    print("\n")
    print("Accuracy on test set is {}".format(acc(y_test, y_pred_test)))
    print("\n")

    return test_loss, f1_macro(y_test, y_pred_test)
X_train, y_train, X_test, y_test = load_dataset(config["data"]["dataset"],
                                                test=True)

# load word embeddings
if config["data"]["embeddings"] == "wiki.en.vec":
    word2idx, idx2word, weights = load_word_vectors_from_fasttext(
        os.path.join(EMB_DIR, config["data"]["embeddings"]),
        config["data"]["embeddings_dim"])
else:
    word2idx, idx2word, weights = load_word_vectors(
        os.path.join(EMB_DIR, config["data"]["embeddings"]),
        config["data"]["embeddings_dim"])

checkpoint_name = "Psych_exp_baseline"

state = load_checkpoint(checkpoint_name)

# features, feat_length = load_features(config["data"]["features"])

test_set = ClfDataset(X_test, y_test, word2idx, name="psych_test")
test_lengths = [len(x) for x in test_set.data]
test_sampler = SortedSampler(test_lengths)
test_loader = DataLoader(test_set,
                         sampler=test_sampler,
                         batch_size=config["batch_size"],
                         num_workers=opts.cores,
                         collate_fn=ClfCollate())

model = Classifier(ntokens=weights.shape[0], nclasses=7, **config["model"])
model.load_state_dict(state["model"])
def sum_clf_test(dataset,
                 config,
                 opts,
                 transfer=False,
                 output_dir=None,
                 checkpoint_name='scv2_aux_ft_gu_last'):
    opts.name = config["name"]
    X_test, y_test, posts_test, pids, human_summaries = dataset
    vocab = None
    if transfer:
        opts.transfer = config["pretrained_lm"]
        checkpoint = load_checkpoint(opts.transfer)
        config["vocab"].update(checkpoint["config"]["vocab"])
        dict_pattern_rename(checkpoint["config"]["model"],
                            {"rnn_": "bottom_rnn_"})
        config["model"].update(checkpoint["config"]["model"])
        vocab = checkpoint["vocab"]

    ####################################################################
    # Load Preprocessed Datasets
    ####################################################################
    if config["preprocessor"] == "twitter":
        preprocessor = twitter_preprocessor()
    else:
        preprocessor = None

    ####################################################################
    # Model
    ####################################################################
    ntokens = 70004
    model = SummarizationClassifier(ntokens, len(set([0, 1])),
                                    **config["model"])
    model.to(opts.device)

    clf_criterion = nn.CrossEntropyLoss()
    lm_criterion = nn.CrossEntropyLoss(ignore_index=0)

    embed_parameters = filter(lambda p: p.requires_grad,
                              model.embed.parameters())
    bottom_parameters = filter(
        lambda p: p.requires_grad,
        chain(model.bottom_rnn.parameters(), model.vocab.parameters()))
    if config["model"]["has_att"]:
        top_parameters = filter(
            lambda p: p.requires_grad,
            chain(model.top_rnn.parameters(), model.attention.parameters(),
                  model.classes.parameters()))
    else:
        top_parameters = filter(
            lambda p: p.requires_grad,
            chain(model.top_rnn.parameters(), model.classes.parameters()))

    embed_optimizer = optim.ASGD(embed_parameters, lr=0.0001)
    rnn_optimizer = optim.ASGD(bottom_parameters)
    top_optimizer = Adam(top_parameters, lr=config["top_lr"])
    ####################################################################
    # Training Pipeline
    ####################################################################

    # Trainer: responsible for managing the training process
    trainer = SumClfTrainer(model,
                            None,
                            None, (lm_criterion, clf_criterion),
                            [embed_optimizer, rnn_optimizer, top_optimizer],
                            config,
                            opts.device,
                            valid_loader_train_set=None,
                            unfreeze_embed=config["unfreeze_embed"],
                            unfreeze_rnn=config["unfreeze_rnn"],
                            test_loader=None)

    ####################################################################
    # Resume Training from a previous checkpoint
    ####################################################################
    if transfer:
        print("Transferring Encoder weights ...")
        dict_pattern_rename(checkpoint["model"], {
            "encoder": "bottom_rnn",
            "decoder": "vocab"
        })
        load_state_dict_subset(model, checkpoint["model"])
    print(model)

    _vocab = trainer.load_checkpoint(name=checkpoint_name, path=None)
    test_set = SUMDataset(X_test,
                          posts_test,
                          y_test,
                          seq_len=config['data']['seq_len'],
                          post_len=config['data']['post_len'],
                          preprocess=preprocessor,
                          vocab=_vocab)
    test_lengths = [len(x) for x in test_set.data]
    test_sampler = SortedSampler(test_lengths)

    # test_loader = DataLoader(test_set, sampler=test_sampler,
    #                          batch_size=config["batch_size"],
    #                          num_workers=opts.cores, collate_fn=SumCollate())

    test_loader = DataLoader(test_set,
                             sampler=test_sampler,
                             batch_size=config["batch_size"],
                             num_workers=0,
                             collate_fn=SumCollate())

    trainer.test_loader = test_loader

    _, labels_array, predicted = trainer.test_epoch()

    pids_dic = {}
    if human_summaries is None:
        for x, y, sent, z in zip(y_test, predicted, X_test, pids):
            if z in pids_dic:
                pids_dic[z].append([x, y, sent])
            else:
                pids_dic[z] = [[x, y, sent]]
    else:
        for x, y, sent, z, h_summary in zip(y_test, predicted, X_test, pids,
                                            human_summaries):
            if z in pids_dic:
                pids_dic[z].append([x, y, sent, h_summary])
            else:
                pids_dic[z] = [[x, y, sent, h_summary]]

    # import os
    # if not os.path.exists('{}/ref_abs'.format(output_dir)):
    #     os.mkdir('{}/ref_abs'.format(output_dir))
    # if not os.path.exists('{}/dec'.format(output_dir)):
    #     os.mkdir('{}/dec'.format(output_dir))

    file_index = 0
    all_summaries = []
    for elem_key in pids_dic:
        current_summary = ''
        for pair in pids_dic[elem_key]:
            if pair[1] == 1:
                current_summary += pair[2] + '\n'

        all_summaries.append(current_summary)

    return all_summaries
Ejemplo n.º 7
0
def sent_clf(dataset, config, opts, transfer=False):
    from logger.experiment import Experiment

    opts.name = config["name"]
    X_train, y_train, _, X_val, y_val, _ = dataset
    vocab = None
    if transfer:
        opts.transfer = config["pretrained_lm"]
        checkpoint = load_checkpoint(opts.transfer)
        config["vocab"].update(checkpoint["config"]["vocab"])
        dict_pattern_rename(checkpoint["config"]["model"],
                            {"rnn_": "bottom_rnn_"})
        config["model"].update(checkpoint["config"]["model"])
        vocab = checkpoint["vocab"]

    ####################################################################
    # Load Preprocessed Datasets
    ####################################################################
    if config["preprocessor"] == "twitter":
        preprocessor = twitter_preprocessor()
    else:
        preprocessor = None

    print("Building training dataset...")
    train_set = ClfDataset(X_train,
                           y_train,
                           vocab=vocab,
                           preprocess=preprocessor,
                           vocab_size=config["vocab"]["size"],
                           seq_len=config["data"]["seq_len"])

    print("Building validation dataset...")
    val_set = ClfDataset(X_val,
                         y_val,
                         seq_len=train_set.seq_len,
                         preprocess=preprocessor,
                         vocab=train_set.vocab)

    src_lengths = [len(x) for x in train_set.data]
    val_lengths = [len(x) for x in val_set.data]

    # select sampler & dataloader
    train_sampler = BucketBatchSampler(src_lengths, config["batch_size"], True)
    val_sampler = SortedSampler(val_lengths)
    val_sampler_train = SortedSampler(src_lengths)

    train_loader = DataLoader(train_set,
                              batch_sampler=train_sampler,
                              num_workers=opts.cores,
                              collate_fn=ClfCollate())
    val_loader = DataLoader(val_set,
                            sampler=val_sampler,
                            batch_size=config["batch_size"],
                            num_workers=opts.cores,
                            collate_fn=ClfCollate())
    val_loader_train_dataset = DataLoader(train_set,
                                          sampler=val_sampler_train,
                                          batch_size=config["batch_size"],
                                          num_workers=opts.cores,
                                          collate_fn=ClfCollate())
    ####################################################################
    # Model
    ####################################################################
    ntokens = len(train_set.vocab)
    model = Classifier(ntokens, len(set(train_set.labels)), **config["model"])
    model.to(opts.device)

    clf_criterion = nn.CrossEntropyLoss()
    lm_criterion = nn.CrossEntropyLoss(ignore_index=0)

    embed_parameters = filter(lambda p: p.requires_grad,
                              model.embed.parameters())
    bottom_parameters = filter(
        lambda p: p.requires_grad,
        chain(model.bottom_rnn.parameters(), model.vocab.parameters()))
    if config["model"]["has_att"]:
        top_parameters = filter(
            lambda p: p.requires_grad,
            chain(model.top_rnn.parameters(), model.attention.parameters(),
                  model.classes.parameters()))
    else:
        top_parameters = filter(
            lambda p: p.requires_grad,
            chain(model.top_rnn.parameters(), model.classes.parameters()))

    embed_optimizer = optim.ASGD(embed_parameters, lr=0.0001)
    rnn_optimizer = optim.ASGD(bottom_parameters)
    top_optimizer = Adam(top_parameters, lr=config["top_lr"])
    ####################################################################
    # Training Pipeline
    ####################################################################

    # Trainer: responsible for managing the training process
    trainer = SentClfTrainer(model,
                             train_loader,
                             val_loader, (lm_criterion, clf_criterion),
                             [embed_optimizer, rnn_optimizer, top_optimizer],
                             config,
                             opts.device,
                             valid_loader_train_set=val_loader_train_dataset,
                             unfreeze_embed=config["unfreeze_embed"],
                             unfreeze_rnn=config["unfreeze_rnn"])

    ####################################################################
    # Experiment: logging and visualizing the training process
    ####################################################################

    # exp = Experiment(opts.name, config, src_dirs=opts.source,
    #                  output_dir=EXP_DIR)
    # exp.add_metric("ep_loss_lm", "line", "epoch loss lm",
    #                ["TRAIN", "VAL"])
    # exp.add_metric("ep_loss_cls", "line", "epoch loss class",
    #                ["TRAIN", "VAL"])
    # exp.add_metric("ep_f1", "line", "epoch f1", ["TRAIN", "VAL"])
    # exp.add_metric("ep_acc", "line", "epoch accuracy", ["TRAIN", "VAL"])
    #
    # exp.add_value("epoch", title="epoch summary")
    # exp.add_value("progress", title="training progress")

    ep_loss_lm = [10000, 10000]
    ep_loss_cls = [10000, 10000]
    ep_f1 = [0, 0]
    ep_acc = [0, 0]
    e_log = 0
    progress = 0
    ####################################################################
    # Resume Training from a previous checkpoint
    ####################################################################
    if transfer:
        print("Transferring Encoder weights ...")
        dict_pattern_rename(checkpoint["model"], {
            "encoder": "bottom_rnn",
            "decoder": "vocab"
        })
        load_state_dict_subset(model, checkpoint["model"])
    print(model)

    ####################################################################
    # Training Loop
    ####################################################################
    best_loss = None
    early_stopping = EarlyStopping("min", config["patience"])

    for epoch in range(0, config["epochs"]):

        train_loss = trainer.train_epoch()
        val_loss, y, y_pred = trainer.eval_epoch(val_set=True)
        _, y_train, y_pred_train = trainer.eval_epoch(train_set=True)
        # exp.update_metric("ep_loss_lm", train_loss[0], "TRAIN")
        ep_loss_lm[0] = train_loss[0]
        # exp.update_metric("ep_loss_lm", val_loss[0], "VAL")
        ep_loss_lm[1] = val_loss[0]
        # exp.update_metric("ep_loss_cls", train_loss[1], "TRAIN")
        # exp.update_metric("ep_loss_cls", val_loss[1], "VAL")
        ep_loss_cls[0] = train_loss[1]
        ep_loss_cls[1] = val_loss[1]

        # exp.update_metric("ep_f1", f1_macro(y_train, y_pred_train),
        #                   "TRAIN")
        ep_f1[0] = f1_macro(y_train, y_pred_train)
        # exp.update_metric("ep_f1", f1_macro(y, y_pred), "VAL")
        ep_f1[1] = f1_macro(y, y_pred)

        # exp.update_metric("ep_acc", acc(y_train, y_pred_train), "TRAIN")
        # exp.update_metric("ep_acc", acc(y, y_pred), "VAL")

        ep_acc[0] = acc(y_train, y_pred_train)
        ep_acc[1] = acc(y, y_pred)

        # print('Train lm Loss : {}\nVal lm Loss : {}\nTrain cls Loss : {}\nVal cls Loss : {}\n Train f1 : {}\nVal f1 : {}\nTrain acc : {}\n Val acc : {}'.format(
        #     ep_loss_lm[0], ep_loss_lm[1], ep_loss_cls[0], ep_loss_cls[1], ep_f1[0], ep_f1[1], ep_acc[0], ep_acc[1]
        # ))
        # epoch_log = exp.log_metrics(["ep_loss_lm", "ep_loss_cls","ep_f1", "ep_acc"])
        epoch_log = 'Train lm Loss : {}\nVal lm Loss : {}\nTrain cls Loss : {}\nVal cls Loss : {}\n Train f1 : {}\nVal f1 : {}\nTrain acc : {}\n Val acc : {}'.format(
            ep_loss_lm[0], ep_loss_lm[1], ep_loss_cls[0], ep_loss_cls[1],
            ep_f1[0], ep_f1[1], ep_acc[0], ep_acc[1])
        print(epoch_log)
        # exp.update_value("epoch", epoch_log)
        e_log = epoch_log
        # print('')
        # Save the model if the val loss is the best we've seen so far.
        # if not best_loss or val_loss[1] < best_loss:
        #     best_loss = val_loss[1]
        #     trainer.best_acc = acc(y, y_pred)
        #     trainer.best_f1 = f1_macro(y, y_pred)
        #     trainer.checkpoint(name=opts.name, timestamp=True)
        best_loss = val_loss[1]
        trainer.best_acc = acc(y, y_pred)
        trainer.best_f1 = f1_macro(y, y_pred)
        trainer.checkpoint(name=opts.name, tags=str(epoch))

        # if early_stopping.stop(val_loss[1]):
        #     print("Early Stopping (according to classification loss)....")
        #     break

        print("\n" * 2)

    return best_loss, trainer.best_acc, trainer.best_f1
Ejemplo n.º 8
0
def compress_seq3(checkpoint,
                  src_file,
                  out_file,
                  device,
                  verbose=False,
                  mode="attention"):
    checkpoint = load_checkpoint(checkpoint)
    config = checkpoint["config"]
    vocab = checkpoint["vocab"]

    def giga_tokenizer(x):
        return x.strip().lower().split()

    dataset = AEDataset(src_file,
                        preprocess=giga_tokenizer,
                        vocab=checkpoint["vocab"],
                        seq_len=config["data"]["seq_len"],
                        return_oov=True,
                        oovs=config["data"]["oovs"])

    data_loader = DataLoader(dataset,
                             batch_size=config["batch_size"],
                             num_workers=0,
                             collate_fn=Seq2SeqOOVCollate())
    n_tokens = len(dataset.vocab)
    model = Seq2Seq2Seq(n_tokens, **config["model"]).to(device)
    model.load_state_dict(checkpoint["model"])
    model.eval()

    ##############################################

    n_batches = math.ceil(len(data_loader.dataset) / data_loader.batch_size)

    if verbose:
        iterator = tqdm(enumerate(data_loader, 1), total=n_batches)
    else:
        iterator = enumerate(data_loader, 1)

    def devect(ids, oov, strip_eos, pp):
        return devectorize(ids.tolist(),
                           vocab.id2tok,
                           vocab.tok2id[vocab.EOS],
                           strip_eos=strip_eos,
                           oov_map=oov,
                           pp=pp)

    def id2txt(ids, oov=None, lengths=None, strip_eos=True):
        if lengths:
            return [
                " ".join(x[:l])
                for l, x in zip(lengths, devect(ids, oov, strip_eos, pp=True))
            ]
        else:
            return [" ".join(x) for x in devect(ids, oov, strip_eos, pp=True)]

    results = []
    with open(out_file, "w") as f:
        with torch.no_grad():
            for i, batch in iterator:
                batch_oov_map = batch[-1]
                batch = batch[:-1]

                batch = list(map(lambda x: x.to(device), batch))
                (inp_src, out_src, inp_trg, out_trg, src_lengths,
                 trg_lengths) = batch

                trg_lengths = torch.clamp(src_lengths / 2, min=5, max=30) + 1

                #############################################################
                # Debug
                #############################################################
                if mode in ["attention", "debug"]:

                    outputs = model(inp_src,
                                    inp_trg,
                                    src_lengths,
                                    trg_lengths,
                                    sampling=0)
                    enc1, dec1, enc2, dec2 = outputs

                    if mode == "debug":

                        src = id2txt(inp_src)
                        latent = id2txt(dec1[3].max(-1)[1])
                        rec = id2txt(dec2[0].max(-1)[1])

                        _results = list(zip(src, latent, rec))

                        for sample in _results:
                            f.write("\n".join(sample) + "\n\n")

                    elif mode == "attention":
                        src = devect(inp_src, None, strip_eos=False, pp=False)
                        latent = devect(dec1[3].max(-1)[1],
                                        None,
                                        strip_eos=False,
                                        pp=False)
                        rec = devect(dec2[0].max(-1)[1],
                                     None,
                                     strip_eos=False,
                                     pp=False)

                        _results = [src, latent, dec1[4], rec, dec2[4]]

                        results += list(zip(*_results))

                        break

                    else:
                        raise ValueError
                else:
                    enc1, dec1 = model.generate(inp_src, src_lengths,
                                                trg_lengths)

                    preds = id2txt(dec1[0].max(-1)[1], batch_oov_map,
                                   trg_lengths.tolist())

                    for sample in preds:
                        f.write(sample + "\n")
    return results
Ejemplo n.º 9
0
train_loader = DataLoader(train_set,
                          config["batch_train"],
                          shuffle=True,
                          drop_last=True)
test_loader = DataLoader(test_set, config["batch_eval"])

classes = label_encoder.classes_.size
model = Classifier(embeddings=weights, out_size=classes, **config).to(DEVICE)

weights = class_weigths(train_set.labels, to_pytorch=True)
weights = weights.to(DEVICE)
criterion = CrossEntropyLoss(weight=weights)

if pretrained_classifier:
    pretr_model, pretr_optimizer, pretr_vocab, loss, acc = \
        load_checkpoint("sentiment_baseline")
    pretr_model.to(DEVICE)
    pretr_model.output = model.output
    model = pretr_model

    name = "wassa_pretr_clf_with_baseline"
else:
    name = "wassa_rnn_600_bidir_batch32_adam_0.05"

parameters = filter(lambda p: p.requires_grad, model.parameters())
optimizer = Adam(parameters, amsgrad=True)

print(model)

#############################################################################
# Training Pipeline
Ejemplo n.º 10
0
from utils.training import epoch_summary, save_checkpoint, load_checkpoint

# load dataset

config = ConfLangModelFT
name = 'LM_FT_GU_3_6'
dataset = 'wassa'

unfreeze = True
freeze = {"embed": True, "hidden": True}

unfreeze_epoque = {"embed": 6, "hidden": 3}

# Load Pretrained LM
pretr_model, pretr_optimizer, pretr_vocab, loss, acc = \
    load_checkpoint("emotion2M/emotion_with_2M_18-06-28_18:04:54")
pretr_model.to(DEVICE)

# Load wassa
train_data, val_data, _, _ = load_wassa()
#####################################################################
# Define Dataloaders
#####################################################################

preprocessor = twitter_preprocessor()
if preprocessor is None:
    train_name = "train_simple_split_{}".format(dataset)
    val_name = "valid_simple_split_{}".format(dataset)
else:
    train_name = "train_ekphrasis_{}".format(dataset)
    val_name = "valid_ekphrasis_{}".format(dataset)
Ejemplo n.º 11
0
def sent_clf_no_aux(dataset, config, opts, transfer=False):
    from logger.experiment import Experiment

    opts.name = config["name"]
    X_train, y_train, X_val, y_val = dataset
    vocab = None
    if transfer:
        opts.transfer = config["pretrained_lm"]
        checkpoint = load_checkpoint(opts.transfer)
        config["vocab"].update(checkpoint["config"]["vocab"])
        dict_pattern_rename(checkpoint["config"]["model"],
                            {"rnn_": "bottom_rnn_"})
        config["model"].update(checkpoint["config"]["model"])
        vocab = checkpoint["vocab"]

    ####################################################################
    # Data Loading and Preprocessing
    ####################################################################
    if config["preprocessor"] == "twitter":
        preprocessor = twitter_preprocessor()
    else:
        preprocessor = None

    print("Building training dataset...")
    train_set = ClfDataset(X_train,
                           y_train,
                           vocab=vocab,
                           preprocess=preprocessor,
                           vocab_size=config["vocab"]["size"],
                           seq_len=config["data"]["seq_len"])

    print("Building validation dataset...")
    val_set = ClfDataset(X_val,
                         y_val,
                         seq_len=train_set.seq_len,
                         preprocess=preprocessor,
                         vocab=train_set.vocab)

    src_lengths = [len(x) for x in train_set.data]
    val_lengths = [len(x) for x in val_set.data]

    # select sampler & dataloader
    train_sampler = BucketBatchSampler(src_lengths, config["batch_size"], True)
    val_sampler = SortedSampler(val_lengths)
    val_sampler_train = SortedSampler(src_lengths)

    train_loader = DataLoader(train_set,
                              batch_sampler=train_sampler,
                              num_workers=opts.cores,
                              collate_fn=ClfCollate())
    val_loader = DataLoader(val_set,
                            sampler=val_sampler,
                            batch_size=config["batch_size"],
                            num_workers=opts.cores,
                            collate_fn=ClfCollate())
    val_loader_train_dataset = DataLoader(train_set,
                                          sampler=val_sampler_train,
                                          batch_size=config["batch_size"],
                                          num_workers=opts.cores,
                                          collate_fn=ClfCollate())
    ####################################################################
    # Model
    ####################################################################
    ntokens = len(train_set.vocab)
    model = NaiveClassifier(ntokens,
                            len(set(train_set.labels)),
                            attention=config["model"]["has_att"],
                            **config["model"])
    model.to(opts.device)

    criterion = nn.CrossEntropyLoss()

    if config["gu"]:

        embed_parameters = filter(lambda p: p.requires_grad,
                                  model.embed.parameters())
        bottom_parameters = filter(lambda p: p.requires_grad,
                                   chain(model.bottom_rnn.parameters()))
        if config["model"]["has_att"]:
            top_parameters = filter(
                lambda p: p.requires_grad,
                chain(model.attention.parameters(),
                      model.classes.parameters()))
        else:
            top_parameters = filter(lambda p: p.requires_grad,
                                    model.classes.parameters())

        embed_optimizer = Adam(embed_parameters)
        rnn_optimizer = Adam(bottom_parameters)
        top_optimizer = Adam(top_parameters)

        # Trainer: responsible for managing the training process
        trainer = SentClfNoAuxTrainer(
            model,
            train_loader,
            val_loader,
            criterion, [embed_optimizer, rnn_optimizer, top_optimizer],
            config,
            opts.device,
            valid_loader_train_set=val_loader_train_dataset,
            unfreeze_embed=config["unfreeze_embed"],
            unfreeze_rnn=config["unfreeze_rnn"])
    else:
        parameters = filter(lambda p: p.requires_grad, model.parameters())

        optimizer = optim.Adam(parameters, lr=config["top_lr"])
        # Trainer: responsible for managing the training process
        trainer = SentClfNoAuxTrainer(
            model,
            train_loader,
            val_loader,
            criterion, [optimizer],
            config,
            opts.device,
            valid_loader_train_set=val_loader_train_dataset)

    ####################################################################
    # Experiment: logging and visualizing the training process
    ####################################################################
    exp = Experiment(opts.name,
                     config,
                     src_dirs=opts.source,
                     output_dir=EXP_DIR)
    exp.add_metric("ep_loss", "line", "epoch loss class", ["TRAIN", "VAL"])
    exp.add_metric("ep_f1", "line", "epoch f1", ["TRAIN", "VAL"])
    exp.add_metric("ep_acc", "line", "epoch accuracy", ["TRAIN", "VAL"])

    exp.add_value("epoch", title="epoch summary")
    exp.add_value("progress", title="training progress")

    ####################################################################
    # Resume Training from a previous checkpoint
    ####################################################################
    if transfer:
        print("Transferring Encoder weights ...")
        dict_pattern_rename(checkpoint["model"], {"encoder": "bottom_rnn"})
        load_state_dict_subset(model, checkpoint["model"])

    print(model)

    ####################################################################
    # Training Loop
    ####################################################################
    best_loss = None
    early_stopping = EarlyStopping("min", config["patience"])

    for epoch in range(1, config["epochs"] + 1):
        train_loss = trainer.train_epoch()
        val_loss, y, y_pred = trainer.eval_epoch(val_set=True)
        _, y_train, y_pred_train = trainer.eval_epoch(train_set=True)
        # Calculate accuracy and f1-macro on the evaluation set
        exp.update_metric("ep_loss", train_loss.item(), "TRAIN")
        exp.update_metric("ep_loss", val_loss.item(), "VAL")
        exp.update_metric("ep_f1", f1_macro(y_train, y_pred_train), "TRAIN")
        exp.update_metric("ep_f1", f1_macro(y, y_pred), "VAL")
        exp.update_metric("ep_acc", acc(y_train, y_pred_train), "TRAIN")
        exp.update_metric("ep_acc", acc(y, y_pred), "VAL")

        print()
        epoch_log = exp.log_metrics(["ep_loss", "ep_f1", "ep_acc"])
        print(epoch_log)
        exp.update_value("epoch", epoch_log)

        ###############################################################
        # Unfreezing the model after X epochs
        ###############################################################
        # Save the model if the val loss is the best we've seen so far.
        if not best_loss or val_loss < best_loss:
            best_loss = val_loss
            trainer.best_acc = acc(y, y_pred)
            trainer.best_f1 = f1_macro(y, y_pred)
            trainer.checkpoint(name=opts.name)

        if early_stopping.stop(val_loss):
            print("Early Stopping (according to cls loss)....")
            break

        print("\n" * 2)

    return best_loss, trainer.best_acc, trainer.best_f1
Ejemplo n.º 12
0
####################################################################
# Data Loading and Preprocessing
####################################################################

vocab = None

if config["vocab"]["vocab_path"] is not None:
    vocab_path = config["vocab"]["vocab_path"]
    print(f"Loading vocab from '{vocab_path}'...")
    vocab = Vocab()
    vocab.from_file(vocab_path)

if opts.cp_vocab is not None:
    print(f"Loading vocab from checkpoint '{opts.cp_vocab}'...")
    vcp = load_checkpoint(opts.cp_vocab)
    vocab = vcp["vocab"]

if opts.resume:
    checkpoint = load_checkpoint(opts.resume)
    config["vocab"].update(checkpoint["config"]["vocab"])
    if not config["vocab"]["subword"]:
        vocab = checkpoint["vocab"]


def giga_tokenizer(x):
    return x.strip().lower().split()


print("Building training dataset...")
train_set = SentenceLMDataset(config["data"]["train_path"],
Ejemplo n.º 13
0
####################################################################
# Settings
####################################################################
opts, config = seq2seq2seq_options()

####################################################################
#
# Weight Transfer
#
####################################################################
vocab = None

if config["model"]["prior_loss"] and config["prior"] is not None:
    print("Loading Oracle LM ...")
    oracle_cp = load_checkpoint(config["prior"])
    vocab = oracle_cp["vocab"]

    oracle = SeqReader(len(vocab), **oracle_cp["config"]["model"])
    oracle.load_state_dict(oracle_cp["model"])
    oracle.to(opts.device)
    freeze_module(oracle)
else:
    oracle = None


####################################################################
#
# Data Loading and Preprocessing
#
####################################################################