Beispiel #1
0
    def __init__(self,
                 hparams=DotDict({
                     'model_type': 'transformer',
                     'ninp': 128,
                     'nhead': 2,
                     'nhid': 512,
                     'nlayers': 2,
                     'tie_layers': True,
                     'tie_encoder_decoder': True,
                     'dropout': 0.1,
                 })):
        super(LanguageModelTrainer, self).__init__()

        self.hparams = hparams if isinstance(hparams, DotDict) \
                        else DotDict(hparams)

        from utils import get_default_tokenizer
        self.vocab_size = get_default_tokenizer()._tokenizer.get_vocab_size()

        self.model_type = hparams.get('model_type', 'transformer')
        assert self.model_type in ['transformer', 'lstm']

        if self.model_type == 'transformer':
            self.model = TransformerModel(ntoken=self.vocab_size, **hparams)
        else:
            self.model = LSTMModel(ntoken=self.vocab_size, **hparams)

        self.batch_size = hparams.get('batch_size', 64)
        self.bptt = hparams.get('bptt', 128)
def run(stock: str, model_type: str, stationary=True):
    df = Analysis.get_data(stock)
    df["Company stock name"] = stock.split('/')[-1].split('.')[0]
    dataset = GetDataset(df)
    dataset.get_dataset(scale=False, stationary=stationary)
    train_data, test_data, train_data_len = dataset.split(train_split_ratio=0.8, time_period=30)
    train_data, test_data = dataset.get_torchdata()
    x_train, y_train = train_data
    x_test, y_test = test_data

    if model_type == 'lstm':
        params = rnn_params
        model = TorchRNN(rnn_type=params.rnn_type, input_dim=params.input_dim,
                         hidden_dim=params.hidden_dim, output_dim=params.output_dim,
                         num_layers=params.num_layers)
    elif model_type == 'transformer':
        params = transf_params
        model = TransformerModel(params)
    else:
        raise ValueError('Wrong model type selection, select either "rnn" or "transformer"!')

    clf = Classifier(model)
    clf.train([x_train, y_train], params=params)
    y_scaler = dataset.y_scaler
    predictions = clf.predict([x_test, y_test], y_scaler, data_scaled=False)
    predictions = pd.DataFrame(predictions)
    predictions.reset_index(drop=True, inplace=True)
    predictions.index = df.index[-len(x_test):]
    predictions['Actual'] = y_test[:-1]
    predictions.rename(columns={0: 'Predictions'}, inplace=True)
    if stationary:
        predictions = Analysis.inverse_stationary_data(old_df=df, new_df=predictions,
                                                       orig_feature='Actual', new_feature='Predictions',
                                                       diff=12, do_orig=False)
    plot_predictions(df, train_data_len, predictions["Predictions"].values, model_type)
Beispiel #3
0
    def __init__(self, hparams: dict(), **kwargs) -> 'LightningTemplateModel':
        # init superclass
        super().__init__(**kwargs)

        self.save_hyperparameters()

        self.hparams = hparams
        if self.hparams.model == 'awd':
            self.model = WDLSTM(
                self.hparams.num_tokens,
                num_layers=self.hparams.num_layers,
                num_hidden=self.hparams.num_hidden,
                num_embedding=self.hparams.num_embedding,
                tie_weights=self.hparams.tie_weights,
                embedding_dropout=self.hparams.embedding_dropout,
                input_dropout=self.hparams.input_dropout,
                hidden_dropout=self.hparams.hidden_dropout,
                output_dropout=self.hparams.output_dropout,
                weight_dropout=self.hparams.weight_dropout)

            self.model(
                torch.zeros(self.hparams.bptt, self.hparams.batch_size).long(),
                self.model.init_hidden(self.hparams.batch_size))
        elif self.hparams.model == 'rnn':
            self.model = RNNModel(self.hparams.rnn_type,
                                  self.hparams.num_tokens,
                                  num_embedding=self.hparams.num_embedding,
                                  num_hidden=self.hparams.num_hidden,
                                  num_layers=self.hparams.num_layers,
                                  dropout=self.hparams.dropout,
                                  tie_weights=self.hparams.tie_weights)
        elif self.hparams.model == 'transformer':
            self.model = TransformerModel(
                self.hparams.num_tokens,
                num_embedding=self.hparams.num_embedding,
                num_hidden=self.hparams.num_hidden,
                num_layers=self.hparams.num_layers,
                dropout=self.hparams.dropout,
                num_heads=self.hparams.num_heads)
        else:
            raise ValueError(f'Model {self.hparams.model} not recognized.')

        self.hiddens = None
        self.criterion = torch.nn.NLLLoss()
        self.avg_loss = 0
Beispiel #4
0
 def create_model(self):
     """
             根据config文件选择对应的模型,并初始化
             :return:
             """
     if self.config["model_name"] == "textcnn":
         self.model = TextCnnModel(config=self.config,
                                   vocab_size=self.vocab_size,
                                   word_vectors=self.word_vectors)
     elif self.config["model_name"] == "bilstm":
         self.model = BiLstmModel(config=self.config,
                                  vocab_size=self.vocab_size,
                                  word_vectors=self.word_vectors)
     elif self.config["model_name"] == "bilstm_atten":
         self.model = BiLstmAttenModel(config=self.config,
                                       vocab_size=self.vocab_size,
                                       word_vectors=self.word_vectors)
     elif self.config["model_name"] == "rcnn":
         self.model = RcnnModel(config=self.config,
                                vocab_size=self.vocab_size,
                                word_vectors=self.word_vectors)
     elif self.config["model_name"] == "transformer":
         self.model = TransformerModel(config=self.config,
                                       vocab_size=self.vocab_size,
                                       word_vectors=self.word_vectors)
Beispiel #5
0
def initialize_model(type_model, args):
    if type_model.lower() == 'lstm':
        model = Stacked_LSTM(args)
    elif type_model.lower() == 'attention_lstm':
        model = LSTM_and_Attention(args)
    elif type_model.lower() == 'transformer':
        model = TransformerModel(d_input=3)
    elif type_model.lower() == 'cnn':
        model = CNN(n_out=36, dropout=0.01)
    else:
        raise ValueError("Invalid name!")

    loss_fn = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate)
    return model, optimizer, loss_fn
Beispiel #6
0
 def create_new_model():
     if args.model_type == "seq2seq":
         return Seq2Seq(word_vectors=word_vectors,
                        hidden_size=args.hidden_size,
                        output_size=vocab_size,
                        device=device)
     elif args.model_type == "seq2seq_attn":
         return Seq2SeqAttn(word_vectors=word_vectors,
                            hidden_size=args.hidden_size,
                            output_size=vocab_size,
                            device=device)
     elif args.model_type == "transformer":
         return TransformerModel(vocab_size,
                                 device,
                                 num_encoder_layers=2,
                                 num_decoder_layers=2,
                                 dropout=0.1)
 def __init__(self,
              train_dataset,
              test_dataset,
              *,
              val_dataset,
              n_layers=6,
              n_head=8,
              d_model=512,
              d_inner_hid=1024,
              d_k=64,
              d_v=64,
              edrop=0.25,
              odrop=0.25,
              hdrop=0.1,
              propagate=False,
              steps=15,
              avg_window=AVERAGING_WINDOW,
              clip_grad=5,
              min_length=TRAIN_PERIODS,
              tf_decay=0.7**(1 / 6),
              tf_min=0.02,
              tf_warmup=12000,
              tf_steps=2000):
     self.name = "transformer"
     if propagate:
         self.name += "_tf"
     super(TransformerBot, self).__init__(train_dataset,
                                          test_dataset,
                                          clip_grad=clip_grad,
                                          val_dataset=val_dataset,
                                          avg_window=avg_window)
     self.model = TransformerModel(n_max_seq=TRAIN_PERIODS,
                                   n_layers=n_layers,
                                   n_head=n_head,
                                   d_word_vec=d_model,
                                   d_model=d_model,
                                   d_inner_hid=d_inner_hid,
                                   d_k=d_k,
                                   d_v=d_v,
                                   propagate=propagate,
                                   hdrop=hdrop,
                                   edrop=edrop,
                                   odrop=odrop,
                                   min_length=min_length,
                                   y_scale_by=1 / self.global_stds[0],
                                   steps=steps)
     self.model.cuda()
     self.current_tf_ratio = 1
     self.best_tf_ratio = 1
     self.tf_min = tf_min
     self.tf_decay = tf_decay
     self.tf_steps = tf_steps
     self.tf_warmup = tf_warmup
     self.logger.info(str(self.model))
     if propagate:
         self.logger.info(
             "TF min: {:.2f} TF decay: {:.4f} TF steps: {:d} TF warmup: {:d}"
             .format(tf_min, tf_decay, tf_steps, tf_warmup))
     self.tbwriter.add_text("model_structure", str(self.model))
     self.tbwriter.add_text(
         "TF_setting",
         "TF min: {:.2f} TF decay: {:.4f} TF steps: {:d} TF warmup: {:d}".
         format(tf_min, tf_decay, tf_steps, tf_warmup))
Beispiel #8
0
cuda = config['train']['cuda']



#Main Loop

while True:
    
    min_test_loss = 1.e6
    
    loss = 0.0
    train_loss_seq = []
    test_loss_seq = []

    if model_type == 'Transformer':
        model = TransformerModel(config)
    elif model_type == 'LSTM':
        model = LSTMModel(config)
    if cuda:
        model = model.cuda()

    optimizer = torch.optim.Adam(model.parameters(),
                             lr=config['train']['learning_rate'],
                             weight_decay=config['train']['weight_decay'])
    criterion = torch.nn.MSELoss()
    
    optimizer.zero_grad()
        
    for it in range(n_iter):
        model.train()
        country = random.choice(train_countries)
Beispiel #9
0
def launch(model_params, checkpoint_path, device='cuda'):
    print('model_params:\t', model_params)

    max_length = model_params['bptt']

    tokenizer = get_default_tokenizer()

    eos_token = tokenizer.token_to_id('[SEP]')
    eod_token = tokenizer.token_to_id('[DOC_SEP]')
    vocab_size = tokenizer._tokenizer.get_vocab_size()

    assert eos_token is not None, 'Invalid tokenizer files - EOS token cannot be null'

    # Model

    from models import TransformerModel, LSTMModel

    model_type = model_params.get('model_type', 'transformer')
    assert model_type in ['transformer', 'lstm']

    if model_type == 'transformer':
        model = TransformerModel(ntoken=vocab_size, **model_params)
    else:
        model = LSTMModel(ntoken=vocab_size, **model_params)

    model = model.to(device)

    if checkpoint_path and path.exists(checkpoint_path):
        print(f'Loading checkpoint from {checkpoint_path}')
        checkpoint_state = torch.load(checkpoint_path)
        model.load_state_dict(checkpoint_state)

    @torch.no_grad()
    def _generate(input_ids=None,
                  max_length=max_length,
                  do_sample=True,
                  num_beams=5,
                  temperature=1.3,
                  top_k=50,
                  top_p=1.0,
                  repetition_penalty=1.2,
                  eos_token_ids=[eos_token, eod_token],
                  length_penalty=1.0,
                  num_return_sequences=1,
                  vocab_size=vocab_size):
        pad_token_id = 0
        model.eval()

        batch_size = 1
        cur_len = input_ids.shape[1]

        # Expand input to num beams
        input_ids = input_ids.unsqueeze(1).expand(batch_size, num_beams,
                                                  cur_len)
        input_ids = input_ids.contiguous().view(batch_size * num_beams,
                                                cur_len)

        # generated hypotheses
        generated_hyps = [
            BeamHypotheses(num_beams,
                           max_length,
                           length_penalty,
                           early_stopping=False) for _ in range(batch_size)
        ]

        # scores for each sentence in the beam
        beam_scores = torch.zeros((batch_size, num_beams),
                                  dtype=torch.float,
                                  device=input_ids.device)
        beam_scores[:, 1:] = -1e9
        beam_scores = beam_scores.view(-1)  # shape (batch_size * num_beams,)

        # cache compute states
        past = None

        # done sentences
        done = [False for _ in range(batch_size)]

        while cur_len < max_length:

            outputs = model(input_ids.t())
            outputs = outputs.permute(1, 0, 2)
            # print(input_ids)
            # print(torch.argmax(outputs))

            scores = outputs[:, -1, :]

            # repetition penalty (from CTRL paper https://arxiv.org/abs/1909.05858)
            if repetition_penalty != 1.0:
                for i in range(batch_size * num_beams):
                    for previous_token in set(input_ids[i].tolist()):
                        # if score < 0 then repetition penalty has to multiplied to reduce the previous token probability
                        if scores[i, previous_token] < 0:
                            scores[i, previous_token] *= repetition_penalty
                        else:
                            scores[i, previous_token] /= repetition_penalty

            if do_sample:
                # Temperature (higher temperature => more likely to sample low probability tokens)
                if temperature != 1.0:
                    scores = scores / temperature
                # Top-p/top-k filtering
                # min_value = torch.min(scores, dim=-1)[]
                scores = top_k_top_p_filtering(
                    scores, top_k=top_k, top_p=top_p, min_tokens_to_keep=2
                )  # (batch_size * num_beams, vocab_size)
                # Sample 2 next words for each beam (so we have some spare tokens and match output of greedy beam search)

                try:
                    next_words = torch.multinomial(
                        torch.softmax(scores, dim=-1),
                        num_samples=2,
                        replacement=True)  # (batch_size * num_beams, 2)
                except:
                    print((torch.softmax(scores, dim=-1) > 0).sum())
                    raise ValueError()
                # Compute next scores
                _scores = F.log_softmax(
                    scores, dim=-1)  # (batch_size * num_beams, vocab_size)
                _scores = torch.gather(
                    _scores, -1, next_words)  # (batch_size * num_beams, 2)
                next_scores = _scores + beam_scores[:, None].expand_as(
                    _scores)  # (batch_size * num_beams, 2)
                # Match shape of greedy beam search
                next_words = next_words.view(
                    batch_size, 2 * num_beams)  # (batch_size, 2 * num_beams)
                next_scores = next_scores.view(
                    batch_size, 2 * num_beams)  # (batch_size, 2 * num_beams)
            else:
                # do greedy beam search
                scores = F.log_softmax(
                    scores, dim=-1)  # (batch_size * num_beams, vocab_size)
                assert scores.size() == (batch_size * num_beams, vocab_size)
                # Add the log prob of the new beams to the log prob of the beginning of the sequence (sum of logs == log of the product)
                _scores = scores + beam_scores[:, None].expand_as(
                    scores)  # (batch_size * num_beams, vocab_size)
                # re-organize to group the beam together (we are keeping top hypothesis accross beams)
                _scores = _scores.view(
                    batch_size, num_beams *
                    vocab_size)  # (batch_size, num_beams * vocab_size)
                next_scores, next_words = torch.topk(_scores,
                                                     2 * num_beams,
                                                     dim=1,
                                                     largest=True,
                                                     sorted=True)

            assert next_scores.size() == next_words.size() == (batch_size,
                                                               2 * num_beams)

            # next batch beam content
            # list of (batch_size * num_beams) tuple(next hypothesis score, next word, current position in the batch)
            next_batch_beam = []

            # for each sentence
            for batch_ex in range(batch_size):

                # if we are done with this sentence
                done[batch_ex] = done[batch_ex] or generated_hyps[
                    batch_ex].is_done(next_scores[batch_ex].max().item())
                if done[batch_ex]:
                    next_batch_beam.extend([(0, pad_token_id, 0)] *
                                           num_beams)  # pad the batch
                    continue

                # next sentence beam content
                next_sent_beam = []

                # next words for this sentence
                for idx, score in zip(next_words[batch_ex],
                                      next_scores[batch_ex]):

                    # get beam and word IDs
                    beam_id = idx // vocab_size
                    word_id = idx % vocab_size

                    # end of sentence, or next word
                    if word_id.item(
                    ) in eos_token_ids or cur_len + 1 == max_length:
                        generated_hyps[batch_ex].add(
                            input_ids[batch_ex * num_beams +
                                      beam_id, :cur_len].clone(), score.item())
                    else:
                        next_sent_beam.append(
                            (score, word_id, batch_ex * num_beams + beam_id))

                    # the beam for next step is full
                    if len(next_sent_beam) == num_beams:
                        break

                # update next beam content
                assert len(next_sent_beam
                           ) == 0 if cur_len + 1 == max_length else num_beams
                if len(next_sent_beam) == 0:
                    next_sent_beam = [(0, pad_token_id, 0)
                                      ] * num_beams  # pad the batch
                next_batch_beam.extend(next_sent_beam)
                assert len(next_batch_beam) == num_beams * (batch_ex + 1)

            # sanity check / prepare next batch
            assert len(next_batch_beam) == batch_size * num_beams
            beam_scores = beam_scores.new([x[0] for x in next_batch_beam])
            beam_words = input_ids.new([x[1] for x in next_batch_beam])
            beam_idx = input_ids.new([x[2] for x in next_batch_beam])

            # re-order batch
            input_ids = input_ids[beam_idx, :]
            input_ids = torch.cat([input_ids, beam_words.unsqueeze(1)], dim=-1)

            # re-order internal states
            if past:
                reordered_past = []
                for layer_past in past:
                    # get the correct batch idx from layer past batch dim
                    # batch dim of `past` and `mems` is at 2nd position
                    reordered_layer_past = [
                        layer_past[:, i].unsqueeze(1).clone().detach()
                        for i in beam_idx
                    ]
                    reordered_layer_past = torch.cat(reordered_layer_past,
                                                     dim=1)
                    # check that shape matches
                    assert reordered_layer_past.shape == layer_past.shape
                    reordered_past.append(reordered_layer_past)
                past = tuple(reordered_past)

            # update current length
            cur_len = cur_len + 1

            # stop when we are done with each sentence
            if all(done):
                break

        # visualize hypotheses
        # print([len(x) for x in generated_hyps], cur_len)
        # globals().update( locals() );
        # !import code; code.interact(local=vars())
        # for ii in range(batch_size):
        #     for ss, ww in sorted(generated_hyps[ii].hyp, key=lambda x: x[0], reverse=True):
        #         print("%.3f " % ss + " ".join(self.dico[x] for x in ww.tolist()))
        #     print("")

        # select the best hypotheses
        tgt_len = input_ids.new(batch_size)
        best = []

        for i, hypotheses in enumerate(generated_hyps):
            if len(hypotheses.hyp) == 0:
                continue

            best_hyp = max(hypotheses.hyp, key=lambda x: x[0])[1]
            tgt_len[i] = len(best_hyp) + 1  # +1 for the <EOS> symbol
            best.append(best_hyp)

        # generate target batch
        decoded = input_ids.new(batch_size,
                                tgt_len.max().item()).fill_(pad_token_id)
        for i, hypo in enumerate(best):
            decoded[i, :tgt_len[i] - 1] = hypo
            decoded[i, tgt_len[i] - 1] = eos_token_ids[0]

        return decoded

    model_input = LEADING_TEXT

    while True:
        user_prompt = input(' >>> ')

        if user_prompt == 'exit':
            exit()

        else:
            num_return_sequences = 1

            model_input += ' [P0] ' + user_prompt + ' [SEP] [P1] '

            input_ids = tokenizer.encode(model_input).ids
            input_ids = torch.LongTensor(input_ids).unsqueeze(0)
            input_ids = input_ids.to(device)

            output = _generate(input_ids=input_ids,
                               max_length=min(max_length,
                                              input_ids.size(1) + 40))

            if num_return_sequences != 1:
                output = output.view(batch_size, num_return_sequences, -1)

            response = tokenizer.decode(output[0].cpu().tolist(),
                                        skip_special_tokens=False)

            eod_token = '[DOC_SEP]'

            if eod_token in response:
                response = response[response.index(eod_token):]

            start_token = '[P1]'
            sep_token = '[SEP]'

            if start_token in response:
                start_idx = response.index(start_token) + len(start_token) + 1
                response = response[start_idx:]

            if sep_token in response:
                sep_idx = response.index(sep_token)
                response = response[:sep_idx]

            model_input += response + f' {sep_token} '

            print('Bot: ' + response)
class TransformerBot(BaseBot):
    def __init__(self,
                 train_dataset,
                 test_dataset,
                 *,
                 val_dataset,
                 n_layers=6,
                 n_head=8,
                 d_model=512,
                 d_inner_hid=1024,
                 d_k=64,
                 d_v=64,
                 edrop=0.25,
                 odrop=0.25,
                 hdrop=0.1,
                 propagate=False,
                 steps=15,
                 avg_window=AVERAGING_WINDOW,
                 clip_grad=5,
                 min_length=TRAIN_PERIODS,
                 tf_decay=0.7**(1 / 6),
                 tf_min=0.02,
                 tf_warmup=12000,
                 tf_steps=2000):
        self.name = "transformer"
        if propagate:
            self.name += "_tf"
        super(TransformerBot, self).__init__(train_dataset,
                                             test_dataset,
                                             clip_grad=clip_grad,
                                             val_dataset=val_dataset,
                                             avg_window=avg_window)
        self.model = TransformerModel(n_max_seq=TRAIN_PERIODS,
                                      n_layers=n_layers,
                                      n_head=n_head,
                                      d_word_vec=d_model,
                                      d_model=d_model,
                                      d_inner_hid=d_inner_hid,
                                      d_k=d_k,
                                      d_v=d_v,
                                      propagate=propagate,
                                      hdrop=hdrop,
                                      edrop=edrop,
                                      odrop=odrop,
                                      min_length=min_length,
                                      y_scale_by=1 / self.global_stds[0],
                                      steps=steps)
        self.model.cuda()
        self.current_tf_ratio = 1
        self.best_tf_ratio = 1
        self.tf_min = tf_min
        self.tf_decay = tf_decay
        self.tf_steps = tf_steps
        self.tf_warmup = tf_warmup
        self.logger.info(str(self.model))
        if propagate:
            self.logger.info(
                "TF min: {:.2f} TF decay: {:.4f} TF steps: {:d} TF warmup: {:d}"
                .format(tf_min, tf_decay, tf_steps, tf_warmup))
        self.tbwriter.add_text("model_structure", str(self.model))
        self.tbwriter.add_text(
            "TF_setting",
            "TF min: {:.2f} TF decay: {:.4f} TF steps: {:d} TF warmup: {:d}".
            format(tf_min, tf_decay, tf_steps, tf_warmup))

    def get_model_params(self, steps=0, is_train=True):
        if is_train:
            if steps < self.tf_warmup:
                return {"tf_ratio": 1}
            if (steps - self.tf_warmup) % self.tf_steps == 0:
                self.current_tf_ratio = max(
                    self.current_tf_ratio * self.tf_decay, self.tf_min)
            return {"tf_ratio": self.current_tf_ratio}
        return {"tf_ratio": 0}

    def reset_params(self):
        self.current_tf_ratio = 1
        self.best_tf_ratio = 1

    def additional_logging(self, step):
        if self.model.propagate:
            self.logger.info("Current tf_ratio: {:.4f}".format(
                self.current_tf_ratio))
            self.tbwriter.add_scalar("tf_ratio", self.current_tf_ratio, step)

    def save_state(self):
        self.best_tf_ratio = self.current_tf_ratio
Beispiel #11
0
                                    sort_within_batch=False)
    valid_iterator = BucketIterator(valid,
                                    batch_size=batch_size,
                                    device=device,
                                    sort=False,
                                    sort_within_batch=False)

    # sumeval evaluator
    evaluator = SumEvaluator(metrics=metrics, stopwords=False, lang="en")

    # Transformer model
    model = TransformerModel(
        len(IN_TEXT.vocab),
        t_conf["model"]["params"]["d_model"],  # emb_size
        len(OUT_TEXT.vocab),
        pretrained_vectors=None,
        nhead=t_conf["model"]["params"]["nhead"],
        num_encoder_layers=t_conf["model"]["params"]["num_encoder_layer"],
        num_decoder_layers=t_conf["model"]["params"]["num_decoder_layer"],
        dim_feedforward=t_conf["model"]["params"]["dim_feedforward"],
        dropout=t_conf["model"]["params"]["dropout"]).to(device)

    # Optimizer
    # General template to make an optimzier instance
    # e.g.,)
    #        optimizer = optim.SGD(model.parameters(),
    #                              lr=0.1,
    #                              momentum=0.9,
    #                              nesterov=True)
    optimizer = eval("{}(model.parameters(), **{})".format(
        t_conf["training"]["optimizer"]["cls"],
        str(t_conf["training"]["optimizer"]["params"])))
Beispiel #12
0

def get_batch(source, i):
    seq_len = min(bptt, len(source) - 1 - i)
    data = source[i:i + seq_len]
    target = source[i + 1:i + 1 + seq_len].view(-1)
    return data, target


ntokens = len(TEXT.vocab.stoi)  # the size of vocabulary
emsize = 200  # embedding dimension
nhid = 200  # the dimension of the feedforward network model in nn.TransformerEncoder
nlayers = 2  # the number of nn.TransformerEncoderLayer in nn.TransformerEncoder
nhead = 2  # the number of heads in the multiheadattention models
dropout = 0.2  # the dropout value
model = TransformerModel(ntokens, emsize, nhead, nhid, nlayers,
                         dropout).to(device)

criterion = nn.CrossEntropyLoss()
lr = 5.0  # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)


def train():
    model.train()  # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    ntokens = len(TEXT.vocab.stoi)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
        data, targets = get_batch(train_data, i)
        optimizer.zero_grad()
Beispiel #13
0
def main(args):
    # Set up logging and devices
    startime = datetime.now()
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)

    time_log = args.log_time
    if time_log > 0:
        log.info(f'Start training at: {startime.strftime("%H:%M:%S")}')

    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))
    model_type = args.model

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # check this
    #useCharEmbeddings = args.model == 'BiDAFplus'

    # Get embeddings
    log.info('Loading embeddings...')
    print(f'{args.word_emb_file}')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)
    if time_log > 0:
        log.info(f'Loaded embeddings: {(datetime.now()-startime).seconds}')
    # load_char_vectors
    # Get model
    log.info('Building model...')
    if model_type == 'BiDAFplus':  #
        model = BiDAFplus(word_vectors=word_vectors,
                          char_vectors=char_vectors,
                          hidden_size=args.hidden_size,
                          params=get_params(model_type, args.params))

    elif model_type == 'BiDAFbase':
        model = BiDAFbase(word_vectors=word_vectors,
                          hidden_size=args.hidden_size,
                          drop_prob=args.drop_prob)

    elif model_type == "Transformer":
        model = TransformerModel(word_vectors=word_vectors,
                                 char_vectors=char_vectors,
                                 params=get_params(model_type, args.params))

    elif model_type == 'BiDAF':
        model = BiDAF(word_vectors=word_vectors,
                      char_vectors=char_vectors,
                      hidden_size=args.hidden_size,
                      params=get_params(model_type, args.params))

    model = nn.DataParallel(model, args.gpu_ids)
    if time_log > 0:
        log.info(f'Built model: {(datetime.now()-startime).seconds}')
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    if args.mode != 'quick_eval':
        train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
        train_loader = data.DataLoader(train_dataset,
                                       batch_size=args.batch_size,
                                       shuffle=True,
                                       num_workers=args.num_workers,
                                       collate_fn=collate_fn)

        dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
        dev_loader = data.DataLoader(dev_dataset,
                                     batch_size=args.batch_size,
                                     shuffle=False,
                                     num_workers=args.num_workers,
                                     collate_fn=collate_fn)

    else:
        loaded_data = quick_eval_data_loader()
        train_loader = [loaded_data for _ in range(5)]
        dev_loader = [quick_eval_data_loader(dev=True)]
        train_dataset = train_loader
        dev_dataset = dev_loader

    log.info('Built dataset: {}:{}'.format(*divmod((datetime.now() -
                                                    startime).seconds, 60)))

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    if time_log > 0:
        traintime = datetime.now()
    total_iterations = 0
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        if time_log > 0:
            epochtime = datetime.now()
        if args.mode != 'quick_eval':
            progress_len = len(train_loader.dataset)
        else:
            progress_len = len(train_loader)
        with torch.enable_grad(), \
                tqdm(total=progress_len) as progress_bar:

            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:

                #quick_eval_data_saver(cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids)

                #########
                if time_log > 0:
                    itertime = datetime.now()
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                if model_type == 'BiDAF' or model_type == "Transformer":
                    cc_idxs = cc_idxs.to(device)
                    qc_idxs = qc_idxs.to(device)

                    log_p1, log_p2 = model(cc_idxs, qc_idxs, cw_idxs, qw_idxs)

                # Forward
                elif model_type == 'BiDAFbase':
                    log_p1, log_p2 = model(cw_idxs, qw_idxs)

                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                if time_log > 2:
                    forwardtime = datetime.now()
                    log.info('Forward time {}:{}'.format(
                        *divmod((forwardtime - itertime).seconds, 60)))
                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                if time_log > 2:
                    backwardtime = datetime.now()
                    log.info('Backward time {}:{}'.format(
                        *divmod((backwardtime - forwardtime).seconds, 60)))
                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                if time_log > 0:
                    enditertime = datetime.now()
                    #log.info('Iteration {} {}:{}'.format(total_iterations,
                    #    *divmod((enditertime-itertime).seconds, 60)))

                steps_till_eval -= batch_size
                if steps_till_eval <= 0 or args.mode == 'quick_eval':
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(
                        model,
                        dev_loader,
                        device,
                        args.dev_eval_file,
                        args.max_ans_len,
                        args.use_squad_v2,
                        model_type,
                        quick_eval=args.mode == 'quick_eval')
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    if time_log > 1:
                        log.info('Eval time {}:{}'.format(
                            *divmod((datetime.now() -
                                     enditertime).seconds, 60)))

                    results_str = ', '.join(f'{k}: {v:05.2f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
                total_iterations += 1
                if ((time_log == 2) and (total_iterations % 10 == 0)) or (
                    (time_log == 1) and (total_iterations % 100 == 0)):
                    log.info('Mean iteration time {}:{}'.format(
                        *divmod((enditertime - traintime).seconds /
                                total_iterations, 60)))

        if time_log > 0:
            endepochtime = datetime.now()
            log.info('Epoch time {}:{}'.format(
                *divmod((endepochtime - epochtime).seconds, 60)))
    torch.manual_seed(args.seed)

    scan_all = ge.load_scan_file('all', 'train')
    scan_all_var = ge.load_scan_var('all', 'train')

    input_symbols_scan = get_unique_words([c[0] for c in scan_all])
    output_symbols_scan = get_unique_words([c[1] for c in scan_all])

    all_symbols_scan = input_symbols_scan + output_symbols_scan
    all_lang = Lang(all_symbols_scan)
    ntoken = all_lang.n_symbols

    # set up transformer encoder-decoder model, loss, optimizer
    model = TransformerModel(ntoken=ntoken,
                             emsize=args.emsize,
                             nhead=args.nhead,
                             nhid=args.nhid,
                             nlayers=args.nlayers,
                             dropout=args.dropout)
    model = nn.DataParallel(model).cuda()
    criterion = nn.NLLLoss().cuda()
    optimizer = torch.optim.Adam(model.parameters(), args.lr)

    if args.model_path:
        if os.path.isfile(args.model_path):
            print('Loading model at:', args.model_path)
            checkpoint = torch.load(args.model_path)
            model.load_state_dict(checkpoint['model_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
    else:
        print("=> no checkpoint found at '{}'".format(args.model_path))
Beispiel #15
0
def main():
    args = parse_args()
    if args.deterministic:
        random.seed(0)
        torch.manual_seed(0)
        np.random.seed(0)
        torch.backends.cudnn.deterministic = True

    logging.basicConfig(
        format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
        datefmt='%m/%d/%Y %H:%M:%S',
        level=logging.INFO)
    logger = logging.getLogger(__name__)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    args.gpu = 0

    TEXT = torchtext.data.Field(tokenize=get_tokenizer("basic_english"),
                                init_token='<sos>',
                                eos_token='<eos>',
                                lower=False)
    train_txt, val_txt, test_txt = torchtext.datasets.WikiText2.splits(
        TEXT, root=args.data_dir)
    TEXT.build_vocab(train_txt)

    model = TransformerModel(len(TEXT.vocab.stoi), args.em_size,
                             args.num_heads, args.hid_size,
                             args.num_layers).to(device)
    # model = torch.nn.DataParallel(model, dim=1)
    # optimiser = optim.Adam(model.parameters())
    # optimiser = Ranger(model.parameters())
    optimiser = RAdam(model.parameters())

    if args.eval:
        dataloaders = {
            "test":
            DataLoader(TextEvalDataset(test_txt, args.ngram, TEXT),
                       batch_size=args.eval_batch_size,
                       shuffle=False)
        }
        if args.resume:
            resume(model, args)

        test_loss, test_acc = eval_pll(device, model, dataloaders["test"],
                                       args)
        logger.info(f"Eval: Test Loss = {test_loss}, Test Acc = {test_acc}")
    else:
        dataloaders = {
            "train":
            DataLoader(TextTrainDataset(train_txt, args.ngram, TEXT,
                                        args.poisson_rate),
                       batch_size=args.train_batch_size,
                       shuffle=True),
            "val":
            DataLoader(TextEvalDataset(val_txt, args.ngram, TEXT),
                       batch_size=args.eval_batch_size,
                       shuffle=False),
            "test":
            DataLoader(TextEvalDataset(test_txt, args.ngram, TEXT),
                       batch_size=args.eval_batch_size,
                       shuffle=False)
        }
        args.start_epoch = 0
        args.best_acc = 1 / args.ngram
        if args.resume:
            resume(model, args, optimiser)

        # Create folder for the current model and save args
        model_dir = time.ctime().replace(" ", "_").replace(":", "_")
        args.model_dir = os.path.join("models", model_dir)
        os.makedirs(args.model_dir, exist_ok=True)
        with open(os.path.join(args.model_dir, "args.json"), "w") as f:
            json.dump(args.__dict__, f, indent=2)
        args.logger = logger
        train_pll(device, model, optimiser, dataloaders, args)
Beispiel #16
0
                                   device=device,
                                   sort=False,
                                   sort_within_batch=False)
    agg_test_iterator = BucketIterator(agg_test,
                                       batch_size=batch_size,
                                       device=device,
                                       sort=False,
                                       sort_within_batch=False)
    # ================================================================

    # Load model
    model = TransformerModel(
        len(IN_TEXT.vocab),
        t_conf["model"]["params"]["d_model"],  # emb_size
        len(OUT_TEXT.vocab),
        pretrained_vectors=None,
        nhead=t_conf["model"]["params"]["nhead"],
        num_encoder_layers=t_conf["model"]["params"]["num_encoder_layer"],
        num_decoder_layers=t_conf["model"]["params"]["num_decoder_layer"],
        dim_feedforward=t_conf["model"]["params"]["dim_feedforward"],
        dropout=t_conf["model"]["params"]["dropout"]).to(device)
    model.load_state_dict(torch.load(model_filepath, map_location=device))

    # sumeval evaluator
    evaluator = SumEvaluator(metrics=t_conf["metrics"],
                             stopwords=False,
                             lang="en")

    # Old script used t_conf["training"]["gen_maxlen"]
    gen_maxlen = g_conf["gen_maxtoken"]

    ## 1. Generation for each entity in "aggregated" test_{}.csv
Beispiel #17
0
train_data, val_data, test_data, vocab = dh.get_data()

#Hyper params
n_tokens = len(vocab.stoi)  # the size of vocabulary
emb_size = 512  # embedding size
n_hidden = 200  # dimension of the FF network inside the transformer
n_layers = 2  # number of transformer layers
n_heads = 2  # the number of heads in the multiheadattention models
dropout = 0.2  # dropout percentage
criterion = nn.CrossEntropyLoss()  #loss function
lr = 5.0  # learning rate

# Creating the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
model = TransformerModel(n_tokens, emb_size, n_heads, n_hidden, n_layers,
                         dropout).to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=lr)  #Optimizer
scheduler = torch.optim.lr_scheduler.StepLR(
    optimizer, 1, gamma=0.95)  #Scheduler for the optimizer


def train(model):
    model.train()  # Turn on the train mode
    total_loss = 0.
    start_time = time.time()
    src_mask = model.generate_square_subsequent_mask(dh.bptt).to(device)
    for batch, i in enumerate(range(0, train_data.size(0) - 1, dh.bptt)):
        data, targets = dh.get_batch(train_data, i)
        optimizer.zero_grad()
        if data.size(0) != dh.bptt:
Beispiel #18
0
                eos_token='<eos>',
                lower=True)

    train_data, val_data, test_data = Multi30k.splits(exts=('.de', '.en'),
                                                      fields=(SRC, TRG))
    train_iter, val_iter, test_iter = BucketIterator.splits(
        (train_data, val_data, test_data), batch_size=args.batch_size)

    print(len(train_iter))

    SRC.build_vocab(train_data, min_freq=2)
    TRG.build_vocab(train_data, min_freq=2)

    # Create model
    model = TransformerModel(len(SRC.vocab), len(TRG.vocab), args.d_model,
                             args.n_head, args.num_enc_layers,
                             args.num_dec_layers, args.dim_feedforword,
                             args.dropout, args.activation).to(device)
    if args.resume_model is not None:
        start_epoch, best_wer = resume_model(model, args.resume_model)
    # Run the model parallelly
    if torch.cuda.device_count() > 1:
        logger.info("Using {} GPUs".format(torch.cuda.device_count()))
        model = nn.DataParallel(model)
    # Create loss criterion & optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=args.learning_rate)

    # Start training
    logger.info("Training Started".center(60, '#'))
    for epoch in range(start_epoch, args.epochs):
        # Train the model
Beispiel #19
0
                    .with_ascii_quotes_replacement()\
                    .with_possessive_elimination()\
                    .with_punct_removal()\
                    .with_stopwords_removal()\
                    .with_digit_removal()\
                    .build()

    clean_train_df, edited_col_name_train = preprocessor.preprocess(train_df)
    clean_test_df, edited_col_name_test = preprocessor.preprocess(test_df)

    # We set our training data and test data
    training_data = clean_train_df[edited_col_name_train]
    test_data = clean_test_df[edited_col_name_test]

    # Create tokenizer, model
    model = TransformerModel.TransformerModelBuilder().build()
    print(model)
    print("Model initialised.")

    # Prepare the dataset
    train_X = model.tokenize(training_data.to_list())
    train_dataset = Task1Dataset(train_X, train_df['meanGrade'])

    model.to(device)

    train_proportion = 0.8

    train_examples = round(len(train_dataset) * train_proportion)
    dev_examples = len(train_dataset) - train_examples

    train_dataset, dev_dataset = random_split(train_dataset,
Beispiel #20
0
class Predictor(PredictorBase):
    def __init__(self, config):
        super(Predictor, self).__init__(config)
        self.model = None
        self.config = config

        self.word_to_index, self.label_to_index = self.load_vocab()
        self.index_to_label = {
            value: key
            for key, value in self.label_to_index.items()
        }
        self.vocab_size = len(self.word_to_index)
        self.word_vectors = None
        self.sequence_length = self.config["sequence_length"]

        # 创建模型
        self.create_model()
        # 加载计算图
        self.load_graph()

    def load_vocab(self):
        # 将词汇-索引映射表加载出来
        with open(os.path.join(self.output_path, "word_to_index.pkl"),
                  "rb") as f:
            word_to_index = pickle.load(f)

        with open(os.path.join(self.output_path, "label_to_index.pkl"),
                  "rb") as f:
            label_to_index = pickle.load(f)

        return word_to_index, label_to_index

    def sentence_to_idx(self, sentence):
        """
        将分词后的句子转换成idx表示
        :param sentence:
        :return:
        """
        sentence_ids = [
            self.word_to_index.get(token, self.word_to_index["<UNK>"])
            for token in sentence
        ]
        sentence_pad = sentence_ids[: self.sequence_length] if len(sentence_ids) > self.sequence_length \
            else sentence_ids + [0] * (self.sequence_length - len(sentence_ids))
        return sentence_pad

    def load_graph(self):
        """
        加载计算图
        :return:
        """
        self.sess = tf.Session()
        ckpt = tf.train.get_checkpoint_state(
            os.path.join(os.path.abspath(os.path.dirname(os.getcwd())),
                         self.config["ckpt_model_path"]))
        if ckpt and tf.train.checkpoint_exists(ckpt.model_checkpoint_path):
            print('Reloading model parameters..')
            self.model.saver.restore(self.sess, ckpt.model_checkpoint_path)
        else:
            raise ValueError('No such file:[{}]'.format(
                self.config["ckpt_model_path"]))

    def create_model(self):
        """
                根据config文件选择对应的模型,并初始化
                :return:
                """
        if self.config["model_name"] == "textcnn":
            self.model = TextCnnModel(config=self.config,
                                      vocab_size=self.vocab_size,
                                      word_vectors=self.word_vectors)
        elif self.config["model_name"] == "bilstm":
            self.model = BiLstmModel(config=self.config,
                                     vocab_size=self.vocab_size,
                                     word_vectors=self.word_vectors)
        elif self.config["model_name"] == "bilstm_atten":
            self.model = BiLstmAttenModel(config=self.config,
                                          vocab_size=self.vocab_size,
                                          word_vectors=self.word_vectors)
        elif self.config["model_name"] == "rcnn":
            self.model = RcnnModel(config=self.config,
                                   vocab_size=self.vocab_size,
                                   word_vectors=self.word_vectors)
        elif self.config["model_name"] == "transformer":
            self.model = TransformerModel(config=self.config,
                                          vocab_size=self.vocab_size,
                                          word_vectors=self.word_vectors)

    def predict(self, sentence):
        """
        给定分词后的句子,预测其分类结果
        :param sentence:
        :return:
        """
        sentence_ids = self.sentence_to_idx(sentence)

        prediction = self.model.infer(self.sess, [sentence_ids]).tolist()[0]
        label = self.index_to_label[prediction[0]]
        return label
Beispiel #21
0
class Trainer(TrainerBase):
    def __init__(self, args):
        super(Trainer, self).__init__()
        self.args = args
        with open(
                os.path.join(os.path.abspath(os.path.dirname(os.getcwd())),
                             args.config_path), "r") as fr:
            self.config = json.load(fr)

        self.train_data_obj = None
        self.eval_data_obj = None
        self.model = None
        # save_path模型保存目录
        self.save_path = os.path.join(
            os.path.abspath(os.path.dirname(os.getcwd())),
            self.config["ckpt_model_path"])
        if not os.path.exists(self.save_path):
            os.makedirs(self.save_path)
        # self.builder = tf.saved_model.builder.SavedModelBuilder("../pb_model/weibo/bilstm/savedModel")

        # 加载数据集
        self.load_data()
        self.train_inputs, self.train_labels, label_to_idx = self.train_data_obj.gen_data(
        )
        print("train data size: {}".format(len(self.train_labels)))
        self.vocab_size = self.train_data_obj.vocab_size
        print("vocab size: {}".format(self.vocab_size))
        self.word_vectors = self.train_data_obj.word_vectors
        self.label_list = [value for key, value in label_to_idx.items()]

        self.eval_inputs, self.eval_labels = self.eval_data_obj.gen_data()
        print("eval data size: {}".format(len(self.eval_labels)))
        print("label numbers: ", len(self.label_list))
        # 初始化模型对象
        self.create_model()

    def load_data(self):
        """
        创建数据对象
        :return:
        """
        # 生成训练集对象并生成训练数据
        self.train_data_obj = TrainData(self.config)

        # 生成验证集对象和验证集数据
        self.eval_data_obj = EvalData(self.config)

    def create_model(self):
        """
        根据config文件选择对应的模型,并初始化
        :return:
        """
        if self.config["model_name"] == "textcnn":
            self.model = TextCnnModel(config=self.config,
                                      vocab_size=self.vocab_size,
                                      word_vectors=self.word_vectors)
        elif self.config["model_name"] == "bilstm":
            self.model = BiLstmModel(config=self.config,
                                     vocab_size=self.vocab_size,
                                     word_vectors=self.word_vectors)
        elif self.config["model_name"] == "bilstm_atten":
            self.model = BiLstmAttenModel(config=self.config,
                                          vocab_size=self.vocab_size,
                                          word_vectors=self.word_vectors)
        elif self.config["model_name"] == "rcnn":
            self.model = RcnnModel(config=self.config,
                                   vocab_size=self.vocab_size,
                                   word_vectors=self.word_vectors)
        elif self.config["model_name"] == "transformer":
            self.model = TransformerModel(config=self.config,
                                          vocab_size=self.vocab_size,
                                          word_vectors=self.word_vectors)

    def train(self):
        """
        训练模型
        :return:
        """
        gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.9,
                                    allow_growth=True)
        sess_config = tf.ConfigProto(log_device_placement=False,
                                     allow_soft_placement=True,
                                     gpu_options=gpu_options)
        with tf.Session(config=sess_config) as sess:
            # 初始化变量值
            sess.run(tf.global_variables_initializer())
            current_step = 0
            eval_loss_lis = [0]

            # 创建train和eval的summary路径和写入对象
            train_summary_path = os.path.join(
                os.path.abspath(os.path.dirname(os.getcwd())),
                self.config["output_path"] + "/summary/train")
            if not os.path.exists(train_summary_path):
                os.makedirs(train_summary_path)
            train_summary_writer = tf.summary.FileWriter(
                train_summary_path, sess.graph)

            eval_summary_path = os.path.join(
                os.path.abspath(os.path.dirname(os.getcwd())),
                self.config["output_path"] + "/summary/eval")
            if not os.path.exists(eval_summary_path):
                os.makedirs(eval_summary_path)
            eval_summary_writer = tf.summary.FileWriter(
                eval_summary_path, sess.graph)

            for epoch in range(self.config["epochs"]):

                print("----- Epoch {}/{} -----".format(epoch + 1,
                                                       self.config["epochs"]))

                for batch in self.train_data_obj.next_batch(
                        self.train_inputs, self.train_labels,
                        self.config["batch_size"]):
                    summary, loss, predictions = self.model.train(
                        sess, batch, self.config["keep_prob"],
                        self.config['learning_rate'])
                    train_summary_writer.add_summary(summary)
                    current_step += 1

                    if self.config[
                            "num_classes"] == 1 and current_step % self.config[
                                "print_every"] == 0:
                        acc, auc, recall, prec, f_beta = get_binary_metrics(
                            pred_y=predictions, true_y=batch["y"])
                        print(
                            "train: step: {}, loss: {}, acc: {}, auc: {}, recall: {}, precision: {}, f_beta: {}"
                            .format(current_step, loss, acc, auc, recall, prec,
                                    f_beta))
                    elif self.config[
                            "num_classes"] > 1 and current_step % self.config[
                                "print_every"] == 0:
                        acc, recall, prec, f_beta = get_multi_metrics(
                            pred_y=predictions,
                            true_y=batch["y"],
                            labels=self.label_list)
                        print(
                            "train: step: {}, loss: {}, acc: {}, recall: {}, precision: {}, f_beta: {}"
                            .format(current_step, loss, acc, recall, prec,
                                    f_beta))

                #每训练一个epoch输出验证集的评测结果
                if self.eval_data_obj:

                    eval_losses = []
                    eval_accs = []
                    eval_aucs = []
                    eval_recalls = []
                    eval_precs = []
                    eval_f_betas = []
                    for eval_batch in self.eval_data_obj.next_batch(
                            self.eval_inputs, self.eval_labels,
                            self.config["batch_size"]):
                        eval_summary, eval_loss, eval_predictions = self.model.eval(
                            sess, eval_batch)
                        eval_summary_writer.add_summary(eval_summary)

                        eval_losses.append(eval_loss)
                        if self.config["num_classes"] == 1:
                            acc, auc, recall, prec, f_beta = get_binary_metrics(
                                pred_y=eval_predictions,
                                true_y=eval_batch["y"])
                            eval_accs.append(acc)
                            eval_aucs.append(auc)
                            eval_recalls.append(recall)
                            eval_precs.append(prec)
                            eval_f_betas.append(f_beta)
                        elif self.config["num_classes"] > 1:
                            acc, recall, prec, f_beta = get_multi_metrics(
                                pred_y=eval_predictions,
                                true_y=eval_batch["y"],
                                labels=self.label_list)
                            eval_accs.append(acc)
                            eval_recalls.append(recall)
                            eval_precs.append(prec)
                            eval_f_betas.append(f_beta)
                    eval_loss_lis.append(mean(eval_losses))
                    print("\n")
                    print(
                        "eval:  loss: {}, acc: {}, auc: {}, recall: {}, precision: {}, f_beta: {}"
                        .format(mean(eval_losses), mean(eval_accs),
                                mean(eval_aucs), mean(eval_recalls),
                                mean(eval_precs), mean(eval_f_betas)))
                    print("\n")

                    if self.config["ckpt_model_path"] and eval_loss_lis[
                            -1] >= max(eval_loss_lis):
                        #self.model_save_path是模型保存具体的名字
                        self.model_save_path = os.path.join(
                            self.save_path, self.config["model_name"])
                        self.model.saver.save(sess,
                                              self.model_save_path,
                                              global_step=epoch + 1)
                    elif self.config["ckpt_model_path"] and eval_loss_lis[
                            -1] < max(eval_loss_lis):
                        if self.config['batch_size'] <= 256:
                            self.config['batch_size'] *= 2
                        if self.config['learning_rate'] <= 0.00001:
                            self.config['learning_rate'] *= 0.95
                            print(
                                "epoch: {} lr: {} self.batch_size: {}".format(
                                    epoch, self.lr, self.batch_size))
                            self.save_path = tf.train.latest_checkpoint(
                                self.save_path)
                            print('最新加载的模型路径{}'.format(self.save_path))
                        else:
                            print('learn_rate 小于0.00001,训练结束')
Beispiel #22
0
class LanguageModel(LightningModule):
    def __init__(self, hparams: dict(), **kwargs) -> 'LightningTemplateModel':
        # init superclass
        super().__init__(**kwargs)

        self.save_hyperparameters()

        self.hparams = hparams
        if self.hparams.model == 'awd':
            self.model = WDLSTM(
                self.hparams.num_tokens,
                num_layers=self.hparams.num_layers,
                num_hidden=self.hparams.num_hidden,
                num_embedding=self.hparams.num_embedding,
                tie_weights=self.hparams.tie_weights,
                embedding_dropout=self.hparams.embedding_dropout,
                input_dropout=self.hparams.input_dropout,
                hidden_dropout=self.hparams.hidden_dropout,
                output_dropout=self.hparams.output_dropout,
                weight_dropout=self.hparams.weight_dropout)

            self.model(
                torch.zeros(self.hparams.bptt, self.hparams.batch_size).long(),
                self.model.init_hidden(self.hparams.batch_size))
        elif self.hparams.model == 'rnn':
            self.model = RNNModel(self.hparams.rnn_type,
                                  self.hparams.num_tokens,
                                  num_embedding=self.hparams.num_embedding,
                                  num_hidden=self.hparams.num_hidden,
                                  num_layers=self.hparams.num_layers,
                                  dropout=self.hparams.dropout,
                                  tie_weights=self.hparams.tie_weights)
        elif self.hparams.model == 'transformer':
            self.model = TransformerModel(
                self.hparams.num_tokens,
                num_embedding=self.hparams.num_embedding,
                num_hidden=self.hparams.num_hidden,
                num_layers=self.hparams.num_layers,
                dropout=self.hparams.dropout,
                num_heads=self.hparams.num_heads)
        else:
            raise ValueError(f'Model {self.hparams.model} not recognized.')

        self.hiddens = None
        self.criterion = torch.nn.NLLLoss()
        self.avg_loss = 0

    def forward(self, x, hiddens=None):
        if self.hparams.model != 'transformer':
            return self.model(x, hiddens)
        return self.model(x)

    def on_train_epoch_start(self):
        self.train_len = len(
            self.train_dataloader().batch_sampler) * self.hparams.bptt
        if self.hparams.model != 'transformer':
            self.hiddens = self.model.init_hidden(self.hparams.batch_size)

    def training_step(self, batch, batch_idx):
        x, y = batch

        if self.hparams.model == 'awd':
            self.hiddens = repackage_hidden(self.hiddens)
            out, self.hiddens, (hs, dropped_hs) = self(x, self.hiddens)
        elif self.hparams.model == 'rnn':
            self.hiddens = repackage_hidden(
                self.hiddens) if self.hiddens else self.hiddens
            out, self.hiddens = self(x, self.hiddens)
        elif self.hparams.model == 'transformer':
            out = self(x)

        raw_loss = self.criterion(out, y)
        loss = raw_loss

        # The AR and TAR loss are only applied to the output of the final
        # RNN layer, not to all layers

        if self.hparams.model == 'awd':
            # WARNING: It is implementing here \ell_2^2 instead of \ell_2
            # Activation Regularization
            if self.hparams.alpha > 0:
                loss += self.hparams.alpha * dropped_hs[-1].pow(2).mean()

            # Temporal Activation Regularization (slowness)
            if self.hparams.beta > 0:
                loss += self.hparams.beta * \
                    (hs[-1][1:] - hs[-1][:-1]).pow(2).mean()

        ppl = torch.exp(raw_loss)
        bpc = raw_loss / math.log(2)

        self.log('train_loss', loss)
        self.log('train_ppl', ppl, prog_bar=True)
        self.log('train_bpc', bpc, prog_bar=True)

        return loss

    def on_validation_epoch_start(self):
        self.val_len = len(
            self.val_dataloader().batch_sampler) * self.hparams.bptt
        if self.hparams.model != 'transformer':
            self.hiddens = self.model.init_hidden(self.hparams.batch_size)

    def validation_step(self, batch, batch_idx):
        x, y = batch

        if self.hparams.model == 'awd':
            self.hiddens = repackage_hidden(self.hiddens)
            out, self.hiddens, (hs, dropped_hs) = self(x, self.hiddens)
        elif self.hparams.model == 'rnn':
            self.hiddens = repackage_hidden(
                self.hiddens) if self.hiddens else self.hiddens
            out, self.hiddens = self(x, self.hiddens)
        elif self.hparams.model == 'transformer':
            out = self(x)

        loss = self.criterion(out, y)

        self.log('val_loss',
                 len(x) * loss,
                 prog_bar=True,
                 reduce_fx=lambda x: torch.sum(x) / self.val_len)
        self.log('val_bpc',
                 len(x) * loss,
                 prog_bar=True,
                 reduce_fx=lambda x:
                 (torch.sum(x) / self.val_len) / math.log(2))
        self.log('val_ppl',
                 len(x) * loss,
                 prog_bar=True,
                 reduce_fx=lambda x: torch.exp(torch.sum(x) / self.val_len))
        return loss

    def on_test_epoch_start(self):
        self.test_len = len(
            self.test_dataloader().batch_sampler) * self.hparams.bptt
        if self.hparams.model != 'transformer':
            self.hiddens = self.model.init_hidden(self.hparams.batch_size)

    def test_step(self, batch, batch_idx):
        x, y = batch

        if self.hparams.model == 'awd':
            self.hiddens = repackage_hidden(self.hiddens)
            out, self.hiddens, (hs, dropped_hs) = self(x, self.hiddens)
        elif self.hparams.model == 'rnn':
            self.hiddens = repackage_hidden(
                self.hiddens) if self.hiddens else self.hiddens
            out, self.hiddens = self(x, self.hiddens)
        elif self.hparams.model == 'transformer':
            out = self(x)

        loss = self.criterion(out, y)

        self.log('test_loss',
                 len(x) * loss,
                 prog_bar=True,
                 reduce_fx=lambda x: torch.sum(x) / self.test_len)
        self.log('test_bpc',
                 len(x) * loss,
                 prog_bar=True,
                 reduce_fx=lambda x:
                 (torch.sum(x) / self.test_len) / math.log(2))
        self.log('test_ppl',
                 len(x) * loss,
                 prog_bar=True,
                 reduce_fx=lambda x: torch.exp(torch.sum(x) / self.test_len))
        return loss

    def configure_optimizers(self):
        """
        Return whatever optimizers and learning rate schedulers you want here.
        At least one optimizer is required.


        WARNING: The paper use a variation of ASGD, called non-monotonically
        triggered ASGD (Algorithm 1), which is not implemented yet, They used L
        to be the number of iterations in an epoch (i.e., after training epoch
        ends) and n=5.
        """
        if self.hparams.optimizer == 'sgd':
            optimizer = torch.optim.SGD(self.parameters(),
                                        lr=self.hparams.learning_rate,
                                        weight_decay=self.hparams.weight_decay)
        if self.hparams.optimizer == 'adam':
            optimizer = torch.optim.Adam(
                self.parameters(),
                lr=self.hparams.learning_rate,
                weight_decay=self.hparams.weight_decay)

        # scheduler = torch.optim.lr_scheduler.MultiStepLR(
        #     optimizer, self.hparams.multi_step_lr_milestones, gamma=0.1)
        scheduler = torch.optim.lr_scheduler.OneCycleLR(
            optimizer,
            max_lr=self.hparams.learning_rate,
            epochs=self.hparams.max_epochs,
            steps_per_epoch=len(self.train_dataloader()))
        return [optimizer], [{'scheduler': scheduler, 'interval': 'step'}]

    @staticmethod
    def add_model_specific_args(parent_parser):
        parser = ArgumentParser(parents=[parent_parser], add_help=False)
        parser.add_argument('--num-embedding',
                            type=int,
                            default=400,
                            help='size of word embeddings')
        parser.add_argument('--num-hidden',
                            type=int,
                            default=1150,
                            help='number of hidden units per layer')
        parser.add_argument('--num-layers',
                            type=int,
                            default=3,
                            help='number of layers')
        parser.add_argument('--learning_rate',
                            '--learning-rate',
                            type=float,
                            default=30.0,
                            help='initial learning rate')
        parser.add_argument('--batch-size',
                            type=int,
                            default=80,
                            metavar='N',
                            help='batch size')
        parser.add_argument('--bptt',
                            type=int,
                            default=70,
                            help='sequence length')
        parser.add_argument('--output-dropout',
                            type=float,
                            default=0.4,
                            help='dropout applied to layers (0 = no dropout)')
        parser.add_argument('--hidden-dropout',
                            type=float,
                            default=0.3,
                            help='dropout for rnn layers (0 = no dropout)')
        parser.add_argument(
            '--input-dropout',
            type=float,
            default=0.65,
            help='dropout for input embedding layers (0 = no dropout)')
        parser.add_argument(
            '--embedding-dropout',
            type=float,
            default=0.1,
            help='dropout to remove words from embedding layer '
            '(0 = no dropout)')
        parser.add_argument(
            '--weight-dropout',
            type=float,
            default=0.5,
            help='amount of weight dropout to apply to the RNN hidden to '
            'hidden matrix')
        parser.add_argument(
            '--alpha',
            type=float,
            default=0,
            help='alpha L2 regularization on RNN activation (alpha = 0 means'
            ' no regularization)')
        parser.add_argument(
            '--beta',
            type=float,
            default=0,
            help='beta slowness regularization applied on RNN activiation '
            '(beta = 0 means no regularization)')
        parser.add_argument('--weight-decay',
                            type=float,
                            default=1.2e-6,
                            help='weight decay applied to all weights')
        parser.add_argument('--optimizer',
                            type=str,
                            default='sgd',
                            help='optimizer to use (sgd, adam)')
        parser.add_argument(
            '--no-tie-weights',
            dest='tie_weights',
            default=True,
            action='store_false',
            help='if set, does not tie the input/output embedding weights')
        parser.add_argument('--rnn-type',
                            choices=['LSTM', 'GRU', 'RNN_TANH', 'RNN_RELU'],
                            default='LSTM')
        parser.add_argument('--dropout', type=float, default=0.2)
        parser.add_argument(
            '--num-heads',
            type=int,
            default=2,
            help='the number of heads in the encoder/decoder of the '
            ' transformer model')
        return parser
Beispiel #23
0
def main(args):

    since = time.time()
    output_dir = os.path.join(os.getcwd(), 'outputs')
    os.makedirs(output_dir, exist_ok=True)

    data_loaders = get_dataloader(
        input_dir=args.input_dir,
        which_challenge='3rd_challenge',
        phases=['test'],
        max_frame_length=args.max_frame_length,
        max_vid_label_length=args.max_vid_label_length,
        max_seg_label_length=args.max_seg_label_length,
        rgb_feature_size=args.rgb_feature_size,
        audio_feature_size=args.audio_feature_size,
        batch_size=args.batch_size,
        num_workers=args.num_workers)

    model = TransformerModel(
        n_layers=args.n_layers,
        n_heads=args.n_heads,
        rgb_feature_size=args.rgb_feature_size,
        audio_feature_size=args.audio_feature_size,
        d_rgb=args.d_rgb,
        d_audio=args.d_audio,
        d_model=args.d_model,
        d_ff=args.d_ff,
        d_proj=args.d_proj,
        n_attns = args.n_attns,
        num_classes=args.num_classes,
        dropout=args.dropout)
    model = model.to(device)

    checkpoint = torch.load(os.path.join(os.getcwd(), 'models/model-epoch-04.ckpt'))
    model.load_state_dict(checkpoint['state_dict'])
    model.eval()

    df_outputs = {i: pd.DataFrame(columns=['vid_id', 'vid_label_pred', 'vid_prob', 'seg_label_pred', 'seg_prob']) \
                      for i in range(1, args.num_classes+1)}

    for idx, (vid_ids, frame_lengths, frame_rgbs, frame_audios, vid_labels, seg_labels, seg_times) \
        in enumerate(data_loaders['test']):           

        if idx%10 == 0:
            print('idx:', idx)

        # frame_rgbs: [batch_size, frame_length, rgb_feature_size]
        # frame_audios: [batch_size, frame_length, audio_feature_size]
        frame_rgbs = frame_rgbs.to(device)
        frame_audios = frame_audios.to(device)
        batch_size = frame_audios.size(0)

        # vid_probs: [batch_size, num_classes]
        # attn_idc: [batch_size, num_classes]
        # scores: [batch_size, max_seg_length, n_attns]
        # attn_weights: [batch_size, max_seg_length, n_attns]
        vid_probs, attn_idc, scores, attn_weights, conv_loss = model(frame_rgbs, frame_audios, device)

        # vid_probs: [batch_size, vid_pred_length]
        # vid_label_preds: [batch_size, vid_pred_length]
        vid_probs, vid_label_preds = torch.topk(vid_probs, args.vid_pred_length)
        vid_label_preds = vid_label_preds + 1

        # attn_idc: [batch_size, num_classes+1]
        zeros = torch.zeros(batch_size, 1).long().to(device)
        attn_idc = torch.cat((zeros, attn_idc), dim=1)

        # selected_attn_idc: [batch_size, vid_pred_length]
        selected_attn_idc = torch.gather(attn_idc, 1, vid_label_preds)

        # attn_weights: [batch_size, n_attns, max_seg_length]
        attn_weights = attn_weights.transpose(1, 2)

        # selected_attn_weights: [batch_size, vid_pred_length, max_seg_length]
        selected_attn_weights = batched_index_select(attn_weights, 1, selected_attn_idc)

        # seg_probs: [batch_size, vid_pred_length, seg_pred_length] 
        # seg_label_preds: [batch_size, vid_pred_length, seg_pred_length] 
        seg_probs, seg_label_preds = torch.topk(selected_attn_weights, args.seg_pred_length)
        seg_label_preds = seg_label_preds + 1

        # seg_prob_min, seg_prob_max: [batch_size, vid_pred_length]
        seg_prob_min, _ = seg_probs.min(dim=2)
        seg_prob_max, _ = seg_probs.max(dim=2)

        # seg_prob_min, seg_prob_max: [batch_size, vid_pred_length, seg_pred_length]
        seg_prob_min = seg_prob_min.unsqueeze(2).expand(batch_size, args.vid_pred_length, args.seg_pred_length)
        seg_prob_max = seg_prob_max.unsqueeze(2).expand(batch_size, args.vid_pred_length, args.seg_pred_length)

        # seg_probs: [batch_size, vid_pred_length, seg_pred_length]
        seg_probs = (seg_probs - seg_prob_min) / (seg_prob_max - seg_prob_min + 1e-6)

        # To save predictions, converted to numpy data.
        vid_probs = vid_probs.cpu().detach().numpy()
        vid_label_preds = vid_label_preds.cpu().numpy()
        seg_probs = seg_probs.cpu().detach().numpy()
        seg_label_preds = seg_label_preds.cpu().numpy()

        for i in range(batch_size):
            for j in range(args.vid_pred_length):
                vid_label_pred = vid_label_preds[i][j]
                df_outputs[vid_label_pred] = df_outputs[vid_label_pred].append(
                    {'vid_id': vid_ids[i],
                     'vid_label_pred': vid_label_pred,
                     'vid_prob': vid_probs[i][j],
                     'seg_label_pred': list(seg_label_preds[i][j]),
                     'seg_prob': list(seg_probs[i][j])}, ignore_index=True)

    for i in range(1, args.num_classes+1):
        df_outputs[i].to_csv(os.path.join(output_dir, '%04d.csv'%i), index=False)

    time_elapsed = time.time() - since
    print('=> Running time in a epoch: {:.0f}h {:.0f}m {:.0f}s'
          .format(time_elapsed // 3600, (time_elapsed % 3600) // 60, time_elapsed % 60))
Beispiel #24
0
def generate_summary(model, tokenizer, document, decoder):
    """ Generates a summary for a single document

    Parameters
    ----------
    model: ``BartForConditionalGeneration`` A BART model that has been
        fine-tuned for summarization
    tokenizer: ``BartForConditionalGeneration``: A corresponding BART tokenizer
    document: ``str`` A single document to be summarized
    decoder: ``str`` The decoder to use for decoding

    Returns:
    ----------
    summary: ``str`` A generated summary of the input document
    summary_score: ``float`` The log-probability score of the summary
    """
    input_ids = tokenizer(document, truncation=True,
                          return_tensors='pt')['input_ids']
    metadata = {'input_ids': input_ids}
    model_wrapper = TransformerModel(model)

    if decoder == 'greedy':
        top_candidate = decoders.greedy_decoding(
            model=model_wrapper,
            max_length=50,
            eos_id=tokenizer.eos_token_id,
            decoded_ids=[tokenizer.bos_token_id],
            metadata=metadata)
    elif decoder == 'beam_search':
        top_candidate = decoders.beam_search_decoding(
            model=model_wrapper,
            beam_size=3,
            max_length=50,
            eos_id=tokenizer.eos_token_id,
            decoded_ids=[tokenizer.bos_token_id],
            metadata=metadata)[0]
    elif decoder == 'random':
        # Random sampling
        top_candidate = decoders.top_k_sampling(
            model=model_wrapper,
            top_k=int(1e9),  # random sampling is top-K with large K
            temperature=1,
            max_length=50,
            eos_id=tokenizer.eos_token_id,
            decoded_ids=[tokenizer.bos_token_id],
            metadata=metadata)
    elif decoder == 'top_k':
        top_candidate = decoders.top_k_sampling(
            model=model_wrapper,
            top_k=3,
            temperature=0.5,
            max_length=50,
            eos_id=tokenizer.eos_token_id,
            decoded_ids=[tokenizer.bos_token_id],
            metadata=metadata)
    elif decoder == 'nucleus':
        top_candidate = decoders.nucleus_sampling(
            model=model_wrapper,
            top_p=0.2,
            max_length=50,
            eos_id=tokenizer.eos_token_id,
            decoded_ids=[tokenizer.bos_token_id],
            metadata=metadata)

    summary_ids = top_candidate.decoded_ids
    summary = tokenizer.decode(summary_ids, skip_special_tokens=True)
    summary_score = top_candidate.score
    return summary, summary_score
Beispiel #25
0
def main(args):
    # Set up logging
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False)
    log = util.get_logger(args.save_dir, args.name)
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    device, gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(gpu_ids))
    model_type = args.model

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)
    # char_vectors = load_char_vectors
    # Get model
    log.info('Building model...')
    if model_type == 'BiDAFplus':  #
        model = BiDAFplus(word_vectors=word_vectors,
                          char_vectors=char_vectors,
                          hidden_size=args.hidden_size,
                          params=get_params(model_type, args.params))

    elif model_type == 'BiDAFbase':
        model = BiDAFbase(word_vectors=word_vectors,
                          hidden_size=args.hidden_size,
                          drop_prob=args.drop_prob)

    elif model_type == "Transformer":
        model = TransformerModel(word_vectors=word_vectors,
                                 char_vectors=char_vectors,
                                 input_size=len(word_vectors),
                                 hidden_size=args.hidden_size)

    elif model_type == 'BiDAF':
        model = BiDAF(word_vectors=word_vectors,
                      char_vectors=char_vectors,
                      hidden_size=args.hidden_size,
                      params=get_params(model_type, args.params))

    model = nn.DataParallel(model, gpu_ids)
    log.info(f'Loading checkpoint from {args.load_path}...')
    model = util.load_model(model, args.load_path, gpu_ids, return_step=False)
    model = model.to(device)
    model.eval()

    # Get data loader
    log.info('Building dataset...')
    record_file = vars(args)[f'{args.split}_record_file']
    dataset = SQuAD(record_file, args.use_squad_v2)
    data_loader = data.DataLoader(dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)

    # Evaluate
    log.info(f'Evaluating on {args.split} split...')
    nll_meter = util.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    eval_file = vars(args)[f'{args.split}_eval_file']
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)
            if model_type == 'BiDAF' or model_type == 'BiDAFplus':
                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)

                log_p1, log_p2 = model(cc_idxs, qc_idxs, cw_idxs, qw_idxs)

            # Forward
            else:
                log_p1, log_p2 = model(cw_idxs, qw_idxs)

            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            if args.split != 'test':
                # No labels for the test set, so NLL would be invalid
                progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(),
                                                      starts.tolist(),
                                                      ends.tolist(),
                                                      args.use_squad_v2)
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    # Log results (except for test set, since it does not come with labels)
    if args.split != 'test':
        results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)
        results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                        ('EM', results['EM'])]
        if args.use_squad_v2:
            results_list.append(('AvNA', results['AvNA']))
        results = OrderedDict(results_list)

        # Log to console
        results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
        log.info(f'{args.split.title()} {results_str}')

        # Log to TensorBoard
        tbx = SummaryWriter(args.save_dir)
        util.visualize(tbx,
                       pred_dict=pred_dict,
                       eval_path=eval_file,
                       step=0,
                       split=args.split,
                       num_visuals=args.num_visuals)

    # Write submission file
    sub_path = join(args.save_dir, args.split + '_' + args.sub_file)
    log.info(f'Writing submission file to {sub_path}...')
    with open(sub_path, 'w', newline='', encoding='utf-8') as csv_fh:
        csv_writer = csv.writer(csv_fh, delimiter=',')
        csv_writer.writerow(['Id', 'Predicted'])
        for uuid in sorted(sub_dict):
            csv_writer.writerow([uuid, sub_dict[uuid]])