Beispiel #1
0
def main(_):
    """ Typical usage

    For <model_name> see your folder name in ../checkpoints. 

    Training
    ``` sh
    $ python main.py --mode train --model <model> (if restoring or naming a model: --model_name <model_name>)
    ```
    
    Evaluation
    ``` sh
    $ python main.py --mode eval --model <model> --model_name <model_name>
    ```

    Shell
    ``` sh
    $ python main.py --mode shell --model <model> --model_name <model_name>
    ```
    """
    # Load data
    train = SquadDataset(*get_data_paths(FLAGS.data_dir, name='train'),
                         max_question_length=FLAGS.max_question_length,
                         max_paragraph_length=FLAGS.max_paragraph_length)
    dev = SquadDataset(*get_data_paths(FLAGS.data_dir, name='val'),
                       max_question_length=FLAGS.max_question_length,
                       max_paragraph_length=FLAGS.max_paragraph_length
                       )  # change to eval to zero if too long

    logging.info(f'Train/Dev size {train.length}/{dev.length}')

    # Load embeddings
    embed_path = FLAGS.embed_path or pjoin(
        FLAGS.data_dir, "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    embeddings = np.load(embed_path)['glove']  # 115373

    # Build model
    if FLAGS.model in ('baseline', 'mixed', 'dcnplus', 'dcn'):
        model = DCN(embeddings, FLAGS.__flags)
    elif FLAGS.model == 'cat':
        from networks.cat import Graph
        model = Graph(embeddings)
    else:
        raise ValueError(f'{FLAGS.model} is not a supported model')

    # Run mode
    if FLAGS.mode == 'train':
        save_flags()
        do_train(model, train, dev)
    elif FLAGS.mode == 'eval':
        do_eval(model, train, dev)
    elif FLAGS.mode == 'overfit':
        test_overfit(model, train)
    elif FLAGS.mode == 'shell':
        do_shell(model, dev)
    else:
        raise ValueError(f'Incorrect mode entered, {FLAGS.mode}')
Beispiel #2
0
def train():
    """ Training function for Squad QA BERT model
    Implement the Squad QA trainer which trains the model you have made.

    Note: There are useful tools for your implementation below.

    Memory tip 1: If you delete the output tensors explictly after every loss calculation like "del out, loss",
                  tensors are garbage-collected before next loss calculation so you can cut memory usage.

    Memory tip 2: If you want to keep batch_size while reducing memory usage,
                  creating a virtual batch is a good solution.
    Explanation: https://medium.com/@davidlmorton/increasing-mini-batch-size-without-increasing-memory-6794e10db672

    Useful readings: https://blog.paperspace.com/pytorch-memory-multi-gpu-debugging/ 
    """
    # Below options are just our recommendation. You can choose your own options if you want.
    epochs = 3
    learning_rate = 5e-5
    batch_size = 8
    bert_type = 'bert-base-uncased' 

    # Change the lazy option if you want fast debugging.
    dataset = SquadFeatureDataset(SquadDataset(), bert_type=bert_type, lazy=False) 

    model = BertForSquad.from_pretrained(bert_type)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    ### YOUR CODE HERE

    ### END YOUR CODE

    # Save the model in the checkpoint folder
    model.save_pretrained('./checkpoint')
Beispiel #3
0
def quantative_analysis(tokenizer, model):
    print("======Quantitative Analysis======")
    dataset = SquadDataset('data/dev-v1.1-TA.json')
    dataset = SquadFeatureDataset(dataset,
                                  bert_type=bert_type,
                                  lazy=True,
                                  return_sample=True,
                                  eval=True)

    answers = dict()

    for index in trange(len(dataset), desc="Answering"):
        (input_ids, token_type_ids, _, _), sample = dataset[index]
        answers[sample['id']] = \
                inference_model(model, tokenizer, sample['context'], sample['question'], input_ids, token_type_ids)

    with open('dev-v1.1-TA-answers.json', mode='w') as f:
        json.dump(answers, f)

    with open('data/dev-v1.1-TA.json', mode='r') as f:
        dataset = json.load(f)['data']

    results = evaluate(dataset, answers)
    print(
        f"Exact Match: {results['exact_match']}. This should be upper than 60.0. TA score: 75.2"
    )
    print(
        f"F1 score: {results['f1']}. This should be upper than 70.0. TA score: 83.9"
    )
Beispiel #4
0
def test_overfit():
    """
    Tests that model can overfit on small datasets.
    """
    data_hparams = {'max_paragraph_length': 300, 'max_question_length': 25}
    train = SquadDataset(*get_data_paths(FLAGS.data_dir, name='train'),
                         **data_hparams)
    dev = SquadDataset(*get_data_paths(FLAGS.data_dir, name='val'),
                       **data_hparams)  # probably not cut

    embed_path = FLAGS.embed_path or pjoin(
        FLAGS.data_dir, "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    embeddings = np.load(embed_path)['glove']  # 115373

    test_hparams = {
        'learning_rate': 0.01,
        'keep_prob': 1.0,
        'trainable_embeddings': False,
        'clip_gradients': True,
        'max_gradient_norm': 5.0
    }
    model = Baseline(embeddings, test_hparams)

    epochs = 100
    test_size = 32
    steps_per_epoch = 10
    train.question, train.paragraph, train.question_length, train.paragraph_length, train.answer = train[:
                                                                                                         test_size]
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for epoch in range(epochs):
            epoch_start = timer()
            for step in range(steps_per_epoch):
                loss, _ = model.training_step(sess, *train[:test_size])
                if (step == 0 and epoch == 0):
                    print(
                        f'Entropy - Result: {loss:.2f}, Expected (approx.): {2*np.log(FLAGS.max_paragraph_length):.2f}'
                    )
                if step == steps_per_epoch - 1:
                    print(f'Cross entropy: {loss}')
                    train.length = 32
                    print(evaluate(sess, model, train, size=test_size))
            global_step = tf.train.get_global_step().eval()
            print(
                f'Epoch took {timer() - epoch_start:.2f} s (step: {global_step})'
            )
Beispiel #5
0
def main(_):
    # Load data
    train = SquadDataset(*get_data_paths(FLAGS.data_dir, name='train'),
                         max_question_length=FLAGS.max_question_length,
                         max_paragraph_length=FLAGS.max_paragraph_length)
    dev = SquadDataset(
        *get_data_paths(FLAGS.data_dir, name='val'),
        max_question_length=FLAGS.max_question_length,
        max_paragraph_length=FLAGS.max_paragraph_length)  # probably not cut
    # TODO convert to TF Dataset API
    # train = tf.convert_to_tensor(train)
    # dev = tf.convert_to_tensor(dev)
    # tf.contrib.data.Dataset()

    # logging.info(f'Train/Dev size {train.length}/{dev.length}')

    # Load embeddings
    embed_path = FLAGS.embed_path or pjoin(
        FLAGS.data_dir, "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    embeddings = np.load(embed_path)['glove']  # 115373

    is_training = (FLAGS.mode == 'train' or FLAGS.mode == 'overfit')

    # Build model
    if FLAGS.model == 'dcnplus':
        model = DCNPlus(embeddings, FLAGS.__flags, is_training=is_training)
    elif FLAGS.model == 'baseline':
        model = Baseline(embeddings, FLAGS.__flags)
    elif FLAGS.model == 'cat':
        model = Graph(embeddings, is_training=is_training)
    else:
        raise ValueError(f'{FLAGS.model} is not a supported model')

    # Run mode
    if FLAGS.mode == 'train':
        save_flags()
        do_train(model, train)
    elif FLAGS.mode == 'eval':
        do_eval(model, train, dev, evaluate)
    elif FLAGS.mode == 'overfit':
        test_overfit(model, train, evaluate)
    elif FLAGS.mode == 'shell':
        do_shell(model, dev)
    else:
        raise ValueError(f'Incorrect mode entered, {FLAGS.mode}')
Beispiel #6
0
def main(_):
    # Load data
    train = SquadDataset(*get_data_paths(FLAGS.data_dir, name='train'),
                         max_question_length=FLAGS.max_question_length,
                         max_paragraph_length=FLAGS.max_paragraph_length)
    dev = SquadDataset(
        *get_data_paths(FLAGS.data_dir, name='val'),
        max_question_length=FLAGS.max_question_length,
        max_paragraph_length=FLAGS.max_paragraph_length)  # probably not cut
    # TODO convert to TF Dataset API
    # train = tf.convert_to_tensor(train)
    # dev = tf.convert_to_tensor(dev)
    # tf.contrib.data.Dataset()

    logging.info(f'Train/Dev size {train.length}/{dev.length}')

    # Load embeddings
    embed_path = FLAGS.embed_path or pjoin(
        FLAGS.data_dir, "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    embeddings = np.load(embed_path)['glove']  # 115373
    # vocab_path = FLAGS.vocab_path or pjoin(FLAGS.data_dir, "vocab.dat")
    # vocab, rev_vocab = initialize_vocab(vocab_path) # dict, list

    is_training = FLAGS.mode == 'train'

    # Build model
    if FLAGS.model == 'dcnplus':
        model = DCNPlus(embeddings, FLAGS.__flags, is_training=is_training)
    elif FLAGS.model == 'baseline':
        model = Baseline(embeddings, FLAGS.__flags)
    elif FLAGS.model == 'cat':
        model = Graph(embeddings, is_training=is_training)
    else:
        raise ValueError(f'{FLAGS.model} is not a supported model')

    # Run mode
    if FLAGS.mode == 'train':
        with open(os.path.join(FLAGS.train_dir, "flags.json"), 'w') as f:
            json.dump(FLAGS.__flags, f, indent=4)
        do_train(model, train, dev, evaluate)
    elif FLAGS.mode == 'eval':
        do_eval(model, train, dev, evaluate)
    else:
        raise ValueError(f'Incorrect mode entered, {FLAGS.mode}')
Beispiel #7
0
def main(_):
    """ Typical usage

    For <model_name> see your folder name in ../checkpoints. 

    Training
    ``` sh
    $ python main.py --mode train --model <model> (if restoring or naming a model: --model_name <model_name>)
    ```
    
    Evaluation
    ``` sh
    $ python main.py --mode eval --model <model> --model_name <model_name>
    ```

    Shell
    ``` sh
    $ python main.py --mode shell --model <model> --model_name <model_name>
    ```
    """
    # Load data
    train = SquadDataset(*get_data_paths(FLAGS.data_dir, name='train'),
                         max_question_length=FLAGS.max_question_length,
                         max_paragraph_length=FLAGS.max_paragraph_length)
    dev = SquadDataset(*get_data_paths(FLAGS.data_dir, name='val'),
                       max_question_length=FLAGS.max_question_length,
                       max_paragraph_length=FLAGS.max_paragraph_length
                       )  # change to eval to zero if too long

    logging.info(f'Train/Dev size {train.length}/{dev.length}')

    # Load embeddings
    embed_path = FLAGS.embed_path or pjoin(
        FLAGS.data_dir, "glove.trimmed.{}.npz".format(FLAGS.embedding_size))
    embeddings = np.load(embed_path)['glove']  # 115373

    if FLAGS.use_siamese:
        # get config file for siamese model
        siamese_config = '../../paraphrase-id-tensorflow-master/logs/baseline_siamese/{}/trainparams.json'.format(
            FLAGS.siamese_model_num)
        with open(siamese_config, 'r') as f:
            siamese_config = json.load(f)
            siamese_config['mode'] = 'test'

        checkpoint_dir = '../../paraphrase-id-tensorflow-master/models/baseline_siamese/{}/'.format(
            FLAGS.siamese_model_num)
        # siamese_graph = ImportGraph(checkpoint_dir, embeddings)
        siamese_graph = ImportModel(checkpoint_dir, siamese_config, embeddings)

    # Build model
    if FLAGS.model in ('baseline', 'mixed', 'dcnplus', 'dcn'):
        # with tf.variable_scope('dcn'):
        model = DCN(embeddings,
                    FLAGS.__flags,
                    siamese_output_dim=siamese_config['rnn_hidden_size'])
    elif FLAGS.model == 'cat':
        from networks.cat import Graph
        model = Graph(embeddings)
    else:
        raise ValueError(f'{FLAGS.model} is not a supported model')

    # Run mode
    if FLAGS.mode == 'train':
        save_flags()
        do_train(model, train, dev, input_model=siamese_graph)
    elif FLAGS.mode == 'eval':
        do_eval(model, train, dev, input_model=siamese_graph)
    elif FLAGS.mode == 'overfit':
        test_overfit(model, train, input_model=siamese_graph)
    elif FLAGS.mode == 'shell':
        do_shell(model, dev, input_model=siamese_graph)
    else:
        raise ValueError(f'Incorrect mode entered, {FLAGS.mode}')
def train():
    """ Training function for Squad QA BERT model
    Implement the Squad QA trainer which trains the model you have made.

    Note: There are useful tools for your implementation below.

    Memory tip 1: If you delete the output tensors explictly after every loss calculation like "del out, loss",
                  tensors are garbage-collected before next loss calculation so you can cut memory usage.

    Memory tip 2: If you want to keep batch_size while reducing memory usage,
                  creating a virtual batch is a good solution.
    Explanation: https://medium.com/@davidlmorton/increasing-mini-batch-size-without-increasing-memory-6794e10db672

    Useful readings: https://blog.paperspace.com/pytorch-memory-multi-gpu-debugging/ 
    """
    # Below options are just our recommendation. You can choose your own options if you want.
    epochs = 3
    learning_rate = 5e-5
    batch_size = 8
    bert_type = 'bert-base-uncased'

    # Change the lazy option if you want fast debugging.
    dataset = SquadFeatureDataset(SquadDataset(),
                                  bert_type=bert_type,
                                  lazy=True)

    model = BertForSquad.from_pretrained(bert_type).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    train_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, collate_fn=squad_feature_collate_fn)
    train_iterator = tqdm(train_loader, leave=False)
    start_loss_obj = torch.nn.CrossEntropyLoss()
    end_loss_obj = torch.nn.CrossEntropyLoss()
    for epoch in range(epochs):
        print('epoch:{}'.format(epoch))
        for data in train_iterator:
            optimizer.zero_grad()
            input_ids, attention_mask, token_type_ids, start_token_pos, end_token_pos = data
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            start_token_pos = start_token_pos.to(device)
            end_token_pos = end_token_pos.to(device)
            # start_predict, end_predict = model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
            #
            # start_loss = start_loss_obj(start_predict, start_token_pos)
            # end_loss = end_loss_obj(end_predict, end_token_pos)
            # loss = start_loss + end_loss
            # optimizer.zero_grad()
            # loss.backward()
            # optimizer.step()
            # print(loss.item())

            start_logits, end_logits = model(input_ids,
                                             attention_mask=attention_mask,
                                             token_type_ids=token_type_ids)
            start_logits = start_logits.log()
            end_logits = end_logits.log()
            ignored_index = start_logits.size(1)
            start_token_pos.clamp_(0, ignored_index)
            end_token_pos.clamp_(0, ignored_index)
            loss_fct = NLLLoss(ignore_index=ignored_index)
            start_loss = loss_fct(start_logits, start_token_pos)
            end_loss = loss_fct(end_logits, end_token_pos)
            loss = (start_loss + end_loss) / 2
            loss.backward()
            optimizer.step()
            print(loss.item())

    # Save the model in the checkpoint folder
    model.save_pretrained('./checkpoint')
Beispiel #9
0
                          val_questions,
                          truncation=True,
                          padding=True)
'''
last step preparing model inputs
'''
add_token_positions(train_encodings, train_answers)
add_token_positions(val_encodings, val_answers)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
if device == 'cuda':
    torch.cuda.set_device(DEVICE_ID)  # use an unoccupied GPU
'''
Torch dataset object
'''
train_dataset = SquadDataset(train_encodings, device)
val_dataset = SquadDataset(val_encodings, device)

model.to(device)
model.train()

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)

optim = AdamW(model.parameters(), lr=5e-5)

for epoch in range(NUM_EPOCH):
    for batch in tqdm(train_loader):
        optim.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        start_positions = batch['start_positions'].to(device)
Beispiel #10
0
  jieba_sgns_v = Vocab(os.path.join(data_root_folder, 'vocab', 'useless.vocab.pkl'),
                       os.path.join(data_root_folder, 'vocab', 'useless.emb.pkl'))
  jieba_flag_v = Vocab(os.path.join(data_root_folder, 'vocab', 'useless.vocab.pkl'),
                       os.path.join(data_root_folder, 'vocab', 'useless.emb.pkl'))

  trainset_roots = [
    os.path.join(data_root_folder, 'val.txt')
  ]

  embed_lists = {
    'jieba': [jieba_base_v.embeddings, jieba_sgns_v.embeddings, jieba_flag_v.embeddings],
    'pyltp': []
  }

  transform = SquadTransform(jieba_base_v, jieba_sgns_v, jieba_flag_v)
  train_dataset = SquadDataset(train_file, transform, use_rouge=True, max_size=None)
  dev_dataset = SquadDataset(val_file, transform, use_rouge=True, max_size=None)

  num_workers = 0

  train_loader = DataLoader(
    dataset=train_dataset,
    batch_size=BATCH_SIZE,
    num_workers=num_workers,
    collate_fn=transform.batchify,
  )

  dev_loader = DataLoader(
    dataset=dev_dataset,
    batch_size=BATCH_SIZE,
    num_workers=num_workers,
Beispiel #11
0
def train():
    """ Training function for Squad QA BERT model
    Implement the Squad QA trainer which trains the model you have made.

    Note: There are useful tools for your implementation below.

    Memory tip 1: If you delete the output tensors explictly after every loss calculation like "del out, loss",
                  tensors are garbage-collected before next loss calculation so you can cut memory usage.

    Memory tip 2: If you want to keep batch_size while reducing memory usage,
                  creating a virtual batch is a good solution.
    Explanation: https://medium.com/@davidlmorton/increasing-mini-batch-size-without-increasing-memory-6794e10db672

    Useful readings: https://blog.paperspace.com/pytorch-memory-multi-gpu-debugging/ 
    """
    # Below options are just our recommendation. You can choose your own options if you want.
    epochs = 3
    learning_rate = 5e-5
    batch_size = 6
    bert_type = 'bert-base-uncased'
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    # Change the lazy option if you want fast debugging.
    dataset = SquadFeatureDataset(SquadDataset(),
                                  bert_type=bert_type,
                                  lazy=False)

    model = BertForSquad.from_pretrained(bert_type)
    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    ### YOUR CODE HERE
    batch_sampler = SquadBucketSampler(dataset, batch_size, shuffle=True)
    data_loader = DataLoader(dataset,
                             batch_sampler=batch_sampler,
                             collate_fn=squad_feature_collate_fn)
    loss_fct = torch.nn.CrossEntropyLoss()

    model.train()
    losses = []
    for epoch in range(epochs):
        batch_loss = 0.0
        for input_ids, attention_mask, token_type_ids, start_pos, end_pos in tqdm(
                data_loader):
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            token_type_ids = token_type_ids.to(device)
            start_pos = start_pos.to(device)
            end_pos = end_pos.to(device)
            optimizer.zero_grad()
            start_logits, end_logits = model(input_ids, attention_mask,
                                             token_type_ids)
            start_loss = loss_fct(start_logits, start_pos)
            end_loss = loss_fct(end_logits, end_pos)
            loss = start_loss + end_loss
            loss.backward()
            optimizer.step()
            batch_loss += loss.item()
            del start_logits, end_logits, loss  # MemoryError
        losses.append(batch_loss)

    ### END YOUR CODE

    # Save the model in the checkpoint folder
    model.save_pretrained('./checkpoint')
Beispiel #12
0
    print(f"Using {device}.")

    print(f"Reading {sys.argv[1]}...")
    df = extract_data(sys.argv[1], contain_answers=False).set_index(['id'])
    print(f"DataFrame created.")

    print("Tokenizing the DataFrame...")
    model = DistilBertKnowledge(alpha=0.5)
    DistilBertTokenizer.from_pretrained(
        model.info.pretrained_model).save_pretrained('slow_tokenizer/')
    tokenizer = BertWordPieceTokenizer('slow_tokenizer/vocab.txt',
                                       lowercase=True)
    df = process_dataframe(df, tokenizer, contain_answers=False)
    print("Tokenization complete.")

    dataset = SquadDataset(df, model.info, contain_answers=False)
    loader = DataLoader(dataset, batch_size=16, num_workers=4, pin_memory=True)

    print("Loading model weights...")
    model.load_state_dict(torch.load('model.pt'))
    model = model.to(device)
    print("Model loaded.")

    model.eval()
    print("Starting evaluation...")
    starts, ends = [], []
    num_batches = len(loader)
    for idx, input in enumerate(loader):
        if (idx + 1) % 100 == 0:
            print(f'Batch {idx + 1:{len(str(num_batches))}}/{num_batches}')
        with torch.no_grad():
Beispiel #13
0
def train_model(preprocessor, base_model, frac_train_data, frac_val_data, batch_size=8, n_epoch=10, log_every=1,
                eval_every=10,
                save_every=300, checkpoint_fn=None, force_cpu=False, save_model_prefix=""
                ) -> None:
    """
    Fine-tunes transformer model with custom head on custom data.

    Parameters
    ----------
    preprocessor (SquadPreprocessor,  SquadPlausibleAnswersPreprocessor) - pre-processor class.
    base_model (nn.Module)- model class, sub-class of nn.Module.
    frac_train_data (float) - fraction of training data to sample randomly. Useful with limited memory.
    frac_val_data (float) - fraction of validation data to sample randomly.
    batch_size (int) - batch size for training.
    n_epoch (int) - number of epochs for training.
    log_every (int) - steps frequency to print training loss.
    eval_every (int) - steps frequency to print eval loss.
    save_every (int) - steps frequency to save checkpoint.
    checkpoint_fn (None or str) - if str, uses as filename to load a checkpoint model, to continue training.
    force_cpu - forces CPU, even on systems with detectable CUDA. Useful for old CUDA architectures,
                which aren't supported anymore
    save_model_prefix (str) - prefix to save the model checkpoint
    """

    sp = preprocessor()
    train_enc, val_enc = sp.get_encodings(random_sample_train=frac_train_data, random_sample_val=frac_val_data,
                                          return_tensors="pt")

    train_ds = SquadDataset(train_enc)
    val_ds = SquadDataset(val_enc)

    train_dl = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    eval_dl = DataLoader(val_ds, batch_size=64, shuffle=True)

    dbm = DistilBertModel.from_pretrained('distilbert-base-uncased', return_dict=True)

    # Freeze all parameters of the DistilBert
    # for name, param in dbm.named_parameters():
    #     if name.startswith('embeddings'):
    #         param.requires_grad = False
    if force_cpu:
        device = torch.device("cpu")
    else:
        device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')  # torch.device("cpu")

    epoch = 0
    train_iter = 0
    loss_eval = 1000

    if checkpoint_fn is not None:
        checkpoint = torch.load(checkpoint_fn, map_location=device)
        epoch = checkpoint['epoch'] - 1.0
        train_iter = checkpoint['train_iter']
    else:
        checkpoint = None

    model = base_model(transformer_model=dbm, device=device)

    if checkpoint:
        model.load_state_dict(checkpoint['model_state_dict'])

    # optimizer = torch.optim.Adam(model.parameters(), lr = 0.0002)
    logging.info(f"Using device: {device}")

    model.to(device)
    model.train()

    optimizer = AdamW(model.parameters(), lr=5e-5)  # torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    if checkpoint:
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

    while epoch < n_epoch:
        epoch += 1

        for train_data in train_dl:
            train_iter += 1
            optimizer.zero_grad()
            model_out = model(train_data)
            loss = model.compute_loss(*model_out)
            loss.backward()
            optimizer.step()

            if train_iter % log_every == 0:
                print('Train: Epoch: %d, iter: %d, avg. loss: %.2f' % (epoch, train_iter, loss))

            if train_iter % eval_every == 0:
                with torch.no_grad():  # Disable gradient tracking for evaluation
                    model.eval()
                    eval_data = next(iter(eval_dl))
                    model_out = model(eval_data)
                    loss_eval = model.compute_loss(*model_out)
                    print('\nEval: Epoch: %d, iter: %d, avg. loss: %.2f\n' % (epoch, train_iter, loss_eval))
                    model.train()

            if train_iter % save_every == 0:
                model.save(f"model_checkpoint/{save_model_prefix}_model_{train_iter}.pt", train_iter=train_iter,
                           epoch=epoch,
                           optimizer=optimizer,
                           train_loss=loss, eval_loss=loss_eval)
Beispiel #14
0
    def fit(self, config, device):
        logging.info(json.dumps(config, indent=4, sort_keys=True))

        if config["char_embeddings"]:
            fields = SquadDataset.prepare_fields_char()
        else:
            fields = SquadDataset.prepare_fields()

        train, val = SquadDataset.splits(fields)
        fields = dict(fields)

        fields["question"].build_vocab(train, val, vectors=GloVe(name='6B', dim=config["embedding_size"]))

        if not type(fields["question_char"]) == torchtext.data.field.RawField:
            fields["question_char"].build_vocab(train, val, max_size=config["char_maxsize_vocab"])

        # Make if shuffle
        train_iter = BucketIterator(train, sort_key=lambda x: -(len(x.question) + len(x.document)),
                                    shuffle=True, sort=False, sort_within_batch=True,
                                    batch_size=config["batch_size"], train=True,
                                    repeat=False,
                                    device=device)

        val_iter = BucketIterator(val, sort_key=lambda x: -(len(x.question) + len(x.document)), sort=True,
                                  batch_size=config["batch_size"],
                                  repeat=False,
                                  device=device)
        #
        # model = torch.load(
        #     "saved/65F1_checkpoint_<class 'trainer.ModelFramework'>_L_2.1954014434733815_2019-06-28_10:06_pcknot2.pt").to(
        #     device)
        if config["modelname"] == "baseline":
            model = Baseline(config, fields["question"].vocab).to(device)
        elif config["modelname"] == "bidaf_simplified":
            model = BidafSimplified(config, fields["question"].vocab).to(device)
        elif config["modelname"] == "bidaf":
            model = BidAF(config, fields['question'].vocab, fields["question_char"].vocab).to(device)
        # glorot_param_init(model)
        logging.info(f"Models has {count_parameters(model)} parameters")
        param_sizes, param_shapes = report_parameters(model)
        param_sizes = "\n'".join(str(param_sizes).split(", '"))
        param_shapes = "\n'".join(str(param_shapes).split(", '"))
        logging.debug(f"Model structure:\n{param_sizes}\n{param_shapes}\n")

        if config["optimizer"] == "adam":
            optimizer = Adam(filter(lambda p: p.requires_grad, model.parameters()),
                             lr=config["learning_rate"])
        else:
            raise NotImplementedError(f"Option {config['optimizer']} for \"optimizer\" setting is undefined.")

        start_time = time.time()
        try:
            best_val_loss = math.inf
            best_val_f1 = 0
            best_em = 0
            ema_active = False
            for it in range(config["max_iterations"]):
                logging.info(f"Iteration {it}")
                if "ema" in config and config["ema"]:
                    ema = EMA.ema_register(config, model)
                    ema_active = True

                self.train_epoch(model, CrossEntropyLoss(), optimizer, train_iter)

                if ema_active:
                    EMA.ema_update(ema, model)

                validation_loss, em, f1 = self.validate(model, CrossEntropyLoss(reduction='none'), val_iter,
                                                        ema=ema if "ema" in config and config[
                                                            "ema"] and ema_active else None)
                if validation_loss < best_val_loss: best_val_loss = validation_loss
                if f1 > best_val_f1: best_val_f1 = validation_loss
                if em > best_em: best_em = em
                logging.info(f"BEST L/F1/EM = {best_val_loss:.2f}/{best_val_f1:.2f}/{best_em:.2f}")
                if em > 65:
                    # Do all this on CPU, this is memory exhaustive!
                    model.to(torch.device("cpu"))

                    if ema_active:
                        # backup current params and load ema params
                        backup_params = EMA.ema_backup_and_loadavg(ema, model)

                        torch.save(model,
                                   f"saved/checkpoint"
                                   f"_{str(self.__class__)}"
                                   f"_EM_{em:.2f}_F1_{f1:.2f}_L_{validation_loss:.2f}_{get_timestamp()}"
                                   f"_{socket.gethostname()}.pt")

                        # load back backed up params
                        EMA.ema_restore_backed_params(backup_params, model)

                    else:
                        torch.save(model,
                                   f"saved/checkpoint"
                                   f"_{str(self.__class__)}"
                                   f"_EM_{em:.2}_F1_{f1:.2}_L_{validation_loss:.2}_{get_timestamp()}"
                                   f"_{socket.gethostname()}.pt")

                    model.to(device)
                logging.info(f"Validation loss: {validation_loss}")

        except KeyboardInterrupt:
            logging.info('-' * 120)
            logging.info('Exit from training early.')
        finally:
            logging.info(f'Finished after {(time.time() - start_time) / 60} minutes.')