def experiment(self, seeds, epoch_num, learning_rate):
        for seed in seeds:
            torch.manual_seed(seed)
            random.seed(seed)

            novel_tokens = []
            for experiment in self.experiments:
                novel_tokens += experiment.novel_tokens

            train_data = []
            for experiment in self.experiments:
                train_data += experiment.train_data
            model = BERT(novel_tokens, learning_rate)

            print('Training')
            for epoch in range(epoch_num):
                model.model.train()
                model.optimizer.zero_grad()
                loss = model.get_loss(train_data)
                loss.backward()
                model.optimizer.step()
                print('loss:', loss.item())

            for experiment in self.experiments:
                experiment.run(model)

        pickle.dump(self.experiments, open(self.save_name + '.pkl', 'wb'))
Example #2
0
        vocab_size, train_loader, fine_tuning_loader, validation_loader = preprocess_GPT(gpt_hyperparams)
    else:
        print("loading data for BERT model")
        vocab_size, train_loader, fine_tuning_loader, validation_loader = preprocess_BERT(bert_hyperparams)

    #create model
    if run_GPT:
        model = GPT(device=device, seq_len=gpt_hyperparams["seq_len"], num_words=vocab_size,
                    d_model=gpt_hyperparams["d_model"], h= gpt_hyperparams["num_heads"],
                    n=gpt_hyperparams["num_layers"]).to(device)

        if args.load:
            model.load_state_dict(torch.load('./gpt_model.pt'))
    else:
        model = BERT(device=device, seq_len=bert_hyperparams["seq_len"], num_words=vocab_size,
                    d_model=bert_hyperparams["d_model"], h=bert_hyperparams["num_heads"],
                    n=bert_hyperparams["num_layers"]).to(device)
        if args.load:
            model.load_state_dict(torch.load('./bert_model.pt'))

    #train model
    if args.train:
        print("training model ...")
        if run_GPT:
            train_GPT(model, train_loader, gpt_hyperparams)
            if args.save:
                torch.save(model.state_dict(), './gpt_model.pt')
        else:
            train_BERT(model, train_loader, bert_hyperparams)
            if args.save:
                torch.save(model.state_dict(), './bert_model.pt')
Example #3
0
def main(args):
    # Set up logging
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=False)
    log = util.get_logger(args.save_dir, args.name)
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    device, gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(gpu_ids))

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    model = None
    log.info('Building model...')
    max_context_len, max_question_len = args.para_limit, args.ques_limit
    if (args.model_type == "bidaf" or args.model_type == "bert-bidaf"):
        model = BiDAF(word_vectors=word_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=0)
    elif (args.model_type == "dcn" or args.model_type == "dcn-bidaf"):
        model = DCN(word_vectors=word_vectors,
                    hidden_size=args.hidden_size,
                    max_context_len=max_context_len,
                    max_question_len=max_question_len,
                    drop_prob=0)
    elif (args.model_type == "bert-basic"):
        model = BERT(word_vectors=word_vectors,
                     hidden_size=args.hidden_size,
                     drop_prob=0)
    model = nn.DataParallel(model, gpu_ids)
    log.info('Loading checkpoint from {}...'.format(args.load_path))
    model = util.load_model(model, args.load_path, gpu_ids, return_step=False)
    model = model.to(device)
    model.eval()

    # Get data loader
    log.info('Building dataset...')
    record_file = vars(args)['{}_record_file'.format(args.split)]
    dataset = SQuAD(record_file, args.use_squad_v2)
    data_loader = data.DataLoader(dataset,
                                  batch_size=args.batch_size,
                                  shuffle=False,
                                  num_workers=args.num_workers,
                                  collate_fn=collate_fn)

    # Evaluate
    log.info('Evaluating on {} split...'.format(args.split))
    nll_meter = util.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    eval_file = vars(args)['{}_eval_file'.format(args.split)]
    with open(eval_file, 'r') as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), \
            tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            ## Additions for BERT ##
            max_context_len, max_question_len = args.para_limit, args.ques_limit

            if "bert" in args.model_type:
                bert_embeddings = get_embeddings(args.split, ids,
                                                 args.para_limit,
                                                 args.ques_limit)
            else:
                bert_embeddings = None

            # Forward
            log_p1, log_p2 = model(cw_idxs, qw_idxs, bert_embeddings, \
            max_context_len, max_question_len,device)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            if args.split != 'test':
                # No labels for the test set, so NLL would be invalid
                progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(gold_dict, ids.tolist(),
                                                      starts.tolist(),
                                                      ends.tolist(),
                                                      args.use_squad_v2)
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    # Log results (except for test set, since it does not come with labels)
    if args.split != 'test':
        results = util.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)
        results_list = [('NLL', nll_meter.avg), ('F1', results['F1']),
                        ('EM', results['EM'])]
        if args.use_squad_v2:
            results_list.append(('AvNA', results['AvNA']))
        results = OrderedDict(results_list)

        # Log to console
        results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                for k, v in results.items())
        log.info('{} {}'.format(args.split.title(), results_str))

        # Log to TensorBoard
        tbx = SummaryWriter(args.save_dir)
        util.visualize(tbx,
                       pred_dict=pred_dict,
                       eval_path=eval_file,
                       step=0,
                       split=args.split,
                       num_visuals=args.num_visuals)

    # Write submission file
    sub_path = join(args.save_dir, args.split + '_' + args.sub_file)
    log.info('Writing submission file to {}...'.format(sub_path))
    with open(sub_path, 'w') as csv_fh:
        csv_writer = csv.writer(csv_fh, delimiter=',')
        csv_writer.writerow(['Id', 'Predicted'])
        for uuid in sorted(sub_dict):
            csv_writer.writerow([uuid, sub_dict[uuid]])
Example #4
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    # Comment out to only use 1 GPU on nv12
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings

    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info('Building model...')
    model = None
    max_context_len, max_question_len = args.para_limit, args.ques_limit
    if (args.model_type == "bidaf" or args.model_type == "bert-bidaf"):
        model = BiDAF(word_vectors=word_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob)
    elif (args.model_type == "dcn" or args.model_type == "bert-dcn"):
        model = DCN(word_vectors=word_vectors,
                    hidden_size=args.hidden_size,
                    max_context_len=max_context_len,
                    max_question_len=max_question_len,
                    drop_prob=args.drop_prob)
    elif (args.model_type == "bert-basic"):
        model = BERT(word_vectors=word_vectors,
                     hidden_size=args.hidden_size,
                     drop_prob=args.drop_prob)

    if model is None:
        raise ValueError('Model is unassigned. Please ensure --model_type \
        chooses between {bidaf, bert-bidaf, dcn, bert-dcn, bert-basic} ')

    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    count_skip = 0
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                batch_size = cw_idxs.size(0)
                count_skip += 1
                if (args.skip_examples == True
                        and (count_skip % 5 == 1 or count_skip % 5 == 2
                             or count_skip % 5 == 3 or count_skip % 5 == 4)):
                    step += batch_size
                    progress_bar.update(batch_size)
                    steps_till_eval -= batch_size
                    continue
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                ## Additions for BERT ##
                max_context_len, max_question_len = args.para_limit, args.ques_limit

                if "bert" in args.model_type:
                    bert_train_embeddings = get_embeddings(
                        "train", ids, args.para_limit, args.ques_limit)
                else:
                    bert_train_embeddings = None

                # Forward
                log_p1, log_p2 = model(cw_idxs, qw_idxs, bert_train_embeddings, \
                max_context_len, max_question_len, device)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2, args)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Example #5
0
def train(**kwargs):

    # The received parameters will be used to update configuration dict
    opt.parse(kwargs)

    # Step 0: data and device
    inputs = get_inputs(data_dir=opt.data_dir,
                        corpus_file=opt.corpus_file,
                        vocab_file=opt.vocab_path)
    train_dataloader = DataLoader(dataset=BERTDataset(
        inputs, max_sen_len=opt.max_sen_len),
                                  shuffle=True,
                                  batch_size=opt.batch_size,
                                  collate_fn=BERTCollate_fn)
    use_cuda = True if opt.use_cuda and torch.cuda.is_available() else False
    if use_cuda:
        torch.cuda.empty_cache()
    device = torch.device('cuda' if use_cuda else 'cpu')
    writer = SummaryWriter()

    # Step 1: model
    bert = BERT(n_layers=opt.n_layers,
                d_model=opt.d_model,
                vocab_size=opt.max_vocab_size,
                max_len=opt.max_sen_len,
                n_heads=opt.n_heads,
                n_seg=opt.n_seg,
                ff_hidden=opt.n_ff_hidden,
                device=device).to(device)
    masked_lm = MaskedLM(d_model=opt.d_model,
                         vocab_size=opt.max_vocab_size,
                         bert=bert).to(device)
    next_pred = NextPred(d_model=opt.d_model).to(device)

    # Write model
    dummy_input_ids = torch.zeros(
        (opt.batch_size, opt.max_sen_len)).long().to(device)
    dummy_seg_ids = torch.zeros(
        (opt.batch_size, opt.max_sen_len)).long().to(device)

    writer.add_graph(bert, (dummy_input_ids, dummy_seg_ids), False)

    # dummy_bertout = torch.zeros((opt.batch_size, opt.max_sen_len, opt.d_model)).long().to(device)
    # dummy_masked_pos = torch.zeros((opt.batch_size, opt.max_mask_len)).long().to(device)
    #
    # writer.add_graph(masked_lm, (dummy_bertout, dummy_masked_pos), True)
    # writer.add_graph(next_pred, (dummy_bertout), True)

    # Step 2: criterion and optimizer
    criterion = nn.CrossEntropyLoss()

    num_paras = sum(p.numel() for model in (bert, masked_lm, next_pred)
                    for p in model.parameters() if p.requires_grad)
    paras = list(bert.parameters()) + list(masked_lm.parameters()) + list(
        next_pred.parameters())
    print("Total number of parameters is {}".format(num_paras))
    optimizer = torch.optim.Adam(paras,
                                 lr=0.0001,
                                 betas=(0.9, 0.999),
                                 weight_decay=0.01)

    # Step 3: train
    print("Start training ...")
    for epoch in range(opt.epochs):
        epoch_loss = 0
        for i, batch_data in enumerate(train_dataloader, 1):

            input_ids, seg_ids, masked_pos, masked_token, isnext = map(
                lambda x: x.to(device), batch_data)
            # Reset gradients and forward
            optimizer.zero_grad()
            bertout = bert(input_ids, seg_ids)
            logits_lm = masked_lm(bertout, masked_pos)
            logits_clsf = next_pred(bertout)

            # Compute loss
            logits_lm = logits_lm.view(
                -1, logits_lm.size(-1))  # (bz * len_mask, vocab)
            masked_token = masked_token.view(-1, )  # (bz * len_mask, )
            logits_clsf = logits_clsf.view(-1, logits_clsf.size(-1))  # (bz, )
            isnext.view(-1, )  # (bz, )

            loss_lm = criterion(logits_lm, masked_token)
            loss_clsf = criterion(logits_clsf, isnext)
            loss = loss_lm + loss_clsf

            _, mask_preds = torch.max(logits_lm, dim=-1)
            _, next_preds = torch.max(logits_clsf, dim=-1)
            mask_pred_acc = mask_preds.eq(
                masked_token).sum().item() / masked_token.size(0)
            next_pred_acc = next_preds.eq(isnext).sum().item() / isnext.size(0)

            if i % 20 == 0:
                writer.add_scalar('loss_lm', loss_lm.item(),
                                  i + epoch * len(train_dataloader))
                writer.add_scalar('loss_clsf', loss_clsf.item(),
                                  i + epoch * len(train_dataloader))
                writer.add_scalar('lm_acc', mask_pred_acc,
                                  i + epoch * len(train_dataloader))
                writer.add_scalar('next_acc', next_pred_acc,
                                  i + epoch * len(train_dataloader))
                print(
                    'Epoch {}, Batch {}/{}, loss_lm={}, loss_next={}, lm_acc={}, next_acc={}'
                    .format(epoch + 1, i,
                            len(train_dataloader), loss_lm.item(),
                            loss_clsf.item(), mask_pred_acc, next_pred_acc))

            epoch_loss += loss.item()

            # Backward and update
            loss.backward()
            optimizer.step()

        if (1 + epoch) % 1 == 0:
            print('Epoch:', '%04d' % (epoch + 1), 'cost =',
                  '{:.6f}'.format(epoch_loss))

    print('finished train')

    # Step 4: Save model
    ckpt_file_name = dt.strftime(dt.now(), '%Y-%m-%d %H: %M: %S.ckpt')
    save_path = os.path.join(opt.ckpt_path, ckpt_file_name)
    torch.save(bert.state_dict(), save_path)
Example #6
0
def finetune(ckpt_file=None):

    if ckpt_file is None:
        files = os.listdir('checkpoints')
        if len(files):
            ckpt_file = files[-1]

    ckpt_path = os.path.join('checkpoints', ckpt_file)

    # Step 0: data and device
    inputs = get_inputs(data_dir=opt.data_dir,
                        corpus_file=opt.corpus_file,
                        vocab_file=opt.vocab_path)
    train_dataloader = DataLoader(dataset=BERTDataset(
        inputs, max_sen_len=opt.max_sen_len),
                                  shuffle=True,
                                  batch_size=opt.batch_size,
                                  collate_fn=BERTCollate_fn)
    use_cuda = True if opt.use_cuda and torch.cuda.is_available() else False
    if use_cuda:
        torch.cuda.empty_cache()
    device = torch.device('cuda' if use_cuda else 'cpu')

    writer = SummaryWriter(comment='finetune')

    # Step 1; model
    bert = BERT(n_layers=opt.n_layers,
                d_model=opt.d_model,
                vocab_size=opt.max_vocab_size,
                max_len=opt.max_sen_len,
                n_heads=opt.n_heads,
                n_seg=opt.n_seg,
                ff_hidden=opt.n_ff_hidden,
                device=device).to(device)
    bert.load_state_dict(torch.load(ckpt_path))

    clf = NextSenCLF(d_model=opt.d_model, bert=bert).to(device)

    # Step: criterion and optimizers
    criterion = nn.CrossEntropyLoss()
    optimizer1 = torch.optim.Adam(bert.parameters(), lr=1e-8)
    optimizer2 = torch.optim.Adam(clf.clf.parameters(), lr=0.001)

    # Step 3: training
    for epoch in range(max_epochs):
        epoch_loss = 0
        for i, batch_data in enumerate(train_dataloader, 1):

            input_ids, seg_ids, _, _, isnext = map(lambda x: x.to(device),
                                                   batch_data)
            # Reset gradients and forward
            optimizer1.zero_grad()
            optimizer2.zero_grad()
            clfout = clf(input_ids, seg_ids)

            # Compute loss
            clfout = clfout.view(-1, clfout.size(-1))
            isnext = isnext.view(-1, )
            loss = criterion(clfout, isnext)

            _, next_preds = torch.max(clfout, dim=-1)
            next_pred_acc = next_preds.eq(isnext).sum().item() / isnext.size(0)

            if i % 5 == 0:
                writer.add_scalar('clf_loss', loss.item(),
                                  i + epoch * len(train_dataloader))
                writer.add_scalar('clf_acc', next_pred_acc,
                                  i + epoch * len(train_dataloader))

                print('Epoch {}, Batch {}/{}, clf_loss={}, clf_acc={}'.format(
                    epoch + 1, i, len(train_dataloader), loss.item(),
                    next_pred_acc))

            epoch_loss += loss.item()

            # Backward and update
            loss.backward()
            optimizer1.step()
            optimizer2.step()
Example #7
0
                                batch_size=args.batch_size,
                                sort_key=lambda x: len(x.text),
                                device=args.device,
                                train=True,
                                sort=True,
                                sort_within_batch=True)
    test_iter = Iterator(test_dataset,
                         batch_size=args.batch_size,
                         device=args.device,
                         train=False,
                         shuffle=False,
                         sort=False)

    logger.info("Initializations done")

    model = BERT().to(args.device)
    optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad],
                           lr=2e-6)

    logger.info("Begin Training")
    train(model=model,
          optimizer=optimizer,
          device=args.device,
          train_loader=train_iter,
          valid_loader=valid_iter,
          num_epochs=args.epochs,
          eval_every=args.eval_every,
          ckpt_dir=args.ckpt_dir)
    print("Training over")

    best_model = BERT().to(args.device)
Example #8
0
    model = vanillaLSTM(size_vocab, args.dim_emb, args.dim_hid, args.nlayers, 
                           n_labels, args.dropout, pad_id, corpus, 
                           no_glove=args.no_glove, freeze=args.freeze,
                           bidirectional=True).to(my_device)
elif args.model == 'lstm_crf':
    model = LSTMCRF(size_vocab, args.dim_emb, args.dim_hid, args.nlayers, 
                           n_labels, args.dropout, pad_id, corpus, 
                           no_glove=args.no_glove, freeze=args.freeze,
                           bidirectional=False).to(my_device)
elif args.model == 'bilstm_crf':
    model = LSTMCRF(size_vocab, args.dim_emb, args.dim_hid, args.nlayers, 
                           n_labels, args.dropout, pad_id, corpus, 
                           no_glove=args.no_glove, freeze=args.freeze,
                           bidirectional=True).to(my_device)  
elif 'bert' in args.model:
    model = BERT(n_labels, corpus, seq2seq, args.model, args.dropout).to(my_device)
    # freeze bert's pretrained parameters
    #for param in model.encoder.bert.parameters():
    #    param.requires_grad=False
else:
    raise 'Choose a model among the five options.'

# choose minimization method
#if 'bert' in args.model:
    # update only classifier model and use fixed pretrained models
    # https://www.depends-on-the-definition.com/named-entity-recognition-with-bert/
#    param_optimizer = list(model.encoder.classifier.named_parameters()) 
#    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
#    optimizer = optim.Adam(optimizer_grouped_parameters, lr=args.lr)
#else:
optimizer = optim.Adam(model.parameters(), lr=args.lr)