Esempio n. 1
0
def train_entry(config):
    from models import QANet

    with open(config.word_emb_file, "rb") as fh:
        word_mat = np.array(pickle.load(fh), dtype=np.float32)
    with open(config.char_emb_file, "rb") as fh:
        char_mat = np.array(pickle.load(fh), dtype=np.float32)
    with open(config.dev_eval_file, "r") as fh:
        dev_eval_file = json.load(fh)

    print("Building model...")

    train_dataset = SQuADDataset(config.train_record_file, config.num_steps, config.batch_size)
    dev_dataset = SQuADDataset(config.dev_record_file, config.val_num_batches, config.batch_size)

    lr = config.learning_rate
    base_lr = 1
    lr_warm_up_num = config.lr_warm_up_num

    model = QANet(word_mat, char_mat).to(device)
    parameters = filter(lambda param: param.requires_grad, model.parameters())
    #optimizer = optim.Adam(lr=base_lr, betas=(0.8, 0.999), eps=1e-7, weight_decay=3e-7, params=parameters)
    #optimizer = optim.SparseAdam(lr=lr, betas=(0.8, 0.999), eps=1e-7, params=parameters)
    optimizer = optim.Adam(lr=lr, params=parameters)
    #cr = lr / math.log2(lr_warm_up_num)
    #scheduler = optim.lr_scheduler.LambdaLR(
    #    optimizer,
    #    lr_lambda=lambda ee: cr * math.log2(ee + 1) if ee < lr_warm_up_num else lr)
    scheduler=''
    L = config.checkpoint
    N = config.num_steps
    best_f1 = 0
    best_em = 0
    patience = 0
    unused = False
    for iter in range(0, N, L):
        train(model, optimizer, scheduler, train_dataset, iter, L)
        metrics = test(model, dev_dataset, dev_eval_file)
        if iter + L >= lr_warm_up_num - 1 and unused:
            optimizer.param_groups[0]['initial_lr'] = lr
            scheduler = optim.lr_scheduler.ExponentialLR(optimizer, 0.99997)
            unused = False
        if config.print_weight:
            print_weight(model, 5, iter + L)
        #print("Learning rate: {}".format(scheduler.get_lr()))
        dev_f1 = metrics["f1"]
        dev_em = metrics["exact_match"]
        if dev_f1 < best_f1 and dev_em < best_em:
            patience += 1
            if patience > config.early_stop:
                break
        else:
            patience = 0
            best_f1 = max(best_f1, dev_f1)
            best_em = max(best_em, dev_em)

        fn = os.path.join(config.save_dir, "model.pt")
        torch.save(model, fn)
Esempio n. 2
0
def train_entry(config):
    from models import QANet

    with open(config.word_emb_file, "r") as fh:
        word_mat = np.array(json.load(fh), dtype=np.float32)
    with open(config.char_emb_file, "r") as fh:
        char_mat = np.array(json.load(fh), dtype=np.float32)
    with open(config.train_eval_file, "r") as fh:
        train_eval_file = json.load(fh)
    with open(config.dev_eval_file, "r") as fh:
        dev_eval_file = json.load(fh)

    print("Building model...")

    train_dataset = SQuADDataset(config.train_record_file, config.num_steps,
                                 config.batch_size)
    dev_dataset = SQuADDataset(config.dev_record_file, config.test_num_batches,
                               config.batch_size)

    lr = config.learning_rate
    base_lr = 1.0
    warm_up = config.lr_warm_up_num

    model = QANet(word_mat, char_mat).to(device)
    ema = EMA(config.ema_decay)
    for name, p in model.named_parameters():
        if p.requires_grad: ema.set(name, p)
    params = filter(lambda param: param.requires_grad, model.parameters())
    optimizer = optim.Adam(lr=base_lr,
                           betas=(config.beta1, config.beta2),
                           eps=1e-7,
                           weight_decay=3e-7,
                           params=params)
    cr = lr / log2(warm_up)
    scheduler = optim.lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=lambda ee: cr * log2(ee + 1) if ee < warm_up else lr)
    L = config.checkpoint
    N = config.num_steps
    best_f1 = best_em = patience = 0
    for iter in range(0, N, L):
        train(model, optimizer, scheduler, ema, train_dataset, iter, L)
        valid(model, train_dataset, train_eval_file)
        metrics = test(model, dev_dataset, dev_eval_file)
        print("Learning rate: {}".format(scheduler.get_lr()))
        dev_f1 = metrics["f1"]
        dev_em = metrics["exact_match"]
        if dev_f1 < best_f1 and dev_em < best_em:
            patience += 1
            if patience > config.early_stop: break
        else:
            patience = 0
            best_f1 = max(best_f1, dev_f1)
            best_em = max(best_em, dev_em)

        fn = os.path.join(config.save_dir,
                          f"model_iter={iter}_best_f1={best_f1}.pt")
        torch.save(model, fn)
Esempio n. 3
0
def train_entry(config):
    from models import QANet

    with open(config.word_emb_file, "rb") as fh:
        word_mat = np.array(json.load(fh), dtype=np.float32)
    with open(config.char_emb_file, "rb") as fh:
        char_mat = np.array(json.load(fh), dtype=np.float32)
    with open(config.dev_eval_file, "r") as fh:
        dev_eval_file = json.load(fh)

    print("Building model...")

    train_dataset = get_loader(config.train_record_file, config.batch_size)
    dev_dataset = get_loader(config.dev_record_file, config.batch_size)

    lr = config.learning_rate
    base_lr = 1
    lr_warm_up_num = config.lr_warm_up_num

    model = QANet(word_mat, char_mat).to(device)
    parameters = filter(lambda param: param.requires_grad, model.parameters())
    optimizer = optim.Adam(lr=base_lr,
                           betas=(0.8, 0.999),
                           eps=1e-6,
                           weight_decay=3e-7,
                           params=parameters)
    cr = lr / math.log2(lr_warm_up_num)
    scheduler = optim.lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=lambda ee: cr * math.log2(ee + 1)
        if ee < lr_warm_up_num else lr)
    best_f1 = 0
    best_em = 0
    patience = 0
    unused = False
    for iter in range(config.num_epoch):
        train(model, optimizer, scheduler, train_dataset, dev_dataset,
              dev_eval_file, iter)
        metrics = test(model, dev_dataset, dev_eval_file,
                       (iter + 1) * len(train_dataset))
        dev_f1 = metrics["f1"]
        dev_em = metrics["exact_match"]
        if dev_f1 < best_f1 and dev_em < best_em:
            patience += 1
            if patience > config.early_stop:
                break
        else:
            patience = 0
            best_f1 = max(best_f1, dev_f1)
            best_em = max(best_em, dev_em)

        fn = os.path.join(config.save_dir, "model.pt")
        torch.save(model, fn)
Esempio n. 4
0
def train(epoch, data, model=None):
    if model == None:
        model = QANet(data).to(device)
    parameters = filter(lambda param: param.requires_grad, model.parameters())
    optimizer = optim.Adam(betas=(0.8, 0.999),
                           eps=1e-7,
                           weight_decay=3e-7,
                           params=parameters)
    crit = lr / math.log2(1000)
    scheduler = optim.lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=lambda ee: crit * math.log2(ee + 1)
        if ee + 1 <= 1000 else lr)
    packs = trunk(data.train.packs, batch_size)
    f_log = open("log/model.log", "w")
    # test(model, data, ep=0, iter=0, test_num=10, test_size=50, f_log=f_log)
    try:
        for ep in range(epoch):
            print("EPOCH {:02d}: ".format(ep))
            l = len(packs)
            for i in tqdm(range(l)):
                pack = packs[i]
                Cw, Cc, Qw, Qc, a = to_batch(pack, data, data.train)
                optimizer.zero_grad()
                out1, out2 = model(Cw, Cc, Qw, Qc)
                loss1 = F.cross_entropy(out1, a[:, 0])
                loss2 = F.cross_entropy(out2, a[:, 1])
                loss = loss1 + loss2
                writer.add_scalar("data/loss", float(loss), ep * l + i)
                loss.backward()
                scheduler.step()
                if (i + 1) % checkpoint == 0:
                    torch.save(
                        model,
                        os.path.join(
                            model_dir,
                            "model-tmp-{:02d}-{}.pt".format(ep, i + 1)))
                    # test(model, data, ep, i, 10, 50, f_log)
            # test(model, data, ep, i, 1, -1, f_log)
            random.shuffle(packs)
        torch.save(model, os.path.join(model_dir, model_fn))
        writer.close()
    except Exception as e:
        torch.save(
            model,
            os.path.join(model_dir, "model-{:02d}-{}.pt".format(ep, i + 1)))
        raise e
    except KeyboardInterrupt as k:
        torch.save(
            model,
            os.path.join(model_dir, "model-{:02d}-{}.pt".format(ep, i + 1)))
    return model
Esempio n. 5
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    # Args:  word_vectors: word vector tensor of dimension [vocab_size * wemb_dim]
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)

    # Get Model
    log.info('Building Model...')
    model = QANet(word_vectors,
                  char_vectors,
                  args.para_limit,
                  args.ques_limit,
                  args.f_model,
                  num_head=args.num_head,
                  train_cemb = (not args.pretrained_char))
    model = nn.DataParallel(model, args.gpu_ids)
    
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    parameters = filter(lambda p: p.requires_grad, model.parameters())
    optimizer = optim.Adam(
        params=parameters,
        lr=args.lr,
        betas=(args.beta1, args.beta2),
        eps=1e-8,
        weight_decay=3e-7)
    cr = 1.0 / math.log(args.lr_warm_up_num)
    scheduler = optim.lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=lambda ee: cr * math.log(ee + 1)
        if ee < args.lr_warm_up_num else 1)
    loss_f = torch.nn.CrossEntropyLoss()

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)

                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)
                
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = torch.mean(loss_f(log_p1, y1) + loss_f(log_p2, y2))
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch,
                                         NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR',
                               optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 6
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)

    # Get model
    log.info('Building model...')
    """
    model = BiDAF(word_vectors=word_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    """
    '''
    model = charBiDAF(word_vectors=word_vectors,
                      char_vectors=char_vectors,
                      emb_size=char_vectors.size(1),
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob)
    '''
    model = QANet(word_vectors=word_vectors,
                  char_vectors=char_vectors,
                  emb_size=char_vectors.size(1),
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    print(
        "YOU ARE ENTERING THE QA NET MODEL TRAINING_____________________________________________________________________"
    )
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    #print("Model status - ", cuda_or_cpu(model))
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    #torch.utils.data._utils.MP_STATUS_CHECK_INTERVAL = 300
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)

    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=0,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:

                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                cc_idxs = cc_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                qc_idxs = qc_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()

                #print(torch.cuda.memory_allocated(device=None))
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)
                #torch.cuda.empty_cache()

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    # print('Memory 1: ', torch.cuda.memory_allocated())
                    ema.assign(model)
                    # print('Memory 2: ', torch.cuda.memory_allocated())
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 7
0
def train(config):
    from models import QANet

    with open(config.word_emb_file, "r") as fh:
        word_mat = np.array(json.load(fh), dtype=np.float32)
    with open(config.char_emb_file, "r") as fh:
        char_mat = np.array(json.load(fh), dtype=np.float32)
    with open(config.train_eval_file, "r") as fh:
        train_eval_file = json.load(fh)
    with open(config.dev_eval_file, "r") as fh:
        dev_eval_file = json.load(fh)
    with open(config.dev_meta, "r") as fh:
        meta = json.load(fh)
    train_log = open(config.train_log, "w")

    dev_total = meta["total"]
    print("Building model...")

    train_dataset = SQuADDataset(config.train_record_file, config.num_steps,
                                 config.batch_size)
    dev_dataset = SQuADDataset(config.dev_record_file, config.val_num_batches,
                               config.batch_size)

    lr = config.learning_rate

    model = QANet(word_mat, char_mat).to(device)
    parameters = filter(lambda param: param.requires_grad, model.parameters())
    optimizer = optim.Adam(betas=(0.8, 0.999),
                           eps=1e-7,
                           weight_decay=3e-7,
                           params=parameters)
    crit = lr / math.log2(1000)
    scheduler = optim.lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=lambda ee: crit * math.log2(ee + 1)
        if ee + 1 <= 1000 else lr)

    for ep in tqdm(range(config.num_steps), total=config.num_steps):
        (Cwid, Ccid, Qwid, Qcid, y1, y2, ids) = train_dataset[ep]
        Cwid, Ccid, Qwid, Qcid = Cwid.to(device), Ccid.to(device), Qwid.to(
            device), Qcid.to(device)
        p1, p2 = model(Cwid, Ccid, Qwid, Qcid)
        y1, y2 = y1.to(device), y2.to(device)
        loss1 = F.cross_entropy(p1, y1)
        loss2 = F.cross_entropy(p2, y1)
        loss = loss1 + loss2
        loss.backward(retain_graph=True)
        scheduler.step()
        model.zero_grad()
        if (ep + 1) % config.checkpoint == 0:
            del Cwid, Ccid, Qwid, Qcid, y1, y2, p1, p2, loss
            torch.cuda.empty_cache()
            metric = evaluate_batch(model, dev_eval_file, dev_dataset)
            log_ = "EPOCH {:8d} loss {:8f} F1 {:8f} EM {:8f}\n".format(
                ep, metric["loss"], metric["f1"], metric["exact_match"])
            train_log.write(log_)
            train_log.flush()
            dev_f1 = metric["f1"]
            dev_em = metric["exact_match"]
            if dev_f1 < best_f1 and dev_em < best_em:
                patience += 1
                if patience > config.early_stop:
                    break
            else:
                patience = 0
                best_em = max(best_em, dev_em)
                best_f1 = max(best_f1, dev_f1)

            fn = os.path.join(config.save_dir, "model_{}.ckpt".format(ep))
            torch.save(model, fn, pickle_protocol=False)
Esempio n. 8
0
def train_entry(config):
    from models import QANet

    with open(config.word_emb_file, "rb") as fh:
        word_mat = np.array(json.load(fh), dtype=np.float32)
    with open(config.char_emb_file, "rb") as fh:
        char_mat = np.array(json.load(fh), dtype=np.float32)
    with open(config.dev_eval_file, "r") as fh:
        dev_eval_file = json.load(fh)

    print("Building model...")

    train_dataset = get_loader(config.train_record_file, config.batch_size)
    dev_dataset = get_loader(config.dev_record_file, config.batch_size)

    lr = config.learning_rate
    base_lr = 1
    lr_warm_up_num = config.lr_warm_up_num

    model = QANet(word_mat, char_mat).to(device)
    if torch.cuda.device_count() > 1:
        print('i can use gpu')
        model = torch.nn.DataParallel(model, device_ids=[0, 1])
    model.load_state_dict(
        torch.load('/home/cn/AI/QANet-pytorch-/model_state_dict.pt'))
    ema = EMA(config.decay)
    for name, param in model.named_parameters():
        if param.requires_grad:
            ema.register(name, param.data)

    parameters = filter(lambda param: param.requires_grad, model.parameters())
    optimizer = optim.Adam(lr=base_lr,
                           betas=(0.9, 0.999),
                           eps=1e-7,
                           weight_decay=5e-8,
                           params=parameters)
    cr = lr / math.log2(lr_warm_up_num)
    scheduler = optim.lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=lambda ee: cr * math.log2(ee + 1)
        if ee < lr_warm_up_num else lr)
    best_f1 = 0
    best_em = 0
    patience = 0
    unused = False
    for iter in range(config.num_epoch):
        train(model, optimizer, scheduler, train_dataset, dev_dataset,
              dev_eval_file, iter, ema)
        print(iter)
        ema.assign(model)
        metrics = test(model, dev_dataset, dev_eval_file,
                       (iter + 1) * len(train_dataset))
        dev_f1 = metrics["f1"]
        dev_em = metrics["exact_match"]
        if dev_f1 < best_f1 and dev_em < best_em:
            patience += 1
            if patience > config.early_stop:
                break
        else:
            patience = 0
            best_f1 = max(best_f1, dev_f1)
            best_em = max(best_em, dev_em)

        fn = os.path.join(config.save_dir, "model.pt")
        torch.save(model, fn)
        torch.save(model.state_dict(), 'model_state_dict.pt')
        ema.resume(model)
Esempio n. 9
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # setup_args = get_setup_args()
    with open(args.char2idx_file, "r") as f:
        char2idx = json_load(f)
    # Get model
    log.info('Building model...')
    model = QANet(word_vectors=word_vectors, char2idx=char2idx)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    # optimizer = optim.Adam(model.parameters(), lr=0.001, betas=(0.8, 0.999))
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')

    dataset1 = np.load(args.train_record_file)
    outfile = '/content/dataset/train_light.npz'
    n_row = 75000
    np.savez(outfile,
             context_idxs=dataset1['context_idxs'][:n_row],
             context_char_idxs=dataset1['context_char_idxs'][:n_row],
             ques_idxs=dataset1['ques_idxs'][:n_row],
             ques_char_idxs=dataset1['ques_char_idxs'][:n_row],
             y1s=dataset1['y1s'][:n_row],
             y2s=dataset1['y2s'][:n_row],
             ids=dataset1['ids'][:n_row])

    # train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_dataset = SQuAD(outfile, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()
                if step % 10000 == 0:
                    print('loss val: {}'.format(loss_val))

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 10
0
def train(config):
    from models import QANet

    with open(config.word_emb_file, "r") as fh:
        word_mat = np.array(json.load(fh), dtype=np.float32)
    with open(config.char_emb_file, "r") as fh:
        char_mat = np.array(json.load(fh), dtype=np.float32)
    with open(config.train_eval_file, "r") as fh:
        train_eval_file = json.load(fh)
    with open(config.dev_eval_file, "r") as fh:
        dev_eval_file = json.load(fh)

    print("Building model...")

    model = QANet(word_mat, char_mat).to(device)
    train_dataset = SQuADDataset(config.train_record_file, config.num_steps,
                                 config.batch_size)
    dev_dataset = SQuADDataset(config.dev_record_file,
                               -1,
                               config.batch_size,
                               name='dev')

    lr = config.learning_rate
    base_lr = 1.0
    lr_warm_up_num = config.lr_warm_up_num

    parameters = filter(lambda param: param.requires_grad, model.parameters())
    optimizer = optim.Adam(lr=base_lr,
                           betas=(config.beta1, config.beta2),
                           eps=1e-7,
                           weight_decay=3e-7,
                           params=parameters)
    cr = lr / math.log2(lr_warm_up_num)
    scheduler = optim.lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=lambda ee: cr * math.log2(ee + 1)
        if ee < lr_warm_up_num else lr)

    L = config.checkpoint
    N = config.num_steps
    best_f1 = 0
    best_em = 0
    patience = 0
    unused = True
    for iter in tqdm(range(0, N)):
        if iter % L == 0:
            valid_train_loss, valid_train_metrics = valid(model,
                                                          train_dataset,
                                                          train_eval_file,
                                                          num_ex=1000)
            valid_dev_loss, valid_dev_metrics = valid(model,
                                                      dev_dataset,
                                                      dev_eval_file,
                                                      num_ex=1000)
            if config.use_tensorboard:
                writer.add_scalar('data/valid_train_loss', valid_train_loss,
                                  iter / L)
                writer.add_scalar('data/valid_dev_loss', valid_dev_loss,
                                  iter / L)
                writer.add_scalar('data/valid_train_em',
                                  valid_train_metrics['exact_match'], iter / L)
                writer.add_scalar('data/valid_dev_em',
                                  valid_dev_metrics['exact_match'], iter / L)
                writer.add_scalar('data/valid_train_f1',
                                  valid_train_metrics['f1'], iter / L)
                writer.add_scalar('data/valid_dev_f1', valid_dev_metrics['f1'],
                                  iter / L)
        train_loss = update(model, optimizer, scheduler, train_dataset[iter])
        if config.use_tensorboard:
            writer.add_scalar('data/train_loss', train_loss, iter)
        if iter + L >= lr_warm_up_num - 1 and unused:
            optimizer.param_groups[0]['initial_lr'] = lr
            scheduler = optim.lr_scheduler.ExponentialLR(
                optimizer, config.decay)
            unused = False
        #print("Learning rate: {}".format(scheduler.get_lr()))
        '''
        dev_f1 = metrics["f1"]
        dev_em = metrics["exact_match"]
        print('after {} steps , f1={} em={}'.format(iter, dev_f1, dev_em))
        if dev_em < best_em:
            patience += 1
            print('doesnot beat best model, patience={}'.format(patience))
            if patience > config.early_stop:
                break
        else:
            fn = os.path.join(config.save_dir, "model.pt")
            result = os.path.join(config.save_dir, "best_result.txt")
            print('beat best model, now best em={}, f1={}'.format(dev_em, dev_f1))
            with open(result, "w") as f:
                json.dump(metrics, f)
            best_em = dev_em
            best_f1 = max(best_f1, dev_f1)
            torch.save(model, fn)
            patience = 0
        '''
    writer.close()
Esempio n. 11
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get word embeddings
    log.info('Loading word embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    ### start our code:
    # Get char-embeddings
    log.info('Loading char-embeddings...')
    char_vectors = util.torch_from_json(args.char_emb_file)
    ### end our code

    # Get model
    log.info('Building model...')
    # model = BiDAF(word_vectors=word_vectors,
    #               hidden_size=args.hidden_size,
    #               char_vectors = char_vectors,
    #               drop_prob=args.drop_prob)

    ### start our code:   (QANet)
    model = QANet(word_vectors=word_vectors,
                  char_vectors=char_vectors,
                  hidden_size=args.hidden_size,
                  kernel_size=7,
                  filters=128,
                  drop_prob=args.drop_prob)
    ### end our code

    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    # https://pytorch.org/docs/stable/optim.html
    # Original:
    # optimizer = optim.Adadelta(model.parameters(), args.lr,
    #                           weight_decay=args.l2_wd)
    #                   # default: lr=0.5, rho=0.9, eps=1e-06, weight_decay=0
    # scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR
    # A function which computes a multiplicative factor given an integer parameter epoch,
    # or a list of such functions, one for each group in optimizer.param_groups

    ### start our code:
    optimizer = optim.Adam(model.parameters(),
                           lr=1e-3,
                           betas=(0.8, 0.999),
                           eps=1e-7,
                           weight_decay=3e-7,
                           amsgrad=False)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Adabound
    # optimizer = adabound.AdaBound(model.parameters(), lr=1e-3, final_lr=0.1)
    # scheduler = sched.LambdaLR(optimizer, lambda s: 1.)
    ### end our code

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:

                # print("cc_idxs: ", cc_idxs)
                # print("qc_idxs: ", qc_idxs)

                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)

                ### start our code:
                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)

                ### end our code

                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                # log_p1, log_p2 = model(cw_idxs, qw_idxs)   # original

                ### start our code:
                log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs)

                ### end our code

                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)

                ### start our code:
                # optimizer = LR_decay(optimizer, epoch, lr = 0.8)   # initial learning rate 0.8
                if step < 1000:
                    optimizer = LR_warmup(optimizer, step, lr=0.)
                ### end our code

                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)