Esempio n. 1
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir) # Writes entries directly to event files in the logdir to be consumed by TensorBoard.
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))  # 一个gpu: batch_size=64 看实际情况

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path: # default=None
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay) # ema_decay = 0.999
    # ema core =>  new_average = (1.0 - decay) * param.data + decay * self.shadow[name]

    # Get saver
    # metric_name: NLL or EM F1
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints, # max_checkpoints = 5
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(), args.lr,
                               weight_decay=args.l2_wd)  # lr : default=0.5    l2_wd : default=0
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)  # train_record_file = './data/train.npz'

    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size, # 64
                                   shuffle=True, # sampler = RandomSampler(dataset) batch_sampler = BatchSampler(sampler, batch_size, drop_last)
                                   num_workers=args.num_workers, # 4
                                   collate_fn=collate_fn) # merges a list of samples to form a mini-batch.
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) #  dev_record_file =  './data/dev.npz'
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)# Merge examples of different length by padding all examples to the maximum length in the batch.

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps # 50000
    epoch = step // len(train_dataset) # len(train_dataset)= 7   epoch=0
    while epoch != args.num_epochs: # 30
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)  # 64
                optimizer.zero_grad()

                # Forward
                log_p1, log_p2 = model(cw_idxs, qw_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)   # max_grad_norm : default=5.0
                optimizer.step()  # 进行1次optimize
                scheduler.step(step // batch_size)# train : step=0
                ema(model, step // batch_size)   # def __call__(self, model, num_updates):

                # Log info
                step += batch_size #step: 0   batch_size=64
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch,
                                         NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step) # Add scalar data to summary.
                tbx.add_scalar('train/LR',
                               optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps  # 50000

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model) ##
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file, # './data/dev_eval.json'
                                                  args.max_ans_len,  # 15
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 2
0
def main(args):
  print("in main")
  print("args: ", args)

  if True: 
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')

    # CHECK IF WE NEED TO USE ALL OF THESE???? 
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(), args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR


    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    print("train dataset!: ", train_dataset)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)


# Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                log_p1, log_p2 = model(cw_idxs, qw_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch,
                                         NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR',
                               optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 3
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)

    # ######################################
    # tokenizer = BertTokenizer.from_pretrained('bert-large-uncased', do_lower_case=True)
    # train_examples = None
    # train_examples = read_squad_examples(
    #     input_file=args.train_file, is_training=True, version_2_with_negative=args.version_2_with_negative)
    # train_features = convert_examples_to_features(
    #     examples=train_examples,
    #     tokenizer=tokenizer,
    #     max_seq_length=args.max_seq_length,
    #     doc_stride=args.doc_stride,
    #     max_query_length=args.max_query_length,
    #     is_training=True)
    # if args.local_rank == -1 or torch.distributed.get_rank() == 0:
    #     logger.info("  Saving train features into cached file %s", cached_train_features_file)
    #     with open(cached_train_features_file, "wb") as writer:
    #         pickle.dump(train_features, writer)
    # all_input_ids = torch.tensor([f.input_ids for f in train_features], dtype=torch.long)
    # x = all_input_ids
    ###########################################

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors,
                  char_vectors=char_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)

                # added_flag
                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)

                optimizer.zero_grad()

                # Forward
                # log_p1, log_p2 = model(cw_idxs, qw_idxs)
                log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 4
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)
            optimizer.zero_grad()

            # Forward
            log_p1, log_p2 = model(cw_idxs, qw_idxs)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            loss_val = loss.item()

            # Backward
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            optimizer.step()
            scheduler.step(step // batch_size)
            ema(model, step // batch_size)

            # Log info
            step += batch_size
            if step % 1000 == 0 and step > 0:
                log.info(f'Step {step}: training loss {loss_val}...')

            steps_till_eval -= batch_size
            if steps_till_eval <= 0:
                steps_till_eval = args.eval_steps

                # Evaluate and save checkpoint
                log.info(f'Evaluating at step {step}...')
                ema.assign(model)
                results, pred_dict = evaluate(model, dev_loader, device,
                                              args.dev_eval_file,
                                              args.max_ans_len,
                                              args.use_squad_v2)
                saver.save(step, model, results[args.metric_name], device)
                ema.resume(model)

                # Log to console
                results_str = ', '.join(f'{k}: {v:05.2f}'
                                        for k, v in results.items())
                log.info(f'Dev {results_str}')

                # Log to TensorBoard
                log.info('Visualizing in TensorBoard...')
                for k, v in results.items():
                    tbx.add_scalar(f'dev/{k}', v, step)
                util.visualize(tbx,
                               pred_dict=pred_dict,
                               eval_path=args.dev_eval_file,
                               step=step,
                               split='dev',
                               num_visuals=args.num_visuals)
Esempio n. 5
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    # Comment out to only use 1 GPU on nv12
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings

    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info('Building model...')
    model = None
    max_context_len, max_question_len = args.para_limit, args.ques_limit
    if (args.model_type == "bidaf" or args.model_type == "bert-bidaf"):
        model = BiDAF(word_vectors=word_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob)
    elif (args.model_type == "dcn" or args.model_type == "bert-dcn"):
        model = DCN(word_vectors=word_vectors,
                    hidden_size=args.hidden_size,
                    max_context_len=max_context_len,
                    max_question_len=max_question_len,
                    drop_prob=args.drop_prob)
    elif (args.model_type == "bert-basic"):
        model = BERT(word_vectors=word_vectors,
                     hidden_size=args.hidden_size,
                     drop_prob=args.drop_prob)

    if model is None:
        raise ValueError('Model is unassigned. Please ensure --model_type \
        chooses between {bidaf, bert-bidaf, dcn, bert-dcn, bert-basic} ')

    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    count_skip = 0
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                batch_size = cw_idxs.size(0)
                count_skip += 1
                if (args.skip_examples == True
                        and (count_skip % 5 == 1 or count_skip % 5 == 2
                             or count_skip % 5 == 3 or count_skip % 5 == 4)):
                    step += batch_size
                    progress_bar.update(batch_size)
                    steps_till_eval -= batch_size
                    continue
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                ## Additions for BERT ##
                max_context_len, max_question_len = args.para_limit, args.ques_limit

                if "bert" in args.model_type:
                    bert_train_embeddings = get_embeddings(
                        "train", ids, args.para_limit, args.ques_limit)
                else:
                    bert_train_embeddings = None

                # Forward
                log_p1, log_p2 = model(cw_idxs, qw_idxs, bert_train_embeddings, \
                max_context_len, max_question_len, device)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2, args)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 6
0
def main(args):

    # Set up logging and devices
    name = "train_exp2"
    args.save_dir = util.get_save_dir(args.logging_dir, name, training=True)
    log = get_logger(args.save_dir, name)
    tbx = SummaryWriter(args.save_dir)
    device, gpu_ids = util.get_available_devices()
    log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}")
    args.batch_size *= max(1, len(gpu_ids))

    # Set random seed
    log.info(f"Using random seed {args.random_seed}...")
    random.seed(args.random_seed)
    np.random.seed(args.random_seed)
    torch.manual_seed(args.random_seed)
    torch.cuda.manual_seed_all(args.random_seed)

    # Get embeddings
    log.info(f"Loading embeddings from {args.word_emb_file}...")
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info("Building model...")
    model = BiDAF(word_vectors=word_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    model = nn.DataParallel(model, gpu_ids)
    if args.load_path:
        log.info(f"Loading checkpoint from {args.load_path}...")
        model, step = util.load_model(model, args.load_path, gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(), args.learning_rate,
                               weight_decay=args.learning_rate_decay)
    # scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR
    scheduler = sched.ReduceLROnPlateau(optimizer=optimizer,
                                        mode="min", factor=0.1,
                                        patience=2, verbose=True, cooldown=0 
                                        min_lr=0.0005)


    for epoch in range(args.num_epochs):
        log.info(f"Starting epoch {epoch}...")
        for i in range(args.num_train_chunks):
        # Get data loader
            train_rec_file = f"{args.train_record_file_exp2}_{i}.npz"
            log.info(f'Building dataset from {train_rec_file} ...')
            train_dataset = SQuAD(train_rec_file, args.exp2_train_topic_contexts, use_v2=True)
            train_loader = data.DataLoader(train_dataset,
                                           batch_size=args.batch_size,
                                           shuffle=True,
                                           num_workers=args.num_workers,
                                           collate_fn=collate_fn)

            # Train
            log.info('Training...')
            steps_till_eval = args.eval_steps
            epoch = 0
        # torch.set_num_threads(7)
            with torch.enable_grad(), tqdm(total=len(train_loader.dataset)) as progress_bar:
                for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                    # Setup for forward
                    cw_idxs = cw_idxs.to(device)
                    qw_idxs = qw_idxs.to(device)
                    batch_size = qw_idxs.size(0)
                    optimizer.zero_grad()

                    # Forward
                    log_p1, log_p2 = model(cw_idxs, qw_idxs)
                    y1, y2 = y1.to(device), y2.to(device)
                    loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                    loss_val = loss.item()

                    # Backward
                    loss.backward()
                    nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                    optimizer.step()
                    scheduler.step(step // batch_size)
                    ema(model, step // batch_size)

                    # Log info
                    step += batch_size
                    progress_bar.update(batch_size)
                    progress_bar.set_postfix(epoch=epoch,
                                             NLL=loss_val)
                    tbx.add_scalar('train/NLL', loss_val, step)
                    tbx.add_scalar('train/LR',
                                   optimizer.param_groups[0]['lr'],
                                   step)

                    steps_till_eval -= batch_size
                    if steps_till_eval <= 0:
                        steps_till_eval = args.eval_steps

                        # Evaluate and save checkpoint
                        log.info(f"Evaluating at step {step}...")
                        ema.assign(model)

                        for i in range(args.num_dev_chunks):
                        # Get data loader
                            all_pred_dicts = {}
                            all_results = OrderedDict() 
                            dev_rec_file = f"{args.dev_record_file_exp2}_{i}.npz"
                            log.info(f'Building evaluating dataset from {dev_rec_file} ...')
                            dev_dataset = SQuAD(dev_rec_file, 
                                                args.exp2_dev_topic_contexts, 
                                                use_v2=True)
                            dev_loader = data.DataLoader(dev_dataset,
                                                           batch_size=args.batch_size,
                                                           shuffle=True,
                                                           num_workers=args.num_workers,
                                                           collate_fn=collate_fn)
                            results, pred_dict = evaluate(model, dev_loader, device,
                                                          args.dev_eval_file,
                                                          args.max_ans_len,
                                                          use_squad_v2=True)
                            all_results.update(results)
                            all_pred_dicts.update(pred_dict)

                            del dev_dataset
                            del dev_loader
                            del results
                            del pred_dict
                            torch.cuda.empty_cache()

                        saver.save(step, model, all_results[args.metric_name], device)
                        ema.resume(model)

                        # Log to console
                        results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in all_results.items())
                        log.info(f"Dev {results_str}")

                        # Log to TensorBoard
                        log.info('Visualizing in TensorBoard...')
                        for k, v in all_results.items():
                            tbx.add_scalar(f"dev/{k}", v, step)
                        util.visualize(tbx,
                                       pred_dict=all_pred_dicts,
                                       eval_path=args.dev_eval_file,
                                       step=step,
                                       split='dev',
                                       num_visuals=args.num_visuals)
                    torch.cuda.empty_cache()
            del train_dataset
            del train_loader
            torch.cuda.empty_cache()
Esempio n. 7
0
def main(args):
    save_dir = os.path.join("./save", time.strftime("%m%d%H%M%S"))
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    if args.all_data:
        data_loader = get_ext_data_loader(tokenizer,
                                          "./data/train/",
                                          shuffle=True,
                                          args=args)
    else:
        data_loader, _, _ = get_data_loader(tokenizer,
                                            "./data/train-v1.1.json",
                                            shuffle=True,
                                            args=args)
    vocab_size = len(tokenizer.vocab)
    if args.bidaf:
        print("train bidaf")
        model = BiDAF(embedding_size=args.embedding_size,
                      vocab_size=vocab_size,
                      hidden_size=args.hidden_size,
                      drop_prob=args.dropout)
    else:
        ntokens = len(tokenizer.vocab)
        model = QANet(ntokens,
                      embedding=args.embedding,
                      embedding_size=args.embedding_size,
                      hidden_size=args.hidden_size,
                      num_head=args.num_head)
    if args.load_model:
        state_dict = torch.load(args.model_path, map_location="cpu")
        model.load_state_dict(state_dict)
        print("load pre-trained model")
    device = torch.device("cuda")
    model = model.to(device)
    model.train()
    ema = EMA(model, args.decay)

    base_lr = 1
    parameters = filter(lambda param: param.requires_grad, model.parameters())
    optimizer = optim.Adam(lr=base_lr,
                           betas=(0.9, 0.999),
                           eps=1e-7,
                           weight_decay=5e-8,
                           params=parameters)
    cr = args.lr / math.log2(args.lr_warm_up_num)
    scheduler = optim.lr_scheduler.LambdaLR(
        optimizer,
        lr_lambda=lambda ee: cr * math.log2(ee + 1)
        if ee < args.lr_warm_up_num else args.lr)
    step = 0
    num_batches = len(data_loader)
    avg_loss = 0
    best_f1 = 0
    for epoch in range(1, args.num_epochs + 1):
        step += 1
        start = time.time()
        model.train()
        for i, batch in enumerate(data_loader, start=1):
            c_ids, q_ids, start_positions, end_positions = batch
            c_len = torch.sum(torch.sign(c_ids), 1)
            max_c_len = torch.max(c_len)
            c_ids = c_ids[:, :max_c_len].to(device)
            q_len = torch.sum(torch.sign(q_ids), 1)
            max_q_len = torch.max(q_len)
            q_ids = q_ids[:, :max_q_len].to(device)

            start_positions = start_positions.to(device)
            end_positions = end_positions.to(device)

            optimizer.zero_grad()
            loss = model(c_ids,
                         q_ids,
                         start_positions=start_positions,
                         end_positions=end_positions)
            loss.backward()
            avg_loss = cal_running_avg_loss(loss.item(), avg_loss)
            nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
            optimizer.step()
            scheduler.step(step)
            ema(model, step // args.batch_size)

            batch_size = c_ids.size(0)
            step += batch_size

            msg = "{}/{} {} - ETA : {} - qa_loss: {:.4f}" \
                .format(i, num_batches, progress_bar(i, num_batches),
                        eta(start, i, num_batches),
                        avg_loss)
            print(msg, end="\r")
        if not args.debug:
            metric_dict = eval_qa(args, model)
            f1 = metric_dict["f1"]
            em = metric_dict["exact_match"]
            print("epoch: {}, final loss: {:.4f}, F1:{:.2f}, EM:{:.2f}".format(
                epoch, avg_loss, f1, em))

            if args.bidaf:
                model_name = "bidaf"
            else:
                model_name = "qanet"
            if f1 > best_f1:
                best_f1 = f1
                state_dict = model.state_dict()
                save_file = "{}_{:.2f}_{:.2f}".format(model_name, f1, em)
                path = os.path.join(save_dir, save_file)
                torch.save(state_dict, path)
def main(data, flags):
    # Set up logging and devices
    log_dir = data.logging_dir
    log = util.get_logger(log_dir, "toy")
    tbx = SummaryWriter(data.logging_dir)
    device, data.gpu_ids = util.get_available_devices()
    log.info('Config: {}'.format(dumps(vars(data), indent=4, sort_keys=True)))
    data.batch_size *= max(1, len(data.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(data.random_seed))
    random.seed(data.random_seed)
    np.random.seed(data.random_seed)
    torch.manual_seed(data.random_seed)
    torch.cuda.manual_seed_all(data.random_seed)

    if flags[1] == "toy":
        word_emb_file = data.toy_word_emb_file
        training_data = data.toy_record_file_exp3
        test_data = data.dev_record_file_exp3
        eval_file = data.toy_eval_exp3
    elif flags[1] == "train":
        word_emb_file = data.word_emb_file
        training_data = data.train_record_file_exp3
        test_data = data.dev_record_file_exp3
        eval_file = data.train_eval_exp3
    elif flags[1] == "dev":
        word_emb_file = data.word_emb_file
        training_data = data.dev_record_file_exp3
        test_data = data.toy_record_file_exp3
        eval_file = data.dev_eval_exp3

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(word_emb_file)

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors,
                  hidden_size=data.hidden_size,
                  drop_prob=data.drop_prob)
    model = nn.DataParallel(model, data.gpu_ids)
    if data.load_path:
        log.info('Loading checkpoint from {}...'.format(data.load_path))
        model, step = util.load_model(model, data.load_path, data.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, data.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(data.logging_dir,
                                 max_checkpoints=10,
                                 metric_name=data.metric_name,
                                 maximize_metric=data.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               data.learning_rate,
                               weight_decay=data.learning_weight_decay)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    # np.load(data.toy_record_file_exp3)
    train_dataset = SQuAD3(training_data, use_v2=True)
    train_loader = torchdata.DataLoader(train_dataset,
                                        batch_size=data.batch_size,
                                        shuffle=True,
                                        num_workers=data.num_workers,
                                        collate_fn=collate_fn)

    test_dataset = SQuAD3(test_data, use_v2=True)
    test_loader = torchdata.DataLoader(test_dataset,
                                       batch_size=data.batch_size,
                                       shuffle=False,
                                       num_workers=data.num_workers,
                                       collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = data.eval_steps
    epoch = step // len(test_dataset)
    while epoch != data.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                log.info("cw_idxs length: {}".format(str(len(cw_idxs))))
                log.info("qw_idxs length: {}".format(str(len(qw_idxs))))
                log.info("cw_idxs size: {}".format(str(
                    sys.getsizeof(cw_idxs))))
                log.info("qw_idxs size: {}".format(str(
                    sys.getsizeof(qw_idxs))))
                log.info("cw_idxs shape: {}".format(str(cw_idxs.shape)))
                log.info("qw_idxs shape: {}".format(str(qw_idxs.shape)))

                log_p1, log_p2 = model(cw_idxs, qw_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         data.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('toy/NLL', loss_val, step)
                tbx.add_scalar('toy/LR', optimizer.param_groups[0]['lr'], step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = data.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model,
                                                  test_loader,
                                                  device,
                                                  eval_path=eval_file,
                                                  max_len=sys.maxsize,
                                                  use_squad_v2=True)
                    saver.save(step, model, results[data.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=data.num_visuals)
Esempio n. 9
0
def train(args):
    # Set up logging and devices
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}")
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f"Using random seed {args.seed}...")
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info("Loading embeddings...")
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info("Building model...")
    model = BiDAF(
        word_vectors=word_vectors,
        hidden_size=args.hidden_size,
        drop_prob=args.drop_prob,
        use_glove=args.use_glove,
    )
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f"Loading checkpoint from {args.load_path}...")
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = stats.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(
        args.save_dir,
        max_checkpoints=args.max_checkpoints,
        metric_name=args.metric_name,
        maximize_metric=args.maximize_metric,
        log=log,
    )

    # Get optimizer and scheduler
    optimizer = optim.Adam(model.parameters(),
                           args.lr,
                           weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.0)  # Constant LR

    # Get data loader
    log.info("Building dataset...")
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        collate_fn=collate_fn,
    )
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(
        dev_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.num_workers,
        collate_fn=collate_fn,
    )

    # Train
    log.info("Training...")
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    scaler = amp.GradScaler()

    while epoch != args.num_epochs:
        epoch += 1
        log.info(f"Starting epoch {epoch}...")
        with torch.enable_grad(), tqdm(
                total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                with amp.autocast():
                    log_p1, log_p2 = model(cw_idxs, qw_idxs)
                    y1, y2 = y1.to(device), y2.to(device)
                    loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                    loss_val = loss.item()

                # Backward
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                scaler.step(optimizer)
                scaler.update()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar("train/NLL", loss_val, step)
                tbx.add_scalar("train/LR", optimizer.param_groups[0]["lr"],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f"Evaluating at step {step}...")
                    ema.assign(model)
                    results, pred_dict = evaluate(
                        model,
                        dev_loader,
                        device,
                        args.dev_eval_file,
                        args.max_ans_len,
                        args.use_squad_v2,
                    )
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ", ".join(f"{k}: {v:05.2f}"
                                            for k, v in results.items())
                    log.info(f"Dev {results_str}")

                    # Log to TensorBoard
                    log.info("Visualizing in TensorBoard...")
                    for k, v in results.items():
                        tbx.add_scalar(f"dev/{k}", v, step)
                    util.visualize(
                        tbx,
                        pred_dict=pred_dict,
                        eval_path=args.dev_eval_file,
                        step=step,
                        split="dev",
                        num_visuals=args.num_visuals,
                    )
Esempio n. 10
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)

    # Get model
    log.info('Building model...')

    model = BiDAF(vectors=(word_vectors, char_vectors),
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob,
                  p_sdd=args.p_sdd,
                  char_limit=args.char_limit,
                  use_transformer=args.use_transformer,
                  inter_size=args.inter_size,
                  heads=args.heads,
                  c2w_size=args.c2w_size,
                  enc_blocks=args.enc_blocks,
                  enc_convs=args.enc_convs,
                  mod_blocks=args.mod_blocks,
                  mod_convs=args.mod_convs,
                  use_GRU=args.use_GRU)

    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path,
                                      args.gpu_ids)  # uses the saved step num
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    # optimizer = optim.Adadelta(model.parameters(), args.lr,
    #                            weight_decay=args.l2_wd)

    # The scheduler MULTIPLIES the base LR, NOT replaces
    optimizer = optim.Adam(model.parameters(),
                           1.,
                           betas=(.9, .98),
                           eps=1e-9,
                           weight_decay=args.l2_wd)

    scheduler = sched.LambdaLR(
        optimizer, lambda s: 0.001 * math.log(s + 1) / math.log(1000 - 1)
        if s < 1000 else 0.001)  # Chute (must use math.log, else TypeError)
    # scheduler = sched.LambdaLR(optimizer, lambda s: (args.hidden_size**(-.5)) *
    #                            min((s+1e-9)**(-.5), s*(4000**(-1.5)))
    #                            )  # From Vaswani et. al 2017

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:

                # Setup for forward
                optimizer.zero_grad()
                batch_size = cw_idxs.size(0)

                cc_idxs = cc_idxs.to(device)  # (batch, c_limit, char_limit)
                qc_idxs = qc_idxs.to(device)
                cw_idxs = cw_idxs.to(device)  # (batch, c_limit)
                qw_idxs = qw_idxs.to(device)

                c_idxs, q_idxs = (cw_idxs, cc_idxs), (qw_idxs, qc_idxs)

                # Forward
                log_p1, log_p2 = model(c_idxs, q_idxs)

                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                scheduler.step(
                    step // batch_size
                )  # By default, schedules per epoch; pass in step # as "epoch"
                ema(model, step // batch_size)

                # Log info
                step += batch_size  # Number of examples. Step is usually the number of (mini)-batches
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 11
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vec = util.torch_from_json(args.char_emb_file)

    # Get model
    log.info('Building model...')
    if args.name == 'baseline':
        model = BiDAF(word_vectors=word_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob)
    elif args.name == 'charembeddings':
        model = BiDAFChar(word_vectors=word_vectors,
                          char_vec=char_vec,
                          word_len=16,
                          hidden_size=args.hidden_size,
                          drop_prob=args.drop_prob)
    elif args.name == 'charembeddings2':
        model = BiDAFChar2(word_vectors=word_vectors,
                           char_vec=char_vec,
                           word_len=16,
                           hidden_size=args.hidden_size,
                           drop_prob=args.drop_prob)
    elif args.name == 'qanet':
        model = QANet(word_vectors=word_vectors,
                      char_vec=char_vec,
                      word_len=16,
                      emb_size=args.hidden_size,
                      drop_prob=args.drop_prob,
                      enc_size=args.enc_size,
                      n_head=args.n_head,
                      LN_train=args.ln_train,
                      DP_residual=args.dp_res,
                      mask_pos=args.mask_pos,
                      two_pos=args.two_pos,
                      total_prob=args.total_drop,
                      final_prob=args.final_prob)
    elif args.name == 'qanet2':
        model = QANet2(word_vectors=word_vectors,
                       char_vec=char_vec,
                       word_len=16,
                       emb_size=args.hidden_size,
                       drop_prob=args.drop_prob,
                       enc_size=args.enc_size,
                       n_head=args.n_head,
                       LN_train=args.ln_train,
                       DP_residual=args.dp_res,
                       mask_pos=args.mask_pos,
                       two_pos=args.two_pos,
                       rel=args.rel_att,
                       total_prob=args.total_drop,
                       final_prob=args.final_prob,
                       freeze=args.freeze_emb)
    elif args.name == 'qanet3':
        model = QANet3(word_vectors=word_vectors,
                       char_vec=char_vec,
                       word_len=16,
                       emb_size=args.hidden_size,
                       drop_prob=args.drop_prob,
                       enc_size=args.enc_size,
                       n_head=args.n_head,
                       LN_train=args.ln_train,
                       DP_residual=args.dp_res,
                       mask_pos=args.mask_pos,
                       two_pos=args.two_pos,
                       rel=args.rel_att,
                       total_prob=args.total_drop,
                       final_prob=args.final_prob,
                       freeze=args.freeze_emb)
    elif args.name == 'qanet4':
        model = QANet4(word_vectors=word_vectors,
                       char_vec=char_vec,
                       word_len=16,
                       emb_size=args.hidden_size,
                       drop_prob=args.drop_prob,
                       enc_size=args.enc_size,
                       n_head=args.n_head,
                       LN_train=args.ln_train,
                       DP_residual=args.dp_res,
                       mask_pos=args.mask_pos,
                       two_pos=args.two_pos,
                       rel=args.rel_att,
                       total_prob=args.total_drop,
                       final_prob=args.final_prob,
                       freeze=args.freeze_emb)
    else:
        raise ValueError('Wrong model name')

    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler

    if args.name == 'qanet':
        optimizer = optim.Adam(model.parameters(),
                               args.lr,
                               betas=(0.8, 0.999),
                               weight_decay=3 * 1e-7,
                               eps=1e-7)
        scheduler = warmup(optimizer, 1, 2000)
    elif args.opt == 'adam':
        if args.grad_cent:
            optimizer = AdamWGC(model.parameters(),
                                args.lr,
                                betas=(0.9, 0.999),
                                weight_decay=3 * 1e-7,
                                eps=1e-7,
                                use_gc=True)
        else:
            optimizer = AdamW(model.parameters(),
                              args.lr,
                              betas=(0.8, 0.999),
                              weight_decay=3 * 1e-7,
                              eps=1e-7)
        scheduler = warmup(optimizer, 1, 2000)
    elif args.opt == 'adadelta':
        optimizer = optim.Adadelta(model.parameters(),
                                   args.lr,
                                   weight_decay=3 * 1e-7)
        scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR
    elif args.opt == 'sgd':
        optimizer = optim.SGD(model.parameters(),
                              args.lr,
                              weight_decay=3 * 1e-7)
        scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    i = 0
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)

                # Forward
                log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()
                i += 1
                loss /= args.acc_step

                # Backward
                loss.backward()
                if i % args.acc_step == 0:
                    nn.utils.clip_grad_norm_(model.parameters(),
                                             args.max_grad_norm)
                    optimizer.step()
                    scheduler.step(i // (args.acc_step))
                    ema(model, i // (args.acc_step))
                    optimizer.zero_grad()

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0 and i % args.acc_step == 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 12
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))
    max_len = 10

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    ch_vectors = util.torch_from_json(args.char_emb_file)

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors,
                  ch_vectors=ch_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                log_p = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                #print("ckpt 1")
                ans_lens = y2 - y1
                loss = 0
                for i in range(max_len):
                    mask = ((torch.ones_like(y1) * i) == ans_lens).type(
                        torch.cuda.LongTensor)
                    y = y1 * mask
                    loss += F.nll_loss(log_p[:, :, i], y)
                #print("ckpt 2")
                loss_val = loss.item()
                #print("ckpt 3")
                # Backward
                loss.backward()
                #print("ckpt 4")
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                #print("ckpt 5")
                optimizer.step()
                #print("ckpt 6")
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)
                #print("ckpt 7")

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                if step % (50 * batch_size) == 0:
                    print(loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                #print("ckpt 8")
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 13
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    # To make the data generation of every experiment same
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)
    print(word_vectors.size())
    print(char_vectors.size())

    # Get model
    log.info('Building model...')
    model = BiDAF(word_vectors=word_vectors,
                  char_vectors=char_vectors,
                  hidden_size=args.hidden_size,
                  drop_prob=args.drop_prob)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    # (context_idxs(context_len,): Indices of the words in the context.,
    # context_char_idx(context_len, max_word_len): Indices of the characters in the context,
    # question_idxs(question_len,): Indices of the words in the question,
    # question_char_idx(question_len, max_word_len): Indices of the characters in the question,
    # y1:start, -1 if no answer: answer start index,
    # y2:start, -1 if no answer: answer end index,
    # id ID of the example)
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                # cw_idxs: Indices of the words in the context
                # cc_idxs: Indices of the characters in the context
                # qw_idxs: Indices of the words in the query
                # qc_idxs: Indices of the characters in teh query
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                # cw_idx with shape(context_len, )
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                # L(theta) = - 1/N * sum(log(P1_yi_1) + log(P2_yi_2))
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 14
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info('Building model...')
    '''
    model = QANet(word_vectors, args.hidden_size, args.char_embed_size, args.word_from_char_size,
                  args.dropout_main,
                  args.embed_encoder_num_convs, args.embed_encoder_conv_kernel_size,
                  args.embed_encoder_num_heads, args.embed_encoder_num_blocks,
                  args.model_encoder_num_convs, args.model_encoder_conv_kernel_size,
                  args.model_encoder_num_heads, args.model_encoder_num_blocks)
    '''
    char_vectors = util.torch_from_json(args.char_emb_file)
    model = BiDAF(word_vectors=word_vectors,
                          char_vectors=char_vectors,
                          hidden_size=args.hidden_size,
                          drop_prob=args.drop_prob)
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    # Increase LR from 0 to args.lr in num_warmup_steps steps, then keep constant LR
    optimizer = optim.Adam(model.parameters(), args.lr, betas=(0.8, 0.999), eps=1e-07, weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer,
                               lambda s: math.log(1+s)/math.log(args.num_warmup_steps) if s < args.num_warmup_steps else 1) 

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs)
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch,
                                         NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR',
                               optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
Esempio n. 15
0
def main(args):
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info('Args: {}'.format(dumps(vars(args), indent=4, sort_keys=True)))
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info('Using random seed {}...'.format(args.seed))
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)

    # NEW : load the tag embeddings
    pos_vectors = util.torch_from_json(args.pos_emb_file)
    ner_vectors = util.torch_from_json(args.ner_emb_file)

    # Add loss
    if 'loss' in args.name:
        distance_criterion = DistanceFromAnswerLoss(coefficient=.5, device=device,
                                                    normalization=True, penalization_type='quadratic',
                                                    reduction='mean')

    # Choose model
    log.info('Building model {}...'.format(args.name))

    if 'baseline' in args.name:
        model = BiDAF(word_vectors=word_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob)

    elif args.name == 'BiDAF_char':
        model = BiDAF_char(word_vectors=word_vectors,
                      char_vectors=char_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob)

    elif (args.name == 'BiDAF_tag') or (args.name == 'BiDAF_tag_loss'):
        model = BiDAF_tag(word_vectors=word_vectors,
                      char_vectors=char_vectors,
                      pos_vectors=pos_vectors,
                      ner_vectors=ner_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob)

    elif (args.name == 'BiDAF_tag_unfrozen') or (args.name == 'BiDAF_tag_unfrozen_loss'):
        model = BiDAF_tag(word_vectors=word_vectors,
                      char_vectors=char_vectors,
                      pos_vectors=pos_vectors,
                      ner_vectors=ner_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob,
                      freeze_tag=False)

    elif args.name == 'BiDAF_tag_ext':
        model = BiDAF_tag_ext(word_vectors=word_vectors,
                      char_vectors=char_vectors,
                      pos_vectors=pos_vectors,
                      ner_vectors=ner_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob)

    elif args.name == 'BiDAF_tag_ext_unfrozen':
        model = BiDAF_tag_ext(word_vectors=word_vectors,
                      char_vectors=char_vectors,
                      pos_vectors=pos_vectors,
                      ner_vectors=ner_vectors,
                      hidden_size=args.hidden_size,
                      drop_prob=args.drop_prob,
                      freeze_tag=False)

    elif args.name == 'coattn':
        model = CoattentionModel(hidden_dim=args.hidden_size,
                                embedding_matrix=word_vectors,
                                train_word_embeddings=False,
                                dropout=0.35,
                                pooling_size=16,
                                number_of_iters=4,
                                number_of_layers=2,
                                device=device)

    else:
        raise NameError('No model named ' + args.name)

    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info('Loading checkpoint from {}...'.format(args.load_path))
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0

    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(), args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn,
                                   drop_last=True)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn,
                                 drop_last=True)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info('Starting epoch {}...'.format(epoch))
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, cpos_idxs, cner_idxs, cw_ems, cw_tfs, qw_idxs, qc_idxs, qpos_idxs, qner_idxs, qw_ems, qw_tfs, y1, y2, ids in train_loader: # NEW
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)

                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                if 'baseline' in args.name:
                    log_p1, log_p2 = model(cw_idxs, qw_idxs)

                elif args.name == 'BiDAF_char':
                    # Additional setup for forward
                    cc_idxs = cc_idxs.to(device)
                    qc_idxs = qc_idxs.to(device)
                    log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs)

                elif (args.name == 'BiDAF_tag') or (args.name == 'BiDAF_tag_unfrozen') or (args.name == 'BiDAF_tag_loss') or (args.name == 'BiDAF_tag_unfrozen_loss'):
                    # Additional setup for forward
                    cc_idxs = cc_idxs.to(device)
                    cpos_idxs = cpos_idxs.to(device)
                    cner_idxs = cner_idxs.to(device)
                    qc_idxs = qc_idxs.to(device)
                    qpos_idxs = qpos_idxs.to(device)
                    qner_idxs = qner_idxs.to(device)

                    log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs, cpos_idxs, qpos_idxs, cner_idxs, qner_idxs)

                elif (args.name == 'BiDAF_tag_ext') or (args.name == 'BiDAF_tag_ext_unfrozen'):
                    # Additional setup for forward
                    cc_idxs = cc_idxs.to(device)
                    cpos_idxs = cpos_idxs.to(device)
                    cner_idxs = cner_idxs.to(device)
                    cw_ems = cw_ems.to(device)
                    cw_tfs = cw_tfs.to(device)
                    qc_idxs = qc_idxs.to(device)
                    qpos_idxs = qpos_idxs.to(device)
                    qner_idxs = qner_idxs.to(device)
                    qw_ems = qw_ems.to(device)
                    qw_tfs = qw_tfs.to(device)

                    log_p1, log_p2 = model(cw_idxs, qw_idxs, cc_idxs, qc_idxs, cpos_idxs, qpos_idxs, cner_idxs, qner_idxs, cw_ems, qw_ems, cw_tfs, qw_tfs)

                elif args.name == 'coattn':
                    max_c_len = cw_idxs.size(1)
                    max_q_len = qw_idxs.size(1)

                    c_len = []
                    q_len = []

                    for i in range(cw_idxs.size(0)):
                        if len((cw_idxs[i] == 0).nonzero()) != 0:
                            c_len_i = (cw_idxs[i] == 0).nonzero()[0].item()
                        else:
                            c_len_i = cw_idxs.size(1)

                        if len((qw_idxs[i] == 0).nonzero()) != 0:
                            q_len_i = (qw_idxs[i] == 0).nonzero()[0].item()
                        else:
                            q_len_i = qw_idxs.size(1)

                        c_len.append(int(c_len_i))
                        q_len.append(int(q_len_i))

                    c_len = torch.Tensor(c_len).int()
                    q_len = torch.Tensor(q_len).int()

                    num_examples = int(cw_idxs.size(0) / len(args.gpu_ids))

                    log_p1, log_p2 = model(max_c_len, max_q_len, cw_idxs, qw_idxs, c_len, q_len, num_examples, True, True)

                else:
                    raise NameError('No model named ' + args.name)

                y1, y2 = y1.to(device), y2.to(device)

                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)

                # Add distance penalization
                if 'loss' in args.name:
                    loss += distance_criterion(log_p1, y1) + distance_criterion(log_p2, y2)

                loss_val = loss.item()

                # Backward
                loss.backward()
                nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch,
                                         NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR',
                               optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info('Evaluating at step {}...'.format(step))
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2,
                                                  args.name,
                                                  args.gpu_ids)
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join('{}: {:05.2f}'.format(k, v)
                                            for k, v in results.items())
                    log.info('Dev {}'.format(results_str))

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar('dev/{}'.format(k), v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)