Beispiel #1
0
def get_args(args):
    # Compute derived args values
    device, args.gpu_ids = util.get_available_devices()

    args.batch_size_per_gpu = args.batch_size
    args.batch_size *= max(1, len(args.gpu_ids))
    return args, device
Beispiel #2
0
def get_args(args):
    # Compute derived args values
    device, args.gpu_ids = util.get_available_devices()

    args.batch_size_per_gpu = args.batch_size
    args.batch_size *= max(1, len(args.gpu_ids))
    args.num_steps = args.epoch_size // args.batch_size // args.gradient_accumulation
    if args.num_epochs >= 0:
        args.num_steps *= args.num_epochs

    if args.decay_forever:
        args.num_steps = float("inf")

    return args, device
Beispiel #3
0
def train(args):
    # Set up logging and devices
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}")
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f"Using random seed {args.seed}...")
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info("Loading embeddings...")
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info("Building model...")
    model = BiDAF(
        word_vectors=word_vectors,
        hidden_size=args.hidden_size,
        drop_prob=args.drop_prob,
        use_glove=args.use_glove,
    )
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f"Loading checkpoint from {args.load_path}...")
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = stats.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(
        args.save_dir,
        max_checkpoints=args.max_checkpoints,
        metric_name=args.metric_name,
        maximize_metric=args.maximize_metric,
        log=log,
    )

    # Get optimizer and scheduler
    optimizer = optim.Adam(model.parameters(),
                           args.lr,
                           weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.0)  # Constant LR

    # Get data loader
    log.info("Building dataset...")
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        collate_fn=collate_fn,
    )
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(
        dev_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.num_workers,
        collate_fn=collate_fn,
    )

    # Train
    log.info("Training...")
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    scaler = amp.GradScaler()

    while epoch != args.num_epochs:
        epoch += 1
        log.info(f"Starting epoch {epoch}...")
        with torch.enable_grad(), tqdm(
                total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()

                # Forward
                with amp.autocast():
                    log_p1, log_p2 = model(cw_idxs, qw_idxs)
                    y1, y2 = y1.to(device), y2.to(device)
                    loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                    loss_val = loss.item()

                # Backward
                scaler.scale(loss).backward()
                scaler.unscale_(optimizer)
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                scaler.step(optimizer)
                scaler.update()
                scheduler.step(step // batch_size)
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar("train/NLL", loss_val, step)
                tbx.add_scalar("train/LR", optimizer.param_groups[0]["lr"],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f"Evaluating at step {step}...")
                    ema.assign(model)
                    results, pred_dict = evaluate(
                        model,
                        dev_loader,
                        device,
                        args.dev_eval_file,
                        args.max_ans_len,
                        args.use_squad_v2,
                    )
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ", ".join(f"{k}: {v:05.2f}"
                                            for k, v in results.items())
                    log.info(f"Dev {results_str}")

                    # Log to TensorBoard
                    log.info("Visualizing in TensorBoard...")
                    for k, v in results.items():
                        tbx.add_scalar(f"dev/{k}", v, step)
                    util.visualize(
                        tbx,
                        pred_dict=pred_dict,
                        eval_path=args.dev_eval_file,
                        step=step,
                        split="dev",
                        num_visuals=args.num_visuals,
                    )
Beispiel #4
0
def test(args):
    # Set up logging
    log = util.get_logger(args.save_dir, args.name)
    log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}")
    device, gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(gpu_ids))

    # Get embeddings
    log.info("Loading embeddings...")
    word_vectors = util.torch_from_json(args.word_emb_file)

    # Get model
    log.info("Building model...")
    model = BiDAF(
        word_vectors=word_vectors,
        hidden_size=args.hidden_size,
        use_glove=args.use_glove,
    )
    model = nn.DataParallel(model, gpu_ids)
    log.info(f"Loading checkpoint from {args.load_path}...")
    model = util.load_model(model, args.load_path, gpu_ids, return_step=False)
    model = model.to(device)
    model.eval()

    # Get data loader
    log.info("Building dataset...")
    record_file = vars(args)[f"{args.split}_record_file"]
    dataset = SQuAD(record_file, args.use_squad_v2)
    data_loader = data.DataLoader(
        dataset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.num_workers,
        collate_fn=collate_fn,
    )

    # Evaluate
    log.info(f"Evaluating on {args.split} split...")
    nll_meter = stats.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    eval_file = vars(args)[f"{args.split}_eval_file"]
    with open(eval_file, "r") as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            # Setup for forward
            cw_idxs = cw_idxs.to(device)
            qw_idxs = qw_idxs.to(device)
            batch_size = cw_idxs.size(0)

            # Forward
            log_p1, log_p2 = model(cw_idxs, qw_idxs)
            y1, y2 = y1.to(device), y2.to(device)
            loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
            nll_meter.update(loss.item(), batch_size)

            # Get F1 and EM scores
            p1, p2 = log_p1.exp(), log_p2.exp()
            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            if args.split != "test":
                # No labels for the test set, so NLL would be invalid
                progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(
                gold_dict,
                ids.tolist(),
                starts.tolist(),
                ends.tolist(),
                args.use_squad_v2,
            )
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    # Log results (except for test set, since it does not come with labels)
    if args.split != "test":
        results = {"NLL": nll_meter.avg}
        results.update(eval.eval_dicts(gold_dict, pred_dict,
                                       args.use_squad_v2))

        # Log to console
        results_str = ", ".join(f"{k}: {v:05.2f}" for k, v in results.items())
        log.info(f"{args.split.title()} {results_str}")

        # Log to TensorBoard
        tbx = SummaryWriter(args.save_dir)
        util.visualize(
            tbx,
            pred_dict=pred_dict,
            eval_path=eval_file,
            step=0,
            split=args.split,
            num_visuals=args.num_visuals,
        )

    # Write submission file
    if args.split == "dev":
        sub_path = join(args.save_dir, "val" + "_" + args.sub_file)
    else:
        sub_path = join(args.save_dir, args.split + "_" + args.sub_file)
    log.info(f"Writing submission file to {sub_path}...")
    eval.write_submission(sub_path, sub_dict)
def test(args):
    # Set up logging
    log = util.get_logger(args.save_dir, args.name)
    log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}")
    device, gpu_ids = util.get_available_devices()
    args.batch_size *= max(1, len(gpu_ids))

    # Get embeddings
    log.info("Loading embeddings...")
    word_vectors = util.torch_from_json(args.word_emb_file)

    # TODO: Hardcode padding_idx
    padding_idx = 0

    # Get model
    log.info("Building model...")
    model = WordTransformerQA(
        dim=args.dim,
        n_heads=args.n_heads,
        ff_dim=args.ff_dim,
        activation=args.activation,
        dropout=args.dropout,
        attn_dropout=args.attn_dropout,
        act_dropout=args.act_dropout,
        n_layers=args.n_layers,
        max_positions=args.max_positions,
        word_vectors=word_vectors,
    )
    model = nn.DataParallel(model, gpu_ids)
    log.info(f"Loading checkpoint from {args.load_path}...")
    model = util.load_model(model, args.load_path, gpu_ids, return_step=False)
    model = model.to(device)
    model.eval()

    # Get data loader
    log.info("Building dataset...")
    record_file = vars(args)[f"{args.split}_record_file"]
    dataset = SQuAD(record_file, args.use_squad_v2)
    data_loader = data.DataLoader(
        dataset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.num_workers,
        collate_fn=collate_fn,
    )

    # Evaluate
    log.info(f"Evaluating on {args.split} split...")
    nll_meter = stats.AverageMeter()
    pred_dict = {}  # Predictions for TensorBoard
    sub_dict = {}  # Predictions for submission
    eval_file = vars(args)[f"{args.split}_eval_file"]
    with open(eval_file, "r") as fh:
        gold_dict = json_load(fh)
    with torch.no_grad(), tqdm(total=len(dataset)) as progress_bar:
        for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in data_loader:
            batch_size = cw_idxs.size(0)
            _, loss_val, scores = forward(cw_idxs, qw_idxs, y1, y2,
                                          padding_idx, args, device, model)
            nll_meter.update(loss_val, batch_size)

            # Get F1 and EM scores
            p1, p2 = model.module.get_prob(scores).split(1, dim=-1)
            p1, p2 = p1.squeeze(-1), p2.squeeze(-1)
            starts, ends = util.discretize(p1, p2, args.max_ans_len,
                                           args.use_squad_v2)

            # Log info
            progress_bar.update(batch_size)
            if args.split != "test":
                # No labels for the test set, so NLL would be invalid
                progress_bar.set_postfix(NLL=nll_meter.avg)

            idx2pred, uuid2pred = util.convert_tokens(
                gold_dict,
                ids.tolist(),
                starts.tolist(),
                ends.tolist(),
                args.use_squad_v2,
            )
            pred_dict.update(idx2pred)
            sub_dict.update(uuid2pred)

    # Log results (except for test set, since it does not come with labels)
    if args.split != "test":
        results = eval.eval_dicts(gold_dict, pred_dict, args.use_squad_v2)
        results_list = [
            ("NLL", nll_meter.avg),
            ("F1", results["F1"]),
            ("EM", results["EM"]),
        ]
        if args.use_squad_v2:
            results_list.append(("AvNA", results["AvNA"]))
        results = OrderedDict(results_list)

        # Log to console
        results_str = ", ".join(f"{k}: {v:05.2f}" for k, v in results.items())
        log.info(f"{args.split.title()} {results_str}")

        # Log to TensorBoard
        tbx = SummaryWriter(args.save_dir)
        util.visualize(
            tbx,
            pred_dict=pred_dict,
            eval_path=eval_file,
            step=0,
            split=args.split,
            num_visuals=args.num_visuals,
        )

    # Write submission file
    if args.split == "dev":
        sub_path = join(args.save_dir, "val" + "_" + args.sub_file)
    else:
        sub_path = join(args.save_dir, args.split + "_" + args.sub_file)
    log.info(f"Writing submission file to {sub_path}...")
    eval.write_submission(sub_path, sub_dict)
def train(args):
    # Set up logging and devices
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f"Args: {dumps(vars(args), indent=4, sort_keys=True)}")
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f"Using random seed {args.seed}...")
    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get data loader
    log.info("Building dataset...")
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(
        train_dataset,
        batch_size=args.batch_size,
        shuffle=True,
        num_workers=args.num_workers,
        collate_fn=collate_fn,
    )
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(
        dev_dataset,
        batch_size=args.batch_size,
        shuffle=False,
        num_workers=args.num_workers,
        collate_fn=collate_fn,
    )

    # Get embeddings
    log.info("Loading embeddings...")
    word_vectors = util.torch_from_json(args.word_emb_file)

    # TODO: Hardcode padding idx
    padding_idx = 0

    # Get model
    log.info("Building model...")
    model = WordTransformerQA(
        dim=args.dim,
        n_heads=args.n_heads,
        ff_dim=args.ff_dim,
        activation=args.activation,
        dropout=args.dropout,
        attn_dropout=args.attn_dropout,
        act_dropout=args.act_dropout,
        n_layers=args.n_layers,
        max_positions=args.max_positions,
        word_vectors=word_vectors,
    )
    model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f"Loading checkpoint from {args.load_path}...")
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)

    log.info(model)
    log.info(
        summary(
            model,
            input_size=(args.max_positions, args.batch_size),
            dtypes=[torch.long],
            device=device,
            depth=5,
            verbose=0,
        ))
    model.train()

    # Get saver
    saver = util.CheckpointSaver(
        args.save_dir,
        max_checkpoints=args.max_checkpoints,
        metric_name=args.metric_name,
        maximize_metric=args.maximize_metric,
        log=log,
    )

    # Get optimizer and scheduler
    optimizer = optim.AdamW(
        model.parameters(),
        args.lr,
        betas=(args.beta_1, args.beta_2),
        eps=args.eps,
        weight_decay=args.l2_wd,
    )
    if args.num_epochs < 0:
        max_num_steps = (len(train_loader) +
                         args.num_workers) // args.gradient_accumulation
    else:
        max_num_steps = (args.num_epochs *
                         (len(train_loader) + args.num_workers) //
                         args.gradient_accumulation)

    warmup_steps = int(max_num_steps * args.warmup_ratio)
    if args.decay_forever:
        max_num_steps = -1

    scheduler = sched.LinearWarmupPowerDecay(
        optimizer,
        args.start_lr,
        args.lr,
        args.end_lr,
        warmup_steps,
        max_num_steps,
        power=args.power_decay,
    )

    # Train
    log.info("Training...")
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    scaler = amp.GradScaler()
    fwd_step = 0

    while epoch != args.num_epochs:
        epoch += 1
        log.info(f"Starting epoch {epoch}...")
        with torch.enable_grad(), tqdm(
                total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                batch_size = cw_idxs.size(0)
                loss, loss_val, _ = forward(cw_idxs, qw_idxs, y1, y2,
                                            padding_idx, args, device, model)
                loss = loss / args.gradient_accumulation

                # Backward
                scaler.scale(loss).backward()
                if (fwd_step + 1) % args.gradient_accumulation == 0:
                    scaler.unscale_(optimizer)
                    nn.utils.clip_grad_norm_(model.parameters(),
                                             args.max_grad_norm)
                    scaler.step(optimizer)
                    scaler.update()
                    scheduler.step()
                    optimizer.zero_grad()

                # Log info
                fwd_step += 1
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar("train/NLL", loss_val, step)
                tbx.add_scalar("train/LR", optimizer.param_groups[0]["lr"],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f"Evaluating at step {step}...")
                    results, pred_dict = evaluate(
                        model,
                        dev_loader,
                        device,
                        args.dev_eval_file,
                        args.max_ans_len,
                        args.use_squad_v2,
                        args,
                        padding_idx,
                    )
                    saver.save(step, model, results[args.metric_name], device)

                    # Log to console
                    results_str = ", ".join(f"{k}: {v:05.2f}"
                                            for k, v in results.items())
                    log.info(f"Dev {results_str}")

                    # Log to TensorBoard
                    log.info("Visualizing in TensorBoard...")
                    for k, v in results.items():
                        tbx.add_scalar(f"dev/{k}", v, step)
                    util.visualize(
                        tbx,
                        pred_dict=pred_dict,
                        eval_path=args.dev_eval_file,
                        step=step,
                        split="dev",
                        num_visuals=args.num_visuals,
                    )