def train(args, train_dataset, dev_dataset, model, tokenizer):
    """ Train the model """

    tb_writer = SummaryWriter(os.path.join(args.output_dir, 'TB_writer'))

    if args.dynamic_batching:
        train_sampler = CustomBatchSampler(train_dataset,
                                           args.train_batch_size)
        train_dataloader = DataLoader(train_dataset,
                                      batch_sampler=train_sampler,
                                      num_workers=1,
                                      collate_fn=dynamic_padding_collate_fn)
    else:
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size,
                                      num_workers=1)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=args.fp16_opt_level)
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Batch size = %d", args.train_batch_size)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0

    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    model.train()
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch")

    # Added here for reproductibility
    set_seed(args)

    loss_cum = None
    # torch.autograd.set_detect_anomaly(True)
    for _ in train_iterator:

        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              smoothing=0.05)
        for step, batch_cpu in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            batch = tuple(t.to(args.device) for t in batch_cpu)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "start_positions": batch[2].squeeze(-1),
                "end_positions": batch[3].squeeze(-1),
                "max_ans_length": args.max_ans_length,
            }

            outputs = model(**inputs)
            loss = outputs[0]

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                    if loss_cum is None:
                        loss_cum = loss.detach()
                    else:
                        loss_cum += loss.detach()

            else:
                loss.backward()
                if loss_cum is None:
                    loss_cum = loss.detach()
                else:
                    loss_cum += loss.detach()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                # Log train metrics
                if (not global_step % args.train_logging_steps
                    ) and args.train_logging_steps > 0:
                    tb_writer.add_scalar(
                        'train_loss',
                        loss_cum.item() / args.train_logging_steps,
                        global_step)

                    loss_cum = None
                # Log dev metrics
                if args.dev_logging_steps > 0 and global_step % args.dev_logging_steps == 0 and args.evaluate_during_training:
                    dev_loss = evaluate(args, dev_dataset, model)
                    tb_writer.add_scalar("dev_loss", dev_loss, global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)

                # Save model checkpoint
                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    model.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

        tb_writer.close()
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Train!
    print("***** Running training *****")
    print("  Num examples = %d", len(train_dataset))
    print("  Num Epochs = %d", args.num_train_epochs)
    print("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    print(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    print("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    print("  Total optimization steps = %d", t_total)

    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            print("  Continuing training from checkpoint, will skip to saved global_step")
            print("  Continuing training from epoch %d", epochs_trained)
            print("  Continuing training from global step %d", global_step)
            print("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            print("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    # Added here for reproductibility
    set_seed(args)

    for _ in train_iterator:
        training_pbar = tqdm(total=len(train_dataset),
                         position=0, leave=True,
                         file=sys.stdout, bar_format="{l_bar}%s{bar}%s{r_bar}" % (Fore.GREEN, Fore.RESET))
        for step, batch in enumerate(train_dataloader):
            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            training_pbar.update(batch[0].size(0)) # hiepnh
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                # Log metrics
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.local_rank == -1 and args.evaluate_during_training:
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                # Save model checkpoint
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    # Take care of distributed/parallel training
                    model_to_save = model.module if hasattr(model, "module") else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    print("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    print("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                training_pbar.close() # hiepnh
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #3
0
def train(args, train_dataset, model: PreTrainedModel,
          tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=collate)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if (args.model_name_or_path and os.path.isfile(
            os.path.join(args.model_name_or_path, "optimizer.pt"))
            and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt"))):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model_to_resize = model.module if hasattr(
        model,
        "module") else model  # Take care of distributed/parallel training
    model_to_resize.resize_token_embeddings(len(tokenizer))

    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = mask_tokens(batch, tokenizer,
                                         args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs,
                            masked_lm_labels=labels) if args.mlm else model(
                                inputs, labels=labels)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                            args.local_rank == -1
                            and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir,
                        "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #4
0
def train(args, train_dataset, model, tokenizer, teacher=None):
    """Train the model"""
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    # Added here for reproductibility
    set_seed(args)

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            if teacher is not None:
                teacher.eval()
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }
            if args.model_type != "distilbert":
                inputs[
                    "token_type_ids"] = None if args.model_type == "xlm" else batch[
                        2]
            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
                if args.version_2_with_negative:
                    inputs.update({"is_impossible": batch[7]})
            outputs = model(**inputs)
            loss, start_logits_stu, end_logits_stu = outputs

            # Distillation loss
            if teacher is not None:
                if "token_type_ids" not in inputs:
                    inputs[
                        "token_type_ids"] = None if args.teacher_type == "xlm" else batch[
                            2]
                with torch.no_grad():
                    start_logits_tea, end_logits_tea = teacher(
                        input_ids=inputs["input_ids"],
                        token_type_ids=inputs["token_type_ids"],
                        attention_mask=inputs["attention_mask"],
                    )
                assert start_logits_tea.size() == start_logits_stu.size()
                assert end_logits_tea.size() == end_logits_stu.size()

                loss_fct = nn.KLDivLoss(reduction="batchmean")
                loss_start = loss_fct(
                    nn.functional.log_softmax(
                        start_logits_stu / args.temperature, dim=-1),
                    nn.functional.softmax(start_logits_tea / args.temperature,
                                          dim=-1),
                ) * (args.temperature**2)
                loss_end = loss_fct(
                    nn.functional.log_softmax(
                        end_logits_stu / args.temperature, dim=-1),
                    nn.functional.softmax(end_logits_tea / args.temperature,
                                          dim=-1),
                ) * (args.temperature**2)
                loss_ce = (loss_start + loss_end) / 2.0

                loss = args.alpha_ce * loss_ce + args.alpha_squad * loss

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                             args.max_grad_norm)
                else:
                    nn.utils.clip_grad_norm_(model.parameters(),
                                             args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                # Log metrics
                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.local_rank == -1 and args.evaluate_during_training:
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
    def train(
        self,
        model,
        train_dataloader,
        dev_dataloader,
        dev_dataset,
        device,
        n_gpu,
        eval_fn,
        output_dir,
        save_optimizer,
        eval_params,
        bert_model,
    ):
        results = {}
        best_score = 0.0
        t_total = (len(train_dataloader) // self.gradient_accumulation_steps *
                   self.num_train_epochs)

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                self.weight_decay,
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]

        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=self.learning_rate,
            eps=self.adam_epsilon,
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=t_total,
        )

        global_step = 0
        epochs_trained = 0
        steps_trained_in_current_epoch = 0

        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(
            epochs_trained,
            int(self.num_train_epochs),
            desc="Epoch",
        )
        for epoch in train_iterator:
            epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):

                # Skip past any already trained steps if resuming training
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue

                model.train()
                batch = tuple(t.to(device) for t in batch)
                inputs = {
                    "input_ids": batch[0],
                    "position_ids": batch[1],
                    "token_type_ids": batch[2],
                    "bbox": batch[3],
                    "labels": batch[4],
                }

                outputs = model(**inputs)
                loss = outputs[
                    0]  # model outputs are always tuple in transformers (see doc)

                if n_gpu > 1:
                    loss = (
                        loss.mean()
                    )  # mean() to average on multi-gpu parallel training
                if self.gradient_accumulation_steps > 1:
                    loss = loss / self.gradient_accumulation_steps

                loss.backward()

                tr_loss += loss.item()
                if (step + 1) % self.gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   self.max_grad_norm)

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if self.logging_steps > 0 and global_step % self.logging_steps == 0:
                        loss_scalar = (tr_loss -
                                       logging_loss) / self.logging_steps
                        learning_rate_scalar = scheduler.get_lr()[0]
                        epoch_iterator.set_description(
                            f"Loss :{loss_scalar} LR: {learning_rate_scalar}")
                        logging_loss = tr_loss
            score = self.eval(
                model,
                dev_dataloader,
                dev_dataset,
                device,
                n_gpu,
                eval_fn,
                eval_params,
                mode="dev",
                bert_model=bert_model,
            )
            results[epoch] = score
            with torch.no_grad():
                if score >= best_score:
                    logger.info(f"Storing the new model with score: {score}")
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)

                    torch.save(self.args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info(f"Saving model checkpoint to {output_dir}")
                    if save_optimizer:
                        torch.save(
                            optimizer.state_dict(),
                            os.path.join(output_dir, "optimizer.pt"),
                        )
                        torch.save(
                            scheduler.state_dict(),
                            os.path.join(output_dir, "scheduler.pt"),
                        )
                        logger.info(
                            "Saving optimizer and scheduler states to %s",
                            output_dir)
                    best_score = score

        return results
Example #6
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=int(t_total * args.warmup_proportion),
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Train batch size per GPU = %d", args.train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    mb = master_bar(range(int(args.num_train_epochs)))
    # Added here for reproductibility
    set_seed(args)

    for epoch in mb:
        epoch_iterator = progress_bar(train_dataloader, parent=mb)
        for step, batch in enumerate(epoch_iterator):
            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "distilkobert",
                    "xlm-roberta"
            ]:
                del inputs["token_type_ids"]

            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
                if args.version_2_with_negative:
                    inputs.update({"is_impossible": batch[7]})
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                # Log metrics
                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.evaluate_during_training:
                        results = evaluate(args,
                                           model,
                                           tokenizer,
                                           global_step=global_step)
                        for key in sorted(results.keys()):
                            logger.info("  %s = %s", key, str(results[key]))

                    logging_loss = tr_loss

                # Save model checkpoint
                if args.save_steps > 0 and global_step % args.save_steps == 0:
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Take care of distributed/parallel training
                    model_to_save = model.module if hasattr(
                        model, "module") else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    if args.save_optimizer:
                        torch.save(optimizer.state_dict(),
                                   os.path.join(output_dir, "optimizer.pt"))
                        torch.save(scheduler.state_dict(),
                                   os.path.join(output_dir, "scheduler.pt"))
                        logger.info(
                            "Saving optimizer and scheduler states to %s",
                            output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                break

        mb.write("Epoch {} done".format(epoch + 1))

        if args.max_steps > 0 and global_step > args.max_steps:
            break

    return global_step, tr_loss / global_step
Example #7
0
class NeuralRstParserCoref(object):
    def __init__(self, clf, coref_trainer, data_helper, config):

        self.config = config
        self.data_helper = data_helper

        self.clf = clf
        self.coref_trainer = coref_trainer
        if self.config[MODEL_TYPE] in [2, 3]:
            self.clf.bert = self.coref_trainer.model.encoder.bert
        self.loss = CrossEntropyLoss(reduction='mean').to(config[DEVICE])
        self.optim = None

    def get_optim_scheduler(self, train_loader):
        no_decay = ['bias', 'LayerNorm.weight']
        self.optim = AdamW(
            params=[
                {
                    'params': [
                        p for n, p in self.clf.bert.named_parameters()
                        if not any(nd in n for nd in no_decay)
                    ],
                    'lr':
                    1e-05
                },  # Bert params outside no_decay
                {
                    'params': [
                        p for n, p in self.clf.bert.named_parameters()
                        if any(nd in n for nd in no_decay)
                    ],
                    'weight_decay':
                    0.0,
                    'lr':
                    1e-05
                },  #Bert params in no_decay
                {
                    'params': [
                        p for n, p in
                        self.clf.named_parameters()  # Clf outside no_decay
                        if ("bert" not in n and not any(nd in n
                                                        for nd in no_decay))
                    ]
                },
                {
                    'params': [
                        p for n, p in self.clf.named_parameters()
                        if ("bert" not in n and any(nd in n
                                                    for nd in no_decay))
                    ],
                    'weight_decay':
                    0.0
                },  # Clf params in no_decay
            ],
            lr=0.0002,
            weight_decay=0.01)

        self.num_batches = 34685 / self.clf.config[BATCH_SIZE]
        train_steps = int(20 * self.num_batches)
        if self.config[MODEL_TYPE] > 1:
            self.task_p = self.num_batches / (
                self.num_batches + len(self.coref_trainer.train_corpus))

        self.scheduler = get_linear_schedule_with_warmup(
            self.optim,
            num_warmup_steps=int(train_steps * 0.1),
            num_training_steps=train_steps)

    def train_classifier(self, train_loader):

        # Initialize optimizer and scheduler
        self.get_optim_scheduler(train_loader)

        if os.path.isfile("../data/model/" + self.config[MODEL_NAME]):
            epoch_start = self.load("../data/model/" + self.config[MODEL_NAME])
        else:
            epoch_start = 0

        for epoch in range(epoch_start + 1, 21):
            cost_acc = 0
            self.clf.train()
            if self.config[MODEL_TYPE] > 1:
                self.coref_trainer.model.train()
            print("============ epoch: ", epoch, " ============")
            for i, data in tqdm(enumerate(train_loader)):
                # if 0, train on random datapoint from coref corpus
                while self.config[MODEL_TYPE] > 1 and binomial(
                        1, self.task_p) == 0:
                    cost_acc += self.coref_trainer.train_epoch(i, 1)
                cost_acc += self.train_sample_rst(data)
            print("Total cost for epoch %d is %f" % (epoch, cost_acc))
            print("============ Evaluating on the dev set ============")
            self.save(self.config[MODEL_NAME], epoch)
            self.evaluate()

    def train_sample_rst(self, sample):
        docs, batched_clusters, action_feats, neural_feats, all_actions, all_relations, rel_mask = sample

        self.optim.zero_grad()

        # Forward pass
        if self.clf.config[MODEL_TYPE] in [0, 3]:
            span_embeds = self.clf.get_edus_bert_coref(docs,
                                                       [None] * len(docs),
                                                       neural_feats)
        elif self.clf.config[MODEL_TYPE] in [1, 2]:
            span_embeds = self.clf.get_edus_bert_coref(docs, batched_clusters,
                                                       neural_feats)

        # Compute action loss
        action_probs, rel_probs = self.clf.decode_action_coref(
            span_embeds, action_feats)
        cost = self.loss(action_probs.to(self.config[DEVICE]),
                         all_actions.to(self.config[DEVICE]))

        # Compute relation loss
        rel_probs, rel_labels = rel_probs[rel_mask], all_relations[rel_mask]
        if rel_labels.shape[0] > 0:
            cost += self.loss(rel_probs.to(self.config[DEVICE]),
                              rel_labels.to(self.config[DEVICE]))

        # Update the model
        cost.backward()
        nn.utils.clip_grad_norm_(self.clf.parameters(), 1.0)
        self.optim.step()
        self.scheduler.step()

        return cost.item()

    def sr_parse(self, doc, gold_actions, gold_rels):

        # Generate coref clusters for the document
        if self.clf.config[MODEL_TYPE] in [1, 2]:
            with torch.no_grad():
                clusters, _ = self.coref_trainer.predict_clusters(doc)
        else:
            clusters = None

        # Stack/Queue state
        conf = ParsingState([], [], self.clf.config)
        conf.init(doc)
        all_action_probs, all_rel_probs = [], []
        # Until the tree is built
        while not conf.end_parsing():

            # Get features for the current stack/queue state, and span boundaries
            stack, queue = conf.get_status()
            fg = ActionFeatureGenerator(stack, queue, [], doc,
                                        self.data_helper, self.config)
            action_feat, span_boundary = fg.gen_features()
            span_embeds = self.clf.get_edus_bert_coref([doc], [clusters],
                                                       [span_boundary])
            action_probs, rel_probs = self.clf.decode_action_coref(
                span_embeds, [action_feat])
            all_action_probs.append(action_probs.squeeze())
            sorted_action_idx = torch.argsort(action_probs, descending=True)
            sorted_rel_idx = torch.argsort(rel_probs, descending=True)

            # Select Shift/Reduce action (shift/reduce-nn/...)
            action_idx = 0
            pred_action, pred_nuc = xidx_action_map[int(
                sorted_action_idx[0, action_idx])]
            while not conf.is_action_allowed(
                (pred_action, pred_nuc, None), doc):
                action_idx += 1
                pred_action, pred_nuc = xidx_action_map[int(
                    sorted_action_idx[0, action_idx])]

            # Select Relation annotation
            pred_rel = None
            if pred_action != "Shift":
                all_rel_probs.append(rel_probs.squeeze())
                pred_rel_idx = int(sorted_rel_idx[0, 0])
                pred_rel = xidx_relation_map[pred_rel_idx]
            #assert not (pred_action == "Reduce" and pred_rel is None)
            if (pred_action == "Reduce" and pred_rel is None):
                print(
                    "Warning: got a Reduce with a None relation. Replacing with Elaboration"
                )
                pred_rel = "Elaboration"

            predictions = (pred_action, pred_nuc, pred_rel)
            conf.operate(predictions)

        # Shift/Reduce loss
        cost = self.loss(torch.stack(all_action_probs), gold_actions)

        # Relation annotation loss
        if all_rel_probs != []:
            cost_relation = self.loss(torch.stack(all_rel_probs), gold_rels)
            cost += cost_relation

        tree = conf.get_parse_tree()
        rst_tree = RstTree()
        rst_tree.assign_tree(tree)
        rst_tree.assign_doc(doc)
        rst_tree.back_prop(tree, doc)

        return rst_tree, cost.item()

    def evaluate(self):
        self.clf.eval()
        if self.config[MODEL_TYPE] > 1:
            self.coref_trainer.model.eval()
        with torch.no_grad():
            eval = Evaluator(self, self.data_helper, self.config)
            eval.eval_parser(self.data_helper.val_trees)

    def save(self, model_name, epoch):
        """Save models
        """
        save_dict = {
            'epoch': epoch,
            'model_state_dict': self.clf.state_dict(),
            'optimizer_state_dict': self.optim.state_dict(),
            'scheduler_state_dict': self.scheduler.state_dict()
        }
        if self.clf.config[MODEL_TYPE] in [2, 3]:
            save_dict.update({
                'coref_state_dict':
                self.coref_trainer.model.state_dict(),
                'coref_optimizer_state_dict':
                self.coref_trainer.optimizer.state_dict(),
                'coref_scheduler_state_dict':
                self.coref_trainer.scheduler.state_dict(),
            })

        torch.save(save_dict, os.path.join("../data/model/", model_name))

    def load(self, model_dir):
        """ Load models
        """
        model_save = torch.load(model_dir)
        cleanup_load_dict(model_save)
        self.clf.load_state_dict(model_save['model_state_dict'])
        self.clf.eval()
        if self.optim is not None:
            self.optim.load_state_dict(model_save['optimizer_state_dict'])
            self.scheduler.load_state_dict(model_save['scheduler_state_dict'])

        if self.config[MODEL_TYPE] in [2, 3]:
            self.coref_trainer.model.load_state_dict(
                model_save['coref_state_dict'])
            self.coref_trainer.model.eval()
            self.coref_trainer.optimizer.load_state_dict(
                model_save['coref_optimizer_state_dict'])
            self.coref_trainer.scheduler.load_state_dict(
                model_save['coref_scheduler_state_dict'])
            self.clf.bert = self.coref_trainer.model.encoder.bert
        return model_save['epoch']
Example #8
0
def _train(task, logger, tb_writer, model, tokenizer, dataset, max_steps,
           num_train_epochs, gradient_accumulation_steps, weight_decay,
           learning_rate, adam_epsilon, max_grad_norm, warmup_steps, fp16,
           fp16_opt_level, n_gpu, local_rank, evaluate_during_training,
           evaluate_func, per_gpu_train_batch_size, device, output_dir,
           model_type, model_name_or_path, configs, seed, logging_steps,
           save_steps, **kwargs):
    """
      The basic training process function
    """
    train_batch_size = per_gpu_train_batch_size * max(1, n_gpu)
    train_sampler = RandomSampler(
        dataset) if local_rank == -1 else DistributedSampler(dataset)
    train_dataloader = DataLoader(
        dataset, sampler=train_sampler,
        batch_size=train_batch_size)  # 直接shuffle = true, 默认使用Randomsampler

    if max_steps > 0:  # 判断所有步数
        t_total = max_steps
        num_train_epochs = max_steps // (len(train_dataloader) //
                                         gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader) // gradient_accumulation_steps * num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            float(weight_decay),  # 默认的权重衰减值为0
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=float(learning_rate),
                      eps=float(adam_epsilon))
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=warmup_steps,
                                                num_training_steps=t_total)

    if os.path.isfile(os.path.join(
            model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(model_name_or_path, "scheduler.pt")))

    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[local_rank],
            output_device=local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(dataset))
    logger.info("  Num Epochs = %d", num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        train_batch_size * gradient_accumulation_steps *
        (torch.distributed.get_world_size() if local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from  checkpoint
    if os.path.exists(model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
        if re.match("checkpoint-\d+", model_name_or_path):
            global_step = int(model_name_or_path.split("-")[-1].split("/")[0])
        epochs_trained = global_step // (len(train_dataloader) //
                                         gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (
            len(train_dataloader) // gradient_accumulation_steps)

        logger.info(
            "  Continuing training from checkpoint, will skip to saved global_step"
        )
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info("  Will skip the first %d steps in the first epoch",
                    steps_trained_in_current_epoch)

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(num_train_epochs),
                            desc="Epoch",
                            disable=local_rank not in [-1, 0])

    set_seed(seed, n_gpu)  # Added here for reproductibility
    for _ in train_iterator:  # epoch
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):  # dataitor

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(device) for t in batch)
            # decide inputs based one task type
            inputs = _decide_inputs(task, batch, model_type)
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in pytorch-transformers (see doc)
            if n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if gradient_accumulation_steps > 1:
                loss = loss / gradient_accumulation_steps

            if fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % gradient_accumulation_steps == 0:
                if fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if local_rank in [
                        -1, 0
                ] and logging_steps > 0 and global_step % logging_steps == 0:
                    # Log metrics
                    if (
                            local_rank == -1 and evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results, _ = evaluate_func(mode="dev",
                                                   model=model,
                                                   tokenizer=tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         logging_steps, global_step)
                    logging_loss = tr_loss

                if local_rank in [
                        -1, 0
                ] and save_steps > 0 and global_step % save_steps == 0:
                    # Save model checkpoint
                    outputdir = os.path.join(
                        output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(outputdir):
                        os.makedirs(outputdir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(
                        outputdir)  # TODO: check save_pretrained method
                    tokenizer.save_pretrained(outputdir)

                    torch.save(configs,
                               os.path.join(outputdir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", outputdir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(outputdir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(outputdir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                outputdir)

            if max_steps > 0 and global_step > max_steps:
                epoch_iterator.close()
                break
        if max_steps > 0 and global_step > max_steps:
            train_iterator.close()
            break

    if local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #9
0
def main():
    # argument parsing
    parser = argparse.ArgumentParser()

    parser.add_argument('--max-epochs', type=int, default=2)
    parser.add_argument('--batch-size', type=int, default=4)
    parser.add_argument('--max-sequence-length', type=int, default=128)

    parser.add_argument('--seed', type=int, default=None)
    parser.add_argument('--data-dir', type=str, default='data')
    parser.add_argument('--real-dataset', type=str, default='webtext')
    parser.add_argument('--fake-dataset', type=str, default='xl-1542M-nucleus')
    parser.add_argument('--save-dir', type=str, default='bert_logs')

    parser.add_argument('--learning-rate', type=float, default=2e-5)
    parser.add_argument('--weight-decay', type=float, default=0)
    parser.add_argument('--model-name', type=str, default='bert-base-cased')
    parser.add_argument('--wandb', type=bool, default=True)

    args = parser.parse_args()
    if args.wandb:
        wandb.init(project=args.model_name)

    device = "cuda" if torch.cuda.is_available() else "cpu"

    # config, tokenizer, model
    config = AutoConfig.from_pretrained(
        args.model_name,
        num_labels=2
    )

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    tokenization_utils.logger.setLevel('DEBUG')

    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name,
        config=config
    )
    model.to(device)

    # load data
    train_loader, validation_loader, test_loader = load_datasets(args, tokenizer)

    # my model
    optimizer = AdamW(model.parameters(), lr=args.learning_rate, weight_decay=args.weight_decay)

    best_val = 0.
    for epoch in range(args.max_epochs):
        train(model, optimizer, train_loader, args, device)
        val_acc = validation(model, validation_loader, args, device)
        test_acc = test(model, test_loader, args, device)

        print(f"Epoch {epoch + 1} | val_acc: {val_acc} test_acc: {test_acc}")

        if val_acc > best_val:
            os.makedirs(args.save_dir, exist_ok=True)
            model_name = 'baseline_' + args.model_name + '.pt'
            model_to_save = model.module if hasattr(model, 'module') else model
            torch.save(dict(
                epoch=epoch+1,
                model_state_dict=model_to_save.state_dict(),
                optimizer_state_dict=optimizer.state_dict(),
                args=args
            ),
                os.path.join(args.save_dir, model_name)
            )
            print("Model saved to", args.save_dir)
            best_val = val_acc
Example #10
0
        ins_accum += BATCH_SIZE

        if niter % 1000 == 0:
            print("experiment on val...")
            model.eval()
            val_loss, val_acc = evaluate(model, val_dataset, val_labels)
            model.train()
            train_log['Val/Loss'].append((epoch, niter, val_loss))
            train_log['Val/Acc'].append((epoch, niter, val_acc))
            print(train_log['Val/Loss'][-1])
            print(train_log['Val/Acc'][-1])
            if val_acc > best_val_acc:
                best_val_acc = val_acc
                state = {
                    'net': model.state_dict(),
                    'optimizer': optim.state_dict(),
                    'epoch': epoch
                }
                train_log['Save'].append((epoch, niter, best_val_acc))
                print('saving best model at epoch {} iter {}'.format(
                    epoch, niter))
                torch.save(state, 'state_bert_base_best.pth')

        if niter % 100 == 0:
            # print(niter, input_ids, outputs, labels)
            train_log['Train/Loss'].append(
                (epoch, niter, loss_accum / ins_accum))
            train_log['Train/Acc'].append(
                (epoch, niter, correct_accum / ins_accum))
            print(train_log['Train/Loss'][-1])
            print(train_log['Train/Acc'][-1])
Example #11
0
def main():

    print('Start')
    parser = argparse.ArgumentParser()

    # Add the arguments to the parser
    parser.add_argument("--model_name", required=True)
    parser.add_argument("--checkpoint_input_path", required=False)
    parser.add_argument("--checkpoint_output_path", required=True)
    parser.add_argument("--bioasq_path", required=True)
    parser.add_argument("--seed", default=1995)
    parser.add_argument("--learning_rate", default=5e-5, type=float)
    parser.add_argument("--batch_size", default=16, type=int)
    parser.add_argument("--epochs", default=3, type=int)

    args = vars(parser.parse_args())

    random.seed(args['seed'])

    with open(args['bioasq_path'], 'rb') as f:
        bio_list_raw = json.load(f)['questions']

    bio_list_raw = [
        question for question in bio_list_raw if question['type'] == 'list'
    ]
    bio_list_questions = [question['body'] for question in bio_list_raw]
    bio_list_ids = [question['id'] for question in bio_list_raw]
    bio_list_answers = [question['exact_answer'] for question in bio_list_raw]
    bio_snippets = {
        question['id']: [snippet['text'] for snippet in question['snippets']]
        for question in bio_list_raw
    }
    print(f'Number of questions: {len(bio_list_questions)}')

    ids = []
    snippets = []
    for key, value in bio_snippets.items():
        for snippet in value:
            ids.append(key)
            snippets.append(snippet)

    snippets_df = pd.DataFrame({'id': ids, 'snippet': snippets})
    questions_df = pd.DataFrame({
        'id': bio_list_ids,
        'question': bio_list_questions,
        'label': bio_list_answers
    })
    val_df = pd.merge(snippets_df, questions_df, how='left', on='id')

    ids = []
    labels = []
    snippets = []
    questions = []
    for index, row in val_df.iterrows():
        ids += [row['id'] + f'_{i}' for i in range(len(row['label']))]
        labels += [row['label'][i][0] for i in range(len(row['label']))]
        snippets += [row['snippet'] for i in range(len(row['label']))]
        questions += [row['question'] for i in range(len(row['label']))]
    list_df = pd.DataFrame({
        'id': ids,
        'question': questions,
        'snippet': snippets,
        'label': labels
    })

    list_df = list_df.sample(16)

    def get_start_answer(row):
        label = row['label'].lower()
        context = row['snippet'].lower()
        if label in context:
            return context.index(label)
        return None

    list_df['answer_start'] = list_df.apply(get_start_answer, axis=1)

    clean_df = list_df[~list_df.answer_start.isnull()]

    bio_list_questions = list(clean_df.question)
    bio_list_contexts = list(clean_df.snippet)
    bio_list_answers = [{
        'text': row['label'],
        'answer_start': int(row['answer_start'])
    } for index, row in clean_df.iterrows()]

    from transformers import BertTokenizerFast
    tokenizer_fast = BertTokenizerFast.from_pretrained(
        args['model_name'],
        do_lower_case=True,
        padding=True,
        truncation=True,
        add_special_tokens=True,
        model_max_length=1000000000)

    # In[26]:

    from squad_processing import add_end_idx, add_token_positions

    add_end_idx(bio_list_answers, bio_list_contexts)

    # In[27]:

    list_encodings = tokenizer_fast(bio_list_contexts,
                                    bio_list_questions,
                                    add_special_tokens=True,
                                    truncation=True,
                                    padding=True,
                                    max_length=500)

    # In[29]:

    add_token_positions(list_encodings, bio_list_answers, tokenizer_fast)

    # In[30]:

    from torch.utils.data import Dataset

    class SquadDataset(Dataset):
        def __init__(self, encodings):
            self.encodings = encodings

        def __getitem__(self, idx):
            #print(self.encodings['start_positions'][idx])
            #{key: torch.tensor(val[idx], dtype = torch.long) for key, val in self.encodings.items()}
            return {
                'input_ids':
                torch.tensor(self.encodings['input_ids'][idx],
                             dtype=torch.long),
                'attention_mask':
                torch.tensor(self.encodings['attention_mask'][idx],
                             dtype=torch.long),
                'start_positions':
                torch.tensor(self.encodings['start_positions'][idx],
                             dtype=torch.long),
                'end_positions':
                torch.tensor(self.encodings['end_positions'][idx],
                             dtype=torch.long)
            }

        def __len__(self):
            return len(self.encodings.input_ids)

    # In[32]:

    train_bio_list = SquadDataset(list_encodings)

    # In[46]:

    from transformers import BertPreTrainedModel, BertModel
    from torch import nn
    from torch.utils.data import DataLoader
    from transformers import AdamW
    from transformers.modeling_outputs import QuestionAnsweringModelOutput
    import torch
    from torch.nn import CrossEntropyLoss

    # In[47]:

    class BertForQuestionAnswering(BertPreTrainedModel):

        _keys_to_ignore_on_load_unexpected = [r"pooler"]

        def __init__(self, config):
            super().__init__(config)
            self.num_labels = config.num_labels

            self.bert = BertModel(config, add_pooling_layer=False)
            self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)

            self.init_weights()

        def forward(
            self,
            input_ids=None,
            attention_mask=None,
            token_type_ids=None,
            position_ids=None,
            head_mask=None,
            inputs_embeds=None,
            start_positions=None,
            end_positions=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None,
        ):
            r"""
	        start_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
	            Labels for position (index) of the start of the labelled span for computing the token classification loss.
	            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
	            sequence are not taken into account for computing the loss.
	        end_positions (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
	            Labels for position (index) of the end of the labelled span for computing the token classification loss.
	            Positions are clamped to the length of the sequence (:obj:`sequence_length`). Position outside of the
	            sequence are not taken into account for computing the loss.
	        """
            return_dict = return_dict if return_dict is not None else self.config.use_return_dict

            outputs = self.bert(
                input_ids,
                attention_mask=attention_mask,
                token_type_ids=token_type_ids,
                position_ids=position_ids,
                head_mask=head_mask,
                inputs_embeds=inputs_embeds,
                output_attentions=output_attentions,
                output_hidden_states=output_hidden_states,
                return_dict=return_dict,
            )

            sequence_output = outputs[0]

            logits = self.qa_outputs(sequence_output)
            start_logits, end_logits = logits.split(1, dim=-1)
            start_logits = start_logits.squeeze(-1)
            end_logits = end_logits.squeeze(-1)

            total_loss = None
            if start_positions is not None and end_positions is not None:
                # If we are on multi-GPU, split add a dimension
                if len(start_positions.size()) > 1:
                    start_positions = start_positions.squeeze(-1)
                if len(end_positions.size()) > 1:
                    end_positions = end_positions.squeeze(-1)
                # sometimes the start/end positions are outside our model inputs, we ignore these terms
                ignored_index = start_logits.size(1)
                start_positions.clamp_(0, ignored_index)
                end_positions.clamp_(0, ignored_index)

                loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
                start_loss = loss_fct(start_logits, start_positions)
                end_loss = loss_fct(end_logits, end_positions)
                total_loss = (start_loss + end_loss) / 2

            if not return_dict:
                output = (start_logits, end_logits) + outputs[2:]
                return ((total_loss, ) +
                        output) if total_loss is not None else output

            return QuestionAnsweringModelOutput(
                loss=total_loss,
                start_logits=start_logits,
                end_logits=end_logits,
                hidden_states=outputs.hidden_states,
                attentions=outputs.attentions,
            )

    # In[48]:

    model = BertForQuestionAnswering.from_pretrained(args['model_name'])

    # In[49]:

    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    checkpoint = torch.load(args['checkpoint_input_path'], map_location=device)
    model.load_state_dict({
        key.replace('module.', ''): value
        for key, value in checkpoint.items()
    })

    # In[50]:

    model.to(device)
    model.train()
    from torch.nn import DataParallel

    model = DataParallel(model)

    train_loader = DataLoader(train_bio_list,
                              batch_size=args['batch_size'],
                              shuffle=True)

    optim = AdamW(model.parameters(), lr=args['learning_rate'])

    # In[51]:

    # Train on BioAsq
    from barbar import Bar

    for epoch in range(args['epochs']):
        for i, batch in enumerate(Bar(train_loader)):
            optim.zero_grad()
            input_ids = batch['input_ids'].to(device, dtype=torch.long)
            attention_mask = batch['attention_mask'].to(device,
                                                        dtype=torch.long)
            start_positions = batch['start_positions'].to(device,
                                                          dtype=torch.long)
            end_positions = batch['end_positions'].to(device, dtype=torch.long)
            outputs = model(input_ids,
                            attention_mask=attention_mask,
                            start_positions=start_positions,
                            end_positions=end_positions)
            loss = outputs[0]
            loss.sum().backward()
            optim.step()
    model.eval()

    # In[ ]:

    torch.save(
        {
            'epoch': 3,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optim.state_dict(),
            'loss': loss,
        }, args['checkpoint_output_path'] + '/checkpoint_list.pt')
Example #12
0
def train(args, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

        # Initial train dataloader
        if args.use_random_candidates:
            train_dataset, _, _= load_and_cache_examples(args, tokenizer)
        elif args.use_hard_negatives or args.use_hard_and_random_negatives:
            train_dataset, _, _ = load_and_cache_examples(args, tokenizer, model)
        else:
            train_dataset, _, _ = load_and_cache_examples(args, tokenizer)

        args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
        train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if args.resume_path is not None and os.path.isfile(os.path.join(args.resume_path, "optimizer.pt")) \
            and os.path.isfile(os.path.join(args.resume_path, "scheduler.pt")
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.resume_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.resume_path, "scheduler.pt")))
        logger.info("INFO: Optimizer and scheduler state loaded successfully.")

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # For debugging: Register backward hooks to check gradient

    # def hook(self, grad_in, grad_out):
    #     print(self)
    #     print('grad_in')
    #     print([_grad_in for _grad_in in grad_in if _grad_in is not None])
    #     print('grad_out')
    #     print([_grad_out for _grad_out in grad_out if _grad_out is not None])
    #
    # for module in model.modules():
    #     module.register_backward_hook(hook)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.resume_path is not None:
        # set global_step to global_step of last saved checkpoint from model path
        # global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
        global_step = int(args.resume_path.split("/")[-2].split("-")[-1])
        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproductibility
    for epoch_num in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)


            ner_inputs = {"args": args,
                          "mention_token_ids": batch[0],
                          "mention_token_masks": batch[1],
                          "mention_start_indices": batch[7],
                          "mention_end_indices": batch[8],
                          "mode": 'ner',
                          }

            if args.use_hard_and_random_negatives:
                ned_inputs = {"args": args,
                              "last_hidden_states": None,
                              "mention_start_indices": batch[7],
                              "mention_end_indices": batch[8],
                              "candidate_token_ids_1": batch[2],
                              "candidate_token_masks_1": batch[3],
                              "candidate_token_ids_2": batch[4],
                              "candidate_token_masks_2": batch[5],
                              "labels": batch[6],
                              "mode": 'ned',
                              }
            else:
                ned_inputs = {"args": args,
                              "mention_token_ids": batch[0],
                              "mention_token_masks": batch[1],
                              "mention_start_indices": batch[7],
                              "mention_end_indices": batch[8],
                              "candidate_token_ids_1": batch[2],
                              "candidate_token_masks_1": batch[3],
                              "labels": batch[6],
                              "mode": 'ned',
                              }
            if args.ner:
                loss, _ = model.forward(**ner_inputs)
            elif args.alternate_batch:
                # Randomly choose whether to do tagging or NED for the current batch
                if random.random() <= 0.5:
                    loss = model.forward(**ner_inputs)
                else:
                    loss, _ = model.forward(**ned_inputs)
            elif args.ner_and_ned:
                ner_loss, last_hidden_states = model.forward(**ner_inputs)
                ned_inputs["last_hidden_states"] = last_hidden_states
                ned_loss, _ = model.forward(**ned_inputs)
                loss = ner_loss + ned_loss
            else:
                logger.info(" Specify a training protocol from (ner, alternate_batch, ner_and_ned)")

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break

        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

        # New data loader for the next epoch
        if args.use_random_candidates:
            # New data loader at every epoch for random sampler if we use random negative samples
            train_dataset, _, _= load_and_cache_examples(args, tokenizer)
            args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
            train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(
                train_dataset)
            train_dataloader = DataLoader(train_dataset, sampler=train_sampler,
                                          batch_size=args.train_batch_size)
        elif args.use_hard_negatives or args.use_hard_and_random_negatives:
            # New data loader at every epoch for hard negative sampler if we use hard negative mining
            train_dataset, _, _= load_and_cache_examples(args, tokenizer, model)
            args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
            train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(
                train_dataset)
            train_dataloader = DataLoader(train_dataset, sampler=train_sampler,
                                          batch_size=args.train_batch_size)

            # Anneal the lamba_1 nd lambda_2 weights
            args.lambda_1 = args.lambda_1 - 1 / (epoch_num + 1)
            args.lambda_2 = args.lambda_2 + 1 / (epoch_num + 1)

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
                predicted = torch.max(logits, 1)[1]
                # print("labels:")
                num_total += labels.size(0)
                num_correct += (predicted == labels).sum().item()
                running_loss += loss.item()
                print("predicted, labels:",
                      predicted.cpu().detach().numpy(),
                      labels.cpu().detach().numpy())

        print('Validation Accuracy: {}'.format(num_correct / num_total),
              'Average Loss: {}'.format(running_loss / len(valid_loader)))


model.to(device)
lr = 3e-5
# optimizer = optim.SGD(model.parameters(), lr=lr)
optimizer = AdamW(model.parameters(), lr=lr)
num_epochs = 25

train(model, train_dataloader, valid_dataloader, optimizer, num_epochs)

torch.save(model.state_dict(), 'model.npy')
torch.save(optimizer.state_dict(), 'optimizer.npy')

#
# num_epochs = 15
# train(model, train_dataloader, valid_dataloader, optimizer, criterion, num_epochs)
#
# torch.save(model.state_dict(), 'model.npy')
# torch.save(optimizer.state_dict(), 'optimizer.npy')
Example #14
0
def main():

    print('Start')
    parser = argparse.ArgumentParser()

    # Add the arguments to the parser
    parser.add_argument("--model_name", required=True)
    parser.add_argument("--checkpoint_input_path", required=False)
    parser.add_argument("--checkpoint_output_path", required=True)
    parser.add_argument("--bioasq_path", required=True)
    parser.add_argument("--seed", default=1995)
    parser.add_argument("--learning_rate", default=5e-5, type=float)
    parser.add_argument("--batch_size", default=16, type=int)
    parser.add_argument("--epochs", default=3, type=int)

    parser.add_argument('--mid_layer', dest='mid_layer', action='store_true')
    parser.add_argument('--no-mid_layer',
                        dest='mid_layer',
                        action='store_false')
    parser.set_defaults(mid_layer=True)

    parser.add_argument('--balance', dest='balance', action='store_true')
    parser.add_argument('--no-balance', dest='balance', action='store_false')
    parser.set_defaults(balance=True)

    parser.add_argument("--mid_layer_size", default=256, type=int)

    args = vars(parser.parse_args())

    print(args['mid_layer'])

    random.seed(args['seed'])
    device = torch.device(
        'cuda') if torch.cuda.is_available() else torch.device('cpu')

    with open(args['bioasq_path'], 'rb') as f:
        bio_yn_raw = json.load(f)['questions']
    bio_yn = [
        question for question in bio_yn_raw if question['type'] == 'yesno'
    ]
    bio_yn_questions = [question['body'] for question in bio_yn]
    bio_yn_ids = [question['id'] for question in bio_yn]
    bio_yn_answers = [question['exact_answer'] for question in bio_yn]
    bio_snippets = {
        question['id']: [snippet['text'] for snippet in question['snippets']]
        for question in bio_yn
    }

    ids = []
    snippets = []
    for key, value in bio_snippets.items():
        for snippet in value:
            ids.append(key)
            snippets.append(snippet)

    snippets_df = pd.DataFrame({'id': ids, 'snippet': snippets})
    questions_df = pd.DataFrame({
        'id': bio_yn_ids,
        'question': bio_yn_questions,
        'label': bio_yn_answers
    })
    bio_yn_df = pd.merge(snippets_df, questions_df, how='left', on='id')

    bio_yn_df = bio_yn_df.sample(32)
    no_size = bio_yn_df[bio_yn_df.label == 'no'].shape[0]
    yes_index = bio_yn_df[bio_yn_df.label == 'yes'].index
    random_index = np.random.choice(yes_index, no_size, replace=False)
    yes_sample = bio_yn_df.loc[random_index]
    bio_yn_balanced = pd.concat(
        [yes_sample, bio_yn_df[bio_yn_df.label == 'no']])

    bio_yn_balanced = bio_yn_balanced.sample(frac=1)

    if args['balance']:
        train_a = list(bio_yn_balanced.question)
        train_b = list(bio_yn_balanced.snippet)
        train_labels = [
            int(answer == 'yes') for answer in bio_yn_balanced.label
        ]
    else:
        train_a = list(bio_yn_df.question)
        train_b = list(bio_yn_df.snippet)
        train_labels = [int(answer == 'yes') for answer in bio_yn_df.label]

    from transformers import BertTokenizer
    # Load the BERT tokenizer.
    tokenizer = BertTokenizer.from_pretrained(args['model_name'],
                                              do_lower_case=True)

    # In[39]:

    train_tokens = tokenizer(train_a,
                             train_b,
                             add_special_tokens=True,
                             max_length=500,
                             truncation=True,
                             padding=True)
    train_tokens['labels'] = train_labels

    # In[40]:

    from torch.utils.data import Dataset, DataLoader

    class MnliDataset(Dataset):
        def __init__(self, encodings):
            self.encodings = encodings

        def __getitem__(self, idx):
            #print(self.encodings['start_positions'][idx])
            #{key: torch.tensor(val[idx], dtype = torch.long) for key, val in self.encodings.items()}
            return {
                'input_ids':
                torch.tensor(self.encodings['input_ids'][idx],
                             dtype=torch.long),
                'attention_mask':
                torch.tensor(self.encodings['attention_mask'][idx],
                             dtype=torch.long),
                'token_type_ids':
                torch.tensor(self.encodings['token_type_ids'][idx],
                             dtype=torch.long),
                'labels':
                torch.tensor(self.encodings['labels'][idx], dtype=torch.long)
            }

        def __len__(self):
            return len(self.encodings.input_ids)

    train_dataset = MnliDataset(train_tokens)

    # In[5]:

    # In[4]:

    from transformers import BertForSequenceClassification

    model = BertForSequenceClassification.from_pretrained(args['model_name'],
                                                          num_labels=3)
    checkpoint = torch.load(args['checkpoint_input_path'], map_location=device)
    model.load_state_dict({
        key.replace('module.', ''): value
        for key, value in checkpoint.items()
    })

    # freeze all the parameters
    #for param in model.parameters():
    #    param.requires_grad = False
    # In[73]:

    class BERT_Arch(nn.Module):
        def __init__(self, model):

            super(BERT_Arch, self).__init__()

            self.model = model

            # dropout layer
            self.dropout = nn.Dropout(0.1)

            # relu activation function
            self.relu = nn.ReLU()
            # dense layer 1
            if args['mid_layer']:
                self.fc1 = nn.Linear(3, args['mid_layer_size'])

                self.fc2 = nn.Linear(args['mid_layer_size'], 2)
            else:
                self.fc1 = nn.Linear(3, 2)

            #softmax activation function
            self.softmax = nn.LogSoftmax(dim=1)

        #define the forward pass
        def forward(self, input_ids, attention_mask, token_type_ids, labels):

            #pass the inputs to the model
            outputs = self.model(input_ids,
                                 attention_mask=attention_mask,
                                 token_type_ids=token_type_ids,
                                 labels=labels)

            cls_hs = outputs.logits

            if args['mid_layer']:
                x = self.fc1(cls_hs)

                x = self.relu(x)

                x = self.dropout(x)

                # output layer
                x = self.fc2(x)

                # apply softmax activation
                x = self.softmax(x)
            else:
                x = self.dropout(cls_hs)

                # output layer
                x = self.fc1(x)

                # apply softmax activation
                x = self.softmax(x)

            return x

    # In[74]:

    model_full = BERT_Arch(model)

    # In[81]:

    from torch.utils.data import DataLoader
    from transformers import AdamW
    from torch.nn import DataParallel

    model_full.to(device)
    model_full.train()

    model_full = DataParallel(model_full)

    train_loader = DataLoader(train_dataset,
                              batch_size=args['batch_size'],
                              shuffle=True)

    optim = AdamW(model.parameters(), lr=args['learning_rate'])

    # In[83]:

    cross_entropy = nn.NLLLoss()
    for epoch in range(args['epochs']):
        for i, batch in enumerate(Bar(train_loader)):
            optim.zero_grad()
            input_ids = batch['input_ids'].to(device, dtype=torch.long)
            attention_mask = batch['attention_mask'].to(device,
                                                        dtype=torch.long)
            token_type_ids = batch['token_type_ids'].to(device,
                                                        dtype=torch.long)
            labels = batch['labels'].to(device, dtype=torch.long)
            outputs = model_full(input_ids,
                                 attention_mask=attention_mask,
                                 token_type_ids=token_type_ids,
                                 labels=labels)
            #loss = outputs.loss
            loss = cross_entropy(outputs, labels)
            loss.backward()
            optim.step()
    model_full.eval()

    # In[ ]:
    if args['mid_layer']:
        checkpoint_output = args[
            'checkpoint_output_path'] + '/checkpoint_yn_' + str(
                args['mid_layer_size']) + '.pt'
    else:
        checkpoint_output = args[
            'checkpoint_output_path'] + '/checkpoint_yn_direct.pt'

    torch.save(
        {
            'epoch': args['epochs'],
            'model_state_dict': model_full.state_dict(),
            'optimizer_state_dict': optim.state_dict(),
            'loss': loss,
        }, checkpoint_output)
Example #15
0
class BertForSeqFinetune():
    def __init__(self,
                 model_name,
                 config,
                 num_labels,
                 model_desc,
                 save_ckpt_path,
                 save_flag,
                 hf_model_class,
                 hf_token_class,
                 model_class_type,
                 vocab_file=None,
                 model_weights=None,
                 from_tf=False):
        """

        Params
        ------

        model_name: model_name e.g. "bert-base-uncased" or path
        config: BertConfig object that is initialised with the same model_name
        num_labels: number of classes for finetuning categorical data
        model_desc: description of model used to name checkpoints that are saved with training
        save_ckpt_path: base path for saving checkpoints.
        save_flag: Whether to save, not used for model evaluation, i.e. using validation data.
        hf_model_class: HuggingFace model class
        hf_model_class: HuggingFace token class
        vocab_file: The vocabulary from a pretrained model
        model_weights: The Pytorch binary file from a pretrained model
        from_tf: Whether the model_name is a path pointing to a model pre-trained in Tensorflow. (Not tested)

        """

        # super(BertForSeqFinetune, self).__init__(config)

        ###   MODEL VARIABLES   ###

        # self.args_loaded = False
        self.device = None

        self.model_name = model_name
        self.config = config  # initialised outside of class
        self.model = None
        self.tokenizer = None

        self.hf_model_class = hf_model_class
        self.hf_token_class = hf_token_class
        self.model_class_type = model_class_type

        ###   DATA VARIABLES   ###

        self.training_data_loader = None
        self.testing_data_loader = None
        self.validating_data_loader = None

        ###   TRAINING VARIABLES   ###

        self.num_labels = num_labels
        self.max_token_len = 128
        self.lr = 2e-5
        # self.TEST_SIZE = 0.2
        # self.EPOCHS = 3
        # self.BATCH_SIZE = 8

        # Save file names
        self.optimizer_pt = "optimizer.pt"
        self.scheduler_pt = "scheduler.pt"

        self.save_steps = 10
        self.warmup_steps = None
        self.total_steps = None
        self.gradient_accumulation_steps = 1
        self.logging_steps = 50
        self.max_grad_norm = 1.0
        self.loss_over_time = []
        self.random_state = 2018

        ###   EVALUATION VARIABLES   ###

        self.validation_accuracy = None
        # Precision-recall by topic
        # self.pr_dict = defaultdict(lambda: defaultdict(int))

        self.preds_arr = None
        self.labels_arr = None
        self.topics_eval_arr = None
        self.doc_id_eval_arr = None

        ###   SAVE PATH VARIABLES   ###

        self.cache_dir = None
        self.save_flag = save_flag
        if self.save_flag:
            self.output_dir = f"./{save_ckpt_path}/{model_desc}"
            if not os.path.exists(self.output_dir):
                os.makedirs(self.output_dir)

        self._specify_model(self.model_name,
                            self.config,
                            self.num_labels,
                            vocab_file=vocab_file,
                            model_weights=model_weights)

    def load_dataloader_train_and_test(self, training_data_loader,
                                       testing_data_loader):
        self.training_data_loader = training_data_loader
        self.testing_data_loader = testing_data_loader

    def load_dataloader_validate(self, validating_data_loader):
        self.validating_data_loader = validating_data_loader

    def _specify_model(self,
                       model_name,
                       config,
                       num_labels,
                       vocab_file=None,
                       model_weights=None,
                       from_tf=False):
        """
        The naming conventions for loading a pretrained model is:

        "config.json"
        "vocab.txt"
        "pytorch_model.bin"

        To be explicit, we'll force the user to specify their files. The `config.json`
        file specified outside of the class, so account for the remaining two.

        If we are loading the files from Tensorflow, then we need to pass in a
        boolean (in this case from_tf)
        """

        if (model_weights is not None) and (from_tf == False):
            self.model = self.hf_model_class.from_pretrained(
                f"{model_name}", config=self.config)
        elif from_tf:
            self.model = self.hf_model_class.from_pretrained(
                f"{model_name}", from_tf=from_tf, config=self.config)
        else:
            self.model = self.hf_model_class.from_pretrained(
                f"{model_name}", config=self.config)
        if vocab_file is not None:
            self.tokenizer = self.hf_token_class.from_pretrained(
                f"{model_name}/{vocab_file}", do_lower_case=True)
        else:
            self.tokenizer = self.hf_token_class.from_pretrained(
                f"{model_name}", do_lower_case=True)

    def train(self, epochs, batch_size, use_gpu):
        """
        use_gpu: int
        """

        if self.model is None:
            raise ValueError("Model has not been specified!")

        if torch.cuda.is_available() and use_gpu:
            print("Using GPU")
            self.device = torch.device("cuda")
            torch.cuda.empty_cache()
            if use_gpu > 1:
                self.model = nn.DataParallel(self.model)
        else:
            print("CUDA not available. Using CPU")
            self.device = torch.device("cpu")

        self.model.to(self.device)

        self.total_steps = len(self.training_data_loader) // (
            self.gradient_accumulation_steps * epochs)
        self.warmup_steps = int(self.total_steps / 10)

        self.optimizer = AdamW(self.model.parameters(),
                               lr=2e-5,
                               correct_bias=False)
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=self.warmup_steps,
            num_training_steps=self.total_steps)

        print(f"{self.output_dir}/optimizer.pt")
        if os.path.isfile(
                f"{self.output_dir}/optimizer.pt") and os.path.isfile(
                    f"{self.output_dir}/scheduler.pt"):
            print("loading saved optimiser and scheduler")
            self.optimizer.load_state_dict(
                torch.load(f"{self.output_dir}/{self.optimizer_pt}"))
            self.scheduler.load_state_dict(
                torch.load(f"{self.output_dir}/{self.scheduler_pt}"))

        global_steps = 0
        tr_loss, tr_loss_prev = 0.0, 0.0
        nb_tr_examples = 0

        for epoch in trange(epochs, desc="EPOCHS"):
            epoch_iterator = tqdm(self.training_data_loader, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):
                self.model.train()
                self.model.zero_grad()

                batch = tuple(t.to(self.device) for t in batch)
                inputs = {
                    'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'labels': batch[3]
                }
                inputs['token_type_ids'] = (batch[2] if self.model_class_type
                                            in ["bert", "xlnet", "albert"
                                                ] else None)
                # Rewrite this code to check for model_type more easily.
                # if args.model_type != 'distilbert':
                #     inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None

                outputs = self.model(**inputs)
                loss = outputs[0]
                print(f"loss: {loss}")
                loss.backward()

                torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                               self.max_grad_norm)
                self.optimizer.step()
                self.scheduler.step()

                tr_loss += loss.item()
                self.loss_over_time.append(tr_loss)
                nb_tr_examples += inputs["input_ids"].size(0)
                global_steps += 1

                # @TODO: Find suitable way to record this information
                if global_steps % self.logging_steps == 0:
                    avg_loss = (tr_loss - tr_loss_prev) / self.logging_steps
                    tr_loss_prev = tr_loss
                    print(
                        f"Statistics over the last {self.logging_steps} steps:"
                    )
                    print(f"\t global_steps: {global_steps}")
                    print(f"\t average loss: {avg_loss}")
                    print(f"\t loss.item(): {loss.item()}")
                    print(f"\t tr_loss: {tr_loss}")
                    print(f"\t nb_tr_examples: {nb_tr_examples}")

            if self.save_flag:
                output_dir = os.path.join(self.output_dir,
                                          'checkpoint-{}'.format(global_steps))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)

                if (epoch % 1) == 0:
                    tmp_eval = self.evaluate(
                        {"precision_recall_by_topic": eval_pr_per_topics},
                        use_ids=True,
                        validate=False,
                        use_gpu=True)

                    # Convert the defaultdict to dict, for JSON
                    # print(tmp_eval[4])
                    for key in tmp_eval[4].keys():
                        tmp_eval[4][key] = dict(tmp_eval[4][key])
                    # print(tmp_eval[4])
                    pr_dict_tmp = dict(tmp_eval[4])
                    # print(pr_dict_tmp)

                    output_dict = {
                        "y_truth": tmp_eval[0].tolist(),
                        "y_pred": tmp_eval[1].tolist(),
                        "topics_arr": tmp_eval[2].tolist(),
                        "doc_ids_arr": tmp_eval[3].tolist(),
                        "pr_dict": pr_dict_tmp,
                    }
                    # print(output_dict["pr_dict"])
                    # print(type(output_dict["pr_dict"]))

                    pd.to_pickle(output_dict, f"{output_dir}/ckpt_eval.pickle")
                    # with open(f"{output_dir}/ckpt_eval.json", "w") as f:
                    #     pickle.dump(pr_dict_tmp, f)

                    self.save_model(output_dir)

            # Take care of distributed/parallel training
            # model_to_save = self.model.module if hasattr(self.model, 'module') else self.model
            # model_to_save.save_pretrained(output_dir)

            # @TODO: Do we want to implement a way to save the arguments?
            # torch.save(args, os.path.join(output_dir, 'training_args.bin'))

        return global_steps, tr_loss / global_steps

    def evaluate(self, eval_metrics_dict, use_ids, validate, use_gpu):
        """

        Format of the batch is different depending on use_ids:

        IF use_ids IS True:
        (
          [
            tensor([
              [a1], ..., [an]
            ])
            tensor([
              [b1], ..., [bn]
            ])
            ...
            tensor([
              [j1], ..., [jn]
            ])
          ],
          [id1, ..., idn]
        )
        ELSE:
          (
            tensor([
              [a1], ..., [an]
            ])
            ...
            tensor([
              [j1], ..., [jn]
            ])
          )

        Params
        ------
        eval_metric_dict: {
                "accuracy": num_correctly_classified,
                "precision_recall_by_topic": eval_pr_per_topics,
                "roc_curve": calc_roc
            }
        use_ids: If true, we use unique IDs are used to track the individual data points
        validate: If true, then this is a validation set with no labels.

        """
        eval_loss = 0.0
        nb_eval_steps = 0

        acc_test_loss = 0.0
        self.pr_dict = defaultdict(lambda: defaultdict(int))

        if torch.cuda.is_available() and use_gpu:
            print("Using GPU")
            self.device = torch.device("cuda")
            torch.cuda.empty_cache()
            if use_gpu > 1:
                self.model = nn.DataParallel(self.model)
        else:
            print("CUDA not available. Using CPU")
            self.device = torch.device("cpu")

        self.model.to(self.device)

        self.model.eval()

        if validate:
            test_data = self.validating_data_loader
        else:
            test_data = self.testing_data_loader

        for batch in tqdm(test_data, desc="EVALUATING"):
            with torch.no_grad():
                # print(batch)
                if use_ids:
                    doc_ids_batch = batch[1]
                    batch = tuple(t.to(self.device) for t in batch[0])

                    topics_batch = batch[4].detach().cpu().numpy()
                else:
                    batch = tuple(t.to(self.device) for t in batch)
                    topics_batch = batch[4].detach().cpu().numpy()
                    doc_ids_batch = batch[5].detach().cpu().numpy()
                inputs = {
                    'input_ids': batch[0],
                    'attention_mask': batch[1],
                    'labels': batch[3]
                }
                inputs['token_type_ids'] = (batch[2] if self.model_class_type
                                            in ["bert", "xlnet", "albert"
                                                ] else None)
                # if args.model_type != 'distilbert':
                #     inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
                # What does this do?

                if validate:
                    inputs.pop("labels")
                    outputs = self.model(**inputs)
                    logits = outputs[0]
                    # print(logits)
                else:
                    outputs = self.model(**inputs)
                    tmp_test_loss, logits = outputs[:2]

                    # What does this do?
                    eval_loss += tmp_test_loss.mean().item()
            nb_eval_steps += 1

            ################     UPDATE TOTAL LOSS     ################

            logits_batch = logits.detach().cpu().numpy()
            if not validate:
                labels_batch = inputs["labels"].cpu().numpy()

            if "accuracy" in eval_metrics_dict.keys():
                batch_test_loss = eval_metrics_dict["accuracy"](logits_batch,
                                                                labels_batch)
                acc_test_loss += batch_test_loss

            if "precision_recall_by_topic" in eval_metrics_dict.keys():
                eval_metrics_dict["precision_recall_by_topic"](logits_batch,
                                                               labels_batch,
                                                               topics_batch,
                                                               self.pr_dict)

            # We're going to save this and return it later
            if self.preds_arr is None:
                self.preds_arr = logits_batch

                if not validate:
                    self.labels_arr = labels_batch

                self.topics_eval_arr = topics_batch
                self.doc_id_eval_arr = doc_ids_batch
            else:
                self.preds_arr = np.append(self.preds_arr,
                                           logits_batch,
                                           axis=0)

                if not validate:
                    # print(inputs["labels"])
                    self.labels_arr = np.append(self.labels_arr,
                                                labels_batch,
                                                axis=0)
                self.topics_eval_arr = np.append(self.topics_eval_arr,
                                                 topics_batch,
                                                 axis=0)
                self.doc_id_eval_arr = np.append(self.doc_id_eval_arr,
                                                 doc_ids_batch,
                                                 axis=0)

        ################     DISPLAY RESULTS     ################

        # previous metric_function function accuracy percentage for each batch
        # self.validation_accuracy = acc_test_loss/nb_eval_steps

        if not validate:
            eval_loss = eval_loss / nb_eval_steps
            print(f"eval_loss: {eval_loss}")

        if validate:
            num_test_points = len(self.validating_data_loader.dataset)
        else:
            num_test_points = len(self.testing_data_loader.dataset)

        print(f"acc_test_loss: {acc_test_loss}")
        print(f"num_test_points: {num_test_points}")

        if "accuracy" in eval_metrics_dict.keys():
            self.validation_accuracy = acc_test_loss / num_test_points
            print("Validation Accuracy: {}".format(self.validation_accuracy))

        if "precision_recall_by_topic" in eval_metrics_dict.keys():
            for topic in self.pr_dict.keys():

                if (self.pr_dict[topic]["false_positive"] +
                        self.pr_dict[topic]["true_positive"]) == 0:
                    print(f"FP + TP = 0")
                    precision = 0
                else:
                    precision = self.pr_dict[topic]["true_positive"] / (
                        self.pr_dict[topic]["false_positive"] +
                        self.pr_dict[topic]["true_positive"])

                if (self.pr_dict[topic]["false_negative"] +
                        self.pr_dict[topic]["true_positive"]) == 0:
                    print(f"FN + TP = 0")
                    recall = 0
                else:
                    recall = self.pr_dict[topic]["true_positive"] / (
                        self.pr_dict[topic]["false_negative"] +
                        self.pr_dict[topic]["true_positive"])

                self.pr_dict[topic]["precision"] = precision
                self.pr_dict[topic]["recall"] = recall

        if "roc_curve" in eval_metrics_dict.keys():
            eval_metrics_dict["roc_curve"](self.preds_arr,
                                           self.labels_arr,
                                           num_classes=self.num_labels)

        # self.labels_arr is None if we are evaluating with validation data.

        return self.labels_arr, self.preds_arr, self.topics_eval_arr, self.doc_id_eval_arr, self.pr_dict, self.validation_accuracy

    def save_model(self, output_dir):
        model_to_save = self.model.module if hasattr(
            self.model, 'module'
        ) else self.model  # Take care of distributed/parallel training
        model_to_save.save_pretrained(output_dir)
        self.tokenizer.save_pretrained(output_dir)

        torch.save(self.optimizer.state_dict(),
                   f"{output_dir}/{self.optimizer_pt}")
        torch.save(self.scheduler.state_dict(),
                   f"{output_dir}/{self.scheduler_pt}")

        # @TODO: Implement dict of args
        #  torch.save(args, os.path.join(args.output_dir, 'training_args.bin'))

        # Load a trained model and vocabulary that you have fine-tuned
        self.model = self.hf_model_class.from_pretrained(output_dir)
        self.tokenizer = self.hf_token_class.from_pretrained(output_dir)
        self.model.to(self.device)
Example #16
0
def train(model, tokenizer, checkpoint, round):
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
    else:
        amp = None

    train_data = Multi_task_dataset_eng(data_file=args.train_file,
                                        max_length=args.max_length,
                                        tokenizer=tokenizer,
                                        model_type=args.model_type)

    train_dataloader = DataLoader(dataset=train_data,
                                  batch_size=args.batch_size,
                                  shuffle=True)

    t_total = len(train_dataloader) * args.epochs
    warmup_steps = int(args.warmup_steps * t_total)
    optimizer = AdamW(model.parameters(),
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=warmup_steps,
                                                num_training_steps=t_total)

    if args.fp16:
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fptype)

    # 读取断点 optimizer、scheduler
    checkpoint_dir = args.save_dir + "/checkpoint-" + str(
        checkpoint) + '-' + str(round)
    if os.path.isfile(os.path.join(checkpoint_dir, "optimizer.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(checkpoint_dir, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(checkpoint_dir, "scheduler.pt")))
        if args.fp16:
            amp.load_state_dict(
                torch.load(os.path.join(checkpoint_dir, "amp.pt")))

    # 开始训练
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataloader))
    logger.info("  Num Epochs = %d", args.epochs)
    logger.info("  Batch size = %d", args.batch_size)
    logger.info("  learning_rate = %s", str(args.learning_rate))
    logger.info("  Total steps = %d", t_total)
    logger.info("  warmup steps = %d", warmup_steps)
    logger.info("  Model_type = %s", args.model_type)
    logger.info("  Decoder_type = %s", args.decoder_type)
    logger.info("  vice_loss_weight = %s", str(args.vice_weight))

    # 没有历史断点,则从0开始
    if checkpoint < 0:
        checkpoint = 0
        round = 0
    else:
        checkpoint += 1
        round += 1

    max_test_acc = 0
    max_test_f1 = 0

    logger.debug("  Start Batch = %d", checkpoint)
    for epoch in range(checkpoint, args.epochs):
        model.train()
        epoch_loss = []

        step = 0
        for batch in tqdm(train_dataloader, desc="Iteration", ncols=50):
            model.zero_grad()
            # 设置tensor gpu运行
            batch = tuple(t.to(args.device) for t in batch)

            if 'roberta' in args.model_type:
                input_ids, attention_mask, labels_main, labels_vice1, labels_vice2 = batch
                outputs = model(input_ids=input_ids.long(),
                                attention_mask=attention_mask.long(),
                                labels_main=labels_main,
                                labels_vice1=labels_vice1,
                                labels_vice2=labels_vice2,
                                model_type='roberta')
            else:
                input_ids, token_type_ids, attention_mask, labels_main, labels_vice1, labels_vice2 = batch
                outputs = model(input_ids=input_ids.long(),
                                token_type_ids=token_type_ids.long(),
                                attention_mask=attention_mask.long(),
                                labels_main=labels_main,
                                labels_vice1=labels_vice1,
                                labels_vice2=labels_vice2)

            loss = outputs[0]

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()  # 计算出梯度

            epoch_loss.append(loss.item())

            optimizer.step()
            scheduler.step()
            step += 1
            if step % 500 == 0:
                logger.debug("loss:" + str(np.array(epoch_loss).mean()))
                logger.debug(
                    'learning_rate:' +
                    str(optimizer.state_dict()['param_groups'][0]['lr']))
            if step % args.saving_steps == 0:
                round += 1
                dev_loss, dev_acc, dev_f1 = test(model=model,
                                                 tokenizer=tokenizer,
                                                 test_file=args.dev_file,
                                                 checkpoint=epoch,
                                                 round=round)
                logger.info(
                    '【DEV】Train Epoch %d, round %d: train_loss=%.4f, acc=%.4f, f1=%.4f'
                    % (epoch, round, dev_loss, dev_acc, dev_f1))

                test_loss, test_acc, test_f1 = test(model=model,
                                                    tokenizer=tokenizer,
                                                    test_file=args.test_file,
                                                    checkpoint=epoch,
                                                    round=round)
                logger.info(
                    '【TEST】Train Epoch %d, round %d: train_loss=%.4f, acc=%.4f, f1=%.4f'
                    % (epoch, round, test_loss, test_acc, test_f1))
                output_dir = args.save_dir + "/checkpoint-" + str(
                    epoch) + '-' + str(round)
                if test_acc > max_test_acc or test_f1 > max_test_f1:
                    max_test_acc = max(test_acc, max_test_acc)
                    max_test_f1 = max(test_f1, max_test_f1)

                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (model.module
                                     if hasattr(model, "module") else model)
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.debug("Saving model checkpoint to %s", output_dir)
                    if args.fp16:
                        torch.save(amp.state_dict(),
                                   os.path.join(output_dir, "amp.pt"))
                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.debug("Saving optimizer and scheduler states to %s",
                                 output_dir)
                model.train()

            # 保存模型
        output_dir = args.save_dir + "/checkpoint-" + str(epoch) + '-' + str(
            round)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        model_to_save = (model.module if hasattr(model, "module") else model)
        model_to_save.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        torch.save(args, os.path.join(output_dir, "training_args.bin"))
        logger.debug("Saving model checkpoint to %s", output_dir)
        if args.fp16:
            torch.save(amp.state_dict(), os.path.join(output_dir, "amp.pt"))
        torch.save(optimizer.state_dict(),
                   os.path.join(output_dir, "optimizer.pt"))
        torch.save(scheduler.state_dict(),
                   os.path.join(output_dir, "scheduler.pt"))
        logger.debug("Saving optimizer and scheduler states to %s", output_dir)

        dev_loss, dev_acc, dev_f1 = test(model=model,
                                         tokenizer=tokenizer,
                                         test_file=args.dev_file,
                                         checkpoint=epoch,
                                         round=round)
        test_loss, test_acc, test_f1 = test(model=model,
                                            tokenizer=tokenizer,
                                            test_file=args.test_file,
                                            checkpoint=epoch,
                                            round=round)
        #print(test_loss, test_acc)
        logger.info(
            '【DEV】Train Epoch %d, round %d: train_loss=%.4f, acc=%.4f, f1=%.4f'
            % (epoch, round, dev_loss, dev_acc, dev_f1))
        logger.info(
            '【TEST】Train Epoch %d, round %d: train_loss=%.4f, acc=%.4f, f1=%.4f'
            % (epoch, round, test_loss, test_acc, test_f1))
        if test_acc > max_test_acc or test_f1 > max_test_f1:
            max_test_acc = max(test_acc, max_test_acc)
            max_test_f1 = max(test_f1, max_test_f1)
    logger.info('【BEST TEST ACC】: %.4f,   【BEST TEST F1】: %.4f' %
                (max_test_acc, max_test_f1))
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler_total = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    subset_quantity = args.div_subset

    # notice 难度划分
    curriculum_sets_temp = []

    # done 如何保证课程被采样了
    diff_eval_result = Difficulty_Evaluation(args, train_dataset)
    for i,subset in enumerate(diff_eval_result):
        gate = int((len(train_dataset)/args.train_batch_size)/(subset_quantity))
        print("第",i,"个 num:",len(subset)," 阈值 ",gate)
        random.shuffle(subset)
        # 如果subset过于小,就不采样了
        if len(subset) > gate:
            # subset = list(subset)
            # 决定没一个采样的长度
            curriculum_sets_temp.append(subset[0:int( gate /subset_quantity)])
        # elif(len(subset) <= int(gate/subset_quantity)):
        #     for i in range(subset_quantity):
        #         curriculum_sets_temp.append(subset)
        else:
            curriculum_sets_temp.append(subset)
        # curriculum_sets_temp.append(subset)

    # 不采样的
    # diff_eval_result = Difficulty_Evaluation(args, train_dataset)
    # for _ in range(int(args.num_train_epochs)):
    #     for i, subset in enumerate(diff_eval_result):
    #         random.shuffle(subset)
    #         curriculum_sets_temp.append(subset)


    # 随机划分
    # curriculum_sets_temp = Difficulty_Evaluation_Randomly(args,train_dataset)

    # 先添加全部任务
    curriculum_sets = []
    total_train_dataloader = DataLoader(train_dataset, sampler=train_sampler_total, batch_size=args.train_batch_size)
    for i in range(int(args.num_train_epochs)):
        curriculum_sets.append(total_train_dataloader)

    # 再添加课程任务
    # notice 课程任务顺序
    curriculum_sets += curriculum_sets_temp


    # CL阶段训练

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(curriculum_sets[0]) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(curriculum_sets[0]) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    # notice 添加L2正则化
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon,weight_decay=0.01)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
            os.path.join(args.model_name_or_path, "scheduler.pt")
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")

        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(curriculum_sets[0]))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(curriculum_sets[0]) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(curriculum_sets[0]) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        # epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
        epochs_trained, int(len(curriculum_sets)), desc = "Epoch", disable = args.local_rank not in [-1, 0]
    )
    # Added here for reproductibility
    set_seed(args)

    current_stage = 0
    for _ in train_iterator:
        epoch_iterator = tqdm(curriculum_sets[current_stage], desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            # print("batch_size",batch[0].shape)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

            if args.model_type in ["xlm", "roberta", "distilbert", "camembert", "bart", "longformer"]:
                del inputs["token_type_ids"]

            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
                if args.version_2_with_negative:
                    inputs.update({"is_impossible": batch[7]})
                if hasattr(model, "config") and hasattr(model.config, "lang2id"):
                    inputs.update(
                        {"langs": (torch.ones(batch[0].shape, dtype=torch.int64) * args.lang_id).to(args.device)}
                    )

            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]

            # notice 添加KL的loss 或者 wgan的那个w
            # pa = 100
            # loss += (pa * (cal_diff(outputs.hidden_states[0], outputs.hidden_states[-1],norm="line",criterion="wd")))

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                # Log metrics
                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.local_rank == -1 and args.evaluate_during_training:
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                # Save model checkpoint
                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    # Take care of distributed/parallel training
                    model_to_save = model.module if hasattr(model, "module") else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

        current_stage += 1

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #18
0
def train(args, data_generator, model,
          tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter(log_dir=args.run_name)

    train_dataset = data_generator.instance_a_train_dataset()
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    criterion = nn.BCEWithLogitsLoss()

    def collate(batch):
        # if tokenizer._pad_token is None:
        #     return pad_sequence(examples, batch_first=True)
        # return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

        tokens = [b[0] for b in batch]
        features = [b[1] for b in batch]
        targets = [b[2] for b in batch]
        inputs = [b[3] for b in batch]

        lens = [len(x) for x in inputs]

        inputs = pad_sequence(inputs,
                              batch_first=True,
                              padding_value=tokenizer.pad_token_id)
        attention_mask = (inputs != tokenizer.pad_token_id).int()

        tokens, features, targets = [
            torch.tensor(x) for x in [tokens, features, targets]
        ]

        return tokens, features, targets, inputs, attention_mask, torch.tensor(
            lens).unsqueeze(1)

    if args.use_bucket_iterator:
        print("\n\n\n\n USING THE BUCKET ITERATOR \n\n\n\n")
        bucket_boundaries = [0, 20, 40, 60, 80, 101]
        train_sampler = BySequenceLengthSampler(
            train_dataset,
            bucket_boundaries,
            batch_size=args.train_batch_size,
            drop_last=False)
        train_dataloader = DataLoader(train_dataset,
                                      batch_size=1,
                                      batch_sampler=train_sampler,
                                      collate_fn=collate)
    else:
        train_sampler = RandomSampler(
            train_dataset) if args.local_rank == -1 else DistributedSampler(
                train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size,
                                      collate_fn=collate)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [{
        "params": [
            p for n, p in model.bert.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        "weight_decay":
        args.weight_decay,
        "lr":
        args.learning_rate,
    }, {
        "params": [
            p for n, p in model.bert.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        "weight_decay":
        0.0,
        "lr":
        args.learning_rate,
    }, {
        "params": [
            p for n, p in model.mlp_net.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        "weight_decay":
        args.weight_decay,
        "lr":
        args.mlp_learning_rate,
    }, {
        "params": [
            p for n, p in model.mlp_net.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        "weight_decay":
        0.0,
        "lr":
        args.mlp_learning_rate,
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    #    # Check if saved optimizer or scheduler states exist
    # TODO    if (
    #        args.model_name_or_path
    #        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
    #        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    #    ):
    #        # Load in optimizer and scheduler states
    #        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
    #        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))
    #

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    # if args.model_name_or_path and os.path.exists(args.model_name_or_path):
    #     try:
    #         # set global_step to gobal_step of last saved checkpoint from model path
    #         checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
    #         global_step = int(checkpoint_suffix)
    #         epochs_trained = global_step // ((len(train_dataset)//args.train_batch_size) // args.gradient_accumulation_steps)
    #         steps_trained_in_current_epoch = global_step % ((len(train_dataset)//args.train_batch_size) // args.gradient_accumulation_steps)

    #         logger.info("  Continuing training from checkpoint, will skip to saved global_step")
    #         logger.info("  Continuing training from epoch %d", epochs_trained)
    #         logger.info("  Continuing training from global step %d", global_step)
    #         logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
    #     except ValueError:
    #         logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    # model_to_resize = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    # model_to_resize.resize_token_embeddings(len(tokenizer))

    if args.continue_training:
        model.load_state_dict(
            torch.load(args.continue_training_path + "model.bin"))
        optimizer.load_state_dict(
            torch.load(args.continue_training_path + "optimizer.pt"))
        scheduler.load_state_dict(
            torch.load(args.continue_training_path + "scheduler.pt"))
        print("\n loaded model/optimizer/scheduler")

    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproducibility

    evaluate(args, data_generator, tb_writer, model, tokenizer, global_step)

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            # training loop
            tokens, features, targets, inputs, attention_mask, lens = batch

            tokens, features, targets, inputs, attention_mask, lens = [
                x.to(args.device) for x in
                [tokens, features, targets, inputs, attention_mask, lens]
            ]

            tokens, features, targets = [
                x.float() for x in [tokens, features, targets]
            ]

            model.train()
            logit = model(tokens, features, inputs, attention_mask, lens)
            loss = criterion(logit, targets)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if step % 100 == 0:
                tb_writer.add_scalar("training_loss", loss.item(), global_step)
                print("{}".format(loss.item()))
                with open("./train_log.txt", "a") as f:
                    f.write("{} \n".format(loss.item()))

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                            args.local_rank == -1
                            and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        # results = evaluate(args, tb_writer, model, tokenizer)
                        evaluate(args, data_generator, tb_writer, model,
                                 tokenizer, global_step)
                        # for key, value in results.items():
                        #     tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir,
                        "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    # model_to_save = (
                    #     model.module if hasattr(model, "module") else model
                    # )  # Take care of distributed/parallel training
                    # model_to_save.save_pretrained(output_dir)
                    torch.save(model.state_dict(),
                               os.path.join(output_dir, "model.bin"))
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #19
0
def train(train_dataset,
          model,
          tokenizer,
          n_epochs=2,
          eval_every=2500,
          save_every=5000,
          output_folder='SQUAD_data',
          checkpoint='-1',
          bs=2,
          w_checkpoint=True,
          tensordir='runs',
          acc_steps=1):
    """ Train the model """
    tb_writer = SummaryWriter(tensordir)
    train_batch_size = bs
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=train_batch_size)

    t_total = len(train_dataloader) // acc_steps * n_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=3e-5, eps=1e-8)
    #     scheduler = get_linear_schedule_with_warmup(
    #         optimizer, num_warmup_steps=0, num_training_steps=t_total
    #     )

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(output_folder, checkpoint,
                                   "optimizer.pt")):  # and os.path.isfile(
        #         os.path.join(output_folder, checkpoint, "scheduler.pt")
        #     ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(output_folder, checkpoint,
                                    "optimizer.pt")))
        #         scheduler.load_state_dict(torch.load(os.path.join(output_folder, checkpoint, "scheduler.pt")))
        print('Optimizer and scheduler found !\n')

    # Train!
    print("***** Running training *****")
    print("  Num examples = %s" % len(train_dataset))
    print("  Num Epochs = %s" % n_epochs)
    print("  Instantaneous batch size per GPU = %s" % bs)
    print("  Total optimization steps = %s" % t_total)

    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(os.path.join(output_folder, checkpoint)):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            if checkpoint == '':
                t = glob.glob(os.path.join(output_folder, '*.txt'))[0]
                global_step = int(t[len(output_folder) + 1:-4])
            else:
                checkpoint_suffix = checkpoint.split("-")[-1].split("/")[0]
                global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             acc_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // acc_steps)

            print(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            print("  Continuing training from epoch %d" % epochs_trained)
            print("  Continuing training from global step %d" % global_step)
            print("  Will skip the first %d steps in the first epoch" %
                  steps_trained_in_current_epoch)
        except ValueError:
            print("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0
    optimizer.zero_grad()
    train_iterator = trange(epochs_trained,
                            n_epochs,
                            desc="Epoch",
                            disable=False)
    # Added here for reproductibility
    set_seed()

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=False)
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

            inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
            inputs.update({"is_impossible": batch[7]})
            if hasattr(model, "config") and hasattr(model.config, "lang2id"):
                inputs.update({
                    "langs": (torch.ones(batch[0].shape, dtype=torch.int64) *
                              args.lang_id).to(device)
                })

            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]
            if acc_steps > 1:
                loss = loss / acc_steps
            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % acc_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                optimizer.step()
                #                 scheduler.step()  # Update learning rate schedule
                optimizer.zero_grad()
                global_step += 1

                #if global_step % 5000 == 0: drive.mount("/content/gdrive", force_remount=True)

                # Log metrics
                if (global_step % eval_every == 0) and (eval_every != -1):
                    # Only evaluate when single GPU otherwise metrics may not average well
                    output_dir = os.path.join(
                        output_folder, "checkpoint-{}".format(
                            global_step)) if w_checkpoint else output_folder
                    results = evaluate(model,
                                       tokenizer,
                                       output_dir,
                                       bs=train_batch_size)
                    for key, value in results.items():
                        tb_writer.add_scalar("eval_{}".format(key), value,
                                             global_step)
#                     tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step)
                    tb_writer.add_scalar("loss",
                                         (tr_loss - logging_loss) / eval_every,
                                         global_step)
                    logging_loss = tr_loss
                    with open(
                            os.path.join(
                                output_dir,
                                'results_{}.json'.format(global_step)),
                            'w') as f:
                        json.dump(results, f)

                # Save model checkpoint
                if (global_step % save_every == 0) and (save_every != -1):
                    output_dir = os.path.join(
                        output_folder, "checkpoint-{}".format(
                            global_step)) if w_checkpoint else output_folder
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Take care of distributed/parallel training
                    model_to_save = model.module if hasattr(
                        model, "module") else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    #torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    print("Saving model checkpoint to %s" % output_dir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    #                     torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    print("Saving optimizer and scheduler states to %s" %
                          output_dir)
                    if checkpoint == '':
                        if global_step == save_every: pass
                        else:
                            t = glob.glob(os.path.join(output_dir, '*.txt'))[0]
                            os.remove(t)
                        with open(
                                os.path.join(output_dir,
                                             '{}.txt'.format(global_step)),
                                'w') as f:
                            f.write(' ')
        if save_every == -1:
            output_dir = os.path.join(output_folder, "checkpoint-{}".format(
                global_step)) if w_checkpoint else output_folder
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            # Take care of distributed/parallel training
            model_to_save = model.module if hasattr(model, "module") else model
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)

            #torch.save(args, os.path.join(output_dir, "training_args.bin"))
            print("Saving model checkpoint to %s" % output_dir)

            torch.save(optimizer.state_dict(),
                       os.path.join(output_dir, "optimizer.pt"))
            #             torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
            print("Saving optimizer and scheduler states to %s" % output_dir)
            if checkpoint == '':
                if global_step == save_every: pass
                else:
                    t = glob.glob(os.path.join(output_dir, '*.txt'))[0]
                    os.remove(t)
                with open(
                        os.path.join(output_dir, '{}.txt'.format(global_step)),
                        'w') as f:
                    f.write('')
        if eval_every == -1:
            # Only evaluate when single GPU otherwise metrics may not average well
            output_dir = os.path.join(output_folder, "checkpoint-{}".format(
                global_step)) if w_checkpoint else output_folder
            results = evaluate(model,
                               tokenizer,
                               output_dir,
                               bs=train_batch_size)
            for key, value in results.items():
                tb_writer.add_scalar("eval_{}".format(key), value, global_step)


#             tb_writer.add_scalar("lr", scheduler.get_last_lr()[0], global_step)
            tb_writer.add_scalar("loss", (tr_loss - logging_loss) / 500,
                                 global_step)
            logging_loss = tr_loss
            with open(
                    os.path.join(output_dir,
                                 'results_{}.json'.format(global_step)),
                    'w') as f:
                json.dump(results, f)

    tb_writer.close()

    return global_step, tr_loss / global_step
Example #20
0
def train(args,input_qnlidata_dir, train_dataset, model, tokenizer,qnlimodel_output_path):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {"params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
         "weight_decay": args.weight_decay,
         },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
         "weight_decay": 0.0
         },
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
        os.path.join(args.model_name_or_path, "scheduler.pt")
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True,
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_this_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
        steps_trained_in_this_epoch = global_step % (
            len(train_dataloader) // args.gradient_accumulation_steps)

        logger.info(f"  Continuing training from checkpoint, will skip to saved global_step")
        logger.info(f"  Continuing training from epoch {epochs_trained}")
        logger.info(f"  Continuing training from global step {global_step}")
        logger.info(f"  Will skip the first {steps_trained_in_this_epoch} steps in the first epoch")

    tr_loss, logging_loss, epoch_loss = 0.0, 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0],
                            mininterval=10,
                            ncols=100)
    set_seed(args)  # Added here for reproductibility
    best_dev_performance = 0
    best_epoch = epochs_trained

    train_acc = 0.0
    for epoch, _ in enumerate(train_iterator):
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0],
                              mininterval=10,
                              ncols=100)

        train_iterator.set_description(f"train_epoch: {epoch} train_acc: {train_acc:.4f}")
        train_ids = None
        train_golds = None
        train_logits = None
        train_losses = None
        for step, batch in enumerate(epoch_iterator):
            # Skip past any already trained steps if resuming training
            if steps_trained_in_this_epoch > 0:
                steps_trained_in_this_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None
                )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
            if hasattr(torch.cuda, 'empty_cache'):
                torch.cuda.empty_cache()
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if train_logits is None:  # Keep track of training dynamics.
                train_ids = batch[4].detach().cpu().numpy()
                train_logits = outputs[1].detach().cpu().numpy()
                train_golds = inputs["labels"].detach().cpu().numpy()
                train_losses = loss.detach().cpu().numpy()
            else:
                train_ids = np.append(train_ids, batch[4].detach().cpu().numpy())
                train_logits = np.append(train_logits, outputs[1].detach().cpu().numpy(), axis=0)
                train_golds = np.append(train_golds, inputs["labels"].detach().cpu().numpy())
                train_losses = np.append(train_losses, loss.detach().cpu().numpy())

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if (
                    args.local_rank in [-1, 0] and
                    args.logging_steps > 0 and
                    global_step % args.logging_steps == 0
                ):
                    epoch_log = {}
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.local_rank == -1 and args.evaluate_during_training_epoch:
                        logger.info(f"From within the epoch at step {step}")
                        results, _ = evaluate(args,input_qnlidata_dir,qnlimodel_output_path, model, tokenizer)
                        for key, value in results.items():
                            eval_key = "eval_{}".format(key)
                            epoch_log[eval_key] = value

                    epoch_log["learning_rate"] = scheduler.get_lr()[0]
                    epoch_log["loss"] = (tr_loss - logging_loss) / args.logging_steps
                    logging_loss = tr_loss

                    for key, value in epoch_log.items():
                        tb_writer.add_scalar(key, value, global_step)
                    logger.info(json.dumps({**epoch_log, **{"step": global_step}}))

                if (
                    args.local_rank in [-1, 0] and
                    args.save_steps > 0 and
                    global_step % args.save_steps == 0
                ):
                    # Save model checkpoint
                    output_dir = os.path.join(qnlimodel_output_path, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            epoch_iterator.set_description(f"lr = {scheduler.get_lr()[0]:.8f}, "
                                           f"loss = {(tr_loss-epoch_loss)/(step+1):.4f}")
            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break

        #### Post epoch eval ####
        # Only evaluate when single GPU otherwise metrics may not average well
        if args.local_rank == -1 and args.evaluate_during_training:
            best_dev_performance, best_epoch = save_model(
                args,input_qnlidata_dir, model, tokenizer, epoch, best_epoch, best_dev_performance,qnlimodel_output_path)

        # Keep track of training dynamics.
        log_training_dynamics(output_dir=qnlimodel_output_path,
                              epoch=epoch,
                              train_ids=list(train_ids),
                              train_logits=list(train_logits),
                              train_golds=list(train_golds))
        train_result = compute_metrics(args.task_name, np.argmax(train_logits, axis=1), train_golds)
        train_acc = train_result["acc"]

        epoch_log = {"epoch": epoch,
                     "train_acc": train_acc,
                     "best_dev_performance": best_dev_performance,
                     "avg_batch_loss": (tr_loss - epoch_loss) / args.per_gpu_train_batch_size,
                     "learning_rate": scheduler.get_lr()[0],}
        epoch_loss = tr_loss

        logger.info(f"  End of epoch : {epoch}")
        with open(os.path.join(qnlimodel_output_path, f"eval_metrics_train.json"), "a") as toutfile:
            toutfile.write(json.dumps(epoch_log) + "\n")
        for key, value in epoch_log.items():
            tb_writer.add_scalar(key, value, global_step)
            logger.info(f"  {key}: {value:.6f}")

        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break
        elif args.evaluate_during_training and epoch - best_epoch >= args.patience:
            logger.info(f"Ran out of patience. Best epoch was {best_epoch}. "
                f"Stopping training at epoch {epoch} out of {args.num_train_epochs} epochs.")
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #21
0
def train(args, train_dataset, model, tokenizer, teacher=None):
    """Train the model"""
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter(log_dir=args.output_dir)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if "mask_score" in n and p.requires_grad
            ],
            "lr":
            args.mask_scores_learning_rate,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if "mask_score" not in n and p.requires_grad and not any(
                    nd in n for nd in no_decay)
            ],
            "lr":
            args.learning_rate,
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if "mask_score" not in n and p.requires_grad and any(
                    nd in n for nd in no_decay)
            ],
            "lr":
            args.learning_rate,
            "weight_decay":
            0.0,
        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True,
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)
    # Distillation
    if teacher is not None:
        logger.info("  Training with distillation")

    global_step = 0
    # Global TopK
    if args.global_topk:
        threshold_mem = None
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to global_step of last saved checkpoint from model path
        try:
            global_step = int(
                args.model_name_or_path.split("-")[-1].split("/")[0])
        except ValueError:
            global_step = 0
        epochs_trained = global_step // (len(train_dataloader) //
                                         args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (
            len(train_dataloader) // args.gradient_accumulation_steps)

        logger.info(
            "  Continuing training from checkpoint, will skip to saved global_step"
        )
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info("  Will skip the first %d steps in the first epoch",
                    steps_trained_in_current_epoch)

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        epochs_trained,
        int(args.num_train_epochs),
        desc="Epoch",
        disable=args.local_rank not in [-1, 0],
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            threshold, regu_lambda = schedule_threshold(
                step=global_step,
                total_step=t_total,
                warmup_steps=args.warmup_steps,
                final_threshold=args.final_threshold,
                initial_threshold=args.initial_threshold,
                final_warmup=args.final_warmup,
                initial_warmup=args.initial_warmup,
                final_lambda=args.final_lambda,
            )
            # Global TopK
            if args.global_topk:
                if threshold == 1.0:
                    threshold = -1e2  # Or an indefinitely low quantity
                else:
                    if (threshold_mem is None) or (
                            global_step % args.global_topk_frequency_compute
                            == 0):
                        # Sort all the values to get the global topK
                        concat = torch.cat([
                            param.view(-1)
                            for name, param in model.named_parameters()
                            if "mask_scores" in name
                        ])
                        n = concat.numel()
                        kth = max(n - (int(n * threshold) + 1), 1)
                        threshold_mem = concat.kthvalue(kth).values.item()
                        threshold = threshold_mem
                    else:
                        threshold = threshold_mem
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": batch[3]
            }
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2] if args.model_type
                    in ["bert", "masked_bert", "xlnet", "albert"] else None
                )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids

            if "masked" in args.model_type:
                inputs["threshold"] = threshold

            outputs = model(**inputs)
            loss, logits_stu = outputs  # model outputs are always tuple in transformers (see doc)

            # Distillation loss
            if teacher is not None:
                if "token_type_ids" not in inputs:
                    inputs[
                        "token_type_ids"] = None if args.teacher_type == "xlm" else batch[
                            2]
                with torch.no_grad():
                    (logits_tea, ) = teacher(
                        input_ids=inputs["input_ids"],
                        token_type_ids=inputs["token_type_ids"],
                        attention_mask=inputs["attention_mask"],
                    )

                loss_logits = nn.functional.kl_div(
                    input=nn.functional.log_softmax(
                        logits_stu / args.temperature, dim=-1),
                    target=nn.functional.softmax(logits_tea / args.temperature,
                                                 dim=-1),
                    reduction="batchmean",
                ) * (args.temperature**2)

                loss = args.alpha_distil * loss_logits + args.alpha_ce * loss

            # Regularization
            if args.regularization is not None:
                regu_ = regularization(model=model, mode=args.regularization)
                loss = loss + regu_lambda * regu_

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0 or (
                    # last step in epoch but step is always smaller than gradient_accumulation_steps
                    len(epoch_iterator) <= args.gradient_accumulation_steps and
                (step + 1) == len(epoch_iterator)):
                if args.fp16:
                    nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                             args.max_grad_norm)
                else:
                    nn.utils.clip_grad_norm_(model.parameters(),
                                             args.max_grad_norm)

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    tb_writer.add_scalar("threshold", threshold, global_step)
                    for name, param in model.named_parameters():
                        if not param.requires_grad:
                            continue
                        tb_writer.add_scalar("parameter_mean/" + name,
                                             param.data.mean(), global_step)
                        tb_writer.add_scalar("parameter_std/" + name,
                                             param.data.std(), global_step)
                        tb_writer.add_scalar("parameter_min/" + name,
                                             param.data.min(), global_step)
                        tb_writer.add_scalar("parameter_max/" + name,
                                             param.data.max(), global_step)
                        tb_writer.add_scalar("grad_mean/" + name,
                                             param.grad.data.mean(),
                                             global_step)
                        tb_writer.add_scalar("grad_std/" + name,
                                             param.grad.data.std(),
                                             global_step)
                        if args.regularization is not None and "mask_scores" in name:
                            if args.regularization == "l1":
                                perc = (torch.sigmoid(param) > threshold
                                        ).sum().item() / param.numel()
                            elif args.regularization == "l0":
                                perc = (torch.sigmoid(param - 2 / 3 *
                                                      np.log(0.1 / 1.1))
                                        ).sum().item() / param.numel()
                            tb_writer.add_scalar(
                                "retained_weights_perc/" + name, perc,
                                global_step)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
                    if (
                            args.local_rank == -1
                            and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            eval_key = "eval_{}".format(key)
                            logs[eval_key] = value

                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()
                    logs["learning_rate"] = learning_rate_scalar[0]
                    if len(learning_rate_scalar) > 1:
                        for idx, lr in enumerate(learning_rate_scalar[1:]):
                            logs[f"learning_rate/{idx+1}"] = lr
                    logs["loss"] = loss_scalar
                    if teacher is not None:
                        logs["loss/distil"] = loss_logits.item()
                    if args.regularization is not None:
                        logs["loss/regularization"] = regu_.item()
                    if (teacher is not None) or (args.regularization
                                                 is not None):
                        if (teacher is not None) and (args.regularization
                                                      is not None):
                            logs["loss/instant_ce"] = (
                                loss.item() -
                                regu_lambda * logs["loss/regularization"] -
                                args.alpha_distil *
                                logs["loss/distil"]) / args.alpha_ce
                        elif teacher is not None:
                            logs["loss/instant_ce"] = (
                                loss.item() - args.alpha_distil *
                                logs["loss/distil"]) / args.alpha_ce
                        else:
                            logs["loss/instant_ce"] = loss.item(
                            ) - regu_lambda * logs["loss/regularization"]
                    logging_loss = tr_loss

                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    print(json.dumps({**logs, **{"step": global_step}}))

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #22
0
            scaled_loss.backward()
        if (global_step + 1) % accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            model.zero_grad()

        global_step += 1

    if (time.time() - start_time) / 3600 > 7:
        break

del examples, train_dataset, train_loader
gc.collect()

torch.save(model.state_dict(), output_model_file)
torch.save(optimizer.state_dict(), output_optimizer_file)
torch.save(amp.state_dict(), output_amp_file)

# %% [code]
print(f'trained {global_step * batch_size} samples')
print(f'training time: {(time.time() - start_time) / 3600:.1f} hours')


# %% [code]
def eval_collate_fn(
        examples: List[Example]) -> Tuple[List[torch.Tensor], List[Example]]:
    # input tokens
    max_len = max([len(example.input_ids) for example in examples])
    tokens = np.zeros((len(examples), max_len), dtype=np.int64)
    token_type_ids = np.ones((len(examples), max_len), dtype=np.int64)
    for i, example in enumerate(examples):
Example #23
0
def train(model, tokenizer, checkpoint):
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
    else:
        amp = None
    # 训练数据处理
    train_data = DataBert(data_file=args.train_file,
                          doc_file=doc_file,
                          s1_length=args.s1_length,
                          s2_length=args.s2_length,
                          max_length=args.max_length,
                          tokenizer=tokenizer
                          )
    train_dataLoader = DataLoader(dataset=train_data,
                                batch_size=args.batch_size,
                                shuffle= not args.pair)

    attacked_data = AttackedData(attacked_file=args.attacked_file)  #攻击样本

    attack_dataloader = DataLoader(dataset=attacked_data,
                                   batch_size=args.batch_size,
                                   shuffle=False)

    print('train_data:', len(train_data))
    print('attack_data:', len(attacked_data))
    # 初始化 optimizer,scheduler
    t_total = len(train_dataLoader) * args.epochs
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(model.parameters(), lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )
    # apex
    if args.fp16:
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fptype)

    # 读取断点 optimizer、scheduler
    checkpoint_dir = args.save_dir + "/checkpoint-" + str(checkpoint)
    if os.path.isfile(os.path.join(checkpoint_dir, "optimizer.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(checkpoint_dir, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(checkpoint_dir, "scheduler.pt")))
        if args.fp16:
            amp.load_state_dict(torch.load(os.path.join(checkpoint_dir, "amp.pt")))

    # 开始训练
    logger.debug("***** Running training *****")
    logger.debug("  Num examples = %d", len(train_dataLoader))
    logger.debug("  Num Epochs = %d", args.epochs)
    logger.debug("  Set_Batch size = %d", args.batch_size)
    logger.debug("  Real_Batch_size = %d", args.batch_size * args.accumulate)

    # 没有历史断点,则从0开始
    if checkpoint < 0:
        checkpoint = 0
    else:
        checkpoint += 1
    logger.debug("  Start Batch = %d", checkpoint)

    for epoch in range(checkpoint, args.epochs):
        model.train()
        epoch_loss = []

        step = 0
        for batch, batch_attack in tqdm(zip(train_dataLoader, attack_dataloader), desc="Iteration", total=len(train_dataLoader)):
            # 设置tensor gpu运行
            batch = tuple(t.to('cuda') for t in batch[:4])
            input_ids, token_type_ids, attention_mask, labels = batch

            outputs = model(input_ids=input_ids.long(),
                            token_type_ids=token_type_ids.long(),
                            labels=labels)

            loss_clean = outputs[0]

            # if args.fp16:
            #     with amp.scale_loss(loss, optimizer) as scaled_loss:
            #         scaled_loss.backward()
            # else:
            #     loss.backward() #计算出梯度

            batch_attack = tuple(t.to('cuda') for t in batch_attack)

            input_ids2, token_type_ids2, attention_mask2, labels2 = batch_attack

            outputs_attack = model(input_ids=input_ids2.long(),
                                   token_type_ids=token_type_ids2.long(),
                                   attention_mask=attention_mask2,
                                   labels=labels2)


            loss_adv = outputs_attack[0]


            loss = (0.5 * loss_clean) + (0.5 * loss_adv)

            print(loss_clean.item(), loss_adv.item())
            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            epoch_loss.append(loss.item())
            

            if step % args.accumulate == 0:
                optimizer.step()
                scheduler.step()
                model.zero_gra()
            step += 1

            # 保存模型

        output_dir = args.save_dir + "/checkpoint-" + str(epoch)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        model_to_save = (model.module if hasattr(model, "module") else model)
        model_to_save.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
        torch.save(args, os.path.join(output_dir, "training_args.bin"))
        logger.debug("Saving model checkpoint to %s", output_dir)
        if args.fp16:
            torch.save(amp.state_dict(), os.path.join(output_dir, "amp.pt"))
        torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
        torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
        logger.debug("Saving optimizer and scheduler states to %s", output_dir)

        # eval dev
        eval_loss, eval_map, eval_mrr = evaluate(model, tokenizer, eval_file=args.dev_file,
                                                checkpoint=epoch,
                                                output_dir=output_dir)
        # eval test
        test_eval_loss, test_eval_map, test_eval_mrr = evaluate(model, tokenizer, eval_file=args.test_file,
                                                                checkpoint=epoch,
                                                                output_dir=output_dir)

        # 输出日志 + 保存日志
        logger.info('【DEV 】Train Epoch %d: train_loss=%.4f, map=%.4f, mrr=%.4f' % (
        epoch, np.array(epoch_loss).mean(), eval_map, eval_mrr))
        logger.info('【TEST】Train Epoch %d: train_loss=%.4f, map=%.4f, mrr=%.4f' % (
        epoch, np.array(epoch_loss).mean(), test_eval_map, test_eval_mrr))
Example #24
0
File: train.py Project: mutiann/ccc
def train(args):
    if args.model_path is None:
        msg = 'Prepare for new run ...'
        output_dir = os.path.join(
            args.log_dir, args.run_name + '_' +
            datetime.datetime.now().strftime('%m%d_%H%M'))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        ckpt_dir = os.path.join(
            args.ckpt_dir, args.run_name + '_' +
            datetime.datetime.now().strftime('%m%d_%H%M'))
        if not os.path.exists(ckpt_dir):
            os.makedirs(ckpt_dir)
    else:
        msg = 'Restart previous run ...\nlogs to save to %s, ckpt to save to %s, model to load from %s' % \
                     (args.log_dir, args.ckpt_dir, args.model_path)
        output_dir = args.log_dir
        ckpt_dir = args.ckpt_dir
        if not os.path.isdir(output_dir):
            print('Invalid log dir: %s' % output_dir)
            return
        if not os.path.isdir(ckpt_dir):
            print('Invalid ckpt dir: %s' % ckpt_dir)
            return

    set_logger(os.path.join(output_dir, 'outputs.log'))
    logging.info(msg)

    global device
    if args.device is not None:
        logging.info('Setting device to ' + args.device)
        device = torch.device(args.device)
    else:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    logging.info('Setting up...')
    hparams.parse(args.hparams)
    logging.info(hparams_debug_string())

    model = EdgeClassification

    if hparams.use_roberta:
        logging.info('Using Roberta...')
        model = RobertaEdgeClassification

    global_step = 0

    if args.model_path is None:
        if hparams.load_pretrained:
            logging.info('Load online pretrained model...' + (
                ('cached at ' +
                 args.cache_path) if args.cache_path is not None else ''))
            if hparams.use_roberta:
                model = model.from_pretrained('roberta-base',
                                              cache_dir=args.cache_path,
                                              hparams=hparams)
            else:
                model = model.from_pretrained('bert-base-uncased',
                                              cache_dir=args.cache_path,
                                              hparams=hparams)
        else:
            logging.info('Build model from scratch...')
            if hparams.use_roberta:
                config = BertConfig.from_pretrained('bert-base-uncased')
            else:
                config = RobertaConfig.from_pretrained('roberta-base')
            model = model(config=config, hparams=hparams)
    else:
        if not os.path.isdir(args.model_path):
            raise OSError(str(args.model_path) + ' not found')
        logging.info('Load saved model from %s ...' % (args.model_path))
        model = model.from_pretrained(args.model_path, hparams=hparams)
        step = args.model_path.split('_')[-1]
        if step.isnumeric():
            global_step = int(step)
            logging.info('Initial step=%d' % global_step)

    if hparams.use_roberta:
        tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
    else:
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    hparams.parse(args.hparams)
    logging.info(hparams_debug_string())

    if hparams.text_sample_eval:
        if args.eval_text_path is None:
            raise ValueError('eval_text_path not given')
        if ':' not in args.eval_text_path:
            eval_data_paths = [args.eval_text_path]
        else:
            eval_data_paths = args.eval_text_path.split(':')
        eval_feeder = []
        for p in eval_data_paths:
            name = os.path.split(p)[-1]
            if name.endswith('.tsv'):
                name = name[:-4]
            eval_feeder.append(
                (name, ExternalTextFeeder(p, hparams, tokenizer, 'dev')))
    else:
        eval_feeder = [('', DataFeeder(args.data_dir, hparams, tokenizer,
                                       'dev'))]

    tb_writer = SummaryWriter(output_dir)

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        hparams.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=hparams.learning_rate,
                      eps=hparams.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=hparams.warmup_steps,
        lr_decay_step=hparams.lr_decay_step,
        max_lr_decay_rate=hparams.max_lr_decay_rate)

    acc_step = global_step * hparams.gradient_accumulation_steps
    time_window = ValueWindow()
    loss_window = ValueWindow()
    acc_window = ValueWindow()
    model.to(device)
    model.zero_grad()
    tr_loss = tr_acc = 0.0
    start_time = time.time()

    if args.model_path is not None:
        logging.info('Load saved model from %s ...' % (args.model_path))
        if os.path.exists(os.path.join(args.model_path, 'optimizer.pt')) \
                and os.path.exists(os.path.join(args.model_path, 'scheduler.pt')):
            optimizer.load_state_dict(
                torch.load(os.path.join(args.model_path, 'optimizer.pt')))
            optimizer.load_state_dict(optimizer.state_dict())
            scheduler.load_state_dict(
                torch.load(os.path.join(args.model_path, 'scheduler.pt')))
            scheduler.load_state_dict(scheduler.state_dict())
        else:
            logging.warning('Could not find saved optimizer/scheduler')

    if global_step > 0:
        logs = run_eval(args, model, eval_feeder)
        for key, value in logs.items():
            tb_writer.add_scalar(key, value, global_step)

    logging.info('Start training...')
    if hparams.text_sample_train:
        train_feeder = PrebuiltTrainFeeder(args.train_text_path, hparams,
                                           tokenizer, 'train')
    else:
        train_feeder = DataFeeder(args.data_dir, hparams, tokenizer, 'train')

    while True:
        batch = train_feeder.next_batch()
        model.train()

        outputs = model(input_ids=batch.input_ids.to(device),
                        attention_mask=batch.input_mask.to(device),
                        token_type_ids=None if batch.token_type_ids is None
                        else batch.token_type_ids.to(device),
                        labels=batch.labels.to(device))
        loss = outputs['loss']
        preds = outputs['preds']

        acc = torch.mean((preds.cpu() == batch.labels).float())
        preds = preds.cpu().detach().numpy()
        labels = batch.labels.detach().numpy()
        t_acc = np.sum(np.logical_and(preds == 1, labels
                                      == 1)) / np.sum(labels == 1)
        f_acc = np.sum(np.logical_and(preds == 0, labels
                                      == 0)) / np.sum(labels == 0)

        if hparams.gradient_accumulation_steps > 1:
            loss = loss / hparams.gradient_accumulation_steps
            acc = acc / hparams.gradient_accumulation_steps

        tr_loss += loss.item()
        tr_acc += acc.item()
        loss.backward()
        acc_step += 1

        if acc_step % hparams.gradient_accumulation_steps != 0:
            continue

        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                       hparams.max_grad_norm)
        optimizer.step()
        scheduler.step(None)
        model.zero_grad()
        global_step += 1

        step_time = time.time() - start_time
        time_window.append(step_time)
        loss_window.append(tr_loss)
        acc_window.append(tr_acc)

        if global_step % args.save_steps == 0:
            # Save model checkpoint
            model_to_save = model.module if hasattr(model, 'module') else model
            cur_ckpt_dir = os.path.join(ckpt_dir,
                                        'checkpoint_%d' % (global_step))
            if not os.path.exists(cur_ckpt_dir):
                os.makedirs(cur_ckpt_dir)
            model_to_save.save_pretrained(cur_ckpt_dir)
            torch.save(args, os.path.join(cur_ckpt_dir, 'training_args.bin'))
            torch.save(optimizer.state_dict(),
                       os.path.join(cur_ckpt_dir, 'optimizer.pt'))
            torch.save(scheduler.state_dict(),
                       os.path.join(cur_ckpt_dir, 'scheduler.pt'))
            logging.info("Saving model checkpoint to %s", cur_ckpt_dir)

        if global_step % args.logging_steps == 0:
            logs = run_eval(args, model, eval_feeder)

            learning_rate_scalar = scheduler.get_lr()[0]
            logs['learning_rate'] = learning_rate_scalar
            logs['loss'] = loss_window.average
            logs['acc'] = acc_window.average

            for key, value in logs.items():
                tb_writer.add_scalar(key, value, global_step)

        message = 'Step %-7d [%.03f sec/step, loss=%.05f, avg_loss=%.05f, acc=%.05f, avg_acc=%.05f, t_acc=%.05f, f_acc=%.05f]' % (
            global_step, step_time, tr_loss, loss_window.average, tr_acc,
            acc_window.average, t_acc, f_acc)
        logging.info(message)
        tr_loss = tr_acc = 0.0
        start_time = time.time()
Example #25
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0

    tr_loss, logging_loss = 0.0, 0.0
    tr_acc, logging_acc = 0.0, 0.0
    best_eval_acc = 0
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    # Added here for reproducibility
    set_seed(args)

    for epoch in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              position=0,
                              leave=True,
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "labels": batch[7],
            }

            outputs = model(**inputs)
            loss, logits = outputs[:2]
            logits = logits.detach().cpu().numpy()
            preds = np.argmax(logits, axis=1)
            gt = inputs["labels"].detach().cpu().numpy()
            acc = (preds == gt).mean()

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            tr_acc += acc

            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                # Log metrics
                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.evaluate_during_training:
                        logger.info(
                            "Validation start for epoch {}".format(epoch))
                        eval_loss, eval_acc = evaluate(args,
                                                       model,
                                                       tokenizer,
                                                       prefix=epoch)
                        is_best = eval_acc > best_eval_acc
                        best_eval_acc = max(eval_acc, best_eval_acc)

                        current_loss = (tr_loss -
                                        logging_loss) / args.logging_steps
                        logging_loss = tr_loss
                        current_acc = (tr_acc -
                                       logging_acc) / args.logging_steps
                        logging_acc = tr_acc

                        logger.info(
                            "best_eval_acc = {}, eval_acc = {}, eval_loss = {}, acc = {}, loss = {}, global_step = {}, " \
                            .format(best_eval_acc, eval_acc, eval_loss, current_acc, current_loss, global_step))
                        if IS_ON_NSML:
                            nsml.report(summary=True,
                                        step=global_step,
                                        eval_acc=eval_acc,
                                        eval_loss=eval_loss,
                                        acc=current_acc,
                                        loss=current_loss)
                            if is_best:
                                nsml.save(args.model_type + "_best")

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    if IS_ON_NSML:
                        nsml.save(args.model_type +
                                  "_gs{}_e{}".format(global_step, epoch))
                    else:
                        output_dir = os.path.join(
                            args.output_dir,
                            "checkpoint-{}".format(global_step))
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # Take care of distributed/parallel training
                        model_to_save = model.module if hasattr(
                            model, "module") else model
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)

                        torch.save(
                            args, os.path.join(output_dir,
                                               "training_args.bin"))
                        logger.info("Saving model checkpoint to %s",
                                    output_dir)

                        torch.save(optimizer.state_dict(),
                                   os.path.join(output_dir, "optimizer.pt"))
                        torch.save(scheduler.state_dict(),
                                   os.path.join(output_dir, "scheduler.pt"))
                        logger.info(
                            "Saving optimizer and scheduler states to %s",
                            output_dir)

            if 0 < args.max_steps < global_step:
                epoch_iterator.close()
                break

        if 0 < args.max_steps < global_step:
            train_iterator.close()
            break

    if IS_ON_NSML:
        nsml.save(args.model_type + "_last")

    return global_step, tr_loss / global_step
Example #26
0
def train(args, train_dataset, model, tokenizer, fh, pool):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        args.tensorboard_dir = os.path.join(args.output_dir, 'tensorboard')
        if not os.path.exists(args.tensorboard_dir):
            os.makedirs(args.tensorboard_dir)
        tb_writer = SummaryWriter(args.tensorboard_dir)

    args.batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)

    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.batch_size,
                                  drop_last=True)
    total_examples = len(train_dataset) * (torch.distributed.get_world_size()
                                           if args.local_rank != -1 else 1)
    batch_size = args.batch_size * args.gradient_accumulation_steps * (
        torch.distributed.get_world_size() if args.local_rank != -1 else 1)
    # if args.max_steps > 0:
    #     t_total = args.max_steps
    #     args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    if args.num_train_epochs > 0:
        t_total = total_examples // batch_size * args.num_train_epochs
    args.max_steps = t_total
    model.to(args.device)
    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier()
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)
    checkpoint_last = os.path.join(args.output_dir, 'checkpoint-last')
    scheduler_last = os.path.join(checkpoint_last, 'scheduler.pt')
    optimizer_last = os.path.join(checkpoint_last, 'optimizer.pt')
    if os.path.exists(scheduler_last):
        scheduler.load_state_dict(
            torch.load(scheduler_last, map_location="cpu"))
    if os.path.exists(optimizer_last):
        optimizer.load_state_dict(
            torch.load(optimizer_last, map_location="cpu"))
    if args.local_rank == 0:
        torch.distributed.barrier()
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank % args.gpu_per_node],
            output_device=args.local_rank % args.gpu_per_node,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", total_examples)
    logger.info("  Num epoch = %d", t_total * batch_size // total_examples)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        batch_size)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = args.start_step
    tr_loss, logging_loss, avg_loss, tr_nb = 0.0, 0.0, 0.0, 0
    # model.resize_token_embeddings(len(tokenizer))
    model.zero_grad()
    set_seed(
        args)  # Added here for reproducibility (even between python 2 and 3)

    best_bleu = 0.0

    for idx in range(args.start_epoch, int(args.num_train_epochs)):
        for step, (batch, token_labels) in enumerate(train_dataloader):
            inputs = batch.to(args.device)
            attn_mask = torch.tensor(token_labels.clone().detach() != 0,
                                     dtype=torch.uint8,
                                     device=args.device)
            loss_mask = torch.tensor(token_labels.clone().detach() == 2,
                                     dtype=torch.uint8,
                                     device=args.device)
            model.train()
            # outputs = model(inputs, attention_mask=attn_mask, labels=inputs, loss_mask=loss_mask)
            # loss = outputs[0]
            outputs = model(inputs, attention_mask=attn_mask)
            logits = outputs[0]
            labels = inputs
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            # Flatten the tokens
            loss_fct = CrossEntropyLoss()
            flatten_shift_loss_mask = loss_mask[..., :-1].contiguous().view(-1)
            ids = torch.nonzero(flatten_shift_loss_mask).view(-1)
            loss = loss_fct(
                shift_logits.view(-1, shift_logits.size(-1))[ids],
                shift_labels.view(-1)[ids])

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            tr_loss += loss.item()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                global_step += 1
                output_flag = True
                avg_loss = round(
                    np.exp((tr_loss - logging_loss) / (global_step - tr_nb)),
                    4)
                if global_step % args.logging_steps == 0:
                    logger.info("  steps: %s  ppl: %s", global_step,
                                round(avg_loss, 5))
                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    tb_writer.add_scalar('lr',
                                         scheduler.get_last_lr()[0],
                                         global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss
                    tr_nb = global_step

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    if args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        # results = evaluate(args, model, tokenizer, eval_when_training=True)
                        # for key, value in results.items():
                        #     tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
                        #     logger.info("  %s = %s", key, round(value,4))
                        # output_dir = os.path.join(args.output_dir, '{}-{}-{}'.format(checkpoint_prefix, global_step, round(results['perplexity'],4)))
                        dev_bleu, dev_EM = eval_bleu(args,
                                                     model,
                                                     tokenizer,
                                                     file_type='dev',
                                                     num=100)
                        logger.info(f"dev bleu: {dev_bleu}, dev EM: {dev_EM}")
                        output_dir = os.path.join(
                            args.output_dir,
                            '{}-{}-{}'.format(checkpoint_prefix, global_step,
                                              round(dev_bleu, 2)))
                        if dev_bleu > best_bleu:
                            best_bleu = dev_bleu
                            logger.info(
                                f"best bleu updated. saved in {output_dir}")
                            logger.info(f"best bleu: {best_bleu}")
                    else:
                        output_dir = os.path.join(
                            args.output_dir,
                            "{}-{}".format(checkpoint_prefix, global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    # _rotate_checkpoints(args, checkpoint_prefix)
                    last_output_dir = os.path.join(args.output_dir,
                                                   'checkpoint-last')
                    if not os.path.exists(last_output_dir):
                        os.makedirs(last_output_dir)
                    model_to_save.save_pretrained(last_output_dir)
                    tokenizer.save_pretrained(last_output_dir)
                    idx_file = os.path.join(last_output_dir, 'idx_file.txt')
                    with open(idx_file, 'w', encoding='utf-8') as idxf:
                        idxf.write(str(0) + '\n')

                    torch.save(optimizer.state_dict(),
                               os.path.join(last_output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(last_output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                last_output_dir)

                    step_file = os.path.join(last_output_dir, 'step_file.txt')
                    with open(step_file, 'w', encoding='utf-8') as stepf:
                        stepf.write(str(global_step) + '\n')

                    # torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    # torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    # logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    #scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)  #choose schedule here
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    if args.load_states == True:
        folder = [x[0] for x in os.walk(args.output_dir)][1]
        print(folder)
        states = torch.load(folder + "/checkp.pth")
        optimizer.load_state_dict(states["optimizer"])
        scheduler.load_state_dict(states["scheduler"])
        global_step = states["step"]
        print("optimizer and schedule were loaded succesfully")
        del states

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    if args.load_states == False:
        global_step = 0

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproducibility (even between python 2 and 3)
    counterL = 1
    lossStack = 0
    lossHistory = []
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = mask_tokens(batch, tokenizer,
                                         args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs,
                            masked_lm_labels=labels) if args.mlm else model(
                                inputs, labels=labels)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            counterL += 1
            if counterL % 100 == 0:
                lossHistory.append(lossStack)
                lossStack = 0
                counterL = 1
            else:
                lossStack += loss.item()

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = 'checkpoint'
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir,
                        '{}-{}'.format(checkpoint_prefix, global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)

                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)
                    _rotate_checkpoints(args, checkpoint_prefix)

                    checkpoint = {
                        "lossHistory": lossHistory,
                        'step': global_step,
                        'scheduler': scheduler.state_dict(),
                        'optimizer': optimizer.state_dict()
                    }
                    torch.save(checkpoint,
                               os.path.join(output_dir, 'checkp.pth'))

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #28
0
        #         checkpoints instead.
        if save_flag:
            torch.save({"epoch": epoch,
                        "models": model.state_dict(),
                        "best_score": best_score,
                        "epochs_count": epochs_count,
                        "train_losses": train_losses,
                        "valid_losses": valid_losses},
                       os.path.join(target_dir, "best.pth.tar"))

        # Save the models at each epoch.
    if save_flag:
        torch.save({"epoch": epoch,
                    "models": model.state_dict(),
                    "best_score": best_score,
                    "optimizer": optimizer.state_dict(),
                    "epochs_count": epochs_count,
                    "train_losses": train_losses,
                    "valid_losses": valid_losses},
                   os.path.join(target_dir, "esim_{}.pth.tar".format(epoch)))

    if patience_counter >= patience:
        print("-> Early stopping: patience limit reached, stopping...")
        break


plt.figure()
plt.plot(epochs_count, train_losses, "-r")
plt.plot(epochs_count, valid_losses, "-b")
plt.xlabel("epoch")
plt.ylabel("loss")
Example #29
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
        os.path.join(args.model_name_or_path, "scheduler.pt")
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
        epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0],
    )
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {"input_ids": batch[0], "attention_mask": batch[1], "labels": batch[3]}
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2] if args.model_type in ["bert", "xlnet", "albert"] else None
                )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            eval_key = "eval_{}".format(key)
                            logs[eval_key] = value

                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
                    logs["learning_rate"] = learning_rate_scalar
                    logs["loss"] = loss_scalar
                    logging_loss = tr_loss

                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    print(json.dumps({**logs, **{"step": global_step}}))

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #30
0
def train(args):
    # Init device
    n_gpu = torch.cuda.device_count()
    if n_gpu == 0:
        warnings.warn('No GPU detected. Training on CPU will be very slow')
    elif n_gpu > 1:
        warnings.warn('This codebase is not optimized for multi GPU usage')
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # Lambda for filenames
    example_tag_to_fp = lambda tag: os.path.join(args.examples_dir, '{}.pkl'.
                                                 format(tag))
    out_fn_to_fp = lambda fn: os.path.join(args.train_dir, fn)

    # Create training dir
    os.makedirs(args.train_dir, exist_ok=True)
    resuming = os.path.exists(out_fn_to_fp('step.pkl'))

    # Create tokenizer
    tokenizer = ilm.tokenize_util.Tokenizer[args.tokenizer_name.upper()]
    if tokenizer == ilm.tokenize_util.Tokenizer.CUSTOM:
        ilm.tokenize_util.set_custom_vocab_fp(args.tokenizer_custom_vocab_fp)

    # Update tokenizer
    base_vocab_size = ilm.tokenize_util.vocab_size(tokenizer)
    start_infill_id = base_vocab_size + 0
    end_infill_id = base_vocab_size + 1
    additional_ids_to_tokens = {
        start_infill_id: '<|startofinfill|>',
        end_infill_id: '<|endofinfill|>'
    }
    mask_cls = ilm.mask.util.mask_cls_str_to_type(args.mask_cls)
    mask_types = mask_cls.mask_types()
    mask_type_to_id = {}
    for i, t in enumerate(mask_types):
        t_id = base_vocab_size + 2 + i
        t_tok = '<|infill_{}|>'.format(mask_cls.mask_type_serialize(t))
        additional_ids_to_tokens[t_id] = t_tok
        mask_type_to_id[t] = t_id
    print(additional_ids_to_tokens)
    vocab_size = ilm.tokenize_util.update_tokenizer(additional_ids_to_tokens,
                                                    tokenizer)
    with open(out_fn_to_fp('additional_ids_to_tokens.pkl'), 'wb') as f:
        pickle.dump(additional_ids_to_tokens, f)

    # Load training data
    if not args.eval_only:
        print('Loading training data')
        loaded_from_cache = False
        if args.data_cache:
            try:
                train_inputs = np.load(out_fn_to_fp('train_inp.npy'))
                train_tts = np.load(out_fn_to_fp('train_tts.npy'))
                with open(out_fn_to_fp('train_num_docs.pkl'), 'rb') as f:
                    train_num_docs = pickle.load(f)
                loaded_from_cache = True
            except:
                pass
        if not loaded_from_cache:
            train_inputs, train_tts, train_num_docs = masked_dataset_to_inputs_and_tts(
                'train', tokenizer, start_infill_id, end_infill_id,
                mask_type_to_id, args)
            if args.data_cache:
                np.save(out_fn_to_fp('train_inp.npy'), train_inputs)
                np.save(out_fn_to_fp('train_tts.npy'), train_tts)
                with open(out_fn_to_fp('train_num_docs.pkl'), 'wb') as f:
                    pickle.dump(train_num_docs, f)
        train_tt_to_count = {
            TargetType(k): v
            for k, v in zip(*np.unique(train_tts, return_counts=True))
        }
        print(train_tt_to_count)
        num_unmasked = train_tt_to_count.get(TargetType.CONTEXT, 0)
        num_masked = train_tt_to_count.get(TargetType.INFILL, 0)
        print('Mask rate (tokens): {:.4f}'.format(num_masked /
                                                  (num_unmasked + num_masked)))
        print('{} documents, {} examples'.format(train_num_docs,
                                                 train_inputs.shape[0]))
        print(train_inputs.shape, train_inputs.dtype, train_tts.shape,
              train_tts.dtype)
        train_data = TensorDataset(
            torch.from_numpy(train_inputs.astype(np.int64)),
            torch.from_numpy(train_tts))
        del train_inputs
        del train_tts

    # Load eval data
    print('Loading eval data')
    loaded_from_cache = False
    if args.data_cache:
        try:
            eval_inputs = np.load(out_fn_to_fp('eval_inp.npy'))
            eval_tts = np.load(out_fn_to_fp('eval_tts.npy'))
            with open(out_fn_to_fp('eval_num_docs.pkl'), 'rb') as f:
                eval_num_docs = pickle.load(f)
            loaded_from_cache = True
        except:
            pass
    if not loaded_from_cache:
        eval_inputs, eval_tts, eval_num_docs = masked_dataset_to_inputs_and_tts(
            'eval', tokenizer, start_infill_id, end_infill_id, mask_type_to_id,
            args)
        if args.data_cache:
            np.save(out_fn_to_fp('eval_inp.npy'), eval_inputs)
            np.save(out_fn_to_fp('eval_tts.npy'), eval_tts)
            with open(out_fn_to_fp('eval_num_docs.pkl'), 'wb') as f:
                pickle.dump(eval_num_docs, f)
    eval_tt_to_count = {
        TargetType(k): v
        for k, v in zip(*np.unique(eval_tts, return_counts=True))
    }
    print(eval_tt_to_count)
    num_unmasked = eval_tt_to_count.get(TargetType.CONTEXT, 0)
    num_masked = eval_tt_to_count.get(TargetType.INFILL, 0)
    print('Mask rate (tokens): {:.4f}'.format(num_masked /
                                              (num_unmasked + num_masked)))
    print('{} documents, {} examples'.format(eval_num_docs,
                                             eval_inputs.shape[0]))
    print(eval_inputs.shape, eval_inputs.dtype, eval_tts.shape, eval_tts.dtype)
    eval_data = TensorDataset(torch.from_numpy(eval_inputs.astype(np.int64)),
                              torch.from_numpy(eval_tts))
    del eval_inputs
    del eval_tts

    # Calculate number of steps to train for (return if we're just pre-cacheing data)
    if args.train_num_epochs is not None:
        train_num_batches = int(
            float(train_num_docs * args.train_num_epochs) /
            args.train_batch_size)
        if train_num_batches == 0:
            return
        print('Maximum number of training steps: {}'.format(
            train_num_batches / args.train_batch_accumulation))

    # Create data iterators
    print('Creating datasets')
    if not args.eval_only:
        train_sampler = RandomSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size,
                                      drop_last=True)
    eval_sampler = SequentialSampler(eval_data)
    eval_dataloader = DataLoader(eval_data,
                                 sampler=eval_sampler,
                                 batch_size=args.eval_batch_size,
                                 drop_last=True)

    # Load model
    print('Initializing model...')
    set_random_seed(args.seed)
    if args.model_name in ilm.constants.GPT2_MODEL_NAMES:
        model_type = GPT2LMHeadModel
        cfg_type = GPT2Config
    if resuming:
        print('from saved checkpoint (resuming)')
        model = model_type.from_pretrained(args.train_dir)
    else:
        if args.train_from_scratch:
            print('from scratch')
            cfg = cfg_type.from_pretrained(args.model_name)
            model = model_type(cfg)
        else:
            print('from pretrained checkpoint')
            model = model_type.from_pretrained('data/gpt-2-pytorch')
    model.resize_token_embeddings(vocab_size)
    model.to(device)
    model.train()

    # Reset random seed in case model init triggered RNG

    # Initialize optimizers
    if not args.eval_only:
        params = list(model.named_parameters())
        no_decay = ['bias', 'ln']
        optimizer_grouped_parameters = [{
            'params':
            [p for n, p in params if not any(nd in n for nd in no_decay)],
            'weight_decay':
            args.train_weight_decay
        }, {
            'params':
            [p for n, p in params if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        }]
        optimizer = AdamW(optimizer_grouped_parameters,
                          lr=args.train_learning_rate,
                          eps=args.train_adam_epsilon)
        if resuming:
            optimizer.load_state_dict(torch.load(out_fn_to_fp('optimizer.pt')))

    # Create global step
    if resuming:
        try:
            with open(out_fn_to_fp('step.pkl'), 'rb') as f:
                step = pickle.load(f)
        except Exception as e:
            if args.eval_only:
                step = None
            else:
                raise e
    else:
        step = 0

    if args.eval_only:
        print('Evaluating')
        model.eval()

        eval_start = time.time()
        eval_token_counts = defaultdict(int)
        eval_token_loss_sums = defaultdict(float)
        for i, eval_batch in enumerate(eval_dataloader):
            with torch.no_grad():
                eval_inputs, eval_tts = tuple(t.to(device) for t in eval_batch)
                eval_logits, _ = model(eval_inputs)
                eval_logits_relevant = eval_logits[:, :-1].contiguous().view(
                    -1, eval_logits.shape[-1])

                for tag, tts in [
                    ('context', [TargetType.CONTEXT]),
                    ('infill', [TargetType.INFILL, TargetType.INFILL_SPECIAL]),
                    ('infill_textonly', [TargetType.INFILL])
                ]:
                    eval_labels = tts_to_labels(eval_inputs, eval_tts, tts)
                    eval_labels_relevant = eval_labels[:, 1:]
                    eval_labels_relevant_count = (eval_labels_relevant !=
                                                  -1).long().sum().item()
                    eval_labels_loss = F.cross_entropy(
                        eval_logits_relevant,
                        eval_labels_relevant.contiguous().view(-1),
                        ignore_index=-1).item()
                    eval_token_counts[tag] += eval_labels_relevant_count
                    eval_token_loss_sums[
                        tag] += eval_labels_loss * eval_labels_relevant_count

        eval_dict = {}
        for tag, count in eval_token_counts.items():
            loss = eval_token_loss_sums[tag]
            if count > 0:
                loss /= count
            eval_dict['eval_{}_count'.format(tag)] = count
            eval_dict['eval_{}_loss'.format(tag)] = loss
            eval_dict['eval_{}_ppl'.format(tag)] = np.exp(loss)
        eval_dict['eval_time'] = time.time() - eval_start

        print('-' * 80)
        if step is not None:
            print('(Step {}) Eval'.format(step))
        for k, v in eval_dict.items():
            print('{}: {}'.format(k, v))
        if args.wandb:
            wandb.log(eval_dict, step=step)

    else:
        print('Training')
        set_random_seed(args.seed)
        best_eval_loss = None
        num_save = -1
        num_summary = -1
        num_batches_complete = step * args.train_batch_accumulation
        start = time.time()
        while True:
            if args.train_num_epochs is not None and num_batches_complete >= train_num_batches:
                break

            for batch in train_dataloader:
                if args.train_num_epochs is not None and num_batches_complete >= train_num_batches:
                    break

                elapsed = time.time() - start

                # Evaluate
                if int(elapsed / args.train_eval_secs) > num_save:
                    num_save = int(elapsed / args.train_eval_secs)

                    model.eval()

                    eval_start = time.time()
                    eval_token_counts = defaultdict(int)
                    eval_token_loss_sums = defaultdict(float)
                    for i, eval_batch in enumerate(eval_dataloader):
                        with torch.no_grad():
                            eval_inputs, eval_tts = tuple(
                                t.to(device) for t in eval_batch)
                            eval_logits, _ = model(eval_inputs)
                            eval_logits_relevant = eval_logits[:, :
                                                               -1].contiguous(
                                                               ).view(
                                                                   -1,
                                                                   eval_logits.
                                                                   shape[-1])

                            for tag, tts in [('context', [TargetType.CONTEXT]),
                                             ('infill', [
                                                 TargetType.INFILL,
                                                 TargetType.INFILL_SPECIAL
                                             ]),
                                             ('infill_textonly',
                                              [TargetType.INFILL])]:
                                eval_labels = tts_to_labels(
                                    eval_inputs, eval_tts, tts)
                                eval_labels_relevant = eval_labels[:, 1:]
                                eval_labels_relevant_count = (
                                    eval_labels_relevant !=
                                    -1).long().sum().item()
                                eval_labels_loss = F.cross_entropy(
                                    eval_logits_relevant,
                                    eval_labels_relevant.contiguous().view(-1),
                                    ignore_index=-1).item()
                                eval_token_counts[
                                    tag] += eval_labels_relevant_count
                                eval_token_loss_sums[
                                    tag] += eval_labels_loss * eval_labels_relevant_count

                    eval_dict = {}
                    for tag, count in eval_token_counts.items():
                        loss = eval_token_loss_sums[tag]
                        if count > 0:
                            loss /= count
                        eval_dict['eval_{}_count'.format(tag)] = count
                        eval_dict['eval_{}_loss'.format(tag)] = loss
                    eval_dict['eval_time'] = time.time() - eval_start

                    print('-' * 80)
                    print('(Step {}) Eval'.format(step))
                    for k, v in eval_dict.items():
                        print('{}: {}'.format(k, v))
                    if args.wandb:
                        wandb.log(eval_dict, step=step)

                    if best_eval_loss is None or eval_dict[
                            'eval_infill_loss'] < best_eval_loss:
                        print('Saving')
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        model_to_save.config.to_json_file(
                            out_fn_to_fp(CONFIG_NAME))
                        torch.save(model_to_save.state_dict(),
                                   out_fn_to_fp(WEIGHTS_NAME))
                        torch.save(optimizer.state_dict(),
                                   out_fn_to_fp('optimizer.pt'))
                        with open(out_fn_to_fp('step.pkl'), 'wb') as f:
                            pickle.dump(step, f)
                        best_eval_loss = eval_dict['eval_infill_loss']

                    model.train()

                # Train
                inputs, tts = tuple(t.to(device) for t in batch)
                # TODO: Option to train on CONTEXT_SPECIAL?
                labels_context = tts_to_labels(inputs, tts,
                                               [TargetType.CONTEXT])
                # TODO: Option to skip training on INFILL_REDUNDANT?
                # NOTE: This would give Task.NAIVE/Task.LM less supervision overall but put them more in line with the supervision that Task.ILM and Task.NO_CONTEXT_ILM receive
                labels_infill = tts_to_labels(inputs, tts, [
                    TargetType.INFILL, TargetType.INFILL_SPECIAL,
                    TargetType.INFILL_REDUNDANT
                ])
                logits, _ = model(inputs)
                logits_relevant = logits[:, :-1].contiguous().view(
                    -1, logits.shape[-1])
                loss_context = F.cross_entropy(
                    logits_relevant,
                    labels_context[:, 1:].contiguous().view(-1),
                    ignore_index=-1)
                loss_infill = F.cross_entropy(
                    logits_relevant,
                    labels_infill[:, 1:].contiguous().view(-1),
                    ignore_index=-1)

                loss_context_item = loss_context.item()
                loss_infill_item = loss_infill.item()

                loss = loss_infill
                if args.train_context:
                    loss += loss_context

                if args.train_batch_accumulation != 1:
                    loss /= float(args.train_batch_accumulation)
                loss.backward()

                # Summarize
                if int(elapsed / args.train_summary_secs) > num_summary:
                    num_summary = int(elapsed / args.train_summary_secs)

                    print('-' * 80)
                    print('(Step {}) Summary'.format(step))
                    print(loss_context_item)
                    print(loss_infill_item)
                    with torch.no_grad():
                        for t in inputs, labels_context, labels_infill:
                            t0 = list(t[0].cpu().numpy())
                            print('-' * 40)
                            print(t0)
                        for t in inputs, labels_context, labels_infill:
                            t0 = list(t[0].cpu().numpy())
                            print('-' * 40)
                            print(
                                ilm.tokenize_util.decode(
                                    [0 if t == -1 else t for t in t0],
                                    tokenizer))

                    if args.wandb:
                        wandb.log(
                            {
                                'loss_context': loss_context_item,
                                'loss_infill': loss_infill_item,
                            },
                            step=step)

                if ((num_batches_complete + 1) %
                        args.train_batch_accumulation) == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.train_max_grad_norm)
                    optimizer.step()
                    optimizer.zero_grad()
                    step += 1

                num_batches_complete += 1