Beispiel #1
0
def train(args, train_dataset, model, tokenizer, optimizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    scheduler = get_linear_schedule_with_warmup(optimizer, args.warmup_steps,
                                                t_total)

    checkpoint_last = os.path.join(args.output_dir, 'checkpoint-last')
    scheduler_last = os.path.join(checkpoint_last, 'scheduler.pt')
    if os.path.exists(scheduler_last):
        scheduler.load_state_dict(torch.load(scheduler_last))

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = args.start_step
    tr_loss, logging_loss = 0.0, 0.0
    best_acc = 0.0
    model.zero_grad()
    train_iterator = trange(args.start_epoch,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    model.train()
    for idx, _ in enumerate(train_iterator):
        tr_loss = 0.0
        for step, batch in enumerate(train_dataloader):

            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids':
                batch[0],
                'attention_mask':
                batch[1],
                'token_type_ids':
                batch[2] if args.model_type in ['bert', 'xlnet'] else None,
                # XLM don't use segment_ids
                'labels':
                batch[3]
            }
            ouputs = model(**inputs)
            loss = ouputs[
                0]  # model outputs are always tuple in pytorch-transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                try:
                    from apex import amp
                except ImportError:
                    raise ImportError(
                        "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                    )
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args,
                                           model,
                                           tokenizer,
                                           checkpoint=str(global_step))
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                            logger.info('loss %s', str(tr_loss - logging_loss))
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss
            if args.max_steps > 0 and global_step > args.max_steps:
                # epoch_iterator.close()
                break

        if args.do_eval and (args.local_rank == -1
                             or torch.distributed.get_rank() == 0):
            results = evaluate(args,
                               model,
                               tokenizer,
                               checkpoint=str(args.start_epoch + idx))

            last_output_dir = os.path.join(args.output_dir, 'checkpoint-last')
            if not os.path.exists(last_output_dir):
                os.makedirs(last_output_dir)
            model_to_save = model.module if hasattr(
                model, 'module'
            ) else model  # Take care of distributed/parallel training
            model_to_save.save_pretrained(last_output_dir)
            logger.info("Saving model checkpoint to %s", last_output_dir)
            idx_file = os.path.join(last_output_dir, 'idx_file.txt')
            with open(idx_file, 'w', encoding='utf-8') as idxf:
                idxf.write(str(args.start_epoch + idx) + '\n')

            torch.save(optimizer.state_dict(),
                       os.path.join(last_output_dir, "optimizer.pt"))
            torch.save(scheduler.state_dict(),
                       os.path.join(last_output_dir, "scheduler.pt"))
            logger.info("Saving optimizer and scheduler states to %s",
                        last_output_dir)

            step_file = os.path.join(last_output_dir, 'step_file.txt')
            with open(step_file, 'w', encoding='utf-8') as stepf:
                stepf.write(str(global_step) + '\n')

            if (results['acc'] > best_acc):
                best_acc = results['acc']
                output_dir = os.path.join(args.output_dir, 'checkpoint-best')
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model.module if hasattr(
                    model, 'module'
                ) else model  # Take care of distributed/parallel training
                model_to_save.save_pretrained(output_dir)
                torch.save(
                    args,
                    os.path.join(output_dir, 'training_{}.bin'.format(idx)))
                logger.info("Saving model checkpoint to %s", output_dir)

                torch.save(optimizer.state_dict(),
                           os.path.join(output_dir, "optimizer.pt"))
                torch.save(scheduler.state_dict(),
                           os.path.join(output_dir, "scheduler.pt"))
                logger.info("Saving optimizer and scheduler states to %s",
                            output_dir)

        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Beispiel #2
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    # Added here for reproductibility
    set_seed(args)

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
                if args.version_2_with_negative:
                    inputs.update({"is_impossible": batch[7]})
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            if isinstance(model, torch.nn.DataParallel):
                inputs["return_tuple"] = True

            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                # Log metrics
                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.local_rank == -1 and args.evaluate_during_training:
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                # Save model checkpoint
                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    # Take care of distributed/parallel training
                    model_to_save = model.module if hasattr(
                        model, "module") else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
    def train(self,
              train_dataset,
              output_dir,
              show_running_loss=True,
              eval_df=None,
              verbose=True):
        """
        Trains the model on train_dataset.

        Utility function to be used by the train_model() method. Not intended to be used directly.
        """

        device = self.device
        model = self.model
        args = self.args

        tb_writer = SummaryWriter(logdir=args["tensorboard_dir"])
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(train_dataset,
                                      sampler=train_sampler,
                                      batch_size=args["train_batch_size"])

        t_total = len(train_dataloader) // args[
            "gradient_accumulation_steps"] * args["num_train_epochs"]

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                args["weight_decay"],
            },
            {
                "params": [
                    p for n, p in model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay":
                0.0,
            },
        ]

        warmup_steps = math.ceil(t_total * args["warmup_ratio"])
        args["warmup_steps"] = warmup_steps if args[
            "warmup_steps"] == 0 else args["warmup_steps"]

        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=args["learning_rate"],
            eps=args["adam_epsilon"],
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=args["warmup_steps"],
            num_training_steps=t_total)

        if args["fp16"]:
            try:
                from apex import amp
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )

            model, optimizer = amp.initialize(model,
                                              optimizer,
                                              opt_level=args["fp16_opt_level"])

        if args["n_gpu"] > 1:
            model = torch.nn.DataParallel(model)

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(int(args["num_train_epochs"]),
                                desc="Epoch",
                                disable=args["silent"],
                                mininterval=0)
        epoch_number = 0
        best_eval_metric = None
        early_stopping_counter = 0
        steps_trained_in_current_epoch = 0
        epochs_trained = 0

        if args["model_name"] and os.path.exists(args["model_name"]):
            try:
                # set global_step to gobal_step of last saved checkpoint from model path
                checkpoint_suffix = args["model_name"].split("/")[-1].split(
                    "-")
                if len(checkpoint_suffix) > 2:
                    checkpoint_suffix = checkpoint_suffix[1]
                else:
                    checkpoint_suffix = checkpoint_suffix[-1]
                global_step = int(checkpoint_suffix)
                epochs_trained = global_step // (
                    len(train_dataloader) //
                    args["gradient_accumulation_steps"])
                steps_trained_in_current_epoch = global_step % (
                    len(train_dataloader) //
                    args["gradient_accumulation_steps"])

                logger.info(
                    "   Continuing training from checkpoint, will skip to saved global_step"
                )
                logger.info("   Continuing training from epoch %d",
                            epochs_trained)
                logger.info("   Continuing training from global step %d",
                            global_step)
                logger.info(
                    "   Will skip the first %d steps in the current epoch",
                    steps_trained_in_current_epoch)
            except ValueError:
                logger.info("   Starting fine-tuning.")

        if args["evaluate_during_training"]:
            training_progress_scores = self._create_training_progress_scores()
        if args["wandb_project"]:
            wandb.init(project=args["wandb_project"],
                       config={**args},
                       **args["wandb_kwargs"])
            wandb.watch(self.model)

        model.train()
        for _ in train_iterator:
            if epochs_trained > 0:
                epochs_trained -= 1
                continue
            # epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(
                    tqdm(train_dataloader,
                         desc="Current iteration",
                         disable=args["silent"])):
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue
                batch = tuple(t.to(device) for t in batch)

                inputs = self._get_inputs_dict(batch)

                outputs = model(**inputs)
                # model outputs are always tuple in pytorch-transformers (see doc)
                loss = outputs[0]

                if args["n_gpu"] > 1:
                    loss = loss.mean(
                    )  # mean() to average on multi-gpu parallel training

                current_loss = loss.item()

                if show_running_loss:
                    print("\rRunning loss: %f" % loss, end="")

                if args["gradient_accumulation_steps"] > 1:
                    loss = loss / args["gradient_accumulation_steps"]

                if args["fp16"]:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    # torch.nn.utils.clip_grad_norm_(
                    #     amp.master_params(optimizer), args["max_grad_norm"]
                    # )
                else:
                    loss.backward()
                    # torch.nn.utils.clip_grad_norm_(
                    #     model.parameters(), args["max_grad_norm"]
                    # )

                tr_loss += loss.item()
                if (step + 1) % args["gradient_accumulation_steps"] == 0:
                    if args["fp16"]:
                        torch.nn.utils.clip_grad_norm_(
                            amp.master_params(optimizer),
                            args["max_grad_norm"])
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                       args["max_grad_norm"])
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args["logging_steps"] > 0 and global_step % args[
                            "logging_steps"] == 0:
                        # Log metrics
                        tb_writer.add_scalar("lr",
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar(
                            "loss",
                            (tr_loss - logging_loss) / args["logging_steps"],
                            global_step,
                        )
                        logging_loss = tr_loss
                        if args["wandb_project"]:
                            wandb.log({
                                "Training loss": current_loss,
                                "lr": scheduler.get_lr()[0],
                                "global_step": global_step,
                            })

                    if args["save_steps"] > 0 and global_step % args[
                            "save_steps"] == 0:
                        # Save model checkpoint
                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        self._save_model(output_dir_current,
                                         optimizer,
                                         scheduler,
                                         model=model)

                    if args["evaluate_during_training"] and (
                            args["evaluate_during_training_steps"] > 0
                            and global_step %
                            args["evaluate_during_training_steps"] == 0):
                        # Only evaluate when single GPU otherwise metrics may not average well
                        results, _, _ = self.eval_model(eval_df, verbose=True)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)

                        output_dir_current = os.path.join(
                            output_dir, "checkpoint-{}".format(global_step))

                        os.makedirs(output_dir_current, exist_ok=True)

                        if args["save_eval_checkpoints"]:
                            self._save_model(output_dir_current,
                                             optimizer,
                                             scheduler,
                                             model=model,
                                             results=results)

                        training_progress_scores["global_step"].append(
                            global_step)
                        training_progress_scores["train_loss"].append(
                            current_loss)
                        for key in results:
                            training_progress_scores[key].append(results[key])
                        report = pd.DataFrame(training_progress_scores)
                        report.to_csv(
                            os.path.join(args["output_dir"],
                                         "training_progress_scores.csv"),
                            index=False,
                        )

                        if args["wandb_project"]:
                            wandb.log(
                                self._get_last_metrics(
                                    training_progress_scores))

                        if not best_eval_metric:
                            best_eval_metric = results[
                                args["early_stopping_metric"]]
                            self._save_model(args["best_model_dir"],
                                             optimizer,
                                             scheduler,
                                             model=model,
                                             results=results)
                        if best_eval_metric and args[
                                "early_stopping_metric_minimize"]:
                            if results[args[
                                    "early_stopping_metric"]] - best_eval_metric < args[
                                        "early_stopping_delta"]:
                                best_eval_metric = results[
                                    args["early_stopping_metric"]]
                                self._save_model(args["best_model_dir"],
                                                 optimizer,
                                                 scheduler,
                                                 model=model,
                                                 results=results)
                                early_stopping_counter = 0
                            else:
                                if args["use_early_stopping"]:
                                    if early_stopping_counter < args[
                                            "early_stopping_patience"]:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args['early_stopping_metric']}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args['early_stopping_patience']}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args['early_stopping_patience']} steps reached"
                                            )
                                            logger.info(
                                                " Training terminated.")
                                            train_iterator.close()
                                        return global_step, tr_loss / global_step
                        else:
                            if results[args[
                                    "early_stopping_metric"]] - best_eval_metric > args[
                                        "early_stopping_delta"]:
                                best_eval_metric = results[
                                    args["early_stopping_metric"]]
                                self._save_model(args["best_model_dir"],
                                                 optimizer,
                                                 scheduler,
                                                 model=model,
                                                 results=results)
                                early_stopping_counter = 0
                            else:
                                if args["use_early_stopping"]:
                                    if early_stopping_counter < args[
                                            "early_stopping_patience"]:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(
                                                f" No improvement in {args['early_stopping_metric']}"
                                            )
                                            logger.info(
                                                f" Current step: {early_stopping_counter}"
                                            )
                                            logger.info(
                                                f" Early stopping patience: {args['early_stopping_patience']}"
                                            )
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args['early_stopping_patience']} steps reached"
                                            )
                                            logger.info(
                                                " Training terminated.")
                                            train_iterator.close()
                                        return global_step, tr_loss / global_step

            epoch_number += 1
            output_dir_current = os.path.join(
                output_dir,
                "checkpoint-{}-epoch-{}".format(global_step, epoch_number))

            if args["save_model_every_epoch"] or args[
                    "evaluate_during_training"]:
                os.makedirs(output_dir_current, exist_ok=True)

            if args["save_model_every_epoch"]:
                self._save_model(output_dir_current,
                                 optimizer,
                                 scheduler,
                                 model=model)

            if args["evaluate_during_training"]:
                results, _, _ = self.eval_model(eval_df, verbose=True)

                self._save_model(output_dir_current,
                                 optimizer,
                                 scheduler,
                                 results=results)

                training_progress_scores["global_step"].append(global_step)
                training_progress_scores["train_loss"].append(current_loss)
                for key in results:
                    training_progress_scores[key].append(results[key])
                report = pd.DataFrame(training_progress_scores)
                report.to_csv(os.path.join(args["output_dir"],
                                           "training_progress_scores.csv"),
                              index=False)

                if args["wandb_project"]:
                    wandb.log(self._get_last_metrics(training_progress_scores))

                if not best_eval_metric:
                    best_eval_metric = results[args["early_stopping_metric"]]
                    self._save_model(args["best_model_dir"],
                                     optimizer,
                                     scheduler,
                                     model=model,
                                     results=results)
                if best_eval_metric and args["early_stopping_metric_minimize"]:
                    if results[args[
                            "early_stopping_metric"]] - best_eval_metric < args[
                                "early_stopping_delta"]:
                        best_eval_metric = results[
                            args["early_stopping_metric"]]
                        self._save_model(args["best_model_dir"],
                                         optimizer,
                                         scheduler,
                                         model=model,
                                         results=results)
                        early_stopping_counter = 0
                else:
                    if results[args[
                            "early_stopping_metric"]] - best_eval_metric > args[
                                "early_stopping_delta"]:
                        best_eval_metric = results[
                            args["early_stopping_metric"]]
                        self._save_model(args["best_model_dir"],
                                         optimizer,
                                         scheduler,
                                         model=model,
                                         results=results)
                        early_stopping_counter = 0

        return global_step, tr_loss / global_step
Beispiel #4
0
def train(args, train_dataset, model, tokenizer) -> Tuple[int, float]:
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer.pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples,
                            batch_first=True,
                            padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=collate)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay) and p.requires_grad
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay) and p.requires_grad
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model_to_resize = model.module if hasattr(model, "module") else model
    model_to_resize.resize_token_embeddings(len(tokenizer))

    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch")
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            loss = outputs[0]

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    perplexity = evaluate(args, model, tokenizer)['perplexity']
                    logging_loss = tr_loss / global_step
                    logger.info(
                        f'Step={global_step}, train loss={logging_loss:.4f}, eval perplexity={perplexity:.4f}'
                    )

            if 0 < args.max_steps < global_step:
                epoch_iterator.close()
                break

        if args.save_steps > 0:
            checkpoint_prefix = "checkpoint"
            # Save model checkpoint
            output_dir = os.path.join(
                args.output_dir, "{}-{}".format(checkpoint_prefix,
                                                global_step))
            os.makedirs(output_dir, exist_ok=True)
            model_to_save = (
                model.module if hasattr(model, "module") else model
            )  # Take care of distributed/parallel training
            model_to_save.save_pretrained(output_dir)
            tokenizer.save_pretrained(output_dir)

            torch.save(args, os.path.join(output_dir, "training_args.bin"))
            logger.info("Saving model checkpoint to %s", output_dir)

            _rotate_checkpoints(args, checkpoint_prefix)

        if 0 < args.max_steps < global_step:
            train_iterator.close()
            break

    return global_step, tr_loss / global_step
Beispiel #5
0
    def train(self):
        if self.args.method == "clean":
            print("clean data!")
            concatdataset = ConcatDataset([self.train_dataset, self.unlabeled])
            train_sampler = RandomSampler(concatdataset)
            train_dataloader = DataLoader(
                concatdataset, sampler=train_sampler, batch_size=self.args.batch_size
            )
        else:
            train_sampler = RandomSampler(self.train_dataset)
            train_dataloader = DataLoader(
                self.train_dataset,
                sampler=train_sampler,
                batch_size=self.args.batch_size,
            )
        # assert 0
        if self.args.max_steps > 0:
            t_total = self.args.max_steps
            self.args.num_train_epochs = (
                self.args.max_steps
                // (len(train_dataloader) // self.args.gradient_accumulation_steps)
                + 1
            )
        else:
            t_total = (
                len(train_dataloader)
                // self.args.gradient_accumulation_steps
                * self.args.num_train_epochs
            )

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p
                    for n, p in self.model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": self.args.weight_decay,
            },
            {
                "params": [
                    p
                    for n, p in self.model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=self.args.learning_rate,
            eps=self.args.adam_epsilon,
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.args.warmup_steps,
            num_training_steps=t_total,
        )

        # Train!
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(self.train_dataset))
        logger.info("  Num Epochs = %d", self.args.num_train_epochs)
        logger.info("  Total train batch size = %d", self.args.batch_size)
        logger.info(
            "  Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps
        )
        logger.info("  Total optimization steps = %d", t_total)
        global_step = 0
        tr_loss = 0.0
        self.model.zero_grad()

        train_iterator = trange(int(self.args.num_train_epochs), desc="Epoch")
        set_seed(self.args)
        criterion = nn.KLDivLoss(reduction="batchmean")

        for _ in train_iterator:
            epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(epoch_iterator):
                self.model.train()
                batch = tuple(t.to(self.device) for t in batch)  # GPU or CPU
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                    "labels": batch[3],
                }
                if self.args.task_type == "wic":
                    inputs["keys"] = batch[6]
                elif self.args.task_type == "re":
                    inputs["e1_mask"] = batch[4]
                    inputs["e2_mask"] = batch[5]
                outputs = self.model(**inputs)
                loss1 = outputs[0]
                logits = outputs[1]
                loss = criterion(
                    input=F.log_softmax(logits),
                    target=self.label_matrix[batch[3]].to(self.device),
                )
                if self.args.gradient_accumulation_steps > 1:
                    loss = loss / self.args.gradient_accumulation_steps
                if torch.cuda.device_count() > 1:
                    # print(loss.size(), torch.cuda.device_count())
                    loss = loss.mean()
                loss.backward()
                tr_loss += loss.item()
                if (step + 1) % self.args.gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(
                        self.model.parameters(), self.args.max_grad_norm
                    )
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    self.model.zero_grad()
                    global_step += 1
                    epoch_iterator.set_description(
                        "iteration:%d, w=%.1f, Loss:%.3f"
                        % (_, self.args.soft_label_weight, tr_loss / global_step)
                    )
                    if (
                        self.args.logging_steps > 0
                        and global_step % self.args.logging_steps == 0
                    ):
                        # self.evaluate("dev", global_step)
                        self.evaluate("test", global_step)
                    if (
                        self.args.save_steps > 0
                        and global_step % self.args.save_steps == 0
                    ):
                        self.save_model()

                if 0 < self.args.max_steps < global_step:
                    epoch_iterator.close()
                    break

            if 0 < self.args.max_steps < global_step:
                train_iterator.close()
                break
        # assert 0
        return global_step, tr_loss / global_step
Beispiel #6
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[3]
            }
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2] if args.model_type in [
                    'bert', 'xlnet'
                ] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Beispiel #7
0
        inside_validation_dataset),  # Pull out batches sequentially.
    batch_size=batch_size  # Evaluate with this batch size.
)
#----------------------------------------------------------------------------------------

# Note: AdamW is a class from the huggingface library (as opposed to pytorch)
optimizer = AdamW(model.parameters(), lr=learning_rate, eps=epsilon)

# Total number of training steps is [number of batches] x [number of epochs].
# (Note that this is not the same as the number of training samples).
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
# This changes the learning rate as the training loop progresses
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=warmup_steps,
                                            num_training_steps=total_steps)


def format_time(elapsed):
    return str(datetime.timedelta(seconds=int(round((elapsed)))))


total_t0 = time.time()

model = model.to(device)

for epoch_i in range(0, epochs):

    # ========================================
    #               Training
Beispiel #8
0
def main(argv):
    parser = argparse.ArgumentParser(description='')
    required = parser.add_argument_group('required arguments')
    required.add_argument(
        '-r',
        '--retrieval',
        choices=['IR', 'NSP', 'NN'],
        help='retrieval solver for the contexts. Options: IR, NSP or NN',
        required=True)
    parser.add_argument(
        '-d',
        '--device',
        default='gpu',
        choices=['gpu', 'cpu'],
        help='device to train the model with. Options: cpu or gpu. Default: gpu'
    )
    parser.add_argument(
        '-p',
        '--pretrainings',
        default="checkpoints/pretrainings_e4.pth",
        help=
        'path to the pretrainings model. If empty, the model will be the RobertForSequenceClassification with roberta-large weights. Default: checkpoints/pretrainings_e4.pth'
    )
    parser.add_argument('-b',
                        '--batchsize',
                        default=8,
                        type=int,
                        help='size of the batches. Default: 8')
    parser.add_argument('-x',
                        '--maxlen',
                        default=64,
                        type=int,
                        help='max sequence length. Default: 64')
    parser.add_argument('-l',
                        '--lr',
                        default=1e-5,
                        type=float,
                        help='learning rate. Default: 1e-5')
    parser.add_argument('-e',
                        '--epochs',
                        default=2,
                        type=int,
                        help='number of epochs. Default: 2')
    parser.add_argument('-s',
                        '--save',
                        default=False,
                        help='save model at the end of the training',
                        action='store_true')
    args = parser.parse_args()
    print(args)

    model = RobertaForSequenceClassification.from_pretrained("roberta-large",
                                                             num_labels=2)
    if args.pretrainings == "checkpoints/pretrainings_e4.pth":
        model.roberta = torch.load(args.pretrainings).roberta
    tokenizer = RobertaTokenizer.from_pretrained('roberta-large')

    if args.device == "gpu":
        device = torch.device("cuda")
        model.cuda()
    if args.device == "cpu":
        device = torch.device("cpu")
        model.cpu()

    model.zero_grad()

    batch_size = args.batchsize
    max_len = args.maxlen
    lr = args.lr
    epochs = args.epochs
    retrieval_solver = args.retrieval
    save_model = args.save

    raw_data_train = get_data_tf("train", retrieval_solver, tokenizer, max_len)
    raw_data_val = get_data_tf("val", retrieval_solver, tokenizer, max_len)

    train_dataloader = process_data_ndq(raw_data_train, batch_size, "train")
    val_dataloader = process_data_ndq(raw_data_val, batch_size, "val")

    optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
    total_steps = len(train_dataloader) * epochs
    print(total_steps)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0,
                                                num_training_steps=total_steps)

    training_tf(model, train_dataloader, val_dataloader, optimizer, scheduler,
                epochs, retrieval_solver, device, save_model)
Beispiel #9
0
def train_f1_f2(args, model_f1, model_f2, train_dataset):
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.mini_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    args.num_train_epochs = 1
    t_total = len(train_dataloader
                  ) // args.gradient_accumulation_steps * args.num_train_epochs

    if args.warmup_proportion > 0:
        args.warmup_steps = int(t_total * args.warmup_proportion)

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in list(model_f1.named_parameters()) +
                list(model_f2.named_parameters())
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in list(model_f1.named_parameters()) +
                list(model_f2.named_parameters())
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        [model_f1,
         model_f2], optimizer = amp.initialize([model_f1, model_f2],
                                               optimizer,
                                               opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model_f1 = torch.nn.DataParallel(model_f1)
        model_f2 = torch.nn.DataParallel(model_f2)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model_f1 = torch.nn.parallel.DistributedDataParallel(
            model_f1,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

        model_f2 = torch.nn.parallel.DistributedDataParallel(
            model_f2,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0

    tr_loss, logging_loss = 0.0, 0.0
    model_f1.zero_grad()
    model_f2.zero_grad()

    set_seed(args)
    logger.info("***** train f1 f2 ******")
    logger.info("***** Num examples: {} ********".format(len(train_dataset)))

    for _ in range(1):
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iter(loss=X.XXX, lr=X.XXXXXXXX)",
                              disable=args.local_rank not in [-1, 0])

        for step, batch in enumerate(epoch_iterator):
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model_f1.train()
            model_f2.train()
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "labels": batch[3],
                "label_mask": batch[4]
            }

            outputs1 = model_f1(**inputs)
            loss1 = outputs1

            outputs2 = model_f2(**inputs)
            loss2 = outputs2

            w1 = model_f1.classifier.weight  #[hidden_size, num_labels]
            w2 = model_f2.classifier.weight.transpose(
                -1, -2)  #[num_labels, hidden_size]

            norm_term = torch.norm(torch.matmul(w1, w2))

            loss = loss1 + loss2 + args.alpha * norm_term

            if args.n_gpu > 1:
                loss = loss.mean()
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                epoch_iterator.set_description(
                    'Iter (loss=%5.3f) lr=%9.7f' %
                    (loss.item(), scheduler.get_lr()[0]))
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model_f1.parameters(),
                                                   args.max_grad_norm)
                    torch.nn.utils.clip_grad_norm_(model_f2.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model_f1.zero_grad()
                model_f2.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics

                    tb_writer.add_scalar("f1_f2_lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("f1_f2_loss",
                                         (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return model_f1, model_f2
Beispiel #10
0
def train():
    writer = SummaryWriter(comment="Relation")
    modelDir = writer.log_dir.replace("runs", "models")
    epochs = 20
    device = "cuda"
    dataset = RelationDataset("albert-base-v2", device="cpu")
    dataloader = DataLoader(dataset, batch_size=32, collate_fn=collate_fn_padd)
    model = AlbertForRelation.from_pretrained(
        "albert-base-v2",
        num_rel_labels=len(relationTypes),
    )
    model.resize_token_embeddings(len(dataset.dataset.tokenizer))
    model.to(device)
    optim = AdamW(
        [
            {"params": model.albert.parameters(), "lr": 1e-4},
            {
                "params": model.classifier.parameters(),
                "lr": 1e-3,
            },
        ]
    )
    scheduler = get_linear_schedule_with_warmup(optim, 100, epochs * 10000 / 32)

    iTot = 0
    for epoch in range(epochs):
        i = 0
        lossesTrain = []
        lossesVal = []
        for (
            input_ids,
            token_type_ids,
            attention_mask,
            rel_label,
            e1_index,
            e2_index,
        ) in dataloader:
            if i % 5 != 0:
                model.train()
                loss, acc = model(
                    input_ids.to(device),
                    token_type_ids.to(device),
                    attention_mask.to(device),
                    rel_label.to(device),
                    e1_index.to(device),
                    e2_index.to(device),
                )
                loss.backward()
                optim.step()
                scheduler.step()
                optim.zero_grad()
                lossesTrain.append(loss.item())
                writer.add_scalar("lossRel/Train", lossesTrain[-1], iTot)
                writer.add_scalar("accRel/Train", acc.item(), iTot)
            else:
                with torch.no_grad():
                    model.eval()
                    loss, acc = model(
                        input_ids.to(device),
                        token_type_ids.to(device),
                        attention_mask.to(device),
                        rel_label.to(device),
                        e1_index.to(device),
                        e2_index.to(device),
                    )
                    lossesVal.append(loss.item())
                    writer.add_scalar("accRel/Eval", acc.item(), iTot)
                    writer.add_scalar("lossRel/Eval", lossesVal[-1], iTot)
            if iTot % 20 == 0:
                for (i2, lr) in enumerate(scheduler.get_lr()):
                    writer.add_scalar("lr/" + str(i2), lr, iTot)
            print(epoch, i)
            i += 1
            iTot += 1
        model.save_pretrained(modelDir + "/" + str(epoch))
        dataset.dataset.tokenizer.save_pretrained(modelDir + "/" + str(epoch))
Beispiel #11
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = (RandomSampler(train_dataset) if args.local_rank == -1 else
                     DistributedSampler(train_dataset))
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = (
            args.max_steps //
            (len(train_dataloader) // args.gradient_accumulation_steps) + 1)
    else:
        t_total = (len(train_dataloader) // args.gradient_accumulation_steps *
                   args.num_train_epochs)

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0,
        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True,
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to global_step of last saved checkpoint from model path
        try:
            global_step = int(
                args.model_name_or_path.split("-")[-1].split("/")[0])
        except ValueError:
            global_step = 0
        epochs_trained = global_step // (len(train_dataloader) //
                                         args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (
            len(train_dataloader) // args.gradient_accumulation_steps)

        logger.info(
            "  Continuing training from checkpoint, will skip to saved global_step"
        )
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info(
            "  Will skip the first %d steps in the first epoch",
            steps_trained_in_current_epoch,
        )

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        epochs_trained,
        int(args.num_train_epochs),
        desc="Epoch",
        disable=args.local_rank not in [-1, 0],
    )
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": batch[3],
            }
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2]
                    if args.model_type in ["bert", "xlnet", "albert"] else None
                )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0 or (
                    # last step in epoch but step is always smaller than gradient_accumulation_steps
                    len(epoch_iterator) <= args.gradient_accumulation_steps and
                (step + 1) == len(epoch_iterator)):
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if (args.local_rank in [-1, 0] and args.logging_steps > 0
                        and global_step % args.logging_steps == 0):
                    logs = {}
                    if (
                            args.local_rank == -1
                            and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            eval_key = "eval_{}".format(key)
                            logs[eval_key] = value

                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
                    logs["learning_rate"] = learning_rate_scalar
                    logs["loss"] = loss_scalar
                    logging_loss = tr_loss

                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    print(json.dumps({**logs, **{"step": global_step}}))

                if (args.local_rank in [-1, 0] and args.save_steps > 0
                        and global_step % args.save_steps == 0):
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Beispiel #12
0
    MODEL_NAME,
    hidden_dropout_prob=DROPOUT,
    attention_probs_dropout_prob=DROPOUT,
    num_labels=len(labels2ind),
    id2label={str(v): k
              for k, v in labels2ind.items()})

# Prepare optimizer and schedule (linear warmup and decay)
optimizer = get_optimizer_with_weight_decay(model=nerbert,
                                            optimizer=OPTIMIZER,
                                            learning_rate=LEARNING_RATE,
                                            weight_decay=WEIGHT_DECAY)

training_steps = (len(dataloader_tr) // ACUMULATE_GRAD_EVERY) * N_EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer=optimizer,
                                            num_warmup_steps=training_steps *
                                            RATIO_WARMUP_STEPS,
                                            num_training_steps=training_steps)

# Trainer
trainer = BertTrainer(model=nerbert,
                      tokenizer=tokenizer,
                      optimizer=optimizer,
                      scheduler=scheduler,
                      labels2ind=labels2ind,
                      device=DEVICE,
                      n_epochs=N_EPOCHS,
                      accumulate_grad_every=ACUMULATE_GRAD_EVERY,
                      output_dir='./trained_models')

# Train and validate model
trainer.train(dataloader_train=dataloader_tr, dataloader_val=dataloader_val)
Beispiel #13
0
def train(model, tokenizer, train_dataloader, validation_dataloader,
          index_to_label, pad_token_dict, doc_start_ind_dict, device):
    def calculate_loss(lm_logits, b_labels, b_input_mask, cls_labels,
                       index_to_label, doc_start_ind_dict, loss_fct):
        batch_size = lm_logits.shape[0]
        logits_collected = []
        labels_collected = []
        for b in range(batch_size):
            logits_ind = lm_logits[b, :, :]  # seq_len x |V|
            labels_ind = b_labels[b, :]  # seq_len
            mask = b_input_mask[b, :] > 0
            maski = mask.unsqueeze(-1).expand_as(logits_ind)
            # unpad_seq_len x |V|
            logits_pad_removed = torch.masked_select(logits_ind, maski).view(
                -1, logits_ind.size(-1))
            labels_pad_removed = torch.masked_select(labels_ind,
                                                     mask)  # unpad_seq_len

            doc_start_ind = doc_start_ind_dict[index_to_label[
                cls_labels[b].item()]]
            shift_logits = logits_pad_removed[doc_start_ind -
                                              1:-1, :].contiguous()
            shift_labels = labels_pad_removed[doc_start_ind:].contiguous()
            # Flatten the tokens
            logits_collected.append(
                shift_logits.view(-1, shift_logits.size(-1)))
            labels_collected.append(shift_labels.view(-1))

        logits_collected = torch.cat(logits_collected, dim=0)
        labels_collected = torch.cat(labels_collected, dim=0)
        loss = loss_fct(logits_collected, labels_collected)
        return loss

    optimizer = AdamW(
        model.parameters(),
        lr=5e-4,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
        eps=1e-8  # args.adam_epsilon  - default is 1e-8.
    )

    loss_fct = CrossEntropyLoss()
    sample_every = 100
    warmup_steps = 1e2
    epochs = 5
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=warmup_steps,
                                                num_training_steps=total_steps)
    seed_val = 42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    training_stats = []
    total_t0 = time.time()

    for epoch_i in range(0, epochs):
        print("", flush=True)
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs),
              flush=True)
        print('Training...', flush=True)
        t0 = time.time()
        total_train_loss = 0
        model.train()

        for step, batch in enumerate(train_dataloader):
            if step % sample_every == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
                    step, len(train_dataloader), elapsed),
                      flush=True)
                model.eval()
                lbl = random.choice(list(index_to_label.values()))
                temp_list = ["<|labelpad|>"] * pad_token_dict[lbl]
                if len(temp_list) > 0:
                    label_str = " ".join(
                        lbl.split("_")) + " " + " ".join(temp_list)
                else:
                    label_str = " ".join(lbl.split("_"))
                text = tokenizer.bos_token + " " + label_str + " <|labelsep|> "
                sample_outputs = model.generate(input_ids=tokenizer.encode(
                    text, return_tensors='pt').to(device),
                                                do_sample=True,
                                                top_k=50,
                                                max_length=200,
                                                top_p=0.95,
                                                num_return_sequences=1)
                for i, sample_output in enumerate(sample_outputs):
                    print("{}: {}".format(i, tokenizer.decode(sample_output)),
                          flush=True)
                model.train()

            b_input_ids = batch[0].to(device)
            b_labels = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            cls_labels = batch[2].to(device)

            model.zero_grad()

            outputs = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)

            loss = calculate_loss(outputs[1], b_labels, b_input_mask,
                                  cls_labels, index_to_label,
                                  doc_start_ind_dict, loss_fct)
            # loss = outputs[0]
            total_train_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)

        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        print("", flush=True)
        print("  Average training loss: {0:.2f}".format(avg_train_loss),
              flush=True)
        print("  Training epcoh took: {:}".format(training_time), flush=True)

        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        print("", flush=True)
        print("Running Validation...", flush=True)

        t0 = time.time()

        model.eval()

        total_eval_loss = 0
        nb_eval_steps = 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:
            b_input_ids = batch[0].to(device)
            b_input_mask = batch[1].to(device)
            b_labels = batch[0].to(device)
            cls_labels = batch[2].to(device)

            with torch.no_grad():
                outputs = model(b_input_ids,
                                token_type_ids=None,
                                attention_mask=b_input_mask,
                                labels=b_labels)

            # Accumulate the validation loss.
            loss = calculate_loss(outputs[1], b_labels, b_input_mask,
                                  cls_labels, index_to_label,
                                  doc_start_ind_dict, loss_fct)
            # loss = outputs[0]
            total_eval_loss += loss.item()

        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader)

        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)

        print("  Validation Loss: {0:.2f}".format(avg_val_loss), flush=True)
        print("  Validation took: {:}".format(validation_time), flush=True)

        # Record all statistics from this epoch.
        training_stats.append({
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        })

    print("", flush=True)
    print("Training complete!", flush=True)

    print("Total training took {:} (h:mm:ss)".format(
        format_time(time.time() - total_t0)),
          flush=True)
    return model
Beispiel #14
0
def main(params: dict, output_dir: str):
    import mlflow
    print("start params={}".format(params))
    model_id = "all"
    logger = get_logger()
    df = pd.read_pickle(
        "../input/riiid-test-answer-prediction/train_merged.pickle")
    # df = pd.read_pickle("../input/riiid-test-answer-prediction/split10/train_0.pickle").sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    if is_debug:
        df = df.head(30000)
    df["prior_question_had_explanation"] = df[
        "prior_question_had_explanation"].fillna(-1)
    df["answered_correctly"] = df["answered_correctly"].replace(-1, np.nan)
    column_config = {
        ("content_id", "content_type_id"): {
            "type": "category"
        },
        "user_answer": {
            "type": "leakage_feature"
        },
        "answered_correctly": {
            "type": "leakage_feature"
        },
        "part": {
            "type": "category"
        },
        "prior_question_elapsed_time_bin300": {
            "type": "category"
        },
        "duration_previous_content_bin300": {
            "type": "category"
        },
        "prior_question_had_explanation": {
            "type": "category"
        },
        "rating_diff_content_user_id": {
            "type": "numeric"
        },
        "task_container_id_bin300": {
            "type": "category"
        },
        "previous_answer_index_question_id": {
            "type": "category"
        },
        "previous_answer_question_id": {
            "type": "category"
        },
        "timediff-elapsedtime_bin500": {
            "type": "category"
        },
        "timedelta_log10": {
            "type": "category"
        }
    }

    if not load_pickle or is_debug:
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent(
                is_partial_fit=True)
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_dict["user_id"][
            "UserContentRateEncoder"] = UserContentRateEncoder(
                rate_func="elo", column="user_id")
        feature_factory_dict["user_id"]["PreviousAnswer2"] = PreviousAnswer2(
            groupby="user_id",
            column="question_id",
            is_debug=is_debug,
            model_id=model_id,
            n=300)
        feature_factory_dict["user_id"][
            "StudyTermEncoder2"] = StudyTermEncoder2(is_partial_fit=True)
        feature_factory_dict["user_id"][
            f"MeanAggregatorStudyTimebyUserId"] = MeanAggregator(
                column="user_id", agg_column="study_time", remove_now=False)

        feature_factory_dict["user_id"][
            "ElapsedTimeMeanByContentIdEncoder"] = ElapsedTimeMeanByContentIdEncoder(
            )
        feature_factory_dict["post"] = {
            "DurationFeaturePostProcess": DurationFeaturePostProcess()
        }

        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id=model_id,
            load_feature=not is_debug,
            save_feature=not is_debug)
        print("all_predict")
        df = feature_factory_manager.all_predict(df)

        def f(x):
            x = x // 1000
            if x < -100:
                return -100
            if x > 400:
                return 400
            return x

        df["task_container_id_bin300"] = [
            x if x < 300 else 300 for x in df["task_container_id"]
        ]
        df["timediff-elapsedtime_bin500"] = [
            f(x) for x in df["timediff-elapsedtime"].values
        ]
        df["timedelta_log10"] = np.log10(
            df["duration_previous_content"].values)
        df["timedelta_log10"] = df["timedelta_log10"].replace(
            -np.inf, -1).replace(np.inf, -1).fillna(-1).astype("int8")
        df = df[[
            "user_id", "content_id", "content_type_id", "part", "user_answer",
            "answered_correctly", "prior_question_elapsed_time_bin300",
            "duration_previous_content_bin300",
            "prior_question_had_explanation", "rating_diff_content_user_id",
            "task_container_id_bin300", "previous_answer_index_question_id",
            "previous_answer_question_id", "row_id",
            "timediff-elapsedtime_bin500", "timedelta_log10"
        ]]
        print(df.head(10))

        print("data preprocess")

    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    ff_for_transformer.make_dict(df=df)
    n_skill = len(ff_for_transformer.embbed_dict[("content_id",
                                                  "content_type_id")])

    if not load_pickle or is_debug:
        df_val_row = pd.read_feather(
            "../input/riiid-test-answer-prediction/train_transformer_last2500k_only_row_id.feather"
        )
        if is_debug:
            df_val_row = df_val_row.head(3000)
        df_val_row["is_val"] = 1

        df = pd.merge(df, df_val_row, how="left", on="row_id")
        df["is_val"] = df["is_val"].fillna(0)

        print(df["is_val"].value_counts())

        w_df = df[df["is_val"] == 0]
        w_df["group"] = (
            w_df.groupby("user_id")["user_id"].transform("count") -
            w_df.groupby("user_id").cumcount()) // params["max_seq"]
        w_df["user_id"] = w_df["user_id"].astype(
            str) + "_" + w_df["group"].astype(str)

        group = ff_for_transformer.all_predict(w_df)

        dataset_train = SAKTDataset(group,
                                    n_skill=n_skill,
                                    max_seq=params["max_seq"])

        del w_df
        gc.collect()

    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    if not load_pickle or is_debug:
        group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0])
        dataset_val = SAKTDataset(group,
                                  is_test=True,
                                  n_skill=n_skill,
                                  max_seq=params["max_seq"])

    os.makedirs("../input/feature_engineering/model275_all", exist_ok=True)
    if not is_debug and not load_pickle:
        with open(f"../input/feature_engineering/model275_all/train.pickle",
                  "wb") as f:
            pickle.dump(dataset_train, f)
        with open(f"../input/feature_engineering/model275_all/val.pickle",
                  "wb") as f:
            pickle.dump(dataset_val, f)

    if not is_debug and load_pickle:
        with open(f"../input/feature_engineering/model275_all/train.pickle",
                  "rb") as f:
            dataset_train = pickle.load(f)
        with open(f"../input/feature_engineering/model275_all/val.pickle",
                  "rb") as f:
            dataset_val = pickle.load(f)
        print("loaded!")
    dataloader_train = DataLoader(dataset_train,
                                  batch_size=params["batch_size"],
                                  shuffle=True)
    dataloader_val = DataLoader(dataset_val,
                                batch_size=params["batch_size"],
                                shuffle=False)

    model = SAKTModel(n_skill,
                      embed_dim=params["embed_dim"],
                      max_seq=params["max_seq"],
                      dropout=dropout,
                      cont_emb=params["cont_emb"])

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=params["lr"],
        weight_decay=0.01,
    )
    num_train_optimization_steps = int(len(dataloader_train) * 20)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=params["num_warmup_steps"],
        num_training_steps=num_train_optimization_steps)
    criterion = nn.BCEWithLogitsLoss()

    model.to(device)
    criterion.to(device)

    for epoch in range(epochs):
        loss, acc, auc, auc_val = train_epoch(model, dataloader_train,
                                              dataloader_val, optimizer,
                                              criterion, scheduler, epoch,
                                              output_dir, device)
        print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".
              format(epoch, loss, auc, auc_val))
        torch.save(
            model.state_dict(),
            f"{output_dir}/transformers_epoch{epoch}_auc{round(auc_val, 4)}.pth"
        )

    # df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False)
    """
    df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv")
    df_oof2.columns = ["row_id", "predict_lgbm", "target"]
    df_oof2 = pd.merge(df_oof, df_oof2, how="inner")

    auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values)
    print("lgbm: {:.4f}".format(auc_lgbm))

    print("ensemble")
    max_auc = 0
    max_nn_ratio = 0
    for r in np.arange(0, 1.05, 0.05):
        auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r)
        print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc))

        if max_auc < auc:
            max_auc = auc
            max_nn_ratio = r
    print(len(df_oof2))
    """
    if not is_debug:
        mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__))

        for key, value in params.items():
            mlflow.log_param(key, value)
        mlflow.log_metric("auc_val", auc_transformer)
        mlflow.end_run()
    torch.save(model.state_dict(), f"{output_dir}/transformers.pth")
    del model
    torch.cuda.empty_cache()
    with open(f"{output_dir}/transformer_param.json", "w") as f:
        json.dump(params, f)
    if is_make_feature_factory:
        # feature factory
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent(
                is_partial_fit=True)
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id="all",
            load_feature=not is_debug,
            save_feature=not is_debug)

        ff_for_transformer = FeatureFactoryForTransformer(
            column_config=column_config,
            dict_path="../feature_engineering/",
            sequence_length=params["max_seq"],
            logger=logger)
        df = pd.read_pickle(
            "../input/riiid-test-answer-prediction/train_merged.pickle")
        if is_debug:
            df = df.head(10000)
        df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)
        feature_factory_manager.fit(df)
        df = feature_factory_manager.all_predict(df)
        for dicts in feature_factory_manager.feature_factory_dict.values():
            for factory in dicts.values():
                factory.logger = None
        feature_factory_manager.logger = None
        with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f:
            pickle.dump(feature_factory_manager, f)

        ff_for_transformer.fit(df)
        ff_for_transformer.logger = None
        with open(
                f"{output_dir}/feature_factory_manager_for_transformer.pickle",
                "wb") as f:
            pickle.dump(ff_for_transformer, f)
def train(coarse_model, fine_model, coarse_tokenizer, fine_tokenizer,
          train_dataloader, validation_dataloader,
          label_to_exclusive_dataloader, doc_start_ind, index_to_label, device,
          secondary_device):
    def calculate_kl_div_loss(batch_fine_probs, batch_coarse_probs,
                              batch_fine_input_masks, batch_coarse_input_masks,
                              batch_fine_input_ids, batch_coarse_input_ids,
                              coarse_tokenizer, fine_tokenizer, doc_start_ind):
        # Remove pad tokens
        # consider from doc_start_ind - 1
        loss_fct = torch.nn.KLDivLoss(reduction="batchmean")
        batch_size = batch_fine_probs.shape[0]
        losses = []
        for b in range(batch_size):
            fine_logits_ind = batch_fine_probs[b, :, :]  # seq_len x |V|
            coarse_logits_ind = batch_coarse_probs[b, :, :]  # seq_len x |V|
            fine_mask = batch_fine_input_masks[b, :] > 0
            coarse_mask = batch_coarse_input_masks[b, :] > 0
            if not torch.all(fine_mask.eq(coarse_mask)):
                print("Fine sentence",
                      fine_tokenizer.decode(batch_fine_input_ids[b, :]))
                print("Coarse sentence",
                      coarse_tokenizer.decode(batch_coarse_input_ids[b, :]))
                raise Exception("Fine and Coarse mask is not same")

            fine_dec_sent = fine_tokenizer.decode(
                batch_fine_input_ids[b, :][doc_start_ind:])
            coarse_dec_sent = coarse_tokenizer.decode(
                batch_coarse_input_ids[b, :][doc_start_ind:])

            if fine_dec_sent != coarse_dec_sent:
                print(
                    "Fine sentence ",
                    fine_tokenizer.decode(
                        batch_fine_input_ids[b, :][doc_start_ind:]))
                print(
                    "Coarse sentence ",
                    coarse_tokenizer.decode(
                        batch_coarse_input_ids[b, :][doc_start_ind:]))
                raise Exception("Fine and Coarse decoded sentence is not same")

            fine_maski = fine_mask.unsqueeze(-1).expand_as(fine_logits_ind)
            coarse_maski = coarse_mask.unsqueeze(-1).expand_as(
                coarse_logits_ind)
            # unpad_seq_len x |V|
            fine_logits_pad_removed = torch.masked_select(
                fine_logits_ind, fine_maski).view(-1, fine_logits_ind.size(-1))
            coarse_logits_pad_removed = torch.masked_select(
                coarse_logits_ind,
                coarse_maski).view(-1, coarse_logits_ind.size(-1))
            shift_fine_logits = fine_logits_pad_removed[doc_start_ind -
                                                        1:-1, :].contiguous()
            shift_coarse_logits = coarse_logits_pad_removed[
                doc_start_ind - 1:-1, :].contiguous()
            # Compute loss here of shift_fine_logits and shift_coarse_logits append to losses
            loss = loss_fct(shift_fine_logits,
                            shift_coarse_logits).unsqueeze(0)
            losses.append(loss)

        # Return mean of losses here
        losses = torch.cat(losses, dim=0)
        return losses.mean()

    def calculate_cross_entropy_loss(fine_model, label_to_exclusive_dataloader,
                                     doc_start_ind, device):
        loss_function = CrossEntropyLoss()

        b_labels_list = []
        b_input_ids_list = []
        b_input_mask_list = []
        scores_list = []

        selected_labs = random.sample(
            list(label_to_exclusive_dataloader.keys()), 6)
        for l in selected_labs:
            # print("Label", l)
            dataloader = label_to_exclusive_dataloader[l]
            it = 0
            for step, batch in dataloader:
                # print("Step for exc", step, it)
                b_input_ids = batch[0].to(device)
                b_labels = batch[0].to(device)
                b_input_mask = batch[1].to(device)

                outputs = fine_model(b_input_ids,
                                     token_type_ids=None,
                                     attention_mask=b_input_mask,
                                     labels=b_labels)
                b_labels_list.append(b_labels)
                b_input_ids_list.append(b_input_ids)
                b_input_mask_list.append(b_input_mask)
                scores_list.append(outputs[1])
                # reporter = MemReporter()
                # reporter.report()
                it += 1
                if it == 1:
                    break

        b_labels_tensor = torch.cat(b_labels_list, dim=0)
        b_input_ids_tensor = torch.cat(b_input_ids_list, dim=0)
        b_input_mask_tensor = torch.cat(b_input_mask_list, dim=0)
        scores_tensor = torch.cat(scores_list, dim=0)

        assert b_labels_tensor.shape[0] == b_input_ids_tensor.shape[0] == b_input_mask_tensor.shape[0] == \
               scores_tensor.shape[0]
        batch_size = scores_tensor.shape[0]
        logits_collected = []
        labels_collected = []
        for b in range(batch_size):
            logits_ind = scores_tensor[b, :, :]  # seq_len x |V|
            labels_ind = b_labels_tensor[b, :]  # seq_len
            mask = b_input_mask_tensor[b, :] > 0
            maski = mask.unsqueeze(-1).expand_as(logits_ind)
            # unpad_seq_len x |V|
            logits_pad_removed = torch.masked_select(logits_ind, maski).view(
                -1, logits_ind.size(-1))
            labels_pad_removed = torch.masked_select(labels_ind,
                                                     mask)  # unpad_seq_len

            shift_logits = logits_pad_removed[doc_start_ind -
                                              1:-1, :].contiguous()
            shift_labels = labels_pad_removed[doc_start_ind:].contiguous()
            # Flatten the tokens
            logits_collected.append(
                shift_logits.view(-1, shift_logits.size(-1)))
            labels_collected.append(shift_labels.view(-1))

        logits_collected = torch.cat(logits_collected, dim=0)
        labels_collected = torch.cat(labels_collected, dim=0)
        loss = loss_function(logits_collected, labels_collected).to(device)
        return loss

    def calculate_loss(batch_fine_probs,
                       batch_coarse_probs,
                       batch_fine_input_masks,
                       batch_coarse_input_masks,
                       batch_fine_input_ids,
                       batch_coarse_input_ids,
                       coarse_tokenizer,
                       fine_tokenizer,
                       fine_model,
                       label_to_exclusive_dataloader,
                       doc_start_ind,
                       device,
                       lambda_1=5,
                       is_val=False):
        kl_div_loss = calculate_kl_div_loss(
            batch_fine_probs, batch_coarse_probs, batch_fine_input_masks,
            batch_coarse_input_masks, batch_fine_input_ids,
            batch_coarse_input_ids, coarse_tokenizer, fine_tokenizer,
            doc_start_ind)
        # del batch_fine_probs
        # del batch_coarse_probs
        # del batch_fine_input_masks
        # del batch_coarse_input_masks
        # del batch_fine_input_ids
        # del batch_coarse_input_ids
        # torch.cuda.empty_cache()
        if not is_val:
            cross_ent_loss = calculate_cross_entropy_loss(
                fine_model, label_to_exclusive_dataloader, doc_start_ind,
                device)
            print("KL-loss", kl_div_loss.item(), "CE-loss",
                  cross_ent_loss.item())
        else:
            cross_ent_loss = 0
            print("KL-loss", kl_div_loss.item(), "CE-loss", cross_ent_loss)
        return (1 - lambda_1) * kl_div_loss + lambda_1 * cross_ent_loss

    def compute_lambda(step, max_steps):
        temp = 1 - step / max_steps
        if temp < 0:
            return 0
        else:
            return temp

    # epsilon = 1e-20  # Defined to avoid log probability getting undefined.
    fine_posterior = torch.nn.Parameter(
        torch.ones(len(index_to_label)).to(device))
    optimizer = AdamW(
        list(fine_model.parameters()) + [fine_posterior],
        lr=5e-4,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
        eps=1e-8  # args.adam_epsilon  - default is 1e-8.
    )
    sample_every = 100
    warmup_steps = 1e2
    epochs = 5
    total_steps = len(train_dataloader) * epochs
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=warmup_steps,
                                                num_training_steps=total_steps)
    seed_val = 42
    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    training_stats = []
    total_t0 = time.time()

    coarse_model.eval()
    global_step = 0

    for epoch_i in range(0, epochs):
        print("", flush=True)
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs),
              flush=True)
        print('Training...', flush=True)
        t0 = time.time()
        total_train_loss = 0
        fine_model.train()

        for step, batch in enumerate(train_dataloader):
            # batch contains -> coarse_input_ids, coarse_attention_masks, fine_input_ids, fine_attention_masks
            if step % sample_every == 0 and not step == 0:
                elapsed = format_time(time.time() - t0)
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
                    step, len(train_dataloader), elapsed),
                      flush=True)
                fine_model.eval()
                lbl = random.choice(list(index_to_label.values()))
                temp_list = ["<|labelpad|>"] * pad_token_dict[lbl]
                if len(temp_list) > 0:
                    label_str = " ".join(
                        lbl.split("_")) + " " + " ".join(temp_list)
                else:
                    label_str = " ".join(lbl.split("_"))
                text = fine_tokenizer.bos_token + " " + label_str + " <|labelsep|> "
                sample_outputs = fine_model.generate(
                    input_ids=fine_tokenizer.encode(
                        text, return_tensors='pt').to(device),
                    do_sample=True,
                    top_k=50,
                    max_length=200,
                    top_p=0.95,
                    num_return_sequences=1)
                for i, sample_output in enumerate(sample_outputs):
                    print("{}: {}".format(
                        i, fine_tokenizer.decode(sample_output)),
                          flush=True)
                fine_model.train()

            fine_posterior_log_probs = torch.log_softmax(fine_posterior, dim=0)
            print(torch.softmax(fine_posterior, dim=0), flush=True)

            b_coarse_input_ids = batch[0].to(secondary_device)
            b_coarse_labels = batch[0].to(secondary_device)
            b_coarse_input_mask = batch[1].to(secondary_device)

            b_size = b_coarse_input_ids.shape[0]

            b_fine_input_ids_minibatch = batch[2].to(device)
            b_fine_input_mask_minibatch = batch[3].to(device)

            coarse_model.zero_grad()
            # fine_model.zero_grad()
            optimizer.zero_grad()

            outputs = coarse_model(b_coarse_input_ids,
                                   token_type_ids=None,
                                   attention_mask=b_coarse_input_mask,
                                   labels=b_coarse_labels)

            batch_coarse_probs = torch.softmax(outputs[1], dim=-1).to(
                device)  # (b_size, seq_len, |V|)
            b_coarse_input_ids = b_coarse_input_ids.to(device)
            b_coarse_input_mask = b_coarse_input_mask.to(device)

            batch_fine_probs = []
            batch_fine_input_masks = []
            batch_fine_input_ids = []
            for b_ind in range(b_size):
                fine_label_sum_log_probs = []
                for l_ind in index_to_label:
                    b_fine_input_ids = b_fine_input_ids_minibatch[
                        b_ind, l_ind, :].unsqueeze(0).to(device)
                    b_fine_labels = b_fine_input_ids_minibatch[
                        b_ind, l_ind, :].unsqueeze(0).to(device)
                    b_fine_input_mask = b_fine_input_mask_minibatch[
                        b_ind, l_ind, :].unsqueeze(0).to(device)

                    outputs = fine_model(b_fine_input_ids,
                                         token_type_ids=None,
                                         attention_mask=b_fine_input_mask,
                                         labels=b_fine_labels)

                    b_fine_labels = b_fine_labels.to(secondary_device)

                    fine_log_probs = torch.log_softmax(outputs[1], dim=-1)
                    fine_label_sum_log_probs.append(
                        (fine_log_probs + fine_posterior_log_probs[l_ind]))

                fine_label_sum_log_probs = torch.cat(
                    fine_label_sum_log_probs, dim=0)  # (|F|, seq_len, |V|)
                batch_fine_probs.append(fine_label_sum_log_probs.unsqueeze(0))
                batch_fine_input_ids.append(b_fine_input_ids)
                batch_fine_input_masks.append(b_fine_input_mask)

            batch_fine_probs = torch.cat(batch_fine_probs,
                                         dim=0)  # (b_size, |F|, seq_len, |V|)
            batch_fine_input_masks = torch.cat(batch_fine_input_masks,
                                               dim=0)  # (b_size, seq_len)
            batch_fine_input_ids = torch.cat(batch_fine_input_ids,
                                             dim=0)  # (b_size, seq_len)
            batch_fine_log_probs = torch.logsumexp(
                batch_fine_probs,
                dim=1)  # This computes logsum_i P(f_i|c) P(D|f_i)

            loss = calculate_loss(
                batch_fine_log_probs,
                batch_coarse_probs,
                batch_fine_input_masks,
                b_coarse_input_mask,
                batch_fine_input_ids,
                b_coarse_input_ids,
                coarse_tokenizer,
                fine_tokenizer,
                fine_model,
                label_to_exclusive_dataloader,
                doc_start_ind,
                device,
                lambda_1=compute_lambda(global_step,
                                        max_steps=len(train_dataloader) *
                                        epochs))
            # loss = criterion(batch_fine_probs.log(), batch_coarse_probs.detach()).sum(dim=-1).mean(dim=-1).mean(dim=-1)
            total_train_loss += loss.item()
            print("Loss:", loss.item(), flush=True)

            loss.backward()
            optimizer.step()
            scheduler.step()
            global_step += 1

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)

        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        print("", flush=True)
        print("  Average training loss: {0:.2f}".format(avg_train_loss),
              flush=True)
        print("  Training epoch took: {:}".format(training_time), flush=True)

        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        print("", flush=True)
        print("Running Validation...", flush=True)

        t0 = time.time()

        fine_model.eval()

        total_eval_loss = 0
        nb_eval_steps = 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:
            # batch contains -> coarse_input_ids, coarse_attention_masks, fine_input_ids, fine_attention_masks
            b_coarse_input_ids = batch[0].to(secondary_device)
            b_coarse_labels = batch[0].to(secondary_device)
            b_coarse_input_mask = batch[1].to(secondary_device)

            b_size = b_coarse_input_ids.shape[0]

            b_fine_input_ids_minibatch = batch[2].to(device)
            b_fine_input_mask_minibatch = batch[3].to(device)

            with torch.no_grad():
                fine_posterior_log_probs = torch.log_softmax(fine_posterior,
                                                             dim=0)
                outputs = coarse_model(b_coarse_input_ids,
                                       token_type_ids=None,
                                       attention_mask=b_coarse_input_mask,
                                       labels=b_coarse_labels)

                batch_coarse_probs = torch.softmax(outputs[1], dim=-1).to(
                    device)  # (b_size, seq_len, |V|)

                b_coarse_input_ids = b_coarse_input_ids.to(device)
                b_coarse_input_mask = b_coarse_input_mask.to(device)

                batch_fine_probs = []
                batch_fine_input_masks = []
                batch_fine_input_ids = []
                for b_ind in range(b_size):
                    fine_label_sum_log_probs = []
                    for l_ind in index_to_label:
                        b_fine_input_ids = b_fine_input_ids_minibatch[
                            b_ind, l_ind, :].unsqueeze(0).to(device)
                        b_fine_labels = b_fine_input_ids_minibatch[
                            b_ind, l_ind, :].unsqueeze(0).to(device)
                        b_fine_input_mask = b_fine_input_mask_minibatch[
                            b_ind, l_ind, :].unsqueeze(0).to(device)

                        outputs = fine_model(b_fine_input_ids,
                                             token_type_ids=None,
                                             attention_mask=b_fine_input_mask,
                                             labels=b_fine_labels)
                        fine_log_probs = torch.log_softmax(outputs[1], dim=-1)
                        fine_label_sum_log_probs.append(
                            (fine_log_probs + fine_posterior_log_probs[l_ind]))

                    fine_label_sum_log_probs = torch.cat(
                        fine_label_sum_log_probs, dim=0)  # (|F|, seq_len, |V|)
                    batch_fine_probs.append(
                        fine_label_sum_log_probs.unsqueeze(0))
                    batch_fine_input_ids.append(b_fine_input_ids)
                    batch_fine_input_masks.append(b_fine_input_mask)

                batch_fine_probs = torch.cat(
                    batch_fine_probs, dim=0)  # (b_size, |F|, seq_len, |V|)
                batch_fine_input_masks = torch.cat(batch_fine_input_masks,
                                                   dim=0)  # (b_size, seq_len)
                batch_fine_input_ids = torch.cat(batch_fine_input_ids,
                                                 dim=0)  # (b_size, seq_len)
                batch_fine_log_probs = torch.logsumexp(
                    batch_fine_probs,
                    dim=1)  # This computes logsum_i P(f_i|c) P(D|f_i)

            # Accumulate the validation loss.
            loss = calculate_loss(
                batch_fine_log_probs,
                batch_coarse_probs,
                batch_fine_input_masks,
                b_coarse_input_mask,
                batch_fine_input_ids,
                b_coarse_input_ids,
                coarse_tokenizer,
                fine_tokenizer,
                fine_model,
                label_to_exclusive_dataloader,
                doc_start_ind,
                device,
                is_val=True,
                lambda_1=compute_lambda(global_step,
                                        max_steps=len(train_dataloader) *
                                        epochs))
            total_eval_loss += loss.item()

        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader)

        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)

        print("  Validation Loss: {0:.2f}".format(avg_val_loss), flush=True)
        print("  Validation took: {:}".format(validation_time), flush=True)

        # Record all statistics from this epoch.
        training_stats.append({
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Training Time': training_time,
            'Validation Time': validation_time
        })

        # todo make temp_df, fine_input_ids, fine_attention_masks class variables.
        # true, preds, _ = test(fine_model, fine_posterior, fine_input_ids, fine_attention_masks, doc_start_ind,
        #                       index_to_label, label_to_index, list(temp_df.label.values), device)

    print("", flush=True)
    print("Training complete!", flush=True)

    print("Total training took {:} (h:mm:ss)".format(
        format_time(time.time() - total_t0)),
          flush=True)
    return fine_posterior, fine_model
Beispiel #16
0
def train_ft(args, model_ft, train_dataset):
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.mini_batch_size * max(1, args.n_gpu)

    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    args.num_train_epochs = 1
    t_total = len(train_dataloader
                  ) // args.gradient_accumulation_steps * args.num_train_epochs

    if args.warmup_proportion > 0:
        args.warmup_steps = int(t_total * args.warmup_proportion)

    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in list(model_ft.named_parameters())
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in list(model_ft.named_parameters())
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model_ft, optimizer = amp.initialize(model_ft,
                                             optimizer,
                                             opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model_ft = torch.nn.DataParallel(model_ft)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model_ft = torch.nn.parallel.DistributedDataParallel(
            model_ft,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    tr_loss, logging_loss = 0.0, 0.0

    model_ft.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])

    set_seed(args)
    logger.info("******* train ft *************")
    for _ in range(1):
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iter(loss=X.XXX, lr=X.XXXXXXXX)",
                              disable=args.local_rank not in [-1, 0])

        for step, batch in enumerate(epoch_iterator):
            model_ft.train()
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "labels": batch[3],
                "label_mask": batch[4],
            }

            outputs = model_ft(**inputs)
            loss = outputs

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                epoch_iterator.set_description(
                    'Iter (loss=%5.3f) lr=%9.7f' %
                    (loss.item(), scheduler.get_lr()[0]))
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model_ft.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model_ft.zero_grad()
                global_step += 1

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return model_ft
Beispiel #17
0
def train(args, train_dataset, valid_dataset, model, tokenizer, labels):

    # Prepare train data
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=collate)
    train_batch_size = args.train_batch_size

    # Prepare optimizer
    t_total = len(train_dataloader) * args.num_train_epochs
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=t_total // 10,
                                                num_training_steps=t_total)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", train_batch_size)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch")
    set_seed(args)
    best_f1_score = 0
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration")
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[2]
            }
            outputs = model(**inputs, return_dict=False)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]

            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(),
                                           args.max_grad_norm)

            tr_loss += loss.item()
            optimizer.step()
            scheduler.step()
            model.zero_grad()
            global_step += 1

        # Checking for validation accuracy and stopping after drop in accuracy for 3 epochs
        results = evaluate(args, model, tokenizer, labels, 'validation')
        if results.get('f1') > best_f1_score and args.save_steps > 0:
            best_f1_score = results.get('f1')
            model_to_save = model.module if hasattr(model, "module") else model
            model_to_save.save_pretrained(args.output_dir)
            tokenizer.save_pretrained(args.output_dir)
            torch.save(args, os.path.join(args.output_dir,
                                          "training_args.bin"))

    return global_step, tr_loss / global_step
Beispiel #18
0
def train(args, train_dataset, model, tokenizer, criterion):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=args.train_batch_size,
        collate_fn=collate_fn,
        num_workers=args.num_workers,
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    best_f1, n_no_improve = 0, 0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            labels = batch[5]
            inputs = {
                "input_ids": batch[0],
                "input_modal": batch[2],
                "attention_mask": batch[1],
                "modal_start_tokens": batch[3],
                "modal_end_tokens": batch[4],
            }
            outputs = model(**inputs)
            logits = outputs[
                0]  # model outputs are always tuple in transformers (see doc)
            loss = criterion(logits, labels)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
                    if (
                            args.local_rank == -1
                            and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer, criterion)
                        for key, value in results.items():
                            eval_key = "eval_{}".format(key)
                            logs[eval_key] = value

                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
                    logs["learning_rate"] = learning_rate_scalar
                    logs["loss"] = loss_scalar
                    logging_loss = tr_loss

                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    print(json.dumps({**logs, **{"step": global_step}}))

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    torch.save(model_to_save.state_dict(),
                               os.path.join(output_dir, WEIGHTS_NAME))
                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

        if args.local_rank == -1:
            results = evaluate(args, model, tokenizer, criterion)
            if results["micro_f1"] > best_f1:
                best_f1 = results["micro_f1"]
                n_no_improve = 0
            else:
                n_no_improve += 1

            if n_no_improve > args.patience:
                train_iterator.close()
                break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Beispiel #19
0
def main(params: dict, output_dir: str):
    import mlflow
    print("start params={}".format(params))
    model_id = "train_0"
    logger = get_logger()
    # df = pd.read_pickle("../input/riiid-test-answer-prediction/train_merged.pickle")
    df = pd.read_pickle(
        "../input/riiid-test-answer-prediction/split10/train_0.pickle"
    ).sort_values(["user_id", "timestamp"]).reset_index(drop=True)
    if is_debug:
        df = df.head(30000)
    df["prior_question_had_explanation"] = df[
        "prior_question_had_explanation"].fillna(-1)
    column_config = {
        ("content_id", "content_type_id"): {
            "type": "category"
        },
        "user_answer": {
            "type": "leakage_feature"
        },
        "answered_correctly": {
            "type": "leakage_feature"
        },
        "part": {
            "type": "category"
        },
        "prior_question_elapsed_time_bin300": {
            "type": "category"
        },
        "duration_previous_content_bin300": {
            "type": "category"
        },
        "prior_question_had_explanation": {
            "type": "category"
        },
        "rating_diff_content_user_id": {
            "type": "numeric"
        },
        "task_container_id_bin300": {
            "type": "category"
        },
    }

    if not load_pickle or is_debug:
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent()
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_dict["user_id"][
            "UserContentRateEncoder"] = UserContentRateEncoder(
                rate_func="elo", column="user_id")
        feature_factory_dict["user_id"][
            "StudyTermEncoder"] = StudyTermEncoder2()

        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id="train_0",
            load_feature=not is_debug,
            save_feature=not is_debug)

        print("all_predict")
        df = feature_factory_manager.all_predict(df)
        df["task_container_id_bin300"] = [
            x if x < 300 else 300 for x in df["task_container_id"].values
        ]

        def f(x):
            x = x // 1000
            if x > 150:
                return 150
            if x < -150:
                return -150
            return x

        df["study_time_bin300"] = [f(x) for x in df["study_time"].values]
        df = df[[
            "user_id", "content_id", "content_type_id", "part", "user_answer",
            "answered_correctly", "prior_question_elapsed_time_bin300",
            "duration_previous_content_bin300", "study_time_bin300",
            "prior_question_had_explanation", "rating_diff_content_user_id",
            "task_container_id_bin300"
        ]]
        print(df.head(10))

        print("data preprocess")

        train_idx = []
        val_idx = []
        np.random.seed(0)
        for _, w_df in df[df["content_type_id"] == 0].groupby("user_id"):
            if np.random.random() < 0.01:
                # all val
                val_idx.extend(w_df.index.tolist())
            else:
                train_num = int(len(w_df) * 0.95)
                train_idx.extend(w_df[:train_num].index.tolist())
                val_idx.extend(w_df[train_num:].index.tolist())
    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    ff_for_transformer.make_dict(df=df)
    n_skill = len(ff_for_transformer.embbed_dict[("content_id",
                                                  "content_type_id")])
    if not load_pickle or is_debug:
        df["is_val"] = 0
        df["is_val"].loc[val_idx] = 1
        w_df = df[df["is_val"] == 0]
        w_df["group"] = (
            w_df.groupby("user_id")["user_id"].transform("count") -
            w_df.groupby("user_id").cumcount()) // params["max_seq"]
        w_df["user_id"] = w_df["user_id"].astype(
            str) + "_" + w_df["group"].astype(str)

        group = ff_for_transformer.all_predict(w_df)

        dataset_train = SAKTDataset(group,
                                    n_skill=n_skill,
                                    max_seq=params["max_seq"])

        del w_df
        gc.collect()

    ff_for_transformer = FeatureFactoryForTransformer(
        column_config=column_config,
        dict_path="../feature_engineering/",
        sequence_length=params["max_seq"],
        logger=logger)
    if not load_pickle or is_debug:
        group = ff_for_transformer.all_predict(df[df["content_type_id"] == 0])
        dataset_val = SAKTDataset(group,
                                  is_test=True,
                                  n_skill=n_skill,
                                  max_seq=params["max_seq"])

    os.makedirs("../input/feature_engineering/model137", exist_ok=True)
    if not is_debug and not load_pickle:
        with open(f"../input/feature_engineering/model137/train.pickle",
                  "wb") as f:
            pickle.dump(dataset_train, f)
        with open(f"../input/feature_engineering/model137/val.pickle",
                  "wb") as f:
            pickle.dump(dataset_val, f)

    if not is_debug and load_pickle:
        with open(f"../input/feature_engineering/model137/train.pickle",
                  "rb") as f:
            dataset_train = pickle.load(f)
        with open(f"../input/feature_engineering/model137/val.pickle",
                  "rb") as f:
            dataset_val = pickle.load(f)
        print("loaded!")
    dataloader_train = DataLoader(dataset_train,
                                  batch_size=params["batch_size"],
                                  shuffle=True,
                                  num_workers=1)
    dataloader_val = DataLoader(dataset_val,
                                batch_size=params["batch_size"],
                                shuffle=False,
                                num_workers=1)

    model = SAKTModel(n_skill,
                      embed_dim=params["embed_dim"],
                      max_seq=params["max_seq"],
                      dropout=dropout,
                      n_encoder_layer=params["n_encoder_layer"],
                      n_decoder_layer=params["n_decoder_layer"],
                      emb1=params["emb1"],
                      emb2=params["emb2"])

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        0.01
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=params["lr"],
        weight_decay=0.01,
    )
    num_train_optimization_steps = int(len(dataloader_train) * epochs)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=params["num_warmup_steps"],
        num_training_steps=num_train_optimization_steps)
    criterion = nn.BCEWithLogitsLoss()

    model.to(device)
    criterion.to(device)

    for epoch in range(epochs):
        loss, acc, auc, auc_val = train_epoch(model, dataloader_train,
                                              dataloader_val, optimizer,
                                              criterion, scheduler, device)
        print("epoch - {} train_loss - {:.3f} auc - {:.4f} auc-val: {:.4f}".
              format(epoch, loss, auc, auc_val))

    preds = []
    labels = []
    with torch.no_grad():
        for item in tqdm(dataloader_val):
            label = item["label"].to(device).float()

            output = model(item, device)
            preds.extend(torch.nn.Sigmoid()(
                output[:, -1]).view(-1).data.cpu().numpy().tolist())
            labels.extend(label[:, -1].view(-1).data.cpu().numpy().tolist())

    auc_transformer = roc_auc_score(labels, preds)
    print("single transformer: {:.4f}".format(auc_transformer))
    df_oof = pd.DataFrame()
    # df_oof["row_id"] = df.loc[val_idx].index
    print(len(dataloader_val))
    print(len(preds))
    df_oof["predict"] = preds
    df_oof["target"] = labels

    df_oof.to_csv(f"{output_dir}/transformers1.csv", index=False)
    """
    df_oof2 = pd.read_csv("../output/ex_237/20201213110353/oof_train_0_lgbm.csv")
    df_oof2.columns = ["row_id", "predict_lgbm", "target"]
    df_oof2 = pd.merge(df_oof, df_oof2, how="inner")

    auc_lgbm = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values)
    print("lgbm: {:.4f}".format(auc_lgbm))

    print("ensemble")
    max_auc = 0
    max_nn_ratio = 0
    for r in np.arange(0, 1.05, 0.05):
        auc = roc_auc_score(df_oof2["target"].values, df_oof2["predict_lgbm"].values*(1-r) + df_oof2["predict"].values*r)
        print("[nn_ratio: {:.2f}] AUC: {:.4f}".format(r, auc))

        if max_auc < auc:
            max_auc = auc
            max_nn_ratio = r
    print(len(df_oof2))
    """
    if not is_debug:
        mlflow.start_run(experiment_id=10, run_name=os.path.basename(__file__))

        for key, value in params.items():
            mlflow.log_param(key, value)
        mlflow.log_metric("auc_val", auc_transformer)
        mlflow.end_run()
    torch.save(model.state_dict(), f"{output_dir}/transformers.pth")
    del model
    torch.cuda.empty_cache()
    with open(f"{output_dir}/transformer_param.json", "w") as f:
        json.dump(params, f)
    if is_make_feature_factory:
        # feature factory
        feature_factory_dict = {"user_id": {}}
        feature_factory_dict["user_id"][
            "DurationPreviousContent"] = DurationPreviousContent(
                is_partial_fit=True)
        feature_factory_dict["user_id"][
            "ElapsedTimeBinningEncoder"] = ElapsedTimeBinningEncoder()
        feature_factory_manager = FeatureFactoryManager(
            feature_factory_dict=feature_factory_dict,
            logger=logger,
            split_num=1,
            model_id="all",
            load_feature=not is_debug,
            save_feature=not is_debug)

        ff_for_transformer = FeatureFactoryForTransformer(
            column_config=column_config,
            dict_path="../feature_engineering/",
            sequence_length=params["max_seq"],
            logger=logger)
        df = pd.read_pickle(
            "../input/riiid-test-answer-prediction/train_merged.pickle")
        if is_debug:
            df = df.head(10000)
        df = df.sort_values(["user_id", "timestamp"]).reset_index(drop=True)
        feature_factory_manager.fit(df)
        df = feature_factory_manager.all_predict(df)
        for dicts in feature_factory_manager.feature_factory_dict.values():
            for factory in dicts.values():
                factory.logger = None
        feature_factory_manager.logger = None
        with open(f"{output_dir}/feature_factory_manager.pickle", "wb") as f:
            pickle.dump(feature_factory_manager, f)

        ff_for_transformer.fit(df)
        ff_for_transformer.logger = None
        with open(
                f"{output_dir}/feature_factory_manager_for_transformer.pickle",
                "wb") as f:
            pickle.dump(ff_for_transformer, f)
Beispiel #20
0
    def __setup_model_data(self, dataset, lower_case):
        """ set up data/language model """
        if self.model is not None:
            return
        if self.args.is_trained:
            self.model = transformers.AutoModelForTokenClassification.from_pretrained(
                self.args.transformers_model)
            self.transforms = Transforms(self.args.transformers_model,
                                         cache_dir=self.cache_dir)
            self.label_to_id = self.model.config.label2id
            self.dataset_split, self.label_to_id, self.language, self.unseen_entity_set = get_dataset_ner(
                dataset,
                label_to_id=self.label_to_id,
                fix_label_dict=True,
                lower_case=lower_case)
            self.id_to_label = {v: str(k) for k, v in self.label_to_id.items()}
        else:
            self.dataset_split, self.label_to_id, self.language, self.unseen_entity_set = get_dataset_ner(
                dataset, lower_case=lower_case)
            self.id_to_label = {v: str(k) for k, v in self.label_to_id.items()}
            config = transformers.AutoConfig.from_pretrained(
                self.args.transformers_model,
                num_labels=len(self.label_to_id),
                id2label=self.id_to_label,
                label2id=self.label_to_id,
                cache_dir=self.cache_dir)

            self.model = transformers.AutoModelForTokenClassification.from_pretrained(
                self.args.transformers_model, config=config)
            self.transforms = Transforms(self.args.transformers_model,
                                         cache_dir=self.cache_dir)

        # optimizer
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [{
            "params": [
                p for n, p in self.model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            self.args.weight_decay
        }, {
            "params": [
                p for n, p in self.model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        }]
        self.optimizer = transformers.AdamW(optimizer_grouped_parameters,
                                            lr=self.args.lr,
                                            eps=1e-8)

        # scheduler
        self.scheduler = transformers.get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=self.args.warmup_step,
            num_training_steps=self.args.total_step)

        # GPU allocation
        self.model.to(self.device)

        # GPU mixture precision
        if self.args.fp16:
            try:
                from apex import amp  # noqa: F401
                self.model, self.optimizer = amp.initialize(
                    self.model,
                    self.optimizer,
                    opt_level='O1',
                    max_loss_scale=2**13,
                    min_loss_scale=1e-5)
                self.master_params = amp.master_params
                self.scale_loss = amp.scale_loss
                logging.info('using `apex.amp`')
            except ImportError:
                logging.exception(
                    "Skip apex: please install apex from https://www.github.com/nvidia/apex to use fp16"
                )

        # multi-gpus
        if self.n_gpu > 1:
            # multi-gpu training (should be after apex fp16 initialization)
            self.model = torch.nn.DataParallel(self.model.cuda())
            logging.info('using `torch.nn.DataParallel`')
        logging.info('running on %i GPUs' % self.n_gpu)
Beispiel #21
0
def run(n_epochs, lr, train_batch_size, val_batch_size, base_model,
        clustering_loss_weight, embedding_extractor, annealing_alphas, dataset,
        train_idx_file, val_idx_file, result_dir, early_stopping,
        early_stopping_tol, device, random_state):
    # Set random states
    np.random.seed(random_state)
    torch.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)

    # load data
    df = pd.read_csv(dataset)

    with open(train_idx_file, 'r') as f:
        train_idx = np.array(list(map(int, f.readlines())))

    with open(val_idx_file, 'r') as f:
        val_idx = np.array(list(map(int, f.readlines())))

    all_idx = np.concatenate((train_idx, val_idx))

    df_train = df.iloc[all_idx].copy()

    train_texts = df_train['texts'].to_numpy()
    train_labels = df_train['labels'].to_numpy()

    train_data = TextDataset(train_texts, train_labels)
    train_data_loader = DataLoader(dataset=train_data,
                                   batch_size=train_batch_size,
                                   shuffle=False)

    df_val = df.iloc[val_idx].copy()

    val_texts = df_val['texts'].to_numpy()
    val_labels = df_val['labels'].to_numpy()

    val_data = TextDataset(val_texts, val_labels)
    val_data_loader = DataLoader(dataset=val_data,
                                 batch_size=val_batch_size,
                                 shuffle=False)

    # init lm model & tokenizer
    lm_model = AutoModelForMaskedLM.from_pretrained(base_model,
                                                    return_dict=True,
                                                    output_hidden_states=True)
    tokenizer = AutoTokenizer.from_pretrained(base_model,
                                              return_dict=True,
                                              output_hidden_states=True)

    lm_model.to(device)

    # init clustering model
    model, initial_centroids, initial_embeddings = init_model(
        lm_model=lm_model,
        tokenizer=tokenizer,
        data_loader=train_data_loader,
        embedding_extractor=embedding_extractor,
        n_clusters=np.unique(train_labels).shape[0],
        device=device)

    # init optimizer & scheduler
    opt = torch.optim.RMSprop(
        params=model.parameters(),
        lr=lr,  # 2e-5, 5e-7,
        eps=1e-8)

    total_steps = len(train_data_loader) * n_epochs

    scheduler = get_linear_schedule_with_warmup(
        optimizer=opt,
        num_warmup_steps=int(len(train_data_loader) * 0.5),
        num_training_steps=total_steps)

    # train the model
    hist = train(n_epochs=n_epochs,
                 model=model,
                 optimizer=opt,
                 scheduler=scheduler,
                 annealing_alphas=annealing_alphas,
                 train_data_loader=train_data_loader,
                 clustering_loss_weight=clustering_loss_weight,
                 early_stopping=early_stopping,
                 early_stopping_tol=early_stopping_tol,
                 verbose=True)

    # do eval
    run_results = {}

    predicted_labels, true_labels = evaluate(model=model,
                                             eval_data_loader=val_data_loader,
                                             verbose=True)

    best_matching, accuracy = cluster_accuracy(true_labels, predicted_labels)
    ari = adjusted_rand_score(true_labels, predicted_labels)
    nmi = normalized_mutual_info_score(true_labels, predicted_labels)
    purity = purity_score(y_true=true_labels, y_pred=predicted_labels)

    run_results['best_matching'] = best_matching
    run_results['accuracy'] = accuracy
    run_results['ari'] = ari
    run_results['nmi'] = nmi
    run_results[
        'purity'] = purity  # use purity to compare with microsoft paper

    # save results & model
    os.makedirs(result_dir)
    with open(os.path.join(result_dir, 'train_hist.h'), 'wb') as f:
        pickle.dump(hist, file=f)

    result_df = pd.DataFrame.from_records([run_results])
    result_df.to_csv(os.path.join(result_dir,
                                  f'ag_news_subset5-distilbert.csv'),
                     index=False)

    torch.save(model, os.path.join(result_dir, 'model.bin'))
Beispiel #22
0
def evaluate_model(original_data, model_config, eval_config, seed=None):
    if seed is not None:
        np.random.seed(seed)
        torch.manual_seed(seed)

    skf = StratifiedKFold(n_splits=eval_config['n_folds'])
    accuracies = []
    fold = 1

    data = original_data.copy()
    basic_cols = ['text']
    manual_cols = [
        'constructive', 'toxic', 'sarcasm_irony', 'mockery_ridicule',
        'insults', 'argument_discussion', 'negative_toxic_lang',
        'aggressiveness', 'intolerance'
    ]
    manual_transformation = {'sí': 1, 'si': 1, 'no': 0, 'd': 0.5}
    label_col = 'toxicity_degree'
    implemented_models = ['Random Forest', 'SVC', 'Logistic Regression']

    if eval_config['basic_manual_both'] == 2:
        for col in manual_cols:
            data[col] = data[col].str.lower().map(manual_transformation)
        data = data[basic_cols + manual_cols + [label_col]]
    elif eval_config['basic_manual_both'] == 1:
        for col in manual_cols:
            data[col] = data[col].str.lower().map(manual_transformation)
        data = data[manual_cols + [label_col]]
    else:
        data = data[basic_cols + [label_col]]

    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

    for train_index, test_index in skf.split(data, data[label_col].values):
        train = data.loc[train_index, :]
        test = data.loc[test_index, :]
        if model_config['name'].startswith('bert'):
            # Create data loaders based on the data split
            train_data_loader = create_data_loader(train,
                                                   model_config['tokenizer'],
                                                   model_config['max_len'],
                                                   model_config['batch_size'])
            test_data_loader = create_data_loader(test,
                                                  model_config['tokenizer'],
                                                  model_config['max_len'],
                                                  model_config['batch_size'])

            # Create the model and load it into the device
            model = HateSpeechClassifier(model_config['name'],
                                         data[label_col].nunique())
            model = model.to(device)

            # Add the Adam optimizer
            optimizer = torch.optim.Adam(params=model.parameters(),
                                         lr=model_config['learning_rate'])
            total_steps = len(train_data_loader) * model_config['epochs']

            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=0, num_training_steps=total_steps)

            # Use the cross entropy loss
            weights = torch.Tensor(
                1 /
                train.groupby('toxicity_degree').size().sort_index().values)
            loss_fn = nn.CrossEntropyLoss(weight=weights).to(device)

            # Evaluate model on test
            history = defaultdict(list)

            for epoch in range(model_config['epochs']):

                print('Epoch {}/{}'.format(epoch + 1, model_config['epochs']))
                print('-' * 10)

                train_acc, train_loss = train_epoch(model, train_data_loader,
                                                    loss_fn, optimizer,
                                                    device, scheduler,
                                                    len(train_index))

                print(f'Train loss {train_loss} accuracy {train_acc}')

                history['train_acc'].append(train_acc)
                history['train_loss'].append(train_loss)

                y_pred, test_acc = eval_nn(model, test_data_loader, device,
                                           len(test))

                print(f'Test  accuracy {test_acc}')
                print()

            # y_pred = eval_nn(model, test_data_loader, device, len(val))

        elif model_config['name'] in implemented_models:
            train_x = train[[c for c in train.columns if c != label_col]]
            train_y = train[label_col].values

            test_x = test[[c for c in test.columns if c != label_col]]

            if eval_config['basic_manual_both'] != 1:
                bow = TfidfVectorizer(
                    strip_accents=model_config['strip_accents'],
                    stop_words=model_config['stop_words'])
                train_bow_feats = bow.fit_transform(
                    train_x.text.values).todense()

                # Perform dimensionality reduction
                pca = PCA(n_components=model_config['PCA_components'],
                          svd_solver=model_config['svd_solver'])
                train_bow_feats = pca.fit_transform(train_bow_feats)
                test_bow_feats = bow.transform(test_x.text.values).todense()
                test_bow_feats = pca.transform(test_bow_feats)

                train_x.drop('text', axis=1, inplace=True)
                test_x.drop('text', axis=1, inplace=True)

                train_x = np.hstack((train_x.values, train_bow_feats))
                test_x = np.hstack((test_x.values, test_bow_feats))

            if model_config['name'] == 'Random Forest':
                rfc = RandomForestClassifier(
                    n_estimators=model_config['n_trees'],
                    criterion=model_config['criterion'],
                    max_features=model_config['n_feats'],
                    bootstrap=model_config['bootstrap'])
                rfc.fit(train_x, train_y)

                if eval_config['log']:
                    print(rfc.feature_importances_)

                y_pred = rfc.predict(test_x)

            elif model_config['name'] == 'SVC':
                svm = SVC(
                    kernel=model_config['kernel'],
                    decision_function_shape=model_config['decision_func'],
                    gamma=model_config['gamma'],
                    C=model_config['penalty'])
                svm.fit(train_x, train_y)
                y_pred = svm.predict(test_x)

            elif model_config['name'] == 'Logistic Regression':
                lr = LogisticRegression(
                    penalty=model_config['penalty'],
                    solver=model_config['solver'],
                    multi_class=model_config['multi_class'])
                lr.fit(train_x, train_y)
                y_pred = lr.predict(test_x)
        else:
            print('No valid model has been selected')
            return

        accuracies.append(
            f1_score(test[label_col].values,
                     y_pred,
                     labels=data[label_col].unique(),
                     average='macro'))
        # print('Accuracy for Fold', fold, 'is:', np.round(accuracies[-1], 4))

        fold += 1

    mean_accuracy = np.mean(accuracies)
    std_accuracy = np.std(accuracies)

    # print('Total Prediction Accuracy is:', np.round(mean_accuracy, 4), '\u00B1', np.round(std_accuracy, 4))

    return mean_accuracy, std_accuracy
Beispiel #23
0
    def selftrain(self, soft=True):
        selftrain_dataset = ConcatDataset([self.train_dataset, self.unlabeled])
        ## generating pseudo_labels
        pseudo_labels = []
        train_sampler = RandomSampler(selftrain_dataset)
        train_dataloader = DataLoader(
            selftrain_dataset, sampler=train_sampler, batch_size=self.args.batch_size
        )
        if self.args.self_training_max_step > 0:
            t_total = self.args.self_training_max_step
            self.args.num_train_epochs = (
                self.args.self_training_max_step
                // (len(train_dataloader) // self.args.gradient_accumulation_steps)
                + 1
            )
        else:
            t_total = (
                len(train_dataloader)
                // self.args.gradient_accumulation_steps
                * self.args.num_train_epochs
            )

        # Prepare optimizer and schedule (linear warmup and decay)
        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p
                    for n, p in self.model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "weight_decay": self.args.weight_decay,
            },
            {
                "params": [
                    p
                    for n, p in self.model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "weight_decay": 0.0,
            },
        ]
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=self.args.learning_rate,
            eps=self.args.adam_epsilon,
        )
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=self.args.warmup_steps,
            num_training_steps=t_total,
        )
        self_training_loss = (
            nn.KLDivLoss(reduction="none")
            if soft
            else nn.CrossEntropyLoss(reduction="none")
        )
        softmax = nn.Softmax(dim=1)
        update_step = 0
        self_training_steps = self.args.self_training_max_step
        global_step = 0
        selftrain_loss = 0
        set_seed(self.args)
        # self.model.zero_grad()
        for t3 in range(int(self_training_steps / len(train_dataloader)) + 1):
            epoch_iterator = tqdm(train_dataloader, desc="SelfTrain, Iteration")
            for step, batch in enumerate(epoch_iterator):
                if global_step % self.args.self_training_update_period == 0:
                    teacher_model = copy.deepcopy(self.model)  # .to("cuda")
                    teacher_model.eval()
                    for p in teacher_model.parameters():
                        p.requires_grad = False

                self.model.train()
                batch = tuple(t.to(self.device) for t in batch)  # GPU or CPU
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "token_type_ids": batch[2],
                }
                # self.model.eval()
                if self.args.task_type == "wic":
                    inputs["keys"] = batch[6]
                elif self.args.task_type == "re":
                    inputs["e1_mask"] = batch[4]
                    inputs["e2_mask"] = batch[5]
                outputs = self.model(**inputs)
                outputs_pseudo = teacher_model(**inputs)

                logits = outputs[0]
                true_labels = batch[-1]

                loss = self.calc_loss(
                    input=torch.log(softmax(logits)),
                    target=outputs_pseudo[0],
                    loss=self_training_loss,
                    thresh=self.args.self_training_eps,
                    soft=soft,
                    conf="entropy",
                    confreg=self.args.self_training_confreg,
                )

                if self.args.self_training_contrastive_weight > 0:
                    contrastive_loss = self.contrastive_loss(
                        input=torch.log(softmax(logits)),
                        feat=outputs_pseudo[-1],
                        target=outputs_pseudo[0],
                        conf="entropy",
                        thresh=self.args.self_training_eps,
                        distmetric=self.args.distmetric,
                    )
                    loss = (
                        loss
                        + self.args.self_training_contrastive_weight * contrastive_loss
                    )

                if self.args.gradient_accumulation_steps > 1:
                    loss = loss / self.args.gradient_accumulation_steps

                if torch.cuda.device_count() > 1:
                    loss = loss.mean()

                selftrain_loss += loss.item()
                loss.backward()
                if (step + 1) % self.args.gradient_accumulation_steps == 0:
                    torch.nn.utils.clip_grad_norm_(
                        self.model.parameters(), self.args.max_grad_norm
                    )

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    self.model.zero_grad()
                    teacher_model.zero_grad()
                    global_step += 1
                    epoch_iterator.set_description(
                        "SelfTrain iter:%d Loss:%.3f m:%.3f"
                        % (step, selftrain_loss / global_step, 0)
                    )
                    if (
                        self.args.logging_steps > 0
                        and global_step % self.args.self_train_logging_steps == 0
                    ):
                        # self.evaluate("dev", global_step)
                        self.evaluate("test", global_step)

                    if (
                        self.args.save_steps > 0
                        and global_step % self.args.save_steps == 0
                    ):
                        self.save_model()

                if 0 < self.args.self_training_max_step < global_step:
                    epoch_iterator.close()
                    break

            if 0 < self.args.self_training_max_step < global_step:
                break
        pass
def train(args, train_dataset, bert_model, model, tokenizer, labels,
          pad_token_label_id):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    loss_fct = torch.nn.CrossEntropyLoss()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps * (1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
        try:
            global_step = int(
                args.model_name_or_path.split("-")[-1].split("/")[0])
        except ValueError:
            global_step = 0
        epochs_trained = global_step // (len(train_dataloader) //
                                         args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (
            len(train_dataloader) // args.gradient_accumulation_steps)

        logger.info(
            "  Continuing training from checkpoint, will skip to saved global_step"
        )
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info("  Will skip the first %d steps in the first epoch",
                    steps_trained_in_current_epoch)

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": batch[3]
            }
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2] if args.model_type in ["bert", "xlnet"] else None
                )  # XLM and RoBERTa don"t use segment_ids

            hs = bert_model(inputs["input_ids"],
                            attention_mask=inputs["attention_mask"],
                            output_hidden_states=True)

            avg_emb = 0
            for layer in range(1, len(hs.hidden_states)):
                avg_emb += hs.hidden_states[layer]

            avg_emb = torch.div(avg_emb, len(hs.hidden_states) - 1)
            #print(avg_emb.shape, len(hs.hidden_states))
            #cls_hs = hs[0]

            logits = model(avg_emb)
            active_loss = inputs["attention_mask"].view(-1) == 1
            active_logits = logits.view(-1, args.num_labels)
            active_labels = torch.where(
                active_loss, inputs["labels"].view(-1),
                torch.tensor(loss_fct.ignore_index).type_as(inputs["labels"]))
            loss = loss_fct(active_logits, active_labels)

            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

                scheduler.step()  # Update learning rate schedule
                optimizer.step()
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                            args.local_rank == -1
                            and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results, _ = evaluate(args,
                                              model,
                                              tokenizer,
                                              labels,
                                              pad_token_label_id,
                                              mode="dev")
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)

                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training

                    bert_model_to_save = (bert_model.module if hasattr(
                        bert_model, "module") else bert_model)

                    bert_model_to_save.save_pretrained(args.output_dir)
                    tokenizer.save_pretrained(args.output_dir)

                    # Good practice: save your training arguments together with the trained model
                    torch.save(
                        args, os.path.join(args.output_dir,
                                           "training_args.bin"))
                    torch.save(
                        model_to_save.state_dict(),
                        os.path.join(args.output_dir, "bert_lstm.model"))

                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Beispiel #25
0
def run():
    seed_everything(config.SEED)
    dfx = pd.read_csv(config.TRAINING_FILE).dropna().reset_index(drop=True)

    df_train, df_valid = model_selection.train_test_split(
        dfx, test_size=0.1, random_state=42, stratify=dfx.sentiment.values)

    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    train_dataset = TweetDataset(tweet=df_train.text.values,
                                 sentiment=df_train.sentiment.values,
                                 selected_text=df_train.selected_text.values)

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=config.TRAIN_BATCH_SIZE, num_workers=4)

    valid_dataset = TweetDataset(tweet=df_valid.text.values,
                                 sentiment=df_valid.sentiment.values,
                                 selected_text=df_valid.selected_text.values)

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=config.VALID_BATCH_SIZE, num_workers=2)

    device = torch.device("cuda")
    model_config = transformers.BertConfig.from_pretrained(config.BERT_PATH)
    model_config.output_hidden_states = True
    model = TweetModel(conf=model_config)
    model.to(device)

    num_train_steps = int(
        len(df_train) / config.TRAIN_BATCH_SIZE * config.EPOCHS)
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {
            'params': [
                p for n, p in param_optimizer
                if not any(nd in n for nd in no_decay)
            ],
            'weight_decay':
            0.001
        },
        {
            'params':
            [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
            'weight_decay':
            0.0
        },
    ]
    optimizer = AdamW(optimizer_parameters, lr=3e-5)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)

    es = utils.EarlyStopping(patience=2, mode="max")

    for epoch in range(config.EPOCHS):
        engine.train_fn(train_data_loader,
                        model,
                        optimizer,
                        device,
                        scheduler=scheduler)
        jaccard = engine.eval_fn(valid_data_loader, model, device)
        #print(f"Jaccard Score = {jaccard}")
        es(jaccard, model, model_path="model.bin")
        if es.early_stop:
            print("Early stopping")
            break
    def __init__(
        self, params: dict, dataset: LmSeqsDataset, token_probs: torch.tensor, student: nn.Module, teacher: nn.Module
    ):
        logger.info("Initializing Distiller")
        self.params = params
        self.dump_path = params.dump_path
        self.multi_gpu = params.multi_gpu
        self.fp16 = params.fp16

        self.student = student
        self.teacher = teacher

        self.student_config = student.config
        self.vocab_size = student.config.vocab_size

        if params.n_gpu <= 1:
            sampler = RandomSampler(dataset)
        else:
            sampler = DistributedSampler(dataset)

        if params.group_by_size:
            groups = create_lengths_groups(lengths=dataset.lengths, k=params.max_model_input_size)
            sampler = GroupedBatchSampler(sampler=sampler, group_ids=groups, batch_size=params.batch_size)
        else:
            sampler = BatchSampler(sampler=sampler, batch_size=params.batch_size, drop_last=False)

        self.dataloader = DataLoader(dataset=dataset, batch_sampler=sampler, collate_fn=dataset.batch_sequences)

        self.temperature = params.temperature
        assert self.temperature > 0.0

        self.alpha_ce = params.alpha_ce
        self.alpha_mlm = params.alpha_mlm
        self.alpha_clm = params.alpha_clm
        self.alpha_mse = params.alpha_mse
        self.alpha_cos = params.alpha_cos

        self.mlm = params.mlm
        if self.mlm:
            logger.info("Using MLM loss for LM step.")
            self.mlm_mask_prop = params.mlm_mask_prop
            assert 0.0 <= self.mlm_mask_prop <= 1.0
            assert params.word_mask + params.word_keep + params.word_rand == 1.0
            self.pred_probs = torch.FloatTensor([params.word_mask, params.word_keep, params.word_rand])
            self.pred_probs = self.pred_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else self.pred_probs
            self.token_probs = token_probs.to(f"cuda:{params.local_rank}") if params.n_gpu > 0 else token_probs
            if self.fp16:
                self.pred_probs = self.pred_probs.half()
                self.token_probs = self.token_probs.half()
        else:
            logger.info("Using CLM loss for LM step.")

        self.epoch = 0
        self.n_iter = 0
        self.n_total_iter = 0
        self.n_sequences_epoch = 0
        self.total_loss_epoch = 0
        self.last_loss = 0
        self.last_loss_ce = 0
        self.last_loss_mlm = 0
        self.last_loss_clm = 0
        if self.alpha_mse > 0.0:
            self.last_loss_mse = 0
        if self.alpha_cos > 0.0:
            self.last_loss_cos = 0
        self.last_log = 0

        self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean")
        self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
        if self.alpha_mse > 0.0:
            self.mse_loss_fct = nn.MSELoss(reduction="sum")
        if self.alpha_cos > 0.0:
            self.cosine_loss_fct = nn.CosineEmbeddingLoss(reduction="mean")

        logger.info("--- Initializing model optimizer")
        assert params.gradient_accumulation_steps >= 1
        self.num_steps_epoch = len(self.dataloader)
        num_train_optimization_steps = (
            int(self.num_steps_epoch / params.gradient_accumulation_steps * params.n_epoch) + 1
        )

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [
                    p for n, p in student.named_parameters() if not any(nd in n for nd in no_decay) and p.requires_grad
                ],
                "weight_decay": params.weight_decay,
            },
            {
                "params": [
                    p for n, p in student.named_parameters() if any(nd in n for nd in no_decay) and p.requires_grad
                ],
                "weight_decay": 0.0,
            },
        ]
        logger.info(
            "------ Number of trainable parameters (student): %i"
            % sum([p.numel() for p in self.student.parameters() if p.requires_grad])
        )
        logger.info("------ Number of parameters (student): %i" % sum([p.numel() for p in self.student.parameters()]))
        self.optimizer = AdamW(
            optimizer_grouped_parameters, lr=params.learning_rate, eps=params.adam_epsilon, betas=(0.9, 0.98)
        )

        warmup_steps = math.ceil(num_train_optimization_steps * params.warmup_prop)
        self.scheduler = get_linear_schedule_with_warmup(
            self.optimizer, num_warmup_steps=warmup_steps, num_training_steps=num_train_optimization_steps
        )

        if self.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
            logger.info(f"Using fp16 training: {self.params.fp16_opt_level} level")
            self.student, self.optimizer = amp.initialize(
                self.student, self.optimizer, opt_level=self.params.fp16_opt_level
            )
            self.teacher = self.teacher.half()

        if self.multi_gpu:
            if self.fp16:
                from apex.parallel import DistributedDataParallel

                logger.info("Using apex.parallel.DistributedDataParallel for distributed training.")
                self.student = DistributedDataParallel(self.student)
            else:
                from torch.nn.parallel import DistributedDataParallel

                logger.info("Using nn.parallel.DistributedDataParallel for distributed training.")
                self.student = DistributedDataParallel(
                    self.student,
                    device_ids=[params.local_rank],
                    output_device=params.local_rank,
                    find_unused_parameters=True,
                )

        self.is_master = params.is_master
        if self.is_master:
            logger.info("--- Initializing Tensorboard")
            self.tensorboard = SummaryWriter(log_dir=os.path.join(self.dump_path, "log", "train"))
            self.tensorboard.add_text(tag="config/training", text_string=str(self.params), global_step=0)
            self.tensorboard.add_text(tag="config/student", text_string=str(self.student_config), global_step=0)
Beispiel #27
0
def training(tokenizer, content, n_splits, fold, train_data_loader,
             val_data_loader, model_type, model_name, hidden_layers,
             optimizer_name, lr_scheduler_name, lr, warmup_proportions,
             batch_size, valid_batch_size, num_epoch, start_epoch,
             accumlation_steps, checkpoint_folder, load_pretrain, seed, loss,
             extra_token, augment, early_stopping):
    torch.cuda.empty_cache()
    strng = "@%s: \n" % os.path.basename(__file__)
    strng += "\tset random seed = %d \n" % seed
    strng += "\t cuda environment: \n"
    strng += "\t torch version is %s \t torch.version.cuda is %s \t torch.backends.cudnn.version() = %s \n" % (
        torch.__version__, torch.version.cuda, torch.backends.cudnn.version())
    strng += "\t torch.cuda.device_count() is %s  \n" % (
        torch.cuda.device_count())

    if augment:
        if extra_token:
            checkpoint_folder = os.path.join(
                checkpoint_folder, model_type + '/' + model_name + '-' +
                content + '-' + loss + '-' + optimizer_name + '-' +
                lr_scheduler_name + '-' + str(n_splits) + '-' + str(seed) +
                '-' + 'aug_differential_extra_token/')
        else:
            checkpoint_folder = os.path.join(
                checkpoint_folder,
                model_type + '/' + model_name + '-' + content + '-' + loss +
                '-' + optimizer_name + '-' + lr_scheduler_name + '-' +
                str(n_splits) + '-' + str(seed) + '-' + 'aug_differential/')
    else:
        if extra_token:
            checkpoint_folder = os.path.join(
                checkpoint_folder,
                model_type + '/' + model_name + '-' + content + '-' + loss +
                '-' + optimizer_name + '-' + lr_scheduler_name + '-' +
                str(n_splits) + '-' + str(seed) + '-' + 'extra_token/')
        else:
            checkpoint_folder = os.path.join(
                checkpoint_folder,
                model_type + '/' + model_name + '-' + content + '-' + loss +
                '-' + optimizer_name + '-' + lr_scheduler_name + '-' +
                str(n_splits) + '-' + str(seed) + '-' + '/')

    checkpoint_filename = 'fold_' + str(fold) + "_checkpoint.pth"
    checkpoint_filepath = os.path.join(checkpoint_folder, checkpoint_filename)

    os.makedirs(checkpoint_folder, exist_ok=True)

    log = Logger()
    log.open(os.path.join(checkpoint_folder,
                          'fold_' + str(fold) + '_train_log.txt'),
             mode='a+')
    log.write('\t%s\n' % strng)
    log.write("\t seed = %s, fold = %s, __file__ = %s, out_dir = %s" %
              (seed, fold, __file__, checkpoint_folder))

    def load(model, pretrain_file, skip=[]):
        pretrain_dict = torch.load(pretrain_file)
        state_dict = model.state_dict()
        for key in state_dict.keys():
            if any(s in key for s in skip):
                continue
            else:
                state_dict[key] = pretrain_dict[key]
        model.load_state_dict(state_dict, strict=False)
        return model

    if content == "Question_Answer":
        num_class = 30
    elif content == "Question":
        num_class = 21
    elif content == "Answer":
        num_class = 9

    if model_type == "bert":
        if extra_token:
            model = QuestNet(model_type=model_name,
                             tokenizer=tokenizer,
                             n_classes=num_class,
                             n_category_classes=num_category_class,
                             n_host_classes=num_host_class,
                             hidden_layers=hidden_layers,
                             extra_token=True)
        else:
            model = QuestNet(model_type=model_name,
                             tokenizer=tokenizer,
                             n_classes=num_class,
                             n_category_classes=num_category_class,
                             n_host_classes=num_host_class,
                             hidden_layers=hidden_layers,
                             extra_token=False)
    elif model_type == "xlnet":
        if extra_token:
            model = QuestNet(model_type=model_name,
                             tokenizer=tokenizer,
                             n_classes=num_class,
                             n_category_classes=num_category_class,
                             n_host_classes=num_host_class,
                             hidden_layers=hidden_layers,
                             extra_token=True)
        else:
            model = QuestNet(model_type=model_name,
                             tokenizer=tokenizer,
                             n_classes=num_class,
                             n_category_classes=num_category_class,
                             n_host_classes=num_host_class,
                             hidden_layers=hidden_layers,
                             extra_token=False)
    else:
        raise NotImplementedError

    model = model.cuda()
    if load_pretrain:
        if content == "Answer":
            model = load(model,
                         checkpoint_filepath,
                         skip=['fc.weight', 'fc.bias'])
        else:
            model = load(model, checkpoint_filepath)

    if model_name == "t5-base":
        weight_decay = 0.9
    else:
        weight_decay = 0.01

    if (model_type == 'bert') or (model_type == 'xlnet'):

        optimizer_grouped_parameters = []
        list_lr = []

        if (model_name == 'bert-base-uncased') or (model_name
                                                   == 'bert-base-cased'):
            list_layers = [
                model.bert_model.embeddings, model.bert_model.encoder.layer[0],
                model.bert_model.encoder.layer[1],
                model.bert_model.encoder.layer[2],
                model.bert_model.encoder.layer[3],
                model.bert_model.encoder.layer[4],
                model.bert_model.encoder.layer[5],
                model.bert_model.encoder.layer[6],
                model.bert_model.encoder.layer[7],
                model.bert_model.encoder.layer[8],
                model.bert_model.encoder.layer[9],
                model.bert_model.encoder.layer[10],
                model.bert_model.encoder.layer[11], model.fc_1, model.fc
            ]

        elif (model_name == 'bert-large-uncased'):
            list_layers = [
                model.bert_model.embeddings, model.bert_model.encoder.layer[0],
                model.bert_model.encoder.layer[1],
                model.bert_model.encoder.layer[2],
                model.bert_model.encoder.layer[3],
                model.bert_model.encoder.layer[4],
                model.bert_model.encoder.layer[5],
                model.bert_model.encoder.layer[6],
                model.bert_model.encoder.layer[7],
                model.bert_model.encoder.layer[8],
                model.bert_model.encoder.layer[9],
                model.bert_model.encoder.layer[10],
                model.bert_model.encoder.layer[11],
                model.bert_model.encoder.layer[12],
                model.bert_model.encoder.layer[13],
                model.bert_model.encoder.layer[14],
                model.bert_model.encoder.layer[15],
                model.bert_model.encoder.layer[16],
                model.bert_model.encoder.layer[17],
                model.bert_model.encoder.layer[18],
                model.bert_model.encoder.layer[19],
                model.bert_model.encoder.layer[20],
                model.bert_model.encoder.layer[21],
                model.bert_model.encoder.layer[22],
                model.bert_model.encoder.layer[23], model.fc_1, model.fc
            ]
        elif (model_name == "xlnet-base-cased"):

            list_layers = [
                model.xlnet_model.word_embedding, model.xlnet_model.layer[0],
                model.xlnet_model.layer[1], model.xlnet_model.layer[2],
                model.xlnet_model.layer[3], model.xlnet_model.layer[4],
                model.xlnet_model.layer[5], model.xlnet_model.layer[6],
                model.xlnet_model.layer[7], model.xlnet_model.layer[8],
                model.xlnet_model.layer[9], model.xlnet_model.layer[10],
                model.xlnet_model.layer[11], model.fc_1, model.fc
            ]
        elif (model_name == "roberta-base"):

            list_layers = [
                model.roberta_model.embeddings,
                model.roberta_model.encoder.layer[0],
                model.roberta_model.encoder.layer[1],
                model.roberta_model.encoder.layer[2],
                model.roberta_model.encoder.layer[3],
                model.roberta_model.encoder.layer[4],
                model.roberta_model.encoder.layer[5],
                model.roberta_model.encoder.layer[6],
                model.roberta_model.encoder.layer[7],
                model.roberta_model.encoder.layer[8],
                model.roberta_model.encoder.layer[9],
                model.roberta_model.encoder.layer[10],
                model.roberta_model.encoder.layer[11], model.fc_1, model.fc
            ]
        elif (model_name == "gpt2"):
            list_layers = [  # model.gpt2_model.wte,
                # model.gpt2_model.wpe,
                model.gpt2_model.h[0],
                model.gpt2_model.h[1],
                model.gpt2_model.h[2],
                model.gpt2_model.h[3],
                model.gpt2_model.h[4],
                model.gpt2_model.h[5],
                model.gpt2_model.h[6],
                model.gpt2_model.h[7],
                model.gpt2_model.h[8],
                model.gpt2_model.h[9],
                model.gpt2_model.h[10],
                model.gpt2_model.h[11],
                model.fc_1,
                model.fc
            ]
        else:
            raise NotImplementedError

        ######## Differential LR and optimizer group ############################################################

        if model_name == "":
            for layer in list_layers:
                list_lr.append(lr)
                lr = lr * decay_factor
            list_lr.reverse()
        else:
            mult = lr / min_lr
            step = mult**(1 / (len(list_layers) - 1))
            list_lr = [lr * (step**i) for i in range(len(list_layers))]
        no_decay = ['bias', 'LayerNorm.weight', 'LayerNorm.bias']

        for i in range(len(list_lr)):
            if isinstance(list_lr[i], list):
                for list_layer in list_layers[i]:
                    layer_parameters = list(list_layer.named_parameters())
                    optimizer_grouped_parameters.append({
                        'params': [
                            p for n, p in layer_parameters
                            if not any(nd in n for nd in no_decay)
                        ],
                        'lr':
                        list_lr[i],
                        'weight_decay':
                        weight_decay
                    })
                    optimizer_grouped_parameters.append({
                        'params': [
                            p for n, p in layer_parameters
                            if any(nd in n for nd in no_decay)
                        ],
                        'lr':
                        list_lr[i],
                        'weight_decay':
                        0.0
                    })
            else:
                layer_parameters = list(list_layers[i].named_parameters())
                optimizer_grouped_parameters.append({
                    'params': [
                        p for n, p in layer_parameters
                        if not any(nd in n for nd in no_decay)
                    ],
                    'lr':
                    list_lr[i],
                    'weight_decay':
                    weight_decay
                })
                optimizer_grouped_parameters.append({
                    'params': [
                        p for n, p in layer_parameters
                        if any(nd in n for nd in no_decay)
                    ],
                    'lr':
                    list_lr[i],
                    'weight_decay':
                    0.0
                })
        if extra_token:
            layer_parameters = list(model.fc_1_category.named_parameters())
            optimizer_grouped_parameters.append({
                'params': [
                    p for n, p in layer_parameters
                    if not any(nd in n for nd in no_decay)
                ],
                'lr':
                1e-6,
                'weight_decay':
                weight_decay
            })
            optimizer_grouped_parameters.append({
                'params': [
                    p for n, p in layer_parameters
                    if any(nd in n for nd in no_decay)
                ],
                'lr':
                1e-6,
                'weight_decay':
                0.0
            })

            layer_parameters = list(model.fc_1_host.named_parameters())
            optimizer_grouped_parameters.append({
                'params': [
                    p for n, p in layer_parameters
                    if not any(nd in n for nd in no_decay)
                ],
                'lr':
                1e-6,
                'weight_decay':
                weight_decay
            })
            optimizer_grouped_parameters.append({
                'params': [
                    p for n, p in layer_parameters
                    if any(nd in n for nd in no_decay)
                ],
                'lr':
                1e-6,
                'weight_decay':
                0.0
            })

            layer_parameters = list(model.fc_category.named_parameters())
            optimizer_grouped_parameters.append({
                'params': [
                    p for n, p in layer_parameters
                    if not any(nd in n for nd in no_decay)
                ],
                'lr':
                1e-6,
                'weight_decay':
                weight_decay
            })
            optimizer_grouped_parameters.append({
                'params': [
                    p for n, p in layer_parameters
                    if any(nd in n for nd in no_decay)
                ],
                'lr':
                1e-6,
                'weight_decay':
                0.0
            })

            layer_parameters = list(model.fc_host.named_parameters())
            optimizer_grouped_parameters.append({
                'params': [
                    p for n, p in layer_parameters
                    if not any(nd in n for nd in no_decay)
                ],
                'lr':
                1e-6,
                'weight_decay':
                weight_decay
            })
            optimizer_grouped_parameters.append({
                'params': [
                    p for n, p in layer_parameters
                    if any(nd in n for nd in no_decay)
                ],
                'lr':
                1e-6,
                'weight_decay':
                0.0
            })
        else:
            print("no extra token")
    else:
        raise NotImplementedError

    if optimizer_name == 'Adam':
        optimizer = torch.optim.Adam(optimizer_grouped_parameters)
    elif optimizer_name == 'Ranger':
        optimizer = Ranger(optimizer_grouped_parameters)
    elif optimizer_name == 'BertAdam':
        num_optimization_steps = num_epoch * len(
            train_data_loader) // accumlation_steps
        optimizer = BertAdam(optimizer_grouped_parameters,
                             warmup=warmup_proportions,
                             t_total=num_optimization_steps)
    elif optimizer_name == 'AdamW':
        optimizer = AdamW(optimizer_grouped_parameters, eps=4e-5)
    elif optimizer_name == 'FusedAdam':
        optimizer = FusedAdam(optimizer_grouped_parameters,
                              bias_correction=False)
    else:
        raise NotImplementedError

    ######## LR shceduler ############################################################
    if lr_scheduler_name == 'CosineAnealing':
        num_train_optimization_steps = num_epoch * len(
            train_data_loader) // accumlation_steps
        scheduler = get_cosine_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(warmup_proportions *
                                 num_train_optimization_steps),
            num_training_steps=num_train_optimization_steps)
        lr_scheduler_each_iter = False
    elif lr_scheduler_name == "WarmRestart":
        scheduler = WarmRestart(optimizer, T_max=5, T_mult=1, eta_min=1e-6)
        lr_scheduler_each_iter = False
    elif lr_scheduler_name == "WarmupLinearSchedule":
        num_train_optimization_steps = num_epoch * len(
            train_data_loader) // accumlation_steps
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=int(warmup_proportions *
                                 num_train_optimization_steps),
            num_training_steps=num_train_optimization_steps)
        lr_scheduler_each_iter = True
    else:
        raise NotImplementedError

    log.write("\t model name: %s \n" % model_name)
    log.write("\t optimizer name: %s \n" % optimizer_name)
    log.write("\t scheduler name: %s \n" % lr_scheduler_name)

    # AMP -automatic mixed precision training for faster training
    # https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html

    model, optimizer = amp.initialize(model, optimizer, opt_level='O1')
    eval_step = len(train_data_loader)
    log_step = 50
    eval_count = 0
    count = 0

    log.write('\t training starts here!!\n')
    log.write('\t batch size = %d, accumulation steps = %d \n' %
              (batch_size, accumlation_steps))
    log.write('\t experiment : %s' % str(__file__.split('/')[-2:]))

    valid_loss = np.zeros(1, np.float32)
    train_loss = np.zeros(1, np.float32)
    valid_metric_optimal = -np.inf

    writer = SummaryWriter()

    # Define loss
    if loss == 'mse':
        criterion = MSELoss()
    elif loss == 'mse-bce':
        criterion = MSBCELoss()
    elif loss == 'focal':
        criterion = FocalLoss()
    elif loss == 'bce':
        if content == 'Question_Answer':
            weights = torch.tensor(np.array(unbalance_weight),
                                   dtype=torch.float64).cuda()
        elif content == 'Answer':
            weights = torch.tensor(np.array(a_unbalance_weight),
                                   dtype=torch.float64).cuda()
        elif content == 'Question':
            weights = torch.tensor(np.array(q_unbalance_weight),
                                   dtype=torch.float64).cuda()
        else:
            raise NotImplementedError
        criterion = nn.BCEWithLogitsLoss(weight=weights)
        criterion_extra = nn.BCEWithLogitsLoss()
    else:
        raise NotImplementedError

    for epoch in range(1, num_epoch + 1):
        labels_train = None
        pred_train = None
        labels_val = None
        pred_val = None

        checkpoint_filename_last_epoch = "fold_" + str(
            fold) + "_checkpoint_last_epoch.pth"
        checkpoint_filepath_last_epoch = os.path.join(
            checkpoint_folder, checkpoint_filename_last_epoch)
        torch.save(model.state_dict(), checkpoint_filepath_last_epoch)

        if (epoch > 1) and (not lr_scheduler_each_iter):
            scheduler.step()
        if epoch < start_epoch:
            continue
        log.write("\t epoch is %d and time is %s \n" %
                  (epoch, time.strftime("%H:%M:%S", time.gmtime(time.time()))))
        prev_time = time.time()

        sum_train_loss = np.zeros_like(train_loss)
        sum_train = np.zeros_like(train_loss)

        torch.cuda.empty_cache()
        model.zero_grad()

        if extra_token:
            for tr_batch_i, (token_ids, seg_ids, labels, labels_category,
                             labels_host) in enumerate(train_data_loader):
                rate = 0
                for param_group in optimizer.param_groups:
                    rate += param_group['lr'] / len(optimizer.param_groups)

                model.train()
                token_ids = token_ids.cuda()
                seg_ids = seg_ids.cuda()
                labels = labels.cuda().float()
                labels_category = labels_category.cuda().float()
                labels_host = labels_host.cuda().float()

                prediction, prediction_category, prediction_host = model(
                    token_ids, seg_ids)
                loss = auxiliary_weights[0] * criterion(
                    prediction,
                    labels) + auxiliary_weights[1] * criterion_extra(
                        prediction_category, labels_category
                    ) + auxiliary_weights[2] * criterion_extra(
                        prediction_host, labels_host)
                with amp.scale_loss(loss / accumlation_steps,
                                    optimizer) as scaled_loss:
                    scaled_loss.backward()

                if ((tr_batch_i + 1) % accumlation_steps == 0):
                    torch.nn.utils.clip_grad_norm(model.parameters(),
                                                  max_norm=5.0,
                                                  norm_type=2)
                    optimizer.step()
                    model.zero_grad()
                    if lr_scheduler_each_iter:
                        scheduler.step()
                        #Write to tensorboard summary writer
                    writer.add_scalar(
                        "train_loss_" + str(fold), loss.item(),
                        (epoch - 1) * len(train_data_loader) * batch_size +
                        tr_batch_i * batch_size)
                prediction = torch.sigmoid(prediction)
                if tr_batch_i == 0:
                    pred_train = prediction.cpu().detach().numpy()
                    labels_train = labels.cpu().detach().numpy()
                else:
                    pred_train = np.concatenate(
                        (pred_train, prediction.cpu().detach().numpy()),
                        axis=0)
                    labels_train = np.concatenate(
                        (labels_train, labels.cpu().detach().numpy()), axis=0)
                l = np.array([loss.item() * batch_size])
                n = np.array([batch_size])
                sum_train_loss += l
                sum_train += n

                #log for training
                if (tr_batch_i + 1) % log_step == 0:
                    train_loss = sum_train_loss / (sum_train + 1e-12)
                    pred_train = np.nan_to_num(pred_train)
                    sp = Spearman(labels_train, pred_train)
                    elapsed_time = time.time() - prev_time
                    prev_time = time.time()
                    log.write(
                        "\t Batch # %d \t perc processed in epoch: %f \t  train_loss is %f \t lr is %f \t spearman is %f \t elapsed time: %d\n"
                        % ((tr_batch_i + 1),
                           ((tr_batch_i + 1) / len(train_data_loader)),
                           train_loss[0], rate, sp, elapsed_time))

                if (tr_batch_i + 1) % eval_step == 0:
                    eval_count += 1
                    valid_loss = np.zeros(1, np.float32)
                    valid_num = np.zeros_like(valid_loss)

                    with torch.no_grad():
                        torch.cuda.empty_cache()
                        for val_batch_i, (
                                token_ids, seg_ids, labels, labels_category,
                                labels_host) in enumerate(val_data_loader):
                            model.eval()
                            token_ids = token_ids.cuda()
                            seg_ids = seg_ids.cuda()
                            labels = labels.cuda().float()
                            labels_category = labels_category.cuda().float()
                            labels_host = labels_host.cuda().float()

                            prediction, prediction_category, prediction_host = model(
                                token_ids, seg_ids)
                            val_loss = auxiliary_weights[0] * criterion(
                                prediction, labels
                            ) + auxiliary_weights[1] * criterion_extra(
                                prediction_category, labels_category
                            ) + auxiliary_weights[2] * criterion_extra(
                                prediction_host, labels_host)
                            writer.add_scalar(
                                "val_loss_" + str(fold), val_loss.item(),
                                (eval_count - 1) * len(val_data_loader) *
                                valid_batch_size +
                                val_batch_i * valid_batch_size)

                            prediction = torch.sigmoid(prediction)
                            if val_batch_i == 0:
                                pred_val = prediction.cpu().detach().numpy()
                                labels_val = labels.cpu().detach().numpy()
                            else:
                                pred_val = np.concatenate(
                                    (pred_val,
                                     prediction.cpu().detach().numpy()),
                                    axis=0)
                                labels_val = np.concatenate(
                                    (labels_val,
                                     labels.cpu().detach().numpy()),
                                    axis=0)
                            l = np.array([val_loss.item() * valid_batch_size])
                            n = np.array([valid_batch_size])
                            valid_loss += l
                            valid_num += n

                            valid_loss = valid_loss / (valid_num + 1e-12)
                            pred_val = np.nan_to_num(pred_val)
                            sp = Spearman(labels_val, pred_val)
                            log.write(
                                "\t Batch # %d perc processed in epoch: %f Validation loss is %f \t spearman is %f \n"
                                % (val_batch_i,
                                   (val_batch_i / len(val_data_loader)),
                                   valid_loss[0], sp))
        else:
            for tr_batch_i, (token_ids, seg_ids,
                             labels) in enumerate(train_data_loader):
                rate = 0
                for param_group in optimizer.param_groups:
                    rate += param_group['lr'] / len(optimizer.param_groups)

                model.train()
                token_ids = token_ids.cuda()
                seg_ids = seg_ids.cuda()
                labels = labels.cuda().float()

                prediction = model(token_ids, seg_ids)
                loss = criterion(prediction, labels)
                with amp.scale_loss(loss / accumlation_steps,
                                    optimizer) as scaled_loss:
                    scaled_loss.backward()

                if ((tr_batch_i + 1) % accumlation_steps == 0):
                    torch.nn.utils.clip_grad_norm(model.parameters(),
                                                  max_norm=5.0,
                                                  norm_type=2)
                    optimizer.step()
                    model.zero_grad()
                    if lr_scheduler_each_iter:
                        scheduler.step()
                        #Write to tensorboard summary writer
                    writer.add_scalar(
                        "train_loss_" + str(fold), loss.item(),
                        (epoch - 1) * len(train_data_loader) * batch_size +
                        tr_batch_i * batch_size)
                prediction = torch.sigmoid(prediction)
                if tr_batch_i == 0:
                    pred_train = prediction.cpu().detach().numpy()
                    labels_train = labels.cpu().detach().numpy()
                else:
                    pred_train = np.concatenate(
                        (pred_train, prediction.cpu().detach().numpy()),
                        axis=0)
                    labels_train = np.concatenate(
                        (labels_train, labels.cpu().detach().numpy()), axis=0)
                l = np.array([loss.item() * batch_size])
                n = np.array([batch_size])
                sum_train_loss += l
                sum_train += n

                #log for training
                if (tr_batch_i + 1) % log_step == 0:
                    train_loss = sum_train_loss / (sum_train + 1e-12)
                    pred_train = np.nan_to_num(pred_train)
                    sp = Spearman(labels_train, pred_train)
                    elapsed_time = time.time() - prev_time
                    prev_time = time.time()
                    log.write(
                        "\t Batch # %d \t perc processed in epoch: %f \t  train_loss is %f \t lr is %f \t spearman is %f \t elapsed time: %d\n"
                        % ((tr_batch_i + 1),
                           ((tr_batch_i + 1) / len(train_data_loader)),
                           train_loss[0], rate, sp, elapsed_time))

                if (tr_batch_i + 1) % eval_step == 0:
                    eval_count += 1
                    valid_loss = np.zeros(1, np.float32)
                    valid_num = np.zeros_like(valid_loss)

                    with torch.no_grad():
                        torch.cuda.empty_cache()
                        for val_batch_i, (
                                token_ids, seg_ids,
                                labels) in enumerate(val_data_loader):
                            model.eval()
                            token_ids = token_ids.cuda()
                            seg_ids = seg_ids.cuda()
                            labels = labels.cuda().float()
                            prediction = model(token_ids, seg_ids)
                            val_loss = criterion(prediction, labels)
                            writer.add_scalar(
                                "val_loss_" + str(fold), val_loss.item(),
                                (eval_count - 1) * len(val_data_loader) *
                                valid_batch_size +
                                val_batch_i * valid_batch_size)

                            prediction = torch.sigmoid(prediction)
                            if val_batch_i == 0:
                                pred_val = prediction.cpu().detach().numpy()
                                labels_val = labels.cpu().detach().numpy()
                            else:
                                pred_val = np.concatenate(
                                    (pred_val,
                                     prediction.cpu().detach().numpy()),
                                    axis=0)
                                labels_val = np.concatenate(
                                    (labels_val,
                                     labels.cpu().detach().numpy()),
                                    axis=0)
                            l = np.array([val_loss.item() * valid_batch_size])
                            n = np.array([valid_batch_size])
                            valid_loss += l
                            valid_num += n

                            valid_loss = valid_loss / (valid_num + 1e-12)
                            pred_val = np.nan_to_num(pred_val)
                            sp = Spearman(labels_val, pred_val)
                            log.write(
                                "\t Batch # %d perc processed in epoch: %f \t Validation loss is %f \t spearman is %f \n"
                                % (val_batch_i,
                                   (val_batch_i / len(val_data_loader)),
                                   valid_loss[0], sp))
        val_metric_epoch = sp
        log.write('Validation metric {:.6f}.  Saving model ...'.format(
            val_metric_epoch))
        checkpoint_filename_swa = "fold_" + str(fold) + "_checkpoint_swa.pth"
        checkpoint_filepath_swa = os.path.join(checkpoint_folder,
                                               checkpoint_filename_swa)
        state_dict_last_epoch = torch.load(checkpoint_filepath_last_epoch)
        state_dict = model.state_dict()
        for name, val in state_dict.items():
            state_dict[name].data.copy_(
                (val.data + epoch * state_dict_last_epoch[name].data) /
                (epoch + 1))
        model.load_state_dict(state_dict)
        torch.save(model.state_dict(), checkpoint_filepath_swa)
Beispiel #28
0
    model.parameters(),
    lr=5e-5,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
    eps=1e-8  # args.adam_epsilon  - default is 1e-8.
)

from transformers import get_linear_schedule_with_warmup

# Number of training epochs (authors recommend between 2 and 4)
epochs = 3

# Total number of training steps is number of batches * number of epochs.
total_steps = len(train_dataloader) * epochs

# Create the learning rate scheduler.
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,  # Default value in run_glue.py
    num_training_steps=total_steps)
"""## 4.3. Training Loop"""

# import numpy as np
# from sklearn.metrics import f1_score, matthews_corrcoef

import numpy as np
from sklearn.metrics import f1_score, matthews_corrcoef


# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)
Beispiel #29
0
def main():

    #load cola dataset for our langage model finetuning
    sentences2 = []
    labels2 = []
    with open("./cola_public/raw/in_domain_train.tsv") as tsvfile2:
        tsvreader2 = csv.reader(tsvfile2, delimiter="\t")
        for line in tsvreader2:
            sentences2 += [line[3]]
            labels2 += [int(line[1])]

    #load encoder
    tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

    # ici juste un tour pour voir quelle est la taille max . on visera un peu pls haut par sécurité.

    max_len2 = 0

    # For every sentence...
    for sent in sentences2:

        # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
        input_ids2 = tokenizer.encode(sent, add_special_tokens=True)

        # Update the maximum sentence length.
        max_len2 = max(max_len2, len(input_ids2))

    print('Max sentence length: ', max_len2)

    input_ids2 = []

    for sent in sentences2:
        encoded_dict2 = tokenizer.encode(
            sent,  # Sentence to encode.
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=64,  # Pad & truncate all sentences.  "max_len2?
            truncation=True,
            pad_to_max_length=True,
            return_tensors='pt',  # Return pytorch tensors.
        )
        # Add the encoded sentence to the list.
        input_ids2.append(encoded_dict2)

    # Convert the lists into tensors.
    input_ids2 = torch.cat(input_ids2, dim=0)
    labels2 = torch.tensor(labels2)

    # Print sentence 0, now as a list of IDs.
    print('Original: ', sentences2[0])
    print('Token IDs:', input_ids2[0])

    # Combine the training inputs into a TensorDataset.
    dataset2 = TensorDataset(input_ids2, labels2)

    # Create a 90-10 train-validation split.

    # Calculate the number of samples to include in each set.
    train_size2 = int(0.9 * len(dataset2))
    val_size2 = len(dataset2) - train_size2

    # Divide the dataset by randomly selecting samples.
    train_dataset2, val_dataset2 = random_split(dataset2,
                                                [train_size2, val_size2])

    # The DataLoader needs to know our batch size for training, so we specify it
    # here. For fine-tuning BERT on a specific task, the authors recommend a batch
    #size of 16 or 32.
    batch_size = 32

    # Create the DataLoaders for our training and validation sets.
    # We'll take training samples in random order.
    train_dataloader2 = DataLoader(
        train_dataset2,  # The training samples.
        sampler=RandomSampler(train_dataset2),  # Select batches randomly
        batch_size=batch_size  # Trains with this batch size.
    )

    # For validation the order doesn't matter, so we'll just read them sequentially.
    validation_dataloader2 = DataLoader(
        val_dataset2,  # The validation samples.
        sampler=SequentialSampler(
            val_dataset2),  # Pull out batches sequentially.
        batch_size=batch_size  # Evaluate with this batch size.
    )

    # Load BertForSequenceClassification, the pretrained BERT model with a single
    # linear classification layer on top.
    modd = DistilBertForSequenceClassification.from_pretrained(
        "./my_pretrained_distil",  # Use the 12-layer BERT model, with an uncased vocab.
        num_labels=
        2,  # The number of output labels--2 for binary classification.
        # You can increase this for multi-class tasks.
        output_attentions=False,  # Whether the model returns attentions weights.
        output_hidden_states=
        False,  # Whether the model returns all hidden-states.
    )
    # If there's a GPU available...
    if torch.cuda.is_available():
        # Tell pytorch to run this model on the GPU.
        modd.cuda()

    # see https://mccormickml.com/2019/07/22/BERT-fine-tuning/

    from transformers import AdamW

    # Note: AdamW is a class from the huggingface library (as opposed to pytorch)
    # I believe the 'W' stands for 'Weight Decay fix"
    optimizer2 = AdamW(
        modd.parameters(),
        lr=2e-5,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
        eps=1e-8  # args.adam_epsilon  - default is 1e-8.
    )

    from transformers import get_linear_schedule_with_warmup

    # Number of training epochs. The BERT authors recommend between 2 and 4.
    # We chose to run for 4, but we'll see later that this may be over-fitting the
    # training data.
    epochs = 4

    # Total number of training steps is [number of batches] x [number of epochs].
    # (Note that this is not the same as the number of training samples).
    total_steps = len(train_dataloader2) * epochs

    # Create the learning rate scheduler.
    scheduler2 = get_linear_schedule_with_warmup(
        optimizer2,
        num_warmup_steps=0,  # Default value in run_glue.py
        num_training_steps=total_steps)

    # If there's a GPU available...
    if torch.cuda.is_available():

        # Tell PyTorch to use the GPU.
        device = torch.device("cuda")

        print('There are %d GPU(s) available.' % torch.cuda.device_count())

        print('We will use the GPU:', torch.cuda.get_device_name(0))

    # If not...
    else:
        print('No GPU available, using the CPU instead.')
        device = torch.device("cpu")

    #second dataset: stanford treebank for our sentiment classification model (which is to be attacked)

    #extract data
    sentences = []
    labels = []
    with open("./glue_data/SST-2/dev.tsv") as tsvfile:
        tsvreader = csv.reader(tsvfile, delimiter="\t")
        for i, line in enumerate(tsvreader):
            if i > 0:
                sentences += [line[0]]
                labels += [int(line[1])]

    #what is sentence maximum lenght?
    max_len = 0
    for sent in sentences:

        # Tokenize the text and add `[CLS]` and `[SEP]` tokens.
        input_ids = tokenizer.encode(sent, add_special_tokens=True)

        # Update the maximum sentence length.
        max_len = max(max_len, len(input_ids))

    print('Max sentence length: ', max_len)

    #encode sentences:
    input_ids = []
    for sent in sentences:
        encoded_dict = tokenizer.encode(
            sent,  # Sentence to encode.
            add_special_tokens=True,  # Add '[CLS]' and '[SEP]'
            max_length=64,  # Pad & truncate all sentences.
            truncation=True,
            pad_to_max_length=True,
            return_tensors='pt',  # Return pytorch tensors.
        )
        # Add the encoded sentence to the list.
        input_ids.append(encoded_dict)

    # Convert the lists into tensors.
    input_ids = torch.cat(input_ids, dim=0)
    labels = torch.tensor(labels)

    # Print sentence 0, now as a list of IDs.
    print('Original: ', sentences[0])
    print('Token IDs:', input_ids[0])

    # Combine the training inputs into a TensorDataset.
    dataset = TensorDataset(input_ids, labels)

    # Create a 90-10 train-validation split:

    # Calculate the number of samples to include in each set.
    train_size = int(0.9 * len(dataset))
    val_size = len(dataset) - train_size

    # Divide the dataset by randomly selecting samples.
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

    # The DataLoader needs to know our batch size for training, so we specify it
    # here. For fine-tuning BERT on a specific task, the authors recommend a batch
    # size of 16 or 32.
    batch_size = 32

    # Create the DataLoaders for our training and validation sets.
    # We'll take training samples in random order.
    train_dataloader = DataLoader(
        train_dataset,  # The training samples.
        sampler=RandomSampler(train_dataset),  # Select batches randomly
        batch_size=batch_size  # Trains with this batch size.
    )

    # For validation the order doesn't matter, so we'll just read them sequentially.
    validation_dataloader = DataLoader(
        val_dataset,  # The validation samples.
        sampler=SequentialSampler(
            val_dataset),  # Pull out batches sequentially.
        batch_size=batch_size  # Evaluate with this batch size.
    )

    #create model to be finetuned:

    # Load BertForSequenceClassification, the pretrained BERT model with a single
    # linear classification layer on top.
    model = DistilBertForSequenceClassification.from_pretrained(
        "./my_pretrained_distil",  # Use the 12-layer BERT model, with an uncased vocab.
        num_labels=
        2,  # The number of output labels--2 for binary classification.
        # You can increase this for multi-class tasks.
        output_attentions=False,  # Whether the model returns attentions weights.
        output_hidden_states=
        False,  # Whether the model returns all hidden-states.
    )
    # If there's a GPU available...
    if torch.cuda.is_available():
        # Tell pytorch to run this model on the GPU.
        model.cuda()

    # Note: AdamW is a class from the huggingface library (as opposed to pytorch)
    # I believe the 'W' stands for 'Weight Decay fix"
    optimizer = AdamW(
        model.parameters(),
        lr=2e-5,  # args.learning_rate - default is 5e-5, our notebook had 2e-5
        eps=1e-8  # args.adam_epsilon  - default is 1e-8.
    )

    # Number of training epochs. The BERT authors recommend between 2 and 4.
    # We chose to run for 4, but we'll see later that this may be over-fitting the
    # training data.
    epochs = 4

    # Total number of training steps is [number of batches] x [number of epochs].
    # (Note that this is not the same as the number of training samples).
    total_steps = len(train_dataloader) * epochs

    # Create the learning rate scheduler.
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,  # Default value in run_glue.py
        num_training_steps=total_steps)

    #the two trainings

    import random
    import numpy as np

    # This training code is based on the `run_glue.py` script here:
    # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

    # Set the seed value all over the place to make this reproducible.
    seed_val = 42

    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    # We'll store a number of quantities such as training and validation loss,
    # validation accuracy, and timings.
    training_stats = []

    # Measure the total training time for the whole run.
    total_t0 = time.time()

    # For each epoch...
    for epoch_i in range(0, epochs):

        # ========================================
        #               Training
        # ========================================

        # Perform one full pass over the training set.

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_train_loss = 0

        # Put the model into training mode. Don't be mislead--the call to
        # `train` just changes the *mode*, it doesn't *perform* the training.
        # `dropout` and `batchnorm` layers behave differently during training
        # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
        modd.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader2):

            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)

                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
                    step, len(train_dataloader2), elapsed))

            # Unpack this training batch from our dataloader.
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using the
            # `to` method.
            #
            # `batch` contains two pytorch tensors:
            #   [0]: input ids
            #   [2]: labels
            b_input_ids = batch[0].to(device)
            b_labels = batch[1].to(device)

            # Always clear any previously calculated gradients before performing a
            # backward pass. PyTorch doesn't do this automatically because
            # accumulating the gradients is "convenient while training RNNs".
            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            modd.zero_grad()

            # Perform a forward pass (evaluate the model on this training batch).
            # The documentation for this `model` function is here:
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # It returns different numbers of parameters depending on what arguments
            # arge given and what flags are set. For our useage here, it returns
            # the loss (because we provided labels) and the "logits"--the model
            # outputs prior to activation.
            loss, logits = modd(b_input_ids, labels=b_labels)

            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value
            # from the tensor.
            total_train_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(modd.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer2.step()

            # Update the learning rate.
            scheduler2.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader2)

        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))

        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        print("")
        print("Running Validation...")

        t0 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        modd.eval()

        # Tracking variables
        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        # Evaluate data for one epoch
        for batch in validation_dataloader2:

            # Unpack this training batch from our dataloader.
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using
            # the `to` method.
            #
            # `batch` contains two pytorch tensors:
            #   [0]: input ids
            #   [2]: labels
            b_input_ids = batch[0].to(device)
            b_labels = batch[1].to(device)

            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training).
            with torch.no_grad():

                # Forward pass, calculate logit predictions.
                # token_type_ids is the same as the "segment ids", which
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                # The documentation for this `model` function is here:
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                # Get the "logits" output by the model. The "logits" are the output
                # values prior to applying an activation function like the softmax.
                (loss, logits) = modd(b_input_ids, labels=b_labels)

            # Accumulate the validation loss.
            total_eval_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch of test sentences, and
            # accumulate it over all batches.
            total_eval_accuracy += flat_accuracy(logits, label_ids)

        # Report the final accuracy for this validation run.
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader2)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader2)

        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)

        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

        # Record all statistics from this epoch.
        training_stats.append({
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        })

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(
        format_time(time.time() - total_t0)))

    #save model
    torch.save(modd.state_dict(), "distil_languagemodel_finetuned.pt")

    # This training code is based on the `run_glue.py` script here:
    # https://github.com/huggingface/transformers/blob/5bfcd0485ece086ebcbed2d008813037968a9e58/examples/run_glue.py#L128

    # Set the seed value all over the place to make this reproducible.
    seed_val = 42

    random.seed(seed_val)
    np.random.seed(seed_val)
    torch.manual_seed(seed_val)
    torch.cuda.manual_seed_all(seed_val)

    # We'll store a number of quantities such as training and validation loss,
    # validation accuracy, and timings.
    training_stats = []

    # Measure the total training time for the whole run.
    total_t0 = time.time()

    # For each epoch...
    for epoch_i in range(0, epochs):

        # ========================================
        #               Training
        # ========================================

        # Perform one full pass over the training set.

        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
        print('Training...')

        # Measure how long the training epoch takes.
        t0 = time.time()

        # Reset the total loss for this epoch.
        total_train_loss = 0

        # Put the model into training mode. Don't be mislead--the call to
        # `train` just changes the *mode*, it doesn't *perform* the training.
        # `dropout` and `batchnorm` layers behave differently during training
        # vs. test (source: https://stackoverflow.com/questions/51433378/what-does-model-train-do-in-pytorch)
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):

            # Progress update every 40 batches.
            if step % 40 == 0 and not step == 0:
                # Calculate elapsed time in minutes.
                elapsed = format_time(time.time() - t0)

                # Report progress.
                print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(
                    step, len(train_dataloader), elapsed))

            # Unpack this training batch from our dataloader.
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using the
            # `to` method.
            #
            # `batch` contains three pytorch tensors:
            #   [0]: input ids
            #   [2]: labels
            b_input_ids = batch[0].to(device)
            b_labels = batch[1].to(device)

            # Always clear any previously calculated gradients before performing a
            # backward pass. PyTorch doesn't do this automatically because
            # accumulating the gradients is "convenient while training RNNs".
            # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
            model.zero_grad()

            # Perform a forward pass (evaluate the model on this training batch).
            # The documentation for this `model` function is here:
            # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
            # It returns different numbers of parameters depending on what arguments
            # arge given and what flags are set. For our useage here, it returns
            # the loss (because we provided labels) and the "logits"--the model
            # outputs prior to activation.
            loss, logits = model(b_input_ids, labels=b_labels)

            # Accumulate the training loss over all of the batches so that we can
            # calculate the average loss at the end. `loss` is a Tensor containing a
            # single value; the `.item()` function just returns the Python value
            # from the tensor.
            total_train_loss += loss.item()

            # Perform a backward pass to calculate the gradients.
            loss.backward()

            # Clip the norm of the gradients to 1.0.
            # This is to help prevent the "exploding gradients" problem.
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and take a step using the computed gradient.
            # The optimizer dictates the "update rule"--how the parameters are
            # modified based on their gradients, the learning rate, etc.
            optimizer.step()

            # Update the learning rate.
            scheduler.step()

        # Calculate the average loss over all of the batches.
        avg_train_loss = total_train_loss / len(train_dataloader)

        # Measure how long this epoch took.
        training_time = format_time(time.time() - t0)

        print("")
        print("  Average training loss: {0:.2f}".format(avg_train_loss))
        print("  Training epcoh took: {:}".format(training_time))

        # ========================================
        #               Validation
        # ========================================
        # After the completion of each training epoch, measure our performance on
        # our validation set.

        print("")
        print("Running Validation...")

        t0 = time.time()

        # Put the model in evaluation mode--the dropout layers behave differently
        # during evaluation.
        model.eval()

        # Tracking variables
        total_eval_accuracy = 0
        total_eval_loss = 0
        nb_eval_steps = 0

        # Evaluate data for one epoch
        for batch in validation_dataloader:

            # Unpack this training batch from our dataloader.
            #
            # As we unpack the batch, we'll also copy each tensor to the GPU using
            # the `to` method.
            #
            # `batch` contains two pytorch tensors:
            #   [0]: input ids
            #   [2]: labels
            b_input_ids = batch[0].to(device)
            b_labels = batch[1].to(device)

            # Tell pytorch not to bother with constructing the compute graph during
            # the forward pass, since this is only needed for backprop (training).
            with torch.no_grad():

                # Forward pass, calculate logit predictions.
                # token_type_ids is the same as the "segment ids", which
                # differentiates sentence 1 and 2 in 2-sentence tasks.
                # The documentation for this `model` function is here:
                # https://huggingface.co/transformers/v2.2.0/model_doc/bert.html#transformers.BertForSequenceClassification
                # Get the "logits" output by the model. The "logits" are the output
                # values prior to applying an activation function like the softmax.
                (loss, logits) = model(b_input_ids, labels=b_labels)

            # Accumulate the validation loss.
            total_eval_loss += loss.item()

            # Move logits and labels to CPU
            logits = logits.detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()

            # Calculate the accuracy for this batch of test sentences, and
            # accumulate it over all batches.
            total_eval_accuracy += flat_accuracy(logits, label_ids)

        # Report the final accuracy for this validation run.
        avg_val_accuracy = total_eval_accuracy / len(validation_dataloader)
        print("  Accuracy: {0:.2f}".format(avg_val_accuracy))

        # Calculate the average loss over all of the batches.
        avg_val_loss = total_eval_loss / len(validation_dataloader)

        # Measure how long the validation run took.
        validation_time = format_time(time.time() - t0)

        print("  Validation Loss: {0:.2f}".format(avg_val_loss))
        print("  Validation took: {:}".format(validation_time))

        # Record all statistics from this epoch.
        training_stats.append({
            'epoch': epoch_i + 1,
            'Training Loss': avg_train_loss,
            'Valid. Loss': avg_val_loss,
            'Valid. Accur.': avg_val_accuracy,
            'Training Time': training_time,
            'Validation Time': validation_time
        })

    print("")
    print("Training complete!")

    print("Total training took {:} (h:mm:ss)".format(
        format_time(time.time() - total_t0)))

    #save model
    torch.save(model.state_dict(), "distil_finetuned.pt")
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    model = model.module if hasattr(model, "module") else model  # Take care of distributed/parallel training
    model.resize_token_embeddings(len(tokenizer))

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
            args.model_name_or_path
            and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
            and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = mask_tokens(batch, tokenizer, args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, masked_lm_labels=labels) if args.mlm else model(inputs, labels=labels)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                            args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if 0 < args.max_steps < global_step:
                epoch_iterator.close()
                break
        if 0 < args.max_steps < global_step:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step