Example #1
0
def train(epoch, model, optimizer, scheduler):
    global global_step

    epoch_loss = 0.0
    running_num = 0
    running_loss = np.zeros(3)

    train_sampler.set_epoch(epoch)
    model.train()

    bar = tqdm(train_loader) if args.local_rank == 0 else train_loader

    for batch_idx, (x, c) in enumerate(bar):

        scheduler.step()
        global_step += 1

        x, c = x.to(device, non_blocking=True), c.to(device, non_blocking=True)

        optimizer.zero_grad()

        log_p, logdet = model(x, c)
        log_p, logdet = torch.mean(log_p), torch.mean(logdet)

        loss = -(log_p + logdet)

        with amp.scale_loss(loss, optimizer) as scaled_loss:
            scaled_loss.backward()

        nn.utils.clip_grad_norm_(amp.master_params(optimizer), 1.)
         
        optimizer.step()

        running_num += 1
        running_loss[0] += loss.item()
        running_loss[1] += log_p.item()
        running_loss[2] += logdet.item()

        epoch_loss += loss.item()

        if args.local_rank == 0:
            bar.set_description('{}/{}, [Log pdf, Log p(z), Log Det] : {}'
                                .format(epoch, global_step, running_loss / running_num))
            if (batch_idx + 1) % 100 == 0:
                running_num = 0
                running_loss = np.zeros(3)

        del x, c, log_p, logdet, loss
    del running_loss
    gc.collect()
    print('{}/{}/{} Training Loss : {:.4f}'.format(epoch, global_step, args.local_rank, epoch_loss / (len(train_loader))))
    return epoch_loss / len(train_loader)
Example #2
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True,
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
        global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
        epochs_trained = global_step // (len(train_dataloader) //
                                         args.gradient_accumulation_steps)
        steps_trained_in_current_epoch = global_step % (
            len(train_dataloader) // args.gradient_accumulation_steps)

        logger.info(
            "  Continuing training from checkpoint, will skip to saved global_step"
        )
        logger.info("  Continuing training from epoch %d", epochs_trained)
        logger.info("  Continuing training from global step %d", global_step)
        logger.info("  Will skip the first %d steps in the first epoch",
                    steps_trained_in_current_epoch)

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        epochs_trained,
        int(args.num_train_epochs),
        desc="Epoch",
        disable=args.local_rank not in [-1, 0],
    )
    set_seed(args)  # Added here for reproductibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "labels": batch[3]
            }
            if args.model_type != "distilbert":
                inputs["token_type_ids"] = (
                    batch[2]
                    if args.model_type in ["bert", "xlnet", "albert"] else None
                )  # XLM, DistilBERT, RoBERTa, and XLM-RoBERTa don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logs = {}
                    if (
                            args.local_rank == -1
                            and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            eval_key = "eval_{}".format(key)
                            logs[eval_key] = value

                    loss_scalar = (tr_loss - logging_loss) / args.logging_steps
                    learning_rate_scalar = scheduler.get_lr()[0]
                    logs["learning_rate"] = learning_rate_scalar
                    logs["loss"] = loss_scalar
                    logging_loss = tr_loss

                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    print(json.dumps({**logs, **{"step": global_step}}))

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
def train(args) -> Tuple[int, float]:
    """ Train the model """

    tb_writer = SummaryWriter()
        
    state_path = f"{args.output_dir}/domain_pre-training_state.json"
    state = {
        "v": 0.1,
        "finished": False,
        "ts": {"t0": time.time()},
        "saved": {"idx": 0, "data": {}},
        "curr_global_step_idx": 0,
        "cumulative_runtime": 0,
        "metrics": {
            "validation": [],
        },
        "timed_out": False
    }
    start_step_idx = 0

    if args.output_dir[:5].lower() != "s3://":
        if not os.path.exists(f"{args.output_dir}/models"):
            os.makedirs(f"{args.output_dir}/models")
        if not os.path.exists(f"{args.output_dir}/metrics"):
            os.makedirs(f"{args.output_dir}/metrics")

    model = None
    if args.should_continue:
        saved_path = None
        print(f"Resuming state from: {state_path}")
        try:
            tmp_state = json.loads(open(state_path, 'r').read())
            if tmp_state["finished"] == True:
                print("State shows that training has finished.")
                print(f"Final model was saved at: {saved_path}")
                print("Quiting.")
                exit(0)

            saved_path = tmp_state["saved"]["data"][str(tmp_state["saved"]["idx"])]["model"]
            print("SP:", saved_path)
            tokenizer, model, optimizer, scheduler = get_model_training_objects(args, saved_path)

            # TODO: Do a deeper validation of state
            # By now the state should be valid. Replace proper variables
            tmp_state['curr_global_step_idx'] = tmp_state["saved"]["data"][str(tmp_state["saved"]["idx"])]["global_step_idx"]
            start_step_idx = tmp_state['curr_global_step_idx'] + 1
            tmp_state['saved']['idx'] += 1
            state = tmp_state
        except Exception:
            if saved_path:
                print(f"Could not load model from {saved_path}")
            else:
                print(f"Could not get model information from {state_path}")
            print(sys.exc_info()[0])
            import traceback
            print(traceback.format_exc())
            pass

        #     if saved_path[:5].lower() == "s3://":
        #         dirname = tempfile.mkdtemp()
        #         if saved_path[-4:].lower() == ".tgz":
        #             cmd = f"aws s3 cp {saved_path} - | tar C {dirname} -zxf - . "
        #         elif saved_path[-4:].lower() == ".tar":
        #             cmd = f"aws s3 cp {saved_path} - | tar C {dirname} -xf - . "
        #         else:
        #             cmd = f"aws s3 cp --recursive {saved_path} {dirname}"

        #         print(cmd)
        #         os.system(cmd)
        #         #exit(4)
        #     else:
        #         dirname = saved_path

        #     if dirname[-1] != "/":
        #         dirname += "/"

        #     tmp_tokenizer = AutoTokenizer.from_pretrained(dirname)

        #     tmp_model = BertForMaskedLM.from_pretrained(
        #         dirname, config=AutoConfig.from_pretrained(dirname))

        #     # Prepare optimizer and schedule (linear warmup and decay)
        #     no_decay = ["bias", "LayerNorm.weight"]
        #     optimizer_grouped_parameters = [
        #         {
        #             "params": [p for n, p in tmp_model.named_parameters() if not any(nd in n for nd in no_decay)],
        #             "weight_decay": args.weight_decay,
        #         },
        #         {"params": [p for n, p in tmp_model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
        #     ]
        #     tmp_optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
        #     tmp_scheduler = get_linear_schedule_with_warmup(
        #         tmp_optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=args.max_steps
        #     )

        #     # Check if saved optimizer or scheduler states exist
        #     if os.path.isfile(os.path.join(dirname, "optimizer.pt")):
        #         tmp_optimizer.load_state_dict(torch.load(os.path.join(dirname, "optimizer.pt")))
        #     if os.path.isfile(os.path.join(dirname, "scheduler.pt")):
        #         tmp_scheduler.load_state_dict(torch.load(os.path.join(dirname, "scheduler.pt")))

        #     if saved_path[:5].lower() == "s3://":
        #         shutil.rmtree(dirname)

        #     # TODO: Do a deeper validation of state
        #     # By now the state should be valid. Replace proper variables
        #     tokenizer = tmp_tokenizer
        #     model = tmp_model
        #     optimizer = tmp_optimizer
        #     scheduler = tmp_scheduler
        #     state = tmp_state
        # except Exception:
        #     print(sys.exc_info()[0])
        #     import traceback
        #     print(traceback.format_exc())
        #     exit(6)
        #     # TODO: We should probably return and error (or raise custom exception) here and
        #     # let the caller initialize the model. At this point we can't be sure what the
        #     # configuration of the model should be.
        #     # Above, we don't set used variables until the tokenizer and model can be loaded.
        #     # If the caller has used the matter.set_* functions this should be enough, but the
        #     # flow should be dictated by the caller, not by exceptions catching here. Otherwise
        #     # we are forcing a valid model be loaded every time, even when resuming.
        #     print("Couldn't read/continue from previous state file. Discarding saved state.")

    if model is None:
        # Need all training objects
        print("Training model from scratch")
        tokenizer, model, optimizer, scheduler = get_model_training_objects(args)

    #args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    #inputfname = "/disco/data/matters/xmatter/gp_priv1/dbdata/df-document_info-with_uri_clean.csv"
    #inputfname = "/disco/data/matters/xmatter/gp_priv1/dbdata/df-document_info-in_golden_set.csv"
    train_df = pd.concat((pd.read_csv(f, keep_default_na=False,
                                      header=0,
                                      usecols=[args.csv_size_column, args.csv_uri_column],
                                      dtype={args.csv_size_column: 'int64', args.csv_uri_column: str})
                          for f in args.train_data_file))
    train_df.rename({args.csv_uri_column: "uri", args.csv_size_column: "size"}, inplace=True)

    # Setup the dataset and data loader
    ds = DFMLMDataset(train_df, tokenizer, args)
    data_loader = DataLoader(ds,
                             batch_size=args.batch_size, shuffle=True,
                             #batch_size=args.batch_size, shuffle=False,
                             num_workers=args.data_loader_num_workers,
                             collate_fn=ds.dataloader_batch_collate)

    t_total = args.max_steps
    args.num_train_epochs = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    tr_loss, logging_loss = 0.0, 0.0

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num docs = %d", len(train_df))
    logger.info(f"  Total bytes = {train_df['size'].sum():,}")
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Batch size = %d", args.batch_size)
    #logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    #print("device=", args.device)
    #model = torch.nn.DataParallel(model)
    #model.to("cuda")
    model.to(args.device)
    model.zero_grad()

    set_seed(args)  # Added here for reproducibility

    data_loader_iterator = iter(data_loader)
    t_b_pre = time.time()
    # TODO: Note that I removed the reproducibility of batches
    for step in range(start_step_idx, args.max_steps):
        state["curr_global_step_idx"] = step

        try:
            batch = next(data_loader_iterator)
        except StopIteration:
            data_loader_iterator = iter(data_loader)
            batch = next(data_loader_iterator)

        inputs, labels = batch
        inputs = inputs.to(args.device)
        labels = labels.to(args.device)

        model.train()
        outputs = model(inputs, masked_lm_labels=labels)
        loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

        if args.n_gpu > 1:
            loss = loss.mean()  # mean() to average on multi-gpu parallel training

        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps

        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        tr_loss += loss.item()
        if (step + 1) % args.gradient_accumulation_steps == 0:
            if args.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

            optimizer.step()
            scheduler.step()
            model.zero_grad()

            #epoch_iterator.set_description(desc=f"Loss {(tr_loss/global_step):7.3f}")
            print(f"Step {step+1}/{args.max_steps} Loss {tr_loss/(step+1):7.3f}")

            if args.logging_steps > 0 and (step + 1) % args.logging_steps == 0:
                # Log metrics
                tb_writer.add_scalar("lr", scheduler.get_lr()[0], step+1)
                tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, step+1)
                logging_loss = tr_loss

            if args.save_steps > 0 and (step + 1) % args.save_steps == 0:
                model_dst = f"{args.output_dir}/models/global_step_idx_{step}"
                save_model_training_objects(args, model_dst, tokenizer, model, optimizer, scheduler)

                state["saved"]["data"][state['saved']['idx']] = {
                    "global_step_idx": step,
                    "model": model_dst,
                    "metrics": None,
                    "validation": False
                }

                print("saving state to ", state_path)
                save_state(state_path, state)

                state['saved']['idx'] += 1

        state["cumulative_runtime"] += time.time() - t_b_pre
        t_b_pre = time.time()

        if args.max_run_time_in_minutes:
            if (state["cumulative_runtime"]) / 60 > args.max_run_time_in_minutes:
                print(f"Terminating because we are over {args.max_run_time_in_minutes} minutes")
                state["timed_out"] = True
                break

    print("Finished training")

    tb_writer.close()

    # TODO: Keep track of last saved model and avoid saving it twice
    model_dst = f"{args.output_dir}/models/global_step_idx_{step}"
    print(f"Saving final trained model to {model_dst}")
    save_model_training_objects(args, model_dst, tokenizer, model, optimizer, scheduler)

    if args.should_continue:
        state["curr_global_step_idx"] = step
        state["finished"] = True
        state["ts"]["tz"] = time.time()

        state["saved"]["data"][state['saved']['idx']] = {
            "global_step_idx": step,
            "model": model_dst,
            "metrics": None,
            "validation": False,
            "final": True
        }

        print("saving state to ", state_path)
        save_state(state_path, state)

    while len(s3_upload_futures) > 0:
        print(f"Waiting on {len(s3_upload_futures)} upload set(s)")
        time.sleep(2)

    return step, tr_loss / step
Example #4
0
    def train(
        self, train_dataset, output_dir, show_running_loss=True, eval_file=None, verbose=True, **kwargs,
    ):
        """
        Trains the model on train_dataset.

        Utility function to be used by the train_model() method. Not intended to be used directly.
        """

        model = self.model
        args = self.args
        tokenizer = self.tokenizer

        def collate(examples: List[torch.Tensor]):
            if tokenizer._pad_token is None:
                return pad_sequence(examples, batch_first=True)
            return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

        tb_writer = SummaryWriter(logdir=args["tensorboard_dir"])
        train_sampler = RandomSampler(train_dataset)
        train_dataloader = DataLoader(
            train_dataset, sampler=train_sampler, batch_size=args["train_batch_size"], collate_fn=collate
        )

        if args["max_steps"] > 0:
            t_total = args["max_steps"]
            args["num_train_epochs"] = (
                args["max_steps"] // (len(train_dataloader) // args["gradient_accumulation_steps"]) + 1
            )
        else:
            t_total = len(train_dataloader) // args["gradient_accumulation_steps"] * args["num_train_epochs"]

        no_decay = ["bias", "LayerNorm.weight"]
        optimizer_grouped_parameters = [
            {
                "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
                "weight_decay": args["weight_decay"],
            },
            {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)]},
        ]

        warmup_steps = math.ceil(t_total * args["warmup_ratio"])
        args["warmup_steps"] = warmup_steps if args["warmup_steps"] == 0 else args["warmup_steps"]

        optimizer = AdamW(optimizer_grouped_parameters, lr=args["learning_rate"], eps=args["adam_epsilon"])
        scheduler = get_linear_schedule_with_warmup(
            optimizer, num_warmup_steps=args["warmup_steps"], num_training_steps=t_total
        )

        if (
            args["model_name"]
            and os.path.isfile(os.path.join(args["model_name"], "optimizer.pt"))
            and os.path.isfile(os.path.join(args["model_name"], "scheduler.pt"))
        ):
            # Load in optimizer and scheduler states
            optimizer.load_state_dict(torch.load(os.path.join(args["model_name"], "optimizer.pt")))
            scheduler.load_state_dict(torch.load(os.path.join(args["model_name"], "scheduler.pt")))

        if args["fp16"]:
            try:
                from apex import amp
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")

            model, optimizer = amp.initialize(model, optimizer, opt_level=args["fp16_opt_level"])

        if args["n_gpu"] > 1:
            model = torch.nn.DataParallel(model)

        logger.info(" Training started")

        global_step = 0
        tr_loss, logging_loss = 0.0, 0.0
        model.zero_grad()
        train_iterator = trange(int(args["num_train_epochs"]), desc="Epoch", disable=args["silent"], mininterval=0)
        epoch_number = 0
        best_eval_metric = None
        early_stopping_counter = 0
        steps_trained_in_current_epoch = 0
        epochs_trained = 0

        if args["model_name"] and os.path.exists(args["model_name"]):
            try:
                # set global_step to gobal_step of last saved checkpoint from model path
                checkpoint_suffix = args["model_name"].split("/")[-1].split("-")
                if len(checkpoint_suffix) > 2:
                    checkpoint_suffix = checkpoint_suffix[1]
                else:
                    checkpoint_suffix = checkpoint_suffix[-1]
                global_step = int(checkpoint_suffix)
                epochs_trained = global_step // (len(train_dataloader) // args["gradient_accumulation_steps"])
                steps_trained_in_current_epoch = global_step % (
                    len(train_dataloader) // args["gradient_accumulation_steps"]
                )

                logger.info("   Continuing training from checkpoint, will skip to saved global_step")
                logger.info("   Continuing training from epoch %d", epochs_trained)
                logger.info("   Continuing training from global step %d", global_step)
                logger.info("   Will skip the first %d steps in the current epoch", steps_trained_in_current_epoch)
            except ValueError:
                logger.info("   Starting fine-tuning.")

        if args["evaluate_during_training"]:
            training_progress_scores = self._create_training_progress_scores(**kwargs)

        if args["wandb_project"]:
            wandb.init(project=args["wandb_project"], config={**args}, **args["wandb_kwargs"])
            wandb.watch(self.model)

        model.train()
        for current_epoch in train_iterator:
            if epochs_trained > 0:
                epochs_trained -= 1
                continue
            # epoch_iterator = tqdm(train_dataloader, desc="Iteration")
            for step, batch in enumerate(tqdm(train_dataloader, desc="Current iteration", disable=args["silent"])):
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue

                inputs, labels = mask_tokens(batch, tokenizer, args) if args["mlm"] else (batch, batch)
                inputs = inputs.to(self.device)
                labels = labels.to(self.device)

                outputs = model(inputs, masked_lm_labels=labels) if args["mlm"] else model(inputs, labels=labels)
                # model outputs are always tuple in pytorch-transformers (see doc)
                loss = outputs[0]
                # if loss.item() < 1:
                #     masked = (labels[0] != -100).nonzero()
                #     print(labels[0][masked])
                #     preds = outputs[1][0, masked, :].clone().detach().cpu().numpy()
                #     print(np.argmax(preds, axis=2))

                if args["n_gpu"] > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu parallel training

                current_loss = loss.item()

                if show_running_loss:
                    print("\rRunning loss: %f" % loss, end="")

                if args["gradient_accumulation_steps"] > 1:
                    loss = loss / args["gradient_accumulation_steps"]

                if args["fp16"]:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                tr_loss += loss.item()
                if (step + 1) % args["gradient_accumulation_steps"] == 0:
                    if args["fp16"]:
                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args["max_grad_norm"])
                    else:
                        torch.nn.utils.clip_grad_norm_(model.parameters(), args["max_grad_norm"])

                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

                    if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0:
                        # Log metrics
                        tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                        tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args["logging_steps"], global_step)
                        logging_loss = tr_loss
                        if args["wandb_project"]:
                            wandb.log(
                                {
                                    "Training loss": current_loss,
                                    "lr": scheduler.get_lr()[0],
                                    "global_step": global_step,
                                }
                            )

                    if args["save_steps"] > 0 and global_step % args["save_steps"] == 0:
                        # Save model checkpoint
                        output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step))

                        self._save_model(output_dir_current, optimizer, scheduler, model=model)

                    if args["evaluate_during_training"] and (
                        args["evaluate_during_training_steps"] > 0
                        and global_step % args["evaluate_during_training_steps"] == 0
                    ):
                        # Only evaluate when single GPU otherwise metrics may not average well
                        results = self.eval_model(
                            eval_file,
                            verbose=verbose and args["evaluate_during_training_verbose"],
                            silent=True,
                            **kwargs,
                        )
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)

                        output_dir_current = os.path.join(output_dir, "checkpoint-{}".format(global_step))

                        if args["save_eval_checkpoints"]:
                            self._save_model(output_dir_current, optimizer, scheduler, model=model, results=results)

                        training_progress_scores["global_step"].append(global_step)
                        training_progress_scores["train_loss"].append(current_loss)
                        for key in results:
                            training_progress_scores[key].append(results[key])
                        report = pd.DataFrame(training_progress_scores)
                        report.to_csv(
                            os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False,
                        )

                        if args["wandb_project"]:
                            wandb.log(self._get_last_metrics(training_progress_scores))

                        if not best_eval_metric:
                            best_eval_metric = results[args["early_stopping_metric"]]
                            self._save_model(
                                args["best_model_dir"], optimizer, scheduler, model=model, results=results
                            )
                        if best_eval_metric and args["early_stopping_metric_minimize"]:
                            if (
                                results[args["early_stopping_metric"]] - best_eval_metric
                                < args["early_stopping_delta"]
                            ):
                                best_eval_metric = results[args["early_stopping_metric"]]
                                self._save_model(
                                    args["best_model_dir"], optimizer, scheduler, model=model, results=results
                                )
                                early_stopping_counter = 0
                            else:
                                if args["use_early_stopping"]:
                                    if early_stopping_counter < args["early_stopping_patience"]:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(f" No improvement in {args['early_stopping_metric']}")
                                            logger.info(f" Current step: {early_stopping_counter}")
                                            logger.info(f" Early stopping patience: {args['early_stopping_patience']}")
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args['early_stopping_patience']} steps reached."
                                            )
                                            logger.info(" Training terminated.")
                                            train_iterator.close()
                                        return global_step, tr_loss / global_step
                        else:
                            if (
                                results[args["early_stopping_metric"]] - best_eval_metric
                                > args["early_stopping_delta"]
                            ):
                                best_eval_metric = results[args["early_stopping_metric"]]
                                self._save_model(
                                    args["best_model_dir"], optimizer, scheduler, model=model, results=results
                                )
                                early_stopping_counter = 0
                            else:
                                if args["use_early_stopping"]:
                                    if early_stopping_counter < args["early_stopping_patience"]:
                                        early_stopping_counter += 1
                                        if verbose:
                                            logger.info(f" No improvement in {args['early_stopping_metric']}")
                                            logger.info(f" Current step: {early_stopping_counter}")
                                            logger.info(f" Early stopping patience: {args['early_stopping_patience']}")
                                    else:
                                        if verbose:
                                            logger.info(
                                                f" Patience of {args['early_stopping_patience']} steps reached."
                                            )
                                            logger.info(" Training terminated.")
                                            train_iterator.close()
                                        return global_step, tr_loss / global_step

                if args["max_steps"] > 0 and global_step > args["max_steps"]:
                    return global_step, tr_loss / global_step

            epoch_number += 1
            output_dir_current = os.path.join(output_dir, "checkpoint-{}-epoch-{}".format(global_step, epoch_number))

            if args["save_model_every_epoch"] or args["evaluate_during_training"]:
                os.makedirs(output_dir_current, exist_ok=True)

            if args["save_model_every_epoch"]:
                self._save_model(output_dir_current, optimizer, scheduler, model=model)

            if args["evaluate_during_training"]:
                results = self.eval_model(
                    eval_file, verbose=verbose and args["evaluate_during_training_verbose"], silent=True, **kwargs
                )

                self._save_model(output_dir_current, optimizer, scheduler, results=results)

                training_progress_scores["global_step"].append(global_step)
                training_progress_scores["train_loss"].append(current_loss)
                for key in results:
                    training_progress_scores[key].append(results[key])
                report = pd.DataFrame(training_progress_scores)
                report.to_csv(os.path.join(args["output_dir"], "training_progress_scores.csv"), index=False)

                if args["wandb_project"]:
                    wandb.log(self._get_last_metrics(training_progress_scores))

                if not best_eval_metric:
                    best_eval_metric = results[args["early_stopping_metric"]]
                    self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results)
                if best_eval_metric and args["early_stopping_metric_minimize"]:
                    if results[args["early_stopping_metric"]] - best_eval_metric < args["early_stopping_delta"]:
                        best_eval_metric = results[args["early_stopping_metric"]]
                        self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results)
                        early_stopping_counter = 0
                    else:
                        if args["use_early_stopping"] and args["early_stopping_consider_epochs"]:
                            if early_stopping_counter < args["early_stopping_patience"]:
                                early_stopping_counter += 1
                                if verbose:
                                    logger.info(f" No improvement in {args['early_stopping_metric']}")
                                    logger.info(f" Current step: {early_stopping_counter}")
                                    logger.info(f" Early stopping patience: {args['early_stopping_patience']}")
                            else:
                                if verbose:
                                    logger.info(f" Patience of {args['early_stopping_patience']} steps reached")
                                    logger.info(" Training terminated.")
                                    train_iterator.close()
                                return global_step, tr_loss / global_step
                else:
                    if results[args["early_stopping_metric"]] - best_eval_metric > args["early_stopping_delta"]:
                        best_eval_metric = results[args["early_stopping_metric"]]
                        self._save_model(args["best_model_dir"], optimizer, scheduler, model=model, results=results)
                        early_stopping_counter = 0
                    else:
                        if args["use_early_stopping"] and args["early_stopping_consider_epochs"]:
                            if early_stopping_counter < args["early_stopping_patience"]:
                                early_stopping_counter += 1
                                if verbose:
                                    logger.info(f" No improvement in {args['early_stopping_metric']}")
                                    logger.info(f" Current step: {early_stopping_counter}")
                                    logger.info(f" Early stopping patience: {args['early_stopping_patience']}")
                            else:
                                if verbose:
                                    logger.info(f" Patience of {args['early_stopping_patience']} steps reached")
                                    logger.info(" Training terminated.")
                                    train_iterator.close()
                                return global_step, tr_loss / global_step

            if args["max_steps"] > 0 and global_step > args["max_steps"]:
                return global_step, tr_loss / global_step

        return global_step, tr_loss / global_step
Example #5
0
def main():
    parser = argparse.ArgumentParser()

    ## Required parameters
    parser.add_argument(
        "--data_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The input data dir. Should contain the .tsv files (or other data files) for the task."
    )
    parser.add_argument(
        "--bert_model",
        default=None,
        type=str,
        required=True,
        help="Bert pre-trained model selected in the list: bert-base-uncased, "
        "bert-large-uncased, bert-base-cased, bert-large-cased, bert-base-multilingual-uncased, "
        "bert-base-multilingual-cased, bert-base-chinese.")
    parser.add_argument("--task_name",
                        default=None,
                        type=str,
                        required=True,
                        help="The name of the task to train.")
    parser.add_argument(
        "--output_dir",
        default=None,
        type=str,
        required=True,
        help=
        "The output directory where the model predictions and checkpoints will be written."
    )

    ## Other parameters
    parser.add_argument(
        "--cache_dir",
        default="",
        type=str,
        help=
        "Where do you want to store the pre-trained models downloaded from s3")
    parser.add_argument(
        "--max_seq_length",
        default=128,
        type=int,
        help=
        "The maximum total input sequence length after WordPiece tokenization. \n"
        "Sequences longer than this will be truncated, and sequences shorter \n"
        "than this will be padded.")
    parser.add_argument("--do_train",
                        action='store_true',
                        help="Whether to run training.")
    parser.add_argument("--do_eval",
                        action='store_true',
                        help="Whether to run eval on the dev set.")
    parser.add_argument(
        "--do_lower_case",
        action='store_true',
        help="Set this flag if you are using an uncased model.")
    parser.add_argument("--train_batch_size",
                        default=32,
                        type=int,
                        help="Total batch size for training.")
    parser.add_argument("--eval_batch_size",
                        default=8,
                        type=int,
                        help="Total batch size for eval.")
    parser.add_argument("--learning_rate",
                        default=5e-5,
                        type=float,
                        help="The initial learning rate for Adam.")
    parser.add_argument("--num_train_epochs",
                        default=3.0,
                        type=float,
                        help="Total number of training epochs to perform.")
    parser.add_argument(
        "--warmup_proportion",
        default=0.1,
        type=float,
        help=
        "Proportion of training to perform linear learning rate warmup for. "
        "E.g., 0.1 = 10%% of training.")
    parser.add_argument("--weight_decay",
                        default=0.01,
                        type=float,
                        help="Weight deay if we apply some.")
    parser.add_argument("--adam_epsilon",
                        default=1e-8,
                        type=float,
                        help="Epsilon for Adam optimizer.")
    parser.add_argument("--max_grad_norm",
                        default=1.0,
                        type=float,
                        help="Max gradient norm.")
    parser.add_argument("--no_cuda",
                        action='store_true',
                        help="Whether not to use CUDA when available")
    parser.add_argument("--local_rank",
                        type=int,
                        default=-1,
                        help="local_rank for distributed training on gpus")
    parser.add_argument('--seed',
                        type=int,
                        default=42,
                        help="random seed for initialization")
    parser.add_argument(
        '--gradient_accumulation_steps',
        type=int,
        default=1,
        help=
        "Number of updates steps to accumulate before performing a backward/update pass."
    )
    parser.add_argument(
        '--fp16',
        action='store_true',
        help="Whether to use 16-bit float precision instead of 32-bit")
    parser.add_argument(
        '--fp16_opt_level',
        type=str,
        default='O1',
        help=
        "For fp16: Apex AMP optimization level selected in ['O0', 'O1', 'O2', and 'O3']."
        "See details at https://nvidia.github.io/apex/amp.html")
    parser.add_argument(
        '--loss_scale',
        type=float,
        default=0,
        help=
        "Loss scaling to improve fp16 numeric stability. Only used when fp16 set to True.\n"
        "0 (default value): dynamic loss scaling.\n"
        "Positive power of 2: static loss scaling value.\n")
    parser.add_argument('--server_ip',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    parser.add_argument('--server_port',
                        type=str,
                        default='',
                        help="Can be used for distant debugging.")
    args = parser.parse_args()

    if args.server_ip and args.server_port:
        # Distant debugging - see https://code.visualstudio.com/docs/python/debugging#_attach-to-a-local-script
        import ptvsd
        print("Waiting for debugger attach")
        ptvsd.enable_attach(address=(args.server_ip, args.server_port),
                            redirect_output=True)
        ptvsd.wait_for_attach()

    processors = {"ner": NerProcessor}

    if args.local_rank == -1 or args.no_cuda:
        device = torch.device("cuda" if torch.cuda.is_available()
                              and not args.no_cuda else "cpu")
        n_gpu = torch.cuda.device_count()
    else:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        n_gpu = 1
        # Initializes the distributed backend which will take care of sychronizing nodes/GPUs
        torch.distributed.init_process_group(backend='nccl')
    logger.info(
        "device: {} n_gpu: {}, distributed training: {}, 16-bits training: {}".
        format(device, n_gpu, bool(args.local_rank != -1), args.fp16))

    if args.gradient_accumulation_steps < 1:
        raise ValueError(
            "Invalid gradient_accumulation_steps parameter: {}, should be >= 1"
            .format(args.gradient_accumulation_steps))

    args.train_batch_size = args.train_batch_size // args.gradient_accumulation_steps

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)

    if not args.do_train and not args.do_eval:
        raise ValueError(
            "At least one of `do_train` or `do_eval` must be True.")

    if os.path.exists(args.output_dir) and os.listdir(
            args.output_dir) and args.do_train:
        raise ValueError(
            "Output directory ({}) already exists and is not empty.".format(
                args.output_dir))
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)

    task_name = args.task_name.lower()

    if task_name not in processors:
        raise ValueError("Task not found: %s" % (task_name))

    processor = processors[task_name]()
    label_list = processor.get_labels()
    num_labels = len(label_list) + 1

    tokenizer = BertTokenizer.from_pretrained(args.bert_model,
                                              do_lower_case=args.do_lower_case)

    train_examples = None
    num_train_optimization_steps = 0
    if args.do_train:
        train_examples = processor.get_train_examples(args.data_dir)
        num_train_optimization_steps = int(
            len(train_examples) / args.train_batch_size /
            args.gradient_accumulation_steps) * args.num_train_epochs
        if args.local_rank != -1:
            num_train_optimization_steps = num_train_optimization_steps // torch.distributed.get_world_size(
            )

    if args.local_rank not in [-1, 0]:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    # Prepare model
    config = BertConfig.from_pretrained(args.bert_model,
                                        num_labels=num_labels,
                                        finetuning_task=args.task_name)
    model = Ner.from_pretrained(args.bert_model, from_tf=False, config=config)

    if args.local_rank == 0:
        torch.distributed.barrier(
        )  # Make sure only the first process in distributed training will download model & vocab

    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params':
        [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)],
        'weight_decay':
        args.weight_decay
    }, {
        'params':
        [p for n, p in param_optimizer if any(nd in n for nd in no_decay)],
        'weight_decay':
        0.0
    }]
    warmup_steps = int(args.warmup_proportion * num_train_optimization_steps)
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=warmup_steps,
                                     t_total=num_train_optimization_steps)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if n_gpu > 1:
        model = torch.nn.DataParallel(model)

    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    global_step = 0
    nb_tr_steps = 0
    tr_loss = 0
    i = 0
    label_map = {i: label for i, label in enumerate(label_list, 0)}
    if args.do_train:
        train_features = convert_examples_to_features(train_examples,
                                                      label_list,
                                                      args.max_seq_length,
                                                      tokenizer)
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_examples))
        logger.info("  Batch size = %d", args.train_batch_size)
        logger.info("  Num steps = %d", num_train_optimization_steps)
        all_input_ids = torch.tensor([f.input_ids for f in train_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in train_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in train_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in train_features],
                                     dtype=torch.long)
        all_valid_ids = torch.tensor([f.valid_ids for f in train_features],
                                     dtype=torch.long)
        all_lmask_ids = torch.tensor([f.label_mask for f in train_features],
                                     dtype=torch.long)
        train_data = TensorDataset(all_input_ids, all_input_mask,
                                   all_segment_ids, all_label_ids,
                                   all_valid_ids, all_lmask_ids)
        if args.local_rank == -1:
            train_sampler = RandomSampler(train_data)
        else:
            train_sampler = DistributedSampler(train_data)
        train_dataloader = DataLoader(train_data,
                                      sampler=train_sampler,
                                      batch_size=args.train_batch_size)

        model.train()
        for _ in trange(int(args.num_train_epochs), desc="Epoch"):
            tr_loss = 0
            nb_tr_examples, nb_tr_steps = 0, 0
            for step, batch in enumerate(
                    tqdm(train_dataloader, desc="Iteration")):
                batch = tuple(t.to(device) for t in batch)
                input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask = batch
                loss = model(input_ids, segment_ids, input_mask, label_ids,
                             valid_ids, l_mask)
                if n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu.
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                tr_loss += loss.item()
                nb_tr_examples += input_ids.size(0)
                nb_tr_steps += 1
                if (step + 1) % args.gradient_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()  # Update learning rate schedule
                    model.zero_grad()
                    global_step += 1

        # Save a trained model and the associated configuration
        model_to_save = model.module if hasattr(
            model, 'module') else model  # Only save the model it-self
        model_to_save.save_pretrained(args.output_dir)
        tokenizer.save_pretrained(args.output_dir)
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        model_config = {
            "bert_model": args.bert_model,
            "do_lower": args.do_lower_case,
            "max_seq_length": args.max_seq_length,
            "num_labels": len(label_list) + 1,
            "label_map": label_map
        }
        json.dump(
            model_config,
            open(os.path.join(args.output_dir, "model_config.json"), "w"))
        # Load a trained model and config that you have fine-tuned
    else:
        # Load a trained model and vocabulary that you have fine-tuned
        model = Ner.from_pretrained(args.output_dir)
        tokenizer = BertTokenizer.from_pretrained(
            args.output_dir, do_lower_case=args.do_lower_case)

    model.to(device)

    if args.do_eval and (args.local_rank == -1
                         or torch.distributed.get_rank() == 0):
        eval_examples = processor.get_test_examples(args.data_dir)
        eval_features = convert_examples_to_features(eval_examples, label_list,
                                                     args.max_seq_length,
                                                     tokenizer)
        logger.info("***** Running evaluation *****")
        logger.info("  Num examples = %d", len(eval_examples))
        logger.info("  Batch size = %d", args.eval_batch_size)
        all_input_ids = torch.tensor([f.input_ids for f in eval_features],
                                     dtype=torch.long)
        all_input_mask = torch.tensor([f.input_mask for f in eval_features],
                                      dtype=torch.long)
        all_segment_ids = torch.tensor([f.segment_ids for f in eval_features],
                                       dtype=torch.long)
        all_label_ids = torch.tensor([f.label_id for f in eval_features],
                                     dtype=torch.long)
        all_valid_ids = torch.tensor([f.valid_ids for f in eval_features],
                                     dtype=torch.long)
        all_lmask_ids = torch.tensor([f.label_mask for f in eval_features],
                                     dtype=torch.long)
        eval_data = TensorDataset(all_input_ids, all_input_mask,
                                  all_segment_ids, all_label_ids,
                                  all_valid_ids, all_lmask_ids)
        # Run prediction for full data
        eval_sampler = SequentialSampler(eval_data)
        eval_dataloader = DataLoader(eval_data,
                                     sampler=eval_sampler,
                                     batch_size=args.eval_batch_size)
        model.eval()
        eval_loss, eval_accuracy = 0, 0
        nb_eval_steps, nb_eval_examples = 0, 0
        y_true = []
        y_pred = []
        label_map = {i: label for i, label in enumerate(label_list, 1)}
        for input_ids, input_mask, segment_ids, label_ids, valid_ids, l_mask in tqdm(
                eval_dataloader, desc="Evaluating"):
            input_ids = input_ids.to(device)
            input_mask = input_mask.to(device)
            segment_ids = segment_ids.to(device)
            valid_ids = valid_ids.to(device)
            label_ids = label_ids.to(device)
            l_mask = l_mask.to(device)

            with torch.no_grad():
                logits = model(input_ids,
                               segment_ids,
                               input_mask,
                               valid_ids=valid_ids,
                               attention_mask_label=l_mask)

            logits = torch.argmax(F.log_softmax(logits, dim=2), dim=2)
            logits = logits.detach().cpu().numpy()
            label_ids = label_ids.to('cpu').numpy()
            input_mask = input_mask.to('cpu').numpy()
            print("---------------------------------------------->")
            print(label_map)
            print("---------------------------------------------->")
            for i, label in enumerate(label_ids):
                temp_1 = []
                temp_2 = []

                for j, m in enumerate(label):
                    if j == 0:
                        continue
                    elif label_ids[i][j] == len(label_map):
                        y_true.append(temp_1)
                        y_pred.append(temp_2)
                        break
                    else:
                        temp_1.append(label_map[label_ids[i][j]])
                        temp_2.append(label_map[logits[i][j]])

        report = classification_report(y_true, y_pred, digits=4)
        logger.info("\n%s", report)
        output_eval_file = os.path.join(args.output_dir, "eval_results.txt")
        with open(output_eval_file, "w") as writer:
            logger.info("***** Eval results *****")
            logger.info("\n%s", report)
            writer.write(report)
Example #6
0
def train(args, model, tokenizer, query_cache, passage_cache):
    """ Train the model """
    logger.info("Training/evaluation parameters %s", args)
    tb_writer = None
    if is_first_worker():
        tb_writer = SummaryWriter(log_dir=args.log_dir)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) #nll loss for query
    real_batch_size = args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)
    
    optimizer = get_optimizer(args, model, weight_decay=args.weight_decay,)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps= args.max_steps
    )
    global_step = 0

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    if args.model_name_or_path != "bert-base-uncased":
        if args.fp16 and args.resume_fp16_checkpoint :
            from collections import OrderedDict
            checkpoint = torch.load(args.model_name_or_path, map_location=lambda s, l: default_restore_location(s, 'cpu'))
            new_state_dict = OrderedDict()
            for k, v in checkpoint['model'].items():
                name = k[7:]
                new_state_dict[name] = v
            model.load_state_dict(new_state_dict)
            # model.load_state_dict(checkpoint['model'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            scheduler.load_state_dict(checkpoint['scheduler'])
            amp.load_state_dict(checkpoint['amp'])
            global_step = checkpoint['offset']
        else:
            saved_state = load_states_from_checkpoint(args.model_name_or_path)
            global_step, model, optimizer, scheduler = _load_saved_state(model, optimizer, scheduler, saved_state, load_optimizer_scheduler=args.load_optimizer_scheduler)

        logger.info("  Continuing training from checkpoint, will skip to saved global_step")
        logger.info("  Continuing training from global step %d", global_step)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True,
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Max steps = %d", args.max_steps)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)

    tr_loss = {}
    model.zero_grad()
    model.train()
    set_seed(args)  # Added here for reproductibility

    last_ann_no = -1
    train_dataloader = None
    train_dataloader_iter = None
    dev_ndcg = 0
    step = 0
    iter_count = 0

    if not args.dual_training:
        args.dual_loss_weight = 0.0


    if args.model_name_or_path != "bert-base-uncased":
        nq_dev_nll_loss, nq_correct_ratio = evaluate_dev(args, model, passage_cache)
        # dev_nll_loss_trivia, correct_ratio_trivia = evaluate_dev(args, model, passage_cache, "-trivia")
        if is_first_worker():
            tb_writer.add_scalar("dev_nll_loss/dev_nll_loss", nq_dev_nll_loss, global_step)
            tb_writer.add_scalar("dev_nll_loss/correct_ratio", nq_correct_ratio, global_step)
            # tb_writer.add_scalar("dev_nll_loss/dev_nll_loss_trivia", dev_nll_loss_trivia, global_step)
            # tb_writer.add_scalar("dev_nll_loss/correct_ratio_trivia", correct_ratio_trivia, global_step)

    while global_step < args.max_steps:

        if step % args.gradient_accumulation_steps == 0 and global_step % args.logging_steps == 0:

            if args.num_epoch == 0:
                # check if new ann training data is availabe
                ann_no, ann_path, ndcg_json = get_latest_ann_data(args.ann_dir)
                if ann_path is not None and ann_no != last_ann_no:
                    logger.info("Training on new add data at %s", ann_path)
                    with open(ann_path, 'r') as f:
                        ann_training_data = f.readlines()
                    logger.info("Training data line count: %d", len(ann_training_data))
                    ann_training_data = [l for l in ann_training_data if len(l.split('\t')[2].split(',')) > 1]
                    logger.info("Filtered training data line count: %d", len(ann_training_data))
                    if args.dual_training:
                        ann_training_data = [l for l in ann_training_data if len(l.split('\t')[3].split(',')) > 1]
                        logger.info("Filtered training data line count in dual training: %d", len(ann_training_data))
                    
                    ann_checkpoint_path = ndcg_json['checkpoint']
                    ann_checkpoint_no = get_checkpoint_no(ann_checkpoint_path)

                    aligned_size = (len(ann_training_data) // args.world_size) * args.world_size
                    ann_training_data = ann_training_data[:aligned_size]

                    logger.info("Total ann queries: %d", len(ann_training_data))
                    if args.triplet:
                        if args.dual_training:                  
                            train_dataset = StreamingDataset(ann_training_data, GetQuadrapuletTrainingDataProcessingFn(args, query_cache, passage_cache))
                        else:
                            train_dataset = StreamingDataset(ann_training_data, GetTripletTrainingDataProcessingFn(args, query_cache, passage_cache))
                        train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size)
                    else:
                        train_dataset = StreamingDataset(ann_training_data, GetTrainingDataProcessingFn(args, query_cache, passage_cache))
                        train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size*2)
                    train_dataloader_iter = iter(train_dataloader)

                    # re-warmup
                    if not args.single_warmup:
                        scheduler = get_linear_schedule_with_warmup(
                            optimizer, num_warmup_steps=args.warmup_steps, num_training_steps= len(ann_training_data)
                        )

                    if args.local_rank != -1:
                        dist.barrier()
            
                    if is_first_worker():
                        # add ndcg at checkpoint step used instead of current step
                        tb_writer.add_scalar("retrieval_accuracy/top20_nq", ndcg_json['top20'], ann_checkpoint_no)
                        tb_writer.add_scalar("retrieval_accuracy/top100_nq", ndcg_json['top100'], ann_checkpoint_no)
                        if 'dev_top20' in ndcg_json:
                            tb_writer.add_scalar("retrieval_accuracy/top20_nq_dev", ndcg_json['dev_top20'], ann_checkpoint_no)
                            tb_writer.add_scalar("retrieval_accuracy/top100_nq_dev", ndcg_json['dev_top100'], ann_checkpoint_no)                 
                        if 'top20_trivia' in ndcg_json:
                            tb_writer.add_scalar("retrieval_accuracy/top20_trivia", ndcg_json['top20_trivia'], ann_checkpoint_no)
                            tb_writer.add_scalar("retrieval_accuracy/top100_trivia", ndcg_json['top100_trivia'], ann_checkpoint_no)
                        if last_ann_no != -1:
                            tb_writer.add_scalar("epoch", last_ann_no, global_step-1)
                        tb_writer.add_scalar("epoch", ann_no, global_step)
                    last_ann_no = ann_no
            elif step == 0:
                train_data_path = os.path.join(args.data_dir, "train-data")
                with open(train_data_path, 'r') as f:
                    training_data = f.readlines()
                if args.triplet:
                    train_dataset = StreamingDataset(training_data, GetTripletTrainingDataProcessingFn(args, query_cache, passage_cache))
                    train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size)
                else:
                    train_dataset = StreamingDataset(training_data, GetTrainingDataProcessingFn(args, query_cache, passage_cache))
                    train_dataloader = DataLoader(train_dataset, batch_size=args.train_batch_size*2)
                all_batch = [b for b in train_dataloader]
                logger.info("Total batch count: %d", len(all_batch))
                train_dataloader_iter = iter(train_dataloader)

        try:
            batch = next(train_dataloader_iter)
        except StopIteration:
            logger.info("Finished iterating current dataset, begin reiterate")
            if args.num_epoch != 0:
                iter_count += 1
                if is_first_worker():
                    tb_writer.add_scalar("epoch", iter_count-1, global_step-1)
                    tb_writer.add_scalar("epoch", iter_count, global_step)
            nq_dev_nll_loss, nq_correct_ratio = evaluate_dev(args, model, passage_cache)
            # dev_nll_loss_trivia, correct_ratio_trivia = evaluate_dev(args, model, passage_cache, "-trivia")
            if is_first_worker():
                tb_writer.add_scalar("dev_nll_loss/dev_nll_loss", nq_dev_nll_loss, global_step)
                tb_writer.add_scalar("dev_nll_loss/correct_ratio", nq_correct_ratio, global_step)
                # tb_writer.add_scalar("dev_nll_loss/dev_nll_loss_trivia", dev_nll_loss_trivia, global_step)
                # tb_writer.add_scalar("dev_nll_loss/correct_ratio_trivia", correct_ratio_trivia, global_step)
            train_dataloader_iter = iter(train_dataloader)
            batch = next(train_dataloader_iter)
            dist.barrier()

        if args.num_epoch != 0 and iter_count > args.num_epoch:
            break
        
        step += 1
        if args.triplet:
            loss, loss_dict = triplet_fwd_pass(args, model, batch)
        else:
            loss, correct_cnt = do_biencoder_fwd_pass(args, model, batch)

        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            if step % args.gradient_accumulation_steps == 0:
                loss.backward()
            else:
                with model.no_sync():
                    loss.backward()          

        if len(tr_loss) >0:
            for k in loss_dict:
                tr_loss[k] = tr_loss[k] + loss_dict[k].item()
        else:
            for k in loss_dict:
                tr_loss[k] = loss_dict[k].item()   

        if step % args.gradient_accumulation_steps == 0:
            if args.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1

            if args.logging_steps > 0 and global_step % args.logging_steps == 0:
                logs = {"loss_total":0.0}
                for k in tr_loss:
                    tr_loss[k] = tr_loss[k] / args.logging_steps
                    logs[k] = tr_loss[k]
                    logs["loss_total"] = logs["loss_total"] + logs[k]
                # loss_scalar = tr_loss / args.logging_steps
                learning_rate_scalar = scheduler.get_lr()[0]
                # logs["loss"] = loss_scalar
                logs["learning_rate"] = learning_rate_scalar
                tr_loss = {}

                if is_first_worker():
                    for key, value in logs.items():
                        tb_writer.add_scalar(key, value, global_step)
                    logger.info(json.dumps({**logs, **{"step": global_step}}))

            if is_first_worker() and args.save_steps > 0 and global_step % args.save_steps == 0:
                if args.fp16:
                    cp = os.path.join(args.output_dir, 'checkpoint-' + str(global_step))
                    # Save checkpoint
                    checkpoint = {
                        'model': model.state_dict(),
                        'optimizer': optimizer.state_dict(),
                        'scheduler' : scheduler.state_dict(),
                        'amp': amp.state_dict(),
                        'epoch':0,
                        'offset': global_step,
                    }
                    torch.save(checkpoint, cp)
                    logger.info('Saved checkpoint at %s', cp)
                else:
                    _save_checkpoint(args, model, optimizer, scheduler, global_step)

    if args.local_rank == -1 or torch.distributed.get_rank() == 0:
        tb_writer.close()

    return global_step
Example #7
0
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout

    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """

    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(
            model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    if hparams.use_vae:
        criterion = Tacotron2Loss_VAE(hparams)
    else:
        criterion = Tacotron2Loss()

    logger = prepare_directories_and_logger(
        output_directory, log_directory, rank, hparams.use_vae)

    train_loader, valset, collate_fn = prepare_dataloaders(hparams)
    valset_csv = os.path.join(output_directory, log_directory, 'valset.csv')
    # list2csv(flatten_list(valset.audiopaths_and_text), valset_csv, delimiter='|')
    list2csv(valset.audiopaths_and_text, valset_csv, delimiter='|')

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(
                checkpoint_path, model, hparams.ignore_layers)
        else:
            model, optimizer, _learning_rate, iteration, epoch, step = \
                load_checkpoint(checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            if epoch == 0:
                iteration += 1  # next iteration is iteration + 1
                epoch_offset = max(0, int(iteration / len(train_loader)))
            else:
                epoch_offset = epoch
            print('epoch offset: {}'.format(epoch_offset))
            train_loader = prepare_dataloaders(hparams, epoch_offset, valset,
                collate_fn['train'])[0]
        print('completing loading model ...')

    model.train()
    is_overflow = False
    # ================ MAIN TRAINNIG LOOP! ===================
    track_csv = os.path.join(output_directory, log_directory, 'track.csv')
    track_header = ['padding-rate-txt', 'max-len-txt', 'top-len-txt',
        'padding-rate-mel', 'max-len-mel', 'top-len-mel', 'batch-size',
        'batch-length', 'batch-area', 'mem-use', 'mem-all', 'mem-cached',
        'duration', 'iteration', 'epoch', 'step']
    if os.path.isfile(track_csv) and checkpoint_path is not None:
        print('loading existing {} ...'.format(track_csv))
        track = csv2dict(track_csv, header=track_header)
    else:
        track = {k:[] for k in track_header}

    print('start training in epoch {} ~ {} ...'.format(epoch_offset, hparams.epochs))
    nbatches = len(train_loader)
    for epoch in range(epoch_offset, hparams.epochs):
        #if epoch >= 10: break
        print("Epoch: {}, #batches: {}".format(epoch, nbatches))
        batch_sizes, batch_lengths = [0] * nbatches, [0] * nbatches
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate

            model.zero_grad()
            x, y = model.parse_batch(batch)
            y_pred = model(x)

            if hparams.use_vae:
                loss, recon_loss, kl, kl_weight = criterion(y_pred, y, iteration)
            else:
                loss = criterion(y_pred, y)

            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()

            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                batch_sizes[i], batch_lengths[i] = batch[0].size(0), batch[2].size(2)
                batch_capacity = batch_sizes[i] * batch_lengths[i]
                mem_all = torch.cuda.memory_allocated() / (1024**2)
                mem_cached = torch.cuda.memory_cached() / (1024**2)
                mem_use = mem_all + mem_cached
                print("{} ({}:{}/{}): ".format(iteration, epoch, i, nbatches), end='')
                print("Batch {} ({}X{}) ".format(batch_capacity, batch_sizes[i],
                    batch_lengths[i]), end='')
                print("Mem {:.1f} ({:.1f}+{:.1f}) ".format(mem_use, mem_all,
                    mem_cached), end='')
                print("Train loss {:.3f} Grad Norm {:.3f} {:.2f}s/it".format(
                    reduced_loss, grad_norm, duration))
                input_lengths, gate_padded = batch[1], batch[4]
                metadata = (duration, iteration, epoch, i)
                track_seq(track, input_lengths, gate_padded, metadata)
                padding_rate_txt = track['padding-rate-txt'][-1]
                max_len_txt = track['max-len-txt'][-1]
                padding_rate_mel = track['padding-rate-mel'][-1]
                max_len_mel = track['max-len-mel'][-1]
                if hparams.use_vae:
                    logger.log_training(
                        reduced_loss, grad_norm, learning_rate, duration,
                        padding_rate_txt, max_len_txt, padding_rate_mel,
                        max_len_mel, iteration, recon_loss, kl, kl_weight)
                else:
                    logger.log_training(
                        reduced_loss, grad_norm, learning_rate, duration,
                        padding_rate_txt, max_len_txt, padding_rate_mel,
                        max_len_mel, iteration)

            check_by_iter = (hparams.check_by == 'iter') and \
                            (iteration % hparams.iters_per_checkpoint == 0)
            check_by_epoch = (hparams.check_by == 'epoch') and i == 0 and \
                             (epoch % hparams.epochs_per_checkpoint == 0)
            if not is_overflow and (check_by_iter or check_by_epoch):
                dict2col(track, track_csv, verbose=True)
                val_loss, (mus, emotions) = validate(model, criterion, valset,
                     iteration, hparams.batch_size, n_gpus, collate_fn['val'], logger,
                     hparams.distributed_run, rank, hparams.use_vae, pre_batching=False)
                if rank == 0:
                    checkpoint_path = os.path.join(output_directory,
                        "checkpoint_{}-{}-{}_{:.3f}".format(iteration, epoch, i, val_loss))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                         epoch, i, checkpoint_path)
                    if hparams.use_vae:
                        image_scatter_path = os.path.join(output_directory,
                             "checkpoint_{0}_scatter_val.png".format(iteration))
                        image_tsne_path = os.path.join(output_directory,
                             "checkpoint_{0}_tsne_val.png".format(iteration))
                        imageio.imwrite(image_scatter_path, plot_scatter(mus, emotions))
                        imageio.imwrite(image_tsne_path, plot_tsne(mus, emotions))

            iteration += 1

        if hparams.prep_trainset_per_epoch:
            train_loader = prepare_dataloaders(hparams, epoch+1, valset,
                collate_fn['train'])[0]
            nbatches = len(train_loader)
Example #8
0
    def fit(
        self,
        epochs,
        lr,
        validate=True,
        schedule_type="warmup_cosine",
        optimizer_type="lamb",
    ):

        tensorboard_dir = self.output_dir / "tensorboard"
        tensorboard_dir.mkdir(exist_ok=True)

        # Train the model
        tb_writer = SummaryWriter(tensorboard_dir)

        train_dataloader = self.data.train_dl
        if self.max_steps > 0:
            t_total = self.max_steps
            self.epochs = (self.max_steps // len(train_dataloader) //
                           self.grad_accumulation_steps + 1)
        else:
            t_total = len(
                train_dataloader) // self.grad_accumulation_steps * epochs

        # Prepare optimiser
        optimizer = self.get_optimizer(lr, optimizer_type=optimizer_type)

        # get the base model if its already wrapped around DataParallel
        if hasattr(self.model, "module"):
            self.model = self.model.module

        if self.is_fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError("Please install apex to use fp16 training")
            self.model, optimizer = amp.initialize(
                self.model, optimizer, opt_level=self.fp16_opt_level)

        # Get scheduler
        scheduler = self.get_scheduler(optimizer,
                                       t_total=t_total,
                                       schedule_type=schedule_type)

        # Parallelize the model architecture
        if self.multi_gpu is True:
            self.model = torch.nn.DataParallel(self.model)

        # Start Training
        self.logger.info("***** Running training *****")
        self.logger.info("  Num examples = %d", len(train_dataloader.dataset))
        self.logger.info("  Num Epochs = %d", epochs)
        self.logger.info(
            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
            self.data.train_batch_size * self.grad_accumulation_steps,
        )
        self.logger.info("  Gradient Accumulation steps = %d",
                         self.grad_accumulation_steps)
        self.logger.info("  Total optimization steps = %d", t_total)

        global_step = 0
        epoch_step = 0
        tr_loss, logging_loss, epoch_loss = 0.0, 0.0, 0.0
        self.model.zero_grad()
        pbar = master_bar(range(epochs))

        for epoch in pbar:
            epoch_step = 0
            epoch_loss = 0.0
            for step, batch in enumerate(
                    progress_bar(train_dataloader, parent=pbar)):
                self.model.train()
                batch = tuple(t.to(self.device) for t in batch)
                inputs = {
                    "input_ids": batch[0],
                    "attention_mask": batch[1],
                    "labels": batch[3],
                }

                if self.model_type in ["bert", "xlnet"]:
                    inputs["token_type_ids"] = batch[2]

                outputs = self.model(**inputs)
                loss = outputs[
                    0]  # model outputs are always tuple in pytorch-transformers (see doc)

                if self.n_gpu > 1:
                    loss = (
                        loss.mean()
                    )  # mean() to average on multi-gpu parallel training
                if self.grad_accumulation_steps > 1:
                    loss = loss / self.grad_accumulation_steps

                if self.is_fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), self.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(self.model.parameters(),
                                                   self.max_grad_norm)

                tr_loss += loss.item()
                epoch_loss += loss.item()
                if (step + 1) % self.grad_accumulation_steps == 0:
                    optimizer.step()
                    scheduler.step()

                    self.model.zero_grad()
                    global_step += 1
                    epoch_step += 1

                    if self.logging_steps > 0 and global_step % self.logging_steps == 0:
                        if validate:
                            # evaluate model
                            results = self.validate()
                            for key, value in results.items():
                                tb_writer.add_scalar("eval_{}".format(key),
                                                     value, global_step)
                                self.logger.info(
                                    "eval_{} after step {}: {}: ".format(
                                        key, global_step, value))

                        # Log metrics
                        self.logger.info("lr after step {}: {}".format(
                            global_step,
                            scheduler.get_lr()[0]))
                        self.logger.info("train_loss after step {}: {}".format(
                            global_step,
                            (tr_loss - logging_loss) / self.logging_steps,
                        ))
                        tb_writer.add_scalar("lr",
                                             scheduler.get_lr()[0],
                                             global_step)
                        tb_writer.add_scalar(
                            "loss",
                            (tr_loss - logging_loss) / self.logging_steps,
                            global_step,
                        )

                        logging_loss = tr_loss

            # Evaluate the model after every epoch
            if validate:
                results = self.validate()
                for key, value in results.items():
                    self.logger.info("eval_{} after epoch {}: {}: ".format(
                        key, (epoch + 1), value))

            # Log metrics
            self.logger.info("lr after epoch {}: {}".format(
                (epoch + 1),
                scheduler.get_lr()[0]))
            self.logger.info("train_loss after epoch {}: {}".format(
                (epoch + 1), epoch_loss / epoch_step))
            self.logger.info("\n")

        tb_writer.close()
        return global_step, tr_loss / global_step
Example #9
0
def train(args, train_dataset, model: PreTrainedModel, tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples: List[torch.Tensor]):
        if tokenizer._pad_token is None:
            return pad_sequence(examples, batch_first=True)
        return pad_sequence(examples, batch_first=True, padding_value=tokenizer.pad_token_id)

    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset, sampler=train_sampler, batch_size=args.train_batch_size, collate_fn=collate, drop_last=True
    )

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Take care of distributed/parallel training
    model = model.module if hasattr(model, "module") else model
    model.resize_token_embeddings(len(tokenizer))
    # add_special_tokens_(model, tokenizer)

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(
            nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )

    # Check if saved optimizer or scheduler states exist
    if (
        args.model_name_or_path
        and os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt"))
        and os.path.isfile(os.path.join(args.model_name_or_path, "scheduler.pt"))
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(
            os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(
            os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(
            model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[
                args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if args.model_name_or_path and os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split(
                "-")[-1].split("/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info(
                "  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0

    model.zero_grad()
    train_iterator = trange(
        epochs_trained, int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]
    )
    set_seed(args)  # Added here for reproducibility
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            inputs, labels = (batch, batch)
            if inputs.shape[1] > 1024:
                continue
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs, labels=labels)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(
                        model.parameters(), args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar(
                                "eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar(
                        "lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar(
                        "loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    # torch.save(args, os.path.join(
                    #    output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(), os.path.join(
                        output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(
                        output_dir, "scheduler.pt"))
                    logger.info(
                        "Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
    def train(self):
        if not self.pretrained_model:
            model = GPT2KWModel(config=self.model_config)
        else:
            self.print_and_log('加载预训练模型')
            model = GPT2KWModel.from_pretrained(self.pretrained_model)
        model.train()
        model.to(self.device)
        # 计算模型参数量
        num_parameters = 0
        parameters = model.parameters()
        for parameter in parameters:
            num_parameters += parameter.numel()
        self.print_and_log('模型参数量: {}'.format(num_parameters))

        self.print_and_log("开始加载训练集")
        train_loader, valid_loader = self.create_dataloader()
        self.print_and_log("训练集加载完毕")

        epoch_steps = int(train_loader.sampler.num_samples / self.batch_size / self.accumulation_steps)
        total_steps = epoch_steps * self.epochs
        self.print_and_log('总样本数 = {}'.format(train_loader.sampler.num_samples))
        self.print_and_log('epoch 步数 = {}'.format(epoch_steps))
        self.print_and_log('总步数 = {}'.format(total_steps))

        optimizer = pytorch_transformers.AdamW(model.parameters(), lr=self.lr, correct_bias=True)
        # scheduler = pytorch_transformers.WarmupLinearSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=total_steps)
        scheduler = pytorch_transformers.WarmupCosineSchedule(optimizer, warmup_steps=self.warmup_steps, t_total=total_steps)
        if self.fp16:
            try:
                from apex import amp
            except ImportError:
                raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
            model, optimizer = amp.initialize(model, optimizer, opt_level=self.fp16_opt_level)

        if torch.cuda.device_count() > 1:
            model = DataParallel(model)
            multi_gpu = True
        else:
            multi_gpu = False

        overall_step = 0
        running_loss = 0
        model.train()
        for epoch in range(self.epochs):
            self.print_and_log('epoch {}'.format(epoch + 1))
            now = datetime.now()
            self.print_and_log('time: {}'.format(now))
            optimizer.zero_grad()
            for i, batch_data in enumerate(train_loader):
                if torch.cuda.is_available():
                    keyword_ids = batch_data[0].to(self.device, non_blocking=True)
                    passage_ids = batch_data[1].to(self.device, non_blocking=True)
                    label_ids = passage_ids.clone().to(self.device, non_blocking=True)
                else:
                    keyword_ids = batch_data[0]
                    passage_ids = batch_data[1]
                    label_ids = passage_ids.clone()
                outputs = model(input_ids=passage_ids, keyword_ids=keyword_ids, labels=label_ids)
                loss, logits = outputs[:2]
                # 多 GPU 训练
                if multi_gpu:
                    loss = loss.mean()
                # 梯度累加
                if self.gradient_accumulation > 1:
                    loss = loss / self.gradient_accumulation
                #  混合精度训练或正常训练
                if self.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                        torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), self.max_grad_norm)
                else:
                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), self.max_grad_norm)
                # 更新权重
                if (i + 1) % self.gradient_accumulation == 0:
                    running_loss += loss.item()
                    scheduler.step()
                    optimizer.step()
                    optimizer.zero_grad()
                    overall_step += 1
                # 报告 train loss
                if (overall_step + 1) % self.log_step == 0 and running_loss != 0:
                    self.print_and_log('now time: {}:{}. Step {} of epoch {}, loss {}'.format(
                        datetime.now().hour,
                        datetime.now().minute,
                        overall_step + 1,
                        epoch + 1,
                        running_loss * self.gradient_accumulation / self.log_step))
                    running_loss = 0

            # 开始验证
            with torch.no_grad():
                valid_start_time = datetime.now()
                model.eval()
                valid_loss = 0
                valid_step = 0
                for i, valid_batch_data in enumerate(valid_loader):
                    if torch.cuda.is_available():
                        keyword_ids = valid_batch_data[0].to(self.device, non_blocking=True)
                        passage_ids = valid_batch_data[1].to(self.device, non_blocking=True)
                        label_ids = passage_ids.clone().to(self.device, non_blocking=True)
                    else:
                        keyword_ids = valid_batch_data[0]
                        passage_ids = valid_batch_data[1]
                        label_ids = passage_ids.clone()
                    outputs = model(input_ids=passage_ids, keyword_ids=keyword_ids, labels=label_ids)
                    loss, logits = outputs[:2]
                    valid_loss += loss
                    valid_step += 1
                valid_loss = valid_loss / valid_step
                self.print_and_log('valid duration: {}, valid loss: {}'.format(datetime.now() - valid_start_time, valid_loss))

            # 保存模型
            if (epoch + 1) % 1 == 0:
                if not os.path.exists(self.output_dir + 'model_epoch{}'.format(epoch + 1)):
                    os.makedirs(self.output_dir + 'model_epoch{}'.format(epoch + 1))
                model_to_save = model.module if hasattr(model, 'module') else model
                model_to_save.save_pretrained(self.output_dir + 'model_epoch{}'.format(epoch + 1))
                # torch.save(scheduler.state_dict(), output_dir + 'model_epoch{}/scheduler.pt'.format(epoch + 1))
                # torch.save(optimizer.state_dict(), output_dir + 'model_epoch{}/optimizer.pt'.format(epoch + 1))

            then = datetime.now()
            self.print_and_log('time: {}'.format(then))
            self.print_and_log('time for one epoch: {}'.format(then - now))
            model.train()

        self.print_and_log('training finished')
        self.f_log.close()
        if not os.path.exists(self.output_dir + 'final_model'):
            os.makedirs(self.output_dir + 'final_model')
        model_to_save = model.module if hasattr(model, 'module') else model
        model_to_save.save_pretrained(self.output_dir + 'final_model')
Example #11
0
def train(args):
    fh = logging.FileHandler(f"./output/{args.model}/{args.dataset}/logs.txt")
    # create file handler which logs even debug messages
    logger.addHandler(fh)  # add the handlers to the logger
    timestamp = datetime.now().strftime('%Y%m%d%H%M')

    random.seed(args.seed)
    np.random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed(args.seed)
    device = torch.device(
        f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")

    def save_model(model, epoch):
        torch.save(
            model.state_dict(),
            f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5')

    def load_model(model, epoch, to_device):
        assert os.path.exists(
            f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5'
        ), f'Weights at epoch {epoch} not found'
        model.load_state_dict(
            torch.load(
                f'./output/{args.model}/{args.dataset}/models/epo{epoch}.h5',
                map_location=to_device))

    config = getattr(configs, 'config_' + args.model)()
    print(config)

    ###############################################################################
    # Load data
    ###############################################################################
    data_path = args.data_path + args.dataset + '/'
    train_set = eval(config['dataset_name'])(config, data_path,
                                             config['train_cfg'],
                                             config['n_node'],
                                             config['train_desc'],
                                             config['desc_len'])
    '''
    valid_set = eval(config['dataset_name'])(data_path,
                                  config['valid_tokens'], config['tokens_len'],
                                  config['valid_desc'], config['desc_len'])
    '''
    data_loader = torch.utils.data.DataLoader(dataset=train_set,
                                              batch_size=config['batch_size'],
                                              shuffle=True,
                                              drop_last=False,
                                              num_workers=1)

    ###############################################################################
    # Define the models
    ###############################################################################
    logger.info('Constructing Model..')
    model = getattr(models, args.model)(config)  #initialize the model
    if args.reload_from > 0:
        load_model(model, args.reload_from, device)
    logger.info('done')
    model.to(device)

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.01
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    # origin: AdamW
    optimizer = torch.optim.AdamW(optimizer_grouped_parameters,
                                  lr=config['learning_rate'],
                                  eps=config['adam_epsilon'])
    # no scheduler in paper's original code
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=config['warmup_steps'],
        num_training_steps=len(data_loader) * config['nb_epoch']
    )  # do not foget to modify the number when dataset is changed
    if config['fp16']:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=config['fp16_opt_level'])

    print('---model parameters---')
    num_params = 0
    for param in model.parameters():
        num_params += param.numel()
    print(num_params / 1e6)

    n_iters = len(data_loader)
    itr_global = args.reload_from + 1
    for epoch in range(int(args.reload_from) + 1, config['nb_epoch'] + 1):
        itr_start_time = time.time()
        losses = []
        for batch in data_loader:

            model.train()
            batch_gpu = [tensor.to(device) for tensor in batch]
            loss = model(*batch_gpu)

            if config['fp16']:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               5.0)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 5.0)

            optimizer.step()
            scheduler.step()
            model.zero_grad()

            losses.append(loss.item())

            if itr_global % args.log_every == 0:
                elapsed = time.time() - itr_start_time
                logger.info('epo:[%d/%d] itr:[%d/%d] step_time:%ds Loss=%.5f' %
                            (epoch, config['nb_epoch'], itr_global % n_iters,
                             n_iters, elapsed, np.mean(losses)))

                losses = []
                itr_start_time = time.time()
            itr_global = itr_global + 1

        # save every epoch
        if epoch >= 80:
            if epoch % 2 == 0:
                save_model(model, epoch)
Example #12
0
    def _train_epoch(
        self,
        scheduler,
        epoch: int,
        eval_step: int,
        train_data_iterator: ShardedDataIterator,
    ):

        args = self.args
        rolling_train_loss = 0.0
        epoch_loss = 0
        epoch_correct_predictions = 0

        log_result_step = args.log_batch_step
        rolling_loss_step = args.train_rolling_loss_step
        num_hard_negatives = args.hard_negatives
        num_other_negatives = args.other_negatives
        seed = args.seed
        self.biencoder.train()
        epoch_batches = train_data_iterator.max_iterations
        data_iteration = 0
        t1 = time.time()

        for i, samples_batch in enumerate(
                train_data_iterator.iterate_data(epoch=epoch)):

            # to be able to resume shuffled ctx- pools
            data_iteration = train_data_iterator.get_iteration()
            random.seed(seed + epoch + data_iteration)
            biencoder_batch = BiEncoder.create_biencoder_input(
                samples_batch,
                self.tensorizer,
                True,
                num_hard_negatives,
                num_other_negatives,
                shuffle=True,
                shuffle_positives=args.shuffle_positive_ctx)

            loss, correct_cnt = _do_biencoder_fwd_pass(self.biencoder,
                                                       biencoder_batch,
                                                       self.tensorizer, args)

            epoch_correct_predictions += correct_cnt
            epoch_loss += loss.item()
            rolling_train_loss += loss.item()

            if args.fp16:
                from apex import amp
                with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                    scaled_loss.backward()
                if args.max_grad_norm > 0:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(self.optimizer), args.max_grad_norm)
            else:
                loss.backward()
                if args.max_grad_norm > 0:
                    torch.nn.utils.clip_grad_norm_(self.biencoder.parameters(),
                                                   args.max_grad_norm)

            if (i + 1) % args.gradient_accumulation_steps == 0:
                self.optimizer.step()
                scheduler.step()
                self.biencoder.zero_grad()

            if i % log_result_step == 0:
                lr = self.optimizer.param_groups[0]['lr']
                logger.info('Epoch: %d: Step: %d/%d, loss=%f, lr=%f', epoch,
                            data_iteration, epoch_batches, loss.item(), lr)

            if (i + 1) % rolling_loss_step == 0:
                t2 = time.time()
                elapsed = t2 - t1
                logger.info('Train batch %d', data_iteration)
                latest_rolling_train_av_loss = rolling_train_loss / rolling_loss_step
                logger.info('Avg. loss per last %d batches: %f',
                            rolling_loss_step, latest_rolling_train_av_loss)
                logger.info('Avg. single GPU speed per last %d batches: %f',
                            rolling_loss_step, elapsed / rolling_loss_step)
                rolling_train_loss = 0.0
                t1 = t2

            if data_iteration % eval_step == 0:
                logger.info('Validation: Epoch: %d Step: %d/%d', epoch,
                            data_iteration, epoch_batches)
                self.validate_and_save(epoch,
                                       train_data_iterator.get_iteration(),
                                       scheduler)
                self.biencoder.train()

        self.validate_and_save(epoch, data_iteration, scheduler)

        epoch_loss = (epoch_loss / epoch_batches) if epoch_batches > 0 else 0
        logger.info('Av Loss per epoch=%f', epoch_loss)
        logger.info('epoch total correct predictions=%d',
                    epoch_correct_predictions)
Example #13
0
def train(args, trn_loader, dev_loader, model, tokenizer, cand_uttr_sys_dict,
          others):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter("runs/" + args.output_dir.replace("/", "-"))

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(trn_loader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            trn_loader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Num batches = %d", len(trn_loader))
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0

    tr_loss, logging_loss = 0.0, 0.0
    loss_mlm, loss_rs = 0, 0
    patience, best_loss = 0, 1e10
    xeloss = torch.nn.CrossEntropyLoss()

    model_to_resize = model.module if hasattr(
        model,
        "module") else model  # Take care of distributed/parallel training
    model_to_resize.resize_token_embeddings(len(tokenizer))

    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproducibility

    for _ in train_iterator:

        ## Calculate kmeans results in the beginning of each epoch
        if args.negative_sampling_by_kmeans:
            ToD_BERT_SYS_UTTR_KMEANS, KMEANS_to_SENTS = get_candidate_kmeans(
                args, cand_uttr_sys_dict, tokenizer, model)
            trn_loader = get_loader(vars(args), "train", tokenizer,
                                    others["datasets"], others["unified_meta"],
                                    "train")

        loss_arr, loss_mlm_arr, loss_rs_arr = [], [], []
        epoch_iterator = tqdm(trn_loader,
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            ## add response selection into pretraining
            if args.add_rs_loss:
                kmeans_others = {
                    "ToD_BERT_SYS_UTTR_KMEANS": ToD_BERT_SYS_UTTR_KMEANS,
                    "KMEANS_to_SENTS": KMEANS_to_SENTS
                } if args.negative_sampling_by_kmeans else {}

                ## Split dialogue into (context, response) pairs
                input_cont, input_resp, resp_label = mask_for_response_selection(
                    batch, tokenizer, args, cand_uttr_sys_dict, kmeans_others)
                ## Mask context part for MLM loss
                input_cont, labels = mask_tokens(
                    input_cont, tokenizer, args) if args.mlm else (input_cont,
                                                                   input_cont)

                ## Allocate tensors to (gpu) devices
                input_cont = input_cont.to(args.device)
                input_resp = input_resp.to(args.device)
                resp_label = resp_label.to(args.device)
                labels = labels.to(args.device)

                ## Encode the context part with BERT
                outputs = model.bert(
                    input_cont,
                    attention_mask=input_cont > 0,
                )
                sequence_output = outputs[0]
                hid_cont = sequence_output[:, 0, :]  ## CLS token

                ## Calculate MLM loss for the context
                prediction_scores = model.cls(sequence_output)
                loss = xeloss(
                    prediction_scores.view(-1, model.config.vocab_size),
                    labels.view(-1))
                loss_mlm = loss.item()

                ## Encode the response part with BERT
                outputs = model.bert(
                    input_resp,
                    attention_mask=input_resp > 0,
                )
                sequence_output = outputs[0]
                hid_resp = sequence_output[:, 0, :]

                ## Calculate RCL loss
                scores = torch.matmul(hid_cont, hid_resp.transpose(1, 0))
                loss_rs = xeloss(scores, resp_label)
                loss += loss_rs
                loss_rs = loss_rs.item()

            ## with only MLM loss
            else:
                inputs = batch["context"].clone()
                if args.mlm:
                    inputs, labels = mask_tokens(inputs, tokenizer, args)
                    inputs = inputs.to(args.device)
                    labels = labels.to(args.device)
                    outputs = model(inputs,
                                    masked_lm_labels=labels,
                                    attention_mask=inputs > 0)
                else:
                    labels = inputs.clone()
                    masked_indices = (labels == 0)
                    labels[masked_indices] = -100
                    outputs = model(inputs, labels=labels)
                loss = outputs[
                    0]  # model outputs are always tuple in transformers (see doc)
                loss_mlm = loss.item()

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps
            loss_arr.append(loss.item())
            loss_mlm_arr.append(loss_mlm)
            loss_rs_arr.append(loss_rs)

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            ## Print loss
            epoch_iterator.set_description(
                "Loss:{:.4f} MLM:{:.4f} RS:{:.4f}".format(
                    np.mean(loss_arr), np.mean(loss_mlm_arr),
                    np.mean(loss_rs_arr)))

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:

                    if args.evaluate_during_training and args.n_gpu == 1:
                        results = evaluate(args, model, dev_loader, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    else:
                        results = {}
                        results["loss"] = best_loss - 0.1  # always saving

                    if results["loss"] < best_loss:
                        patience = 0
                        best_loss = results["loss"]

                        checkpoint_prefix = "checkpoint"
                        # Save model checkpoint
                        output_dir = os.path.join(
                            args.output_dir,
                            "{}-{}".format(checkpoint_prefix, global_step))
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                            model_to_save = (
                                model.module
                                if hasattr(model, "module") else model
                            )  # Take care of distributed/parallel training
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)

                        torch.save(
                            args, os.path.join(output_dir,
                                               "training_args.bin"))
                        logger.info("Saving model checkpoint to %s",
                                    output_dir)

                        _rotate_checkpoints(args, checkpoint_prefix)

                        torch.save(optimizer.state_dict(),
                                   os.path.join(output_dir, "optimizer.pt"))
                        torch.save(scheduler.state_dict(),
                                   os.path.join(output_dir, "scheduler.pt"))
                        logger.info(
                            "Saving optimizer and scheduler states to %s",
                            output_dir)
                    else:
                        patience += 1
                        logger.info(
                            "Current patience: patience {}".format(patience))

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break

            if patience > args.patience:
                logger.info("Ran out of patience...")
                break

        if (args.max_steps > 0
                and global_step > args.max_steps) or patience > args.patience:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #14
0
def train(args, train_dataset, model):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    val_len = int(len(train_dataset)*args.val_fraction)
    train_len = len(train_dataset) - val_len
    train_ds, val_ds = random_split(train_dataset,[train_len, val_len])

    train_sampler = RandomSampler(train_ds) if args.local_rank == -1 else DistributedSampler(train_ds)
    train_dataloader = DataLoader(train_ds, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    args.warmup_steps = int(t_total*args.warmup_proportion)
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [
        {'params': [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)], 'weight_decay': args.weight_decay},
        {'params': [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], 'weight_decay': 0.0}
        ]

    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
            os.path.join(args.model_name_or_path, "scheduler.pt")
    ):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.local_rank],
                                                          output_device=args.local_rank,
                                                          find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_ds))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info("  Total train batch size (w. parallel, distributed & accumulation) = %d",
                   args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        # set global_step to gobal_step of last saved checkpoint from model path
        try:
            global_step = int(args.model_name_or_path.split("-")[-1].split("/")[0])
            epochs_trained = global_step // (len(train_dataloader) // args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info("  Continuing training from checkpoint, will skip to saved global_step")
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d", global_step)
            logger.info("  Will skip the first %d steps in the first epoch", steps_trained_in_current_epoch)
        except:
            pass

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(
        epochs_trained,
        int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])

    set_seed(seed=args.seed,n_gpu=args.n_gpu) # Added here for reproductibility (even between python 2 and 3)

    global_max_seq_len = -1

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            # using adaptive sequence length
            if args.do_freelb:
                max_seq_len = torch.max(torch.sum(batch[1], 1)).item()
                batch = [t[:, :max_seq_len] for t in batch[:3]] + [batch[3]]
                if max_seq_len > global_max_seq_len:
                    global_max_seq_len = max_seq_len


                inputs = {'attention_mask': batch[1],
                          'labels':         batch[3]}
            else:
                inputs = {'input_ids': batch[0],
                          'attention_mask': batch[1],
                          'labels': batch[3]}

            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2] if args.model_type in ['bert', 'xlnet'] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids

            if args.do_freelb and args.model_type in ['bert','albert']:
                # ============================ Code for adversarial training=============
                # initialize delta
                if isinstance(model, torch.nn.DataParallel):
                    embeds_init = model.module.bert.embeddings.word_embeddings(batch[0])
                else:
                    embeds_init = model.bert.embeddings.word_embeddings(batch[0])

                if args.adv_init_mag > 0:
                    input_mask = inputs['attention_mask'].to(embeds_init)
                    input_lengths = torch.sum(input_mask, 1)

                    if args.norm_type == "l2":
                        delta = torch.zeros_like(embeds_init).uniform_(-1, 1) * input_mask.unsqueeze(2)
                        dims = input_lengths * embeds_init.size(-1)
                        mag = args.adv_init_mag / torch.sqrt(dims)
                        delta = (delta * mag.view(-1, 1, 1)).detach()
                    elif args.norm_type == "linf":
                        delta = torch.zeros_like(embeds_init).uniform_(-args.adv_init_mag,
                                                                       args.adv_init_mag) * input_mask.unsqueeze(2)

                else:
                    delta = torch.zeros_like(embeds_init)

                # the main loop
                for astep in range(args.adv_steps):
                    # (0) forward
                    delta.requires_grad_()
                    inputs['inputs_embeds'] = delta + embeds_init
                    outputs = model(**inputs)
                    loss = outputs[0]  # model outputs are always tuple in transformers (see doc)
                    # (1) backward
                    if args.n_gpu > 1:
                        loss = loss.mean()  # mean() to average on multi-gpu parallel training
                    if args.gradient_accumulation_steps > 1:
                        loss = loss / args.gradient_accumulation_steps

                    loss = loss / args.adv_steps

                    tr_loss += loss.item()

                    if args.fp16:
                        with amp.scale_loss(loss, optimizer) as scaled_loss:
                            scaled_loss.backward()
                    else:
                        loss.backward()

                    if astep == args.adv_steps - 1:
                        # further updates on delta
                        break

                    # (2) get gradient on delta
                    delta_grad = delta.grad.clone().detach()

                    # (3) update and clip
                    if args.norm_type == "l2":
                        denorm = torch.norm(delta_grad.view(delta_grad.size(0), -1), dim=1).view(-1, 1, 1)
                        denorm = torch.clamp(denorm, min=1e-8)
                        delta = (delta + args.adv_lr * delta_grad / denorm).detach()
                        if args.adv_max_norm > 0:
                            delta_norm = torch.norm(delta.view(delta.size(0), -1).float(), p=2, dim=1).detach()
                            exceed_mask = (delta_norm > args.adv_max_norm).to(embeds_init)
                            reweights = (args.adv_max_norm / delta_norm * exceed_mask \
                                         + (1 - exceed_mask)).view(-1, 1, 1)
                            delta = (delta * reweights).detach()
                    elif args.norm_type == "linf":
                        denorm = torch.norm(delta_grad.view(delta_grad.size(0), -1), dim=1, p=float("inf")).view(-1, 1,
                                                                                                                 1)
                        denorm = torch.clamp(denorm, min=1e-8)
                        delta = (delta + args.adv_lr * delta_grad / denorm).detach()
                        if args.adv_max_norm > 0:
                            delta = torch.clamp(delta, -args.adv_max_norm, args.adv_max_norm).detach()
                    else:
                        logger.info("Norm type {} not specified.".format(args.norm_type))
                        exit()

                    if isinstance(model, torch.nn.DataParallel):
                        embeds_init = model.module.bert.embeddings.word_embeddings(batch[0])
                    else:
                        embeds_init = model.bert.embeddings.word_embeddings(batch[0])
            else:
                outputs = model(**inputs)
                loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

                if args.n_gpu > 1:
                    loss = loss.mean()  # mean() to average on multi-gpu parallel training
                if args.gradient_accumulation_steps > 1:
                    loss = loss / args.gradient_accumulation_steps

                if args.fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                tr_loss += loss.item()

            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, val_ds, model,prefix=global_step)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value, global_step)
                    tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss)/args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(model, 'module') else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(), os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(), os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #15
0
def train(args, train_dataset, model, tokenizer):
    record_result = []
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if not any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if any(nd in n for nd in no_decay)
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    # Check if saved optimizer or scheduler states exist
    if os.path.isfile(os.path.join(
            args.model_name_or_path, "optimizer.pt")) and os.path.isfile(
                os.path.join(args.model_name_or_path, "scheduler.pt")):
        # Load in optimizer and scheduler states
        optimizer.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "optimizer.pt")))
        scheduler.load_state_dict(
            torch.load(os.path.join(args.model_name_or_path, "scheduler.pt")))

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 1
    epochs_trained = 0
    steps_trained_in_current_epoch = 0
    # Check if continuing training from a checkpoint
    if os.path.exists(args.model_name_or_path):
        try:
            # set global_step to gobal_step of last saved checkpoint from model path
            checkpoint_suffix = args.model_name_or_path.split("-")[-1].split(
                "/")[0]
            global_step = int(checkpoint_suffix)
            epochs_trained = global_step // (len(train_dataloader) //
                                             args.gradient_accumulation_steps)
            steps_trained_in_current_epoch = global_step % (
                len(train_dataloader) // args.gradient_accumulation_steps)

            logger.info(
                "  Continuing training from checkpoint, will skip to saved global_step"
            )
            logger.info("  Continuing training from epoch %d", epochs_trained)
            logger.info("  Continuing training from global step %d",
                        global_step)
            logger.info("  Will skip the first %d steps in the first epoch",
                        steps_trained_in_current_epoch)
        except ValueError:
            logger.info("  Starting fine-tuning.")

    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(epochs_trained,
                            int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    # Added here for reproductibility
    set_seed(args)

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            # Skip past any already trained steps if resuming training
            if steps_trained_in_current_epoch > 0:
                steps_trained_in_current_epoch -= 1
                continue

            model.train()
            batch = tuple(t.to(args.device) for t in batch)

            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                "token_type_ids": batch[2],
                "start_positions": batch[3],
                "end_positions": batch[4],
            }

            if args.model_type in [
                    "xlm", "roberta", "distilbert", "camembert"
            ]:
                del inputs["token_type_ids"]

            if args.model_type in ["xlnet", "xlm"]:
                inputs.update({"cls_index": batch[5], "p_mask": batch[6]})
                if args.version_2_with_negative:
                    inputs.update({"is_impossible": batch[7]})
                if hasattr(model, "config") and hasattr(
                        model.config, "lang2id"):
                    inputs.update({
                        "langs":
                        (torch.ones(batch[0].shape, dtype=torch.int64) *
                         args.lang_id).to(args.device)
                    })

            outputs = model(**inputs)
            # model outputs are always tuple in transformers (see doc)
            loss = outputs[0]

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                # Log metrics
                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Only evaluate when single GPU otherwise metrics may not average well
                    if args.local_rank == -1 and args.evaluate_during_training:
                        results = evaluate(args, model, tokenizer)
                        record_result.append(results)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value,
                                                 global_step)
                    tb_writer.add_scalar("lr",
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                # Save model checkpoint
                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    output_dir = os.path.join(
                        args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # Take care of distributed/parallel training
                    model_to_save = model.module if hasattr(
                        model, "module") else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    torch.save(model, os.path.join(output_dir, "model.pt"))

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    results = evaluate(args, model, tokenizer)
    record_result.append(results)

    torch.save(record_result, os.path.join(args.output_dir, 'result.pt'))

    return global_step, tr_loss / global_step
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=None,
                                  batch_size=args.train_batch_size)

    # Lists to store data after first epoch
    cached_all_input_ids = []
    cached_all_input_mask = []
    cached_all_segment_ids = []
    cached_all_start_positions = []
    cached_all_end_positions = []
    cached_all_cls_index = []
    cached_all_p_mask = []
    cached_all_outputs = []

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 1
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)

    # First epoch
    epoch_iterator = tqdm(train_dataloader,
                          desc="Iteration",
                          disable=args.local_rank not in [-1, 0])
    for step, batch in enumerate(epoch_iterator):
        model.train()
        batch = tuple(t.to(args.device) for t in batch)
        inputs = {
            'input_ids': batch[0],
            'attention_mask': batch[1],
            'start_positions': batch[3],
            'end_positions': batch[4]
        }
        if args.model_type != 'distilbert':
            inputs[
                'token_type_ids'] = None if args.model_type == 'xlm' else batch[
                    2]
        if args.model_type in ['xlnet', 'xlm']:
            inputs.update({'cls_index': batch[5], 'p_mask': batch[6]})
        outputs = model(**inputs)

        # cache data
        #cache_data = cache_data.append((batch[0], batch[1], batch[2], batch[3], batch[4], batch[5], batch[6], batch[6], outputs))
        cached_all_input_ids.append(batch[0])
        cached_all_input_mask.append(batch[1])
        cached_all_segment_ids.append(batch[2])
        cached_all_start_positions.append(batch[3])
        cached_all_end_positions.append(batch[4])
        cached_all_cls_index.append(batch[5])
        cached_all_p_mask.append(batch[6])
        cached_all_outputs.append(outputs[0])
        loss = outputs[
            0]  # model outputs are always tuple in transformers (see doc)
        print(loss.shape)
        if args.n_gpu > 1:
            loss = loss.mean(
            )  # mean() to average on multi-gpu parallel (not distributed) training
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps

        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        tr_loss += loss.item()
        if (step + 1) % args.gradient_accumulation_steps == 0:
            if args.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            optimizer.step()
            scheduler.step()  # Update learning rate schedule
            model.zero_grad()
            global_step += 1

            if args.local_rank in [
                    -1, 0
            ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                # Log metrics
                if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                    results = evaluate(args, model, tokenizer)
                    for key, value in results.items():
                        tb_writer.add_scalar('eval_{}'.format(key), value,
                                             global_step)
                tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step)
                tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                     args.logging_steps, global_step)
                logging_loss = tr_loss

            if args.local_rank in [
                    -1, 0
            ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                # Save model checkpoint
                output_dir = os.path.join(args.output_dir,
                                          'checkpoint-{}'.format(global_step))
                if not os.path.exists(output_dir):
                    os.makedirs(output_dir)
                model_to_save = model.module if hasattr(
                    model, 'module'
                ) else model  # Take care of distributed/parallel training
                model_to_save.save_pretrained(output_dir)
                torch.save(args, os.path.join(output_dir, 'training_args.bin'))
                logger.info("Saving model checkpoint to %s", output_dir)

        if args.max_steps > 0 and global_step > args.max_steps:
            epoch_iterator.close()
            break
    '''
    if args.max_steps > 0 and global_step > args.max_steps:
        train_iterator.close()
        break
    '''
    # For epochs > 1 : use the create train_dataset
    print("****************************",
          torch.stack(cached_all_input_ids).shape,
          torch.stack(cached_all_input_mask).shape,
          torch.stack(cached_all_segment_ids).shape,
          torch.stack(cached_all_start_positions).shape,
          torch.stack(cached_all_end_positions).shape,
          torch.stack(cached_all_cls_index).shape,
          torch.stack(cached_all_p_mask).shape,
          torch.stack(cached_all_outputs).shape)
    train_dataset = TensorDataset(torch.stack(cached_all_input_ids),
                                  torch.stack(cached_all_input_mask),
                                  torch.stack(cached_all_segment_ids),
                                  torch.stack(cached_all_start_positions),
                                  torch.stack(cached_all_end_positions),
                                  torch.stack(cached_all_cls_index),
                                  torch.stack(cached_all_p_mask),
                                  torch.stack(cached_all_outputs))
    # Initialise train_data loader
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    train_iterator = trange(int(args.num_train_epochs - 1),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    for i in train_iterator:
        start_time = timeit.default_timer()
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'start_positions': batch[3],
                'end_positions': batch[4]
            }
            if args.model_type != 'distilbert':
                inputs[
                    'token_type_ids'] = None if args.model_type == 'xlm' else batch[
                        2]
            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[5], 'p_mask': batch[6]})
            outputs = batch[7]
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)
            print(loss)
            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir, 'checkpoint-{}'.format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
            epoch_time = timeit.default_timer() - start_time
            logger.info("Train Epoch %s time: %f secs", str(i), epoch_time)
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #17
0
    def _train_epoch(self, scheduler, epoch: int, eval_step: int,
                     train_data_iterator: ShardedDataIterator, global_step: int):
        args = self.args
        rolling_train_loss = 0.0
        epoch_loss = 0
        log_result_step = args.log_batch_step
        rolling_loss_step = args.train_rolling_loss_step

        self.reader.train()
        epoch_batches = train_data_iterator.max_iterations

        for i, samples_batch in enumerate(train_data_iterator.iterate_data(epoch=epoch)):

            data_iteration = train_data_iterator.get_iteration()

            # enables to resume to exactly same train state
            if args.fully_resumable:
                np.random.seed(args.seed + global_step)
                torch.manual_seed(args.seed + global_step)
                if args.n_gpu > 0:
                    torch.cuda.manual_seed_all(args.seed + global_step)

            input = create_reader_input(self.tensorizer.get_pad_id(),
                                        samples_batch,
                                        args.passages_per_question,
                                        args.sequence_length,
                                        args.max_n_answers,
                                        is_train=True, shuffle=True)

            loss = self._calc_loss(input)

            epoch_loss += loss.item()
            rolling_train_loss += loss.item()

            if args.fp16:
                from apex import amp
                with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                    scaled_loss.backward()
                if args.max_grad_norm > 0:
                    torch.nn.utils.clip_grad_norm_(amp.master_params(self.optimizer), args.max_grad_norm)
            else:
                loss.backward()
                if args.max_grad_norm > 0:
                    torch.nn.utils.clip_grad_norm_(self.reader.parameters(), args.max_grad_norm)

            global_step += 1

            if (i + 1) % args.gradient_accumulation_steps == 0:
                self.optimizer.step()
                scheduler.step()
                self.reader.zero_grad()

            if global_step % log_result_step == 0:
                lr = self.optimizer.param_groups[0]['lr']
                logger.info(
                    'Epoch: %d: Step: %d/%d, global_step=%d, lr=%f', epoch, data_iteration, epoch_batches, global_step,
                    lr)

            if (i + 1) % rolling_loss_step == 0:
                logger.info('Train batch %d', data_iteration)
                latest_rolling_train_av_loss = rolling_train_loss / rolling_loss_step
                logger.info('Avg. loss per last %d batches: %f', rolling_loss_step, latest_rolling_train_av_loss)
                rolling_train_loss = 0.0

            if global_step % eval_step == 0:
                logger.info('Validation: Epoch: %d Step: %d/%d', epoch, data_iteration, epoch_batches)
                self.validate_and_save(epoch, train_data_iterator.get_iteration(), scheduler)
                self.reader.train()

        epoch_loss = (epoch_loss / epoch_batches) if epoch_batches > 0 else 0
        logger.info('Av Loss per epoch=%f', epoch_loss)
        return global_step
Example #18
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(train_dataset) if args.local_rank == -1 else DistributedSampler(train_dataset)
    train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(train_dataloader) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": args.weight_decay,
        },
        {"params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)], "weight_decay": 0.0},
    ]
    optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate, eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=args.warmup_steps, num_training_steps=t_total
    )
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.")
        model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True
        )

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size
        * args.gradient_accumulation_steps
        * (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d", args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0])
    set_seed(args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                "input_ids": batch[0],
                "attention_mask": batch[1],
                # 'token_type_ids':  None if args.model_type == 'xlm' else batch[2],
                "token_type_ids": batch[2],
                "labels": batch[3],
            }
            # if args.model_type in ['xlnet', 'xlm']:
            #     inputs.update({'cls_index': batch[5],
            #                    'p_mask':       batch[6]})
            outputs = model(**inputs)
            loss = outputs[0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean()  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [-1, 0] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if (
                        args.local_rank == -1 and args.evaluate_during_training
                    ):  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar("eval_{}".format(key), value, global_step)
                    tb_writer.add_scalar("lr", scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar("loss", (tr_loss - logging_loss) / args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [-1, 0] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    # Save model checkpoint
                    output_dir = os.path.join(args.output_dir, "checkpoint-{}".format(global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_vocabulary(output_dir)
                    torch.save(args, os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #19
0
def train(args, train_dataset, model: PreTrainedModel,
          tokenizer: PreTrainedTokenizer) -> Tuple[int, float]:
    """ Train the model """
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    def collate(examples):
        model.eval()
        examples_src, examples_tgt, examples_srctgt, langid_srctgt, psi_examples_srctgt, psi_labels = [], [], [], [], [], []
        src_len = tgt_len = 0
        bpe2word_map_src, bpe2word_map_tgt = [], []
        for example in examples:
            end_id = example[0][0][-1].view(-1)

            src_id = example[0][0][:args.block_size]
            src_id = torch.cat([src_id[:-1], end_id])
            tgt_id = example[1][0][:args.block_size]
            tgt_id = torch.cat([tgt_id[:-1], end_id])

            half_block_size = int(args.block_size / 2)
            half_src_id = example[0][0][:half_block_size]
            half_src_id = torch.cat([half_src_id[:-1], end_id])
            half_tgt_id = example[1][0][:half_block_size]
            half_tgt_id = torch.cat([half_tgt_id[:-1], end_id])

            examples_src.append(src_id)
            examples_tgt.append(tgt_id)
            src_len = max(src_len, len(src_id))
            tgt_len = max(tgt_len, len(tgt_id))

            if random.random() > 0.5:
                srctgt = torch.cat([half_src_id, half_tgt_id])
                langid = torch.cat([
                    torch.ones_like(half_src_id),
                    torch.ones_like(half_tgt_id) * 2
                ])
            else:
                srctgt = torch.cat([half_tgt_id, half_src_id])
                langid = torch.cat([
                    torch.ones_like(half_tgt_id),
                    torch.ones_like(half_src_id) * 2
                ])
            examples_srctgt.append(srctgt)
            langid_srctgt.append(langid)

            # [neg, neg] pair
            neg_half_src_id = example[-2][0][:half_block_size]
            neg_half_src_id = torch.cat([neg_half_src_id[:-1], end_id])
            neg_half_tgt_id = example[-1][0][:half_block_size]
            neg_half_tgt_id = torch.cat([neg_half_tgt_id[:-1], end_id])
            if random.random() > 0.5:
                neg_srctgt = torch.cat([neg_half_src_id, neg_half_tgt_id])
            else:
                neg_srctgt = torch.cat([neg_half_tgt_id, neg_half_src_id])
            psi_examples_srctgt.append(neg_srctgt)
            psi_labels.append(1)

            # [pos, neg] pair
            rd = random.random()
            if rd > 0.75:
                neg_srctgt = torch.cat([half_src_id, neg_half_tgt_id])
            elif rd > 0.5:
                neg_srctgt = torch.cat([neg_half_src_id, half_tgt_id])
            elif rd > 0.25:
                neg_srctgt = torch.cat([half_tgt_id, neg_half_src_id])
            else:
                neg_srctgt = torch.cat([neg_half_tgt_id, half_src_id])
            psi_examples_srctgt.append(neg_srctgt)
            psi_labels.append(0)

            bpe2word_map_src.append(example[2])
            bpe2word_map_tgt.append(example[3])

        examples_src = pad_sequence(examples_src,
                                    batch_first=True,
                                    padding_value=tokenizer.pad_token_id)
        examples_tgt = pad_sequence(examples_tgt,
                                    batch_first=True,
                                    padding_value=tokenizer.pad_token_id)
        examples_srctgt = pad_sequence(examples_srctgt,
                                       batch_first=True,
                                       padding_value=tokenizer.pad_token_id)
        langid_srctgt = pad_sequence(langid_srctgt,
                                     batch_first=True,
                                     padding_value=tokenizer.pad_token_id)
        psi_examples_srctgt = pad_sequence(
            psi_examples_srctgt,
            batch_first=True,
            padding_value=tokenizer.pad_token_id)
        psi_labels = torch.tensor(psi_labels)
        guides = model.get_aligned_word(
            examples_src,
            examples_tgt,
            bpe2word_map_src,
            bpe2word_map_tgt,
            args.device,
            src_len,
            tgt_len,
            align_layer=args.align_layer,
            extraction=args.extraction,
            softmax_threshold=args.softmax_threshold)
        return examples_src, examples_tgt, guides, examples_srctgt, langid_srctgt, psi_examples_srctgt, psi_labels

    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  collate_fn=collate)

    t_total = len(train_dataloader
                  ) // args.gradient_accumulation_steps * args.num_train_epochs
    if args.max_steps > 0 and args.max_steps < t_total:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ["bias", "LayerNorm.weight"]

    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in model.named_parameters()
                if (not (any(nd in n for nd in no_decay)))
            ],
            "weight_decay":
            args.weight_decay,
        },
        {
            "params": [
                p for n, p in model.named_parameters()
                if ((any(nd in n for nd in no_decay)))
            ],
            "weight_decay":
            0.0
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)

    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1),
    )
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    # Check if continuing training from a checkpoint
    tr_loss, logging_loss = 0.0, 0.0

    model_to_resize = model.module if hasattr(
        model,
        "module") else model  # Take care of distributed/parallel training
    model_to_resize.resize_token_embeddings(len(tokenizer))

    model.zero_grad()
    set_seed(args)  # Added here for reproducibility

    def backward_loss(loss, tot_loss):
        if args.n_gpu > 1:
            loss = loss.mean(
            )  # mean() to average on multi-gpu parallel training
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps

        tot_loss += loss.item()
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()
        return tot_loss

    tqdm_iterator = trange(int(t_total),
                           desc="Iteration",
                           disable=args.local_rank not in [-1, 0])
    for _ in range(int(args.num_train_epochs)):
        for step, batch in enumerate(train_dataloader):

            model.train()

            if args.train_so:
                inputs_src, inputs_tgt = batch[0].clone(), batch[1].clone()
                inputs_src, inputs_tgt = inputs_src.to(
                    args.device), inputs_tgt.to(args.device)
                attention_mask_src, attention_mask_tgt = (inputs_src !=
                                                          0), (inputs_tgt != 0)
                guide = batch[2].to(args.device)
                loss = model(inputs_src=inputs_src,
                             inputs_tgt=inputs_tgt,
                             attention_mask_src=attention_mask_src,
                             attention_mask_tgt=attention_mask_tgt,
                             guide=guide,
                             align_layer=args.align_layer,
                             extraction=args.extraction,
                             softmax_threshold=args.softmax_threshold)
                tr_loss = backward_loss(loss, tr_loss)

            if args.train_mlm:
                inputs_src, labels_src = mask_tokens(batch[0], tokenizer, args)
                inputs_tgt, labels_tgt = mask_tokens(batch[1], tokenizer, args)
                inputs_src, inputs_tgt = inputs_src.to(
                    args.device), inputs_tgt.to(args.device)
                labels_src, labels_tgt = labels_src.to(
                    args.device), labels_tgt.to(args.device)
                loss = model(inputs_src=inputs_src, labels_src=labels_src)
                tr_loss = backward_loss(loss, tr_loss)

                loss = model(inputs_src=inputs_tgt, labels_src=labels_tgt)
                tr_loss = backward_loss(loss, tr_loss)

            if args.train_tlm:
                inputs_srctgt, labels_srctgt = mask_tokens(
                    batch[3], tokenizer, args, batch[4], 1)
                inputs_srctgt, labels_srctgt = inputs_srctgt.to(
                    args.device), labels_srctgt.to(args.device)
                loss = model(inputs_src=inputs_srctgt,
                             labels_src=labels_srctgt)
                tr_loss = backward_loss(loss, tr_loss)

                inputs_srctgt, labels_srctgt = mask_tokens(
                    batch[3], tokenizer, args, batch[4], 2)
                inputs_srctgt, labels_srctgt = inputs_srctgt.to(
                    args.device), labels_srctgt.to(args.device)
                loss = model(inputs_src=inputs_srctgt,
                             labels_src=labels_srctgt)
                tr_loss = backward_loss(loss, tr_loss)

            if args.train_psi:
                loss = model(inputs_src=batch[5].to(args.device),
                             labels_psi=batch[6].to(args.device),
                             align_layer=args.align_layer + 1)
                tr_loss = backward_loss(loss, tr_loss)

            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1
                tqdm_iterator.update()

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logger.info(
                        "  Step %s. Training loss = %s", str(global_step),
                        str((tr_loss - logging_loss) / args.logging_steps))
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = "checkpoint"
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir,
                        "{}-{}".format(checkpoint_prefix, global_step))
                    os.makedirs(output_dir, exist_ok=True)
                    model_to_save = (
                        model.module if hasattr(model, "module") else model
                    )  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)

                    torch.save(args,
                               os.path.join(output_dir, "training_args.bin"))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

                    torch.save(optimizer.state_dict(),
                               os.path.join(output_dir, "optimizer.pt"))
                    torch.save(scheduler.state_dict(),
                               os.path.join(output_dir, "scheduler.pt"))
                    logger.info("Saving optimizer and scheduler states to %s",
                                output_dir)

            if global_step > t_total:
                break
        if global_step > t_total:
            break

    return global_step, tr_loss / global_step
def train(
    model: TEDD1104,
    optimizer_name: str,
    optimizer: torch.optim,
    train_dir: str,
    dev_dir: str,
    test_dir: str,
    output_dir: str,
    batch_size: int,
    initial_epoch: int,
    num_epoch: int,
    max_acc: float,
    hide_map_prob: float,
    fp16: bool = True,
    amp_opt_level=None,
    save_checkpoints: bool = True,
    save_best: bool = True,
):
    """
    Train a model

    Input:
    - model: TEDD1104 model to train
    - optimizer_name: Name of the optimizer to use [SGD, Adam]
    - optimizer: Optimizer (torch.optim)
    - train_dir: Directory where the train files are stored
    - dev_dir: Directory where the development files are stored
    - test_dir: Directory where the test files are stored
    - output_dir: Directory where the model and the checkpoints are going to be saved
    - batch_size: Batch size (Around 10 for 8GB GPU)
    - initial_epoch: Number of previous epochs used to train the model (0 unless the model has been
      restored from checkpoint)
    - num_epochs: Number of epochs to do
    - max_acc: Accuracy in the development set (0 unless the model has been
      restored from checkpoint)
    - hide_map_prob: Probability for removing the minimap (black square) from the image (0<=hide_map_prob<=1)
    - fp16: Use FP16 for training
    - amp_opt_level: If FP16 training Nvidia apex opt level
    - save_checkpoints: save a checkpoint each epoch (Each checkpoint will rewrite the previous one)
    - save_best: save the model that achieves the higher accuracy in the development set

    Output:
     - float: Accuracy in the development test of the best model
    """

    if fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )

    criterion: CrossEntropyLoss = torch.nn.CrossEntropyLoss()
    X_dev, y_dev = load_dataset(dev_dir, fp=16 if fp16 else 31)
    X_dev = torch.from_numpy(X_dev)

    X_test, y_test = load_dataset(test_dir, fp=16 if fp16 else 21)
    X_test = torch.from_numpy(X_test)

    acc_dev: float = 0.0

    printTrace("Training...")
    for epoch in range(num_epoch):

        for file_t in glob.glob(os.path.join(train_dir, "*.npz")):
            model.train()
            start_time: float = time.time()

            X, y = load_file(path=file_t,
                             fp=16 if fp16 else 32,
                             hide_map_prob=hide_map_prob)
            running_loss = 0.0
            num_batchs = 0

            for X_bacth, y_batch in nn_batchs(X, y, batch_size):
                X_bacth, y_batch = (
                    torch.from_numpy(X_bacth).to(device),
                    torch.from_numpy(y_batch).long().to(device),
                )
                optimizer.zero_grad()
                outputs = model.forward(X_bacth)
                loss = criterion(outputs, y_batch)
                if fp16:
                    with amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                else:
                    loss.backward()

                if fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), 1.0)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

                optimizer.step()
                running_loss += loss.item()
                num_batchs += 1

            # Print Statistics
            acc_train = evaluate(
                model=model,
                X=torch.from_numpy(X),
                golds=y,
                device=device,
                batch_size=batch_size,
            )

            acc_dev = evaluate(
                model=model,
                X=X_dev,
                golds=y_dev,
                device=device,
                batch_size=batch_size,
            )

            acc_test = evaluate(
                model=model,
                X=X_test,
                golds=y_test,
                device=device,
                batch_size=batch_size,
            )

            printTrace(
                f"EPOCH: {initial_epoch+epoch}. Current File {file_t}. Training time: {time.time() - start_time} secs"
            )

            printTrace(
                f"Loss: {running_loss / num_batchs}. Acc training set: {acc_train}. "
                f"Acc dev set: {acc_dev}. Acc test set: {acc_test}")

            if acc_dev > max_acc and save_best:
                max_acc = acc_dev
                printTrace(
                    f"New max acc in dev set {max_acc}. Saving model...")
                save_model(
                    model=model,
                    save_dir=output_dir,
                    fp16=fp16,
                    amp_opt_level=amp_opt_level,
                )

        if save_checkpoints:
            printTrace("Saving checkpoint...")
            save_checkpoint(
                path=os.path.join(output_dir, "checkpoint.pt"),
                model=model,
                optimizer_name=optimizer_name,
                optimizer=optimizer,
                acc_dev=acc_dev,
                epoch=initial_epoch + epoch,
                fp16=fp16,
                opt_level=amp_opt_level,
            )

    return max_acc
Example #21
0
    def test_2models2losses2optimizers(self):
        model0 = MyModel(1)
        model1 = MyModel(2)

        optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
                                      momentum=0.125)
        optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}],
                                      momentum=0.25)

        # Don't do it like this:  reference_grads = [[]]*5
        # because then it creates a list of 5 references to the same "[]" and appending
        # to any of them effectively makes you append to all of them, which multiplies
        # the resulting size of reference_grads by 5x and needless to say makes the test fail.
        reference_grads = [[], [], [], [], []]
        final_params = [None, None, None, None, None]
        for i in range(2):
            optimizer0.zero_grad()
            optimizer1.zero_grad()
            loss0 = model0(self.x)
            loss1 = model1(self.x)
            loss0.backward()
            loss1.backward()

            reference_grads[0].append([param.grad.data.clone() for param in model0.parameters()] +
                                   [param.grad.data.clone() for param in model1.parameters()])

            optimizer0.step()
            optimizer1.step()

        final_params[0] = [param.data.clone() for param in model0.parameters()] + \
                          [param.data.clone() for param in model1.parameters()]

        def what_got_skipped(which_iter, which_backward):
            if which_iter == 0 and which_backward == 0:
                return 1
            if which_iter == 0 and which_backward == 1:
                return 2
            if which_iter == 1 and which_backward == 0:
                return 3
            if which_iter == 1 and which_backward == 1:
                return 4
            return 0

        for which_iter in (0,1):
            for which_backward in (0,1):
                model0 = MyModel(1)
                model1 = MyModel(2)

                optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25}],
                                              momentum=0.125)
                optimizer1 = torch.optim.SGD([{'params' : model1.parameters(), 'lr' : 0.5}],
                                              momentum=0.25)

                for i in range(3):
                    optimizer0.zero_grad()
                    optimizer1.zero_grad()
                    loss0 = model0(self.x)
                    loss1 = model1(self.x)
                    loss0.backward()
                    loss1.backward()

                    if i != which_iter:
                        reference_grads[what_got_skipped(which_iter, which_backward)].append(
                            [param.grad.data.clone() for param in model0.parameters()] +
                            [param.grad.data.clone() for param in model1.parameters()])

                    if i == which_iter:
                        if which_backward == 0:
                            optimizer1.step()
                        else:
                            optimizer0.step()
                    else:
                        optimizer0.step()
                        optimizer1.step()

                final_params[what_got_skipped(which_iter, which_backward)] = \
                    [param.data.clone() for param in model0.parameters()] + \
                    [param.data.clone() for param in model1.parameters()]

        for materialize_master_grads in (False, True):
          for opt_level in ("O0", "O1", "O2", "O3"):
            for how_to_zero in ("none", "model", "optimizer"):
              for use_multiple_loss_scalers in (False, True):
                if opt_level == "O1" or opt_level == "O2":
                    inject_inf_iters = (-1, 0, 1)
                else:
                    inject_inf_iters = (-1,)

                for inject_inf in inject_inf_iters:
                  if inject_inf >= 0:
                     inject_inf_locs = ("fp16", "fp32")
                     which_backwards = (0, 1)
                  else:
                     inject_inf_locs = ("fdsa",)
                     which_backwards = (None,)

                  for inject_inf_loc in inject_inf_locs:
                    for which_backward in which_backwards:
                        if use_multiple_loss_scalers:
                            num_losses = 2
                            loss_ids = [0, 1]
                        else:
                            num_losses = 1
                            loss_ids = [0, 0]

                        if inject_inf >= 0:
                            iters = 3
                        else:
                            iters = 2

                        model0 = MyModel(1)
                        model1 = MyModel(2)

                        models = [model0, model1]

                        optimizer0 = FusedSGD([{'params' : model0.parameters(), 'lr' : 0.25}],
                                              momentum=0.125, materialize_master_grads=materialize_master_grads)
                        optimizer1 = FusedSGD([{'params' : model1.parameters(), 'lr' : 0.5}],
                                              momentum=0.25, materialize_master_grads=materialize_master_grads)

                        _amp_state.allow_incoming_model_not_fp32 = True
                        [model0, model1], [optimizer0, optimizer1] = amp.initialize(
                            [model0, model1],
                            [optimizer0, optimizer1],
                            opt_level=opt_level,
                            verbosity=0,
                            cast_model_type=False,
                            num_losses=num_losses)
                        _amp_state.allow_incoming_model_not_fp32 = False

                        _amp_state.loss_scalers[0]._loss_scale = 4.0
                        if use_multiple_loss_scalers:
                            _amp_state.loss_scalers[1]._loss_scale = 16.0

                        unskipped = 0
                        for i in range(iters):
                            if how_to_zero == "none":
                                for model in models:
                                    for param in model.parameters():
                                        param.grad = None
                            elif how_to_zero == "model":
                                for model in models:
                                    model.zero_grad()
                            else:
                                optimizer0.zero_grad()
                                optimizer1.zero_grad()

                            loss0 = model0(self.x)
                            loss1 = model1(self.x)

                            with amp.scale_loss(loss0, optimizer0, loss_id=loss_ids[0]) as scaled_loss:
                                scaled_loss.backward()
                                if i == inject_inf and which_backward == 0:
                                    if inject_inf_loc == "fp32":
                                        model0.weight0.grad[0] = float('inf')
                                    elif inject_inf_loc == "fp16":
                                        model0.weight1.grad[0] = float('inf')
                            with amp.scale_loss(loss1, optimizer1, loss_id=loss_ids[1]) as scaled_loss:
                                scaled_loss.backward()
                                if i == inject_inf and which_backward == 1:
                                    if inject_inf_loc == "fp32":
                                        model1.weight0.grad[0] = float('inf')
                                    elif inject_inf_loc == "fp16":
                                        model1.weight1.grad[0] = float('inf')

                            # print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers))

                            if i != inject_inf:
                                master_params = list(amp.master_params(optimizer0)) + \
                                                list(amp.master_params(optimizer1))
                                for param, reference_grad in zip(master_params,
                                        reference_grads[what_got_skipped(inject_inf, which_backward)][unskipped]):
                                    if opt_level == "O2" and not materialize_master_grads:
                                        continue
                                    else:
                                        self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
                                unskipped += 1

                            optimizer0.step()
                            optimizer1.step()

                        model_params = [p for p in model0.parameters()] + [p for p in model1.parameters()]
                        master_params = [p for p in amp.master_params(optimizer0)] + \
                                        [p for p in amp.master_params(optimizer1)]
                        for model, master, reference in zip(
                                model_params,
                                master_params,
                                final_params[what_got_skipped(inject_inf, which_backward)]):
                            self.assertTrue(torch.allclose(model, reference))
                            self.assertTrue(torch.allclose(model, master.to(model.dtype)))

                        if opt_level == "O1":
                            _amp_state.handle._deactivate()
Example #22
0
    def train_step(
        self,
        batch,
        idx,
        scheduler,
    ):
        """
        Training for a single batch.
        
        --------------------
        Returns:
        loss - average of start and end cross entropy loss
        https://huggingface.co/transformers/_modules/transformers/modeling_bert.html#BertForQuestionAnswering
        """
        if self.fp16:
            from apex import amp

        # unpack batch data
        batch = tuple(t.to(self.device) for t in batch)

        inputs = {
            "input_ids": batch[0],
            "attention_mask": batch[1],
            "token_type_ids": batch[2],
            "start_positions": batch[3],
            "end_positions": batch[4]
        }

        # zero gradients
        self.model.zero_grad()

        # send data through model forward
        out = self.model(**inputs)
        # model outputs are always tuple in transformers (see doc)
        loss = out[0]

        # for multi-gpu
        if isinstance(self.model, nn.DataParallel):
            loss = loss.mean()  # average on multi-gpu parallel training

        # calculate gradients through back prop
        if self.fp16:
            with amp.scale_loss(loss, self.optimizer) as scaled_loss:
                scaled_loss.backward()
        else:
            loss.backward()

        # clip gradients
        if self.fp16:
            nn.utils.clip_grad_norm_(amp.master_params(self.optimizer),
                                     self.max_grad_norm)
        else:
            nn.utils.clip_grad_norm_(self.model.parameters(),
                                     self.max_grad_norm)

        #take a step in gradient descent
        self.optimizer.step()
        scheduler.step()

        # check which weights are changing
        if self.debug and idx > self.warmup_steps:
            self.check_changing(idx)

        # zero gradients
        self.model.zero_grad()

        return loss.detach()
Example #23
0
    def test_2models2losses1optimizer(self):
        model0 = MyModel(1)
        model1 = MyModel(2)

        optimizer = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
                                     {'params' : model1.parameters(), 'lr' : 0.5}],
                                    momentum=0.125)

        reference_grads = []
        for i in range(2):
            optimizer.zero_grad()
            loss0 = model0(self.x)
            loss1 = model1(self.x)
            loss0.backward()
            loss1.backward()

            reference_grads.append([param.grad.data.clone() for param in model0.parameters()] +
                                   [param.grad.data.clone() for param in model1.parameters()])

            optimizer.step()

        final_params = [param.data.clone() for param in model0.parameters()] + \
                       [param.data.clone() for param in model1.parameters()]

        for materialize_master_grads in (False, True):
          for opt_level in ("O0", "O1", "O2", "O3"):
            for how_to_zero in ("none", "model", "optimizer"):
              for use_multiple_loss_scalers in (False, True):
                if opt_level == "O1" or opt_level == "O2":
                    inject_inf_iters = (-1, 0, 1)
                else:
                    inject_inf_iters = (-1,)

                for inject_inf in inject_inf_iters:
                  if inject_inf >= 0:
                     inject_inf_locs = ("fp16", "fp32")
                     which_backwards = (0, 1)
                  else:
                     inject_inf_locs = ("fdsa",)
                     which_backwards = (None,)

                  for inject_inf_loc in inject_inf_locs:
                    for which_backward in which_backwards:
                        if use_multiple_loss_scalers:
                            num_losses = 2
                            loss_ids = [0, 1]
                        else:
                            num_losses = 1
                            loss_ids = [0, 0]

                        if inject_inf >= 0:
                            iters = 3
                        else:
                            iters = 2

                        model0 = MyModel(1)
                        model1 = MyModel(2)

                        models = [model0, model1]

                        optimizer = FusedSGD([{'params' : model0.parameters(), 'lr' : 0.25},
                                              {'params' : model1.parameters(), 'lr' : 0.5}],
                                             momentum=0.125,
                                             materialize_master_grads=materialize_master_grads)

                        _amp_state.allow_incoming_model_not_fp32 = True
                        [model0, model1], optimizer = amp.initialize(
                            [model0, model1],
                            optimizer,
                            opt_level=opt_level,
                            verbosity=0,
                            cast_model_type=False,
                            num_losses=num_losses)
                        _amp_state.allow_incoming_model_not_fp32 = False

                        _amp_state.loss_scalers[0]._loss_scale = 4.0
                        if use_multiple_loss_scalers:
                            _amp_state.loss_scalers[1]._loss_scale = 16.0

                        unskipped = 0
                        for i in range(iters):
                            if how_to_zero == "none":
                                for model in models:
                                    for param in model.parameters():
                                        param.grad = None
                            elif how_to_zero == "model":
                                for model in models:
                                    model.zero_grad()
                            else:
                                optimizer.zero_grad()

                            loss0 = model0(self.x)
                            loss1 = model1(self.x)

                            with amp.scale_loss(loss0, optimizer, loss_id=loss_ids[0]) as scaled_loss:
                                scaled_loss.backward()
                                if i == inject_inf and which_backward == 0:
                                    if inject_inf_loc == "fp32":
                                        model0.weight0.grad[0] = float('inf')
                                    elif inject_inf_loc == "fp16":
                                        model0.weight1.grad[0] = float('inf')
                            with amp.scale_loss(loss1, optimizer, loss_id=loss_ids[1]) as scaled_loss:
                                scaled_loss.backward()
                                if i == inject_inf and which_backward == 1:
                                    if inject_inf_loc == "fp32":
                                        model1.weight0.grad[0] = float('inf')
                                    elif inject_inf_loc == "fp16":
                                        model1.weight1.grad[0] = float('inf')

                            if i != inject_inf:
                                master_params = amp.master_params(optimizer)
                                for param, reference_grad in zip(master_params, reference_grads[unskipped]):
                                    if opt_level == "O2" and not materialize_master_grads:
                                        continue
                                    else:
                                        self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()),
                                                        "opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers))
                                unskipped += 1
                            optimizer.step()

                        model_params = [p for p in model0.parameters()] + [p for p in model1.parameters()]
                        for model, master, reference in zip(
                                model_params,
                                amp.master_params(optimizer),
                                final_params):
                            self.assertTrue(torch.allclose(model, reference))
                            self.assertTrue(torch.allclose(model, master.to(model.dtype)))

                        if opt_level == "O1":
                            _amp_state.handle._deactivate()
Example #24
0
def train_single_iteration(args, model, train_examples: Examples,
                           valid_examples: Examples, optimizer, scheduler,
                           tb_writer, step_bar, skip_n_steps):
    tr_loss, tr_ac = 0, 0
    batch_size = args.per_gpu_train_batch_size
    cache_file = "cached_single_random_neg_sample_epoch_{}.dat".format(
        args.epochs_trained)
    # save the examples for epoch
    if args.neg_sampling == "random":
        if args.overwrite or not os.path.isfile(cache_file):
            train_dataloader = train_examples.random_neg_sampling_dataloader(
                batch_size=batch_size)
            torch.save(train_dataloader, cache_file)
        else:
            train_dataloader = torch.load(cache_file)
    elif args.neg_sampling == "online":
        # we provide only positive cases and will create negative in the batch processing
        train_dataloader = train_examples.online_neg_sampling_dataloader(
            batch_size=int(batch_size / 2))
    else:
        raise Exception("{} neg_sampling is not recoginized...".format(
            args.neg_sampling))

    for step, batch in enumerate(train_dataloader):
        if skip_n_steps > 0:
            skip_n_steps -= 1
            continue
        if args.neg_sampling == "online":
            batch = train_examples.make_online_neg_sampling_batch(
                batch, model, args.hard_ratio)

        model.train()
        labels = batch[2].to(model.device)
        inputs = format_batch_input_for_single_bert(batch, train_examples,
                                                    model)
        inputs['relation_label'] = labels
        outputs = model(**inputs)
        loss = outputs['loss']
        logit = outputs['logits']
        y_pred = logit.data.max(1)[1]
        tr_ac += y_pred.eq(labels).long().sum().item()

        if args.n_gpu > 1:
            loss = loss.mean(
            )  # mean() to average on multi-gpu parallel (not distributed) training
        if args.gradient_accumulation_steps > 1:
            loss = loss / args.gradient_accumulation_steps

        if args.fp16:
            try:
                from apex import amp
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            except ImportError:
                raise ImportError(
                    "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
                )
        else:
            loss.backward()
        tr_loss += loss.item()

        if (step + 1) % args.gradient_accumulation_steps == 0:
            if args.fp16:
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)
            optimizer.step()
            scheduler.step()
            model.zero_grad()
            args.global_step += 1
            step_bar.update()

            if args.local_rank in [
                    -1, 0
            ] and args.logging_steps > 0 and args.global_step % args.logging_steps == 0:
                tb_data = {
                    'lr':
                    scheduler.get_last_lr()[0],
                    'acc':
                    tr_ac / args.logging_steps /
                    (args.train_batch_size * args.gradient_accumulation_steps),
                    'loss':
                    tr_loss / args.logging_steps
                }
                write_tensor_board(tb_writer, tb_data, args.global_step)
                tr_loss = 0.0
                tr_ac = 0.0

            # Save model checkpoint
            if args.local_rank in [
                    -1, 0
            ] and args.save_steps > 0 and args.global_step % args.save_steps == 1:
                # step invoke checkpoint writing
                ckpt_output_dir = os.path.join(
                    args.output_dir, "checkpoint-{}".format(args.global_step))
                save_check_point(model, ckpt_output_dir, args, optimizer,
                                 scheduler)

            if args.valid_step > 0 and args.global_step % args.valid_step == 1:
                # step invoke validation
                # valid_examples.update_embd(model)
                valid_accuracy, valid_loss = evaluate_classification(
                    valid_examples, model, args.per_gpu_eval_batch_size,
                    "evaluation/runtime_eval")
                pk, best_f1, map = evalute_retrivial_for_single_bert(
                    model, valid_examples, args.per_gpu_eval_batch_size,
                    "evaluation/runtime_eval")
                tb_data = {
                    "valid_accuracy": valid_accuracy,
                    "valid_loss": valid_loss,
                    "precision@3": pk,
                    "best_f1": best_f1,
                    "MAP": map
                }
                write_tensor_board(tb_writer, tb_data, args.global_step)
            args.steps_trained_in_current_epoch += 1
            if args.max_steps > 0 and args.global_step > args.max_steps:
                break
Example #25
0
    def test_3models2losses2optimizers(self):
        model0 = MyModel(1)
        model1 = MyModel(2)
        model2 = MyModel(3)

        optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
                                      {'params' : model1.parameters(), 'lr' : 1.0}],
                                     momentum=0.5)
        optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}],
                                     momentum=0.25)

        # Again, can't do this:  reference_grads = [[]]*9
        reference_grads = [[], [], [], [], [], [], [], [], []]
        final_params = [None, None, None, None, None, None, None, None, None]
        for i in range(2):
            optimizer0.zero_grad()
            optimizer1.zero_grad()
            loss0 = model0(self.x) + model1(self.x)
            loss1 = model2(self.x) + model1(self.x)
            loss0.backward()
            loss1.backward()

            reference_grads[0].append([param.grad.data.clone() for param in model0.parameters()] +
                                   [param.grad.data.clone() for param in model1.parameters()])

            optimizer0.step()
            optimizer1.step()

        final_params[0] = \
            [param.data.clone() for param in model0.parameters()] + \
            [param.data.clone() for param in model1.parameters()] + \
            [param.data.clone() for param in model2.parameters()]

        def what_got_skipped(which_iter, which_backward, which_model):
            if which_iter == 0:
                if which_backward == 0:
                    if which_model == 0:
                        return 1
                    if which_model == 1:
                        return 2
                if which_backward == 1:
                    if which_model == 2:
                        return 3
                    if which_model == 1:
                        return 4
            if which_iter == 1:
                if which_backward == 0:
                    if which_model == 0:
                        return 5
                    if which_model == 1:
                        return 6
                if which_backward == 1:
                    if which_model == 2:
                        return 7
                    if which_model == 1:
                        return 8
            return 0

        for which_iter in (0,1):
            for which_backward in (0,1):
                if which_backward == 0:
                    which_models = (0,1)
                if which_backward == 1:
                    which_models = (2,1)
                for which_model in which_models:

                    model0 = MyModel(1)
                    model1 = MyModel(2)
                    model2 = MyModel(3)

                    optimizer0 = torch.optim.SGD([{'params' : model0.parameters(), 'lr' : 0.25},
                                                  {'params' : model1.parameters(), 'lr' : 1.0}],
                                                 momentum=0.5)
                    optimizer1 = torch.optim.SGD([{'params' : model2.parameters(), 'lr' : 0.5}],
                                                 momentum=0.25)

                    for i in range(3):
                        optimizer0.zero_grad()
                        optimizer1.zero_grad()
                        loss0 = model0(self.x) + model1(self.x)
                        loss1 = model2(self.x) + model1(self.x)
                        loss0.backward()
                        loss1.backward()

                        if i != which_iter:
                            reference_grads[what_got_skipped(which_iter,
                                    which_backward, which_model)].append(
                                [param.grad.data.clone() for param in model0.parameters()] +
                                [param.grad.data.clone() for param in model1.parameters()])

                        if i == which_iter:
                            if which_backward == 0:
                                # if which_model == 0:
                                    optimizer1.step()
                                # if which_model == 1:
                                #     optimizer1.step()
                            if which_backward == 1:
                                # if which_model == 2:
                                #     optimizer0.step()
                                # if which_model == 1:
                                    continue
                        else:
                            optimizer0.step()
                            optimizer1.step()

                    final_params[what_got_skipped(which_iter, which_backward, which_model)] = \
                        [param.data.clone() for param in model0.parameters()] + \
                        [param.data.clone() for param in model1.parameters()] + \
                        [param.data.clone() for param in model2.parameters()]

        for materialize_master_grads in (False, True):
          for opt_level in ("O0", "O1", "O2", "O3"):
            for how_to_zero in ("none", "model", "optimizer"):
              for use_multiple_loss_scalers in (False, True):
                if opt_level == "O1" or opt_level == "O2":
                    inject_inf_iters = (-1, 0, 1)
                else:
                    inject_inf_iters = (-1,)

                for inject_inf in inject_inf_iters:
                  if inject_inf >= 0:
                     inject_inf_locs = ("fp16", "fp32")
                     which_backwards = (0, 1)
                  else:
                     inject_inf_locs = ("fdsa",)
                     which_backwards = (None,)

                  for inject_inf_loc in inject_inf_locs:
                    for which_backward in which_backwards:
                      if use_multiple_loss_scalers:
                          num_losses = 2
                          loss_ids = [0, 1]
                      else:
                          num_losses = 1
                          loss_ids = [0, 0]

                      if inject_inf >= 0:
                          iters = 3
                          if which_backward == 0:
                              which_models = (0, 1)
                          elif which_backward == 1:
                              which_models = (2, 1)
                      else:
                          iters = 2
                          which_models = (None,)

                      for which_model in which_models:
                          model0 = MyModel(1)
                          model1 = MyModel(2)
                          model2 = MyModel(3)

                          models = [model0, model1, model2]

                          optimizer0 = FusedSGD([{'params' : model0.parameters(), 'lr' : 0.25},
                                            {'params' : model1.parameters(), 'lr' : 1.0}],
                                            momentum=0.5, materialize_master_grads=materialize_master_grads)
                          optimizer1 = FusedSGD([{'params' : model2.parameters(), 'lr' : 0.5}],
                                                momentum=0.25, materialize_master_grads=materialize_master_grads)

                          _amp_state.allow_incoming_model_not_fp32 = True
                          [model0, model1, model2], [optimizer0, optimizer1] = amp.initialize(
                              [model0, model1, model2],
                              [optimizer0, optimizer1],
                              opt_level=opt_level,
                              verbosity=0,
                              cast_model_type=False,
                              num_losses=num_losses)
                          _amp_state.allow_incoming_model_not_fp32 = False

                          _amp_state.loss_scalers[0]._loss_scale = 4.0
                          if use_multiple_loss_scalers:
                              _amp_state.loss_scalers[1]._loss_scale = 16.0

                          unskipped = 0
                          for i in range(iters):
                              if how_to_zero == "none":
                                  for model in models:
                                      for param in model.parameters():
                                          param.grad = None
                              elif how_to_zero == "model":
                                  for model in models:
                                      model.zero_grad()
                              else:
                                  optimizer0.zero_grad()
                                  optimizer1.zero_grad()

                              loss0 = model0(self.x) + model1(self.x)
                              loss1 = model2(self.x) + model1(self.x)

                              with amp.scale_loss(loss0, optimizer0, loss_id=loss_ids[0]) as scaled_loss:
                                  scaled_loss.backward()
                                  if i == inject_inf and which_backward == 0:
                                      if which_model == 0:
                                          inj_model = model0
                                      elif which_model == 1:
                                          inj_model = model1
                                      else:
                                          raise RuntimeError(which_model + " invalid for loss 0")
                                      if inject_inf_loc == "fp32":
                                          inj_model.weight0.grad[0] = float('inf')
                                      elif inject_inf_loc == "fp16":
                                          inj_model.weight1.grad[0] = float('inf')
                              with amp.scale_loss(loss1, [optimizer0, optimizer1], loss_id=loss_ids[1]) as scaled_loss:
                                  scaled_loss.backward()
                                  if i == inject_inf and which_backward == 1:
                                      if which_model == 2:
                                          inj_model = model2
                                      elif which_model == 1:
                                          inj_model = model1
                                      else:
                                          raise RuntimeError(which_model + " invalid for loss 1 ")
                                      if inject_inf_loc == "fp32":
                                          inj_model.weight0.grad[0] = float('inf')
                                      elif inject_inf_loc == "fp16":
                                          inj_model.weight1.grad[0] = float('inf')

                              if i != inject_inf:
                                  master_params = list(amp.master_params(optimizer0)) + \
                                                  list(amp.master_params(optimizer1))
                                  for param, reference_grad in zip(master_params,
                                        reference_grads[what_got_skipped(inject_inf,
                                            which_backward, which_model)][unskipped]):
                                      if opt_level == "O2" and not materialize_master_grads:
                                          continue
                                      else:
                                          self.assertTrue(torch.allclose(param.grad.float(), reference_grad.float()))
                                  unskipped += 1

                              optimizer0.step()
                              optimizer1.step()

                          model_params = [p for p in model0.parameters()] + \
                                         [p for p in model1.parameters()] + \
                                         [p for p in model2.parameters()]
                          master_params = [p for p in amp.master_params(optimizer0)] + \
                                          [p for p in amp.master_params(optimizer1)]

                          # print("opt_level {} i {} inject_inf {} which_backward {} inject_inf_loc {} use_multiple_loss_scalers {} which_model {}".format(opt_level, i, inject_inf, which_backward, inject_inf_loc, use_multiple_loss_scalers, which_model))

                          for model, master, reference in zip(
                                  model_params,
                                  master_params,
                                  final_params[what_got_skipped(inject_inf, which_backward, which_model)]):
                              self.assertTrue(torch.allclose(model, reference))
                              self.assertTrue(torch.allclose(model, master.to(model.dtype)))

                          if opt_level == "O1":
                              _amp_state.handle._deactivate()
Example #26
0
def train(output_directory, log_directory, checkpoint_path, warm_start, n_gpus,
          rank, group_name, hparams):
    """Training and validation logging results to tensorboard and stdout
    Params
    ------
    output_directory (string): directory to save checkpoints
    log_directory (string) directory to save tensorboard logs
    checkpoint_path(string): checkpoint path
    n_gpus (int): number of gpus
    rank (int): rank of current gpu
    hparams (object): comma separated list of "name=value" pairs.
    """
    if hparams.distributed_run:
        init_distributed(hparams, n_gpus, rank, group_name)

    torch.manual_seed(hparams.seed)
    torch.cuda.manual_seed(hparams.seed)

    model = load_model(hparams)
    learning_rate = hparams.learning_rate
    optimizer = torch.optim.Adam(model.parameters(),
                                 lr=learning_rate,
                                 weight_decay=hparams.weight_decay)

    if hparams.fp16_run:
        from apex import amp
        model, optimizer = amp.initialize(model, optimizer, opt_level='O2')

    if hparams.distributed_run:
        model = apply_gradient_allreduce(model)

    criterion = Tacotron2Loss()

    logger = prepare_directories_and_logger(output_directory, log_directory,
                                            rank)

    train_loader, valset, collate_fn = prepare_dataloaders(hparams)

    # Load checkpoint if one exists
    iteration = 0
    epoch_offset = 0
    if checkpoint_path is not None:
        if warm_start:
            model = warm_start_model(checkpoint_path, model,
                                     hparams.ignore_layers)
        else:
            model, optimizer, _learning_rate, iteration = load_checkpoint(
                checkpoint_path, model, optimizer)
            if hparams.use_saved_learning_rate:
                learning_rate = _learning_rate
            iteration += 1  # next iteration is iteration + 1
            epoch_offset = max(0, int(iteration / len(train_loader)))

    model.train()
    is_overflow = False
    # ================ MAIN TRAINNIG LOOP! ===================
    for epoch in range(epoch_offset, hparams.epochs):
        print("Epoch: {}".format(epoch))
        for i, batch in enumerate(train_loader):
            start = time.perf_counter()
            for param_group in optimizer.param_groups:
                param_group['lr'] = learning_rate

            model.zero_grad()
            x, y = model.parse_batch(batch)
            y_pred = model(x)

            loss = criterion(y_pred, y)
            if hparams.distributed_run:
                reduced_loss = reduce_tensor(loss.data, n_gpus).item()
            else:
                reduced_loss = loss.item()
            if hparams.fp16_run:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            if hparams.fp16_run:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    amp.master_params(optimizer), hparams.grad_clip_thresh)
                is_overflow = math.isnan(grad_norm)
            else:
                grad_norm = torch.nn.utils.clip_grad_norm_(
                    model.parameters(), hparams.grad_clip_thresh)

            optimizer.step()

            if not is_overflow and rank == 0:
                duration = time.perf_counter() - start
                print(
                    "Train loss {} {:.6f} Grad Norm {:.6f} {:.2f}s/it".format(
                        iteration, reduced_loss, grad_norm, duration))
                logger.log_training(reduced_loss, grad_norm, learning_rate,
                                    duration, iteration)

            if not is_overflow and (iteration % hparams.iters_per_checkpoint
                                    == 0):
                validate(model, criterion, valset, iteration,
                         hparams.batch_size, n_gpus, collate_fn, logger,
                         hparams.distributed_run, rank)
                if rank == 0:
                    checkpoint_path = os.path.join(
                        output_directory, "checkpoint_{}".format(iteration))
                    save_checkpoint(model, optimizer, learning_rate, iteration,
                                    checkpoint_path)

            iteration += 1
Example #27
0
def train(cfg):
    # Set seeds for determinism
    torch.manual_seed(cfg.training.seed)
    torch.cuda.manual_seed_all(cfg.training.seed)
    np.random.seed(cfg.training.seed)
    random.seed(cfg.training.seed)

    main_proc = True
    device = torch.device("cpu" if cfg.training.no_cuda else "cuda")

    is_distributed = os.environ.get(
        "LOCAL_RANK")  # If local rank exists, distributed env

    if is_distributed:
        # when using NCCL, on failures, surviving nodes will deadlock on NCCL ops
        # because NCCL uses a spin-lock on the device. Set this env var and
        # to enable a watchdog thread that will destroy stale NCCL communicators
        os.environ["NCCL_BLOCKING_WAIT"] = "1"

        device_id = int(os.environ["LOCAL_RANK"])
        torch.cuda.set_device(device_id)
        print(f"Setting CUDA Device to {device_id}")

        dist.init_process_group(backend=cfg.training.dist_backend.value)
        main_proc = device_id == 0  # Main process handles saving of models and reporting

    if OmegaConf.get_type(cfg.checkpointing) == FileCheckpointConfig:
        checkpoint_handler = FileCheckpointHandler(cfg=cfg.checkpointing)
    elif OmegaConf.get_type(cfg.checkpointing) == GCSCheckpointConfig:
        checkpoint_handler = GCSCheckpointHandler(cfg=cfg.checkpointing)
    else:
        raise ValueError("Checkpoint Config has not been specified correctly.")

    if main_proc and cfg.visualization.visdom:
        visdom_logger = VisdomLogger(id=cfg.visualization.id,
                                     num_epochs=cfg.training.epochs)
    if main_proc and cfg.visualization.tensorboard:
        tensorboard_logger = TensorBoardLogger(
            id=cfg.visualization.id,
            log_dir=to_absolute_path(cfg.visualization.log_dir),
            log_params=cfg.visualization.log_params)

    if cfg.checkpointing.load_auto_checkpoint:
        latest_checkpoint = checkpoint_handler.find_latest_checkpoint()
        if latest_checkpoint:
            cfg.checkpointing.continue_from = latest_checkpoint

    if cfg.checkpointing.continue_from:  # Starting from previous model
        state = TrainingState.load_state(
            state_path=to_absolute_path(cfg.checkpointing.continue_from))
        model = state.model
        if cfg.training.finetune:
            state.init_finetune_states(cfg.training.epochs)

        if main_proc and cfg.visualization.visdom:  # Add previous scores to visdom graph
            visdom_logger.load_previous_values(state.epoch, state.results)
        if main_proc and cfg.visualization.tensorboard:  # Previous scores to tensorboard logs
            tensorboard_logger.load_previous_values(state.epoch, state.results)
    else:
        # Initialise new model training
        with open(to_absolute_path(cfg.data.labels_path)) as label_file:
            labels = json.load(label_file)

        if OmegaConf.get_type(cfg.model) is BiDirectionalConfig:
            model = DeepSpeech(
                rnn_hidden_size=cfg.model.hidden_size,
                nb_layers=cfg.model.hidden_layers,
                labels=labels,
                rnn_type=supported_rnns[cfg.model.rnn_type.value],
                audio_conf=cfg.data.spect,
                bidirectional=True)
        elif OmegaConf.get_type(cfg.model) is UniDirectionalConfig:
            model = DeepSpeech(
                rnn_hidden_size=cfg.model.hidden_size,
                nb_layers=cfg.model.hidden_layers,
                labels=labels,
                rnn_type=supported_rnns[cfg.model.rnn_type.value],
                audio_conf=cfg.data.spect,
                bidirectional=False,
                context=cfg.model.lookahead_context)
        else:
            raise ValueError("Model Config has not been specified correctly.")

        state = TrainingState(model=model)
        state.init_results_tracking(epochs=cfg.training.epochs)

    # Data setup
    evaluation_decoder = GreedyDecoder(
        model.labels)  # Decoder used for validation
    train_dataset = SpectrogramDataset(audio_conf=model.audio_conf,
                                       manifest_filepath=to_absolute_path(
                                           cfg.data.train_manifest),
                                       labels=model.labels,
                                       normalize=True,
                                       augmentation_conf=cfg.data.augmentation)
    test_dataset = SpectrogramDataset(audio_conf=model.audio_conf,
                                      manifest_filepath=to_absolute_path(
                                          cfg.data.val_manifest),
                                      labels=model.labels,
                                      normalize=True)
    if not is_distributed:
        train_sampler = DSRandomSampler(dataset=train_dataset,
                                        batch_size=cfg.data.batch_size,
                                        start_index=state.training_step)
    else:
        train_sampler = DSElasticDistributedSampler(
            dataset=train_dataset,
            batch_size=cfg.data.batch_size,
            start_index=state.training_step)
    train_loader = AudioDataLoader(dataset=train_dataset,
                                   num_workers=cfg.data.num_workers,
                                   batch_sampler=train_sampler)
    test_loader = AudioDataLoader(dataset=test_dataset,
                                  num_workers=cfg.data.num_workers,
                                  batch_size=cfg.data.batch_size)

    model = model.to(device)
    parameters = model.parameters()
    if OmegaConf.get_type(cfg.optim) is SGDConfig:
        optimizer = torch.optim.SGD(parameters,
                                    lr=cfg.optim.learning_rate,
                                    momentum=cfg.optim.momentum,
                                    nesterov=True,
                                    weight_decay=cfg.optim.weight_decay)
    elif OmegaConf.get_type(cfg.optim) is AdamConfig:
        optimizer = torch.optim.AdamW(parameters,
                                      lr=cfg.optim.learning_rate,
                                      betas=cfg.optim.betas,
                                      eps=cfg.optim.eps,
                                      weight_decay=cfg.optim.weight_decay)
    else:
        raise ValueError("Optimizer has not been specified correctly.")

    model, optimizer = amp.initialize(model,
                                      optimizer,
                                      enabled=not cfg.training.no_cuda,
                                      opt_level=cfg.apex.opt_level,
                                      loss_scale=cfg.apex.loss_scale)
    if state.optim_state is not None:
        optimizer.load_state_dict(state.optim_state)
    if state.amp_state is not None:
        amp.load_state_dict(state.amp_state)

    # Track states for optimizer/amp
    state.track_optim_state(optimizer)
    if not cfg.training.no_cuda:
        state.track_amp_state(amp)

    if is_distributed:
        model = DistributedDataParallel(model, device_ids=[device_id])
    print(model)
    print("Number of parameters: %d" % DeepSpeech.get_param_size(model))

    criterion = CTCLoss()
    batch_time = AverageMeter()
    data_time = AverageMeter()
    losses = AverageMeter()

    for epoch in range(state.epoch, cfg.training.epochs):
        model.train()
        end = time.time()
        start_epoch_time = time.time()
        state.set_epoch(epoch=epoch)
        train_sampler.set_epoch(epoch=epoch)
        train_sampler.reset_training_step(training_step=state.training_step)
        for i, (data) in enumerate(train_loader, start=state.training_step):
            state.set_training_step(training_step=i)
            inputs, targets, input_percentages, target_sizes = data
            input_sizes = input_percentages.mul_(int(inputs.size(3))).int()
            # measure data loading time
            data_time.update(time.time() - end)
            inputs = inputs.to(device)

            out, output_sizes = model(inputs, input_sizes)
            out = out.transpose(0, 1)  # TxNxH

            float_out = out.float()  # ensure float32 for loss
            loss = criterion(float_out, targets, output_sizes,
                             target_sizes).to(device)
            # loss = loss / inputs.size(0)  # average the loss by minibatch
            loss_value = loss.item()

            # Check to ensure valid loss was calculated
            valid_loss, error = check_loss(loss, loss_value)
            if valid_loss:
                optimizer.zero_grad()

                # compute gradient
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               cfg.optim.max_norm)
                optimizer.step()
            else:
                print(error)
                print('Skipping grad update')
                loss_value = 0

            state.avg_loss += loss_value
            losses.update(loss_value, inputs.size(0))

            # measure elapsed time
            batch_time.update(time.time() - end)
            end = time.time()
            print('Epoch: [{0}][{1}/{2}]\t'
                  'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t'
                  'Data {data_time.val:.3f} ({data_time.avg:.3f})\t'
                  'Loss {loss.val:.4f} ({loss.avg:.4f})\t'.format(
                      (epoch + 1), (i + 1),
                      len(train_loader),
                      batch_time=batch_time,
                      data_time=data_time,
                      loss=losses))

            if main_proc and cfg.checkpointing.checkpoint_per_iteration:
                checkpoint_handler.save_iter_checkpoint_model(epoch=epoch,
                                                              i=i,
                                                              state=state)

            del loss, out, float_out

        state.avg_loss = state.avg_loss / len(
            train_dataset) * cfg.data.batch_size

        epoch_time = time.time() - start_epoch_time
        print('Training Summary Epoch: [{0}]\t'
              'Time taken (s): {epoch_time:.0f}\t'
              'Average Loss {loss:.3f}\t'.format(epoch + 1,
                                                 epoch_time=epoch_time,
                                                 loss=state.avg_loss))

        time.sleep(15 * 60)

        with torch.no_grad():
            wer, cer, output_data = run_evaluation(
                test_loader=test_loader,
                device=device,
                model=model,
                decoder=evaluation_decoder,
                target_decoder=evaluation_decoder)

            print('Validation Summary Epoch: [{0}]\t'
                  'Average WER {wer:.3f}\t'
                  'Average CER {cer:.3f}\t'.format(epoch + 1, wer=wer,
                                                   cer=cer))

        state.add_results(epoch=epoch,
                          loss_result=state.avg_loss,
                          wer_result=wer,
                          cer_result=cer)

        if main_proc and cfg.visualization.visdom:
            visdom_logger.update(epoch, state.result_state)
        if main_proc and cfg.visualization.tensorboard:
            tensorboard_logger.update(epoch, state.result_state,
                                      model.named_parameters())

        if main_proc and cfg.checkpointing.checkpoint:  # Save epoch checkpoint
            checkpoint_handler.save_checkpoint_model(epoch=epoch, state=state)
        # anneal lr
        for g in optimizer.param_groups:
            g['lr'] = g['lr'] / cfg.optim.learning_anneal
        print('Learning rate annealed to: {lr:.6f}'.format(lr=g['lr']))

        if main_proc and (state.best_wer is None or state.best_wer > wer):
            checkpoint_handler.save_best_model(epoch=epoch, state=state)
            state.set_best_wer(wer)
            state.reset_avg_loss()
        state.reset_training_step()  # Reset training step for next epoch
Example #28
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter()

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = WarmupLinearSchedule(optimizer,
                                     warmup_steps=args.warmup_steps,
                                     t_total=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproducibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            inputs, labels = mask_tokens(batch, tokenizer,
                                         args) if args.mlm else (batch, batch)
            inputs = inputs.to(args.device)
            labels = labels.to(args.device)
            model.train()
            outputs = model(inputs,
                            masked_lm_labels=labels) if args.mlm else model(
                                inputs, labels=labels)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    # Log metrics
                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args, model, tokenizer)
                        for key, value in results.items():
                            tb_writer.add_scalar('eval_{}'.format(key), value,
                                                 global_step)
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:
                    checkpoint_prefix = 'checkpoint'
                    # Save model checkpoint
                    output_dir = os.path.join(
                        args.output_dir,
                        '{}-{}'.format(checkpoint_prefix, global_step))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model.module if hasattr(
                        model, 'module'
                    ) else model  # Take care of distributed/parallel training
                    model_to_save.save_pretrained(output_dir)
                    torch.save(args,
                               os.path.join(output_dir, 'training_args.bin'))
                    logger.info("Saving model checkpoint to %s", output_dir)

                    _rotate_checkpoints(args, checkpoint_prefix)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step
Example #29
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)

    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  num_workers=4,
                                  pin_memory=True)
    args.max_steps = args.epoch * len(train_dataloader)
    args.save_steps = len(train_dataloader) // 10
    args.warmup_steps = len(train_dataloader)
    args.logging_steps = len(train_dataloader)
    args.num_train_epochs = args.epoch
    model.to(args.device)
    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.max_steps * 0.1,
        num_training_steps=args.max_steps)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    checkpoint_last = os.path.join(args.output_dir, 'checkpoint-last')
    scheduler_last = os.path.join(checkpoint_last, 'scheduler.pt')
    optimizer_last = os.path.join(checkpoint_last, 'optimizer.pt')
    if os.path.exists(scheduler_last):
        scheduler.load_state_dict(torch.load(scheduler_last))
    if os.path.exists(optimizer_last):
        optimizer.load_state_dict(torch.load(optimizer_last))
    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", args.max_steps)

    global_step = args.start_step
    tr_loss, logging_loss, avg_loss, tr_nb, tr_num, train_loss = 0.0, 0.0, 0.0, 0, 0, 0
    best_mrr = 0.0
    best_acc = 0.0
    # model.resize_token_embeddings(len(tokenizer))
    model.zero_grad()

    for idx in range(args.start_epoch, int(args.num_train_epochs)):
        bar = train_dataloader
        tr_num = 0
        train_loss = 0
        for step, batch in enumerate(bar):
            code_inputs = batch[0].to(args.device)
            nl_inputs = batch[1].to(args.device)

            model.train()
            loss, code_vec, nl_vec = model(code_inputs, nl_inputs)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            tr_loss += loss.item()
            tr_num += 1
            train_loss += loss.item()
            if avg_loss == 0:
                avg_loss = tr_loss
            avg_loss = round(train_loss / tr_num, 5)
            if (step + 1) % 100 == 0:
                logger.info("epoch {} step {} loss {}".format(
                    idx, step + 1, avg_loss))
            #bar.set_description("epoch {} loss {}".format(idx,avg_loss))

            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                optimizer.zero_grad()
                scheduler.step()
                global_step += 1
                output_flag = True
                avg_loss = round(
                    np.exp((tr_loss - logging_loss) / (global_step - tr_nb)),
                    4)
                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    logging_loss = tr_loss
                    tr_nb = global_step

                if args.local_rank in [
                        -1, 0
                ] and args.save_steps > 0 and global_step % args.save_steps == 0:

                    if args.local_rank == -1 and args.evaluate_during_training:  # Only evaluate when single GPU otherwise metrics may not average well
                        results = evaluate(args,
                                           model,
                                           tokenizer,
                                           eval_when_training=True)
                        for key, value in results.items():
                            logger.info("  %s = %s", key, round(value, 4))
                        # Save model checkpoint
                        tr_num = 0
                        train_loss = 0

                    if results['eval_mrr'] > best_acc:
                        best_acc = results['eval_mrr']
                        logger.info("  " + "*" * 20)
                        logger.info("  Best mrr:%s", round(best_acc, 4))
                        logger.info("  " + "*" * 20)

                        checkpoint_prefix = 'checkpoint-best-mrr'
                        output_dir = os.path.join(
                            args.output_dir, '{}'.format(checkpoint_prefix))
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        model_to_save = model.module if hasattr(
                            model, 'module') else model
                        output_dir = os.path.join(output_dir,
                                                  '{}'.format('model.bin'))
                        torch.save(model_to_save.state_dict(), output_dir)
                        logger.info("Saving model checkpoint to %s",
                                    output_dir)
Example #30
0
def fine_tune_task(args, model, train_dataset, tokenizer):
    """ Train the model """
    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)

    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    # Prepare optimizer and schedule (linear warmup and decay)
    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]
    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)
    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iteration",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):
            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'labels': batch[3]
            }
            if args.model_type != 'distilbert':
                inputs['token_type_ids'] = batch[2] if args.model_type in [
                    'bert', 'xlnet'
                ] else None  # XLM, DistilBERT and RoBERTa don't use segment_ids
            outputs = model(**inputs)
            loss = outputs[
                0]  # model outputs are always tuple in transformers (see doc)

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
            else:
                loss.backward()

            tr_loss += loss.item()
            if (step + 1
                ) % args.gradient_accumulation_steps == 0 and not args.tpu:
                if args.fp16:
                    torch.nn.utils.clip_grad_norm_(
                        amp.master_params(optimizer), args.max_grad_norm)
                else:
                    torch.nn.utils.clip_grad_norm_(model.parameters(),
                                                   args.max_grad_norm)

                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

            if args.tpu:
                args.xla_model.optimizer_step(optimizer, barrier=True)
                model.zero_grad()
                global_step += 1

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    return global_step, tr_loss / global_step
Example #31
0
def train(args, train_dataset, model, tokenizer):
    """ Train the model """
    if args.local_rank in [-1, 0]:
        tb_writer = SummaryWriter("log/" +
                                  args.output_dir[args.output_dir.find('/') +
                                                  1:])

    args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu)
    train_sampler = RandomSampler(
        train_dataset) if args.local_rank == -1 else DistributedSampler(
            train_dataset)
    train_dataloader = DataLoader(train_dataset,
                                  sampler=train_sampler,
                                  batch_size=args.train_batch_size,
                                  num_workers=4,
                                  pin_memory=True)

    if args.max_steps > 0:
        t_total = args.max_steps
        args.num_train_epochs = args.max_steps // (
            len(train_dataloader) // args.gradient_accumulation_steps) + 1
    else:
        t_total = len(
            train_dataloader
        ) // args.gradient_accumulation_steps * args.num_train_epochs

    no_decay = ['bias', 'LayerNorm.weight']
    optimizer_grouped_parameters = [{
        'params': [
            p for n, p in model.named_parameters()
            if not any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        args.weight_decay
    }, {
        'params': [
            p for n, p in model.named_parameters()
            if any(nd in n for nd in no_decay)
        ],
        'weight_decay':
        0.0
    }]

    optimizer = AdamW(optimizer_grouped_parameters,
                      lr=args.learning_rate,
                      eps=args.adam_epsilon)
    if args.warmup_steps == -1:
        args.warmup_steps = int(0.1 * t_total)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=args.warmup_steps,
        num_training_steps=t_total)
    if args.fp16:
        try:
            from apex import amp
        except ImportError:
            raise ImportError(
                "Please install apex from https://www.github.com/nvidia/apex to use fp16 training."
            )
        model, optimizer = amp.initialize(model,
                                          optimizer,
                                          opt_level=args.fp16_opt_level)

    # multi-gpu training (should be after apex fp16 initialization)
    if args.n_gpu > 1:
        model = torch.nn.DataParallel(model)

    # Distributed training (should be after apex fp16 initialization)
    if args.local_rank != -1:
        model = torch.nn.parallel.DistributedDataParallel(
            model,
            device_ids=[args.local_rank],
            output_device=args.local_rank,
            find_unused_parameters=True)

    # Train!
    logger.info("***** Running training *****")
    logger.info("  Num examples = %d", len(train_dataset))
    logger.info("  Num Epochs = %d", args.num_train_epochs)
    logger.info("  Instantaneous batch size per GPU = %d",
                args.per_gpu_train_batch_size)
    logger.info(
        "  Total train batch size (w. parallel, distributed & accumulation) = %d",
        args.train_batch_size * args.gradient_accumulation_steps *
        (torch.distributed.get_world_size() if args.local_rank != -1 else 1))
    logger.info("  Gradient Accumulation steps = %d",
                args.gradient_accumulation_steps)
    logger.info("  Total optimization steps = %d", t_total)

    global_step = 0
    tr_loss, logging_loss = 0.0, 0.0
    best_f1 = 0
    model.zero_grad()
    train_iterator = trange(int(args.num_train_epochs),
                            desc="Epoch",
                            disable=args.local_rank not in [-1, 0])
    set_seed(
        args)  # Added here for reproductibility (even between python 2 and 3)

    for _ in train_iterator:
        epoch_iterator = tqdm(train_dataloader,
                              desc="Iter",
                              disable=args.local_rank not in [-1, 0])
        for step, batch in enumerate(epoch_iterator):

            model.train()
            batch = tuple(t.to(args.device) for t in batch)
            inputs = {
                'input_ids': batch[0],
                'attention_mask': batch[1],
                'token_type_ids':
                None if args.model_type == 'xlm' else batch[2],
                'start_positions': batch[3],
                'end_positions': batch[4]
            }
            if args.model_type.startswith('roberta'):
                del inputs['token_type_ids']

            if args.model_type in ['xlnet', 'xlm']:
                inputs.update({'cls_index': batch[5], 'p_mask': batch[6]})
            if args.model_type.endswith('multi'):
                inputs.update({'answer_masks': batch[7]})
                inputs.update({'answer_nums': batch[8]})

            outputs = model(**inputs)
            loss = outputs[0]

            if args.n_gpu > 1:
                loss = loss.mean(
                )  # mean() to average on multi-gpu parallel (not distributed) training
            if args.gradient_accumulation_steps > 1:
                loss = loss / args.gradient_accumulation_steps

            if args.fp16:
                with amp.scale_loss(loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer),
                                               args.max_grad_norm)
            else:
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(),
                                               args.max_grad_norm)

            tr_loss += loss.item()
            if (step + 1) % args.gradient_accumulation_steps == 0:
                optimizer.step()
                scheduler.step()  # Update learning rate schedule
                model.zero_grad()
                global_step += 1

                if args.local_rank in [
                        -1, 0
                ] and args.logging_steps > 0 and global_step % args.logging_steps == 0:
                    tb_writer.add_scalar('lr',
                                         scheduler.get_lr()[0], global_step)
                    tb_writer.add_scalar('loss', (tr_loss - logging_loss) /
                                         args.logging_steps, global_step)
                    logging_loss = tr_loss

                if args.local_rank in [
                        -1, 0
                ] and args.evaluate_during_training and args.save_steps > 0 and global_step % args.save_steps == 0:
                    results = evaluate(args, model, tokenizer)
                    for key, value in results.items():
                        tb_writer.add_scalar('eval_{}'.format(key), value,
                                             global_step)
                    if results['f1'] > best_f1:
                        best_f1 = results['f1']
                        # Save model checkpoint
                        output_dir = os.path.join(
                            args.output_dir,
                            'checkpoint-{}'.format(global_step))
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        model_to_save = model.module if hasattr(
                            model, 'module'
                        ) else model  # Take care of distributed/parallel training
                        model_to_save.save_pretrained(output_dir)
                        torch.save(
                            args, os.path.join(output_dir,
                                               'training_args.bin'))
                        logger.info("Saving model checkpoint to %s",
                                    output_dir)
                        logger.info("Best F1 %f", best_f1)

            if args.max_steps > 0 and global_step > args.max_steps:
                epoch_iterator.close()
                break
        if args.max_steps > 0 and global_step > args.max_steps:
            train_iterator.close()
            break

    if args.local_rank in [-1, 0]:
        tb_writer.close()

    return global_step, tr_loss / global_step