Beispiel #1
0
 def _initialize_optimizer(self, args):
     self.lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)),
                                   args.warmup_steps)
     # Generate parameter names needed to perform weight decay.
     # All bias and LayerNorm parameters are excluded.
     decay_params = [
         p.name for n, p in self.model.named_parameters()
         if not any(nd in n for nd in ["bias", "norm"])
     ]
     self.optimizer = AdamW(
         learning_rate=self.lr_scheduler,
         parameters=self.model.parameters(),
         weight_decay=args.weight_decay,
         apply_decay_param_fun=lambda x: x in decay_params,
         grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))
 def _create_optimizer(self, model):
     scheduler = self._create_scheduler()
     clip = paddle.nn.ClipGradByNorm(clip_norm=1.0)
     return AdamW(
         parameters=model.parameters(),
         grad_clip=clip,
         learning_rate=scheduler,
         apply_decay_param_fun=lambda x: x in self.wd_params,
         weight_decay=self.args.weight_decay), scheduler
Beispiel #3
0
def train(args, model, train_data_loader, dev_data_loader, metric, rank):
    num_examples = len(train_data_loader) * args.batch_size * args.n_gpu
    max_train_steps = args.epochs * len(train_data_loader)
    if rank == 0:
        print("Num train examples: %d" % num_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Warmup proportion: %d" % args.warmup_proportion)

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, max_train_steps,
                                         args.warmup_proportion)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = AdamW(learning_rate=lr_scheduler,
                      parameters=model.parameters(),
                      weight_decay=args.weight_decay,
                      apply_decay_param_fun=lambda x: x in decay_params,
                      grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))
    loss_fn = DGULossFunction(args.task_name)

    load_ckpt(args, model, optimizer)

    step = 0
    best_metric = 0.0
    total_time = 0.0
    for epoch in range(args.epochs):
        if rank == 0:
            print('\nEpoch %d/%d' % (epoch + 1, args.epochs))
        batch_start_time = time.time()
        for batch in train_data_loader:
            step += 1
            input_ids, segment_ids, labels = batch
            logits = model(input_ids, segment_ids)
            loss = loss_fn(logits, labels)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            total_time += (time.time() - batch_start_time)
            if rank == 0:
                if step % args.logging_steps == 0:
                    print_logs(args, step, logits, labels, loss, total_time,
                               metric)
                    total_time = 0.0
                if step % args.save_steps == 0 or step == max_train_steps:
                    save_ckpt(model, optimizer, args.output_dir, step)
                    if args.do_eval:
                        print('\nEval begin...')
                        metric_out = evaluation(args, model, dev_data_loader,
                                                metric)
                        if metric_out > best_metric:
                            best_metric = metric_out
                            save_ckpt(model, optimizer, args.output_dir,
                                      'best')
                            print('Best model, step: %d\n' % step)
            batch_start_time = time.time()
Beispiel #4
0
class ModelOperation(object):
    """ModelTrain"""
    def __init__(self):
        self.cur_process_num = paddle.distributed.get_world_size(
        )  # PADDLE_TRAINERS_NUM 的值,默认值为1
        self.cur_process_rank = paddle.distributed.get_rank(
        )  # PADDLE_TRAINER_ID 的值,默认值为0
        self.model_class = {
            "uniLM":
            (UnifiedTransformerLMHeadModel, UnifiedTransformerTokenizer),
        }
        self.data_helper = None

    def _initialize_run_env(self, device, seed):
        assert device in ("cpu", "gpu", "xpu"), \
            f"param device({device}) must be in ('cpu', 'gpu', 'xpu')!!!"
        paddle.set_device(device)
        if self.cur_process_num > 1:
            paddle.distributed.init_parallel_env()
        if seed:
            self.set_seed(seed)

    def _initialize_model(self, model_type, pretrained_model_path):
        assert os.path.exists(pretrained_model_path), \
            f"model path {pretrained_model_path} must exists!!!"
        logging.info(f"initialize model from {pretrained_model_path}")

        model_class, tokenizer_class = self.model_class[model_type]
        self.tokenizer = tokenizer_class.from_pretrained(pretrained_model_path)
        self.model = model_class.from_pretrained(pretrained_model_path)

        if self.cur_process_num > 1:
            self.model = paddle.DataParallel(self.model)

    def _initialize_optimizer(self, args):
        self.lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)),
                                      args.warmup_steps)
        # Generate parameter names needed to perform weight decay.
        # All bias and LayerNorm parameters are excluded.
        decay_params = [
            p.name for n, p in self.model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]
        self.optimizer = AdamW(
            learning_rate=self.lr_scheduler,
            parameters=self.model.parameters(),
            weight_decay=args.weight_decay,
            apply_decay_param_fun=lambda x: x in decay_params,
            grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))

    def _start_train(self, args):
        # load train data loader
        train_dataset = DialogueDataset(args.train_data_path,
                                        args.batch_size,
                                        self.tokenizer.pad_token_id,
                                        self.tokenizer.cls_token_id,
                                        args.sort_pool_size,
                                        args.seed,
                                        mode='train')
        train_data_loader = DataLoader(train_dataset,
                                       return_list=True,
                                       batch_size=None)
        # initialize optimizer
        self._initialize_optimizer(args)
        global_step = 0
        tic_train = time.time()
        for epoch in range(args.train_epochs):
            for batch in train_data_loader:
                # logging.info(f"Epoch: {epoch+1}/{args.train_epochs}, step is {step}")
                global_step += 1
                token_ids, type_ids, pos_ids, generation_mask, tgt_label, tgt_pos = batch

                logits = self.model(token_ids, type_ids, pos_ids,
                                    generation_mask, tgt_pos)
                loss = F.cross_entropy(logits, tgt_label)

                if global_step % args.logging_steps == 0:
                    logging.info(
                        f"global step {global_step}, epoch: {epoch+1}/{args.train_epochs},"
                        f" loss: {loss}, speed: {args.logging_steps / (time.time() - tic_train):.2f} step/s"
                    )
                    tic_train = time.time()
                loss.backward()
                self.optimizer.step()
                self.lr_scheduler.step()
                self.optimizer.clear_gradients()

        if self.cur_process_rank == 0:
            output_dir = \
                os.path.join(args.output_dir, "model_{}".format(global_step))
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            # need better way to get inner model of DataParallel
            model_to_save = \
                self.model._layers if isinstance(self.model, paddle.DataParallel) else self.model
            model_to_save.save_pretrained(output_dir)
            self.tokenizer.save_pretrained(output_dir)
            print('Saving checkpoint to:', output_dir)

    @paddle.no_grad()
    def evaluation(self, args):
        self.model.eval()
        valid_dataset = DialogueDataset(args.valid_data_path,
                                        args.batch_size,
                                        self.tokenizer.pad_token_id,
                                        self.tokenizer.cls_token_id,
                                        args.sort_pool_size,
                                        args.seed,
                                        mode='valid')
        valid_data_loader = DataLoader(valid_dataset,
                                       return_list=True,
                                       batch_size=None)
        total_tokens = 0
        total_loss = 0.0
        start_time = time.time()
        step = 0
        for inputs in valid_data_loader:
            step += 1
            token_ids, type_ids, pos_ids, generation_mask, tgt_label, tgt_pos = inputs

            logits = self.model(token_ids, type_ids, pos_ids, generation_mask,
                                tgt_pos)
            loss = F.cross_entropy(logits, tgt_label, reduction='sum')

            total_loss += loss.numpy()[0]
            total_tokens += tgt_label.shape[0]

        avg_loss = total_loss / total_tokens
        ppl = math.exp(avg_loss)
        avg_speed = (time.time() - start_time) / step
        logging.info('loss: %.4f - ppl: %.4f - %.3fs/step\n' %
                     (avg_loss, ppl, avg_speed))
        self.model.train()

    @paddle.no_grad()
    def _infer(self, data_loader):
        self.model.eval()
        total_time = 0.0
        start_time = time.time()
        responses = []
        for step, inputs in enumerate(data_loader, 1):
            logging.info(f"step is {step}")
            token_ids, type_ids, pos_ids, generation_mask = inputs
            ids, scores = self.model.generate(
                input_ids=token_ids,
                token_type_ids=type_ids,
                position_ids=pos_ids,
                attention_mask=generation_mask,
                max_length=args.max_dec_len,
                min_length=args.min_dec_len,
                decode_strategy=args.decode_strategy,
                temperature=args.temperature,
                top_k=args.top_k,
                top_p=args.top_p,
                num_beams=args.num_beams,
                length_penalty=args.length_penalty,
                early_stopping=args.early_stopping,
                num_return_sequences=args.num_samples)

            total_time += (time.time() - start_time)
            if step % args.logging_steps == 0:
                logging.info(
                    f'step {step} - {total_time / args.logging_steps:.3f}s/step'
                )
                total_time = 0.0
            results = select_response(ids, scores, self.tokenizer,
                                      args.max_dec_len, args.num_samples)
            responses.extend(results)
            start_time = time.time()
        self.model.train()
        return responses

    def predict(self, args):
        # [1]. initialize dataset loader
        test_dataset = DialogueDataset(args.test_data_path,
                                       args.batch_size,
                                       self.tokenizer.pad_token_id,
                                       self.tokenizer.cls_token_id,
                                       args.sort_pool_size,
                                       args.seed,
                                       mode='test')
        valid_data_loader = DataLoader(test_dataset,
                                       return_list=True,
                                       batch_size=None)
        # [2]. do inference
        responses = self._infer(valid_data_loader)
        # [3]. save result
        output_path = os.path.join(args.output_dir, "predict.txt")
        with open(output_path, 'w', encoding='utf-8') as f:
            for response in responses:
                f.write(response + '\n')

    def train_and_eval(self, args):
        self._initialize_run_env(args.device, args.seed)
        self._initialize_model(args.model_type, args.pretrained_model_path)

        # start training
        if args.do_train:
            logging.info("start training...")
            self._start_train(args)
            logging.info("train success.")
        # start evaluation
        if args.do_eval:
            logging.info("start evaluating...")
            self.evaluation(args)
            logging.info("evaluate success.")
        # start predicting
        if args.do_predict:
            logging.info("start predicting...")
            self.predict(args)
            logging.info("predict success.")

    @staticmethod
    def set_seed(random_seed):
        random.seed(random_seed)
        np.random.seed(random_seed)
        paddle.seed(random_seed)
Beispiel #5
0
def run(args):
    paddle.set_device(args.device)
    world_size = dist.get_world_size()

    if world_size > 1:
        dist.init_parallel_env()
    set_seed(args.seed)

    model = UNIMOLMHeadModel.from_pretrained(args.model_name_or_path)
    tokenizer = UNIMOTokenizer.from_pretrained(args.model_name_or_path)

    if world_size > 1:
        model = paddle.DataParallel(model)

    train_ds = load_dataset(args.dataset_name,
                            splits='train',
                            data_files=args.train_file)
    dev_ds = load_dataset(args.dataset_name,
                          splits='dev',
                          data_files=args.predict_file)

    train_ds, train_data_loader = create_data_loader(train_ds, tokenizer, args,
                                                     'train')
    dev_ds, dev_data_loader = create_data_loader(dev_ds, tokenizer, args,
                                                 'test')

    if args.do_train:
        num_training_steps = args.epochs * len(train_data_loader)

        lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                             num_training_steps,
                                             args.warmup_propotion)
        # Generate parameter names needed to perform weight decay.
        # All bias and LayerNorm parameters are excluded.

        decay_params = [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ]

        optimizer = AdamW(learning_rate=lr_scheduler,
                          parameters=model.parameters(),
                          weight_decay=args.weight_decay,
                          beta1=args.beta1,
                          beta2=args.beta2,
                          epsilon=args.epsilon,
                          apply_decay_param_fun=lambda x: x in decay_params,
                          grad_clip=paddle.nn.ClipGradByGlobalNorm(
                              args.max_grad_norm))

        step = 0
        total_time = 0.0
        for epoch in range(args.epochs):
            print('\nEpoch %d/%d' % (epoch + 1, args.epochs))
            batch_start_time = time.time()
            for inputs in train_data_loader:
                step += 1
                labels = inputs[-1]
                logits = model(*inputs[:-1])
                labels = paddle.nn.functional.one_hot(
                    labels, num_classes=logits.shape[-1])
                labels = paddle.nn.functional.label_smooth(labels)
                loss = F.cross_entropy(logits, labels, soft_label=True)

                loss.backward()
                optimizer.step()
                lr_scheduler.step()
                optimizer.clear_grad()

                total_time += (time.time() - batch_start_time)
                if step % args.logging_steps == 0:
                    ppl = paddle.exp(loss)
                    print(
                        'step %d - loss: %.4f - ppl: %.4f - lr: %.7f - %.3fs/step'
                        % (step, loss, ppl, optimizer.get_lr(),
                           total_time / args.logging_steps))
                    total_time = 0.0

                if step % args.save_steps == 0 or step >= num_training_steps:
                    if dist.get_rank() == 0:
                        save_ckpt(model, tokenizer, args.save_dir, step)
                        print('Saved step {} model.\n'.format(step))
                        if args.do_predict:
                            model_eval = model._layers if isinstance(
                                model, paddle.DataParallel) else model
                            evaluation(model_eval, dev_data_loader, args,
                                       tokenizer)

                batch_start_time = time.time()

        print('\nTraining completed.')
    elif args.do_predict:
        model_eval = model._layers if isinstance(
            model, paddle.DataParallel) else model
        evaluation(model_eval, dev_data_loader, args, tokenizer)
Beispiel #6
0
def train(args, model, train_data_loader, dev_data_loader, metric, rank):
    num_examples = len(train_data_loader) * args.batch_size * args.n_gpu
    max_train_steps = args.epochs * len(train_data_loader)
    warmup_steps = int(max_train_steps * args.warmup_proportion)
    if rank == 0:
        print("Num train examples: %d" % num_examples)
        print("Max train steps: %d" % max_train_steps)
        print("Num warmup steps: %d" % warmup_steps)
    factor_fn = partial(compute_lr_factor,
                        warmup_steps=warmup_steps,
                        max_train_steps=max_train_steps)
    lr_scheduler = LambdaDecay(args.learning_rate, factor_fn)
    optimizer = AdamW(learning_rate=lr_scheduler,
                      parameters=model.parameters(),
                      weight_decay=args.weight_decay,
                      apply_decay_param_fun=lambda x: x in [
                          params.name for params in model.parameters()
                          if not any(nd in params.name
                                     for nd in ['bias', 'norm'])
                      ],
                      grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])
    loss_fn = DGULossFunction(args.task_name)

    load_ckpt(args, model, optimizer)

    step = 0
    best_metric = 0.0
    total_time = 0.0
    for epoch in range(args.epochs):
        if rank == 0:
            print('\nEpoch %d/%d' % (epoch + 1, args.epochs))
        batch_start_time = time.time()
        for batch in train_data_loader:
            step += 1
            input_ids, segment_ids, labels = batch
            logits = model(input_ids, segment_ids)
            loss = loss_fn(logits, labels)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_gradients()
            total_time += (time.time() - batch_start_time)
            if rank == 0:
                if step % args.logging_steps == 0:
                    print_logs(args, step, logits, labels, loss, total_time,
                               metric)
                    total_time = 0.0
                if step % args.save_steps == 0 or step == max_train_steps:
                    save_ckpt(model, optimizer, args.output_dir, step)
                    if args.do_eval:
                        print('\nEval begin...')
                        metric_out = evaluation(args, model, dev_data_loader,
                                                metric)
                        if metric_out > best_metric:
                            best_metric = metric_out
                            save_ckpt(model, optimizer, args.output_dir,
                                      'best')
                            print('Best model, step: %d\n' % step)
            batch_start_time = time.time()
Beispiel #7
0
def main(args):
    logging.basicConfig(
        format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
        datefmt="%m/%d/%Y %H:%M:%S",
        level=logging.INFO,
        handlers=[
            logging.FileHandler(
                os.path.join(args.output_dir, "run.log"),
                mode="w",
                encoding="utf-8",
            )
        ],
    )
    logger.info("**********  Configuration Arguments **********")
    for arg, value in sorted(vars(args).items()):
        logger.info(f"{arg}: {value}")
    logger.info("**************************************************")
    set_seed(args)

    # metric and label
    label_name = GLUE_PROCESSED[args.task_name][1]
    if label_name:
        label2id = dict(zip(label_name, range(len(label_name))))
    else:
        label2id = None
    metric_list = GLUE_METRICS[args.task_name]
    generate_max_length = label_length_map[args.task_name]

    writer = get_writer(args)

    # get model and tokenizer
    model = T5ForConditionalGeneration.from_pretrained(args.model_name_or_path)
    tokenizer = T5Tokenizer.from_pretrained(args.model_name_or_path)

    # get dataloader
    train_dataloader = get_train_dataloader(tokenizer, args)
    if args.task_name == "mnli":
        dev_dataloader_match = get_mnli_dev_dataloader(tokenizer,
                                                       args,
                                                       matched=True)
        dev_dataloader_mismatch = get_mnli_dev_dataloader(tokenizer,
                                                          args,
                                                          matched=False)
    else:
        dev_dataloader = get_dev_dataloader(tokenizer, args)

    num_update_steps_per_epoch = math.ceil(
        len(train_dataloader) / args.gradient_accumulation_steps)
    if args.max_train_steps > 0:
        args.num_train_epochs = math.ceil(args.max_train_steps /
                                          num_update_steps_per_epoch)
    else:
        args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch

    # get lr_scheduler
    lr_scheduler = get_scheduler(
        learning_rate=args.learning_rate,
        scheduler_type=args.scheduler_type,
        num_warmup_steps=args.warmup_steps
        if args.warmup_steps > 0 else args.warmup_radio,
        num_training_steps=args.max_train_steps,
    )

    total_batch_size = args.train_batch_size * args.gradient_accumulation_steps

    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]

    optimizer = AdamW(
        learning_rate=lr_scheduler,
        beta1=0.9,
        beta2=0.999,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params,
    )

    if args.use_amp:
        scaler = GradScaler(init_loss_scaling=args.scale_loss)

    logger.info("********** Running training **********")
    logger.info(f"  Num examples = {len(train_dataloader.dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(f"  Instantaneous train batch size = {args.train_batch_size}")
    logger.info(f"  Instantaneous eval batch size = {args.eval_batch_size}")
    logger.info(
        f"  Total train batch size (w. accumulation) = {total_batch_size}")
    logger.info(
        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_train_steps}")

    progress_bar = tqdm(range(args.max_train_steps))

    global_steps = 0
    tr_loss, logging_loss = 0.0, 0.0

    for _ in range(args.num_train_epochs):
        for step, batch in enumerate(train_dataloader):
            model.train()
            with auto_cast(args.use_amp,
                           custom_white_list=["layer_norm", "softmax"]):
                source_ids, source_mask, labels, target_mask = batch
                outputs = model(
                    input_ids=source_ids,
                    attention_mask=source_mask,
                    labels=labels,
                    decoder_attention_mask=target_mask,
                )
                loss = outputs[0] / args.gradient_accumulation_steps
                tr_loss += loss.item()

            if args.use_amp:
                scaler.scale(loss).backward()
            else:
                loss.backward()

            if (step % args.gradient_accumulation_steps == 0
                    or step == len(train_dataloader) - 1):
                if args.use_amp:
                    scaler.minimize(optimizer, loss)
                else:
                    optimizer.step()

                lr_scheduler.step()
                optimizer.clear_grad()
                progress_bar.update(1)
                global_steps += 1

                if args.logging_steps > 0 and global_steps % args.logging_steps == 0:
                    writer.add_scalar("lr", lr_scheduler.get_lr(),
                                      global_steps)
                    writer.add_scalar(
                        "loss",
                        (tr_loss - logging_loss) / args.logging_steps,
                        global_steps,
                    )
                    logger.info(
                        "global_steps {} - lr: {:.10f}  loss: {:.10f}".format(
                            global_steps,
                            lr_scheduler.get_lr(),
                            (tr_loss - logging_loss) / args.logging_steps,
                        ))
                    logging_loss = tr_loss

                if args.save_steps > 0 and global_steps % args.save_steps == 0:
                    logger.info("********** Running evaluating **********")
                    logger.info(f"********** Step {global_steps} **********")
                    output_dir = os.path.join(args.output_dir,
                                              f"step-{global_steps}")
                    os.makedirs(output_dir, exist_ok=True)

                    if args.task_name == "mnli":
                        matched_results = evaluate(
                            model,
                            dev_dataloader_match,
                            tokenizer,
                            label2id,
                            metric_list,
                            generate_max_length,
                        )
                        for k, v in matched_results.items():
                            writer.add_scalar(f"eval/matched_{k}", v,
                                              global_steps)
                            logger.info(f"  {k} = {v}")
                        mismatched_results = evaluate(
                            model,
                            dev_dataloader_mismatch,
                            tokenizer,
                            label2id,
                            metric_list,
                            generate_max_length,
                        )
                        for k, v in mismatched_results.items():
                            writer.add_scalar(f"eval/mismatched_{k}", v,
                                              global_steps)
                            logger.info(f"  {k} = {v}")
                    else:
                        eval_results = evaluate(
                            model,
                            dev_dataloader,
                            tokenizer,
                            label2id,
                            metric_list,
                            generate_max_length,
                        )
                        for k, v in eval_results.items():
                            writer.add_scalar(f"eval/{k}", v, global_steps)
                            logger.info(f"  {k} = {v}")
                    model.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    logger.info("********** Evaluating Done **********")

            if global_steps >= args.max_train_steps:
                logger.info("********** Running evaluating **********")
                logger.info(f"********** Step {global_steps} **********")
                output_dir = os.path.join(args.output_dir,
                                          f"step-{global_steps}")
                os.makedirs(output_dir, exist_ok=True)

                if args.task_name == "mnli":
                    matched_results = evaluate(
                        model,
                        dev_dataloader_match,
                        tokenizer,
                        label2id,
                        metric_list,
                        generate_max_length,
                    )
                    for k, v in matched_results.items():
                        writer.add_scalar(f"eval/matched_{k}", v, global_steps)
                        logger.info(f"  {k} = {v}")
                    mismatched_results = evaluate(
                        model,
                        dev_dataloader_mismatch,
                        tokenizer,
                        label2id,
                        metric_list,
                        generate_max_length,
                    )
                    for k, v in mismatched_results.items():
                        writer.add_scalar(f"eval/mismatched_{k}", v,
                                          global_steps)
                        logger.info(f"  {k} = {v}")
                else:
                    eval_results = evaluate(
                        model,
                        dev_dataloader,
                        tokenizer,
                        label2id,
                        metric_list,
                        generate_max_length,
                    )
                    for k, v in eval_results.items():
                        writer.add_scalar(f"eval/{k}", v, global_steps)
                        logger.info(f"  {k} = {v}")
                model.save_pretrained(output_dir)
                tokenizer.save_pretrained(output_dir)
                logger.info("********** Evaluating Done **********")
                logger.info("********** Training Done **********")
                return
Beispiel #8
0
def train(args):
    paddle.set_device(args.device)
    world_size = dist.get_world_size()
    if world_size > 1:
        dist.init_parallel_env()

    set_seed(args.seed)

    model = UnifiedTransformerLMHeadModel.from_pretrained(
        args.model_name_or_path)
    tokenizer = UnifiedTransformerTokenizer.from_pretrained(
        args.model_name_or_path)

    if world_size > 1:
        model = paddle.DataParallel(model)

    train_ds, dev_ds = load_dataset('duconv', splits=('train', 'dev'))
    train_ds, train_data_loader = create_data_loader(train_ds, tokenizer, args,
                                                     'train')
    dev_ds, dev_data_loader = create_data_loader(dev_ds, tokenizer, args,
                                                 'dev')

    lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)),
                             args.warmup_steps)
    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = AdamW(learning_rate=lr_scheduler,
                      parameters=model.parameters(),
                      weight_decay=args.weight_decay,
                      apply_decay_param_fun=lambda x: x in decay_params,
                      grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))

    step = 0
    total_time = 0.0
    best_ppl = 1e9
    for epoch in range(args.epochs):
        print('\nEpoch %d/%d' % (epoch + 1, args.epochs))
        batch_start_time = time.time()
        for inputs in train_data_loader:
            step += 1
            labels = inputs[-1]

            logits = model(*inputs[:-1])
            loss = F.cross_entropy(logits, labels)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            total_time += (time.time() - batch_start_time)
            if step % args.logging_steps == 0:
                ppl = paddle.exp(loss)
                print(
                    'step %d - loss: %.4f - ppl: %.4f - lr: %.7f - %.3fs/step'
                    % (step, loss, ppl, optimizer.get_lr(),
                       total_time / args.logging_steps))
                total_time = 0.0
            if step % args.save_steps == 0:
                ppl = evaluation(model, dev_data_loader)
                if dist.get_rank() == 0:
                    save_ckpt(model, tokenizer, args.save_dir, step)
                    if ppl < best_ppl:
                        best_ppl = ppl
                        save_ckpt(model, tokenizer, args.save_dir, 'best')
                        print('Saved step {} as best model.\n'.format(step))
            batch_start_time = time.time()
    print('\nTraining completed.')
Beispiel #9
0
def main(args):
    paddle.set_device('gpu' if args.n_gpus else 'cpu')
    paddle.seed(args.seed)
    world_size = dist.get_world_size()
    rank = dist.get_rank()
    if world_size > 1:
        dist.init_parallel_env()

    model = UnifiedTransformerLMHeadModel.from_pretrained(
        args.model_name_or_path)
    tokenizer = UnifiedTransformerTokenizer.from_pretrained(
        args.model_name_or_path)
    if world_size > 1:
        model = paddle.DataParallel(model)

    train_dataset = DialogueDataset(args.train_data_path,
                                    args.batch_size,
                                    tokenizer.pad_token_id,
                                    tokenizer.cls_token_id,
                                    args.sort_pool_size,
                                    args.seed,
                                    mode='train')
    train_dataloader = DataLoader(train_dataset,
                                  return_list=True,
                                  batch_size=None)
    valid_dataset = DialogueDataset(args.valid_data_path,
                                    args.batch_size,
                                    tokenizer.pad_token_id,
                                    tokenizer.cls_token_id,
                                    args.sort_pool_size,
                                    mode='valid')
    valid_dataloader = DataLoader(valid_dataset,
                                  return_list=True,
                                  batch_size=None)

    lr_scheduler = NoamDecay(1 / (args.warmup_steps * (args.lr**2)),
                             args.warmup_steps)
    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = AdamW(learning_rate=lr_scheduler,
                      parameters=model.parameters(),
                      weight_decay=args.weight_decay,
                      apply_decay_param_fun=lambda x: x in decay_params,
                      grad_clip=nn.ClipGradByGlobalNorm(args.max_grad_norm))

    step = 0
    total_time = 0.0
    for epoch in range(args.epochs):
        if rank == 0:
            print('\nEpoch %d/%d' % (epoch + 1, args.epochs))
        batch_start_time = time.time()
        for inputs in train_dataloader:
            step += 1
            token_ids, type_ids, pos_ids, generation_mask, tgt_label, tgt_pos = inputs

            logits = model(token_ids, type_ids, pos_ids, generation_mask,
                           tgt_pos)
            loss = F.cross_entropy(logits, tgt_label)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            total_time += (time.time() - batch_start_time)
            if rank == 0:
                if step % args.logging_steps == 0:
                    ppl = paddle.exp(loss)
                    print(
                        'step %d - loss: %.4f - ppl: %.4f - lr: %.7f - %.3fs/step'
                        % (step, loss, ppl, optimizer.get_lr(),
                           total_time / args.logging_steps))
                    total_time = 0.0
                if step % args.save_steps == 0:
                    evaluation(model, valid_dataloader)
                    save_ckpt(model, tokenizer, args.save_dir, step)
            batch_start_time = time.time()
Beispiel #10
0
def train(args, model, tokenizer):

    set_seed(args)

    generate_max_length = args.max_target_length

    writer = get_writer(args)

    # Distributed Setting
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()
        model = paddle.DataParallel(model)

    # get dataloader
    train_dataloader = get_train_dataloader(
        model=model,
        tokenizer=tokenizer,
        args=args,
    )
    eval_tasks = load_eval_tasks(model=model, tokenizer=tokenizer,
                                 args=args) if args.do_eval else None

    def math_ceil(x, y):
        return math.ceil(x / float(y))

    num_update_steps_per_epoch = math_ceil(len(train_dataloader),
                                           args.gradient_accumulation_steps)
    if args.logging_steps > num_update_steps_per_epoch:
        args.logging_steps = num_update_steps_per_epoch
    if args.max_steps > 0:
        args.num_train_epochs = math_ceil(args.max_steps,
                                          num_update_steps_per_epoch)
    else:
        args.max_steps = args.num_train_epochs * num_update_steps_per_epoch

    # get lr_scheduler
    lr_scheduler = get_scheduler(
        learning_rate=args.learning_rate,
        scheduler_type=args.lr_scheduler_type,
        num_warmup_steps=args.warmup_steps
        if args.warmup_steps > 0 else args.warmup_ratio,
        num_training_steps=args.max_steps,
    )

    total_batch_size = (args.per_device_train_batch_size *
                        args.gradient_accumulation_steps *
                        paddle.distributed.get_world_size())

    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    grad_clip = paddle.nn.ClipGradByGlobalNorm(args.max_grad_norm)
    optimizer = AdamW(
        learning_rate=lr_scheduler,
        beta1=args.adam_beta1,
        beta2=args.adam_beta2,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        apply_decay_param_fun=lambda x: x in decay_params,
        grad_clip=grad_clip,
    )

    if args.use_amp:
        scaler = GradScaler(init_loss_scaling=args.scale_loss)

    logger.info("********** Running training **********")
    logger.info(f"  Num examples = {len(train_dataloader.dataset)}")
    logger.info(f"  Num Epochs = {args.num_train_epochs}")
    logger.info(
        f"  Device train batch size = {args.per_device_train_batch_size}")
    logger.info(
        f"  Device eval  batch size = {args.per_device_eval_batch_size}")
    logger.info(
        f"  Total  train batch size (w. accumulation) = {total_batch_size}")
    logger.info(
        f"  Gradient Accumulation steps = {args.gradient_accumulation_steps}")
    logger.info(f"  Total optimization steps = {args.max_steps}")

    global_steps = 0
    tr_loss, logging_loss = 0.0, 0.0

    best_score = 0.0

    def logging_lr_loss():
        cur_lr = lr_scheduler.get_lr()
        cur_loss = (tr_loss - logging_loss) / args.logging_steps
        writer.add_scalar("lr", cur_lr, global_steps)
        writer.add_scalar("loss", cur_loss, global_steps)
        logger.info(f"global_steps {global_steps}/{args.max_steps}"
                    f" - lr: {cur_lr:.10f}  loss: {cur_loss:.10f}")

    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_dataloader):
            model.train()

            with auto_cast(args.use_amp,
                           custom_white_list=["layer_norm", "softmax"]):
                outputs = model(**batch)
                loss = outputs[0] / args.gradient_accumulation_steps
                tr_loss += loss.item()

            if args.use_amp:
                scaler.scale(loss).backward()
            else:
                loss.backward()

            if (step % args.gradient_accumulation_steps == 0
                    or step == len(train_dataloader) - 1):
                if args.use_amp:
                    scaler.minimize(optimizer, loss)
                else:
                    optimizer.step()

                lr_scheduler.step()
                optimizer.clear_grad()
                global_steps += 1

                if (args.logging_steps > 0
                        and global_steps % args.logging_steps == 0):
                    if paddle.distributed.get_rank() == 0:
                        logging_lr_loss()
                        logging_loss = tr_loss

        save_checkpoint(tokenizer, model,
                        os.path.join(args.output_dir, f"ckpt_epoch{epoch}"))
        if args.do_eval and paddle.distributed.get_rank() == 0:

            logger.info(f"********** Running evaluating **********")
            logger.info(f"************* Epoch {epoch} ************")

            eval_overall_results, eval_predictions = eval_all_tasks(
                eval_tasks=eval_tasks,
                model=model,
                tokenizer=tokenizer,
                generate_max_length=generate_max_length,
            )

            for line in better_print_multi(eval_overall_results).split('\n'):
                logger.info(line)

            if args.metric_for_best_model not in eval_overall_results:
                raise ValueError(f"Main metric {args.metric_for_best_model} "
                                 f"is not in {eval_overall_results.keys()}.")

            logger.info("********** Evaluating Done **********")
            current_score = eval_overall_results[args.metric_for_best_model]
            if current_score > best_score:
                logger.info("********** Saving Model **********")
                best_score = current_score
                save_checkpoint(tokenizer, model,
                                os.path.join(args.output_dir, f"best"))

    best_ckpt_file = os.path.join(args.output_dir, "best",
                                  "model_state.pdparams")
    if os.path.exists(best_ckpt_file):
        logger.info(f"Load best checkpoint from {best_ckpt_file}")
        model.load_dict(paddle.load(best_ckpt_file))

    save_checkpoint(tokenizer, model, args.output_dir)