Esempio n. 1
0
def do_train(args):
    set_seed(args)

    DEV, TEST, TOKENIZER_CLASS = DATASET_INFO[args.dataset]
    tokenizer = TOKENIZER_CLASS.from_pretrained(args.model_name_or_path)

    train_ds, eval_ds, test_ds = load_dataset(args.dataset,
                                              splits=['train', DEV, TEST])

    paddle.set_device(args.device)
    trainer_num = paddle.distributed.get_world_size()
    if trainer_num > 1:
        paddle.distributed.init_parallel_env()
    rank = paddle.distributed.get_rank()
    if rank == 0:
        if os.path.exists(args.model_name_or_path):
            logger.info("init checkpoint from %s" % args.model_name_or_path)

    model = ErnieDocForQuestionAnswering.from_pretrained(
        args.model_name_or_path, dropout=args.dropout)
    model_config = model.ernie_doc.config
    if trainer_num > 1:
        model = paddle.DataParallel(model)

    train_ds_iter = MRCIterator(train_ds,
                                args.batch_size,
                                tokenizer,
                                trainer_num,
                                trainer_id=rank,
                                memory_len=model_config["memory_len"],
                                max_seq_length=args.max_seq_length,
                                random_seed=args.seed)

    eval_ds_iter = MRCIterator(eval_ds,
                               args.batch_size,
                               tokenizer,
                               trainer_num,
                               trainer_id=rank,
                               memory_len=model_config["memory_len"],
                               max_seq_length=args.max_seq_length,
                               mode="eval",
                               random_seed=args.seed)

    test_ds_iter = MRCIterator(test_ds,
                               args.batch_size,
                               tokenizer,
                               trainer_num,
                               trainer_id=rank,
                               memory_len=model_config["memory_len"],
                               max_seq_length=args.max_seq_length,
                               mode="test",
                               random_seed=args.seed)

    train_dataloader = paddle.io.DataLoader.from_generator(capacity=70,
                                                           return_list=True)
    train_dataloader.set_batch_generator(train_ds_iter, paddle.get_device())

    eval_dataloader = paddle.io.DataLoader.from_generator(capacity=70,
                                                          return_list=True)
    eval_dataloader.set_batch_generator(eval_ds_iter, paddle.get_device())

    test_dataloader = paddle.io.DataLoader.from_generator(capacity=70,
                                                          return_list=True)
    test_dataloader.set_batch_generator(test_ds_iter, paddle.get_device())

    num_training_examples = train_ds_iter.get_num_examples()
    num_training_steps = args.epochs * num_training_examples // args.batch_size // trainer_num
    logger.info("Device count: %d, trainer_id: %d" % (trainer_num, rank))
    logger.info("Num train examples: %d" % num_training_examples)
    logger.info("Max train steps: %d" % num_training_steps)
    logger.info("Num warmup steps: %d" %
                int(num_training_steps * args.warmup_proportion))

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps,
                                         args.warmup_proportion)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    # Construct dict
    name_dict = dict()
    for n, p in model.named_parameters():
        name_dict[p.name] = n

    optimizer = AdamWDL(learning_rate=lr_scheduler,
                        parameters=model.parameters(),
                        weight_decay=args.weight_decay,
                        apply_decay_param_fun=lambda x: x in decay_params,
                        n_layers=model_config["num_hidden_layers"],
                        layerwise_decay=args.layerwise_decay,
                        name_dict=name_dict)

    global_steps = 0
    create_memory = partial(init_memory, args.batch_size, args.memory_length,
                            model_config["hidden_size"],
                            model_config["num_hidden_layers"])

    criterion = CrossEntropyLossForQA()

    memories = create_memory()
    tic_train = time.time()
    best_avg_metric = -1
    for epoch in range(args.epochs):
        train_ds_iter.shuffle_sample()
        train_dataloader.set_batch_generator(train_ds_iter,
                                             paddle.get_device())
        for step, batch in enumerate(train_dataloader, start=1):
            global_steps += 1
            input_ids, position_ids, token_type_ids, attn_mask, start_position, \
                end_position, qids, gather_idx, need_cal_loss = batch
            start_logits, end_logits, memories = model(input_ids, memories,
                                                       token_type_ids,
                                                       position_ids, attn_mask)

            start_logits, end_logits, qids, start_position, end_position = list(
                map(lambda x: paddle.gather(x, gather_idx), [
                    start_logits, end_logits, qids, start_position,
                    end_position
                ]))
            loss = criterion([start_logits, end_logits],
                             [start_position, end_position]) * need_cal_loss

            mean_loss = loss.mean()
            mean_loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            if global_steps % args.logging_steps == 0:
                logger.info(
                    "train: global step %d, epoch: %d, loss: %f, lr: %f, speed: %.2f step/s"
                    % (global_steps, epoch, mean_loss, lr_scheduler.get_lr(),
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()

            if global_steps % args.save_steps == 0:
                # Evaluate
                logger.info("Eval:")
                EM, F1, AVG = evaluate(args, model, criterion,
                                       EM_AND_F1(), eval_dataloader,
                                       create_memory(), tokenizer)
                if rank == 0:
                    output_dir = os.path.join(args.output_dir,
                                              "model_%d" % (global_steps))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    if best_avg_metric < AVG:
                        output_dir = os.path.join(args.output_dir,
                                                  "best_model")
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)

            if args.max_steps > 0 and global_steps >= args.max_steps:
                return
    logger.info("Test:")
    evaluate(args, model, criterion, EM_AND_F1(), test_dataloader,
             create_memory(), tokenizer)
    if rank == 0:
        output_dir = os.path.join(args.output_dir, "model_%d" % (global_steps))
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        model_to_save = model._layers if isinstance(
            model, paddle.DataParallel) else model
        model_to_save.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)
def do_train(args):
    set_seed(args)
    tokenizer_class, eval_name, test_name, = DATASET_INFO[args.dataset]
    tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
    train_ds, eval_ds, test_ds = load_dataset(
        args.dataset, splits=["train", eval_name, test_name])
    num_classes = len(train_ds.label_list)
    no_entity_id = num_classes - 1

    paddle.set_device(args.device)
    trainer_num = paddle.distributed.get_world_size()
    if trainer_num > 1:
        paddle.distributed.init_parallel_env()
    rank = paddle.distributed.get_rank()
    if rank == 0:
        if os.path.exists(args.model_name_or_path):
            logger.info("init checkpoint from %s" % args.model_name_or_path)
    model = ErnieDocForTokenClassification.from_pretrained(
        args.model_name_or_path, num_classes=num_classes)
    model_config = model.ernie_doc.config
    if trainer_num > 1:
        model = paddle.DataParallel(model)

    train_ds_iter = SequenceLabelingIterator(
        train_ds,
        args.batch_size,
        tokenizer,
        trainer_num,
        trainer_id=rank,
        memory_len=model_config["memory_len"],
        max_seq_length=args.max_seq_length,
        random_seed=args.seed,
        no_entity_id=no_entity_id)
    eval_ds_iter = SequenceLabelingIterator(
        eval_ds,
        args.batch_size,
        tokenizer,
        trainer_num,
        trainer_id=rank,
        memory_len=model_config["memory_len"],
        max_seq_length=args.max_seq_length,
        mode="eval",
        no_entity_id=no_entity_id)
    test_ds_iter = SequenceLabelingIterator(
        test_ds,
        args.batch_size,
        tokenizer,
        trainer_num,
        trainer_id=rank,
        memory_len=model_config["memory_len"],
        max_seq_length=args.max_seq_length,
        mode="test",
        no_entity_id=no_entity_id)

    train_dataloader = paddle.io.DataLoader.from_generator(capacity=70,
                                                           return_list=True)
    train_dataloader.set_batch_generator(train_ds_iter, paddle.get_device())
    eval_dataloader = paddle.io.DataLoader.from_generator(capacity=70,
                                                          return_list=True)
    eval_dataloader.set_batch_generator(eval_ds_iter, paddle.get_device())
    test_dataloader = paddle.io.DataLoader.from_generator(capacity=70,
                                                          return_list=True)
    test_dataloader.set_batch_generator(test_ds_iter, paddle.get_device())

    num_training_examples = train_ds_iter.get_num_examples()
    num_training_steps = args.epochs * num_training_examples // args.batch_size // trainer_num
    logger.info("Device count: %d, trainer_id: %d" % (trainer_num, rank))
    logger.info("Num train examples: %d" % num_training_examples)
    logger.info("Max train steps: %d" % num_training_steps)
    logger.info("Num warmup steps: %d" %
                int(num_training_steps * args.warmup_proportion))

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps,
                                         args.warmup_proportion)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    # Construct dict
    name_dict = dict()
    for n, p in model.named_parameters():
        name_dict[p.name] = n

    optimizer = AdamWDL(learning_rate=lr_scheduler,
                        parameters=model.parameters(),
                        weight_decay=args.weight_decay,
                        apply_decay_param_fun=lambda x: x in decay_params,
                        n_layers=model_config["num_hidden_layers"],
                        layerwise_decay=args.layerwise_decay,
                        name_dict=name_dict)

    criterion = paddle.nn.loss.CrossEntropyLoss()
    metric = ChunkEvaluator(label_list=train_ds.label_list)

    global_steps = 0

    create_memory = partial(init_memory, args.batch_size, args.memory_length,
                            model_config["hidden_size"],
                            model_config["num_hidden_layers"])
    # Copy the memory
    memories = create_memory()
    tic_train = time.time()
    best_f1 = 0
    for epoch in range(args.epochs):
        train_ds_iter.shuffle_sample()
        train_dataloader.set_batch_generator(train_ds_iter,
                                             paddle.get_device())
        for step, batch in enumerate(train_dataloader, start=1):
            global_steps += 1
            input_ids, position_ids, token_type_ids, attn_mask, labels, lengths, qids, \
                gather_idx, need_cal_loss = batch
            logits, memories = model(input_ids, memories, token_type_ids,
                                     position_ids, attn_mask)
            logits, labels = list(
                map(lambda x: paddle.gather(x, gather_idx), [logits, labels]))

            loss = criterion(logits, labels) * need_cal_loss
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()

            if global_steps % args.logging_steps == 0:
                logger.info(
                    "train: global step %d, epoch: %d, loss: %f, lr: %f, speed: %.2f step/s"
                    % (global_steps, epoch, loss, lr_scheduler.get_lr(),
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()
            if global_steps % args.save_steps == 0:
                # Evaluate
                logger.info("Eval:")
                precision, recall, f1_score = evaluate(model, metric,
                                                       eval_dataloader,
                                                       create_memory())
                # Save
                if rank == 0:
                    output_dir = os.path.join(args.output_dir,
                                              "model_%d" % (global_steps))
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    model_to_save.save_pretrained(output_dir)
                    tokenizer.save_pretrained(output_dir)
                    if f1_score > best_f1:
                        logger.info("Save best model......")
                        best_f1 = f1_score
                        best_model_dir = os.path.join(args.output_dir,
                                                      "best_model")
                        if not os.path.exists(best_model_dir):
                            os.makedirs(best_model_dir)
                        model_to_save.save_pretrained(best_model_dir)
                        tokenizer.save_pretrained(best_model_dir)

            if args.max_steps > 0 and global_steps >= args.max_steps:
                return

    logger.info("Final test result:")
    eval_acc = evaluate(model, metric, test_dataloader, create_memory())
Esempio n. 3
0
def do_train(args):
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)
    tokenizer = AutoTokenizer.from_pretrained(args.model_name_or_path)
    trans_func = partial(convert_example,
                         tokenizer=tokenizer,
                         max_seq_length=args.max_seq_length)
    if args.task_type == "cross-lingual-transfer":
        train_ds = load_dataset("xnli", "en", splits="train")
        train_ds = train_ds.map(trans_func, lazy=True)
    elif args.task_type == "translate-train-all":
        all_train_ds = []
        for language in all_languages:
            train_ds = load_dataset("xnli", language, splits="train")
            all_train_ds.append(train_ds.map(trans_func, lazy=True))
        train_ds = XnliDataset(all_train_ds)
    train_batch_sampler = DistributedBatchSampler(train_ds,
                                                  batch_size=args.batch_size,
                                                  shuffle=True)
    batchify_fn = lambda samples, fn=Tuple(
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
            ),  # input_ids
        Pad(axis=0, pad_val=tokenizer.pad_token_id, dtype="int64"
            ),  # position_ids
        Pad(axis=0, pad_val=0, dtype="int64"),  # attention_mask
        Stack(dtype="int64")  # labels
    ): fn(samples)
    train_data_loader = DataLoader(dataset=train_ds,
                                   batch_sampler=train_batch_sampler,
                                   collate_fn=batchify_fn,
                                   num_workers=0,
                                   return_list=True)

    num_classes = 3
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name_or_path, num_classes=num_classes, dropout=args.dropout)
    n_layers = model.ernie_m.config['num_hidden_layers']
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    if args.max_steps > 0:
        num_training_steps = args.max_steps
        num_train_epochs = math.ceil(num_training_steps /
                                     len(train_data_loader))
    else:
        num_training_steps = len(train_data_loader) * args.num_train_epochs
        num_train_epochs = args.num_train_epochs

    warmup = args.warmup_steps if args.warmup_steps > 0 else args.warmup_proportion

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, warmup)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    # Construct dict
    name_dict = dict()
    for n, p in model.named_parameters():
        name_dict[p.name] = n
    optimizer = AdamWDL(learning_rate=lr_scheduler,
                        beta1=0.9,
                        beta2=0.999,
                        epsilon=args.adam_epsilon,
                        parameters=model.parameters(),
                        weight_decay=args.weight_decay,
                        n_layers=n_layers,
                        layerwise_decay=args.layerwise_decay,
                        apply_decay_param_fun=lambda x: x in decay_params,
                        name_dict=name_dict)

    loss_fct = nn.CrossEntropyLoss()
    if args.use_amp:
        scaler = paddle.amp.GradScaler(init_loss_scaling=args.scale_loss)
    metric = Accuracy()

    global_step = 0
    tic_train = time.time()
    for epoch in range(num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, position_ids, attention_mask, labels = batch
            with paddle.amp.auto_cast(
                    args.use_amp,
                    custom_white_list=["layer_norm", "softmax", "gelu"]):
                logits = model(input_ids, position_ids, attention_mask)
                loss = loss_fct(logits, labels)
            if args.use_amp:
                scaled_loss = scaler.scale(loss)
                scaled_loss.backward()
                scaler.minimize(optimizer, scaled_loss)
            else:
                loss.backward()
                optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
                    % (global_step, num_training_steps, epoch, step,
                       paddle.distributed.get_rank(), loss, optimizer.get_lr(),
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()
            if global_step % args.save_steps == 0 or global_step == num_training_steps:
                for language in all_languages:
                    tic_eval = time.time()
                    test_data_loader = get_test_dataloader(
                        args, language, batchify_fn, trans_func)
                    evaluate(model, loss_fct, metric, test_data_loader,
                             language)
                    print("eval done total : %s s" % (time.time() - tic_eval))
                    if paddle.distributed.get_rank() == 0:
                        output_dir = os.path.join(
                            args.output_dir,
                            "ernie_m_ft_model_%d.pdparams" % (global_step))
                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)
                        # Need better way to get inner model of DataParallel
                        model_to_save = model._layers if isinstance(
                            model, paddle.DataParallel) else model
                        model_to_save.save_pretrained(output_dir)
                        tokenizer.save_pretrained(output_dir)
            if global_step >= num_training_steps:
                break
        if global_step >= num_training_steps:
            break
    if paddle.distributed.get_rank() == 0:
        output_dir = os.path.join(
            args.output_dir, "ernie_m_final_model_%d.pdparams" % global_step)
        if not os.path.exists(output_dir):
            os.makedirs(output_dir)
        # Need better way to get inner model of DataParallel
        model_to_save = model._layers if isinstance(
            model, paddle.DataParallel) else model
        model_to_save.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)