Example #1
0
    def test_forward(self):
        config = copy.deepcopy(self.config)
        del config['batch_size']
        del config['seq_len']

        electra = ElectraModel(**config)
        model = self.TEST_MODEL_CLASS(electra)
        input_ids = paddle.to_tensor(self.input_ids, dtype="int64")
        self.output = model(input_ids)
        self.check_testcase()
Example #2
0
def do_train(args):
    paddle.enable_static() if not args.eager_run else None
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args.seed)
    worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank())

    model_class, tokenizer_class = MODEL_CLASSES['ernie-health']

    # Loads or initialize a model.
    pretrained_models = list(
        tokenizer_class.pretrained_init_configuration.keys())

    if args.model_name_or_path in pretrained_models:
        tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
        generator = ElectraGenerator(
            ElectraModel(**model_class.pretrained_init_configuration[
                args.model_name_or_path + '-generator']))
        discriminator = ErnieHealthDiscriminator(
            ElectraModel(**model_class.pretrained_init_configuration[
                args.model_name_or_path + '-discriminator']))
        model = model_class(generator, discriminator)
        args.init_from_ckpt = False
    else:
        if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt:
            # Load checkpoint
            tokenizer = tokenizer_class.from_pretrained(
                args.model_name_or_path)
            with open(os.path.join(args.model_name_or_path, 'run_states.json'),
                      'r') as f:
                config_dict = json.load(f)
                model_name = config_dict['model_name']
            if model_name in pretrained_models:
                generator = ElectraGenerator(
                    ElectraModel(**model_class.pretrained_init_configuration[
                        model_name + '-generator']))
                discriminator = ErnieHealthDiscriminator(
                    ElectraModel(**model_class.pretrained_init_configuration[
                        model_name + '-discriminator']))
                model = model_class(generator, discriminator)
                model.set_state_dict(
                    paddle.load(
                        os.path.join(args.model_name_or_path,
                                     'model_state.pdparams')))
            else:
                raise ValueError(
                    'initialize a model from ckpt need model_name '
                    'in model_config_file. The supported model_name '
                    'are as follows: {}'.format(
                        tokenizer_class.pretrained_init_configuration.keys()))
        else:
            raise ValueError(
                'initialize a model need identifier or the '
                'directory of storing model. if use identifier, the supported model '
                'identifiers are as follows: {}, if use directory, '
                'make sure set init_from_ckpt as True'.format(
                    model_class.pretrained_init_configuration.keys()))

    criterion = ErnieHealthPretrainingCriterion(
        getattr(model.generator,
                ElectraGenerator.base_model_prefix).config['vocab_size'],
        model.gen_weight)

    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    # Loads dataset.
    tic_load_data = time.time()
    logger.info('start load data : %s' %
                (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())))

    train_dataset = MedicalCorpus(data_path=args.input_dir,
                                  tokenizer=tokenizer)
    logger.info('load data done, total : %s s' % (time.time() - tic_load_data))

    # Reads data and generates mini-batches.
    data_collator = DataCollatorForErnieHealth(
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        mlm_prob=args.mlm_prob)

    train_data_loader = create_dataloader(
        train_dataset,
        batch_size=args.batch_size,
        mode='train',
        use_gpu=True if args.device in 'gpu' else False,
        data_collator=data_collator)

    num_training_steps = args.max_steps if args.max_steps > 0 else (
        len(train_data_loader) * args.num_epochs)
    args.num_epochs = (num_training_steps - 1) // len(train_data_loader) + 1

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, args.warmup_steps)

    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)

    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ['bias', 'norm'])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        grad_clip=clip,
        apply_decay_param_fun=lambda x: x in decay_params)
    if args.use_amp:
        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)

    logger.info('start train : %s' %
                (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())))
    trained_global_step = global_step = 0
    t_loss = defaultdict(lambda: paddle.to_tensor([0.0]))
    log_loss = defaultdict(lambda: paddle.to_tensor([0.0]))
    loss_list = defaultdict(list)
    log_list = []
    tic_train = time.time()

    if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt:
        optimizer.set_state_dict(
            paddle.load(
                os.path.join(args.model_name_or_path, 'model_state.pdopt')))
        trained_global_step = global_step = config_dict['global_step']
        if trained_global_step < num_training_steps:
            logger.info(
                '[ start train from checkpoint ] we have already trained %s steps, seeking next step : %s'
                % (trained_global_step, trained_global_step + 1))
        else:
            logger.info(
                '[ start train from checkpoint ] we have already trained %s steps, but total training steps is %s, please check configuration !'
                % (trained_global_step, num_training_steps))
            exit(0)

    if paddle.distributed.get_rank() == 0:
        writer = LogWriter(os.path.join(args.output_dir, 'loss_log'))

    for epoch in range(args.num_epochs):
        for step, batch in enumerate(train_data_loader):
            if trained_global_step > 0:
                trained_global_step -= 1
                continue
            global_step += 1
            masked_input_ids, input_ids, gen_labels = batch

            if args.use_amp:
                with paddle.amp.auto_cast():
                    gen_logits, logits_rtd, logits_mts, logits_csp, disc_labels, masks = model(
                        input_ids=masked_input_ids,
                        raw_input_ids=input_ids,
                        generator_labels=gen_labels)
                    loss, gen_loss, rtd_loss, mts_loss, csp_loss = criterion(
                        gen_logits, gen_labels, logits_rtd, logits_mts,
                        logits_csp, disc_labels, masks)

                scaled = scaler.scale(loss)
                scaled.backward()
                t_loss['loss'] += loss.detach()
                t_loss['gen'] += gen_loss.detach()
                t_loss['rtd'] += rtd_loss.detach()
                t_loss['mts'] += mts_loss.detach()
                t_loss['csp'] += csp_loss.detach()
                scaler.minimize(optimizer, scaled)
            else:
                gen_logits, logits_rtd, logits_mts, logits_csp, disc_labels, masks = model(
                    input_ids=masked_input_ids,
                    raw_input_ids=input_ids,
                    generator_labels=gen_labels)
                loss, gen_loss, rtd_loss, mts_loss, csp_loss = criterion(
                    gen_logits, gen_labels, logits_rtd, logits_mts, logits_csp,
                    disc_labels, masks)
                loss.backward()
                t_loss['loss'] += loss.detach()
                t_loss['gen'] += gen_loss.detach()
                t_loss['rtd'] += rtd_loss.detach()
                t_loss['mts'] += mts_loss.detach()
                t_loss['csp'] += csp_loss.detach()
                optimizer.step()

            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.logging_steps == 0:
                local_loss = dict([
                    (k, (t_loss[k] - log_loss[k]) / args.logging_steps)
                    for k in ['loss', 'gen', 'rtd', 'mts', 'csp']
                ])
                if paddle.distributed.get_world_size() > 1:
                    for k in ['loss', 'gen', 'rtd', 'mts', 'csp']:
                        paddle.distributed.all_gather(loss_list[k],
                                                      local_loss[k])
                    if paddle.distributed.get_rank() == 0:
                        tmp_loss = dict([
                            (k,
                             float((paddle.stack(loss_list[k]).sum() /
                                    len(loss_list[k])).numpy()))
                            for k in ['loss', 'gen', 'rtd', 'mts', 'csp']
                        ])
                        log_str = (
                            'global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, '
                            'avg_loss: {4:.15f}, generator: {5:.15f}, rtd: {6:.15f}, multi_choice: {7:.15f}, '
                            'seq_contrastive: {8:.15f}, lr: {9:.10f}, speed: {10:.2f} s/it'
                        ).format(
                            global_step, num_training_steps, epoch, step,
                            tmp_loss['loss'], tmp_loss['gen'],
                            tmp_loss['rtd'], tmp_loss['mts'], tmp_loss['csp'],
                            optimizer.get_lr(),
                            (time.time() - tic_train) / args.logging_steps)
                        logger.info(log_str)
                        log_list.append(log_str)
                        writer.add_scalar('generator_loss', tmp_loss['gen'],
                                          global_step)
                        writer.add_scalar('rtd_loss', tmp_loss['rtd'] * 50,
                                          global_step)
                        writer.add_scalar('mts_loss', tmp_loss['mts'] * 20,
                                          global_step)
                        writer.add_scalar('csp_loss', tmp_loss['csp'],
                                          global_step)
                        writer.add_scalar('total_loss', tmp_loss['loss'],
                                          global_step)
                        writer.add_scalar('lr', optimizer.get_lr(),
                                          global_step)
                    loss_list = defaultdict(list)
                else:
                    local_loss = dict([(k, v.numpy()[0])
                                       for k, v in local_loss.items()])
                    log_str = (
                        'global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, '
                        'avg_loss: {4:.15f}, generator: {5:.15f}, rtd: {6:.15f}, multi_choice: {7:.15f}, '
                        'seq_contrastive_loss: {8:.15f}, lr: {9:.10f}, speed: {10:.2f} s/it'
                    ).format(global_step, num_training_steps, epoch, step,
                             local_loss['loss'], local_loss['gen'],
                             local_loss['rtd'], local_loss['mts'],
                             local_loss['csp'], optimizer.get_lr(),
                             (time.time() - tic_train) / args.logging_steps)
                    logger.info(log_str)
                    log_list.append(log_str)
                    loss_dict = {
                        'generator_loss': local_loss['gen'],
                        'rtd_loss': local_loss['rtd'] * 50,
                        'mts_loss': local_loss['mts'] * 20,
                        'csp_loss': local_loss['csp']
                    }
                    for k, v in loss_dict.items():
                        writer.add_scalar('loss/%s' % k, v, global_step)
                    writer.add_scalar('total_loss', local_loss['loss'],
                                      global_step)
                    writer.add_scalar('lr', optimizer.get_lr(), global_step)
                log_loss = dict(t_loss)
                tic_train = time.time()

            if global_step % args.save_steps == 0:
                if paddle.distributed.get_rank() == 0:
                    output_dir = os.path.join(
                        args.output_dir, 'model_%d.pdparams' % global_step)
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    config_to_save = copy.deepcopy(
                        model_to_save.discriminator.electra.config)
                    if 'self' in config_to_save:
                        del config_to_save['self']
                    run_states = {
                        'model_name': model_name
                        if args.init_from_ckpt else args.model_name_or_path,
                        'global_step': global_step,
                        'epoch': epoch,
                        'step': step,
                    }
                    with open(os.path.join(output_dir, 'model_config.json'),
                              'w') as f:
                        json.dump(config_to_save, f)
                    with open(os.path.join(output_dir, 'run_states.json'),
                              'w') as f:
                        json.dump(run_states, f)
                    paddle.save(
                        model.state_dict(),
                        os.path.join(output_dir, 'model_state.pdparams'))
                    tokenizer.save_pretrained(output_dir)
                    paddle.save(optimizer.state_dict(),
                                os.path.join(output_dir, 'model_state.pdopt'))
                    if len(log_list) > 0:
                        with open(os.path.join(output_dir, 'train.log'),
                                  'w') as f:
                            for log in log_list:
                                if len(log.strip()) > 0:
                                    f.write(log.strip() + '\n')
            if global_step >= num_training_steps:
                if paddle.distributed.get_rank() == 0:
                    writer.close()
                return
Example #3
0
def do_train(args):
    paddle.enable_static() if not args.eager_run else None
    paddle.set_device("gpu" if args.n_gpu else "cpu")
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)
    worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank())

    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    # Loads or initializes a model.
    pretrained_models = list(
        tokenizer_class.pretrained_init_configuration.keys())
    if args.model_name_or_path in pretrained_models:
        tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
        generator = ElectraGenerator(
            ElectraModel(**model_class.pretrained_init_configuration[
                args.model_name_or_path + "-generator"]))
        discriminator = ElectraDiscriminator(
            ElectraModel(**model_class.pretrained_init_configuration[
                args.model_name_or_path + "-discriminator"]))
        model = model_class(generator, discriminator)
    else:
        if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt:
            # load checkpoint
            tokenizer = tokenizer_class.from_pretrained(
                args.model_name_or_path)
            for file_id, file_name in model_class.resource_files_names.items():
                full_file_name = os.path.join(args.model_name_or_path,
                                              file_name)
                # to be write : load model ckpt file
        else:
            raise ValueError(
                "initialize a model need identifier or the "
                "path to a directory instead. The supported model "
                "identifiers are as follows: {}".format(
                    model_class.pretrained_init_configuration.keys()))

    criterion = ElectraPretrainingCriterion(
        getattr(model.generator,
                ElectraGenerator.base_model_prefix).config["vocab_size"],
        model.gen_weight, model.disc_weight)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    # Loads dataset.
    tic_load_data = time.time()
    print("start load data : %s" %
          (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    train_dataset = BookCorpus(data_path=args.input_dir,
                               tokenizer=tokenizer,
                               max_seq_length=args.max_seq_length,
                               mode='train')
    print("load data done, total : %s s" % (time.time() - tic_load_data))

    # Reads data and generates mini-batches.
    data_collator = DataCollatorForElectra(tokenizer=tokenizer,
                                           max_seq_length=args.max_seq_length,
                                           mlm=True,
                                           mlm_probability=args.mask_prob)

    train_data_loader = create_dataloader(
        train_dataset,
        batch_size=args.train_batch_size,
        mode='train',
        use_gpu=True if args.n_gpu else False,
        data_collator=data_collator)

    num_training_steps = args.max_steps if args.max_steps > 0 else (
        len(train_data_loader) * args.num_train_epochs)

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate,
                                         num_training_steps, args.warmup_steps)

    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        grad_clip=clip,
        apply_decay_param_fun=lambda x: x in [
            p.name for n, p in model.named_parameters()
            if not any(nd in n for nd in ["bias", "norm"])
        ])

    print("start train : %s" %
          (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    global_step = 0
    tic_train = time.time()
    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            global_step += 1
            input_ids, raw_input_ids, gen_labels = batch
            gen_logits, disc_logits, disc_labels = model(
                input_ids=input_ids,
                raw_input_ids=raw_input_ids,
                gen_labels=gen_labels)
            loss = criterion(gen_logits, disc_logits, gen_labels, disc_labels)
            loss.backward()
            optimizer.step()
            lr_scheduler.step()
            optimizer.clear_gradients()
            #print("backward done, total %s s" % (time.time() - tic_train))
            #tic_train = time.time()
            if global_step % args.logging_steps == 0:
                print(
                    "global step %d/%d, epoch: %d, batch: %d, rank_id: %s, loss: %f, lr: %.10f, speed: %.4f step/s"
                    % (global_step, num_training_steps, epoch, step,
                       paddle.distributed.get_rank(), loss, optimizer.get_lr(),
                       args.logging_steps / (time.time() - tic_train)))
                tic_train = time.time()
            if global_step % args.save_steps == 0:
                if (not args.n_gpu > 1) or paddle.distributed.get_rank() == 0:
                    output_dir = os.path.join(
                        args.output_dir, "model_%d.pdparams" % global_step)
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    # need better way to get inner model of DataParallel
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    #model_to_save.save_pretrained(output_dir)
                    paddle.save(
                        model.state_dict(),
                        os.path.join(output_dir, "model_state.pdparams"))
                    tokenizer.save_pretrained(output_dir)
                    paddle.save(optimizer.state_dict(),
                                os.path.join(output_dir, "model_state.pdopt"))
Example #4
0
def do_train(args):
    paddle.enable_static() if not args.eager_run else None
    paddle.set_device(args.device)
    if paddle.distributed.get_world_size() > 1:
        paddle.distributed.init_parallel_env()

    set_seed(args)
    worker_init = WorkerInitObj(args.seed + paddle.distributed.get_rank())

    args.model_type = args.model_type.lower()
    model_class, tokenizer_class = MODEL_CLASSES[args.model_type]

    # Loads or initializes a model.
    pretrained_models = list(tokenizer_class.pretrained_init_configuration.keys(
    ))
    if args.model_name_or_path in pretrained_models:
        tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
        generator = ElectraGenerator(
            ElectraModel(**model_class.pretrained_init_configuration[
                args.model_name_or_path + "-generator"]))
        discriminator = ElectraDiscriminator(
            ElectraModel(**model_class.pretrained_init_configuration[
                args.model_name_or_path + "-discriminator"]))
        model = model_class(generator, discriminator)
        args.init_from_ckpt = False
    else:
        if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt:
            # Load checkpoint
            tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path)
            with open(
                    os.path.join(args.model_name_or_path, "run_states.json"),
                    'r') as f:
                config_dict = json.load(f)
                model_name = config_dict["model_name"]
            if model_name in pretrained_models:
                generator = ElectraGenerator(
                    ElectraModel(**model_class.pretrained_init_configuration[
                        model_name + "-generator"]))
                discriminator = ElectraDiscriminator(
                    ElectraModel(**model_class.pretrained_init_configuration[
                        model_name + "-discriminator"]))
                model = model_class(generator, discriminator)
                model.set_state_dict(
                    paddle.load(
                        os.path.join(args.model_name_or_path,
                                     "model_state.pdparams")))
            else:
                raise ValueError(
                    "initialize a model from ckpt need model_name "
                    "in model_config_file. The supported model_name "
                    "are as follows: {}".format(
                        tokenizer_class.pretrained_init_configuration.keys()))
        else:
            raise ValueError(
                "initialize a model need identifier or the "
                "directory of storing model. if use identifier, the supported model "
                "identifiers are as follows: {}, if use directory, "
                "make sure set init_from_ckpt as True".format(
                    model_class.pretrained_init_configuration.keys()))

    criterion = ElectraPretrainingCriterion(
        getattr(model.generator,
                ElectraGenerator.base_model_prefix).config["vocab_size"],
        model.gen_weight, model.disc_weight)
    if paddle.distributed.get_world_size() > 1:
        model = paddle.DataParallel(model)

    # Loads dataset.
    tic_load_data = time.time()
    print("start load data : %s" %
          (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    train_dataset = BookCorpus(
        data_path=args.input_dir,
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        mode='train')
    print("load data done, total : %s s" % (time.time() - tic_load_data))

    # Reads data and generates mini-batches.
    data_collator = DataCollatorForElectra(
        tokenizer=tokenizer,
        max_seq_length=args.max_seq_length,
        mlm=True,
        mlm_probability=args.mask_prob)

    train_data_loader = create_dataloader(
        train_dataset,
        batch_size=args.train_batch_size,
        mode='train',
        use_gpu=True if args.device in "gpu" else False,
        data_collator=data_collator)

    num_training_steps = args.max_steps if args.max_steps > 0 else (
        len(train_data_loader) * args.num_train_epochs)

    lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps,
                                         args.warmup_steps)

    clip = paddle.nn.ClipGradByGlobalNorm(clip_norm=1.0)
    # Generate parameter names needed to perform weight decay.
    # All bias and LayerNorm parameters are excluded.
    decay_params = [
        p.name for n, p in model.named_parameters()
        if not any(nd in n for nd in ["bias", "norm"])
    ]
    optimizer = paddle.optimizer.AdamW(
        learning_rate=lr_scheduler,
        epsilon=args.adam_epsilon,
        parameters=model.parameters(),
        weight_decay=args.weight_decay,
        grad_clip=clip,
        apply_decay_param_fun=lambda x: x in decay_params)
    if args.use_amp:
        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)

    print("start train : %s" %
          (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))
    trained_global_step = global_step = 0
    t_loss = paddle.to_tensor([0.0])
    log_loss = paddle.to_tensor([0.0])
    loss_list = []
    log_list = []
    tic_train = time.time()
    if os.path.isdir(args.model_name_or_path) and args.init_from_ckpt:
        optimizer.set_state_dict(
            paddle.load(
                os.path.join(args.model_name_or_path, "model_state.pdopt")))
        trained_global_step = global_step = config_dict["global_step"]
        if trained_global_step < num_training_steps:
            print(
                "[ start train from checkpoint ] we have already trained %s steps, seeking next step : %s"
                % (trained_global_step, trained_global_step + 1))
        else:
            print(
                "[ start train from checkpoint ] we have already trained %s steps, but total training steps is %s, please check configuration !"
                % (trained_global_step, num_training_steps))
            exit(0)

    for epoch in range(args.num_train_epochs):
        for step, batch in enumerate(train_data_loader):
            if trained_global_step > 0:
                trained_global_step -= 1
                continue
            global_step += 1
            input_ids, raw_input_ids, gen_labels = batch
            if args.use_amp:
                with paddle.amp.auto_cast():
                    gen_logits, disc_logits, disc_labels, attention_mask = model(
                        input_ids=input_ids,
                        raw_input_ids=raw_input_ids,
                        gen_labels=gen_labels)
                    loss = criterion(gen_logits, disc_logits, gen_labels,
                                     disc_labels, attention_mask)
                scaled = scaler.scale(loss)
                scaled.backward()
                t_loss += loss.detach()
                scaler.minimize(optimizer, scaled)
            else:
                gen_logits, disc_logits, disc_labels, attention_mask = model(
                    input_ids=input_ids,
                    raw_input_ids=raw_input_ids,
                    gen_labels=gen_labels)
                loss = criterion(gen_logits, disc_logits, gen_labels,
                                 disc_labels, attention_mask)
                loss.backward()
                t_loss += loss.detach()
                optimizer.step()
            lr_scheduler.step()
            optimizer.clear_grad()
            if global_step % args.logging_steps == 0:
                local_loss = (t_loss - log_loss) / args.logging_steps
                if (paddle.distributed.get_world_size() > 1):
                    paddle.distributed.all_gather(loss_list, local_loss)
                    if paddle.distributed.get_rank() == 0:
                        log_str = (
                            "global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, "
                            "avg_loss: {4:.15f}, lr: {5:.10f}, speed: {6:.2f} s/it"
                        ).format(global_step, num_training_steps, epoch, step,
                                 float((paddle.stack(loss_list).sum() / len(
                                     loss_list)).numpy()),
                                 optimizer.get_lr(),
                                 (time.time() - tic_train) / args.logging_steps)
                        print(log_str)
                        log_list.append(log_str)
                        loss_list = []
                else:
                    log_str = (
                        "global step {0:d}/{1:d}, epoch: {2:d}, batch: {3:d}, "
                        "loss: {4:.15f}, lr: {5:.10f}, speed: {6:.2f} s/it"
                    ).format(global_step, num_training_steps, epoch, step,
                             float(local_loss.numpy()),
                             optimizer.get_lr(),
                             (time.time() - tic_train) / args.logging_steps)
                    print(log_str)
                    log_list.append(log_str)
                log_loss = t_loss
                tic_train = time.time()
            if global_step % args.save_steps == 0:
                if paddle.distributed.get_rank() == 0:
                    output_dir = os.path.join(args.output_dir,
                                              "model_%d.pdparams" % global_step)
                    if not os.path.exists(output_dir):
                        os.makedirs(output_dir)
                    model_to_save = model._layers if isinstance(
                        model, paddle.DataParallel) else model
                    config_to_save = copy.deepcopy(
                        model_to_save.discriminator.electra.config)
                    if 'self' in config_to_save:
                        del config_to_save['self']
                    run_states = {
                        "model_name": model_name
                        if args.init_from_ckpt else args.model_name_or_path,
                        "global_step": global_step,
                        "epoch": epoch,
                        "step": step,
                    }
                    with open(
                            os.path.join(output_dir, "model_config.json"),
                            'w') as f:
                        json.dump(config_to_save, f)
                    with open(
                            os.path.join(output_dir, "run_states.json"),
                            'w') as f:
                        json.dump(run_states, f)
                    paddle.save(model.state_dict(),
                                os.path.join(output_dir,
                                             "model_state.pdparams"))
                    tokenizer.save_pretrained(output_dir)
                    paddle.save(optimizer.state_dict(),
                                os.path.join(output_dir, "model_state.pdopt"))
                    if len(log_list) > 0:
                        with open(os.path.join(output_dir, "train.log"),
                                  'w') as f:
                            for log in log_list:
                                if len(log.strip()) > 0:
                                    f.write(log.strip() + '\n')
            if global_step >= num_training_steps:
                return
Example #5
0
def main():
    parser = PdArgumentParser(
        (ModelArguments, DataArguments, TrainingArguments))
    model_args, data_args, training_args = parser.parse_args_into_dataclasses()

    training_args.eval_iters = 10
    training_args.test_iters = training_args.eval_iters * 10

    # Log model and data config
    training_args.print_config(model_args, "Model")
    training_args.print_config(data_args, "Data")

    paddle.set_device(training_args.device)

    # Log on each process the small summary:
    logger.warning(
        f"Process rank: {training_args.local_rank}, device: {training_args.device}, world_size: {training_args.world_size}, "
        +
        f"distributed training: {bool(training_args.local_rank != -1)}, 16-bits training: {training_args.fp16}"
    )

    # Detecting last checkpoint.
    last_checkpoint = None
    if os.path.isdir(
            training_args.output_dir
    ) and training_args.do_train and not training_args.overwrite_output_dir:
        last_checkpoint = get_last_checkpoint(training_args.output_dir)
        if last_checkpoint is None and len(os.listdir(
                training_args.output_dir)) > 1:
            raise ValueError(
                f"Output directory ({training_args.output_dir}) already exists and is not empty. "
                "Use --overwrite_output_dir to overcome.")
        elif last_checkpoint is not None and training_args.resume_from_checkpoint is None:
            logger.info(
                f"Checkpoint detected, resuming training at {last_checkpoint}. To avoid this behavior, change "
                "the `--output_dir` or add `--overwrite_output_dir` to train from scratch."
            )

    model_class, tokenizer_class = MODEL_CLASSES['ernie-health']

    # Loads or initialize a model.
    pretrained_models = list(
        tokenizer_class.pretrained_init_configuration.keys())

    if model_args.model_name_or_path in pretrained_models:
        tokenizer = tokenizer_class.from_pretrained(
            model_args.model_name_or_path)
        generator = ElectraGenerator(
            ElectraModel(**model_class.pretrained_init_configuration[
                model_args.model_name_or_path + "-generator"]))
        discriminator = ErnieHealthDiscriminator(
            ElectraModel(**model_class.pretrained_init_configuration[
                model_args.model_name_or_path + "-discriminator"]))
        model = model_class(generator, discriminator)
    else:
        raise ValueError("Only support %s" % (", ".join(pretrained_models)))

    # Loads dataset.
    tic_load_data = time.time()
    logger.info("start load data : %s" %
                (time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())))

    train_dataset = MedicalCorpus(data_path=data_args.input_dir,
                                  tokenizer=tokenizer)
    logger.info("load data done, total : %s s" % (time.time() - tic_load_data))

    # Reads data and generates mini-batches.
    data_collator = DataCollatorForErnieHealth(
        tokenizer=tokenizer,
        max_seq_length=data_args.max_seq_length,
        mlm_prob=data_args.masked_lm_prob,
        return_dict=True)

    class CriterionWrapper(paddle.nn.Layer):
        """
        """
        def __init__(self):
            """CriterionWrapper
            """
            super(CriterionWrapper, self).__init__()
            self.criterion = ErnieHealthPretrainingCriterion(
                getattr(
                    model.generator,
                    ElectraGenerator.base_model_prefix).config["vocab_size"],
                model.gen_weight)

        def forward(self, output, labels):
            """forward function

            Args:
                output (tuple): generator_logits, logits_rtd, logits_mts, logits_csp, disc_labels, mask
                labels (tuple): generator_labels

            Returns:
                Tensor: final loss.
            """
            generator_logits, logits_rtd, logits_mts, logits_csp, disc_labels, masks = output
            generator_labels = labels

            loss, gen_loss, rtd_loss, mts_loss, csp_loss = self.criterion(
                generator_logits, generator_labels, logits_rtd, logits_mts,
                logits_csp, disc_labels, masks)

            return loss

    trainer = Trainer(
        model=model,
        criterion=CriterionWrapper(),
        args=training_args,
        data_collator=data_collator,
        train_dataset=train_dataset if training_args.do_train else None,
        eval_dataset=None,
        tokenizer=tokenizer,
    )

    checkpoint = None
    if training_args.resume_from_checkpoint is not None:
        checkpoint = training_args.resume_from_checkpoint
    elif last_checkpoint is not None:
        checkpoint = last_checkpoint

    # Training
    if training_args.do_train:
        train_result = trainer.train(resume_from_checkpoint=checkpoint)
        metrics = train_result.metrics
        trainer.save_model()
        trainer.log_metrics("train", metrics)
        trainer.save_metrics("train", metrics)
        trainer.save_state()