コード例 #1
0
ファイル: run_pretrain.py プロジェクト: tianxin1860/PaddleNLP
def create_pretrained_dataset(
    args,
    data_file,
    tokenizer,
    data_world_size,
    data_world_rank,
    max_seq_len,
    places=None,
    data_holders=None,
    current_step=0,
):

    train_valid_test_num_samples = [
        args.global_batch_size * args.max_steps,
        args.micro_batch_size * (args.max_steps // args.eval_freq + 1) *
        args.eval_iters * data_world_size,
        args.micro_batch_size * args.test_iters * data_world_size
    ]
    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
        data_prefix=data_file,
        args=args,
        tokenizer=tokenizer,
        splits_string=args.split,
        train_valid_test_num_samples=train_valid_test_num_samples,
        max_seq_length=args.max_seq_len,
        masked_lm_prob=args.masked_lm_prob,
        short_seq_prob=args.short_seq_prob,
        seed=args.seed,
        skip_warmup=True,
        binary_head=True,
        max_seq_length_dec=None,
        dataset_type='ernie')

    def _collate_data(data, stack_fn=Stack()):
        num_fields = len(data[0])
        out = [None] * num_fields
        # 0. input_ids,
        # 1. segment_ids,
        # 2. input_mask,
        # 3. masked_lm_positions,
        # 4. masked_lm_labels,
        # 5. next_sentence_labels
        for i in (0, 1, 2, 5):
            out[i] = stack_fn([x[i] for x in data])
        out[5] = out[5].reshape([-1, 1])
        batch_size, seq_length = out[0].shape
        size = num_mask = sum(len(x[3]) for x in data)
        # masked_lm_positions
        # Organize as a 1D tensor for gather or use gather_nd
        if size % 8 != 0:
            size += 8 - (size % 8)
        out[3] = np.full(size, 0, dtype=np.int32)
        # masked_lm_labels
        out[4] = np.full([size, 1], -1, dtype=np.int64)
        mask_token_num = 0
        for i, x in enumerate(data):
            for j, pos in enumerate(x[3]):
                out[3][mask_token_num] = i * seq_length + pos
                out[4][mask_token_num] = x[4][j]
                mask_token_num += 1

        return out

    def loader(dataset, consumed_samples=0):
        batch_sampler = DistributedBatchSampler(
            dataset,
            batch_size=args.micro_batch_size,
            num_replicas=data_world_size,
            rank=data_world_rank,
            shuffle=False,
            drop_last=True,
            consumed_samples=consumed_samples)
        data_loader = paddle.io.DataLoader(dataset=dataset,
                                           batch_sampler=batch_sampler,
                                           num_workers=args.num_workers,
                                           worker_init_fn=None,
                                           collate_fn=_collate_data,
                                           return_list=False)
        return data_loader

    train_dl = loader(train_ds, args.global_batch_size * current_step)
    valid_dl = loader(
        valid_ds,
        args.micro_batch_size * ((current_step + 1) // args.eval_freq) *
        args.eval_iters * data_world_size)
    test_dl = loader(test_ds, 0)

    return train_dl, valid_dl, test_dl
コード例 #2
0
def create_pretrained_dataset(data_args, training_args, data_file, tokenizer):

    train_valid_test_num_samples = [
        training_args.per_device_train_batch_size * training_args.world_size *
        training_args.max_steps * training_args.gradient_accumulation_steps,
        training_args.per_device_eval_batch_size * training_args.world_size *
        training_args.eval_iters *
        (training_args.max_steps // training_args.eval_steps + 1),
        training_args.per_device_eval_batch_size * training_args.world_size *
        training_args.test_iters,
    ]
    train_ds, valid_ds, test_ds = build_train_valid_test_datasets(
        data_prefix=data_file,
        args=data_args,
        tokenizer=tokenizer,
        splits_string=data_args.split,
        train_valid_test_num_samples=train_valid_test_num_samples,
        max_seq_length=data_args.max_seq_length,
        masked_lm_prob=data_args.masked_lm_prob,
        short_seq_prob=data_args.short_seq_prob,
        seed=training_args.seed,
        skip_warmup=True,
        binary_head=True,
        max_seq_length_dec=None,
        dataset_type='ernie')

    def _collate_data(data, stack_fn=Stack()):
        num_fields = len(data[0])
        out = [None] * num_fields
        # 0. input_ids,
        # 1. segment_ids,
        # 2. input_mask,
        # 3. masked_lm_positions,
        # 4. masked_lm_labels,
        # 5. next_sentence_labels
        for i in (0, 1, 2, 5):
            out[i] = stack_fn([x[i] for x in data])
        out[5] = out[5].reshape([-1, 1])
        batch_size, seq_length = out[0].shape
        size = num_mask = sum(len(x[3]) for x in data)
        # masked_lm_positions
        # Organize as a 1D tensor for gather or use gather_nd
        if size % 8 != 0:
            size += 8 - (size % 8)
        out[3] = np.full(size, 0, dtype=np.int32)
        # masked_lm_labels
        out[4] = np.full([size, 1], -1, dtype=np.int64)
        mask_token_num = 0
        for i, x in enumerate(data):
            for j, pos in enumerate(x[3]):
                out[3][mask_token_num] = i * seq_length + pos
                out[4][mask_token_num] = x[4][j]
                mask_token_num += 1

        return {
            "input_ids": out[0],
            "token_type_ids": out[1],
            "attention_mask": out[2],
            "masked_positions": out[3],
            "labels": (out[4], out[5]),
        }

    return train_ds, valid_ds, test_ds, _collate_data