Ejemplo n.º 1
0
def _get_mindrecord_dataset(directory,
                            train_mode=True,
                            epochs=1,
                            batch_size=1000,
                            line_per_sample=1000,
                            rank_size=None,
                            rank_id=None):
    """
    Get dataset with mindrecord format.

    Args:
        directory (str): Dataset directory.
        train_mode (bool): Whether dataset is use for train or eval (default=True).
        epochs (int): Dataset epoch size (default=1).
        batch_size (int): Dataset batch size (default=1000).
        line_per_sample (int): The number of sample per line (default=1000).
        rank_size (int): The number of device, not necessary for single device (default=None).
        rank_id (int): Id of device, not necessary for single device (default=None).

    Returns:
        Dataset.
    """
    file_prefix_name = 'train_input_part.mindrecord' if train_mode else 'test_input_part.mindrecord'
    file_suffix_name = '00' if train_mode else '0'
    shuffle = train_mode

    if rank_size is not None and rank_id is not None:
        ds = de.MindDataset(os.path.join(directory,
                                         file_prefix_name + file_suffix_name),
                            columns_list=['feat_ids', 'feat_vals', 'label'],
                            num_shards=rank_size,
                            shard_id=rank_id,
                            shuffle=shuffle,
                            num_parallel_workers=8)
    else:
        ds = de.MindDataset(os.path.join(directory,
                                         file_prefix_name + file_suffix_name),
                            columns_list=['feat_ids', 'feat_vals', 'label'],
                            shuffle=shuffle,
                            num_parallel_workers=8)
    ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True)
    ds = ds.map(operations=(lambda x, y, z:
                            (np.array(x).flatten().reshape(batch_size, 39),
                             np.array(y).flatten().reshape(batch_size, 39),
                             np.array(z).flatten().reshape(batch_size, 1))),
                input_columns=['feat_ids', 'feat_vals', 'label'],
                columns_order=['feat_ids', 'feat_vals', 'label'],
                num_parallel_workers=8)
    ds = ds.repeat(epochs)
    return ds
Ejemplo n.º 2
0
def load_test_data(batch_size=1, data_file=None):
    """Load test dataset."""
    ds = de.MindDataset(data_file,
                        columns_list=["source_eos_ids", "source_eos_mask",
                                      "target_sos_ids", "target_sos_mask",
                                      "target_eos_ids", "target_eos_mask"],
                        shuffle=False)
    type_cast_op = deC.TypeCast(mstype.int32)
    ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids")
    ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask")
    ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids")
    ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask")
    ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids")
    ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask")
    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)
    return ds
Ejemplo n.º 3
0
def create_dataset(data_path, is_train=True, batch_size=32):
    # import
    import mindspore.dataset.engine as de
    import mindspore.dataset.vision.c_transforms as C
    from mindspore.common import set_seed

    set_seed(1)

    # shard
    num_shards = shard_id = None
    rand_size = os.getenv("RANK_SIZE")
    rand_id = os.getenv("RANK_ID")
    if rand_size is not None and rand_id is not None:
        num_shards = int(rand_size)
        shard_id = int(rand_id)

    # define dataset
    ds = de.MindDataset(data_path,
                        columns_list=['data'],
                        shuffle=True,
                        num_shards=num_shards,
                        shard_id=shard_id,
                        num_parallel_workers=8,
                        num_samples=None)

    # map ops
    ds = ds.map(input_columns=["data"], operations=C.Decode())
    ds = ds.map(input_columns=["data"],
                operations=C.Normalize(
                    mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
                    std=[0.229 * 255, 0.224 * 255, 0.225 * 255]))
    ds = ds.map(input_columns=["data"], operations=C.Resize((224, 224)))
    ds = ds.map(input_columns=["data"], operations=C.HWC2CHW())

    # batch & repeat
    ds = ds.batch(batch_size=batch_size, drop_remainder=is_train)
    ds = ds.repeat(count=1)

    return ds
Ejemplo n.º 4
0
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
    """
    create a train or eval dataset

    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        repeat_num(int): the repeat times of dataset. Default: 1
        batch_size(int): the batch size of dataset. Default: 32

    Returns:
        dataset
    """
    device_num = int(os.getenv("RANK_SIZE"))
    try:
        # global_rank_id = int(os.getenv('RANK_ID').split("-")[1].split("custom")[1])
        global_rank_id = int(os.getenv('RANK_ID').split("-")[-1])
    except:
        global_rank_id = 0
    rank_id = int(os.getenv('DEVICE_ID')) + global_rank_id * 8

    columns_list = ["data", "label"]
    if do_train:
        ds = de.MindDataset(dataset_path + '/imagenet_train.mindrecord00',
                            columns_list,
                            num_parallel_workers=8,
                            shuffle=True,
                            num_shards=device_num,
                            shard_id=rank_id)
        print("train dataset size", ds.get_dataset_size())
    else:
        padded_sample = {}
        white_io = BytesIO()
        Image.new('RGB', (224, 224), (255, 255, 255)).save(white_io, 'JPEG')
        padded_sample['data'] = white_io.getvalue()
        padded_sample['label'] = -1
        batch_per_step = batch_size * device_num
        print("eval batch per step:", batch_per_step)
        if batch_per_step < 50000:
            if 50000 % batch_per_step == 0:
                num_padded = 0
            else:
                num_padded = batch_per_step - (50000 % batch_per_step)
        else:
            num_padded = batch_per_step - 50000
        print("Padded samples:", num_padded)
        ds = de.MindDataset(dataset_path + '/imagenet_eval.mindrecord0',
                            columns_list,
                            num_parallel_workers=8,
                            shuffle=False,
                            num_shards=device_num,
                            shard_id=rank_id,
                            padded_sample=padded_sample,
                            num_padded=num_padded)
        print("eval dataset size", ds.get_dataset_size())

    image_size = 224
    mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
    std = [0.229 * 255, 0.224 * 255, 0.225 * 255]

    # define map operations
    if do_train:
        trans = [
            C.RandomCropDecodeResize(image_size,
                                     scale=(0.08, 1.0),
                                     ratio=(0.75, 1.333)),
            C.RandomHorizontalFlip(prob=0.5),
            C.Normalize(mean=mean, std=std),
            C.HWC2CHW()
        ]
    else:
        trans = [
            C.Decode(),
            C.Resize((365, 365)),
            C.CenterCrop(320),
            C.Normalize(mean=mean, std=std),
            C.HWC2CHW()
        ]

    type_cast_op = C2.TypeCast(mstype.int32)

    # ds = ds.shuffle(buffer_size=100000)
    ds = ds.map(input_columns="data", num_parallel_workers=8, operations=trans)
    ds = ds.map(input_columns="label",
                num_parallel_workers=8,
                operations=type_cast_op)

    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)
    # apply dataset repeat operation
    ds = ds.repeat(repeat_num)

    return ds
Ejemplo n.º 5
0
def create_dataset(dataset_path,
                   do_train,
                   config,
                   platform,
                   repeat_num=1,
                   batch_size=100):
    """
    create a train or eval dataset

    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        repeat_num(int): the repeat times of dataset. Default: 1
        batch_size(int): the batch size of dataset. Default: 32

    Returns:
        dataset
    """
    if platform == "Ascend":
        rank_size = int(os.getenv("RANK_SIZE"))
        rank_id = int(os.getenv("RANK_ID"))
        if rank_size == 1:
            ds = de.MindDataset(dataset_path,
                                num_parallel_workers=8,
                                shuffle=True)
        else:
            ds = de.MindDataset(dataset_path,
                                num_parallel_workers=8,
                                shuffle=True,
                                num_shards=rank_size,
                                shard_id=rank_id)
    elif platform == "GPU":
        if do_train:
            from mindspore.communication.management import get_rank, get_group_size
            ds = de.MindDataset(dataset_path,
                                num_parallel_workers=8,
                                shuffle=True,
                                num_shards=get_group_size(),
                                shard_id=get_rank())
        else:
            ds = de.MindDataset(dataset_path,
                                num_parallel_workers=8,
                                shuffle=False)
    else:
        raise ValueError("Unsupport platform.")

    resize_height = config.image_height
    buffer_size = 1000

    # define map operations
    resize_crop_op = C.RandomCropDecodeResize(resize_height,
                                              scale=(0.08, 1.0),
                                              ratio=(0.75, 1.333))
    horizontal_flip_op = C.RandomHorizontalFlip(prob=0.5)

    color_op = C.RandomColorAdjust(brightness=0.4,
                                   contrast=0.4,
                                   saturation=0.4)
    rescale_op = C.Rescale(1 / 255.0, 0)
    normalize_op = C.Normalize(mean=[0.485, 0.456, 0.406],
                               std=[0.229, 0.224, 0.225])
    change_swap_op = C.HWC2CHW()

    # define python operations
    decode_p = P.Decode()
    resize_p = P.Resize(256, interpolation=Inter.BILINEAR)
    center_crop_p = P.CenterCrop(224)
    totensor = P.ToTensor()
    normalize_p = P.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225))
    composeop = P.ComposeOp(
        [decode_p, resize_p, center_crop_p, totensor, normalize_p])
    if do_train:
        trans = [
            resize_crop_op, horizontal_flip_op, color_op, rescale_op,
            normalize_op, change_swap_op
        ]
    else:
        trans = composeop()
    type_cast_op = C2.TypeCast(mstype.int32)

    ds = ds.map(input_columns="image",
                operations=trans,
                num_parallel_workers=8)
    ds = ds.map(input_columns="label_list",
                operations=type_cast_op,
                num_parallel_workers=8)

    # apply shuffle operations
    ds = ds.shuffle(buffer_size=buffer_size)

    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)

    # apply dataset repeat operation
    ds = ds.repeat(repeat_num)

    return ds
Ejemplo n.º 6
0
def _load_dataset(input_files,
                  schema_file,
                  batch_size,
                  sink_mode=False,
                  rank_size=1,
                  rank_id=0,
                  shuffle=True,
                  drop_remainder=True,
                  is_translate=False):
    """
    Load dataset according to passed in params.

    Args:
        input_files (list): Data files.
        schema_file (str): Schema file path.
        batch_size (int): Batch size.
        sink_mode (bool): Whether enable sink mode.
        rank_size (int): Rank size.
        rank_id (int): Rank id.
        shuffle (bool): Whether shuffle dataset.
        drop_remainder (bool): Whether drop the last possibly incomplete batch.
        is_translate (bool): Whether translate the text.

    Returns:
        Dataset, dataset instance.
    """
    if not input_files:
        raise FileNotFoundError("Require at least one dataset.")

    if not (schema_file and os.path.exists(schema_file)
            and os.path.isfile(schema_file)
            and os.path.basename(schema_file).endswith(".json")):
        raise FileNotFoundError(
            "`dataset_schema` must be a existed json file.")

    if not isinstance(sink_mode, bool):
        raise ValueError("`sink` must be type of bool.")

    for datafile in input_files:
        print(f" | Loading {datafile}.")

    if not is_translate:
        ds = de.MindDataset(input_files,
                            columns_list=[
                                "src", "src_padding", "prev_opt", "target",
                                "tgt_padding"
                            ],
                            shuffle=False,
                            num_shards=rank_size,
                            shard_id=rank_id,
                            num_parallel_workers=8)

        ori_dataset_size = ds.get_dataset_size()
        print(f" | Dataset size: {ori_dataset_size}.")
        if shuffle:
            ds = ds.shuffle(buffer_size=ori_dataset_size // 20)
        type_cast_op = deC.TypeCast(mstype.int32)
        ds = ds.map(input_columns="src",
                    operations=type_cast_op,
                    num_parallel_workers=8)
        ds = ds.map(input_columns="src_padding",
                    operations=type_cast_op,
                    num_parallel_workers=8)
        ds = ds.map(input_columns="prev_opt",
                    operations=type_cast_op,
                    num_parallel_workers=8)
        ds = ds.map(input_columns="target",
                    operations=type_cast_op,
                    num_parallel_workers=8)
        ds = ds.map(input_columns="tgt_padding",
                    operations=type_cast_op,
                    num_parallel_workers=8)

        ds = ds.rename(input_columns=[
            "src", "src_padding", "prev_opt", "target", "tgt_padding"
        ],
                       output_columns=[
                           "source_eos_ids", "source_eos_mask",
                           "target_sos_ids", "target_eos_ids",
                           "target_eos_mask"
                       ])
        ds = ds.batch(batch_size, drop_remainder=drop_remainder)
    else:
        ds = de.MindDataset(input_files,
                            columns_list=["src", "src_padding"],
                            shuffle=False,
                            num_shards=rank_size,
                            shard_id=rank_id,
                            num_parallel_workers=8)

        ori_dataset_size = ds.get_dataset_size()
        print(f" | Dataset size: {ori_dataset_size}.")
        if shuffle:
            ds = ds.shuffle(buffer_size=ori_dataset_size // 20)
        type_cast_op = deC.TypeCast(mstype.int32)
        ds = ds.map(input_columns="src",
                    operations=type_cast_op,
                    num_parallel_workers=8)
        ds = ds.map(input_columns="src_padding",
                    operations=type_cast_op,
                    num_parallel_workers=8)

        ds = ds.rename(input_columns=["src", "src_padding"],
                       output_columns=["source_eos_ids", "source_eos_mask"])
        ds = ds.batch(batch_size, drop_remainder=drop_remainder)

    return ds