Esempio n. 1
0
def get_dataset(batch_size=1, repeat_count=1, distribute_file=''):
    """
    get dataset
    """
    _ = distribute_file

    ds = de.TFRecordDataset(
        [cfg.data_file],
        cfg.schema_file,
        columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"])
    type_cast_op = C.TypeCast(mstype.int32)
    ds = ds.map(input_columns="segment_ids", operations=type_cast_op)
    ds = ds.map(input_columns="input_mask", operations=type_cast_op)
    ds = ds.map(input_columns="input_ids", operations=type_cast_op)
    if cfg.task == "Regression":
        type_cast_op_float = C.TypeCast(mstype.float32)
        ds = ds.map(input_columns="label_ids", operations=type_cast_op_float)
    else:
        ds = ds.map(input_columns="label_ids", operations=type_cast_op)
    ds = ds.repeat(repeat_count)

    # apply shuffle operation
    buffer_size = 960
    ds = ds.shuffle(buffer_size=buffer_size)

    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)
    return ds
Esempio n. 2
0
    def convert_dtype(self, ms_dataset):
        """Convert the dataset dtype if the dtype is invalid.

        :param ms_dataset: a dataset object of mindspore
        :return: a dataset object of mindspore after dtype convert
        """
        item = self.dataset[0]
        image, label = item[0], item[1]
        try:
            image_dtype = str(image.dtype)
        except:
            pass
        try:
            label_dtype = str(label.dtype)
        except:
            label_dtype = "int64"
        if image_dtype in self.invalid_dtype:
            type_cast_op = C2.TypeCast(self.dtype_map[image_dtype])
            ms_dataset = ms_dataset.map(input_columns="image", operations=type_cast_op)

        if label_dtype in self.invalid_dtype:
            type_cast_op = C2.TypeCast(self.dtype_map[label_dtype])
            ms_dataset = ms_dataset.map(input_columns="label", operations=type_cast_op)

        return ms_dataset
Esempio n. 3
0
def create_ner_dataset(batch_size=1,
                       repeat_count=1,
                       assessment_method="accuracy",
                       data_file_path=None,
                       schema_file_path=None,
                       do_shuffle=True):
    """create finetune or evaluation dataset"""
    type_cast_op = C.TypeCast(mstype.int32)
    ds = de.TFRecordDataset(
        [data_file_path],
        schema_file_path if schema_file_path != "" else None,
        columns_list=["input_ids", "input_mask", "segment_ids", "label_ids"],
        shuffle=do_shuffle)
    if assessment_method == "Spearman_correlation":
        type_cast_op_float = C.TypeCast(mstype.float32)
        ds = ds.map(operations=type_cast_op_float, input_columns="label_ids")
    else:
        ds = ds.map(operations=type_cast_op, input_columns="label_ids")
    ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
    ds = ds.map(operations=type_cast_op, input_columns="input_mask")
    ds = ds.map(operations=type_cast_op, input_columns="input_ids")
    ds = ds.repeat(repeat_count)
    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)
    return ds
Esempio n. 4
0
def create_gru_dataset(epoch_count=1,
                       batch_size=1,
                       rank_size=1,
                       rank_id=0,
                       do_shuffle=True,
                       dataset_path=None,
                       is_training=True):
    """create dataset"""
    ds = de.MindDataset(
        dataset_path,
        columns_list=["source_ids", "target_ids", "target_mask"],
        shuffle=do_shuffle,
        num_parallel_workers=10,
        num_shards=rank_size,
        shard_id=rank_id)
    operations = random_teacher_force
    ds = ds.map(operations=operations,
                input_columns=["source_ids", "target_ids", "target_mask"],
                output_columns=["source_ids", "target_ids", "teacher_force"],
                column_order=["source_ids", "target_ids", "teacher_force"])
    type_cast_op = deC.TypeCast(mstype.int32)
    type_cast_op_bool = deC.TypeCast(mstype.bool_)
    ds = ds.map(operations=type_cast_op, input_columns="source_ids")
    ds = ds.map(operations=type_cast_op, input_columns="target_ids")
    ds = ds.map(operations=type_cast_op_bool, input_columns="teacher_force")
    ds = ds.batch(batch_size, drop_remainder=True)
    ds = ds.repeat(1)
    return ds
def test_random_apply():
    ds.config.set_seed(0)

    def test_config(arr, op_list, prob=0.5):
        try:
            data = ds.NumpySlicesDataset(arr,
                                         column_names="col",
                                         shuffle=False)
            data = data.map(input_columns=["col"],
                            operations=ops.RandomApply(op_list, prob))
            res = []
            for i in data.create_dict_iterator():
                res.append(i["col"].tolist())
            return res
        except (TypeError, ValueError) as e:
            return str(e)

    res1 = test_config([[0, 1]], [ops.Duplicate(), ops.Concatenate()])
    assert res1 in [[[0, 1]], [[0, 1, 0, 1]]]
    # test single nested compose
    assert test_config([[0, 1, 2]], [
        ops.Compose([ops.Duplicate(),
                     ops.Concatenate(),
                     ops.Slice([0, 1, 2])])
    ]) == [[0, 1, 2]]
    # test exception
    assert "is not of type (<class 'list'>" in test_config([1, 0],
                                                           ops.TypeCast(
                                                               mstype.int32))
    assert "Input prob is not within the required interval" in test_config(
        [0, 1], [ops.Slice([0, 1])], 1.1)
    assert "is not of type (<class 'float'>" in test_config(
        [1, 0], [ops.TypeCast(mstype.int32)], None)
    assert "op_list with value None is not of type (<class 'list'>" in test_config(
        [1, 0], None)
Esempio n. 6
0
def create_dataset(dataset_path, batch_size=1, num_shards=1, shard_id=0, device_target='Ascend'):
    """
     create train or evaluation dataset for warpctc

     Args:
        dataset_path(str): dataset path
        batch_size(int): batch size of generated dataset, default is 1
        num_shards(int): number of devices
        shard_id(int): rank id
        device_target(str): platform of training, support Ascend and GPU
     """

    dataset = _CaptchaDataset(dataset_path, cf.max_captcha_digits, device_target)
    ds = de.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id)
    image_trans = [
        vc.Rescale(1.0 / 255.0, 0.0),
        vc.Normalize([0.9010, 0.9049, 0.9025], std=[0.1521, 0.1347, 0.1458]),
        vc.Resize((m.ceil(cf.captcha_height / 16) * 16, cf.captcha_width)),
        c.TypeCast(mstype.float16)
    ]
    label_trans = [
        c.TypeCast(mstype.int32)
    ]
    ds = ds.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8)
    if device_target == 'Ascend':
        ds = ds.map(operations=transpose_hwc2whc, input_columns=["image"], num_parallel_workers=8)
    else:
        ds = ds.map(operations=transpose_hwc2chw, input_columns=["image"], num_parallel_workers=8)
    ds = ds.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8)

    ds = ds.batch(batch_size, drop_remainder=True)
    return ds
Esempio n. 7
0
def create_ctpn_dataset(mindrecord_file,
                        batch_size=1,
                        repeat_num=1,
                        device_num=1,
                        rank_id=0,
                        is_training=True,
                        num_parallel_workers=4):
    """Creatr deeptext dataset with MindDataset."""
    ds = de.MindDataset(mindrecord_file, columns_list=["image", "annotation"], num_shards=device_num, shard_id=rank_id,\
        num_parallel_workers=8, shuffle=is_training)
    decode = C.Decode()
    ds = ds.map(operations=decode,
                input_columns=["image"],
                num_parallel_workers=1)
    compose_map_func = (lambda image, annotation: preprocess_fn(
        image, annotation, is_training))
    hwc_to_chw = C.HWC2CHW()
    normalize_op = C.Normalize((123.675, 116.28, 103.53),
                               (58.395, 57.12, 57.375))
    type_cast0 = CC.TypeCast(mstype.float32)
    type_cast1 = CC.TypeCast(mstype.float16)
    type_cast2 = CC.TypeCast(mstype.int32)
    type_cast3 = CC.TypeCast(mstype.bool_)
    if is_training:
        ds = ds.map(
            operations=compose_map_func,
            input_columns=["image", "annotation"],
            output_columns=[
                "image", "image_shape", "box", "label", "valid_num"
            ],
            column_order=["image", "image_shape", "box", "label", "valid_num"],
            num_parallel_workers=num_parallel_workers)
        ds = ds.map(operations=[normalize_op, type_cast0],
                    input_columns=["image"],
                    num_parallel_workers=12)
        ds = ds.map(operations=[hwc_to_chw, type_cast1],
                    input_columns=["image"],
                    num_parallel_workers=12)
    else:
        ds = ds.map(
            operations=compose_map_func,
            input_columns=["image", "annotation"],
            output_columns=[
                "image", "image_shape", "box", "label", "valid_num"
            ],
            column_order=["image", "image_shape", "box", "label", "valid_num"],
            num_parallel_workers=num_parallel_workers)

        ds = ds.map(operations=[normalize_op, hwc_to_chw, type_cast1],
                    input_columns=["image"],
                    num_parallel_workers=24)
    # transpose_column from python to c
    ds = ds.map(operations=[type_cast1], input_columns=["image_shape"])
    ds = ds.map(operations=[type_cast1], input_columns=["box"])
    ds = ds.map(operations=[type_cast2], input_columns=["label"])
    ds = ds.map(operations=[type_cast3], input_columns=["valid_num"])
    ds = ds.batch(batch_size, drop_remainder=True)
    ds = ds.repeat(repeat_num)
    return ds
Esempio n. 8
0
def test_compose():
    """
    Test C++ and Python Compose Op
    """
    ds.config.set_seed(0)

    def test_config(arr, op_list):
        try:
            data = ds.NumpySlicesDataset(arr,
                                         column_names="col",
                                         shuffle=False)
            data = data.map(input_columns=["col"], operations=op_list)
            res = []
            for i in data.create_dict_iterator(output_numpy=True):
                res.append(i["col"].tolist())
            return res
        except (TypeError, ValueError) as e:
            return str(e)

    # Test simple compose with only 1 op, this would generate a warning
    assert test_config([[1, 0], [3, 4]],
                       ops.Compose([ops.Fill(2)])) == [[2, 2], [2, 2]]
    # Test 1 column -> 2 columns -> 1 -> 2 -> 1
    assert test_config([[1, 0]],
                       ops.Compose([ops.Duplicate(), ops.Concatenate(), ops.Duplicate(), ops.Concatenate()])) \
           == [[1, 0] * 4]
    # Test one Python transform followed by a C transform. Type after OneHot is a float (mixed use-case)
    assert test_config(
        [1, 0], ops.Compose([py_ops.OneHotOp(2),
                             ops.TypeCast(mstype.int32)])) == [[[0, 1]],
                                                               [[1, 0]]]
    # Test exceptions.
    with pytest.raises(TypeError) as error_info:
        ops.Compose([1, ops.TypeCast(mstype.int32)])
    assert "op_list[0] is not a c_transform op (TensorOp) nor a callable pyfunc." in str(
        error_info.value)
    # Test empty op list
    with pytest.raises(ValueError) as error_info:
        test_config([1, 0], ops.Compose([]))
    assert "op_list can not be empty." in str(error_info.value)

    # Test Python compose op
    assert test_config([1, 0],
                       py_ops.Compose([py_ops.OneHotOp(2)])) == [[[0, 1]],
                                                                 [[1, 0]]]
    assert test_config([1, 0],
                       py_ops.Compose([py_ops.OneHotOp(2),
                                       (lambda x: x + x)])) == [[[0, 2]],
                                                                [[2, 0]]]
    # Test nested Python compose op
    assert test_config([1, 0],
                       py_ops.Compose([py_ops.Compose([py_ops.OneHotOp(2)]), (lambda x: x + x)])) \
           == [[[0, 2]], [[2, 0]]]

    with pytest.raises(TypeError) as error_info:
        py_ops.Compose([(lambda x: x + x)])()
    assert "Compose was called without an image. Fix invocation (avoid it being invoked as Compose([...])())." in str(
        error_info.value)
Esempio n. 9
0
def create_poetry_dataset(batch_size, poetry, tokenizer):
    """create poetry dataset method"""
    dt = PoetryDataGenerator(batch_size, poetry, tokenizer)
    ds = de.GeneratorDataset(dt, ["input_ids", "token_type_id", "pad_mask"])
    #ds.set_dataset_size(dt.__len__())
    int_type_cast_op = C.TypeCast(mstype.int32)
    float_type_cast_op = C.TypeCast(mstype.float32)
    ds = ds.map(input_columns="input_ids", operations=int_type_cast_op)
    ds = ds.map(input_columns="token_type_id", operations=int_type_cast_op)
    ds = ds.map(input_columns="pad_mask", operations=float_type_cast_op)
    ds = ds.batch(batch_size, drop_remainder=True)
    return ds
Esempio n. 10
0
def create_dataset(dataset_path,
                   do_train,
                   batch_size=16,
                   device_num=1,
                   rank=0):
    """
    create a train or eval dataset

    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        batch_size(int): the batch size of dataset. Default: 16.
        device_num (int): Number of shards that the dataset should be divided into (default=1).
        rank (int): The shard ID within num_shards (default=0).

    Returns:
        dataset
    """
    if device_num == 1:
        ds = de.ImageFolderDataset(dataset_path,
                                   num_parallel_workers=8,
                                   shuffle=True)
    else:
        ds = de.ImageFolderDataset(dataset_path,
                                   num_parallel_workers=8,
                                   shuffle=True,
                                   num_shards=device_num,
                                   shard_id=rank)
    # define map operations
    if do_train:
        trans = [
            C.RandomCropDecodeResize(299),
            C.RandomHorizontalFlip(prob=0.5),
            C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4)
        ]
    else:
        trans = [C.Decode(), C.Resize(320), C.CenterCrop(299)]
    trans += [
        C.Normalize(mean=[127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5]),
        C.HWC2CHW(),
        C2.TypeCast(mstype.float32)
    ]

    type_cast_op = C2.TypeCast(mstype.int32)
    ds = ds.map(input_columns="image",
                operations=trans,
                num_parallel_workers=8)
    ds = ds.map(input_columns="label",
                operations=type_cast_op,
                num_parallel_workers=8)
    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)
    return ds
Esempio n. 11
0
def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1):
    """
    create a train or eval dataset

    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        rank (int): The shard ID within num_shards (default=None).
        group_size (int): Number of shards that the dataset should be divided into (default=None).
        repeat_num(int): the repeat times of dataset. Default: 1.

    Returns:
        dataset
    """
    if group_size == 1:
        ds = de.ImageFolderDataset(dataset_path,
                                   num_parallel_workers=cfg.work_nums,
                                   shuffle=True)
    else:
        ds = de.ImageFolderDataset(dataset_path,
                                   num_parallel_workers=cfg.work_nums,
                                   shuffle=True,
                                   num_shards=group_size,
                                   shard_id=rank)
    # define map operations
    if do_train:
        trans = [
            C.RandomCropDecodeResize(224),
            C.RandomHorizontalFlip(prob=0.5),
            C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4)
        ]
    else:
        trans = [C.Decode(), C.Resize(256), C.CenterCrop(224)]
    trans += [
        toBGR(),
        C.Rescale(1.0 / 255.0, 0.0),
        # C.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        C.HWC2CHW(),
        C2.TypeCast(mstype.float32)
    ]

    type_cast_op = C2.TypeCast(mstype.int32)
    ds = ds.map(operations=trans,
                input_columns="image",
                num_parallel_workers=cfg.work_nums)
    ds = ds.map(operations=type_cast_op,
                input_columns="label",
                num_parallel_workers=cfg.work_nums)
    # apply batch operations
    ds = ds.batch(cfg.batch_size, drop_remainder=True)

    return ds
Esempio n. 12
0
def create_tinybert_dataset(batch_size=32,
                            device_num=1,
                            rank=0,
                            do_shuffle="true",
                            data_dir=None,
                            data_type='tfrecord',
                            seq_length=128,
                            task_type=mstype.int32,
                            drop_remainder=True):
    """create tinybert dataset"""
    if isinstance(data_dir, list):
        data_files = data_dir
    else:
        data_files = [data_dir]

    columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"]

    shuffle = (do_shuffle == "true")

    if data_type == 'mindrecord':
        ds = de.MindDataset(data_files,
                            columns_list=columns_list,
                            shuffle=shuffle,
                            num_shards=device_num,
                            shard_id=rank)
    else:
        ds = de.TFRecordDataset(data_files,
                                columns_list=columns_list,
                                shuffle=shuffle,
                                num_shards=device_num,
                                shard_id=rank,
                                shard_equal_rows=(device_num == 1))

    if device_num == 1 and shuffle is True:
        ds = ds.shuffle(10000)

    type_cast_op = C.TypeCast(mstype.int32)
    slice_op = C.Slice(slice(0, seq_length, 1))
    label_type = mstype.int32 if task_type == 'classification' else mstype.float32
    ds = ds.map(operations=[type_cast_op, slice_op],
                input_columns=["segment_ids"])
    ds = ds.map(operations=[type_cast_op, slice_op],
                input_columns=["input_mask"])
    ds = ds.map(operations=[type_cast_op, slice_op],
                input_columns=["input_ids"])
    ds = ds.map(operations=[C.TypeCast(label_type), slice_op],
                input_columns=["label_ids"])
    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=drop_remainder)

    return ds
Esempio n. 13
0
def generate_mnist_dataset(data_path,
                           batch_size=32,
                           repeat_size=1,
                           samples=None,
                           num_parallel_workers=1,
                           sparse=True):
    """
    create dataset for training or testing
    """
    # define dataset
    ds1 = ds.MnistDataset(data_path, num_samples=samples)

    # define operation parameters
    resize_height, resize_width = 32, 32
    rescale = 1.0 / 255.0
    shift = 0.0

    # define map operations
    resize_op = CV.Resize((resize_height, resize_width),
                          interpolation=Inter.LINEAR)
    rescale_op = CV.Rescale(rescale, shift)
    hwc2chw_op = CV.HWC2CHW()
    type_cast_op = C.TypeCast(mstype.int32)

    # apply map operations on images
    if not sparse:
        one_hot_enco = C.OneHot(10)
        ds1 = ds1.map(input_columns="label",
                      operations=one_hot_enco,
                      num_parallel_workers=num_parallel_workers)
        type_cast_op = C.TypeCast(mstype.float32)
    ds1 = ds1.map(input_columns="label",
                  operations=type_cast_op,
                  num_parallel_workers=num_parallel_workers)
    ds1 = ds1.map(input_columns="image",
                  operations=resize_op,
                  num_parallel_workers=num_parallel_workers)
    ds1 = ds1.map(input_columns="image",
                  operations=rescale_op,
                  num_parallel_workers=num_parallel_workers)
    ds1 = ds1.map(input_columns="image",
                  operations=hwc2chw_op,
                  num_parallel_workers=num_parallel_workers)

    # apply DatasetOps
    buffer_size = 10000
    ds1 = ds1.shuffle(buffer_size=buffer_size)
    ds1 = ds1.batch(batch_size, drop_remainder=True)
    ds1 = ds1.repeat(repeat_size)

    return ds1
Esempio n. 14
0
def create_dataset_dp(batch_size, data_path, device_num=1, rank=0, drop=True, data_start_index=0,
                   eod_id=9):
    """
    Create dataset using data parallel.

    Inputs:
        batch_size: batch size
        data_path: path of your MindRecord files
        device_num: total device number
        rank: current rank id
        drop: whether drop remainder
        eod_id: the id for <EOD>

    Returns:
        dataset: the dataset for training or evaluating
    """
    ds.config.set_seed(1)
    home_path = os.path.join(os.getcwd(), data_path)
    files = os.listdir(data_path)
    
    dis = int(batch_size / device_num)
    if dis < 1:
        raise ValueError("Batch size / device_num should be positive, but found {}".format(dis))

    data = [
        os.path.join(home_path, name) for name in files
        if not name.endswith(".db")
    ]
    data.sort(key=lambda x: int(x[x.find("mindrecord")+10:]))
    print(data)

    if data_start_index >= len(data):
        raise ValueError(f"data start index {data_start_index} is larger than dataset length {len(data)}")
    dataset = ds.MindDataset(data[data_start_index:], columns_list=["input_ids"], shuffle=False)
    type_cast_op = C.TypeCast(mstype.int32)
    type_cast_op_float = C.TypeCast(mstype.float16)

    map_func = (lambda input_ids: get_input_data_from_batch(input_ids, eod_id, rank, dis))
    dataset = dataset.batch(batch_size, drop_remainder=drop)
    dataset = dataset.map(operations=map_func, input_columns=["input_ids"],
                          output_columns=["input_ids", "position_id", "attention_mask"],
                          column_order=["input_ids", "position_id", "attention_mask"])
    dataset = dataset.map(input_columns="position_id", operations=type_cast_op)
    dataset = dataset.map(input_columns="attention_mask", operations=type_cast_op_float)

    dataset = dataset.map(input_columns="input_ids", operations=type_cast_op)
    dataset = dataset.repeat(1)
    return dataset
Esempio n. 15
0
def create_squad_dataset(batch_size=1,
                         repeat_count=1,
                         data_file_path=None,
                         schema_file_path=None,
                         is_training=True,
                         do_shuffle=True):
    """create finetune or evaluation dataset"""
    type_cast_op = C.TypeCast(mstype.int32)
    if is_training:
        ds = de.TFRecordDataset(
            [data_file_path],
            schema_file_path if schema_file_path != "" else None,
            columns_list=[
                "input_ids", "input_mask", "segment_ids", "start_positions",
                "end_positions", "unique_ids", "is_impossible"
            ],
            shuffle=do_shuffle)
        ds = ds.map(operations=type_cast_op, input_columns="start_positions")
        ds = ds.map(operations=type_cast_op, input_columns="end_positions")
    else:
        ds = de.TFRecordDataset(
            [data_file_path],
            schema_file_path if schema_file_path != "" else None,
            columns_list=[
                "input_ids", "input_mask", "segment_ids", "unique_ids"
            ])
    ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
    ds = ds.map(operations=type_cast_op, input_columns="input_mask")
    ds = ds.map(operations=type_cast_op, input_columns="input_ids")
    ds = ds.repeat(repeat_count)
    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)
    return ds
Esempio n. 16
0
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="CPU"):


    data_set = ds.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=True)


    # define map operations
    trans = []
    if do_train:
        trans += [
            C.RandomCrop((32, 32), (4, 4, 4, 4)),
            C.RandomHorizontalFlip(prob=0.5)
        ]

    trans += [
        C.Resize((48,48)),
        C.Rescale(1.0 / 255.0, 0.0),
        C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]),
        C.HWC2CHW()
    ]

    type_cast_op = C2.TypeCast(mstype.int32)

    data_set = data_set.map(operations=type_cast_op, input_columns="label", num_parallel_workers=8)
    data_set = data_set.map(operations=trans, input_columns="image", num_parallel_workers=8)

    # apply batch operations
    data_set = data_set.shuffle(buffer_size=10)
    data_set = data_set.batch(batch_size, drop_remainder=False)
    # apply dataset repeat operation
    data_set = data_set.repeat(repeat_num)

    return data_set
Esempio n. 17
0
def create_bert_dataset(device_num=1, rank=0, do_shuffle="true", data_dir=None, schema_dir=None):
    """create train dataset"""
    # apply repeat operations
    files = os.listdir(data_dir)
    data_files = []
    for file_name in files:
        if "tf_record" in file_name:
            data_files.append(os.path.join(data_dir, file_name))
    print(data_files)
    ds = de.TFRecordDataset(data_files, schema_dir if schema_dir != "" else None,
                            columns_list=["input_ids", "input_mask", "segment_ids", "next_sentence_labels",
                                          "masked_lm_positions", "masked_lm_ids", "masked_lm_weights"],
                            shuffle=de.Shuffle.FILES if do_shuffle == "true" else False,
                            num_shards=device_num, shard_id=rank, shard_equal_rows=True)
    ori_dataset_size = ds.get_dataset_size()
    print('origin dataset size: ', ori_dataset_size)
    type_cast_op = C.TypeCast(mstype.int32)
    ds = ds.map(operations=type_cast_op, input_columns="masked_lm_ids")
    ds = ds.map(operations=type_cast_op, input_columns="masked_lm_positions")
    ds = ds.map(operations=type_cast_op, input_columns="next_sentence_labels")
    ds = ds.map(operations=type_cast_op, input_columns="segment_ids")
    ds = ds.map(operations=type_cast_op, input_columns="input_mask")
    ds = ds.map(operations=type_cast_op, input_columns="input_ids")
    # apply batch operations
    ds = ds.batch(cfg.batch_size, drop_remainder=True)
    logger.info("data size: {}".format(ds.get_dataset_size()))
    logger.info("repeat count: {}".format(ds.get_repeat_count()))
    return ds
Esempio n. 18
0
def create_dataset(data_path, batch_size=32, repeat_size=1,
                   num_parallel_workers=1):
    """
    create dataset for train or test
    """
    # define dataset
    mnist_ds = ds.MnistDataset(data_path)

    resize_height, resize_width = 32, 32
    rescale = 1.0 / 255.0
    shift = 0.0
    rescale_nml = 1 / 0.3081
    shift_nml = -1 * 0.1307 / 0.3081

    # define map operations
    resize_op = CV.Resize((resize_height, resize_width), interpolation=Inter.LINEAR)  # Bilinear mode
    rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
    rescale_op = CV.Rescale(rescale, shift)
    hwc2chw_op = CV.HWC2CHW()
    type_cast_op = C.TypeCast(mstype.int32)

    # apply map operations on images
    mnist_ds = mnist_ds.map(operations=type_cast_op, input_columns="label", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=resize_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=rescale_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=rescale_nml_op, input_columns="image", num_parallel_workers=num_parallel_workers)
    mnist_ds = mnist_ds.map(operations=hwc2chw_op, input_columns="image", num_parallel_workers=num_parallel_workers)

    # apply DatasetOps
    buffer_size = 10000
    mnist_ds = mnist_ds.shuffle(buffer_size=buffer_size)  # 10000 as in LeNet train script
    mnist_ds = mnist_ds.batch(batch_size, drop_remainder=True)
    mnist_ds = mnist_ds.repeat(repeat_size)

    return mnist_ds
Esempio n. 19
0
def create_dataset(data_path, batch_size=32, repeat_size=1, mode="train"):
    """
    create dataset for train or test
    """
    cifar_ds = ds.Cifar10Dataset(data_path)
    rescale = 1.0 / 255.0
    shift = 0.0

    resize_op = CV.Resize((cfg.image_height, cfg.image_width))
    rescale_op = CV.Rescale(rescale, shift)
    normalize_op = CV.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
    if mode == "train":
        random_crop_op = CV.RandomCrop([32, 32], [4, 4, 4, 4])
        random_horizontal_op = CV.RandomHorizontalFlip()
    channel_swap_op = CV.HWC2CHW()
    typecast_op = C.TypeCast(mstype.int32)
    cifar_ds = cifar_ds.map(input_columns="label", operations=typecast_op)
    if mode == "train":
        cifar_ds = cifar_ds.map(input_columns="image", operations=random_crop_op)
        cifar_ds = cifar_ds.map(input_columns="image", operations=random_horizontal_op)
    cifar_ds = cifar_ds.map(input_columns="image", operations=resize_op)
    cifar_ds = cifar_ds.map(input_columns="image", operations=rescale_op)
    cifar_ds = cifar_ds.map(input_columns="image", operations=normalize_op)
    cifar_ds = cifar_ds.map(input_columns="image", operations=channel_swap_op)

    cifar_ds = cifar_ds.shuffle(buffer_size=cfg.buffer_size)
    cifar_ds = cifar_ds.batch(batch_size, drop_remainder=True)
    cifar_ds = cifar_ds.repeat(repeat_size)
    return cifar_ds
Esempio n. 20
0
def me_de_train_dataset(sink_mode=False):
    """test me de train dataset"""
    # apply repeat operations
    repeat_count = 1
    sink_size = -1
    batch_size = 16
    data_set = ds.TFRecordDataset(DATA_DIR, SCHEMA_DIR, columns_list=["input_ids", "input_mask", "segment_ids",
                                                                      "next_sentence_labels", "masked_lm_positions",
                                                                      "masked_lm_ids", "masked_lm_weights"],
                                  shuffle=False)
    type_cast_op = C.TypeCast(mstype.int32)
    new_repeat_count = repeat_count
    if sink_mode:
        sink_size = 100
        new_repeat_count = 3
    data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_ids")
    data_set = data_set.map(operations=type_cast_op, input_columns="masked_lm_positions")
    data_set = data_set.map(operations=type_cast_op, input_columns="next_sentence_labels")
    data_set = data_set.map(operations=type_cast_op, input_columns="segment_ids")
    data_set = data_set.map(operations=type_cast_op, input_columns="input_mask")
    data_set = data_set.map(operations=type_cast_op, input_columns="input_ids")
    # apply batch operations
    data_set = data_set.batch(batch_size, drop_remainder=True)
    logger.info("data size: {}".format(data_set.get_dataset_size()))
    logger.info("repeat_count: {}".format(data_set.get_repeat_count()))
    return data_set, new_repeat_count, sink_size
Esempio n. 21
0
def create_tinybert_dataset(task='td',
                            batch_size=32,
                            device_num=1,
                            rank=0,
                            do_shuffle="true",
                            data_dir=None,
                            schema_dir=None):
    """create tinybert dataset"""
    files = os.listdir(data_dir)
    data_files = []
    for file_name in files:
        if "record" in file_name:
            data_files.append(os.path.join(data_dir, file_name))
    if task == "td":
        columns_list = ["input_ids", "input_mask", "segment_ids", "label_ids"]
    else:
        columns_list = ["input_ids", "input_mask", "segment_ids"]

    ds = de.TFRecordDataset(data_files,
                            schema_dir,
                            columns_list=columns_list,
                            shuffle=(do_shuffle == "true"),
                            num_shards=device_num,
                            shard_id=rank,
                            shard_equal_rows=True)
    type_cast_op = C.TypeCast(mstype.int32)
    ds = ds.map(input_columns="segment_ids", operations=type_cast_op)
    ds = ds.map(input_columns="input_mask", operations=type_cast_op)
    ds = ds.map(input_columns="input_ids", operations=type_cast_op)
    if task == "td":
        ds = ds.map(input_columns="label_ids", operations=type_cast_op)
    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)

    return ds
Esempio n. 22
0
def load_test_data(batch_size=1, data_file=None):
    """Load test dataset."""
    data_set = ds.MindDataset(data_file,
                              columns_list=[
                                  "source_eos_ids", "source_eos_mask",
                                  "target_sos_ids", "target_sos_mask",
                                  "target_eos_ids", "target_eos_mask"
                              ],
                              shuffle=False)
    type_cast_op = deC.TypeCast(mstype.int32)
    data_set = data_set.map(operations=type_cast_op,
                            input_columns="source_eos_ids")
    data_set = data_set.map(operations=type_cast_op,
                            input_columns="source_eos_mask")
    data_set = data_set.map(operations=type_cast_op,
                            input_columns="target_sos_ids")
    data_set = data_set.map(operations=type_cast_op,
                            input_columns="target_sos_mask")
    data_set = data_set.map(operations=type_cast_op,
                            input_columns="target_eos_ids")
    data_set = data_set.map(operations=type_cast_op,
                            input_columns="target_eos_mask")
    # apply batch operations
    data_set = data_set.batch(batch_size, drop_remainder=True)
    return data_set
Esempio n. 23
0
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32, target="Ascend"):
    """
    create a train or eval imagenet2012 dataset for resnet50

    Args:
        dataset_path(string): the path of dataset.
        do_train(bool): whether dataset is used for train or eval.
        repeat_num(int): the repeat times of dataset. Default: 1
        batch_size(int): the batch size of dataset. Default: 32
        target(str): the device target. Default: Ascend
    Returns:
        dataset
    """

    if target == "Ascend":
        device_num, rank_id = _get_rank_info()
    else:
        init("nccl")
        rank_id = get_rank()
        device_num = get_group_size()

    if device_num == 1:
        ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True)
    else:
        ds = de.ImageFolderDatasetV2(dataset_path, num_parallel_workers=8, shuffle=True,
                                     num_shards=device_num, shard_id=rank_id)

    image_size = 224
    mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
    std = [0.229 * 255, 0.224 * 255, 0.225 * 255]

    # define map operations
    if do_train:
        trans = [
            C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)),
            C.RandomHorizontalFlip(prob=0.5),
            C.Normalize(mean=mean, std=std),
            C.HWC2CHW()
        ]
    else:
        trans = [
            C.Decode(),
            C.Resize(256),
            C.CenterCrop(image_size),
            C.Normalize(mean=mean, std=std),
            C.HWC2CHW()
        ]

    type_cast_op = C2.TypeCast(mstype.int32)

    ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans)
    ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op)

    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)

    # apply dataset repeat operation
    ds = ds.repeat(repeat_num)

    return ds
Esempio n. 24
0
def test_case_project_between_maps():
    columns = ["col_3d", "col_sint64", "col_2d"]
    parameters = {"params": {'columns': columns}}

    data1 = ds.TFRecordDataset(DATA_DIR_TF, SCHEMA_DIR_TF, shuffle=False)

    type_cast_op = C.TypeCast(mstype.int64)
    data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op)
    data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op)
    data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op)
    data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op)

    data1 = data1.project(columns=columns)

    data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op)
    data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op)
    data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op)
    data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op)
    data1 = data1.map(input_columns=["col_3d"], operations=type_cast_op)

    filename = "project_between_maps_result.npz"
    ordered_save_and_check(data1,
                           parameters,
                           filename,
                           generate_golden=GENERATE_GOLDEN)
Esempio n. 25
0
    def transform(dataset: MnistDataset):
        """Transforming the MNIST dataset."""
        resize_height, resize_width = 32, 32
        rescale = 1.0 / 255.0
        shift = 0.0
        rescale_nml = 1 / 0.3081
        shift_nml = -1 * 0.1307 / 0.3081

        resize_op = CV.Resize((resize_height, resize_width),
                              interpolation=Inter.LINEAR)
        rescale_nml_op = CV.Rescale(rescale_nml, shift_nml)
        rescale_op = CV.Rescale(rescale, shift)
        hwc2chw_op = CV.HWC2CHW()
        type_cast_op = C.TypeCast(mstype.int32)

        dataset = dataset.map(operations=type_cast_op, input_columns="label")
        dataset = dataset.map(operations=resize_op, input_columns="image")
        dataset = dataset.map(operations=rescale_op, input_columns="image")
        dataset = dataset.map(operations=rescale_nml_op, input_columns="image")
        dataset = dataset.map(operations=hwc2chw_op, input_columns="image")

        dataset = dataset.batch(Config().trainer.batch_size,
                                drop_remainder=True)

        return dataset
Esempio n. 26
0
def create_dataset_cifar10(data_path,
                           batch_size=32,
                           repeat_size=1,
                           status="train",
                           target="Ascend"):
    """
    create dataset for train or test
    """

    if target == "Ascend":
        device_num, rank_id = _get_rank_info()

    if target != "Ascend" or device_num == 1:
        cifar_ds = ds.Cifar10Dataset(data_path)
    else:
        cifar_ds = ds.Cifar10Dataset(data_path,
                                     num_parallel_workers=8,
                                     shuffle=True,
                                     num_shards=device_num,
                                     shard_id=rank_id)
    rescale = 1.0 / 255.0
    shift = 0.0
    cfg = alexnet_cifar10_cfg

    resize_op = CV.Resize((cfg.image_height, cfg.image_width))
    rescale_op = CV.Rescale(rescale, shift)
    normalize_op = CV.Normalize((0.4914, 0.4822, 0.4465),
                                (0.2023, 0.1994, 0.2010))
    if status == "train":
        random_crop_op = CV.RandomCrop([32, 32], [4, 4, 4, 4])
        random_horizontal_op = CV.RandomHorizontalFlip()
    channel_swap_op = CV.HWC2CHW()
    typecast_op = C.TypeCast(mstype.int32)
    cifar_ds = cifar_ds.map(input_columns="label",
                            operations=typecast_op,
                            num_parallel_workers=8)
    if status == "train":
        cifar_ds = cifar_ds.map(input_columns="image",
                                operations=random_crop_op,
                                num_parallel_workers=8)
        cifar_ds = cifar_ds.map(input_columns="image",
                                operations=random_horizontal_op,
                                num_parallel_workers=8)
    cifar_ds = cifar_ds.map(input_columns="image",
                            operations=resize_op,
                            num_parallel_workers=8)
    cifar_ds = cifar_ds.map(input_columns="image",
                            operations=rescale_op,
                            num_parallel_workers=8)
    cifar_ds = cifar_ds.map(input_columns="image",
                            operations=normalize_op,
                            num_parallel_workers=8)
    cifar_ds = cifar_ds.map(input_columns="image",
                            operations=channel_swap_op,
                            num_parallel_workers=8)

    cifar_ds = cifar_ds.shuffle(buffer_size=cfg.buffer_size)
    cifar_ds = cifar_ds.batch(batch_size, drop_remainder=True)
    cifar_ds = cifar_ds.repeat(repeat_size)
    return cifar_ds
Esempio n. 27
0
def me_de_train_dataset(sink_mode=False):
    """test me de train dataset"""
    # apply repeat operations
    repeat_count = 1
    batch_size = 16
    ds = de.TFRecordDataset(DATA_DIR,
                            SCHEMA_DIR,
                            columns_list=[
                                "input_ids", "input_mask", "segment_ids",
                                "next_sentence_labels", "masked_lm_positions",
                                "masked_lm_ids", "masked_lm_weights"
                            ],
                            shuffle=False)
    type_cast_op = C.TypeCast(mstype.int32)
    new_repeat_count = repeat_count
    if sink_mode:
        repeat_count = 30
        sink_steps = 100
        ori_dataaet_size = ds.get_dataset_size()
        new_size = sink_steps * batch_size
        ds.set_dataset_size(new_size)
        new_repeat_count = int(repeat_count * ori_dataaet_size //
                               ds.get_dataset_size())
    ds = ds.map(input_columns="masked_lm_ids", operations=type_cast_op)
    ds = ds.map(input_columns="masked_lm_positions", operations=type_cast_op)
    ds = ds.map(input_columns="next_sentence_labels", operations=type_cast_op)
    ds = ds.map(input_columns="segment_ids", operations=type_cast_op)
    ds = ds.map(input_columns="input_mask", operations=type_cast_op)
    ds = ds.map(input_columns="input_ids", operations=type_cast_op)
    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)
    ds = ds.repeat(repeat_count)
    logger.info("data size: {}".format(ds.get_dataset_size()))
    logger.info("repeat_count: {}".format(ds.get_repeat_count()))
    return ds, new_repeat_count
Esempio n. 28
0
def create_dataset(data_path, batch_size):
    ds = de.Cifar10Dataset(
        data_path,
        num_parallel_workers=8,
        shuffle=False,
    )

    # define map operations
    trans = []
    # if do_train:
    #     trans += [
    #         # C.RandomCrop((32, 32), (4, 4, 4, 4)),
    #         # C.RandomHorizontalFlip(prob=0.5)
    #     ]

    trans += [
        C.Resize((224, 224)),
        C.Rescale(1.0 / 255.0, 0.0),
        C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010]),
        C.HWC2CHW()
    ]

    type_cast_op = C2.TypeCast(mstype.int32)

    ds = ds.map(operations=type_cast_op,
                input_columns="label",
                num_parallel_workers=8)
    ds = ds.map(operations=trans,
                input_columns="image",
                num_parallel_workers=8)

    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)

    return ds
Esempio n. 29
0
def create_dataset(data_dir,
                   training=True,
                   batch_size=32,
                   resize=(32, 32),
                   rescale=1 / (255 * 0.3081),
                   shift=-0.1307 / 0.3081,
                   buffer_size=64):
    data_train = os.path.join(data_dir, 'train')  # 训练集信息
    data_test = os.path.join(data_dir, 'test')  # 测试集信息
    print(data_train)
    print(data_test)
    ds = ms.dataset.MnistDataset(data_train if training else data_test)

    ds = ds.map(input_columns=["image"],
                operations=[
                    CV.Resize(resize),
                    CV.Rescale(rescale, shift),
                    CV.HWC2CHW()
                ])
    ds = ds.map(input_columns=["label"], operations=C.TypeCast(ms.int32))
    # When `dataset_sink_mode=True` on Ascend, append `ds = ds.repeat(num_epochs) to the end
    ds = ds.shuffle(buffer_size=buffer_size).batch(batch_size,
                                                   drop_remainder=True)

    return ds
Esempio n. 30
0
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32):
    """
    Create a train or eval dataset.

    Args:
        dataset_path (str): The path of dataset.
        do_train (bool): Whether dataset is used for train or eval.
        repeat_num (int): The repeat times of dataset. Default: 1.
        batch_size (int): The batch size of dataset. Default: 32.

    Returns:
        Dataset.
    """
    if do_train:
        dataset_path = os.path.join(dataset_path, 'train')
        do_shuffle = True
    else:
        dataset_path = os.path.join(dataset_path, 'eval')
        do_shuffle = False

    if device_num == 1 or not do_train:
        ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=do_shuffle)
    else:
        ds = de.Cifar10Dataset(dataset_path, num_parallel_workers=8, shuffle=do_shuffle,
                               num_shards=device_num, shard_id=device_id)

    resize_height = 224
    resize_width = 224
    buffer_size = 100
    rescale = 1.0 / 255.0
    shift = 0.0

    # define map operations
    random_crop_op = C.RandomCrop((32, 32), (4, 4, 4, 4))
    random_horizontal_flip_op = C.RandomHorizontalFlip(device_id / (device_id + 1))

    resize_op = C.Resize((resize_height, resize_width))
    rescale_op = C.Rescale(rescale, shift)
    normalize_op = C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])

    change_swap_op = C.HWC2CHW()

    trans = []
    if do_train:
        trans += [random_crop_op, random_horizontal_flip_op]

    trans += [resize_op, rescale_op, normalize_op, change_swap_op]

    type_cast_op = C2.TypeCast(mstype.int32)

    ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op)
    ds = ds.map(input_columns="image", num_parallel_workers=8, operations=trans)

    # apply batch operations
    ds = ds.batch(batch_size, drop_remainder=True)

    # apply dataset repeat operation
    ds = ds.repeat(repeat_num)

    return ds