def _get_mindrecord_dataset(directory, train_mode=True, epochs=1, batch_size=1000, line_per_sample=1000, rank_size=None, rank_id=None): """ Get dataset with mindrecord format. Args: directory (str): Dataset directory. train_mode (bool): Whether dataset is use for train or eval (default=True). epochs (int): Dataset epoch size (default=1). batch_size (int): Dataset batch size (default=1000). line_per_sample (int): The number of sample per line (default=1000). rank_size (int): The number of device, not necessary for single device (default=None). rank_id (int): Id of device, not necessary for single device (default=None). Returns: Dataset. """ file_prefix_name = 'train_input_part.mindrecord' if train_mode else 'test_input_part.mindrecord' file_suffix_name = '00' if train_mode else '0' shuffle = train_mode if rank_size is not None and rank_id is not None: ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), columns_list=['feat_ids', 'feat_vals', 'label'], num_shards=rank_size, shard_id=rank_id, shuffle=shuffle, num_parallel_workers=8) else: ds = de.MindDataset(os.path.join(directory, file_prefix_name + file_suffix_name), columns_list=['feat_ids', 'feat_vals', 'label'], shuffle=shuffle, num_parallel_workers=8) ds = ds.batch(int(batch_size / line_per_sample), drop_remainder=True) ds = ds.map(operations=(lambda x, y, z: (np.array(x).flatten().reshape(batch_size, 39), np.array(y).flatten().reshape(batch_size, 39), np.array(z).flatten().reshape(batch_size, 1))), input_columns=['feat_ids', 'feat_vals', 'label'], columns_order=['feat_ids', 'feat_vals', 'label'], num_parallel_workers=8) ds = ds.repeat(epochs) return ds
def load_test_data(batch_size=1, data_file=None): """Load test dataset.""" ds = de.MindDataset(data_file, columns_list=["source_eos_ids", "source_eos_mask", "target_sos_ids", "target_sos_mask", "target_eos_ids", "target_eos_mask"], shuffle=False) type_cast_op = deC.TypeCast(mstype.int32) ds = ds.map(operations=type_cast_op, input_columns="source_eos_ids") ds = ds.map(operations=type_cast_op, input_columns="source_eos_mask") ds = ds.map(operations=type_cast_op, input_columns="target_sos_ids") ds = ds.map(operations=type_cast_op, input_columns="target_sos_mask") ds = ds.map(operations=type_cast_op, input_columns="target_eos_ids") ds = ds.map(operations=type_cast_op, input_columns="target_eos_mask") # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) return ds
def create_dataset(data_path, is_train=True, batch_size=32): # import import mindspore.dataset.engine as de import mindspore.dataset.vision.c_transforms as C from mindspore.common import set_seed set_seed(1) # shard num_shards = shard_id = None rand_size = os.getenv("RANK_SIZE") rand_id = os.getenv("RANK_ID") if rand_size is not None and rand_id is not None: num_shards = int(rand_size) shard_id = int(rand_id) # define dataset ds = de.MindDataset(data_path, columns_list=['data'], shuffle=True, num_shards=num_shards, shard_id=shard_id, num_parallel_workers=8, num_samples=None) # map ops ds = ds.map(input_columns=["data"], operations=C.Decode()) ds = ds.map(input_columns=["data"], operations=C.Normalize( mean=[0.485 * 255, 0.456 * 255, 0.406 * 255], std=[0.229 * 255, 0.224 * 255, 0.225 * 255])) ds = ds.map(input_columns=["data"], operations=C.Resize((224, 224))) ds = ds.map(input_columns=["data"], operations=C.HWC2CHW()) # batch & repeat ds = ds.batch(batch_size=batch_size, drop_remainder=is_train) ds = ds.repeat(count=1) return ds
def create_dataset(dataset_path, do_train, repeat_num=1, batch_size=32): """ create a train or eval dataset Args: dataset_path(string): the path of dataset. do_train(bool): whether dataset is used for train or eval. repeat_num(int): the repeat times of dataset. Default: 1 batch_size(int): the batch size of dataset. Default: 32 Returns: dataset """ device_num = int(os.getenv("RANK_SIZE")) try: # global_rank_id = int(os.getenv('RANK_ID').split("-")[1].split("custom")[1]) global_rank_id = int(os.getenv('RANK_ID').split("-")[-1]) except: global_rank_id = 0 rank_id = int(os.getenv('DEVICE_ID')) + global_rank_id * 8 columns_list = ["data", "label"] if do_train: ds = de.MindDataset(dataset_path + '/imagenet_train.mindrecord00', columns_list, num_parallel_workers=8, shuffle=True, num_shards=device_num, shard_id=rank_id) print("train dataset size", ds.get_dataset_size()) else: padded_sample = {} white_io = BytesIO() Image.new('RGB', (224, 224), (255, 255, 255)).save(white_io, 'JPEG') padded_sample['data'] = white_io.getvalue() padded_sample['label'] = -1 batch_per_step = batch_size * device_num print("eval batch per step:", batch_per_step) if batch_per_step < 50000: if 50000 % batch_per_step == 0: num_padded = 0 else: num_padded = batch_per_step - (50000 % batch_per_step) else: num_padded = batch_per_step - 50000 print("Padded samples:", num_padded) ds = de.MindDataset(dataset_path + '/imagenet_eval.mindrecord0', columns_list, num_parallel_workers=8, shuffle=False, num_shards=device_num, shard_id=rank_id, padded_sample=padded_sample, num_padded=num_padded) print("eval dataset size", ds.get_dataset_size()) image_size = 224 mean = [0.485 * 255, 0.456 * 255, 0.406 * 255] std = [0.229 * 255, 0.224 * 255, 0.225 * 255] # define map operations if do_train: trans = [ C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)), C.RandomHorizontalFlip(prob=0.5), C.Normalize(mean=mean, std=std), C.HWC2CHW() ] else: trans = [ C.Decode(), C.Resize((365, 365)), C.CenterCrop(320), C.Normalize(mean=mean, std=std), C.HWC2CHW() ] type_cast_op = C2.TypeCast(mstype.int32) # ds = ds.shuffle(buffer_size=100000) ds = ds.map(input_columns="data", num_parallel_workers=8, operations=trans) ds = ds.map(input_columns="label", num_parallel_workers=8, operations=type_cast_op) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) # apply dataset repeat operation ds = ds.repeat(repeat_num) return ds
def create_dataset(dataset_path, do_train, config, platform, repeat_num=1, batch_size=100): """ create a train or eval dataset Args: dataset_path(string): the path of dataset. do_train(bool): whether dataset is used for train or eval. repeat_num(int): the repeat times of dataset. Default: 1 batch_size(int): the batch size of dataset. Default: 32 Returns: dataset """ if platform == "Ascend": rank_size = int(os.getenv("RANK_SIZE")) rank_id = int(os.getenv("RANK_ID")) if rank_size == 1: ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True) else: ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, num_shards=rank_size, shard_id=rank_id) elif platform == "GPU": if do_train: from mindspore.communication.management import get_rank, get_group_size ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=True, num_shards=get_group_size(), shard_id=get_rank()) else: ds = de.MindDataset(dataset_path, num_parallel_workers=8, shuffle=False) else: raise ValueError("Unsupport platform.") resize_height = config.image_height buffer_size = 1000 # define map operations resize_crop_op = C.RandomCropDecodeResize(resize_height, scale=(0.08, 1.0), ratio=(0.75, 1.333)) horizontal_flip_op = C.RandomHorizontalFlip(prob=0.5) color_op = C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4) rescale_op = C.Rescale(1 / 255.0, 0) normalize_op = C.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) change_swap_op = C.HWC2CHW() # define python operations decode_p = P.Decode() resize_p = P.Resize(256, interpolation=Inter.BILINEAR) center_crop_p = P.CenterCrop(224) totensor = P.ToTensor() normalize_p = P.Normalize((0.485, 0.456, 0.406), (0.229, 0.224, 0.225)) composeop = P.ComposeOp( [decode_p, resize_p, center_crop_p, totensor, normalize_p]) if do_train: trans = [ resize_crop_op, horizontal_flip_op, color_op, rescale_op, normalize_op, change_swap_op ] else: trans = composeop() type_cast_op = C2.TypeCast(mstype.int32) ds = ds.map(input_columns="image", operations=trans, num_parallel_workers=8) ds = ds.map(input_columns="label_list", operations=type_cast_op, num_parallel_workers=8) # apply shuffle operations ds = ds.shuffle(buffer_size=buffer_size) # apply batch operations ds = ds.batch(batch_size, drop_remainder=True) # apply dataset repeat operation ds = ds.repeat(repeat_num) return ds
def _load_dataset(input_files, schema_file, batch_size, sink_mode=False, rank_size=1, rank_id=0, shuffle=True, drop_remainder=True, is_translate=False): """ Load dataset according to passed in params. Args: input_files (list): Data files. schema_file (str): Schema file path. batch_size (int): Batch size. sink_mode (bool): Whether enable sink mode. rank_size (int): Rank size. rank_id (int): Rank id. shuffle (bool): Whether shuffle dataset. drop_remainder (bool): Whether drop the last possibly incomplete batch. is_translate (bool): Whether translate the text. Returns: Dataset, dataset instance. """ if not input_files: raise FileNotFoundError("Require at least one dataset.") if not (schema_file and os.path.exists(schema_file) and os.path.isfile(schema_file) and os.path.basename(schema_file).endswith(".json")): raise FileNotFoundError( "`dataset_schema` must be a existed json file.") if not isinstance(sink_mode, bool): raise ValueError("`sink` must be type of bool.") for datafile in input_files: print(f" | Loading {datafile}.") if not is_translate: ds = de.MindDataset(input_files, columns_list=[ "src", "src_padding", "prev_opt", "target", "tgt_padding" ], shuffle=False, num_shards=rank_size, shard_id=rank_id, num_parallel_workers=8) ori_dataset_size = ds.get_dataset_size() print(f" | Dataset size: {ori_dataset_size}.") if shuffle: ds = ds.shuffle(buffer_size=ori_dataset_size // 20) type_cast_op = deC.TypeCast(mstype.int32) ds = ds.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8) ds = ds.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8) ds = ds.map(input_columns="prev_opt", operations=type_cast_op, num_parallel_workers=8) ds = ds.map(input_columns="target", operations=type_cast_op, num_parallel_workers=8) ds = ds.map(input_columns="tgt_padding", operations=type_cast_op, num_parallel_workers=8) ds = ds.rename(input_columns=[ "src", "src_padding", "prev_opt", "target", "tgt_padding" ], output_columns=[ "source_eos_ids", "source_eos_mask", "target_sos_ids", "target_eos_ids", "target_eos_mask" ]) ds = ds.batch(batch_size, drop_remainder=drop_remainder) else: ds = de.MindDataset(input_files, columns_list=["src", "src_padding"], shuffle=False, num_shards=rank_size, shard_id=rank_id, num_parallel_workers=8) ori_dataset_size = ds.get_dataset_size() print(f" | Dataset size: {ori_dataset_size}.") if shuffle: ds = ds.shuffle(buffer_size=ori_dataset_size // 20) type_cast_op = deC.TypeCast(mstype.int32) ds = ds.map(input_columns="src", operations=type_cast_op, num_parallel_workers=8) ds = ds.map(input_columns="src_padding", operations=type_cast_op, num_parallel_workers=8) ds = ds.rename(input_columns=["src", "src_padding"], output_columns=["source_eos_ids", "source_eos_mask"]) ds = ds.batch(batch_size, drop_remainder=drop_remainder) return ds