def create_dataset(audio_conf, manifest_filepath, labels, normalize, batch_size, train_mode=True, rank=None, group_size=None): """ create train dataset Args: audio_conf: Config containing the sample rate, window and the window length/stride in seconds manifest_filepath (str): manifest_file path. labels (list): list containing all the possible characters to map to normalize: Apply standard mean and deviation normalization to audio tensor train_mode (bool): Whether dataset is use for train or eval (default=True). batch_size (int): Dataset batch size rank (int): The shard ID within num_shards (default=None). group_size (int): Number of shards that the dataset should be divided into (default=None). Returns: Dataset. """ dataset = ASRDataset(audio_conf=audio_conf, manifest_filepath=manifest_filepath, labels=labels, normalize=normalize, batch_size=batch_size, is_training=train_mode) sampler = DistributedSampler(dataset, rank, group_size, shuffle=True) ds = de.GeneratorDataset(dataset, ["inputs", "input_length", "target_indices", "label_values"], sampler=sampler) ds = ds.repeat(1) return ds
def _get_h5_dataset(directory, train_mode=True, epochs=1, batch_size=1000): """ Get dataset with h5 format. Args: directory (str): Dataset directory. train_mode (bool): Whether dataset is use for train or eval (default=True). epochs (int): Dataset epoch size (default=1). batch_size (int): Dataset batch size (default=1000) Returns: Dataset. """ data_para = {'batch_size': batch_size} if train_mode: data_para['random_sample'] = True data_para['shuffle_block'] = True h5_dataset = H5Dataset(data_path=directory, train_mode=train_mode) numbers_of_batch = math.ceil(h5_dataset.data_size / batch_size) def _iter_h5_data(): train_eval_gen = h5_dataset.batch_generator(**data_para) for _ in range(0, numbers_of_batch, 1): yield train_eval_gen.__next__() ds = de.GeneratorDataset(_iter_h5_data, ["ids", "weights", "labels"]) ds.set_dataset_size(numbers_of_batch) ds = ds.repeat(epochs) return ds
def create_dataset(dataset_path, batch_size=1, num_shards=1, shard_id=0, device_target='Ascend'): """ create train or evaluation dataset for warpctc Args: dataset_path(int): dataset path batch_size(int): batch size of generated dataset, default is 1 num_shards(int): number of devices shard_id(int): rank id device_target(str): platform of training, support Ascend and GPU """ dataset = _CaptchaDataset(dataset_path, cf.max_captcha_digits, device_target) ds = de.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id) image_trans = [ vc.Rescale(1.0 / 255.0, 0.0), vc.Normalize([0.9010, 0.9049, 0.9025], std=[0.1521, 0.1347, 0.1458]), vc.Resize((m.ceil(cf.captcha_height / 16) * 16, cf.captcha_width)), vc.HWC2CHW() ] label_trans = [ c.TypeCast(mstype.int32) ] ds = ds.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8) ds = ds.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8) ds = ds.batch(batch_size, drop_remainder=True) return ds
def train_dataset_creator(rank, group_size, shuffle=True): cv2.setNumThreads(0) dataset = TrainDataset() sampler = DistributedSampler(dataset, rank, group_size, shuffle) ds = de.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8, sampler=sampler) ds = ds.repeat(1) ds = ds.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER) return ds
def create_dataset(batch_size=32): # import import mindspore.dataset.engine as de import numpy as np from mindspore.common import set_seed set_seed(1) # shard num_shards = shard_id = None rand_size = os.getenv("RANK_SIZE") rand_id = os.getenv("RANK_ID") if rand_size is not None and rand_id is not None: num_shards = int(rand_size) shard_id = int(rand_id) # define dataset class BaseDataset: def __init__(self): self.samples = [] self._load_samples() def __getitem__(self, index): sample = self.samples[index] return sample[0], sample[1] def _load_samples(self): self.samples.append([ np.random.rand(3, 4, 5).astype(np.float32), np.random.randint(10, size=()).astype(np.int32) ]) def __len__(self): return len(self.samples) # define dataset ds = de.GeneratorDataset(source=BaseDataset(), column_names=['image', 'label'], num_shards=num_shards, shard_id=shard_id) # map ops ds = ds.map(input_columns=["image"], operations=lambda img: img, num_parallel_workers=8) # batch & repeat ds = ds.batch(batch_size=batch_size, drop_remainder=False) ds = ds.repeat(count=1) return ds
def get_data_loaders(dump_root, speaker_id, hparams=None, rank_id=None, group_size=None): """create train dataset""" local_conditioning = hparams.cin_channels > 0 if hparams.max_time_steps is not None: max_steps = ensure_divisible(hparams.max_time_steps, audio.get_hop_size(), True) else: max_steps = None X = FileSourceDataset( RawAudioDataSource(os.path.join(dump_root, 'train_no_dev'), speaker_id=speaker_id, max_steps=max_steps, cin_pad=hparams.cin_pad, hop_size=audio.get_hop_size())) if local_conditioning: Mel = FileSourceDataset( MelSpecDataSource(os.path.join(dump_root, 'train_no_dev'), speaker_id=speaker_id, max_steps=max_steps, cin_pad=hparams.cin_pad, hop_size=audio.get_hop_size())) assert len(X) == len(Mel) print("Local conditioning enabled. Shape of a sample: {}.".format( Mel[0].shape)) else: Mel = None print("length of the dataset is {}".format(len(X))) length_x = np.array(X.file_data_source.lengths) dataset = DualDataset(X, Mel, length_x, batch_size=hparams.batch_size, hparams=hparams) sampler = DistributedSampler(dataset, rank_id, group_size, shuffle=True, seed=0) data_loaders = de.GeneratorDataset( dataset, ["x_batch", "y_batch", "c_batch", "g_batch", "input_lengths", "mask"], sampler=sampler) return data_loaders
def GetDataLoader(per_batch_size, max_epoch, rank, group_size, config, split='train'): """ Centerface get data loader """ centerface_gen = CenterfaceDataset(config=config, split=split) sampler = DistributedSampler( centerface_gen, rank, group_size, shuffle=(split == 'train')) # user defined sampling strategy de_dataset = de.GeneratorDataset(centerface_gen, ["image", "anns"], sampler=sampler, num_parallel_workers=16) if group_size > 1: num_parallel_workers = 24 else: num_parallel_workers = 64 if split == 'train': compose_map_func = ( lambda image, anns: preprocess_train(image, anns, config=config)) columns = [ 'image', "hm", 'reg_mask', 'ind', 'wh', 'wight_mask', 'hm_offset', 'hps_mask', 'landmarks' ] de_dataset = de_dataset.map(input_columns=["image", "anns"], output_columns=columns, column_order=columns, operations=compose_map_func, num_parallel_workers=num_parallel_workers, python_multiprocessing=True) de_dataset = de_dataset.batch(per_batch_size, drop_remainder=True, num_parallel_workers=8) if split == 'train': #de_dataset = de_dataset.repeat(1) # if use this, need an additional "for" cycle epoch times de_dataset = de_dataset.repeat(max_epoch) return de_dataset, de_dataset.get_dataset_size()
def create_dataset(data_dir, p=16, k=8): """ create a train or eval dataset Args: dataset_path(string): the path of dataset. p(int): randomly choose p classes from all classes. k(int): randomly choose k images from each of the chosen p classes. p * k is the batchsize. Returns: dataset """ dataset = MGDataset(data_dir) sampler = DistributedPKSampler(dataset, p=p, k=k) de_dataset = de.GeneratorDataset(dataset, ["image", "label1", "label2"], sampler=sampler) resize_height = config.image_height resize_width = config.image_width rescale = 1.0 / 255.0 shift = 0.0 resize_op = CV.Resize((resize_height, resize_width)) rescale_op = CV.Rescale(rescale, shift) normalize_op = CV.Normalize([0.486, 0.459, 0.408], [0.229, 0.224, 0.225]) change_swap_op = CV.HWC2CHW() trans = [] trans += [resize_op, rescale_op, normalize_op, change_swap_op] type_cast_op_label1 = C.TypeCast(mstype.int32) type_cast_op_label2 = C.TypeCast(mstype.float32) de_dataset = de_dataset.map(input_columns="label1", operations=type_cast_op_label1) de_dataset = de_dataset.map(input_columns="label2", operations=type_cast_op_label2) de_dataset = de_dataset.map(input_columns="image", operations=trans) de_dataset = de_dataset.batch(p*k, drop_remainder=True) return de_dataset
def _get_h5_dataset(data_dir, train_mode=True, epochs=1, batch_size=1000): """ get_h5_dataset """ data_para = { 'batch_size': batch_size, } if train_mode: data_para['random_sample'] = True data_para['shuffle_block'] = True h5_dataset = H5Dataset(data_path=data_dir, train_mode=train_mode) numbers_of_batch = math.ceil(h5_dataset.data_size / batch_size) def _iter_h5_data(): train_eval_gen = h5_dataset.batch_generator(**data_para) for _ in range(0, numbers_of_batch, 1): yield train_eval_gen.__next__() ds = de.GeneratorDataset(_iter_h5_data(), ["ids", "weights", "labels"]) ds = ds.repeat(epochs) return ds
def create_dataset(dataset_path, repeat_num=1, batch_size=1): """ create train or evaluation dataset for warpctc Args: dataset_path(int): dataset path repeat_num(int): dataset repetition num, default is 1 batch_size(int): batch size of generated dataset, default is 1 """ rank_size = int( os.environ.get("RANK_SIZE")) if os.environ.get("RANK_SIZE") else 1 rank_id = int( os.environ.get("RANK_ID")) if os.environ.get("RANK_ID") else 0 dataset = _CaptchaDataset(dataset_path, cf.max_captcha_digits) ds = de.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=rank_size, shard_id=rank_id) ds.set_dataset_size(m.ceil(len(dataset) / rank_size)) image_trans = [ vc.Rescale(1.0 / 255.0, 0.0), vc.Normalize([0.9010, 0.9049, 0.9025], std=[0.1521, 0.1347, 0.1458]), vc.Resize((m.ceil(cf.captcha_height / 16) * 16, cf.captcha_width)), vc.HWC2CHW() ] label_trans = [c.TypeCast(mstype.int32)] ds = ds.map(input_columns=["image"], num_parallel_workers=8, operations=image_trans) ds = ds.map(input_columns=["label"], num_parallel_workers=8, operations=label_trans) ds = ds.batch(batch_size) ds = ds.repeat(repeat_num) return ds
def create_dataset(name, dataset_path, batch_size=1, num_shards=1, shard_id=0, is_training=True, config=config1): """ create train or evaluation dataset for crnn Args: dataset_path(int): dataset path batch_size(int): batch size of generated dataset, default is 1 num_shards(int): number of devices shard_id(int): rank id device_target(str): platform of training, support Ascend and GPU """ if name == 'synth': dataset = CaptchaDataset(dataset_path, is_training, config) elif name == 'ic03': dataset = IC03Dataset(dataset_path, "annotation.txt", config, True, 3) elif name == 'ic13': dataset = IC13Dataset(dataset_path, "Challenge2_Test_Task3_GT.txt", config) elif name == 'svt': dataset = SVTDataset(dataset_path, config) elif name == 'iiit5k': dataset = IIIT5KDataset(dataset_path, "annotation.txt", config) else: raise ValueError(f"unsupported dataset name: {name}") ds = de.GeneratorDataset(dataset, ["image", "label"], shuffle=True, num_shards=num_shards, shard_id=shard_id) image_trans = [ vc.Resize((config.image_height, config.image_width)), vc.Normalize([127.5, 127.5, 127.5], std=[127.5, 127.5, 127.5]), vc.HWC2CHW() ] label_trans = [ C.TypeCast(mstype.int32) ] ds = ds.map(operations=image_trans, input_columns=["image"], num_parallel_workers=8) ds = ds.map(operations=label_trans, input_columns=["label"], num_parallel_workers=8) ds = ds.batch(batch_size, drop_remainder=True) return ds
def get_data_loader(hparam, data_dir): """ test data loader """ wav_paths = glob.glob(os.path.join(data_dir, "*-wave.npy")) if wav_paths: X = FileSourceDataset(RawAudioDataSource(data_dir, hop_size=audio.get_hop_size(), max_steps=None, cin_pad=hparam.cin_pad)) else: X = None C = FileSourceDataset(MelSpecDataSource(data_dir, hop_size=audio.get_hop_size(), max_steps=None, cin_pad=hparam.cin_pad)) length_x = np.array(C.file_data_source.lengths) if C[0].shape[-1] != hparam.cin_channels: raise RuntimeError("Invalid cin_channnels {}. Expected to be {}.".format(hparam.cin_channels, C[0].shape[-1])) dataset = DualDataset(X, C, length_x, batch_size=hparam.batch_size, hparams=hparam) data_loader = de.GeneratorDataset(dataset, ["x_batch", "y_batch", "c_batch", "g_batch", "input_lengths", "mask"]) return data_loader, dataset
def test_dataset_creator(): ds = de.GeneratorDataset(IC15_TEST_Generator, ['img', 'img_resized', 'img_name']) ds = ds.shuffle(config.TEST_BUFFER_SIZE) ds = ds.batch(1, drop_remainder=config.TEST_DROP_REMAINDER) return ds