def __init__(self, name, dataset, training=True, batch_size=1, shuffle=False, sampler=None, batch_sampler=None, num_workers=0, epoch_interval=1, collate_fn=None, stack_dim=0, pin_memory=False, drop_last=False, timeout=0, worker_init_fn=None): super().__init__() ds = df.RepeatedData(dataset, -1) ds = df.MultiProcessRunnerZMQ(ds, num_proc=num_workers, hwm=300) # ds = df.MultiThreadRunner(lambda: ds, num_prefetch=1024, num_thread=num_workers) ds = df.BatchData(ds, batch_size) self.ds = ds self.name = name self.training = training self.epoch_interval = epoch_interval self.stack_dim = stack_dim self.batches_per_epoch = len(dataset) // batch_size
def create_paired_parallel_dataflow_via_numpy(tf_dataset_1, tf_dataset_2, batch_size, augmentations, x_only=False, num_proc=cpu_count(), test_flow=True): X_1, y_1 = [], [] X_2, y_2 = [], [] # Materialize the dataset as a numpy array: this is memory intensive for large datasets! for data in tf_dataset_1: X_1.append(data[0].numpy()) y_1.append(data[1].numpy()) for data in tf_dataset_2: X_2.append(data[0].numpy()) y_2.append(data[1].numpy()) numpy_dataset_1 = list(zip(np.array(X_1), np.array(y_1))) numpy_dataset_2 = list(zip(np.array(X_2), np.array(y_2))) # Create a dataflow dataflow_1 = D.DataFromList(numpy_dataset_1) dataflow_2 = D.DataFromList(numpy_dataset_2) # Select some indices in the data if x_only: dataflow_1 = D.SelectComponent(dataflow_1, [0]) dataflow_2 = D.SelectComponent(dataflow_2, [0]) # Zip them dataflow = D.JoinData([dataflow_1, dataflow_2]) # Batch data dataflow = D.BatchData(dataflow, batch_size=batch_size) # Repeat data only once, we use a custom loop over epochs dataflow = D.RepeatedData(dataflow, 1) # Create a function for data augmentations if not x_only: daug = lambda x: (compose_augmentations(x[0], augmentations), x[1], compose_augmentations(x[2], augmentations), x[3]) else: daug = lambda x: (compose_augmentations(x[0], augmentations), compose_augmentations(x[1], augmentations)) # Map the function onto the data with parallelism dataflow = D.MultiProcessMapData(dataflow, num_proc=num_proc, map_func=daug, strict=True) if test_flow: # A quick runthrough of all the data D.TestDataSpeed(dataflow).start() return dataflow
def read_data(files=None, batch_size=1, window=2, random_rotation=False, repeat=False, shuffle_buffer=None, num_workers=1, cache_data=False): print(files[0:20], '...' if len(files) > 20 else '') # caching makes only sense if the data is finite if cache_data: if repeat == True: raise Exception("repeat must be False if cache_data==True") if random_rotation == True: raise Exception( "random_rotation must be False if cache_data==True") if num_workers != 1: raise Exception("num_workers must be 1 if cache_data==True") df = PhysicsSimDataFlow( files=files, random_rotation=random_rotation, shuffle=True if shuffle_buffer else False, window=window, ) if repeat: df = dataflow.RepeatedData(df, -1) if shuffle_buffer: df = dataflow.LocallyShuffleData(df, shuffle_buffer) if num_workers > 1: df = dataflow.MultiProcessRunnerZMQ(df, num_proc=num_workers) df = dataflow.BatchData(df, batch_size=batch_size, use_list=True) if cache_data: df = dataflow.CacheData(df) df.reset_state() return df
def _wrap_flow(self, dataset: RNGDataFlow ) -> RNGDataFlow: dataset = D.MultiProcessMapData( dataset, num_proc=12, map_func=lambda x: self._read_and_aug(x, self.augmentor), buffer_size=self.config['batch_size'] * 3, strict=True, ) if not self.debug: if self.train: dataset = D.RepeatedData(dataset, num = -1) #dataset = D.LocallyShuffleData(dataset, 2000) dataset = D.BatchData(dataset, self.config['batch_size']) dataset.reset_state() return dataset
def create_parallel_dataflow_via_numpy(tf_dataset, batch_size, augmentations=(), gpu_augmentations=(), x_only=False, num_proc=cpu_count(), test_flow=True): X, y = [], [] # Materialize the dataset as a numpy array: this is memory intensive for large datasets! for data in tf_dataset: X.append(data[0].numpy()) y.append(data[1].numpy()) numpy_dataset = list(zip(np.array(X), np.array(y))) # Create a dataflow dataflow = D.DataFromList(numpy_dataset) # Select some indices in the data if x_only: dataflow = D.SelectComponent(dataflow, [0]) # Batch data dataflow = D.BatchData(dataflow, batch_size=batch_size) # Repeat data only once, we use a custom loop over epochs dataflow = D.RepeatedData(dataflow, 1) # Create a function for data augmentations if not x_only: daug = lambda x: (compose_augmentations(x[0], augmentations), x[1]) else: daug = lambda x: (compose_augmentations(x[0], augmentations)) # Map the function onto the data with parallelism dataflow = D.MultiProcessMapData(dataflow, num_proc=num_proc, map_func=daug, strict=True) # Create a function for gpu data augmentations gpu_daug = lambda x: (compose_augmentations(x, gpu_augmentations)) # Map the function onto the data dataflow = D.MapDataComponent(dataflow, func=gpu_daug, index=0) if test_flow: # A quick runthrough of all the data D.TestDataSpeed(dataflow).start() return dataflow
def create_direct_dataflow( tf_dataset, batch_size, augmentations=(), gpu_augmentations=(), label_augmentations=(), num_proc=cpu_count(), test_flow=True, ): # Create a dataflow dataflow = D.DataFromGenerator(tf_dataset) # Map the tensors to numpy arrays dataflow = D.MapData(dataflow, func=lambda x: (x[0].numpy(), x[1].numpy())) # Batch the data dataflow = D.BatchData(dataflow, batch_size=batch_size) # Repeat the data only once, we use a custom loop over epochs dataflow = D.RepeatedData(dataflow, 1) # Create a function for data augmentations daug = lambda x: compose_augmentations((compose_augmentations( x[0], augmentations), x[1]), label_augmentations) # Map the function onto the data dataflow = D.MapData(dataflow, func=daug) # Create a function for gpu data augmentations gpu_daug = lambda x: (compose_augmentations(x, gpu_augmentations)) # Map the function onto the data dataflow = D.MapDataComponent(dataflow, func=gpu_daug, index=0) if test_flow: # A quick runthrough of all the data D.TestDataSpeed(dataflow, size=128).start() else: # Reset state manually dataflow.reset_state() return dataflow
def create_paired_direct_dataflow(tf_dataset_1, tf_dataset_2, batch_size, augmentations, x_only=False, num_proc=cpu_count(), test_flow=True, cache_dir1='', cache_dir2='', shuffle=True, shuffle_buffer=1000): # Cache the dataset first tf_dataset_1 = tf_dataset_1.cache(cache_dir1).prefetch( tf.data.experimental.AUTOTUNE) tf_dataset_2 = tf_dataset_2.cache(cache_dir2).prefetch( tf.data.experimental.AUTOTUNE) try: # Unbatch them tf_dataset_1 = tf_dataset_1.unbatch() tf_dataset_2 = tf_dataset_2.unbatch() except ValueError: pass if shuffle: # Shuffle the data tf_dataset_1 = tf_dataset_1.shuffle(shuffle_buffer, seed=1) tf_dataset_2 = tf_dataset_2.shuffle(shuffle_buffer, seed=2) # Run through to cache the datasets: this is necessary to do, otherwise it won't work for _ in tf_dataset_1.batch(batch_size): print('.', end='') pass for _ in tf_dataset_2.batch(batch_size): print('.', end='') pass # Create a dataflow dataflow_1 = D.DataFromGenerator(tf_dataset_1) dataflow_2 = D.DataFromGenerator(tf_dataset_2) # Map the tensors to numpy arrays dataflow_1 = D.MapData(dataflow_1, func=lambda x: (x[0].numpy(), x[1].numpy())) dataflow_2 = D.MapData(dataflow_2, func=lambda x: (x[0].numpy(), x[1].numpy())) # Select some indices in the data if x_only: dataflow_1 = D.SelectComponent(dataflow_1, [0]) dataflow_2 = D.SelectComponent(dataflow_2, [0]) # Zip them dataflow = D.JoinData([dataflow_1, dataflow_2]) # Batch data dataflow = D.BatchData(dataflow, batch_size=batch_size, remainder=True) # Repeat data only once, we use a custom loop over epochs dataflow = D.RepeatedData(dataflow, 1) # Create a function for data augmentations if not x_only: daug = lambda x: (compose_augmentations(x[0], augmentations), x[1], compose_augmentations(x[2], augmentations), x[3]) else: daug = lambda x: (compose_augmentations(x[0], augmentations), compose_augmentations(x[1], augmentations)) # Map the function onto the data dataflow = D.MapData(dataflow, func=daug) if test_flow: # A quick runthrough of all the data D.TestDataSpeed(dataflow).start() else: # Reset state manually dataflow.reset_state() return dataflow