Beispiel #1
0
    def __init__(self,
                 name,
                 dataset,
                 training=True,
                 batch_size=1,
                 shuffle=False,
                 sampler=None,
                 batch_sampler=None,
                 num_workers=0,
                 epoch_interval=1,
                 collate_fn=None,
                 stack_dim=0,
                 pin_memory=False,
                 drop_last=False,
                 timeout=0,
                 worker_init_fn=None):

        super().__init__()

        ds = df.RepeatedData(dataset, -1)
        ds = df.MultiProcessRunnerZMQ(ds, num_proc=num_workers, hwm=300)
        # ds = df.MultiThreadRunner(lambda: ds, num_prefetch=1024, num_thread=num_workers)
        ds = df.BatchData(ds, batch_size)
        self.ds = ds

        self.name = name
        self.training = training
        self.epoch_interval = epoch_interval
        self.stack_dim = stack_dim
        self.batches_per_epoch = len(dataset) // batch_size
Beispiel #2
0
def create_paired_parallel_dataflow_via_numpy(tf_dataset_1,
                                              tf_dataset_2,
                                              batch_size,
                                              augmentations,
                                              x_only=False,
                                              num_proc=cpu_count(),
                                              test_flow=True):
    X_1, y_1 = [], []
    X_2, y_2 = [], []
    # Materialize the dataset as a numpy array: this is memory intensive for large datasets!
    for data in tf_dataset_1:
        X_1.append(data[0].numpy())
        y_1.append(data[1].numpy())

    for data in tf_dataset_2:
        X_2.append(data[0].numpy())
        y_2.append(data[1].numpy())

    numpy_dataset_1 = list(zip(np.array(X_1), np.array(y_1)))
    numpy_dataset_2 = list(zip(np.array(X_2), np.array(y_2)))
    # Create a dataflow
    dataflow_1 = D.DataFromList(numpy_dataset_1)
    dataflow_2 = D.DataFromList(numpy_dataset_2)
    # Select some indices in the data
    if x_only:
        dataflow_1 = D.SelectComponent(dataflow_1, [0])
        dataflow_2 = D.SelectComponent(dataflow_2, [0])
    # Zip them
    dataflow = D.JoinData([dataflow_1, dataflow_2])
    # Batch data
    dataflow = D.BatchData(dataflow, batch_size=batch_size)
    # Repeat data only once, we use a custom loop over epochs
    dataflow = D.RepeatedData(dataflow, 1)
    # Create a function for data augmentations
    if not x_only:
        daug = lambda x: (compose_augmentations(x[0], augmentations), x[1],
                          compose_augmentations(x[2], augmentations), x[3])
    else:
        daug = lambda x: (compose_augmentations(x[0], augmentations),
                          compose_augmentations(x[1], augmentations))
    # Map the function onto the data with parallelism
    dataflow = D.MultiProcessMapData(dataflow,
                                     num_proc=num_proc,
                                     map_func=daug,
                                     strict=True)
    if test_flow:
        # A quick runthrough of all the data
        D.TestDataSpeed(dataflow).start()
    return dataflow
def read_data(files=None,
              batch_size=1,
              window=2,
              random_rotation=False,
              repeat=False,
              shuffle_buffer=None,
              num_workers=1,
              cache_data=False):
    print(files[0:20], '...' if len(files) > 20 else '')

    # caching makes only sense if the data is finite
    if cache_data:
        if repeat == True:
            raise Exception("repeat must be False if cache_data==True")
        if random_rotation == True:
            raise Exception(
                "random_rotation must be False if cache_data==True")
        if num_workers != 1:
            raise Exception("num_workers must be 1 if cache_data==True")

    df = PhysicsSimDataFlow(
        files=files,
        random_rotation=random_rotation,
        shuffle=True if shuffle_buffer else False,
        window=window,
    )

    if repeat:
        df = dataflow.RepeatedData(df, -1)

    if shuffle_buffer:
        df = dataflow.LocallyShuffleData(df, shuffle_buffer)

    if num_workers > 1:
        df = dataflow.MultiProcessRunnerZMQ(df, num_proc=num_workers)

    df = dataflow.BatchData(df, batch_size=batch_size, use_list=True)

    if cache_data:
        df = dataflow.CacheData(df)

    df.reset_state()
    return df
Beispiel #4
0
    def _wrap_flow(self, dataset: RNGDataFlow ) -> RNGDataFlow:

        dataset = D.MultiProcessMapData(
            dataset,
            num_proc=12,
            map_func=lambda x: self._read_and_aug(x, self.augmentor),
            buffer_size=self.config['batch_size'] * 3,
            strict=True,
        )

        if not self.debug:
            if self.train:
                dataset = D.RepeatedData(dataset, num = -1)
                #dataset = D.LocallyShuffleData(dataset, 2000)
            dataset = D.BatchData(dataset, self.config['batch_size'])

        dataset.reset_state()

        return dataset
Beispiel #5
0
def create_parallel_dataflow_via_numpy(tf_dataset,
                                       batch_size,
                                       augmentations=(),
                                       gpu_augmentations=(),
                                       x_only=False,
                                       num_proc=cpu_count(),
                                       test_flow=True):
    X, y = [], []
    # Materialize the dataset as a numpy array: this is memory intensive for large datasets!
    for data in tf_dataset:
        X.append(data[0].numpy())
        y.append(data[1].numpy())
    numpy_dataset = list(zip(np.array(X), np.array(y)))
    # Create a dataflow
    dataflow = D.DataFromList(numpy_dataset)
    # Select some indices in the data
    if x_only:
        dataflow = D.SelectComponent(dataflow, [0])
    # Batch data
    dataflow = D.BatchData(dataflow, batch_size=batch_size)
    # Repeat data only once, we use a custom loop over epochs
    dataflow = D.RepeatedData(dataflow, 1)
    # Create a function for data augmentations
    if not x_only:
        daug = lambda x: (compose_augmentations(x[0], augmentations), x[1])
    else:
        daug = lambda x: (compose_augmentations(x[0], augmentations))
    # Map the function onto the data with parallelism
    dataflow = D.MultiProcessMapData(dataflow,
                                     num_proc=num_proc,
                                     map_func=daug,
                                     strict=True)
    # Create a function for gpu data augmentations
    gpu_daug = lambda x: (compose_augmentations(x, gpu_augmentations))
    # Map the function onto the data
    dataflow = D.MapDataComponent(dataflow, func=gpu_daug, index=0)
    if test_flow:
        # A quick runthrough of all the data
        D.TestDataSpeed(dataflow).start()
    return dataflow
Beispiel #6
0
def create_direct_dataflow(
        tf_dataset,
        batch_size,
        augmentations=(),
        gpu_augmentations=(),
        label_augmentations=(),
        num_proc=cpu_count(),
        test_flow=True,
):

    # Create a dataflow
    dataflow = D.DataFromGenerator(tf_dataset)
    # Map the tensors to numpy arrays
    dataflow = D.MapData(dataflow, func=lambda x: (x[0].numpy(), x[1].numpy()))
    # Batch the data
    dataflow = D.BatchData(dataflow, batch_size=batch_size)
    # Repeat the data only once, we use a custom loop over epochs
    dataflow = D.RepeatedData(dataflow, 1)
    # Create a function for data augmentations
    daug = lambda x: compose_augmentations((compose_augmentations(
        x[0], augmentations), x[1]), label_augmentations)
    # Map the function onto the data
    dataflow = D.MapData(dataflow, func=daug)
    # Create a function for gpu data augmentations
    gpu_daug = lambda x: (compose_augmentations(x, gpu_augmentations))
    # Map the function onto the data
    dataflow = D.MapDataComponent(dataflow, func=gpu_daug, index=0)

    if test_flow:
        # A quick runthrough of all the data
        D.TestDataSpeed(dataflow, size=128).start()
    else:
        # Reset state manually
        dataflow.reset_state()

    return dataflow
Beispiel #7
0
def create_paired_direct_dataflow(tf_dataset_1,
                                  tf_dataset_2,
                                  batch_size,
                                  augmentations,
                                  x_only=False,
                                  num_proc=cpu_count(),
                                  test_flow=True,
                                  cache_dir1='',
                                  cache_dir2='',
                                  shuffle=True,
                                  shuffle_buffer=1000):
    # Cache the dataset first
    tf_dataset_1 = tf_dataset_1.cache(cache_dir1).prefetch(
        tf.data.experimental.AUTOTUNE)
    tf_dataset_2 = tf_dataset_2.cache(cache_dir2).prefetch(
        tf.data.experimental.AUTOTUNE)

    try:
        # Unbatch them
        tf_dataset_1 = tf_dataset_1.unbatch()
        tf_dataset_2 = tf_dataset_2.unbatch()
    except ValueError:
        pass

    if shuffle:
        # Shuffle the data
        tf_dataset_1 = tf_dataset_1.shuffle(shuffle_buffer, seed=1)
        tf_dataset_2 = tf_dataset_2.shuffle(shuffle_buffer, seed=2)

    # Run through to cache the datasets: this is necessary to do, otherwise it won't work
    for _ in tf_dataset_1.batch(batch_size):
        print('.', end='')
        pass

    for _ in tf_dataset_2.batch(batch_size):
        print('.', end='')
        pass

    # Create a dataflow
    dataflow_1 = D.DataFromGenerator(tf_dataset_1)
    dataflow_2 = D.DataFromGenerator(tf_dataset_2)
    # Map the tensors to numpy arrays
    dataflow_1 = D.MapData(dataflow_1,
                           func=lambda x: (x[0].numpy(), x[1].numpy()))
    dataflow_2 = D.MapData(dataflow_2,
                           func=lambda x: (x[0].numpy(), x[1].numpy()))
    # Select some indices in the data
    if x_only:
        dataflow_1 = D.SelectComponent(dataflow_1, [0])
        dataflow_2 = D.SelectComponent(dataflow_2, [0])
    # Zip them
    dataflow = D.JoinData([dataflow_1, dataflow_2])
    # Batch data
    dataflow = D.BatchData(dataflow, batch_size=batch_size, remainder=True)
    # Repeat data only once, we use a custom loop over epochs
    dataflow = D.RepeatedData(dataflow, 1)
    # Create a function for data augmentations
    if not x_only:
        daug = lambda x: (compose_augmentations(x[0], augmentations), x[1],
                          compose_augmentations(x[2], augmentations), x[3])
    else:
        daug = lambda x: (compose_augmentations(x[0], augmentations),
                          compose_augmentations(x[1], augmentations))
    # Map the function onto the data
    dataflow = D.MapData(dataflow, func=daug)
    if test_flow:
        # A quick runthrough of all the data
        D.TestDataSpeed(dataflow).start()
    else:
        # Reset state manually
        dataflow.reset_state()
    return dataflow