Esempio n. 1
0
    def __init__(self,
                 name,
                 dataset,
                 training=True,
                 batch_size=1,
                 shuffle=False,
                 sampler=None,
                 batch_sampler=None,
                 num_workers=0,
                 epoch_interval=1,
                 collate_fn=None,
                 stack_dim=0,
                 pin_memory=False,
                 drop_last=False,
                 timeout=0,
                 worker_init_fn=None):

        super().__init__()

        ds = df.RepeatedData(dataset, -1)
        ds = df.MultiProcessRunnerZMQ(ds, num_proc=num_workers, hwm=300)
        # ds = df.MultiThreadRunner(lambda: ds, num_prefetch=1024, num_thread=num_workers)
        ds = df.BatchData(ds, batch_size)
        self.ds = ds

        self.name = name
        self.training = training
        self.epoch_interval = epoch_interval
        self.stack_dim = stack_dim
        self.batches_per_epoch = len(dataset) // batch_size
    def __init__(self, imagenet_dir, mode, transform, batch_size, shuffle=False, num_workers=4, cache=50000,
            drop_last=False):
        if drop_last:
            raise NotImplementedError("drop_last not implemented")
        # enumerate standard imagenet augmentors
        assert mode in ['train', 'val'], mode

        # open the lmdb file
        lmdb_loc = os.path.join(imagenet_dir, 'ILSVRC-%s.lmdb'%mode)
        ds = td.LMDBData(lmdb_loc, shuffle=False)
        if shuffle:
            ds = td.LocallyShuffleData(ds, cache)
        def f(x):
            img, label= td.LMDBSerializer._deserialize_lmdb(x)
            # img, label = x
            img = Image.open(BytesIO(img.tobytes())).convert('RGB')
            img = transform(img)
            return img, label
        # ds = td.MultiProcessMapDataZMQ(ds, num_proc=num_workers, map_func=f)
        ds = td.MultiThreadMapData(ds, num_thread=num_workers, map_func=f)
        # ds = td.MapData(ds, f)
        self.ds = td.BatchData(ds, batch_size, use_list=True, remainder=False)
        # self.ds.reset_state()

        self.batch_size = batch_size
        self.num_workers = num_workers

        self.ds.reset_state()
        self.ds_iter = iter(self.ds)
        self.N = self.ds.size()
        self.i = 0
def evaluate_on(model, datasource):
    images, gt = next(df.BatchData(datasource, datasource.size()).get_data())
    gt = np.squeeze(gt)
    out = model.predict(images, verbose=True)
    _, dices, verbose = compute_dice_metric(preds=out, labels=gt)

    return verbose, dices
Esempio n. 4
0
def load_BSDS500():
    data_path = "../dataset/BSDS500"
    BSDS500 = df.dataset.BSDS500("train", data_dir=data_path, shuffle=True)
    BSDS500.reset_state()

    batches = df.BatchData(BSDS500, 8).get_data()
    images, labels = next(batches)
    image_utils.plot_semantics_data(images, labels, save_name="BSDS500_1.png")
Esempio n. 5
0
def load_Cifar10():
    data_path = "../dataset/Cifar10"
    Cifar10 = df.dataset.Cifar10("train", dir=data_path)
    Cifar10.reset_state()

    class_names = Cifar10.get_label_names()
    Cifar10_m = df.MapData(Cifar10, lambda dp: rot90(dp))
    batches = df.BatchData(Cifar10_m, 9).get_data()
    images, labels = next(batches)
    image_utils.plot_classification_data(images, labels, class_names=class_names, save_name="cifar10.png")
Esempio n. 6
0
def create_paired_parallel_dataflow_via_numpy(tf_dataset_1,
                                              tf_dataset_2,
                                              batch_size,
                                              augmentations,
                                              x_only=False,
                                              num_proc=cpu_count(),
                                              test_flow=True):
    X_1, y_1 = [], []
    X_2, y_2 = [], []
    # Materialize the dataset as a numpy array: this is memory intensive for large datasets!
    for data in tf_dataset_1:
        X_1.append(data[0].numpy())
        y_1.append(data[1].numpy())

    for data in tf_dataset_2:
        X_2.append(data[0].numpy())
        y_2.append(data[1].numpy())

    numpy_dataset_1 = list(zip(np.array(X_1), np.array(y_1)))
    numpy_dataset_2 = list(zip(np.array(X_2), np.array(y_2)))
    # Create a dataflow
    dataflow_1 = D.DataFromList(numpy_dataset_1)
    dataflow_2 = D.DataFromList(numpy_dataset_2)
    # Select some indices in the data
    if x_only:
        dataflow_1 = D.SelectComponent(dataflow_1, [0])
        dataflow_2 = D.SelectComponent(dataflow_2, [0])
    # Zip them
    dataflow = D.JoinData([dataflow_1, dataflow_2])
    # Batch data
    dataflow = D.BatchData(dataflow, batch_size=batch_size)
    # Repeat data only once, we use a custom loop over epochs
    dataflow = D.RepeatedData(dataflow, 1)
    # Create a function for data augmentations
    if not x_only:
        daug = lambda x: (compose_augmentations(x[0], augmentations), x[1],
                          compose_augmentations(x[2], augmentations), x[3])
    else:
        daug = lambda x: (compose_augmentations(x[0], augmentations),
                          compose_augmentations(x[1], augmentations))
    # Map the function onto the data with parallelism
    dataflow = D.MultiProcessMapData(dataflow,
                                     num_proc=num_proc,
                                     map_func=daug,
                                     strict=True)
    if test_flow:
        # A quick runthrough of all the data
        D.TestDataSpeed(dataflow).start()
    return dataflow
def read_data(files=None,
              batch_size=1,
              window=2,
              random_rotation=False,
              repeat=False,
              shuffle_buffer=None,
              num_workers=1,
              cache_data=False):
    print(files[0:20], '...' if len(files) > 20 else '')

    # caching makes only sense if the data is finite
    if cache_data:
        if repeat == True:
            raise Exception("repeat must be False if cache_data==True")
        if random_rotation == True:
            raise Exception(
                "random_rotation must be False if cache_data==True")
        if num_workers != 1:
            raise Exception("num_workers must be 1 if cache_data==True")

    df = PhysicsSimDataFlow(
        files=files,
        random_rotation=random_rotation,
        shuffle=True if shuffle_buffer else False,
        window=window,
    )

    if repeat:
        df = dataflow.RepeatedData(df, -1)

    if shuffle_buffer:
        df = dataflow.LocallyShuffleData(df, shuffle_buffer)

    if num_workers > 1:
        df = dataflow.MultiProcessRunnerZMQ(df, num_proc=num_workers)

    df = dataflow.BatchData(df, batch_size=batch_size, use_list=True)

    if cache_data:
        df = dataflow.CacheData(df)

    df.reset_state()
    return df
Esempio n. 8
0
    def _wrap_flow(self, dataset: RNGDataFlow ) -> RNGDataFlow:

        dataset = D.MultiProcessMapData(
            dataset,
            num_proc=12,
            map_func=lambda x: self._read_and_aug(x, self.augmentor),
            buffer_size=self.config['batch_size'] * 3,
            strict=True,
        )

        if not self.debug:
            if self.train:
                dataset = D.RepeatedData(dataset, num = -1)
                #dataset = D.LocallyShuffleData(dataset, 2000)
            dataset = D.BatchData(dataset, self.config['batch_size'])

        dataset.reset_state()

        return dataset
Esempio n. 9
0
def create_parallel_dataflow_via_numpy(tf_dataset,
                                       batch_size,
                                       augmentations=(),
                                       gpu_augmentations=(),
                                       x_only=False,
                                       num_proc=cpu_count(),
                                       test_flow=True):
    X, y = [], []
    # Materialize the dataset as a numpy array: this is memory intensive for large datasets!
    for data in tf_dataset:
        X.append(data[0].numpy())
        y.append(data[1].numpy())
    numpy_dataset = list(zip(np.array(X), np.array(y)))
    # Create a dataflow
    dataflow = D.DataFromList(numpy_dataset)
    # Select some indices in the data
    if x_only:
        dataflow = D.SelectComponent(dataflow, [0])
    # Batch data
    dataflow = D.BatchData(dataflow, batch_size=batch_size)
    # Repeat data only once, we use a custom loop over epochs
    dataflow = D.RepeatedData(dataflow, 1)
    # Create a function for data augmentations
    if not x_only:
        daug = lambda x: (compose_augmentations(x[0], augmentations), x[1])
    else:
        daug = lambda x: (compose_augmentations(x[0], augmentations))
    # Map the function onto the data with parallelism
    dataflow = D.MultiProcessMapData(dataflow,
                                     num_proc=num_proc,
                                     map_func=daug,
                                     strict=True)
    # Create a function for gpu data augmentations
    gpu_daug = lambda x: (compose_augmentations(x, gpu_augmentations))
    # Map the function onto the data
    dataflow = D.MapDataComponent(dataflow, func=gpu_daug, index=0)
    if test_flow:
        # A quick runthrough of all the data
        D.TestDataSpeed(dataflow).start()
    return dataflow
Esempio n. 10
0
def create_direct_dataflow(
        tf_dataset,
        batch_size,
        augmentations=(),
        gpu_augmentations=(),
        label_augmentations=(),
        num_proc=cpu_count(),
        test_flow=True,
):

    # Create a dataflow
    dataflow = D.DataFromGenerator(tf_dataset)
    # Map the tensors to numpy arrays
    dataflow = D.MapData(dataflow, func=lambda x: (x[0].numpy(), x[1].numpy()))
    # Batch the data
    dataflow = D.BatchData(dataflow, batch_size=batch_size)
    # Repeat the data only once, we use a custom loop over epochs
    dataflow = D.RepeatedData(dataflow, 1)
    # Create a function for data augmentations
    daug = lambda x: compose_augmentations((compose_augmentations(
        x[0], augmentations), x[1]), label_augmentations)
    # Map the function onto the data
    dataflow = D.MapData(dataflow, func=daug)
    # Create a function for gpu data augmentations
    gpu_daug = lambda x: (compose_augmentations(x, gpu_augmentations))
    # Map the function onto the data
    dataflow = D.MapDataComponent(dataflow, func=gpu_daug, index=0)

    if test_flow:
        # A quick runthrough of all the data
        D.TestDataSpeed(dataflow, size=128).start()
    else:
        # Reset state manually
        dataflow.reset_state()

    return dataflow
Esempio n. 11
0
def create_paired_direct_dataflow(tf_dataset_1,
                                  tf_dataset_2,
                                  batch_size,
                                  augmentations,
                                  x_only=False,
                                  num_proc=cpu_count(),
                                  test_flow=True,
                                  cache_dir1='',
                                  cache_dir2='',
                                  shuffle=True,
                                  shuffle_buffer=1000):
    # Cache the dataset first
    tf_dataset_1 = tf_dataset_1.cache(cache_dir1).prefetch(
        tf.data.experimental.AUTOTUNE)
    tf_dataset_2 = tf_dataset_2.cache(cache_dir2).prefetch(
        tf.data.experimental.AUTOTUNE)

    try:
        # Unbatch them
        tf_dataset_1 = tf_dataset_1.unbatch()
        tf_dataset_2 = tf_dataset_2.unbatch()
    except ValueError:
        pass

    if shuffle:
        # Shuffle the data
        tf_dataset_1 = tf_dataset_1.shuffle(shuffle_buffer, seed=1)
        tf_dataset_2 = tf_dataset_2.shuffle(shuffle_buffer, seed=2)

    # Run through to cache the datasets: this is necessary to do, otherwise it won't work
    for _ in tf_dataset_1.batch(batch_size):
        print('.', end='')
        pass

    for _ in tf_dataset_2.batch(batch_size):
        print('.', end='')
        pass

    # Create a dataflow
    dataflow_1 = D.DataFromGenerator(tf_dataset_1)
    dataflow_2 = D.DataFromGenerator(tf_dataset_2)
    # Map the tensors to numpy arrays
    dataflow_1 = D.MapData(dataflow_1,
                           func=lambda x: (x[0].numpy(), x[1].numpy()))
    dataflow_2 = D.MapData(dataflow_2,
                           func=lambda x: (x[0].numpy(), x[1].numpy()))
    # Select some indices in the data
    if x_only:
        dataflow_1 = D.SelectComponent(dataflow_1, [0])
        dataflow_2 = D.SelectComponent(dataflow_2, [0])
    # Zip them
    dataflow = D.JoinData([dataflow_1, dataflow_2])
    # Batch data
    dataflow = D.BatchData(dataflow, batch_size=batch_size, remainder=True)
    # Repeat data only once, we use a custom loop over epochs
    dataflow = D.RepeatedData(dataflow, 1)
    # Create a function for data augmentations
    if not x_only:
        daug = lambda x: (compose_augmentations(x[0], augmentations), x[1],
                          compose_augmentations(x[2], augmentations), x[3])
    else:
        daug = lambda x: (compose_augmentations(x[0], augmentations),
                          compose_augmentations(x[1], augmentations))
    # Map the function onto the data
    dataflow = D.MapData(dataflow, func=daug)
    if test_flow:
        # A quick runthrough of all the data
        D.TestDataSpeed(dataflow).start()
    else:
        # Reset state manually
        dataflow.reset_state()
    return dataflow
def batch_predict(img_dir,
                  out_dir,
                  config,
                  segmodel=None,
                  image_size=None,
                  image_extension='.png'):
    """
    Generates segmentation results visualization for all  images in a given folder

    :param segmodel: segmentation model
    :param img_dir: directory where source images are locate
    :param out_dir:  path to save output segmentatiions, visualization will be saved on outdir/viz directory
    :param N_classes: number of classes that model predicts. see configuration
    :param image_size: image will be resized to image_size before prediction
    :return:
    """

    N_classes = config.NUM_CLASSES

    if (segmodel is None):
        segmodel = load_tfkeras_model(config.MODEL_SAVE_DIR,
                                      file_name_prefix=config.NAME,
                                      model=None,
                                      custom_objects={})

    if (not os.path.exists(out_dir)):
        os.mkdir(out_dir)
    test_data = DirectoryImagesTest(img_dir, image_extension)
    test_ds = SegmentationData(data=test_data.data,
                               loadLabels=False,
                               shuffle=False,
                               isRGB=config.IS_RGB)

    resizer = [df.imgaug.Resize(image_size, interp=cv2.INTER_NEAREST)
               ] if image_size else []
    test_ds = df.AugmentImageComponent(test_ds, augmentors=resizer)
    test_ds = df.MapDataComponent(test_ds, lambda x: x / 255.0, index=0)
    test_ds = df.MapDataComponent(test_ds,
                                  lambda x: np.expand_dims(x, -1),
                                  index=0)
    test_ds = df.BatchData(test_ds, batch_size=np.min([16, test_ds.size()]))

    batch_iter = test_ds.get_data()

    vizdir = os.path.join(out_dir, 'viz')
    if (not os.path.exists(vizdir)): os.mkdir(vizdir)

    image_name = lambda x: os.path.basename(x).split('.')[0]

    for batch in batch_iter:
        images, file_names = batch[0], batch[2]
        out = segmodel.predict(images, batch_size=4)
        images = images * 255

        out_labelmap = np.argmax(out, axis=3).astype(np.uint8)
        for l, f in zip(out_labelmap, file_names):
            cv2.imwrite(os.path.join(out_dir, image_name(f) + '.png'), l)

        viz, _ = visualize_labels_overlay_labelmap(np.argmax(out, axis=3),
                                                   images,
                                                   N_classes,
                                                   stack_images=False)
        for vim, f in zip(viz, file_names):
            cv2.imwrite(os.path.join(vizdir, 'v' + f), vim)

    print('Done')