def setup_train_dataset(self): """ Each self.batch_size of examples follows the same distribution """ bd = BlockDesigner(self.train_examples) if self.sample_class: samp = Sampler(bd.remainder(), seed=self.random_seed) images, labels = samp.custom_distribution(self.sample_class, self.batch_size, self.custom_distribution) return {"X": images, "y": labels} else: blocks = bd.break_off_multiple_blocks(self.n_train_batches, self.batch_size) images = [] labels = [] for block in blocks: for y, ids in block.items(): for id in ids: images.append(id) labels.append(y) return {"X": images, "y": labels}
def __init__(self, train_image_dir="data/train/centered_crop/", image_shape=(128, 128, 3), batch_size=128, cache_size_factor=8, center=0, normalize=0, amplify=1, train_flip='no_flip', shuffle=1, test_image_dir=None, random_seed=None, valid_dataset_size=4864, valid_flip='no_flip', test_flip='no_flip', sample_class=None, custom_distribution=None, train_color_cast='no_cast', valid_color_cast='no_cast', test_color_cast='no_cast', color_cast_range=20): self.train_image_dir = train_image_dir self.test_image_dir = test_image_dir self.image_shape = image_shape self.batch_size = batch_size self.cache_size = (self.batch_size * cache_size_factor) # size in images self.center = center self.mean = None self.normalize = normalize self.std = None self.amplify = amplify self.train_set_flipper = ImageFlipOracle(train_flip) test_set_flipper = ImageFlipOracle(test_flip) self.train_flip_lambda = self.train_set_flipper.get_flip_lambda(train_flip) self.valid_flip_lambda = self.train_set_flipper.get_flip_lambda(valid_flip, deterministic=True) self.test_flip_lambda = test_set_flipper.get_flip_lambda(test_flip, deterministic=True) self.valid_dataset_size = valid_dataset_size self.random_seed = random_seed self.sample_class = sample_class self.custom_distribution = custom_distribution color_cast_oracle = ColorCastOracle(self.image_shape[-1], color_cast_range) self.train_color_cast_lambda = color_cast_oracle.get_color_cast_lambda(train_color_cast) self.valid_color_cast_lambda = color_cast_oracle.get_color_cast_lambda(valid_color_cast) self.test_color_cast_lambda = color_cast_oracle.get_color_cast_lambda(test_color_cast) bd = BlockDesigner(TRAIN_LABELS_CSV_PATH, seed=self.random_seed) valid_examples = bd.break_off_block(self.valid_dataset_size) self.train_examples = bd.remainder() self.n_train_batches = int(bd.size() / self.batch_size) self.valid_dataset = self.setup_valid_dataset(valid_examples) self.train_dataset = None if shuffle else self.setup_train_dataset() self.test_dataset = self.setup_test_dataset() self.n_test_examples = len(self.test_dataset["X"]) if self.sample_class: self.n_train_batches = int(len(self.train_dataset["X"]) / self.batch_size) # override in case Sampler is used (TODO make this neater) self.train_dataset_size = self.n_train_batches * self.batch_size if self.center == 1 or self.normalize == 1: self.calc_mean_std_image()