def __init__(self, x_train, y_train, x_test=None, y_test=None, train_val_ratio=0.85): x_train = np.reshape(x_train, (len(x_train), -1)) y_train = np.reshape(y_train, (len(y_train), -1)) train_set_size = int(round(train_val_ratio * len(x_train))) x_train_set = x_train[:train_set_size] y_train_set = y_train[:train_set_size] x_val_set = x_train[train_set_size:] y_val_set = y_train[train_set_size:] self.train = DataSet(x_train_set, y_train_set, dtype=dtypes.float32, reshape=False) self.validation = DataSet(x_val_set, y_val_set, dtype=dtypes.float32, reshape=False) if (x_test is not None) and (y_test is not None): x_test = np.reshape(x_test, (len(x_test), -1)) y_test = np.reshape(y_test, (len(y_test), -1)) self.test = DataSet(x_test, y_test, dtype=dtypes.float32, reshape=False)
def read_h5_data(h5_path, reshape=False): ''' an alternative version of tensorflow.examples.tutorials.mnist.inputdata.read_data_sets() h5_path: path of the noisy mnist dataset h5 file return tensorflow Dataset instance, can be used with batch_X, batch_Y = noisy_mnist.train.next_batch(100) ''' datasets = h5py.File(h5_path, "r") train_images = datasets['train_images'][:, :] test_images = datasets['test_images'][:, :] validation_images = datasets['val_images'][:, :] train_labels = datasets['train_labels'][:, :] test_labels = datasets['test_labels'][:, :] validation_labels = datasets['val_labels'][:, :] train = DataSet(train_images, train_labels, dtype=dtypes.float32, reshape=reshape, seed=None) validation = DataSet(validation_images, validation_labels, dtype=dtypes.float32, reshape=reshape, seed=None) test = DataSet(test_images, test_labels, dtype=dtypes.float32, reshape=reshape, seed=None) return base.Datasets(train=train, validation=validation, test=test)
def load_dataset(setName, name, dataDir, fraction_training): """ Loads dataset with name setName = {extracted, extracted_remapped, remaining, remaining_remapped} from files in dataDir name - name of the dataset files Return fraction_training examples from training datasets """ # files with data file_dataset_train = os.path.join(dataDir, setName + "_" + name + "_train.npz") file_dataset_validation = os.path.join( dataDir, setName + "_" + name + "_validation.npz") file_dataset_test = os.path.join(dataDir, setName + "_" + name + "_test.npz") # remapped data set # train data set d_train = np.load(file_dataset_train) dataset_train_labels = d_train[setName + '_train_labels'] dataset_train_images = d_train[setName + '_train_images'] # how many training examples to return num_train_to_extract = int(dataset_train_labels.shape[0] * fraction_training) dataset_train = DataSet(dataset_train_images[:num_train_to_extract, :] * MAX_INTENSITY, dataset_train_labels[:num_train_to_extract, :], dtype=dtypes.float32, reshape=False, one_hot=True) # validation data set d_validation = np.load(file_dataset_validation) dataset_validation_labels = d_validation[setName + '_validation_labels'] dataset_validation_images = d_validation[setName + '_validation_images'] dataset_validation = DataSet(dataset_validation_images * MAX_INTENSITY, dataset_validation_labels, dtype=dtypes.float32, reshape=False, one_hot=True) # test data set d_test = np.load(file_dataset_test) dataset_test_labels = d_test[setName + '_test_labels'] dataset_test_images = d_test[setName + '_test_images'] dataset_test = DataSet(dataset_test_images * MAX_INTENSITY, dataset_test_labels, dtype=dtypes.float32, reshape=False, one_hot=True) # combine all data sets dataset = base.Datasets(train=dataset_train, validation=dataset_validation, test=dataset_test) return dataset
def __init__(self,path,validation_size=5000): from tensorflow.contrib.learn.python.learn.datasets.mnist import DataSet from tensorflow.contrib.learn.python.learn.datasets import base data = np.load(path) train = DataSet(data['ks'][validation_size:], data['ys'][validation_size:],reshape=False,dtype=np.uint8,one_hot=False) #dtype won't bother even in the case when latent is int32 type. validation = DataSet(data['ks'][:validation_size], data['ys'][:validation_size],reshape=False,dtype=np.uint8,one_hot=False) #test = DataSet(data['test_x'],np.argmax(data['test_y'],axis=1),reshape=False,dtype=np.float32,one_hot=False) self.size = data['ks'].shape[1] self.data = base.Datasets(train=train, validation=validation, test=None)
def read_data_sets(validation_size=5000, one_hot=True): cifar_filename = "datasets/" + "cifar-10-python.tar.gz" try: os.makedirs("datasets") except OSError: pass if not os.path.isfile(cifar_dir + batches[0]): # Download data print("Downloading ckplus dataset") urllib.urlretrieve( "http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz", cifar_filename) tar = tarfile.open(cifar_filename) tar.extractall(path="datasets") tar.close() os.remove(cifar_filename) # Process batches all_batch_images = [] all_batch_labels = [] for batch_name in batches: batch = np.load(cifar_dir + batch_name) batch_images = batch['data'] all_batch_images.append(batch_images) batch_labels = batch['labels'] all_batch_labels.extend(batch_labels) all_batch_images = np.vstack(all_batch_images).reshape(-1, 3, 32, 32) all_batch_images = all_batch_images.transpose([0, 2, 3, 1]) all_batch_labels = np.array(all_batch_labels) train_images, validation_images, train_labels, validation_labels = train_test_split( all_batch_images, all_batch_labels, test_size=validation_size, random_state=0) test_batch = np.load(cifar_dir + "test_batch") test_images = test_batch['data'].reshape(-1, 3, 32, 32) test_images = test_images.transpose([0, 2, 3, 1]) test_labels = np.array(test_batch['labels']) if one_hot: train_labels = dense_to_one_hot(train_labels, NUM_CLASSES) validation_labels = dense_to_one_hot(validation_labels, NUM_CLASSES) test_labels = dense_to_one_hot(test_labels, NUM_CLASSES) train = DataSet(train_images, train_labels, reshape=False) validation = DataSet(validation_images, validation_labels, reshape=False) test = DataSet(test_images, test_labels, reshape=False) return Datasets(train=train, validation=validation, test=test)
def load_data_ssl(params, dirn='../data'): ''' load CIFAR-10 data and split into 4 blocks ''' # parameter set n_train_lab = params['n_train_lab'] n_train_unlab = params['n_train_unlab'] n_val = params['n_val'] mode = params['mode'] dirn = '../data' X_train, y_train0 = load(dirn, subset='train') X_test, y_test0 = load(dirn, subset='test') y_train = onehot_label(y_train0) y_test = onehot_label(y_test0) # step.1 - split validation set n_train = y_train.shape[0] - n_val X_train_tot, X_validation, y_train_tot, y_validation = \ random_sampling(X_train, y_train, n_labeled=n_train) n_train_total = y_train_tot.shape[0] # step.2 - split train set into labeled / unlabeled if (n_train_lab + n_train_unlab) > n_train_total: print('data splitting condition :') print('\tn_train_lab = ', n_train_lab) print('\tn_train_unlab = ', n_train_unlab) print('\tn_train_total = ', n_train_total) raise ValueError('inconsistent parameters') # select bin_sampling or random_sampling (including inbalance) if mode == 'random': X_train_lab, X_train_unlab, y_train_lab, y_train_unlab = \ random_sampling(X_train_tot, y_train_tot, n_labeled=n_train_lab) else: X_train_lab, X_train_unlab, y_train_lab, y_train_unlab = \ bin_sampling(X_train_tot, y_train_tot, n_labeled=n_train_lab) # matrix transpose (channel 1st -> channel last) X_train_lab = np.transpose(X_train_lab, (0, 2, 3, 1)) X_train_unlab = np.transpose(X_train_unlab, (0, 2, 3, 1)) X_validation = np.transpose(X_validation, (0, 2, 3, 1)) X_test = np.transpose(X_test, (0, 2, 3, 1)) # DataSet class construction train_lab = DataSet(X_train_lab, y_train_lab, reshape=False) train_unlab = DataSet(X_train_unlab, y_train_unlab, reshape=False) validation_set = DataSet(X_validation, y_validation, reshape=False) test_set = DataSet(X_test, y_test, reshape=False) cifar10_ssl = Datasets4(train_lab=train_lab, train_unlab=train_unlab, validation=validation_set, test=test_set) return cifar10_ssl
def train(self, x_vec, y_vec): train_writer = tf.summary.FileWriter( 'tensorboard/' + time.strftime("%Y%m%d-%H%M%S"), self.sess.graph) tf.summary.scalar("discriminator_loss", self.discriminator_loss) tf.summary.scalar("discriminator_loss_true", self.discriminator_loss_true) tf.summary.scalar("discriminator_loss_false", self.discriminator_loss_false) tf.summary.scalar("generator_loss", self.generator_loss) # tf.summary.scalar("generator_learning_rate", self.gen_optimizer._lr_t) merged_summary_op = tf.summary.merge_all() images = np.ndarray((1, 784)) labels = np.ndarray((1, 10)) new_i = 0 for i in range(len(mnist.train.labels)): if mnist.train.labels[i][0] > 0: images[new_i] = mnist.train.images[i] * 255 labels[new_i] = mnist.train.labels[i] new_i += 1 break mnist_ones = DataSet(images, labels, reshape=False) # mnist.train.images = images # mnist.train.labels = labels for i in range(iterations * generator_steps_per_iteration): if i % generator_steps_per_iteration == 0: batch = mnist_ones.next_batch(batch_size) _, _, summary = self.sess.run( [ self.train_step_gen, self.train_step_dis, merged_summary_op ], feed_dict={ self.d_input: batch[0], self.generator.x: self.generator.create_batch_input( x_vec, y_vec, batch_size) }) train_writer.add_summary(summary, i / generator_steps_per_iteration) print("Finished step " + str(i / generator_steps_per_iteration)) else: _ = self.sess.run( [self.train_step_gen], feed_dict={ self.generator.x: self.generator.create_batch_input( x_vec, y_vec, batch_size) })
def read_data_sets( fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000, seed=None, ): if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) with gfile.Open(train_data_dir, 'rb') as f: train_images = extract_images(f) with gfile.Open(train_labels_dir, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) with gfile.Open(eval_data_dir, 'rb') as f: test_images = extract_images(f) with gfile.Open(eval_labels_dir, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) test = DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
def load_all_digits(dataDir, fraction_training): """ Loads dataset with all digits from files in dataDir Return fraction_training examples from training datasets """ # filenames file_all_train = os.path.join(dataDir, "all_train.npz") file_all_validation = os.path.join(dataDir, "all_validation.npz") file_all_test = os.path.join(dataDir, "all_test.npz") # all_digits data set # train data set all_train = np.load(file_all_train) all_train_labels = all_train['all_train_labels'] all_train_images = all_train['all_train_images'] # how many training examples to return num_train_to_extract = int(all_train_labels.shape[0] * fraction_training) all_train = DataSet(all_train_images[:num_train_to_extract, :] * MAX_INTENSITY, all_train_labels[:num_train_to_extract, :], dtype=dtypes.float32, reshape=False, one_hot=True) # validation data set all_validation = np.load(file_all_validation) all_validation_labels = all_validation['all_validation_labels'] all_validation_images = all_validation['all_validation_images'] all_validation = DataSet(all_validation_images * MAX_INTENSITY, all_validation_labels, dtype=dtypes.float32, reshape=False, one_hot=True) # test data set all_test = np.load(file_all_test) all_test_labels = all_test['all_test_labels'] all_test_images = all_test['all_test_images'] all_test = DataSet(all_test_images * MAX_INTENSITY, all_test_labels, dtype=dtypes.float32, reshape=False, one_hot=True) # combine all data sets all_digits = base.Datasets(train=all_train, validation=all_validation, test=all_test) return all_digits
def load_data_ssl(params, dirn='../data'): ''' load MNIST data and split into 4 blocks ''' MAX_TRIAL = 999 # retry-max to check label balance percent_limit = params['percent_limit'] n_train_lab = params['n_train_lab'] n_train_unlab = params['n_train_unlab'] n_val = params['n_val'] mnist = input_data.read_data_sets(dirn, validation_size=n_val, one_hot=True) n_train_total = mnist.train.num_examples if (n_train_lab + n_train_unlab) > n_train_total: raise ValueError('inconsistent parameters') X_train = mnist.train.images y_train = mnist.train.labels # prep. index list n_train = X_train.shape[0] num_trial = 0 while num_trial < MAX_TRIAL: train_idx = np.random.permutation(n_train) # split train data into 2 (labeled / unlabeld) X_train_lab = X_train[train_idx[:n_train_lab]] y_train_lab = y_train[train_idx[:n_train_lab]] X_train_unlab = X_train[train_idx[n_train_lab:]] y_train_unlab = y_train[train_idx[n_train_lab:]] # check balance of label if check_label_balance(y_train_lab, percent_limit=percent_limit): break num_trial += 1 if num_trial == MAX_TRIAL: raise ValueError('percentage range looks too narrow.') # cancel scaling of DataSet class constructor X_train_lab = X_train[train_idx[:n_train_lab]] * 255. X_train_unlab = X_train[train_idx[n_train_lab:]] * 255. train_lab = DataSet(X_train_lab, y_train_lab, reshape=False) train_unlab = DataSet(X_train_unlab, y_train_unlab, reshape=False) mnist_ssl = Datasets4(train_lab=train_lab, train_unlab=train_unlab, validation=mnist.validation, test=mnist.test) return mnist_ssl
def __init__(self): import cifar_10 cifar_10.load_and_preprocess_input(dataset_dir='resource/CIFAR_data') self.test_dataset = DataSet(cifar_10.validate_all['data'], cifar_10.validate_all['labels'], reshape=False) self.train_dataset = DataSet(cifar_10.train_all['data'], cifar_10.train_all['labels'], reshape=False) self.image_dimensions = cifar_10.image_width self.actual_class_labels = cifar_10.actual_class_labels
def read_data_sets(train_dir, one_hot=False, dtype=dtypes.float32, reshape=True, seed=None, source_url=DEFAULT_SOURCE_URL): if not source_url: # empty string check source_url = DEFAULT_SOURCE_URL TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' local_file = base.maybe_download(TRAIN_IMAGES, train_dir, source_url + TRAIN_IMAGES) with gfile.Open(local_file, 'rb') as f: train_images = extract_images(f) local_file = base.maybe_download(TRAIN_LABELS, train_dir, source_url + TRAIN_LABELS) with gfile.Open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) return base.Datasets(train=train, validation=None, test=None)
def expand_data_set(data): # generate translated Dataset i_r, i_c = data.images.shape l_r, l_c = data.labels.shape gen_images = np.ndarray((4 * i_r, i_c), dtype=np.float32) gen_labels = np.ndarray((4 * l_r, l_c)) # Shift right gen_images[0:i_r, 1:] = data.images[:, :-1] gen_labels[0:l_r, :] = data.labels for i in range(28): gen_images[0:i_r, i * 28] = 0 # Shift left gen_images[i_r:2 * i_r, :-1] = data.images[:, 1:] gen_labels[i_r:2 * i_r, :] = data.labels for i in range(28): gen_images[i_r:2 * i_r, i * 28 + 27] = 0 # Shift up gen_images[2 * i_r:3 * i_r, :-28] = data.images[:, 28:] gen_labels[2 * i_r:3 * i_r, :] = data.labels gen_images[2 * i_r:3 * i_r, :28] = 0 # Shift down gen_images[3 * i_r:4 * i_r, 28:] = data.images[:, :-28] gen_labels[3 * i_r:4 * i_r, :] = data.labels gen_images[3 * i_r:4 * i_r, :28] = 0 # Convert back to pixel values gen_images = gen_images * 255 return DataSet(gen_images, gen_labels, reshape=False, one_hot=True)
def _preprocess_dataset(dataset, preprocess_fcn, dtype=tf.float32, reshape=True): from tensorflow.contrib.learn.python.learn.datasets.mnist import DataSet images, labels = preprocess_fcn(dataset.images, dataset.labels) return DataSet(images, labels, dtype, reshape)
def make_dataset(sample_set, label_set): sample_train = np.array(sample_set, dtype=np.float32) label_train = np.zeros((len(label_set), n_class)) for i in range(len(label_set)): label_train[i][int(label_set[i])] = 1 options = dict(dtype=dtypes.float32, reshape=True, seed=None) train = DataSet(sample_train, label_train, **options) validation = DataSet(sample_train, label_train, **options) test = DataSet(sample_train, label_train, **options) dataset = base.Datasets(train=train, validation=validation, test=test) train = em.Dataset(dataset.train.images, dataset.train.labels, name="CLIMATE") validation = em.Dataset(dataset.validation.images, dataset.validation.labels, name="CLIMATE") test = em.Dataset(dataset.test.images, dataset.test.labels, name="CLIMATE") res = [train, validation, test] return em.Datasets.from_list(res)
def prepare(self): """ operation that obtains data and create the computation graph """ cifar10_processing.maybe_download_and_extract() images, _, labels = cifar10_processing.load_training_data() # assign the test dataset that will be used by the workflow to test this and the quantized net test_images, _, test_labels = cifar10_processing.load_test_data() # create an instance of dataset class self._dataset = DataSet(images, labels, one_hot=True, reshape=False) self.test_data = (test_images, test_labels) self._input_placeholder, self._output_placeholder, self._label_placeholder = self._inference( ) self._loss_node = self._loss(self._output_placeholder, self._label_placeholder) self._accuracy_node = self.accuracy(self._output_placeholder, self._label_placeholder) self._train_step_node = self._train(self._loss_node)
def build_modified_mnist(dataset, digit, extra_negatives=None, write_to_folder=None, include_negatives=True): addSize = 0 if extra_negatives is None else extra_negatives.shape[0] size = int(dataset.labels.shape[0] / 7) + addSize labels = np.ndarray(shape=(size, 1), dtype=dataset.labels.dtype) images = np.ndarray(shape=(size, dataset.images.shape[1]), dtype=dataset.images.dtype) positives = 0 negatives = addSize c = addSize np.random.seed(1) for i in range(0, addSize): labels[i][0] = 0 images[i] = extra_negatives[i] for i in range(0, dataset.labels.shape[0]): if dataset.labels[i][digit] == 0 and include_negatives: if negatives < positives and np.random.uniform(0, 1, 1) <= 0.5: negatives += 1 labels[c][0] = 0 images[c] = dataset.images[i] if write_to_folder is not None: writeImg( images[c], 28, 28, write_to_folder + "//negative//img" + str(c) + ".png") c += 1 else: positives += 2 labels[c][0] = 1 images[c] = dataset.images[i] if write_to_folder is not None: writeImg(images[c], 28, 28, write_to_folder + "//positive//img" + str(c) + ".png") c += 1 if c >= size: break temp = np.copy(dataset.images[i]) mask = np.random.choice([0, 1], size=dataset.images[i].shape[0], p=[0.5, 0.5]) temp = np.multiply(temp, mask) labels[c][0] = 0.7 images[c] = temp if write_to_folder is not None: writeImg( images[c], 28, 28, write_to_folder + "//positive//img-degraded" + str(c) + ".png") c += 1 if c >= size: break mnist_modified = DataSet(images, labels, one_hot=True, reshape=False) return mnist_modified
def read_data_sets(fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000, seed=None): if fake_data: def fake(): return DataSet( [], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) TRAIN_IMAGES = "train-images.idx3-ubyte" TRAIN_LABELS = "train-labels.idx1-ubyte" TEST_IMAGES = "t10k-images.idx3-ubyte" TEST_LABELS = "t10k-labels.idx1-ubyte" train_images = loadImageSet(TRAIN_IMAGES) train_labels = loadLabelSet(TRAIN_LABELS) test_images = loadImageSet(TEST_IMAGES) test_labels = loadLabelSet(TEST_LABELS) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.' .format(len(train_images), validation_size)) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) test = DataSet(test_images, test_labels, **options) return base.Datasets(train=train, validation=validation, test=test)
def __init__(self, path, validation_size=1): #def __init__(self,path,validation_size=5000): from tensorflow.contrib.learn.python.learn.datasets.mnist import DataSet from tensorflow.contrib.learn.python.learn.datasets import base data = np.load(path) train = DataSet(data['hiddens'][validation_size:], data['ys'][validation_size:], reshape=False, dtype=np.uint8, one_hot=False) validation = DataSet(data['hiddens'][:validation_size], data['ys'][:validation_size], reshape=False, dtype=np.uint8, one_hot=False) self.size = data['hiddens'].shape[1] self.data = base.Datasets(train=train, validation=validation, test=None)
def cifar_datasets(dirname, one_hot=True, dtype=dtypes.float32, reshape=False, seed=None): maybe_download_and_extract(dirname) dirname = os.path.join(dirname, 'cifar-10-batches-py/') train_images = [] train_labels = [] for i in range(1, 6): fpath = os.path.join(dirname, 'data_batch_' + str(i)) image, label = load_batch(fpath) if i == 1: train_images = np.array(image) train_labels = np.array(label) else: train_images = np.concatenate([train_images, image], axis=0) train_labels = np.concatenate([train_labels, label], axis=0) train_images = np.dstack((train_images[:, :1024], train_images[:, 1024:2048], train_images[:, 2048:])) train_images = np.reshape(train_images, [-1, 32, 32, 3]) if one_hot: train_labels = dense_to_one_hot(train_labels, 10) print 'Cifar train_images size:', train_images.shape print 'Cifar train_labels size:', train_labels.shape train_images = train_images / 255.0 - 0.5 fpath = os.path.join(dirname, "test_batch") image, label = load_batch(fpath) test_images = np.array(image) test_labels = np.array(label) test_images = np.dstack((test_images[:, :1024], test_images[:, 1024:2048], test_images[:, 2048:])) test_images = np.reshape(test_images, [-1, 32, 32, 3]) if one_hot: test_labels = dense_to_one_hot(test_labels, 10) print "Cifar test_images size:", test_images.shape print "Cifar test_lables size:", test_labels.shape test_images = test_images / 255.0 - 0.5 options = dict(dtype=dtype, reshape=reshape, seed=seed) train = DataSet(train_images, train_labels, options) test = DataSet(test_images, test_labels, options) return Datasets(train=train, test=test)
def load_minst(src=None, path=None, one_hot=False): mnist = DataSets() if src: mnist = input_data.read_data_sets("MNIST_data/", one_hot=one_hot) if path: if path[-1] != '/': path += '/' train_images = extract_images(path + TRAIN_IMAGES) train_labels = extract_labels(path + TRAIN_LABELS, one_hot=one_hot) test_images = extract_images(path + TEST_IMAGES) test_labels = extract_labels(path + TEST_LABELS, one_hot=one_hot) validation_images = train_images[:VALIDATION_SIZE] validation_labels = train_labels[:VALIDATION_SIZE] train_images = train_images[VALIDATION_SIZE:] train_labels = train_labels[VALIDATION_SIZE:] mnist.train = DataSet(train_images, train_labels) mnist.validation = DataSet(validation_images, validation_labels) mnist.test = DataSet(test_images, test_labels) return mnist
def read_data_sets(param, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000): TRAIN_IMAGES = 'train-images-idx3-ubyte.gz' TRAIN_LABELS = 'train-labels-idx1-ubyte.gz' TEST_IMAGES = 't10k-images-idx3-ubyte.gz' TEST_LABELS = 't10k-labels-idx1-ubyte.gz' TMP_DIR = '../../tmp/' local_file = os.path.join(TMP_DIR, TRAIN_IMAGES) with open(local_file, 'rb') as f: train_images = extract_images(f) local_file = os.path.join(TMP_DIR, TRAIN_LABELS) with open(local_file, 'rb') as f: train_labels = extract_labels(f, one_hot=one_hot) local_file = os.path.join(TMP_DIR, TEST_IMAGES) with open(local_file, 'rb') as f: test_images = extract_images(f) local_file = os.path.join(TMP_DIR, TEST_LABELS) with open(local_file, 'rb') as f: test_labels = extract_labels(f, one_hot=one_hot) validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
def load_mnist_binary_dataset(directory, split): from tensorflow.contrib.learn.python.learn.datasets.mnist import DataSet np_filepath = os.path.join(directory, FILE_TEMPLATE_NP.format(split=split)) def lines_to_np_array(lines): return np.array([[int(i) for i in line.split()] for line in lines]) if os.path.isfile(np_filepath): np_data = np.load(np_filepath) else: with open(download(directory, FILE_TEMPLATE.format(split=split))) as f: lines = f.readlines() np_data = lines_to_np_array(lines).astype('float32') np.save(np_filepath, np_data) return DataSet(np_data.reshape([-1, 28, 28, 1]) * 255, np.zeros(len(np_data)))
def prepare_data(corruption_matrix, gold_fraction=0.05, merge_valset=True): np.random.seed(1) mnist_images = np.copy(mnist.train.images) mnist_labels = np.copy(mnist.train.labels) if merge_valset: mnist_images = np.concatenate( [mnist_images, np.copy(mnist.validation.images)], axis=0) mnist_labels = np.concatenate( [mnist_labels, np.copy(mnist.validation.labels)]) indices = np.arange(len(mnist_labels)) np.random.shuffle(indices) mnist_images = mnist_images[indices] mnist_labels = mnist_labels[indices].astype(np.long) mnist_labels_orig = np.copy(mnist_labels) num_gold = int(len(mnist_labels) * gold_fraction) num_silver = len(mnist_labels) - num_gold for i in range(num_silver): mnist_labels[i] = np.random.choice( num_classes, p=corruption_matrix[mnist_labels[i]]) # dtype flag is important to the DataSet class doesn't renormalize the images by /255 gold = DataSet(mnist_images[num_silver:], mnist_labels[num_silver:], reshape=False, dtype=dtypes.uint8) silver = DataSet(mnist_images[:num_silver], np.array( list( zip(mnist_labels[:num_silver], mnist_labels_orig[:num_silver]))), reshape=False, dtype=dtypes.uint8) return gold, silver
def load_data_ssl(params, dirn='../data'): ''' load MNIST data and split into 4 blocks ''' n_train_lab = params['n_train_lab'] n_train_unlab = params['n_train_unlab'] n_val = params['n_val'] mode = params['mode'] mnist = input_data.read_data_sets(dirn, validation_size=n_val, one_hot=True) n_train_total = mnist.train.num_examples if (n_train_lab + n_train_unlab) > n_train_total: raise ValueError('inconsistent parameters') X_train = mnist.train.images y_train = mnist.train.labels # select bin_sampling or random_sampling (including inbalance) if mode == 'random': X_train_lab, X_train_unlab, y_train_lab, y_train_unlab = \ random_sampling(X_train, y_train, n_labeled=n_train_lab) else: X_train_lab, X_train_unlab, y_train_lab, y_train_unlab = \ bin_sampling(X_train, y_train, n_labeled=n_train_lab) # cancel scaling by DataSet class constructor X_train_lab = X_train_lab * 255. X_train_unlab = X_train_unlab * 255. train_lab = DataSet(X_train_lab, y_train_lab, reshape=False) train_unlab = DataSet(X_train_unlab, y_train_unlab, reshape=False) mnist_ssl = Datasets4(train_lab=train_lab, train_unlab=train_unlab, validation=mnist.validation, test=mnist.test) return mnist_ssl
def split_dataset(self, dtype=dtypes.float32, reshape=True, seed=None, validation_size=7000): labels = self.dataset.train.labels # SPLIT FIRST GROUP (1-4) # Find all training images/labels 1-4 train_labels_idx = np.nonzero(self.dataset.train.labels)[1] train_labels_idx = np.nonzero(train_labels_idx < 5)[0] train_labels = self.dataset.train.labels[train_labels_idx] train_images = self.dataset.train.images[train_labels_idx] # Find all testing images/labels 1-4 test_labels_idx = np.nonzero(self.dataset.test.labels)[1] test_labels_idx = np.nonzero(test_labels_idx < 5)[0] test_labels = self.dataset.test.labels[test_labels_idx] test_images = self.dataset.test.images[test_labels_idx] # Create validation/training groups validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] options = dict(dtype=dtype, reshape=False, seed=seed) # Define training, validation, and testing datasets train = DataSet(train_images, train_labels, **options) validation = DataSet(validation_images, validation_labels, **options) test = DataSet(test_images, test_labels, **options) first_dataset = base.Datasets(train=train, validation=validation, test=test) # SPLIT SECOND GROUP (5-9) # Find all training images/labels 5-9 train_labels_idx = np.nonzero(self.dataset.train.labels)[1] train_labels_idx = np.nonzero(train_labels_idx >= 5)[0] train_labels_2 = self.dataset.train.labels[train_labels_idx] train_images_2 = self.dataset.train.images[train_labels_idx] # Find all testing images/labels 5-9 test_labels_idx = np.nonzero(self.dataset.test.labels)[1] test_labels_idx = np.nonzero(test_labels_idx >= 5)[0] test_labels_2 = self.dataset.test.labels[test_labels_idx] test_images_2 = self.dataset.test.images[test_labels_idx] # Create validation/training groups validation_images_2 = train_images_2[:validation_size] validation_labels_2 = train_labels_2[:validation_size] train_images_2 = train_images_2[validation_size:] train_labels_2 = train_labels_2[validation_size:] # Define training, validation, and testing datasets train_2 = DataSet(train_images_2, train_labels_2, **options) validation_2 = DataSet(validation_images_2, validation_labels_2, **options) test_2 = DataSet(test_images_2, test_labels_2, **options) second_dataset = base.Datasets(train=train_2, validation=validation_2, test=test_2) return first_dataset, second_dataset
def split_mnist(mnist, cond): sets = ["train", "validation", "test"] sets_list = [] for set_name in sets: this_set = getattr(mnist, set_name) maxlabels = np.argmax(this_set.labels, 1) sets_list.append( DataSet(this_set.images[cond(maxlabels), :], this_set.labels[cond(maxlabels)], dtype=dtypes.uint8, reshape=False)) return base.Datasets(train=sets_list[0], validation=sets_list[1], test=sets_list[2])
def load_data(dirn='../data'): ''' load CIFAR-10 data and split into 3 blocks ''' # parameter set n_train = 40000 n_val = 10000 dirn = '../data' X_train0, y_train0 = load(dirn, subset='train') X_test, y_test0 = load(dirn, subset='test') print('Files are loaded.') y_train1 = onehot_label(y_train0) y_test = onehot_label(y_test0) # split validation set X_train, y_train, X_validation, y_validation = \ random_sampling(X_train0, y_train1, n_validation=n_val) ''' print('X_train: ', X_train.shape, ', ', type(X_train)) print('X_validation: ', X_validation.shape, ', ', type(X_validation)) ''' # matrix transpose (channel 1st -> channel last) X_train = np.transpose(X_train, (0, 2, 3, 1)) X_validation = np.transpose(X_validation, (0, 2, 3, 1)) X_test = np.transpose(X_test, (0, 2, 3, 1)) # DataSet class construction train_lab = DataSet(X_train, y_train, reshape=False) validation_set = DataSet(X_validation, y_validation, reshape=False) test_set = DataSet(X_test, y_test, reshape=False) cifar10 = Datasets(train=train_lab, validation=validation_set, test=test_set) return cifar10
def gen_splitMNIST(bounds): dataset = input_data.read_data_sets(FLAGS.data_dir, one_hot=True) sets = ["train", "validation", "test"] sets_list = [] for set_name in sets: this_set = getattr(dataset, set_name) maxlabels = np.argmax(this_set.labels, 1) sets_list.append( DataSet(this_set.images[((maxlabels >= bounds[0]) & (maxlabels <= bounds[1])), :], this_set.labels[((maxlabels >= bounds[0]) & (maxlabels <= bounds[1]))], dtype=dtypes.uint8, reshape=False)) return base.Datasets(train=sets_list[0], validation=sets_list[1], test=sets_list[2])
def prep_imbalanced_dataset(dirn='../data'): """ prepare imbalanced dataset label-1: dominant label label-3: fewer label (about 5% of lebal-1) label-5: fewer label (about 5% of label-1) """ mnist = input_data.read_data_sets(dirn, one_hot=False) mnist3 = Datasets(train=None, test=None) for subset in [mnist.train, mnist.test]: mnist_lab = subset.labels idx1 = (mnist_lab == 1) # 'Trouser' class in Fashion-MNIST idx3 = (mnist_lab == 3) # 'Dress' class idx5 = (mnist_lab == 5) # 'Sandal' class small = subset.num_examples // 200 # original ...total // 10 idx1 = [i for i in range(len(idx1)) if idx1[i]] idx3 = [i for i in range(len(idx3)) if idx3[i]] idx5 = [i for i in range(len(idx5)) if idx5[i]] idx_merged = np.concatenate([idx1, idx3[:small], idx5[:small]]) X_sub = subset.images[idx_merged] y_sub = subset.labels[idx_merged] # make one-hot label y_oh = [] for lab in y_sub: lab_i = np.zeros([10], dtype=np.float32) lab_i[lab] = 1.0 y_oh.append(lab_i) y_sub = np.asarray(y_oh) # adjust before re-entering into DataSet object X_sub= X_sub * 255. mnist_sub = DataSet(X_sub, y_sub, reshape=False) if subset == mnist.train: mnist3 = mnist3._replace(train=mnist_sub) if subset == mnist.test: mnist3 = mnist3._replace(test=mnist_sub) return mnist3