def __init__(self,
                 x_train,
                 y_train,
                 x_test=None,
                 y_test=None,
                 train_val_ratio=0.85):
        x_train = np.reshape(x_train, (len(x_train), -1))
        y_train = np.reshape(y_train, (len(y_train), -1))

        train_set_size = int(round(train_val_ratio * len(x_train)))
        x_train_set = x_train[:train_set_size]
        y_train_set = y_train[:train_set_size]

        x_val_set = x_train[train_set_size:]
        y_val_set = y_train[train_set_size:]

        self.train = DataSet(x_train_set,
                             y_train_set,
                             dtype=dtypes.float32,
                             reshape=False)
        self.validation = DataSet(x_val_set,
                                  y_val_set,
                                  dtype=dtypes.float32,
                                  reshape=False)
        if (x_test is not None) and (y_test is not None):
            x_test = np.reshape(x_test, (len(x_test), -1))
            y_test = np.reshape(y_test, (len(y_test), -1))
            self.test = DataSet(x_test,
                                y_test,
                                dtype=dtypes.float32,
                                reshape=False)
Ejemplo n.º 2
0
def read_h5_data(h5_path, reshape=False):
    '''
    an alternative version of tensorflow.examples.tutorials.mnist.inputdata.read_data_sets()
    h5_path: path of the noisy mnist dataset h5 file
    return tensorflow Dataset instance, can be used with
    batch_X, batch_Y = noisy_mnist.train.next_batch(100)
    '''

    datasets = h5py.File(h5_path, "r")
    train_images = datasets['train_images'][:, :]
    test_images = datasets['test_images'][:, :]
    validation_images = datasets['val_images'][:, :]
    train_labels = datasets['train_labels'][:, :]
    test_labels = datasets['test_labels'][:, :]
    validation_labels = datasets['val_labels'][:, :]

    train = DataSet(train_images,
                    train_labels,
                    dtype=dtypes.float32,
                    reshape=reshape,
                    seed=None)
    validation = DataSet(validation_images,
                         validation_labels,
                         dtype=dtypes.float32,
                         reshape=reshape,
                         seed=None)
    test = DataSet(test_images,
                   test_labels,
                   dtype=dtypes.float32,
                   reshape=reshape,
                   seed=None)

    return base.Datasets(train=train, validation=validation, test=test)
Ejemplo n.º 3
0
def load_dataset(setName, name, dataDir, fraction_training):
    """
    Loads dataset with name 
    setName = {extracted, extracted_remapped, remaining, remaining_remapped} 
    from files in dataDir
    name - name of the dataset files
    Return fraction_training examples from training datasets
    """
    # files with data
    file_dataset_train = os.path.join(dataDir,
                                      setName + "_" + name + "_train.npz")
    file_dataset_validation = os.path.join(
        dataDir, setName + "_" + name + "_validation.npz")
    file_dataset_test = os.path.join(dataDir,
                                     setName + "_" + name + "_test.npz")

    # remapped data set
    # train data set
    d_train = np.load(file_dataset_train)
    dataset_train_labels = d_train[setName + '_train_labels']
    dataset_train_images = d_train[setName + '_train_images']

    # how many training examples to return
    num_train_to_extract = int(dataset_train_labels.shape[0] *
                               fraction_training)

    dataset_train = DataSet(dataset_train_images[:num_train_to_extract, :] *
                            MAX_INTENSITY,
                            dataset_train_labels[:num_train_to_extract, :],
                            dtype=dtypes.float32,
                            reshape=False,
                            one_hot=True)
    # validation data set
    d_validation = np.load(file_dataset_validation)
    dataset_validation_labels = d_validation[setName + '_validation_labels']
    dataset_validation_images = d_validation[setName + '_validation_images']
    dataset_validation = DataSet(dataset_validation_images * MAX_INTENSITY,
                                 dataset_validation_labels,
                                 dtype=dtypes.float32,
                                 reshape=False,
                                 one_hot=True)

    # test data set
    d_test = np.load(file_dataset_test)
    dataset_test_labels = d_test[setName + '_test_labels']
    dataset_test_images = d_test[setName + '_test_images']
    dataset_test = DataSet(dataset_test_images * MAX_INTENSITY,
                           dataset_test_labels,
                           dtype=dtypes.float32,
                           reshape=False,
                           one_hot=True)
    # combine all data sets
    dataset = base.Datasets(train=dataset_train,
                            validation=dataset_validation,
                            test=dataset_test)

    return dataset
Ejemplo n.º 4
0
        def __init__(self,path,validation_size=5000):
            from tensorflow.contrib.learn.python.learn.datasets.mnist import DataSet
            from tensorflow.contrib.learn.python.learn.datasets import base

            data = np.load(path)
            train = DataSet(data['ks'][validation_size:], data['ys'][validation_size:],reshape=False,dtype=np.uint8,one_hot=False) #dtype won't bother even in the case when latent is int32 type.
            validation = DataSet(data['ks'][:validation_size], data['ys'][:validation_size],reshape=False,dtype=np.uint8,one_hot=False)
            #test = DataSet(data['test_x'],np.argmax(data['test_y'],axis=1),reshape=False,dtype=np.float32,one_hot=False)
            self.size = data['ks'].shape[1]
            self.data = base.Datasets(train=train, validation=validation, test=None)
Ejemplo n.º 5
0
def read_data_sets(validation_size=5000, one_hot=True):
    cifar_filename = "datasets/" + "cifar-10-python.tar.gz"

    try:
        os.makedirs("datasets")
    except OSError:
        pass

    if not os.path.isfile(cifar_dir + batches[0]):
        # Download data
        print("Downloading ckplus dataset")
        urllib.urlretrieve(
            "http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz",
            cifar_filename)
        tar = tarfile.open(cifar_filename)
        tar.extractall(path="datasets")
        tar.close()
        os.remove(cifar_filename)

    # Process batches
    all_batch_images = []
    all_batch_labels = []
    for batch_name in batches:
        batch = np.load(cifar_dir + batch_name)
        batch_images = batch['data']
        all_batch_images.append(batch_images)
        batch_labels = batch['labels']
        all_batch_labels.extend(batch_labels)

    all_batch_images = np.vstack(all_batch_images).reshape(-1, 3, 32, 32)
    all_batch_images = all_batch_images.transpose([0, 2, 3, 1])
    all_batch_labels = np.array(all_batch_labels)

    train_images, validation_images, train_labels, validation_labels = train_test_split(
        all_batch_images,
        all_batch_labels,
        test_size=validation_size,
        random_state=0)

    test_batch = np.load(cifar_dir + "test_batch")
    test_images = test_batch['data'].reshape(-1, 3, 32, 32)
    test_images = test_images.transpose([0, 2, 3, 1])

    test_labels = np.array(test_batch['labels'])

    if one_hot:
        train_labels = dense_to_one_hot(train_labels, NUM_CLASSES)
        validation_labels = dense_to_one_hot(validation_labels, NUM_CLASSES)
        test_labels = dense_to_one_hot(test_labels, NUM_CLASSES)

    train = DataSet(train_images, train_labels, reshape=False)
    validation = DataSet(validation_images, validation_labels, reshape=False)
    test = DataSet(test_images, test_labels, reshape=False)

    return Datasets(train=train, validation=validation, test=test)
Ejemplo n.º 6
0
def load_data_ssl(params, dirn='../data'):
    '''
      load CIFAR-10 data and split into 4 blocks
    '''
    # parameter set
    n_train_lab = params['n_train_lab']
    n_train_unlab = params['n_train_unlab']
    n_val = params['n_val']
    mode = params['mode']

    dirn = '../data'
    X_train, y_train0 = load(dirn, subset='train')
    X_test, y_test0 = load(dirn, subset='test')
    y_train = onehot_label(y_train0)
    y_test = onehot_label(y_test0)

    # step.1 - split validation set
    n_train = y_train.shape[0] - n_val
    X_train_tot, X_validation, y_train_tot, y_validation = \
        random_sampling(X_train, y_train, n_labeled=n_train)
    n_train_total = y_train_tot.shape[0]

    # step.2 - split train set into labeled / unlabeled
    if (n_train_lab + n_train_unlab) > n_train_total:
        print('data splitting condition :')
        print('\tn_train_lab   = ', n_train_lab)
        print('\tn_train_unlab = ', n_train_unlab)
        print('\tn_train_total = ', n_train_total)
        raise ValueError('inconsistent parameters')

    # select bin_sampling or random_sampling (including inbalance)
    if mode == 'random':
        X_train_lab, X_train_unlab, y_train_lab, y_train_unlab = \
            random_sampling(X_train_tot, y_train_tot, n_labeled=n_train_lab)
    else:
        X_train_lab, X_train_unlab, y_train_lab, y_train_unlab = \
            bin_sampling(X_train_tot, y_train_tot, n_labeled=n_train_lab)

    # matrix transpose (channel 1st -> channel last)
    X_train_lab = np.transpose(X_train_lab, (0, 2, 3, 1))
    X_train_unlab = np.transpose(X_train_unlab, (0, 2, 3, 1))
    X_validation = np.transpose(X_validation, (0, 2, 3, 1))
    X_test = np.transpose(X_test, (0, 2, 3, 1))

    # DataSet class construction
    train_lab = DataSet(X_train_lab, y_train_lab, reshape=False)
    train_unlab = DataSet(X_train_unlab, y_train_unlab, reshape=False)
    validation_set = DataSet(X_validation, y_validation, reshape=False)
    test_set = DataSet(X_test, y_test, reshape=False)

    cifar10_ssl = Datasets4(train_lab=train_lab,
                            train_unlab=train_unlab,
                            validation=validation_set,
                            test=test_set)
    return cifar10_ssl
Ejemplo n.º 7
0
    def train(self, x_vec, y_vec):
        train_writer = tf.summary.FileWriter(
            'tensorboard/' + time.strftime("%Y%m%d-%H%M%S"), self.sess.graph)
        tf.summary.scalar("discriminator_loss", self.discriminator_loss)
        tf.summary.scalar("discriminator_loss_true",
                          self.discriminator_loss_true)
        tf.summary.scalar("discriminator_loss_false",
                          self.discriminator_loss_false)
        tf.summary.scalar("generator_loss", self.generator_loss)
        #        tf.summary.scalar("generator_learning_rate", self.gen_optimizer._lr_t)

        merged_summary_op = tf.summary.merge_all()

        images = np.ndarray((1, 784))
        labels = np.ndarray((1, 10))
        new_i = 0
        for i in range(len(mnist.train.labels)):
            if mnist.train.labels[i][0] > 0:
                images[new_i] = mnist.train.images[i] * 255
                labels[new_i] = mnist.train.labels[i]
                new_i += 1
                break

        mnist_ones = DataSet(images, labels, reshape=False)
        # mnist.train.images = images
        # mnist.train.labels = labels

        for i in range(iterations * generator_steps_per_iteration):
            if i % generator_steps_per_iteration == 0:
                batch = mnist_ones.next_batch(batch_size)
                _, _, summary = self.sess.run(
                    [
                        self.train_step_gen, self.train_step_dis,
                        merged_summary_op
                    ],
                    feed_dict={
                        self.d_input:
                        batch[0],
                        self.generator.x:
                        self.generator.create_batch_input(
                            x_vec, y_vec, batch_size)
                    })
                train_writer.add_summary(summary,
                                         i / generator_steps_per_iteration)
                print("Finished step " +
                      str(i / generator_steps_per_iteration))
            else:
                _ = self.sess.run(
                    [self.train_step_gen],
                    feed_dict={
                        self.generator.x:
                        self.generator.create_batch_input(
                            x_vec, y_vec, batch_size)
                    })
Ejemplo n.º 8
0
def read_data_sets(
    fake_data=False,
    one_hot=False,
    dtype=dtypes.float32,
    reshape=True,
    validation_size=5000,
    seed=None,
):
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True,
                           one_hot=one_hot,
                           dtype=dtype,
                           seed=seed)

        train = fake()
        validation = fake()
        test = fake()
        return base.Datasets(train=train, validation=validation, test=test)

    with gfile.Open(train_data_dir, 'rb') as f:
        train_images = extract_images(f)

    with gfile.Open(train_labels_dir, 'rb') as f:
        train_labels = extract_labels(f, one_hot=one_hot)

    with gfile.Open(eval_data_dir, 'rb') as f:
        test_images = extract_images(f)

    with gfile.Open(eval_labels_dir, 'rb') as f:
        test_labels = extract_labels(f, one_hot=one_hot)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    options = dict(dtype=dtype, reshape=reshape, seed=seed)

    train = DataSet(train_images, train_labels, **options)
    validation = DataSet(validation_images, validation_labels, **options)
    test = DataSet(test_images, test_labels, **options)

    return base.Datasets(train=train, validation=validation, test=test)
Ejemplo n.º 9
0
def load_all_digits(dataDir, fraction_training):
    """
    Loads dataset with all digits from files in dataDir
    Return fraction_training examples from training datasets
    """
    # filenames
    file_all_train = os.path.join(dataDir, "all_train.npz")
    file_all_validation = os.path.join(dataDir, "all_validation.npz")
    file_all_test = os.path.join(dataDir, "all_test.npz")

    # all_digits data set
    # train data set
    all_train = np.load(file_all_train)
    all_train_labels = all_train['all_train_labels']
    all_train_images = all_train['all_train_images']

    # how many training examples to return
    num_train_to_extract = int(all_train_labels.shape[0] * fraction_training)

    all_train = DataSet(all_train_images[:num_train_to_extract, :] *
                        MAX_INTENSITY,
                        all_train_labels[:num_train_to_extract, :],
                        dtype=dtypes.float32,
                        reshape=False,
                        one_hot=True)
    # validation data set
    all_validation = np.load(file_all_validation)
    all_validation_labels = all_validation['all_validation_labels']
    all_validation_images = all_validation['all_validation_images']
    all_validation = DataSet(all_validation_images * MAX_INTENSITY,
                             all_validation_labels,
                             dtype=dtypes.float32,
                             reshape=False,
                             one_hot=True)
    # test data set
    all_test = np.load(file_all_test)
    all_test_labels = all_test['all_test_labels']
    all_test_images = all_test['all_test_images']
    all_test = DataSet(all_test_images * MAX_INTENSITY,
                       all_test_labels,
                       dtype=dtypes.float32,
                       reshape=False,
                       one_hot=True)

    # combine all data sets
    all_digits = base.Datasets(train=all_train,
                               validation=all_validation,
                               test=all_test)

    return all_digits
Ejemplo n.º 10
0
def load_data_ssl(params, dirn='../data'):
    '''
      load MNIST data and split into 4 blocks
    '''
    MAX_TRIAL = 999  # retry-max to check label balance
    percent_limit = params['percent_limit']

    n_train_lab = params['n_train_lab']
    n_train_unlab = params['n_train_unlab']
    n_val = params['n_val']
    mnist = input_data.read_data_sets(dirn,
                                      validation_size=n_val,
                                      one_hot=True)
    n_train_total = mnist.train.num_examples
    if (n_train_lab + n_train_unlab) > n_train_total:
        raise ValueError('inconsistent parameters')

    X_train = mnist.train.images
    y_train = mnist.train.labels
    # prep. index list
    n_train = X_train.shape[0]

    num_trial = 0
    while num_trial < MAX_TRIAL:
        train_idx = np.random.permutation(n_train)
        # split train data into 2 (labeled / unlabeld)
        X_train_lab = X_train[train_idx[:n_train_lab]]
        y_train_lab = y_train[train_idx[:n_train_lab]]
        X_train_unlab = X_train[train_idx[n_train_lab:]]
        y_train_unlab = y_train[train_idx[n_train_lab:]]
        # check balance of label
        if check_label_balance(y_train_lab, percent_limit=percent_limit):
            break
        num_trial += 1
        if num_trial == MAX_TRIAL:
            raise ValueError('percentage range looks too narrow.')

    # cancel scaling of DataSet class constructor
    X_train_lab = X_train[train_idx[:n_train_lab]] * 255.
    X_train_unlab = X_train[train_idx[n_train_lab:]] * 255.

    train_lab = DataSet(X_train_lab, y_train_lab, reshape=False)
    train_unlab = DataSet(X_train_unlab, y_train_unlab, reshape=False)

    mnist_ssl = Datasets4(train_lab=train_lab,
                          train_unlab=train_unlab,
                          validation=mnist.validation,
                          test=mnist.test)
    return mnist_ssl
Ejemplo n.º 11
0
    def __init__(self):
        import cifar_10
        cifar_10.load_and_preprocess_input(dataset_dir='resource/CIFAR_data')

        self.test_dataset = DataSet(cifar_10.validate_all['data'],
                                    cifar_10.validate_all['labels'],
                                    reshape=False)

        self.train_dataset = DataSet(cifar_10.train_all['data'],
                                     cifar_10.train_all['labels'],
                                     reshape=False)

        self.image_dimensions = cifar_10.image_width

        self.actual_class_labels = cifar_10.actual_class_labels
Ejemplo n.º 12
0
def read_data_sets(train_dir,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   seed=None,
                   source_url=DEFAULT_SOURCE_URL):
    if not source_url:  # empty string check
        source_url = DEFAULT_SOURCE_URL
    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                     source_url + TRAIN_IMAGES)
    with gfile.Open(local_file, 'rb') as f:
        train_images = extract_images(f)

    local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                     source_url + TRAIN_LABELS)
    with gfile.Open(local_file, 'rb') as f:
        train_labels = extract_labels(f, one_hot=one_hot)

    options = dict(dtype=dtype, reshape=reshape, seed=seed)

    train = DataSet(train_images, train_labels, **options)

    return base.Datasets(train=train, validation=None, test=None)
def expand_data_set(data):
    # generate translated Dataset
    i_r, i_c = data.images.shape
    l_r, l_c = data.labels.shape
    gen_images = np.ndarray((4 * i_r, i_c), dtype=np.float32)
    gen_labels = np.ndarray((4 * l_r, l_c))

    # Shift right
    gen_images[0:i_r, 1:] = data.images[:, :-1]
    gen_labels[0:l_r, :] = data.labels
    for i in range(28):
        gen_images[0:i_r, i * 28] = 0

    # Shift left
    gen_images[i_r:2 * i_r, :-1] = data.images[:, 1:]
    gen_labels[i_r:2 * i_r, :] = data.labels
    for i in range(28):
        gen_images[i_r:2 * i_r, i * 28 + 27] = 0

    # Shift up
    gen_images[2 * i_r:3 * i_r, :-28] = data.images[:, 28:]
    gen_labels[2 * i_r:3 * i_r, :] = data.labels
    gen_images[2 * i_r:3 * i_r, :28] = 0

    # Shift down
    gen_images[3 * i_r:4 * i_r, 28:] = data.images[:, :-28]
    gen_labels[3 * i_r:4 * i_r, :] = data.labels
    gen_images[3 * i_r:4 * i_r, :28] = 0

    # Convert back to pixel values
    gen_images = gen_images * 255

    return DataSet(gen_images, gen_labels, reshape=False, one_hot=True)
Ejemplo n.º 14
0
def _preprocess_dataset(dataset,
                        preprocess_fcn,
                        dtype=tf.float32,
                        reshape=True):
    from tensorflow.contrib.learn.python.learn.datasets.mnist import DataSet
    images, labels = preprocess_fcn(dataset.images, dataset.labels)
    return DataSet(images, labels, dtype, reshape)
Ejemplo n.º 15
0
def make_dataset(sample_set, label_set):
    sample_train = np.array(sample_set, dtype=np.float32)
    label_train = np.zeros((len(label_set), n_class))
    for i in range(len(label_set)):
        label_train[i][int(label_set[i])] = 1
    options = dict(dtype=dtypes.float32, reshape=True, seed=None)
    train = DataSet(sample_train, label_train, **options)
    validation = DataSet(sample_train, label_train, **options)
    test = DataSet(sample_train, label_train, **options)

    dataset = base.Datasets(train=train, validation=validation, test=test)

    train = em.Dataset(dataset.train.images, dataset.train.labels, name="CLIMATE")
    validation = em.Dataset(dataset.validation.images, dataset.validation.labels, name="CLIMATE")
    test = em.Dataset(dataset.test.images, dataset.test.labels, name="CLIMATE")
    res = [train, validation, test]
    return em.Datasets.from_list(res)
 def prepare(self):
     """
     operation that obtains data and create the computation graph
     """
     cifar10_processing.maybe_download_and_extract()
     images, _, labels = cifar10_processing.load_training_data()
     # assign the test dataset that will be used by the workflow to test this and the quantized net
     test_images, _, test_labels = cifar10_processing.load_test_data()
     # create an instance of dataset class
     self._dataset = DataSet(images, labels, one_hot=True, reshape=False)
     self.test_data = (test_images, test_labels)
     self._input_placeholder, self._output_placeholder, self._label_placeholder = self._inference(
     )
     self._loss_node = self._loss(self._output_placeholder,
                                  self._label_placeholder)
     self._accuracy_node = self.accuracy(self._output_placeholder,
                                         self._label_placeholder)
     self._train_step_node = self._train(self._loss_node)
Ejemplo n.º 17
0
def build_modified_mnist(dataset,
                         digit,
                         extra_negatives=None,
                         write_to_folder=None,
                         include_negatives=True):
    addSize = 0 if extra_negatives is None else extra_negatives.shape[0]
    size = int(dataset.labels.shape[0] / 7) + addSize
    labels = np.ndarray(shape=(size, 1), dtype=dataset.labels.dtype)
    images = np.ndarray(shape=(size, dataset.images.shape[1]),
                        dtype=dataset.images.dtype)
    positives = 0
    negatives = addSize
    c = addSize
    np.random.seed(1)
    for i in range(0, addSize):
        labels[i][0] = 0
        images[i] = extra_negatives[i]
    for i in range(0, dataset.labels.shape[0]):
        if dataset.labels[i][digit] == 0 and include_negatives:
            if negatives < positives and np.random.uniform(0, 1, 1) <= 0.5:
                negatives += 1
                labels[c][0] = 0
                images[c] = dataset.images[i]
                if write_to_folder is not None:
                    writeImg(
                        images[c], 28, 28,
                        write_to_folder + "//negative//img" + str(c) + ".png")
                c += 1
        else:
            positives += 2
            labels[c][0] = 1
            images[c] = dataset.images[i]
            if write_to_folder is not None:
                writeImg(images[c], 28, 28,
                         write_to_folder + "//positive//img" + str(c) + ".png")
            c += 1
            if c >= size:
                break
            temp = np.copy(dataset.images[i])
            mask = np.random.choice([0, 1],
                                    size=dataset.images[i].shape[0],
                                    p=[0.5, 0.5])
            temp = np.multiply(temp, mask)
            labels[c][0] = 0.7
            images[c] = temp
            if write_to_folder is not None:
                writeImg(
                    images[c], 28, 28, write_to_folder +
                    "//positive//img-degraded" + str(c) + ".png")

            c += 1
        if c >= size:
            break

    mnist_modified = DataSet(images, labels, one_hot=True, reshape=False)
    return mnist_modified
Ejemplo n.º 18
0
def read_data_sets(fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=5000,
                   seed=None):
	if fake_data:
		def fake():
			return DataSet(
					[], [], fake_data=True, one_hot=one_hot, dtype=dtype, seed=seed)

		train = fake()
		validation = fake()
		test = fake()
		return base.Datasets(train=train, validation=validation, test=test)

	TRAIN_IMAGES = "train-images.idx3-ubyte"
	TRAIN_LABELS = "train-labels.idx1-ubyte"
	TEST_IMAGES = "t10k-images.idx3-ubyte"
	TEST_LABELS = "t10k-labels.idx1-ubyte"

	train_images = loadImageSet(TRAIN_IMAGES)
	train_labels = loadLabelSet(TRAIN_LABELS)
	test_images = loadImageSet(TEST_IMAGES)
	test_labels = loadLabelSet(TEST_LABELS)

	if not 0 <= validation_size <= len(train_images):
		raise ValueError(
				'Validation size should be between 0 and {}. Received: {}.'
					.format(len(train_images), validation_size))

	validation_images = train_images[:validation_size]
	validation_labels = train_labels[:validation_size]
	train_images = train_images[validation_size:]
	train_labels = train_labels[validation_size:]

	options = dict(dtype=dtype, reshape=reshape, seed=seed)

	train = DataSet(train_images, train_labels, **options)
	validation = DataSet(validation_images, validation_labels, **options)
	test = DataSet(test_images, test_labels, **options)

	return base.Datasets(train=train, validation=validation, test=test)
Ejemplo n.º 19
0
    def __init__(self, path, validation_size=1):
        #def __init__(self,path,validation_size=5000):
        from tensorflow.contrib.learn.python.learn.datasets.mnist import DataSet
        from tensorflow.contrib.learn.python.learn.datasets import base

        data = np.load(path)
        train = DataSet(data['hiddens'][validation_size:],
                        data['ys'][validation_size:],
                        reshape=False,
                        dtype=np.uint8,
                        one_hot=False)
        validation = DataSet(data['hiddens'][:validation_size],
                             data['ys'][:validation_size],
                             reshape=False,
                             dtype=np.uint8,
                             one_hot=False)
        self.size = data['hiddens'].shape[1]
        self.data = base.Datasets(train=train,
                                  validation=validation,
                                  test=None)
Ejemplo n.º 20
0
def cifar_datasets(dirname, one_hot=True,
                   dtype=dtypes.float32,
                   reshape=False,
                   seed=None):
    maybe_download_and_extract(dirname)
    dirname = os.path.join(dirname, 'cifar-10-batches-py/')
    train_images = []
    train_labels = []
    for i in range(1, 6):
        fpath = os.path.join(dirname, 'data_batch_' + str(i))
        image, label = load_batch(fpath)
        if i == 1:
            train_images = np.array(image)
            train_labels = np.array(label)
        else:
            train_images = np.concatenate([train_images, image], axis=0)
            train_labels = np.concatenate([train_labels, label], axis=0)
    train_images = np.dstack((train_images[:, :1024], train_images[:, 1024:2048], train_images[:, 2048:]))
    train_images = np.reshape(train_images, [-1, 32, 32, 3])
    if one_hot:
        train_labels = dense_to_one_hot(train_labels, 10)
    print 'Cifar train_images size:', train_images.shape
    print 'Cifar train_labels size:', train_labels.shape
    train_images = train_images / 255.0 - 0.5

    fpath = os.path.join(dirname, "test_batch")
    image, label = load_batch(fpath)
    test_images = np.array(image)
    test_labels = np.array(label)
    test_images = np.dstack((test_images[:, :1024], test_images[:, 1024:2048], test_images[:, 2048:]))
    test_images = np.reshape(test_images, [-1, 32, 32, 3])
    if one_hot:
        test_labels = dense_to_one_hot(test_labels, 10)
    print "Cifar test_images size:", test_images.shape
    print "Cifar test_lables size:", test_labels.shape
    test_images = test_images / 255.0 - 0.5

    options = dict(dtype=dtype, reshape=reshape, seed=seed)
    train = DataSet(train_images, train_labels, options)
    test = DataSet(test_images, test_labels, options)
    return Datasets(train=train, test=test)
Ejemplo n.º 21
0
def load_minst(src=None, path=None, one_hot=False):
    mnist = DataSets()
    if src:
        mnist = input_data.read_data_sets("MNIST_data/", one_hot=one_hot)
    if path:
        if path[-1] != '/':
            path += '/'
        train_images = extract_images(path + TRAIN_IMAGES)
        train_labels = extract_labels(path + TRAIN_LABELS, one_hot=one_hot)
        test_images = extract_images(path + TEST_IMAGES)
        test_labels = extract_labels(path + TEST_LABELS, one_hot=one_hot)

        validation_images = train_images[:VALIDATION_SIZE]
        validation_labels = train_labels[:VALIDATION_SIZE]
        train_images = train_images[VALIDATION_SIZE:]
        train_labels = train_labels[VALIDATION_SIZE:]

        mnist.train = DataSet(train_images, train_labels)
        mnist.validation = DataSet(validation_images, validation_labels)
        mnist.test = DataSet(test_images, test_labels)
    return mnist
Ejemplo n.º 22
0
def read_data_sets(param,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=5000):
    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
    TMP_DIR = '../../tmp/'

    local_file = os.path.join(TMP_DIR, TRAIN_IMAGES)
    with open(local_file, 'rb') as f:
        train_images = extract_images(f)

    local_file = os.path.join(TMP_DIR, TRAIN_LABELS)
    with open(local_file, 'rb') as f:
        train_labels = extract_labels(f, one_hot=one_hot)

    local_file = os.path.join(TMP_DIR, TEST_IMAGES)
    with open(local_file, 'rb') as f:
        test_images = extract_images(f)

    local_file = os.path.join(TMP_DIR, TEST_LABELS)
    with open(local_file, 'rb') as f:
        test_labels = extract_labels(f, one_hot=one_hot)

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape)
    validation = DataSet(validation_images,
                         validation_labels,
                         dtype=dtype,
                         reshape=reshape)
    test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape)

    return base.Datasets(train=train, validation=validation, test=test)
Ejemplo n.º 23
0
def load_mnist_binary_dataset(directory, split):
    from tensorflow.contrib.learn.python.learn.datasets.mnist import DataSet
    np_filepath = os.path.join(directory, FILE_TEMPLATE_NP.format(split=split))
    def lines_to_np_array(lines):
        return np.array([[int(i) for i in line.split()] for line in lines])
    if os.path.isfile(np_filepath):
        np_data = np.load(np_filepath)
    else:
        with open(download(directory, FILE_TEMPLATE.format(split=split))) as f:
            lines = f.readlines()
        np_data = lines_to_np_array(lines).astype('float32')
        np.save(np_filepath, np_data)
    return DataSet(np_data.reshape([-1, 28, 28, 1]) * 255, np.zeros(len(np_data)))
def prepare_data(corruption_matrix, gold_fraction=0.05, merge_valset=True):
    np.random.seed(1)

    mnist_images = np.copy(mnist.train.images)
    mnist_labels = np.copy(mnist.train.labels)
    if merge_valset:
        mnist_images = np.concatenate(
            [mnist_images, np.copy(mnist.validation.images)], axis=0)
        mnist_labels = np.concatenate(
            [mnist_labels, np.copy(mnist.validation.labels)])

    indices = np.arange(len(mnist_labels))
    np.random.shuffle(indices)

    mnist_images = mnist_images[indices]
    mnist_labels = mnist_labels[indices].astype(np.long)
    mnist_labels_orig = np.copy(mnist_labels)

    num_gold = int(len(mnist_labels) * gold_fraction)
    num_silver = len(mnist_labels) - num_gold

    for i in range(num_silver):
        mnist_labels[i] = np.random.choice(
            num_classes, p=corruption_matrix[mnist_labels[i]])

    # dtype flag is important to the DataSet class doesn't renormalize the images by /255
    gold = DataSet(mnist_images[num_silver:],
                   mnist_labels[num_silver:],
                   reshape=False,
                   dtype=dtypes.uint8)
    silver = DataSet(mnist_images[:num_silver],
                     np.array(
                         list(
                             zip(mnist_labels[:num_silver],
                                 mnist_labels_orig[:num_silver]))),
                     reshape=False,
                     dtype=dtypes.uint8)

    return gold, silver
Ejemplo n.º 25
0
def load_data_ssl(params, dirn='../data'):
    '''
      load MNIST data and split into 4 blocks
    '''
    n_train_lab = params['n_train_lab']
    n_train_unlab = params['n_train_unlab']
    n_val = params['n_val']
    mode = params['mode']

    mnist = input_data.read_data_sets(dirn,
                                      validation_size=n_val,
                                      one_hot=True)
    n_train_total = mnist.train.num_examples
    if (n_train_lab + n_train_unlab) > n_train_total:
        raise ValueError('inconsistent parameters')

    X_train = mnist.train.images
    y_train = mnist.train.labels

    # select bin_sampling or random_sampling (including inbalance)
    if mode == 'random':
        X_train_lab, X_train_unlab, y_train_lab, y_train_unlab = \
            random_sampling(X_train, y_train, n_labeled=n_train_lab)
    else:
        X_train_lab, X_train_unlab, y_train_lab, y_train_unlab = \
            bin_sampling(X_train, y_train, n_labeled=n_train_lab)

    # cancel scaling by DataSet class constructor
    X_train_lab = X_train_lab * 255.
    X_train_unlab = X_train_unlab * 255.

    train_lab = DataSet(X_train_lab, y_train_lab, reshape=False)
    train_unlab = DataSet(X_train_unlab, y_train_unlab, reshape=False)

    mnist_ssl = Datasets4(train_lab=train_lab,
                          train_unlab=train_unlab,
                          validation=mnist.validation,
                          test=mnist.test)
    return mnist_ssl
Ejemplo n.º 26
0
    def split_dataset(self, dtype=dtypes.float32, reshape=True, seed=None, validation_size=7000): 
        labels = self.dataset.train.labels  

        # SPLIT FIRST GROUP (1-4)
        # Find all training images/labels 1-4 
        train_labels_idx = np.nonzero(self.dataset.train.labels)[1]
        train_labels_idx = np.nonzero(train_labels_idx < 5)[0]
        train_labels = self.dataset.train.labels[train_labels_idx]
        train_images = self.dataset.train.images[train_labels_idx]

        # Find all testing images/labels 1-4 
        test_labels_idx = np.nonzero(self.dataset.test.labels)[1]
        test_labels_idx = np.nonzero(test_labels_idx < 5)[0]
        test_labels = self.dataset.test.labels[test_labels_idx] 
        test_images = self.dataset.test.images[test_labels_idx] 

        # Create validation/training groups 
        validation_images = train_images[:validation_size]
        validation_labels = train_labels[:validation_size]
        train_images = train_images[validation_size:]
        train_labels = train_labels[validation_size:]

        options = dict(dtype=dtype, reshape=False, seed=seed)

        # Define training, validation, and testing datasets  
        train = DataSet(train_images, train_labels, **options)
        validation = DataSet(validation_images, validation_labels, **options)
        test = DataSet(test_images, test_labels, **options)

        first_dataset = base.Datasets(train=train, validation=validation, test=test)

        # SPLIT SECOND GROUP (5-9)
        # Find all training images/labels 5-9 
        train_labels_idx = np.nonzero(self.dataset.train.labels)[1]
        train_labels_idx = np.nonzero(train_labels_idx >= 5)[0]
        train_labels_2 = self.dataset.train.labels[train_labels_idx]
        train_images_2 = self.dataset.train.images[train_labels_idx]

        # Find all testing images/labels 5-9 
        test_labels_idx = np.nonzero(self.dataset.test.labels)[1]
        test_labels_idx = np.nonzero(test_labels_idx >= 5)[0]
        test_labels_2 = self.dataset.test.labels[test_labels_idx] 
        test_images_2 = self.dataset.test.images[test_labels_idx] 

        # Create validation/training groups 
        validation_images_2 = train_images_2[:validation_size]
        validation_labels_2 = train_labels_2[:validation_size]
        train_images_2 = train_images_2[validation_size:]
        train_labels_2 = train_labels_2[validation_size:]

        # Define training, validation, and testing datasets  
        train_2 = DataSet(train_images_2, train_labels_2, **options)
        validation_2 = DataSet(validation_images_2, validation_labels_2, **options)
        test_2 = DataSet(test_images_2, test_labels_2, **options)

        second_dataset = base.Datasets(train=train_2, validation=validation_2, test=test_2)

        return first_dataset, second_dataset 
Ejemplo n.º 27
0
def split_mnist(mnist, cond):
    sets = ["train", "validation", "test"]
    sets_list = []
    for set_name in sets:
        this_set = getattr(mnist, set_name)
        maxlabels = np.argmax(this_set.labels, 1)
        sets_list.append(
            DataSet(this_set.images[cond(maxlabels), :],
                    this_set.labels[cond(maxlabels)],
                    dtype=dtypes.uint8,
                    reshape=False))
    return base.Datasets(train=sets_list[0],
                         validation=sets_list[1],
                         test=sets_list[2])
Ejemplo n.º 28
0
def load_data(dirn='../data'):
    '''
      load CIFAR-10 data and split into 3 blocks
    '''
    # parameter set
    n_train = 40000
    n_val = 10000

    dirn = '../data'
    X_train0, y_train0 = load(dirn, subset='train')
    X_test, y_test0 = load(dirn, subset='test')
    print('Files are loaded.')
    y_train1 = onehot_label(y_train0)
    y_test = onehot_label(y_test0)

    # split validation set
    X_train, y_train, X_validation, y_validation = \
        random_sampling(X_train0, y_train1, n_validation=n_val)
    '''
    print('X_train: ', X_train.shape, ', ', type(X_train))
    print('X_validation: ', X_validation.shape, ', ', type(X_validation))
    '''

    # matrix transpose (channel 1st -> channel last)
    X_train = np.transpose(X_train, (0, 2, 3, 1))
    X_validation = np.transpose(X_validation, (0, 2, 3, 1))
    X_test = np.transpose(X_test, (0, 2, 3, 1))

    # DataSet class construction
    train_lab = DataSet(X_train, y_train, reshape=False)
    validation_set = DataSet(X_validation, y_validation, reshape=False)
    test_set = DataSet(X_test, y_test, reshape=False)

    cifar10 = Datasets(train=train_lab,
                       validation=validation_set,
                       test=test_set)
    return cifar10
Ejemplo n.º 29
0
def gen_splitMNIST(bounds):
    dataset = input_data.read_data_sets(FLAGS.data_dir, one_hot=True)
    sets = ["train", "validation", "test"]
    sets_list = []
    for set_name in sets:
        this_set = getattr(dataset, set_name)
        maxlabels = np.argmax(this_set.labels, 1)
        sets_list.append(
            DataSet(this_set.images[((maxlabels >= bounds[0]) &
                                     (maxlabels <= bounds[1])), :],
                    this_set.labels[((maxlabels >= bounds[0]) &
                                     (maxlabels <= bounds[1]))],
                    dtype=dtypes.uint8,
                    reshape=False))
    return base.Datasets(train=sets_list[0],
                         validation=sets_list[1],
                         test=sets_list[2])
Ejemplo n.º 30
0
def prep_imbalanced_dataset(dirn='../data'):
    """
      prepare imbalanced dataset
        label-1: dominant label
        label-3: fewer label (about 5% of lebal-1)
        label-5: fewer label (about 5% of label-1)
    """
    mnist = input_data.read_data_sets(dirn, one_hot=False)
    mnist3 = Datasets(train=None, test=None)

    for subset in [mnist.train, mnist.test]:
        mnist_lab = subset.labels
        idx1 = (mnist_lab == 1)     # 'Trouser' class in Fashion-MNIST
        idx3 = (mnist_lab == 3)     # 'Dress'   class
        idx5 = (mnist_lab == 5)     # 'Sandal'  class

        small = subset.num_examples // 200     # original ...total // 10
        idx1 = [i for i in range(len(idx1)) if idx1[i]]
        idx3 = [i for i in range(len(idx3)) if idx3[i]]
        idx5 = [i for i in range(len(idx5)) if idx5[i]]

        idx_merged = np.concatenate([idx1, idx3[:small], idx5[:small]])

        X_sub = subset.images[idx_merged]
        y_sub = subset.labels[idx_merged]

        # make one-hot label
        y_oh = []
        for lab in y_sub:
            lab_i = np.zeros([10], dtype=np.float32)
            lab_i[lab] = 1.0
            y_oh.append(lab_i)
        y_sub = np.asarray(y_oh)

        # adjust before re-entering into DataSet object
        X_sub= X_sub * 255.

        mnist_sub = DataSet(X_sub, y_sub, reshape=False)
    
        if subset == mnist.train:
            mnist3 = mnist3._replace(train=mnist_sub)
        if subset == mnist.test:
            mnist3 = mnist3._replace(test=mnist_sub)

    return mnist3