Example #1
0
def maybe_download_minst(train_dir, SOURCE_URL, train=True, one_hot=True):
    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                     SOURCE_URL + TRAIN_IMAGES)
    with open(local_file, 'rb') as f:
        train_images = extract_images(f)

    local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                     SOURCE_URL + TRAIN_LABELS)
    with open(local_file, 'rb') as f:
        train_labels = extract_labels(f, one_hot=one_hot)

    local_file = base.maybe_download(TEST_IMAGES, train_dir,
                                     SOURCE_URL + TEST_IMAGES)
    with open(local_file, 'rb') as f:
        test_images = extract_images(f)

    local_file = base.maybe_download(TEST_LABELS, train_dir,
                                     SOURCE_URL + TEST_LABELS)
    with open(local_file, 'rb') as f:
        test_labels = extract_labels(f, one_hot=one_hot)

    if train:
        return train_images, train_labels
    else:
        return test_images, test_labels
def import_mnist():
    """
    This import mnist and saves the data as an object of our DataSet class
    :return:
    """
    SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
    VALIDATION_SIZE = 0
    ONE_HOT = True
    TRAIN_DIR = 'MNIST_data'


    local_file = base.maybe_download(TRAIN_IMAGES, TRAIN_DIR,
                                     SOURCE_URL + TRAIN_IMAGES)
    train_images = extract_images(open(local_file))

    local_file = base.maybe_download(TRAIN_LABELS, TRAIN_DIR,
                                     SOURCE_URL + TRAIN_LABELS)
    train_labels = extract_labels(open(local_file), one_hot=ONE_HOT)

    local_file = base.maybe_download(TEST_IMAGES, TRAIN_DIR,
                                     SOURCE_URL + TEST_IMAGES)
    test_images = extract_images(open(local_file))

    local_file = base.maybe_download(TEST_LABELS, TRAIN_DIR,
                                     SOURCE_URL + TEST_LABELS)
    test_labels = extract_labels(open(local_file), one_hot=ONE_HOT)

    validation_images = train_images[:VALIDATION_SIZE]
    validation_labels = train_labels[:VALIDATION_SIZE]
    train_images = train_images[VALIDATION_SIZE:]
    train_labels = train_labels[VALIDATION_SIZE:]

    ## Process images
    train_images = process_mnist(train_images)
    validation_images = process_mnist(validation_images)
    test_images = process_mnist(test_images)

    ## Standardize data
    train_mean, train_std = get_data_info(train_images)
#    train_images = standardize_data(train_images, train_mean, train_std)
#    validation_images = standardize_data(validation_images, train_mean, train_std)
#    test_images = standardize_data(test_images, train_mean, train_std)

    # data = DataSet(train_images, train_labels)
    # test = DataSet(test_images, test_labels)
    # val = DataSet(validation_images, validation_labels)

    data = DataSet(train_images, train_images)
    test = DataSet(test_images, test_images)
    val = DataSet(validation_images, validation_images)


    return data, test, val
Example #3
0
def import_mnist(validation_size=0):
    """
    This import mnist and saves the data as an object of our DataSet class
    :param concat_val: Concatenate training and validation
    :return:
    """
    SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
    ONE_HOT = True
    TRAIN_DIR = 'experiments/data/MNIST_data'

    local_file = base.maybe_download(TRAIN_IMAGES, TRAIN_DIR,
                                     SOURCE_URL + TRAIN_IMAGES)
    with open(local_file) as f:
        train_images = extract_images(f)

    local_file = base.maybe_download(TRAIN_LABELS, TRAIN_DIR,
                                     SOURCE_URL + TRAIN_LABELS)
    with open(local_file) as f:
        train_labels = extract_labels(f, one_hot=ONE_HOT)

    local_file = base.maybe_download(TEST_IMAGES, TRAIN_DIR,
                                     SOURCE_URL + TEST_IMAGES)
    with open(local_file) as f:
        test_images = extract_images(f)

    local_file = base.maybe_download(TEST_LABELS, TRAIN_DIR,
                                     SOURCE_URL + TEST_LABELS)
    with open(local_file) as f:
        test_labels = extract_labels(f, one_hot=ONE_HOT)

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    # process images
    train_images = process_mnist(train_images)
    validation_images = process_mnist(validation_images)
    test_images = process_mnist(test_images)

    # standardize data
    train_mean, train_std = get_data_info(train_images)
    train_images = standardize_data(train_images, train_mean, train_std)
    validation_images = standardize_data(validation_images, train_mean,
                                         train_std)
    test_images = standardize_data(test_images, train_mean, train_std)

    data = DataSet(train_images, train_labels)
    test = DataSet(test_images, test_labels)
    val = DataSet(validation_images, validation_labels)

    return data, test, val
Example #4
0
def load_data():
    with open('../../data/mnist/train-images-idx3-ubyte.gz', 'rb') as f:
        train_images = np.squeeze(extract_images(f))
    with open('../../data/mnist/train-labels-idx1-ubyte.gz', 'rb') as f:
        train_labels = extract_labels(f)
    with open('../../data/mnist/t10k-images-idx3-ubyte.gz', 'rb') as f:
        test_images = np.squeeze(extract_images(f))
    with open('../../data/mnist/t10k-labels-idx1-ubyte.gz', 'rb') as f:
        test_labels = extract_labels(f)
    return train_images, train_labels, test_images, test_labels
Example #5
0
def read_data_sets(
    fake_data=False,
    one_hot=False,
    dtype=dtypes.float32,
    reshape=True,
    validation_size=5000,
    seed=None,
):
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True,
                           one_hot=one_hot,
                           dtype=dtype,
                           seed=seed)

        train = fake()
        validation = fake()
        test = fake()
        return base.Datasets(train=train, validation=validation, test=test)

    with gfile.Open(train_data_dir, 'rb') as f:
        train_images = extract_images(f)

    with gfile.Open(train_labels_dir, 'rb') as f:
        train_labels = extract_labels(f, one_hot=one_hot)

    with gfile.Open(eval_data_dir, 'rb') as f:
        test_images = extract_images(f)

    with gfile.Open(eval_labels_dir, 'rb') as f:
        test_labels = extract_labels(f, one_hot=one_hot)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    options = dict(dtype=dtype, reshape=reshape, seed=seed)

    train = DataSet(train_images, train_labels, **options)
    validation = DataSet(validation_images, validation_labels, **options)
    test = DataSet(test_images, test_labels, **options)

    return base.Datasets(train=train, validation=validation, test=test)
Example #6
0
def loadModelData():
    # load EMNIST data
    with open(TRAIN_IMAGES_PATH, 'rb') as f:
        train_images = extract_images(f)
    with open(TRAIN_LABELS_PATH, 'rb') as f:
        train_labels = extract_labels(f)

    with open(TEST_IMAGES_PATH, 'rb') as f:
        test_images = extract_images(f)
    with open(TEST_LABELS_PATH, 'rb') as f:
        test_labels = extract_labels(f)

    # "rename" to make it similar to the tutorial
    # https://github.com/tflearn/tflearn/blob/master/examples/images/convnet_mnist.py
    X, Y, testX, testY = train_images, train_labels, test_images, test_labels

    # data preprocessing
    X = X.reshape([-1, 28, 28, 1])
    testX = testX.reshape([-1, 28, 28, 1])
    Y = to_categorical(Y, nb_classes=62)
    testY = to_categorical(testY, nb_classes=62)

    # Building convolutional network
    # the input is a 28x28 image with 1 channel
    network = input_data(shape=[None, 28, 28, 1], name='input')

    # 3 x convolution + max pooling
    network = conv_2d(network, 32, 3, activation='relu', regularizer="L2")
    network = conv_2d(network, 32, 3, activation='relu', regularizer="L2")
    network = conv_2d(network, 32, 3, activation='relu', regularizer="L2")
    network = max_pool_2d(network, 2)

    # fully connected with 512 nodes + some dropout
    network = fully_connected(network, 512, activation='relu')
    network = dropout(network, 0.5)
    # fully connected with 62 nodes which are the outputs
    network = fully_connected(network, 62, activation='softmax')

    # train the network with regression
    network = regression(network,
                         optimizer='adam',
                         loss='categorical_crossentropy',
                         name='target')

    # Training
    model = tflearn.DNN(network,
                        tensorboard_verbose=0,
                        checkpoint_path='classifier.tfl.ckpt')

    return model, X, Y, testX, testY
Example #7
0
def read_data_sets(train_dir,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=5000,
                   seed=None,
                   source_url=None): # omit url since we are using our own dataset
    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 'test-images-idx3-ubyte.gz'
    TEST_LABELS = 'test-labels-idx1-ubyte.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir, None) # omit url, local file will be a path

    # type: DataSets
    with gfile.Open(local_file, 'rb') as f:
        train_images = mnist_module.extract_images(f)

    print (train_images.shape)

    local_file = base.maybe_download(TRAIN_LABELS, train_dir, None) # omit url
    with gfile.Open(local_file, 'rb') as f:
        train_labels = mnist_module.extract_labels(f, one_hot=one_hot)

    local_file = base.maybe_download(TEST_IMAGES, train_dir, None) # omit url
    with gfile.Open(local_file, 'rb') as f:
        test_images = mnist_module.extract_images(f)

    local_file = base.maybe_download(TEST_LABELS, train_dir, None) # omit url
    with gfile.Open(local_file, 'rb') as f:
        test_labels = mnist_module.extract_labels(f, one_hot=one_hot)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'
            .format(len(train_images), validation_size))

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    options = dict(dtype=dtype, reshape=reshape, seed=seed)

    train = mnist_module.DataSet(train_images, train_labels, **options)
    validation = mnist_module.DataSet(validation_images, validation_labels, **options)
    test = mnist_module.DataSet(test_images, test_labels, **options)

    return base.Datasets(train=train, validation=validation, test=test)
Example #8
0
def read_data_sets(train_dir,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   seed=None,
                   source_url=DEFAULT_SOURCE_URL):
    if not source_url:  # empty string check
        source_url = DEFAULT_SOURCE_URL
    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'

    local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                     source_url + TRAIN_IMAGES)
    with gfile.Open(local_file, 'rb') as f:
        train_images = extract_images(f)

    local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                     source_url + TRAIN_LABELS)
    with gfile.Open(local_file, 'rb') as f:
        train_labels = extract_labels(f, one_hot=one_hot)

    options = dict(dtype=dtype, reshape=reshape, seed=seed)

    train = DataSet(train_images, train_labels, **options)

    return base.Datasets(train=train, validation=None, test=None)
Example #9
0
def load_data(task_name=None):
    """Loads data associated with given task

  Args:
    task_name: str, the name of the task to load.

  Returns:
    Tuple of numpy arrays (inputs, data_to_generate) normalized between 0 and 1.
  """
    assert task_name is not None

    if task_name == 'mnist':
        with open(MNIST_IMGS_PATH, 'rb') as imgs_file:
            data = extract_images(imgs_file)
        inputs = None
    elif task_name == 'cmnist':
        with open(MNIST_IMGS_PATH, 'rb') as imgs_file:
            data = extract_images(imgs_file)
        with open(MNIST_LABELS_PATH, 'rb') as labels_file:
            inputs = extract_labels(labels_file)
        inputs = np.eye(10)[inputs]
    else:
        raise ValueError('Unknown task: {}'.format(task_name))

    data = (data - data.min()) / (data.max() - data.min())  # Normalize
    data = (data * 2) - 1  # Move to range [-1, 1]
    if inputs is not None:
        inputs = (inputs - inputs.min()) / (inputs.max() - inputs.min()
                                            )  # Normalize
        inputs = (inputs * 2) - 1  # Move to range [-1, 1]

    return inputs, data
Example #10
0
def write_mnist_data(input_images, input_labels, output, partitions):
    with open(input_images, 'rb') as f:
        images = numpy.array(mnist.extract_images(f))

    with open(input_labels, 'rb') as f:
        labels = numpy.array(mnist.extract_labels(f, one_hot=True))

    shape = images.shape
    print("images.shape: {0}".format(shape))
    print("labels.shape: {0}".format(labels.shape))

    images = images.reshape(shape[0], shape[1], shape[2])
    num_per_part = int(math.ceil(float(shape[0]) / partitions))
    seq = 0
    filename = output + "/" + str(seq) + ".tfrecords"
    writer = tf.python_io.TFRecordWriter(filename)

    for i in range(shape[0]):
        if i != 0 and i % num_per_part == 0:
            writer.close()
            seq += 1
            filename = output + "/" + str(seq) + ".tfrecords"
            writer = tf.python_io.TFRecordWriter(filename)
        image_raw = images[i].tostring()
        example = tf.train.Example(features=tf.train.Features(
            feature={
                'image_raw': _bytes_feature(image_raw),
                'label': _int64_feature(labels[i].astype(int))
            }))
        writer.write(example.SerializeToString())
    writer.close()
def load_data_fashion_mnist(data_dir, one_hot=False, num_classes=10):
    train_image_file = os.path.join(data_dir, 'train-images-idx3-ubyte.gz')
    train_labels_file = os.path.join(data_dir, 'train-labels-idx1-ubyte.gz')
    test_image_file = os.path.join(data_dir, 't10k-images-idx3-ubyte.gz')
    test_labels_file = os.path.join(data_dir, 't10k-labels-idx1-ubyte.gz')
    with gfile.Open(train_image_file, 'rb') as f:
        train_images = extract_images(f)

    with gfile.Open(train_labels_file, 'rb') as f:
        train_labels = extract_labels(f, one_hot=True, num_classes=10)

    with gfile.Open(test_image_file, 'rb') as f:
        test_images = extract_images(f)

    with gfile.Open(test_labels_file, 'rb') as f:
        test_labels = extract_labels(f, one_hot=True, num_classes=10)

    return train_images, train_labels, test_images, test_labels
Example #12
0
def writeMNIST(sc, input_images, input_labels, output, format, num_partitions):
    """Writes MNIST image/label vectors into parallelized files on HDFS"""
    # load MNIST gzip into memory
    with open(input_images, 'rb') as f:
        images = numpy.array(mnist.extract_images(f))

    with open(input_labels, 'rb') as f:
        if format == "csv2":
            labels = numpy.array(mnist.extract_labels(f, one_hot=False))
        else:
            labels = numpy.array(mnist.extract_labels(f, one_hot=True))

    shape = images.shape
    print("images.shape: {0}".format(shape))  # 60000 x 28 x 28
    print("labels.shape: {0}".format(labels.shape))  # 60000 x 10

    # create RDDs of vectors
    imageRDD = sc.parallelize(images.reshape(shape[0], shape[1] * shape[2]),
                              num_partitions)
    labelRDD = sc.parallelize(labels, num_partitions)

    output_images = output + "/images"
    output_labels = output + "/labels"

    # save RDDs as specific format
    if format == "pickle":
        imageRDD.saveAsPickleFile(output_images)
        labelRDD.saveAsPickleFile(output_labels)
    elif format == "csv":
        imageRDD.map(toCSV).saveAsTextFile(output_images)
        labelRDD.map(toCSV).saveAsTextFile(output_labels)
    elif format == "csv2":
        imageRDD.map(toCSV).zip(labelRDD).map(
            lambda x: str(x[1]) + "|" + x[0]).saveAsTextFile(output)
    else:  # format == "tfr":
        tfRDD = imageRDD.zip(labelRDD).map(
            lambda x: (bytearray(toTFExample(x[0], x[1])), None))
        # requires: --jars tensorflow-hadoop-1.0-SNAPSHOT.jar
        tfRDD.saveAsNewAPIHadoopFile(
            output,
            "org.tensorflow.hadoop.io.TFRecordFileOutputFormat",
            keyClass="org.apache.hadoop.io.BytesWritable",
            valueClass="org.apache.hadoop.io.NullWritable")
Example #13
0
def load_data():
    cwd = os.getcwd()
    # training images
    with open(os.path.join(cwd, "train-images-idx3-ubyte.gz"), "rb") as f:
        train_images = extract_images(f)

    # training labels
    with open(os.path.join(cwd, "train-labels-idx1-ubyte.gz"), "rb") as f:
        train_labels = extract_labels(f)

    # testing images
    with open(os.path.join(cwd, "t10k-images-idx3-ubyte.gz"), "rb") as f:
        test_images = extract_images(f)

    # testing labels
    with open(os.path.join(cwd, "t10k-labels-idx1-ubyte.gz"), "rb") as f:
        test_labels = extract_labels(f)

    return (train_images, train_labels), (test_images, test_labels)
Example #14
0
def import_mnist():
    """
    This import mnist and saves the data as an object of our DataSet class
    :return:
    """
    VALIDATION_SIZE = 0
    ONE_HOT = True
    TRAIN_DIR = 'INFMNIST_data/'

    train_images = extract_images_2(
        open(TRAIN_DIR + 'mnist8m-patterns-idx3-ubyte.gz'))

    train_labels = extract_labels(open(TRAIN_DIR +
                                       'mnist8m-labels-idx1-ubyte.gz'),
                                  one_hot=ONE_HOT)

    test_images = extract_images(open(TRAIN_DIR + 'test10k-patterns.gz'))

    test_labels = extract_labels(open(TRAIN_DIR + 'test10k-labels.gz'),
                                 one_hot=ONE_HOT)

    validation_images = train_images[:VALIDATION_SIZE]
    validation_labels = train_labels[:VALIDATION_SIZE]
    train_images = train_images[VALIDATION_SIZE:]
    train_labels = train_labels[VALIDATION_SIZE:]

    ## Process images
    train_images = process_mnist(train_images)
    validation_images = process_mnist(validation_images)
    test_images = process_mnist(test_images)

    ## Standardize data
    train_mean, train_std = get_data_info(train_images)
    #    train_images = standardize_data(train_images, train_mean, train_std)
    #    validation_images = standardize_data(validation_images, train_mean, train_std)
    #    test_images = standardize_data(test_images, train_mean, train_std)

    data = DataSet(train_images, train_labels)
    test = DataSet(test_images, test_labels)
    val = DataSet(validation_images, validation_labels)

    return data, test, val
Example #15
0
def import_mnist():
    if os.path.isdir(
            DATA_DIR) is False:  # directory does not exist, download the data
        get_mnist8m_data()

    with open(TRAIN_INPUTS) as f:
        train_images = extract_images(f)
        train_images = process_mnist(train_images)

    with open(TRAIN_OUTPUTS) as f:
        train_labels = extract_labels(f, one_hot=True)

    with open(TEST_INPUTS) as f:
        test_images = extract_images(f)
        test_images = process_mnist(test_images)

    with open(TEST_OUTPUTS) as f:
        test_labels = extract_labels(f, one_hot=True)

    return datasets.DataSet(train_images, train_labels), datasets.DataSet(
        test_images, test_labels)
Example #16
0
def load_minst(src=None, path=None, one_hot=False):
    mnist = DataSets()
    if src:
        mnist = input_data.read_data_sets("MNIST_data/", one_hot=one_hot)
    if path:
        if path[-1] != '/':
            path += '/'
        train_images = extract_images(path + TRAIN_IMAGES)
        train_labels = extract_labels(path + TRAIN_LABELS, one_hot=one_hot)
        test_images = extract_images(path + TEST_IMAGES)
        test_labels = extract_labels(path + TEST_LABELS, one_hot=one_hot)

        validation_images = train_images[:VALIDATION_SIZE]
        validation_labels = train_labels[:VALIDATION_SIZE]
        train_images = train_images[VALIDATION_SIZE:]
        train_labels = train_labels[VALIDATION_SIZE:]

        mnist.train = DataSet(train_images, train_labels)
        mnist.validation = DataSet(validation_images, validation_labels)
        mnist.test = DataSet(test_images, test_labels)
    return mnist
Example #17
0
def mnist_data_loader(one_hot=False, reshape=True):
    """Load MNIST dataset."""
    # Download the dataset if not exist.
    # CVDF mirror of http://yann.lecun.com/exdb/mnist/
    DATA_URL = 'https://storage.googleapis.com/cvdf-datasets/mnist/'

    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'

    file_list = [TRAIN_IMAGES, TRAIN_LABELS, TEST_IMAGES, TEST_LABELS]
    file_list = [
        maybe_download(MNIST_DIR, DATA_URL + fil) for fil in file_list
    ]

    with gfile.Open(file_list[0], 'rb') as f:
        train_data = mnist.extract_images(f) / 255

    with gfile.Open(file_list[1], 'rb') as f:
        train_labels = mnist.extract_labels(f, one_hot)

    with gfile.Open(file_list[2], 'rb') as f:
        test_data = mnist.extract_images(f) / 255

    with gfile.Open(file_list[3], 'rb') as f:
        test_labels = mnist.extract_labels(f, one_hot)

    # Convert the shape of image, if reshape
    # [n_samples, width, length, 1] ==> [n_samples, n_features]
    if reshape:
        assert train_data.shape[1:] == test_data.shape[1:]
        n_train, width, length, _ = train_data.shape
        n_test = test_data.shape[0]
        train_data = train_data.reshape(n_train, width * length)
        test_data = test_data.reshape(n_test, width * length)

    return train_data, train_labels, test_data, test_labels
Example #18
0
def convert_to_data_sets(data_gzs,
                         one_hot=False,
                         dtype=dtypes.float32,
                         reshape=True,
                         validation_size=5000,
                         seed=None):
    """ Modified version of tensorflow/tensorflow/contrib/learn/python/learn/datasets/mnist.py """

    with gfile.Open(data_gzs['train-images'][0], 'rb') as f:
        train_images = tf_mnist.extract_images(f)

    with gfile.Open(data_gzs['train-labels'][0], 'rb') as f:
        train_labels = tf_mnist.extract_labels(f, one_hot=one_hot)

    with gfile.Open(data_gzs['t10k-images'][0], 'rb') as f:
        test_images = tf_mnist.extract_images(f)

    with gfile.Open(data_gzs['t10k-labels'][0], 'rb') as f:
        test_labels = tf_mnist.extract_labels(f, one_hot=one_hot)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))

    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    options = dict(dtype=dtype, reshape=reshape, seed=seed)

    train = tf_mnist.DataSet(train_images, train_labels, **options)
    validation = tf_mnist.DataSet(validation_images, validation_labels,
                                  **options)
    test = tf_mnist.DataSet(test_images, test_labels, **options)

    return base.Datasets(train=train, validation=validation, test=test)
def writeMNIST(sc, input_images, input_labels, output, format, num_partitions):
  """Writes MNIST image/label vectors into parallelized files on HDFS"""
  # load MNIST gzip into memory
  with open(input_images, 'rb') as f:
    images = numpy.array(mnist.extract_images(f))

  with open(input_labels, 'rb') as f:
    if format == "csv2":
      labels = numpy.array(mnist.extract_labels(f, one_hot=False))
    else:
      labels = numpy.array(mnist.extract_labels(f, one_hot=True))

  shape = images.shape
  print("images.shape: {0}".format(shape))          # 60000 x 28 x 28
  print("labels.shape: {0}".format(labels.shape))   # 60000 x 10

  # create RDDs of vectors
  imageRDD = sc.parallelize(images.reshape(shape[0], shape[1] * shape[2]), num_partitions)
  labelRDD = sc.parallelize(labels, num_partitions)

  output_images = output + "/images"
  output_labels = output + "/labels"

  # save RDDs as specific format
  if format == "pickle":
    imageRDD.saveAsPickleFile(output_images)
    labelRDD.saveAsPickleFile(output_labels)
  elif format == "csv":
    imageRDD.map(toCSV).saveAsTextFile(output_images)
    labelRDD.map(toCSV).saveAsTextFile(output_labels)
  elif format == "csv2":
    imageRDD.map(toCSV).zip(labelRDD).map(lambda x: str(x[1]) + "|" + x[0]).saveAsTextFile(output)
  else: # format == "tfr":
    tfRDD = imageRDD.zip(labelRDD).map(lambda x: (bytearray(toTFExample(x[0], x[1])), None))
    # requires: --jars tensorflow-hadoop-1.0-SNAPSHOT.jar
    tfRDD.saveAsNewAPIHadoopFile(output, "org.tensorflow.hadoop.io.TFRecordFileOutputFormat",
                                keyClass="org.apache.hadoop.io.BytesWritable",
                                valueClass="org.apache.hadoop.io.NullWritable")
Example #20
0
def load_mnist_data():
    path = '/content/gdrive/My Drive/542HW4/'
    with open((path + "train-images-idx3-ubyte.gz"), "rb") as f:
        train_allimages = extract_images(f)

    with open((path + "train-labels-idx1-ubyte.gz"), "rb") as f:
        train_alllabels = extract_labels(f)
    valid_set_size = 10000
    split = len(train_allimages) - valid_set_size
    valid_images = train_allimages[split:]
    valid_labels = train_alllabels[split:]
    train_images = train_allimages[:split]
    train_labels = train_alllabels[:split]

    with open((path + "t10k-images-idx3-ubyte.gz"), "rb") as f:
        test_images = extract_images(f)

    with open((path + "t10k-labels-idx1-ubyte.gz"), "rb") as f:
        test_labels = extract_labels(f)

    return (train_images, train_labels), (valid_images,
                                          valid_labels), (test_images,
                                                          test_labels)
Example #21
0
def load_mnist_data(train_dir, one_hot=True):
  """Returns all 'train' data --- images and labels."""
  TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
  TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
  SOURCE_URL = mnist.SOURCE_URL

  local_file = base.maybe_download(TRAIN_IMAGES, train_dir,
                                   SOURCE_URL + TRAIN_IMAGES)
  with open(local_file, 'rb') as f:
    train_images = mnist.extract_images(f)

  local_file = base.maybe_download(TRAIN_LABELS, train_dir,
                                   SOURCE_URL + TRAIN_LABELS)
  with open(local_file, 'rb') as f:
    train_labels = mnist.extract_labels(f, one_hot=one_hot)

  return train_images, train_labels
Example #22
0
    def load_gz(self):
        result = []
        data = {
            'train':
            ['train-images-idx3-ubyte.gz', 'train-labels-idx1-ubyte.gz'],
            'test': ['t10k-images-idx3-ubyte.gz', 't10k-labels-idx1-ubyte.gz']
        }
        image_path = os.path.join(self.data_dir, data[self.__mode][0])
        label_path = os.path.join(self.data_dir, data[self.__mode][1])

        with open(image_path, 'rb') as f:
            result.append(extract_images(f))

        with open(label_path, 'rb') as f:
            result.append(extract_labels(f, self.one_hot))

        return self.sc.parallelize(zip(*result))
Example #23
0
    def _get_data(self):
        from tensorflow.contrib.learn.python.learn.datasets.base \
          import maybe_download
        from tensorflow.contrib.learn.python.learn.datasets.mnist \
          import extract_images, extract_labels

        if self.is_train:
            IMAGES = 'train-images-idx3-ubyte.gz'
            LABELS = 'train-labels-idx1-ubyte.gz'
        else:
            print('using test dataset..')
            IMAGES = 't10k-images-idx3-ubyte.gz'
            LABELS = 't10k-labels-idx1-ubyte.gz'
        SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'

        local_file = maybe_download(IMAGES, self.path, SOURCE_URL)
        with open(local_file, 'rb') as f:
            images = extract_images(f)
        local_file = maybe_download(LABELS, self.path, SOURCE_URL)
        with open(local_file, 'rb') as f:
            labels = extract_labels(f, one_hot=False)

        values, counts = np.unique(labels, return_counts=True)

        data = []
        for i in range(10):
            label = values[i]
            count = counts[i]
            arr = np.empty([count, 1, 28, 28], dtype=np.float32)
            data.append(arr)

        l_iter = [0] * 10
        for i in range(labels.shape[0]):
            label = labels[i]
            data[label][l_iter[label]] = np.reshape(images[i],
                                                    [1, 28, 28]) / 255.
            l_iter[label] += 1

        self.data = data
        self.l_iter = l_iter

        return data
Example #24
0
def generate_mnist_jpg(subdatadir, source_image_path, source_label_path):
	create_folder(subdatadir)
	local_file = base.maybe_download(source_image_path, train_dir,
									SOURCE_URL + source_image_path)
	with open(local_file, 'rb') as f:
		images = mnist.extract_images(f)
	

	local_file = base.maybe_download(source_label_path, train_dir,
                                   SOURCE_URL + source_label_path)
	with open(local_file, 'rb') as f:
		labels = mnist.extract_labels(f, one_hot=False)

	for img in range(labels.size):
		subdirpath = subdatadir + str(labels[img])
		create_folder(subdirpath)

		filepath = (subdirpath + '/' + str(labels[img]) + '_' + str(img) + '.jpg')
		im = Image.fromarray(images[img,:,:,0])
		im.save(filepath)
Example #25
0
    def _get_data(self):
        from tensorflow.examples.tutorials.mnist import input_data
        from tensorflow.contrib.learn.python.learn.datasets.base \
          import maybe_download
        from tensorflow.contrib.learn.python.learn.datasets.mnist \
          import extract_images, extract_labels

        if self.is_train:
            IMAGES = 'train-images-idx3-ubyte.gz'
            LABELS = 'train-labels-idx1-ubyte.gz'
        else:
            IMAGES = 't10k-images-idx3-ubyte.gz'
            LABELS = 't10k-labels-idx1-ubyte.gz'
        SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'

        # local_file = maybe_download(IMAGES, self.path, SOURCE_URL)
        with open("MNIST_data/" + IMAGES, 'rb') as f:
            images = extract_images(f)
        # local_file = maybe_download(LABELS, self.path, SOURCE_URL)
        with open("MNIST_data/" + LABELS, 'rb') as f:
            labels = extract_labels(f, one_hot=False)

        values, counts = np.unique(labels, return_counts=True)

        data = []
        for i in range(10):
            label = values[i]
            count = counts[i]
            arr = np.empty([count, 28, 28, 1], dtype=np.float32)
            data.append(arr)

        l_iter = [0] * 10
        for i in range(labels.shape[0]):
            label = labels[i]
            data[label][l_iter[label]] = images[i] / 255.
            l_iter[label] += 1

        return data
Example #26
0
"""JYI, 11/13/2018 """
# load data set, data exploration
from tensorflow.contrib.learn.python.learn.datasets.mnist import extract_images, extract_labels
with open('train-images-idx3-ubyte.gz', 'rb') as f:
    train_x = extract_images(f)
with open('train-labels-idx1-ubyte.gz', 'rb') as f:
    train_y = extract_labels(f)
with open('t10k-images-idx3-ubyte.gz', 'rb') as f:
    test_x = extract_images(f)
with open('t10k-labels-idx1-ubyte.gz', 'rb') as f:
    test_y = extract_labels(f)

import matplotlib.pyplot as plt
fig1 = plt.figure(1, figsize=(9, 6))
plt.imshow(train_x[0].reshape((28, 28)))
fig1.suptitle('Training data sample', fontsize=10)
fig2 = plt.figure(2, figsize=(9, 6))
plt.imshow(test_x[0].reshape((28, 28)))
fig2.suptitle('Testing data sample', fontsize=10)
plt.show()

print('train_y[0]:{}'.format(train_y[0]))  # 5
print('train_x.shape:{}'.format(train_x.shape))  # (60000, 28, 28, 1)
print('train_y.shape:{}'.format(train_y.shape))  # (60000,)
print('test_x.shape:{}'.format(test_x.shape))  # (10000, 28, 28, 1)
print('test_y.shape:{}'.format(test_y.shape))  # (10000,)

# data set pre-processing
import numpy as np
num_class = 10
num_feature = 784
Example #27
0
            print("Completed processing 20000 images")
        if i == 30000:
            print("Completed processing 30000 images")
        if i == 40000:
            print("Completed processing 40000 images")
        if i == 50000:
            print("Completed processing 50000 images")
        if i == 59999:
            print("Completed processing 60000 images")
    return w, bias


with open('train-images-idx3-ubyte.gz', 'rb') as f:
    X_train = extract_images(f)
with open('train-labels-idx1-ubyte.gz', 'rb') as f:
    Y_train = extract_labels(f)
with open('t10k-images-idx3-ubyte.gz', 'rb') as f:
    x_test = extract_images(f)
with open('t10k-labels-idx1-ubyte.gz', 'rb') as f:
    y_test = extract_labels(f)

num_pixels = X_train.shape[1] * X_train.shape[2]
X_train = X_train.reshape((X_train.shape[0], num_pixels)).astype('float32')
x_test = x_test.reshape((x_test.shape[0], num_pixels)).astype('float32')
Y_train_new = convert(Y_train)
y_test_new = convert(y_test)

# normalize inputs from 0-255 to 0-1
X_train = X_train / 255
x_test = x_test / 255
Example #28
0
@author: ram
"""

import numpy as np
""" Reading MNIST data """
'''
==============================================================================================================
'''

from tensorflow.contrib.learn.python.learn.datasets.mnist import extract_images, extract_labels

with open('train-images-idx3-ubyte.gz', 'rb') as f:
    train_images = extract_images(f)

with open('train-labels-idx1-ubyte.gz', 'rb') as f:
    train_labels = extract_labels(f)

with open('t10k-images-idx3-ubyte.gz', 'rb') as f:
    test_images = extract_images(f)

with open('t10k-labels-idx1-ubyte.gz', 'rb') as f:
    test_labels = extract_labels(f)
'''
==============================================================================================================
'''

#printing shapes of all train and test data

print("train_images shape = ", train_images.shape)
print("train_label shape = ", train_labels.shape)
print("test_images shape = ", test_images.shape)
Example #29
0
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   shuffle=False,
                   validation_percentage=0.1):
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()

    class DataSets(object):
        pass

    data_sets = DataSets()
    if fake_data:

        def fake():
            return DataSet([], [], fake_data=True, one_hot=one_hot)

        data_sets.train = fake()
        data_sets.validation = fake()
        data_sets.test = fake()
        return data_sets

    SOURCE_URL = 'http://yann.lecun.com/exdb/mnist/'
    WORK_DIRECTORY = 'data'

    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
    if 0 == rank:
        local_file = maybe_download(TRAIN_IMAGES)
        temp_file = open(local_file, 'r')
        train_images = extract_images(temp_file)
        if shuffle:
            # shuffle the data
            perm = np.arange(train_images.shape[0])
            np.random.shuffle(perm)
            train_images = train_images[perm]
        # bcast the data
        shape = train_images.shape
        shape = comm.bcast(shape, root=0)
        comm.Bcast(train_images, root=0)

        local_file = maybe_download(TRAIN_LABELS)
        temp_file = open(local_file, 'r')
        train_labels = extract_labels(temp_file, one_hot=one_hot)
        if shuffle:
            # shuffle the data, using same indices as images above
            train_labels = train_labels[perm]
        # bcast the data
        shape = train_labels.shape
        shape = comm.bcast(shape, root=0)
        comm.Bcast(train_labels, root=0)

        local_file = maybe_download(TEST_IMAGES)
        temp_file = open(local_file, 'r')
        test_images = extract_images(temp_file)
        shape = test_images.shape
        shape = comm.bcast(shape, root=0)
        comm.Bcast(test_images, root=0)

        local_file = maybe_download(TEST_LABELS)
        temp_file = open(local_file, 'r')
        test_labels = extract_labels(temp_file, one_hot=one_hot)
        shape = test_labels.shape
        shape = comm.bcast(shape, root=0)
        comm.Bcast(test_labels, root=0)
    else:
        shape = None
        shape = comm.bcast(shape, root=0)
        train_images = np.ndarray(shape=shape, dtype=np.uint8)
        comm.Bcast(train_images, root=0)

        shape = None
        shape = comm.bcast(shape, root=0)
        train_labels = np.ndarray(shape=shape)
        comm.Bcast(train_labels, root=0)

        shape = None
        shape = comm.bcast(shape, root=0)
        test_images = np.ndarray(shape=shape, dtype=np.uint8)
        comm.Bcast(test_images, root=0)

        shape = None
        shape = comm.bcast(shape, root=0)
        test_labels = np.ndarray(shape=shape)
        comm.Bcast(test_labels, root=0)

    VALIDATION_SIZE = train_images.shape[0] * validation_percentage
    total = train_images.shape[0] - VALIDATION_SIZE
    count = total / size
    remain = total % size
    if 0 == rank:
        print "total images", total
        print "image subset (%d,%d)=%d" % (total, size, count)
        print "image subset remainder", remain

    start = rank * count
    stop = rank * count + count
    if rank < remain:
        start += rank
        stop += rank + 1
    else:
        start += remain
        stop += remain

    VALIDATION_SIZE = int(VALIDATION_SIZE)
    start = int(start)
    stop = int(stop)

    validation_images = train_images[:VALIDATION_SIZE]
    validation_labels = train_labels[:VALIDATION_SIZE]
    train_images = train_images[VALIDATION_SIZE:]
    train_labels = train_labels[VALIDATION_SIZE:]
    train_images = train_images[start:stop]
    train_labels = train_labels[start:stop]
    data_sets.train = DataSet(train_images, train_labels)
    data_sets.validation = DataSet(validation_images, validation_labels)
    data_sets.test = DataSet(test_images, test_labels)
    if 0 == rank:
        print "Rank Start Stop NumExamples"
        sys.stdout.flush()
    for i in xrange(size):
        if rank == i:
            print i, start, stop, data_sets.train.num_examples
            sys.stdout.flush()
        comm.Barrier()
    return data_sets
Example #30
0
tf.set_random_seed(777)  # for reproducibility

from tensorflow.contrib.learn.python.learn.datasets.mnist import extract_images, extract_labels

_TEST_DATA_FILENAME = 'D:\DataSet\EMNIST_MNISTFORMAT\gzip\emnist-balanced-test-images-idx3-ubyte.gz'
_TEST_LABELS_FILENAME = 'D:\DataSet\EMNIST_MNISTFORMAT\gzip\emnist-balanced-test-labels-idx1-ubyte.gz'
_TRAIN_DATA_FILENAME = 'D:\DataSet\EMNIST_MNISTFORMAT\gzip\emnist-balanced-train-images-idx3-ubyte.gz'
_TRAIN_LABELS_FILENAME = 'D:\DataSet\EMNIST_MNISTFORMAT\gzip\emnist-balanced-train-labels-idx1-ubyte.gz'

with open('my/directory/train-images-idx3-ubyte.gz', 'rb') as f:
    train_images = extract_images(f)
with open('my/directory/train-labels-idx1-ubyte.gz', 'rb') as f:
    train_labels = extract_images(f)

with open('my/directory/t10k-images-idx3-ubyte.gz', 'rb') as f:
    test_images = extract_labels(f)
with open('my/directory/t10k-labels-idx1-ubyte.gz', 'rb') as f:
    test_labels = extract_labels(f)


mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)

nb_classes = 10

# MNIST data image of shape 28 * 28 = 784
X = tf.placeholder(tf.float32, [None, 784])
# 0 - 9 digits recognition = 10 classes
Y = tf.placeholder(tf.float32, [None, nb_classes])

W = tf.Variable(tf.random_normal([784, nb_classes]))
b = tf.Variable(tf.random_normal([nb_classes]))
Example #31
0
def load_data():
    return extract_images(open('data/emnist-bymerge-train-images-idx3-ubyte.gz', 'rb')), \
           extract_labels(open('data/emnist-bymerge-train-labels-idx1-ubyte.gz', 'rb')), \
           extract_images(open('data/emnist-bymerge-test-images-idx3-ubyte.gz', 'rb')), \
           extract_labels(open('data/emnist-bymerge-test-labels-idx1-ubyte.gz', 'rb'))
Example #32
0
def read_data_sets(train_dir, fake_data=False, one_hot=False,
        shuffle=False, validation_percentage=0.1):
    comm = MPI.COMM_WORLD
    rank = comm.Get_rank()
    size = comm.Get_size()

    class DataSets(object):
        pass
    data_sets = DataSets()
    if fake_data:
        def fake():
            return DataSet([], [], fake_data=True, one_hot=one_hot)
        data_sets.train = fake()
        data_sets.validation = fake()
        data_sets.test = fake()
        return data_sets
    TRAIN_IMAGES = 'train-images-idx3-ubyte.gz'
    TRAIN_LABELS = 'train-labels-idx1-ubyte.gz'
    TEST_IMAGES = 't10k-images-idx3-ubyte.gz'
    TEST_LABELS = 't10k-labels-idx1-ubyte.gz'
    if 0 == rank:
        local_file = maybe_download(TRAIN_IMAGES, train_dir, "http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz")
        train_images = extract_images(local_file)
        if shuffle:
            # shuffle the data
            perm = np.arange(train_images.shape[0])
            np.random.shuffle(perm)
            train_images = train_images[perm]
        # bcast the data
        shape = train_images.shape
        shape = comm.bcast(shape, root=0)
        comm.Bcast(train_images, root=0)

        local_file = maybe_download(TRAIN_LABELS, train_dir, "http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz")
        train_labels = extract_labels(local_file, one_hot=one_hot)
        if shuffle:
            # shuffle the data, using same indices as images above
            train_labels = train_labels[perm]
        # bcast the data
        shape = train_labels.shape
        shape = comm.bcast(shape, root=0)
        comm.Bcast(train_labels, root=0)

        local_file = maybe_download(TEST_IMAGES, train_dir, "http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz")
        test_images = extract_images(local_file)
        shape = test_images.shape
        shape = comm.bcast(shape, root=0)
        comm.Bcast(test_images, root=0)

        local_file = maybe_download(TEST_LABELS, train_dir, "http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz")
        test_labels = extract_labels(local_file, one_hot=one_hot)
        shape = test_labels.shape
        shape = comm.bcast(shape, root=0)
        comm.Bcast(test_labels, root=0)
    else:
        shape = None
        shape = comm.bcast(shape, root=0)
        train_images = np.ndarray(shape=shape, dtype=np.uint8)
        comm.Bcast(train_images, root=0)

        shape = None
        shape = comm.bcast(shape, root=0)
        train_labels = np.ndarray(shape=shape)
        comm.Bcast(train_labels, root=0)

        shape = None
        shape = comm.bcast(shape, root=0)
        test_images = np.ndarray(shape=shape, dtype=np.uint8)
        comm.Bcast(test_images, root=0)

        shape = None
        shape = comm.bcast(shape, root=0)
        test_labels = np.ndarray(shape=shape)
        comm.Bcast(test_labels, root=0)

    VALIDATION_SIZE = int(0)
    total = train_images.shape[0] - VALIDATION_SIZE
    count = total / size
    remain = total % size
    if 0 == rank:
        print "total images", total
        print "image subset (%d,%d)=%d" % (total,size,count)
        print "image subset remainder", remain
        
    start = rank * count
    stop = rank * count + count
    if rank < remain:
        start += rank
        stop += rank + 1
    else :
        start += remain
        stop += remain

    validation_images = train_images[:VALIDATION_SIZE]
    validation_labels = train_labels[:VALIDATION_SIZE]
    train_images = train_images[VALIDATION_SIZE:]
    train_labels = train_labels[VALIDATION_SIZE:]
    train_images = train_images[start:stop]
    train_labels = train_labels[start:stop]
    data_sets.train = DataSet(train_images, train_labels)
    data_sets.validation = DataSet(validation_images, validation_labels)
    data_sets.test = DataSet(test_images, test_labels)
    if 0 == rank:
        print "Rank Start Stop NumExamples"
        sys.stdout.flush()
    for i in xrange(size):
        if rank == i:
            print i,start,stop,data_sets.train.num_examples
            sys.stdout.flush()
        comm.Barrier()
    return data_sets