Ejemplo n.º 1
0
def read_data_sets(train_labels_csv,
                   test_labels_csv,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=5000,
                   dataset_path='../'):
    """Read HASY data."""
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True,
                           one_hot=one_hot,
                           dtype=dtype)

        train = fake()
        validation = fake()
        test = fake()
        return base.Datasets(train=train, validation=validation, test=test)

    symbol_id2index = generate_index(os.path.join(dataset_path, 'symbols.csv'))
    test_images, test_labels, _ = load_images(test_labels_csv, symbol_id2index)
    train_images, train_labels, _ = load_images(train_labels_csv,
                                                symbol_id2index)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'.format(
                len(train_images), validation_size))
    # Shuffle data
    perm = np.arange(len(train_labels))
    np.random.shuffle(perm)
    train_images = train_images[perm]
    train_labels = train_labels[perm]
    # Split training set in training and validation set
    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape)
    validation = DataSet(validation_images,
                         validation_labels,
                         dtype=dtype,
                         reshape=reshape)
    test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape)

    return base.Datasets(train=train, validation=validation, test=test)
Ejemplo n.º 2
0
def read_data_sets(train_dir,
                   fake_data=False,
                   one_hot=False,
                   dtype=dtypes.float32,
                   reshape=True,
                   validation_size=5000):
    """Read MASYM data."""
    if fake_data:

        def fake():
            return DataSet([], [],
                           fake_data=True, one_hot=one_hot, dtype=dtype)

        train = fake()
        validation = fake()
        test = fake()
        return base.Datasets(train=train, validation=validation, test=test)

    symbol_id2index = generate_index('HASYv2/')
    test_images, test_labels = load_images(train_dir,
                                           'hasy-test-labels.csv',
                                           symbol_id2index)
    train_images, train_labels = load_images(train_dir,
                                             'hasy-train-labels.csv',
                                             symbol_id2index)

    if not 0 <= validation_size <= len(train_images):
        raise ValueError(
            'Validation size should be between 0 and {}. Received: {}.'
            .format(len(train_images), validation_size))
    # Shuffle data
    perm = numpy.arange(len(train_labels))
    numpy.random.shuffle(perm)
    train_images = train_images[perm]
    train_labels = train_labels[perm]
    # Split training set in training and validation set
    validation_images = train_images[:validation_size]
    validation_labels = train_labels[:validation_size]
    train_images = train_images[validation_size:]
    train_labels = train_labels[validation_size:]

    train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape)
    validation = DataSet(validation_images,
                         validation_labels,
                         dtype=dtype,
                         reshape=reshape)
    test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape)

    return base.Datasets(train=train, validation=validation, test=test)
Ejemplo n.º 3
0
#!/usr/bin/env python

"""Visualize with tsne as in https://indico.io/blog/visualizing-with-t-sne/."""

import numpy as np
import hasy_tools as ht
from matplotlib import pyplot as plt
from tsne import bh_sne

# load up data
dataset_path = './HASYv2'
ht._get_data(dataset_path)
data_complete = []

symbol_id2index = ht.generate_index("%s/symbols.csv" % dataset_path)
x_data, y_data, _ = ht.load_images('HASYv2/hasy-data-labels.csv',
                                   symbol_id2index,
                                   one_hot=False)

# convert image data to float64 matrix. float64 is need for bh_sne
x_data = np.asarray(x_data).astype('float64')
x_data = x_data.reshape((x_data.shape[0], -1))

# For speed of computation, only run on a subset
n = 20000
x_data = x_data[:n]
y_data = y_data[:n]

# perform t-SNE embedding
vis_data = bh_sne(x_data)
Ejemplo n.º 4
0
def get_data(dataset='iris'):
    """
    Get data ready to learn with.

    Parameters
    ----------
    dataset : str
        'iris'
    fold : int, optional

    Returns
    -------
    dict
    """
    if dataset == 'iris':
        import sklearn
        from sklearn.datasets import fetch_mldata
        from sklearn.utils import shuffle
        mnist = fetch_mldata('iris')

        x = mnist.data
        y = mnist.target

        le = sklearn.preprocessing.LabelEncoder()
        le.fit(y)
        y = le.transform(y)

        x, y = shuffle(x, y, random_state=0)

        from sklearn.cross_validation import train_test_split
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.33,
                                                            random_state=42)
        data = {
            'train': {
                'X': x_train,
                'y': y_train
            },
            'test': {
                'X': x_test,
                'y': y_test
            },
            'n_classes': len(np.unique(y_train))
        }
        scaler = sklearn.preprocessing.StandardScaler().fit(data['train']['X'])
        data['train']['X'] = scaler.transform(data['train']['X'])
        data['test']['X'] = scaler.transform(data['test']['X'])
    elif dataset == 'mnist_simple':
        # Load the simple, but similar digits dataset
        from sklearn.datasets import load_digits
        from sklearn.utils import shuffle
        digits = load_digits()
        x = [np.array(el).flatten() for el in digits.images]
        y = digits.target

        # Scale data to [-1, 1] - This is of mayor importance!!!
        x = x / 255.0 * 2 - 1

        x, y = shuffle(x, y, random_state=0)

        from sklearn.cross_validation import train_test_split
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.33,
                                                            random_state=42)
        data = {
            'train': {
                'X': x_train,
                'y': y_train
            },
            'test': {
                'X': x_test,
                'y': y_test
            }
        }
    elif dataset == 'mnist':  # Load the original dataset
        from sklearn.datasets import fetch_mldata
        from sklearn.utils import shuffle
        mnist = fetch_mldata('MNIST original')

        x = mnist.data
        y = mnist.target

        # Scale data to [-1, 1] - This is of mayor importance!!!
        x = x / 255.0 * 2 - 1

        x, y = shuffle(x, y, random_state=0)

        from sklearn.cross_validation import train_test_split
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.33,
                                                            random_state=42)
        data = {
            'train': {
                'X': x_train,
                'y': y_train
            },
            'test': {
                'X': x_test,
                'y': y_test
            }
        }
    elif dataset == 'hasy':
        import hasy_tools as ht
        dataset_path = './HASYv2'
        data_complete = []

        symbol_id2index = ht.generate_index("%s/symbols.csv" % dataset_path)
        base_ = "%s/10-fold-cross-validation/fold" % dataset_path
        for fold in range(1, 11):
            x_train, y_train = ht.load_images('%s-%i/train.csv' %
                                              (base_, fold),
                                              symbol_id2index,
                                              one_hot=False)
            x_test, y_test = ht.load_images('%s-%i/test.csv' % (base_, fold),
                                            symbol_id2index,
                                            one_hot=False)
            data = {
                'train': {
                    'X':
                    x_train.reshape(x_train.shape[0],
                                    x_train.shape[1] * x_train.shape[2]),
                    'y':
                    y_train
                },
                'test': {
                    'X':
                    x_test.reshape(x_test.shape[0],
                                   x_test.shape[1] * x_test.shape[2]),
                    'y':
                    y_test
                },
                'n_classes': 369
            }
            data_complete.append(data)
        data = data_complete
    else:
        raise NotImplemented()
    return data
Ejemplo n.º 5
0
#!/usr/bin/env python
"""Visualize with tsne as in https://indico.io/blog/visualizing-with-t-sne/."""

import hasy_tools as ht
import numpy as np
from matplotlib import pyplot as plt
from tsne import bh_sne

# load up data
dataset_path = './HASYv2'
ht._get_data(dataset_path)
data_complete = []

symbol_id2index = ht.generate_index("%s/symbols.csv" % dataset_path)
x_data, y_data, _ = ht.load_images('HASYv2/hasy-data-labels.csv',
                                   symbol_id2index,
                                   one_hot=False)

# convert image data to float64 matrix. float64 is need for bh_sne
x_data = np.asarray(x_data).astype('float64')
x_data = x_data.reshape((x_data.shape[0], -1))

# For speed of computation, only run on a subset
n = 20000
x_data = x_data[:n]
y_data = y_data[:n]

# perform t-SNE embedding
vis_data = bh_sne(x_data)

# plot the result
Ejemplo n.º 6
0
            i += 1
            gen_filename = os.path.join(folder, "%s-%i%s" % (filename, i, ext))
        return gen_filename

classifier_data = {}
classifier_data[MODEL_NAME] = []
dataset_path = os.path.join(os.path.expanduser("~"), 'hasy')

for fold in range(1, 11):
    print("#" * 80)
    print("Fold %i" % fold)
    directory = os.path.join(dataset_path,
                             'classification-task/fold-%i/' % fold)
    train_labels_csv = os.path.join(directory, 'train.csv')
    test_labels_csv = os.path.join(directory, 'test.csv')
    symbol_id2index = generate_index(os.path.join(dataset_path, 'symbols.csv'))
    print("\tLoad images ....")
    test_images, test_labels, _ = load_images(test_labels_csv,
                                              symbol_id2index)
    train_images, train_labels, _ = load_images(train_labels_csv,
                                                symbol_id2index)
    print("\t... done loading images")
    results = {}

    model_checkpoint_path = get_nonexisting_path(model_checkpoint_path)
    validation_curve_path = get_nonexisting_path('validation'
                                                 '-curve-accuracy-%s.csv' %
                                                 MODEL_NAME)
    print("model_checkpoint_path: %s" % model_checkpoint_path)
    print("validation_curve_path: %s" % validation_curve_path)
    t0 = time.time()
Ejemplo n.º 7
0
def get_data(dataset='iris'):
    """
    Get data ready to learn with.

    Parameters
    ----------
    dataset : str
        'iris'
    fold : int, optional

    Returns
    -------
    dict
    """
    if dataset == 'iris':
        import sklearn
        from sklearn.datasets import fetch_mldata
        from sklearn.utils import shuffle
        mnist = fetch_mldata('iris')

        x = mnist.data
        y = mnist.target

        le = sklearn.preprocessing.LabelEncoder()
        le.fit(y)
        y = le.transform(y)

        x, y = shuffle(x, y, random_state=0)

        from sklearn.cross_validation import train_test_split
        x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                            test_size=0.33,
                                                            random_state=42)
        data = {'train': {'X': x_train,
                          'y': y_train},
                'test': {'X': x_test,
                         'y': y_test},
                'n_classes': len(np.unique(y_train))}
        scaler = sklearn.preprocessing.StandardScaler().fit(data['train']['X'])
        data['train']['X'] = scaler.transform(data['train']['X'])
        data['test']['X'] = scaler.transform(data['test']['X'])
    elif dataset == 'mnist_simple':
        # Load the simple, but similar digits dataset
        from sklearn.datasets import load_digits
        from sklearn.utils import shuffle
        digits = load_digits()
        x = [np.array(el).flatten() for el in digits.images]
        y = digits.target

        # Scale data to [-1, 1] - This is of mayor importance!!!
        x = x / 255.0 * 2 - 1

        x, y = shuffle(x, y, random_state=0)

        from sklearn.cross_validation import train_test_split
        x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                            test_size=0.33,
                                                            random_state=42)
        data = {'train': {'X': x_train,
                          'y': y_train},
                'test': {'X': x_test,
                         'y': y_test}}
    elif dataset == 'mnist':  # Load the original dataset
        from sklearn.datasets import fetch_mldata
        from sklearn.utils import shuffle
        mnist = fetch_mldata('MNIST original')

        x = mnist.data
        y = mnist.target

        # Scale data to [-1, 1] - This is of mayor importance!!!
        x = x / 255.0 * 2 - 1

        x, y = shuffle(x, y, random_state=0)

        from sklearn.cross_validation import train_test_split
        x_train, x_test, y_train, y_test = train_test_split(x, y,
                                                            test_size=0.33,
                                                            random_state=42)
        data = {'train': {'X': x_train,
                          'y': y_train},
                'test': {'X': x_test,
                         'y': y_test}}
    elif dataset == 'hasy':
        import hasy_tools as ht
        dataset_path = './HASYv2'
        data_complete = []

        symbol_id2index = ht.generate_index("%s/symbols.csv" % dataset_path)
        base_ = "%s/10-fold-cross-validation/fold" % dataset_path
        for fold in range(1, 11):
            x_train, y_train = ht.load_images('%s-%i/train.csv' %
                                              (base_, fold),
                                              symbol_id2index,
                                              one_hot=False)
            x_test, y_test = ht.load_images('%s-%i/test.csv' %
                                            (base_, fold),
                                            symbol_id2index,
                                            one_hot=False)
            data = {'train': {'X': x_train.reshape(x_train.shape[0],
                                                   x_train.shape[1] *
                                                   x_train.shape[2]),
                              'y': y_train},
                    'test': {'X': x_test.reshape(x_test.shape[0],
                                                 x_test.shape[1] *
                                                 x_test.shape[2]),
                             'y': y_test},
                    'n_classes': 369}
            data_complete.append(data)
        data = data_complete
    else:
        raise NotImplemented()
    return data