def read_data_sets(train_labels_csv, test_labels_csv, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000, dataset_path='../'): """Read HASY data.""" if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) symbol_id2index = generate_index(os.path.join(dataset_path, 'symbols.csv')) test_images, test_labels, _ = load_images(test_labels_csv, symbol_id2index) train_images, train_labels, _ = load_images(train_labels_csv, symbol_id2index) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.'.format( len(train_images), validation_size)) # Shuffle data perm = np.arange(len(train_labels)) np.random.shuffle(perm) train_images = train_images[perm] train_labels = train_labels[perm] # Split training set in training and validation set validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
def read_data_sets(train_dir, fake_data=False, one_hot=False, dtype=dtypes.float32, reshape=True, validation_size=5000): """Read MASYM data.""" if fake_data: def fake(): return DataSet([], [], fake_data=True, one_hot=one_hot, dtype=dtype) train = fake() validation = fake() test = fake() return base.Datasets(train=train, validation=validation, test=test) symbol_id2index = generate_index('HASYv2/') test_images, test_labels = load_images(train_dir, 'hasy-test-labels.csv', symbol_id2index) train_images, train_labels = load_images(train_dir, 'hasy-train-labels.csv', symbol_id2index) if not 0 <= validation_size <= len(train_images): raise ValueError( 'Validation size should be between 0 and {}. Received: {}.' .format(len(train_images), validation_size)) # Shuffle data perm = numpy.arange(len(train_labels)) numpy.random.shuffle(perm) train_images = train_images[perm] train_labels = train_labels[perm] # Split training set in training and validation set validation_images = train_images[:validation_size] validation_labels = train_labels[:validation_size] train_images = train_images[validation_size:] train_labels = train_labels[validation_size:] train = DataSet(train_images, train_labels, dtype=dtype, reshape=reshape) validation = DataSet(validation_images, validation_labels, dtype=dtype, reshape=reshape) test = DataSet(test_images, test_labels, dtype=dtype, reshape=reshape) return base.Datasets(train=train, validation=validation, test=test)
#!/usr/bin/env python """Visualize with tsne as in https://indico.io/blog/visualizing-with-t-sne/.""" import numpy as np import hasy_tools as ht from matplotlib import pyplot as plt from tsne import bh_sne # load up data dataset_path = './HASYv2' ht._get_data(dataset_path) data_complete = [] symbol_id2index = ht.generate_index("%s/symbols.csv" % dataset_path) x_data, y_data, _ = ht.load_images('HASYv2/hasy-data-labels.csv', symbol_id2index, one_hot=False) # convert image data to float64 matrix. float64 is need for bh_sne x_data = np.asarray(x_data).astype('float64') x_data = x_data.reshape((x_data.shape[0], -1)) # For speed of computation, only run on a subset n = 20000 x_data = x_data[:n] y_data = y_data[:n] # perform t-SNE embedding vis_data = bh_sne(x_data)
def get_data(dataset='iris'): """ Get data ready to learn with. Parameters ---------- dataset : str 'iris' fold : int, optional Returns ------- dict """ if dataset == 'iris': import sklearn from sklearn.datasets import fetch_mldata from sklearn.utils import shuffle mnist = fetch_mldata('iris') x = mnist.data y = mnist.target le = sklearn.preprocessing.LabelEncoder() le.fit(y) y = le.transform(y) x, y = shuffle(x, y, random_state=0) from sklearn.cross_validation import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) data = { 'train': { 'X': x_train, 'y': y_train }, 'test': { 'X': x_test, 'y': y_test }, 'n_classes': len(np.unique(y_train)) } scaler = sklearn.preprocessing.StandardScaler().fit(data['train']['X']) data['train']['X'] = scaler.transform(data['train']['X']) data['test']['X'] = scaler.transform(data['test']['X']) elif dataset == 'mnist_simple': # Load the simple, but similar digits dataset from sklearn.datasets import load_digits from sklearn.utils import shuffle digits = load_digits() x = [np.array(el).flatten() for el in digits.images] y = digits.target # Scale data to [-1, 1] - This is of mayor importance!!! x = x / 255.0 * 2 - 1 x, y = shuffle(x, y, random_state=0) from sklearn.cross_validation import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) data = { 'train': { 'X': x_train, 'y': y_train }, 'test': { 'X': x_test, 'y': y_test } } elif dataset == 'mnist': # Load the original dataset from sklearn.datasets import fetch_mldata from sklearn.utils import shuffle mnist = fetch_mldata('MNIST original') x = mnist.data y = mnist.target # Scale data to [-1, 1] - This is of mayor importance!!! x = x / 255.0 * 2 - 1 x, y = shuffle(x, y, random_state=0) from sklearn.cross_validation import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) data = { 'train': { 'X': x_train, 'y': y_train }, 'test': { 'X': x_test, 'y': y_test } } elif dataset == 'hasy': import hasy_tools as ht dataset_path = './HASYv2' data_complete = [] symbol_id2index = ht.generate_index("%s/symbols.csv" % dataset_path) base_ = "%s/10-fold-cross-validation/fold" % dataset_path for fold in range(1, 11): x_train, y_train = ht.load_images('%s-%i/train.csv' % (base_, fold), symbol_id2index, one_hot=False) x_test, y_test = ht.load_images('%s-%i/test.csv' % (base_, fold), symbol_id2index, one_hot=False) data = { 'train': { 'X': x_train.reshape(x_train.shape[0], x_train.shape[1] * x_train.shape[2]), 'y': y_train }, 'test': { 'X': x_test.reshape(x_test.shape[0], x_test.shape[1] * x_test.shape[2]), 'y': y_test }, 'n_classes': 369 } data_complete.append(data) data = data_complete else: raise NotImplemented() return data
#!/usr/bin/env python """Visualize with tsne as in https://indico.io/blog/visualizing-with-t-sne/.""" import hasy_tools as ht import numpy as np from matplotlib import pyplot as plt from tsne import bh_sne # load up data dataset_path = './HASYv2' ht._get_data(dataset_path) data_complete = [] symbol_id2index = ht.generate_index("%s/symbols.csv" % dataset_path) x_data, y_data, _ = ht.load_images('HASYv2/hasy-data-labels.csv', symbol_id2index, one_hot=False) # convert image data to float64 matrix. float64 is need for bh_sne x_data = np.asarray(x_data).astype('float64') x_data = x_data.reshape((x_data.shape[0], -1)) # For speed of computation, only run on a subset n = 20000 x_data = x_data[:n] y_data = y_data[:n] # perform t-SNE embedding vis_data = bh_sne(x_data) # plot the result
i += 1 gen_filename = os.path.join(folder, "%s-%i%s" % (filename, i, ext)) return gen_filename classifier_data = {} classifier_data[MODEL_NAME] = [] dataset_path = os.path.join(os.path.expanduser("~"), 'hasy') for fold in range(1, 11): print("#" * 80) print("Fold %i" % fold) directory = os.path.join(dataset_path, 'classification-task/fold-%i/' % fold) train_labels_csv = os.path.join(directory, 'train.csv') test_labels_csv = os.path.join(directory, 'test.csv') symbol_id2index = generate_index(os.path.join(dataset_path, 'symbols.csv')) print("\tLoad images ....") test_images, test_labels, _ = load_images(test_labels_csv, symbol_id2index) train_images, train_labels, _ = load_images(train_labels_csv, symbol_id2index) print("\t... done loading images") results = {} model_checkpoint_path = get_nonexisting_path(model_checkpoint_path) validation_curve_path = get_nonexisting_path('validation' '-curve-accuracy-%s.csv' % MODEL_NAME) print("model_checkpoint_path: %s" % model_checkpoint_path) print("validation_curve_path: %s" % validation_curve_path) t0 = time.time()
def get_data(dataset='iris'): """ Get data ready to learn with. Parameters ---------- dataset : str 'iris' fold : int, optional Returns ------- dict """ if dataset == 'iris': import sklearn from sklearn.datasets import fetch_mldata from sklearn.utils import shuffle mnist = fetch_mldata('iris') x = mnist.data y = mnist.target le = sklearn.preprocessing.LabelEncoder() le.fit(y) y = le.transform(y) x, y = shuffle(x, y, random_state=0) from sklearn.cross_validation import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) data = {'train': {'X': x_train, 'y': y_train}, 'test': {'X': x_test, 'y': y_test}, 'n_classes': len(np.unique(y_train))} scaler = sklearn.preprocessing.StandardScaler().fit(data['train']['X']) data['train']['X'] = scaler.transform(data['train']['X']) data['test']['X'] = scaler.transform(data['test']['X']) elif dataset == 'mnist_simple': # Load the simple, but similar digits dataset from sklearn.datasets import load_digits from sklearn.utils import shuffle digits = load_digits() x = [np.array(el).flatten() for el in digits.images] y = digits.target # Scale data to [-1, 1] - This is of mayor importance!!! x = x / 255.0 * 2 - 1 x, y = shuffle(x, y, random_state=0) from sklearn.cross_validation import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) data = {'train': {'X': x_train, 'y': y_train}, 'test': {'X': x_test, 'y': y_test}} elif dataset == 'mnist': # Load the original dataset from sklearn.datasets import fetch_mldata from sklearn.utils import shuffle mnist = fetch_mldata('MNIST original') x = mnist.data y = mnist.target # Scale data to [-1, 1] - This is of mayor importance!!! x = x / 255.0 * 2 - 1 x, y = shuffle(x, y, random_state=0) from sklearn.cross_validation import train_test_split x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42) data = {'train': {'X': x_train, 'y': y_train}, 'test': {'X': x_test, 'y': y_test}} elif dataset == 'hasy': import hasy_tools as ht dataset_path = './HASYv2' data_complete = [] symbol_id2index = ht.generate_index("%s/symbols.csv" % dataset_path) base_ = "%s/10-fold-cross-validation/fold" % dataset_path for fold in range(1, 11): x_train, y_train = ht.load_images('%s-%i/train.csv' % (base_, fold), symbol_id2index, one_hot=False) x_test, y_test = ht.load_images('%s-%i/test.csv' % (base_, fold), symbol_id2index, one_hot=False) data = {'train': {'X': x_train.reshape(x_train.shape[0], x_train.shape[1] * x_train.shape[2]), 'y': y_train}, 'test': {'X': x_test.reshape(x_test.shape[0], x_test.shape[1] * x_test.shape[2]), 'y': y_test}, 'n_classes': 369} data_complete.append(data) data = data_complete else: raise NotImplemented() return data