def get_mnist_dataset(n_training_samples=None, n_test_samples=None, flat=False, join_train_and_val=False, binarize=False): """ The MNIST DataSet - the Drosophila of machine learning. :param n_training_samples: Cap on the number of training samples :param n_test_samples: Cap on the number of test samples :param flat: Set to True if we just want flat 784-dimensional input data instead of 28x28 images. :param join_train_and_val: If true, merge the validation set into the training set. (giving 60000 training examples, and 10000 test examples). Otherwise you'll have 50000 training, 10000 validation, 10000 test. :param binarize: Binarize inputs by thresholding them at 0.5 :return: A DataSet object containing the MNIST data """ filename = get_file(relative_name='data/mnist.pkl', url='http://deeplearning.net/data/mnist/mnist.pkl.gz', data_transformation=unzip_gz) with open(filename, 'rb') as f: # data = pickle.load(f, encoding='latin1') data = np.load(f, encoding='latin1') x_tr, y_tr = data[0] if n_training_samples is None else ( data[0][0][:n_training_samples], data[0][1][:n_training_samples]) x_ts, y_ts = data[1] if n_test_samples is None else ( data[1][0][:n_test_samples], data[1][1][:n_test_samples]) x_vd, y_vd = data[2] if not flat: x_tr = x_tr.reshape(-1, 28, 28) x_ts = x_ts.reshape(-1, 28, 28) x_vd = x_vd.reshape(-1, 28, 28) if binarize: x_tr = x_tr > 0.5 x_ts = x_ts > 0.5 x_vd = x_vd > 0.5 return \ DataSet( training_set=DataCollection(np.concatenate([x_tr, x_vd], axis=0), np.concatenate([y_tr, y_vd], axis=0)), test_set=DataCollection(x_ts, y_ts) ) \ if join_train_and_val else \ DataSet( training_set=DataCollection(x_tr, y_tr), test_set=DataCollection(x_ts, y_ts), validation_set=DataCollection(x_vd, y_vd) )
def get_temporal_mnist_dataset(smoothing_steps=1000, **mnist_kwargs): tr_x, tr_y, ts_x, ts_y = get_mnist_dataset(**mnist_kwargs).xyxy tr_ixs = temporalize(tr_x, smoothing_steps=smoothing_steps) ts_ixs = temporalize(ts_x, smoothing_steps=smoothing_steps) return DataSet.from_xyxy(tr_x[tr_ixs], tr_y[tr_ixs], ts_x[ts_ixs], ts_y[ts_ixs])
def get_synthetic_deep_dataset(n_training, n_test, **kwargs): """ :param n_training: Number of training samples :param n_test: Number of test samples :param kwargs: See get_synthetic_deep_data above :return: """ x, y = get_synthetic_deep_data(**kwargs) return DataSet.from_xy(x, y, training_fraction=n_training/float(n_training+n_test))
def get_cifar_10_dataset(n_training_samples=None, n_test_samples=None, normalize_inputs=False): """ :param n_training_samples: Number of training samples, or None to leave it at 50000 :param n_test_samples: Number of test samples, or None to leave it at 10000 :param normalize_inputs: True to normalize inputs, and turn them from uint8 to double :param swap_axes: True to arrange images as (n_samples, n_colors, n_rows, n_cols) instead of (n_samples, n_rows, n_cols, n_colors) :return: The CIFAR-10 dataset, which consists of 50000 training and 10000 test images. Images are 32x32 uint8 RGB images (n_samples, 3, 32, 32) of 10 categories of objects. Targets are integer labels in the range [0, 9] """ # TODO: Make method for downloading/unpacking data (from http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz) # We have this for single files already, but in this case the gz contains a folder with the files in it. directory = get_archive( relative_path='data/cifar-10', url='http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz') n_batches_to_read = 5 if n_training_samples is None else int( np.ceil(n_training_samples / 10000.)) file_paths = [get_file(os.path.join(directory, 'cifar-10-batches-py', 'data_batch_%s' % (i, ))) for i in xrange(1, n_batches_to_read+1)] \ + [get_file(os.path.join(directory, 'cifar-10-batches-py', 'test_batch'))] data = [] for file_path in file_paths: with open(file_path) as f: batch_data = pickle.load(f) data.append(batch_data) x_tr = np.concatenate([d['data'] for d in data[:-1]], axis=0).reshape(-1, 3, 32, 32) y_tr = np.concatenate([d['labels'] for d in data[:-1]], axis=0) x_ts = data[-1]['data'].reshape(-1, 3, 32, 32) y_ts = np.array(data[-1]['labels']) if normalize_inputs: mean = x_tr.mean(axis=0, keepdims=True) std = x_tr.std(axis=0, keepdims=True) x_tr = (x_tr - mean) / std x_ts = (x_ts - mean) / std if n_training_samples is not None: x_tr = x_tr[:n_training_samples] y_tr = y_tr[:n_training_samples] if n_test_samples is not None: x_ts = x_ts[:n_test_samples] y_ts = y_ts[:n_test_samples] return DataSet(training_set=DataCollection(x_tr, y_tr), test_set=DataCollection(x_ts, y_ts), name='CIFAR-10')
def get_synthetic_deep_dataset(n_training, n_test, **kwargs): """ :param n_training: Number of training samples :param n_test: Number of test samples :param kwargs: See get_synthetic_deep_data above :return: """ x, y = get_synthetic_deep_data(**kwargs) return DataSet.from_xy(x, y, training_fraction=n_training / float(n_training + n_test))
def get_synthetic_clusters_dataset(n_clusters=4, n_dims=20, n_training=1000, n_test=200, sparsity=0.5, flip_noise=0.1, seed=3425, dtype='float32'): """ A dataset consisting of clustered binary data with "bit-flip" noise, and the corresponding cluster labels. This should be trivially solvable by any classifier, and serves as a basic test of whether your classifier is completely broken or not. :param n_clusters: :param n_dims: :param n_samples_training: :param n_samples_test: :param sparsity: :param flip_noise: :param seed: :return: """ rng = np.random.RandomState(seed) labels = rng.randint(n_clusters, size=n_training + n_test) # (n_samples, ) centers = rng.rand(n_clusters, n_dims) < sparsity # (n_samples, n_dims) input_data = centers[labels] input_data = np.bitwise_xor( input_data, rng.rand(*input_data.shape) < flip_noise).astype(dtype) return DataSet(training_set=DataCollection(input_data[:n_training], labels[:n_training]), test_set=DataCollection(input_data[n_training:], labels[n_training:]), name='Synthetic Clusters Dataset')
def get_20_newsgroups_dataset(filter_most_common = 2000, numeric = False, shuffling_seed = 1234, bag_of_words = False, count_scaling = None): """ The 20 newsgroups dataset. In this dataset, you try to predict the topic of a forum from the words contained in posts in the forums. Words have been preprocessed to the "stemmed" version, as explained on the website: http://ana.cachopo.org/datasets-for-single-label-text-categorization :param filter_most_common: Can be: None: Don't filter out words int N: Filter out words that are not in the N most common workds (int N, int M): Filter out words that are not between the Nth and Mth most common words. :param numeric: Convert everything from words to numbers :param shuffling_seed: Random seed for shuffling (you want to shuffle, because everything's sorted by topic) :param bag_of_words: Return count vectors for each word :param count_scaling: If using bag_of_words, apply the transformation: vector = log(1+word_counts) To generate the input data (this scaling makes it more suitable for some types of classifiers). :return: A DataSet object """ training_set_file = get_file( relative_name = 'data/20ng-train-stemmed.txt', url = 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-train-stemmed.txt' ) test_set_file = get_file( relative_name = 'data/20ng-test-stemmed.txt', url = 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-test-stemmed.txt' ) train_words, train_labels = _read_formatted_file(training_set_file) test_words, test_labels = _read_formatted_file(test_set_file) # Shuffle it up... rng = np.random.RandomState(shuffling_seed) train_words, train_labels =_shuffle((train_words, train_labels), rng) test_words, test_labels =_shuffle((test_words, test_labels), rng) # Filter out most-common-but-not-too-common-words all_train_words = np.concatenate(train_words) filtered_vocab, counts = _find_most_common(all_train_words, filter_most_common) train_words = _filter_lists_of_words(train_words, filtered_vocab) test_words = _filter_lists_of_words(test_words, filtered_vocab) if numeric or bag_of_words: train_ixs_list = _list_of_posts_to_list_of_ixs(train_words, filtered_vocab) test_ixs_list = _list_of_posts_to_list_of_ixs(test_words, filtered_vocab) label_vocab = {lab: i for i, lab in enumerate(np.unique(train_labels))} train_labels = _words_to_ints(train_labels, label_vocab) test_labels = _words_to_ints(test_labels, label_vocab) if bag_of_words: train_counts = _list_of_ixs_to_count_matrix(train_ixs_list, n_words=len(filtered_vocab)) test_counts = _list_of_ixs_to_count_matrix(test_ixs_list, n_words=len(filtered_vocab)) if count_scaling == 'log': train_counts = np.log(1+train_counts) test_counts = np.log(1+test_counts) return DataSet.from_xyxy(training_inputs = train_counts, training_targets = train_labels, test_inputs = test_counts, test_targets = test_labels) else: return DataSet.from_xyxy(training_inputs = train_ixs_list, training_targets = train_labels, test_inputs = test_ixs_list, test_targets = test_labels) else: return DataSet.from_xyxy(training_inputs = train_words, training_targets = train_labels, test_inputs = test_words, test_targets = test_labels)
def get_20_newsgroups_dataset(filter_most_common=2000, numeric=False, shuffling_seed=1234, bag_of_words=False, count_scaling=None): """ The 20 newsgroups dataset. In this dataset, you try to predict the topic of a forum from the words contained in posts in the forums. Words have been preprocessed to the "stemmed" version, as explained on the website: http://ana.cachopo.org/datasets-for-single-label-text-categorization :param filter_most_common: Can be: None: Don't filter out words int N: Filter out words that are not in the N most common workds (int N, int M): Filter out words that are not between the Nth and Mth most common words. :param numeric: Convert everything from words to numbers :param shuffling_seed: Random seed for shuffling (you want to shuffle, because everything's sorted by topic) :param bag_of_words: Return count vectors for each word :param count_scaling: If using bag_of_words, apply the transformation: vector = log(1+word_counts) To generate the input data (this scaling makes it more suitable for some types of classifiers). :return: A DataSet object """ training_set_file = get_file( relative_name='data/20ng-train-stemmed.txt', url= 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-train-stemmed.txt' ) test_set_file = get_file( relative_name='data/20ng-test-stemmed.txt', url= 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-test-stemmed.txt' ) train_words, train_labels = _read_formatted_file(training_set_file) test_words, test_labels = _read_formatted_file(test_set_file) # Shuffle it up... rng = np.random.RandomState(shuffling_seed) train_words, train_labels = _shuffle((train_words, train_labels), rng) test_words, test_labels = _shuffle((test_words, test_labels), rng) # Filter out most-common-but-not-too-common-words all_train_words = np.concatenate(train_words) filtered_vocab, counts = _find_most_common(all_train_words, filter_most_common) train_words = _filter_lists_of_words(train_words, filtered_vocab) test_words = _filter_lists_of_words(test_words, filtered_vocab) if numeric or bag_of_words: train_ixs_list = _list_of_posts_to_list_of_ixs(train_words, filtered_vocab) test_ixs_list = _list_of_posts_to_list_of_ixs(test_words, filtered_vocab) label_vocab = {lab: i for i, lab in enumerate(np.unique(train_labels))} train_labels = _words_to_ints(train_labels, label_vocab) test_labels = _words_to_ints(test_labels, label_vocab) if bag_of_words: train_counts = _list_of_ixs_to_count_matrix( train_ixs_list, n_words=len(filtered_vocab)) test_counts = _list_of_ixs_to_count_matrix( test_ixs_list, n_words=len(filtered_vocab)) if count_scaling == 'log': train_counts = np.log(1 + train_counts) test_counts = np.log(1 + test_counts) return DataSet.from_xyxy(training_inputs=train_counts, training_targets=train_labels, test_inputs=test_counts, test_targets=test_labels) else: return DataSet.from_xyxy(training_inputs=train_ixs_list, training_targets=train_labels, test_inputs=test_ixs_list, test_targets=test_labels) else: return DataSet.from_xyxy(training_inputs=train_words, training_targets=train_labels, test_inputs=test_words, test_targets=test_labels)
def __init__(self, **kwargs): x_tr, y_tr, x_ts, y_ts, w_true = get_logistic_regression_data(**kwargs) DataSet.__init__(self, DataCollection(x_tr, y_tr), DataCollection(x_ts, y_ts)) self._w_true = w_true
def get_synthethic_linear_dataset(noise_level=0.1, n_input_dims=20, n_output_dims=4, n_training_samples=1000, n_test_samples=200, nonlinearity=None, offset_mag=0, seed=8158): """ A Synthethic dataset that can be used for testing generalized linear models. :param noise_level: :param n_input_dims: :param n_output_dims: :param n_training_samples: :param n_test_samples: :param nonlinearity: :param seed: :return: """ input_singleton = n_input_dims == 0 if input_singleton: n_input_dims = 1 output_singleton = n_output_dims == 0 if output_singleton: # Unfortunately we have to deal with the inconsistencies in numpy's handling of singleton dimensions. n_output_dims = 1 rng = np.random.RandomState(seed) w = rng.randn(n_input_dims, n_output_dims) * 1 / np.sqrt(n_input_dims) input_data = rng.randn(n_training_samples + n_test_samples, n_input_dims) target_data = np.dot( input_data, w) + offset_mag * rng.randn(n_output_dims) + noise_level * rng.randn( n_training_samples + n_test_samples, n_output_dims) if nonlinearity == 'softmax': target_data = softmax(target_data, axis=1), elif nonlinearity == 'sigmoid': target_data = sigm(target_data) elif nonlinearity == 'argmax': target_data == np.argmax(target_data, axis=1) elif nonlinearity is None: target_data = target_data else: assert callable(nonlinearity), 'Unknown nonlinearity: {}'.format( nonlinearity) target_data = nonlinearity(target_data) if input_singleton: input_data = input_data[:, 0] if output_singleton: target_data = target_data[:, 0] return DataSet( training_set=DataCollection(input_data[:n_training_samples], target_data[:n_training_samples]), test_set=DataCollection(input_data[n_training_samples:], target_data[n_training_samples:]), )