Esempio n. 1
0
def get_mnist_dataset(n_training_samples=None,
                      n_test_samples=None,
                      flat=False,
                      join_train_and_val=False,
                      binarize=False):
    """
    The MNIST DataSet - the Drosophila of machine learning.

    :param n_training_samples: Cap on the number of training samples
    :param n_test_samples: Cap on the number of test samples
    :param flat: Set to True if we just want flat 784-dimensional input data instead of 28x28 images.
    :param join_train_and_val: If true, merge the validation set into the training set.  (giving 60000 training examples,
        and 10000 test examples).  Otherwise you'll have 50000 training, 10000 validation, 10000 test.
    :param binarize: Binarize inputs by thresholding them at 0.5
    :return: A DataSet object containing the MNIST data
    """
    filename = get_file(relative_name='data/mnist.pkl',
                        url='http://deeplearning.net/data/mnist/mnist.pkl.gz',
                        data_transformation=unzip_gz)

    with open(filename, 'rb') as f:
        # data = pickle.load(f, encoding='latin1')
        data = np.load(f, encoding='latin1')

    x_tr, y_tr = data[0] if n_training_samples is None else (
        data[0][0][:n_training_samples], data[0][1][:n_training_samples])
    x_ts, y_ts = data[1] if n_test_samples is None else (
        data[1][0][:n_test_samples], data[1][1][:n_test_samples])
    x_vd, y_vd = data[2]
    if not flat:
        x_tr = x_tr.reshape(-1, 28, 28)
        x_ts = x_ts.reshape(-1, 28, 28)
        x_vd = x_vd.reshape(-1, 28, 28)
    if binarize:
        x_tr = x_tr > 0.5
        x_ts = x_ts > 0.5
        x_vd = x_vd > 0.5

    return \
        DataSet(
            training_set=DataCollection(np.concatenate([x_tr, x_vd], axis=0), np.concatenate([y_tr, y_vd], axis=0)),
            test_set=DataCollection(x_ts, y_ts)
            ) \
        if join_train_and_val else \
        DataSet(
            training_set=DataCollection(x_tr, y_tr),
            test_set=DataCollection(x_ts, y_ts),
            validation_set=DataCollection(x_vd, y_vd)
            )
Esempio n. 2
0
def get_temporal_mnist_dataset(smoothing_steps=1000, **mnist_kwargs):

    tr_x, tr_y, ts_x, ts_y = get_mnist_dataset(**mnist_kwargs).xyxy
    tr_ixs = temporalize(tr_x, smoothing_steps=smoothing_steps)
    ts_ixs = temporalize(ts_x, smoothing_steps=smoothing_steps)
    return DataSet.from_xyxy(tr_x[tr_ixs], tr_y[tr_ixs], ts_x[ts_ixs],
                             ts_y[ts_ixs])
Esempio n. 3
0
def get_synthetic_deep_dataset(n_training, n_test, **kwargs):
    """
    :param n_training: Number of training samples
    :param n_test: Number of test samples
    :param kwargs: See get_synthetic_deep_data above
    :return:
    """
    x, y = get_synthetic_deep_data(**kwargs)
    return DataSet.from_xy(x, y, training_fraction=n_training/float(n_training+n_test))
Esempio n. 4
0
def get_cifar_10_dataset(n_training_samples=None,
                         n_test_samples=None,
                         normalize_inputs=False):
    """
    :param n_training_samples: Number of training samples, or None to leave it at 50000
    :param n_test_samples: Number of test samples, or None to leave it at 10000
    :param normalize_inputs: True to normalize inputs, and turn them from uint8 to double
    :param swap_axes: True to arrange images as (n_samples, n_colors, n_rows, n_cols) instead of (n_samples, n_rows, n_cols, n_colors)

    :return: The CIFAR-10 dataset, which consists of 50000 training and 10000 test images.
        Images are 32x32 uint8 RGB images (n_samples, 3, 32, 32) of 10 categories of objects.
        Targets are integer labels in the range [0, 9]
    """
    # TODO: Make method for downloading/unpacking data (from http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz)
    # We have this for single files already, but in this case the gz contains a folder with the files in it.

    directory = get_archive(
        relative_path='data/cifar-10',
        url='http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz')

    n_batches_to_read = 5 if n_training_samples is None else int(
        np.ceil(n_training_samples / 10000.))

    file_paths = [get_file(os.path.join(directory, 'cifar-10-batches-py', 'data_batch_%s' % (i, ))) for i in xrange(1, n_batches_to_read+1)] \
        + [get_file(os.path.join(directory, 'cifar-10-batches-py', 'test_batch'))]

    data = []
    for file_path in file_paths:
        with open(file_path) as f:
            batch_data = pickle.load(f)
            data.append(batch_data)

    x_tr = np.concatenate([d['data'] for d in data[:-1]],
                          axis=0).reshape(-1, 3, 32, 32)
    y_tr = np.concatenate([d['labels'] for d in data[:-1]], axis=0)
    x_ts = data[-1]['data'].reshape(-1, 3, 32, 32)
    y_ts = np.array(data[-1]['labels'])

    if normalize_inputs:
        mean = x_tr.mean(axis=0, keepdims=True)
        std = x_tr.std(axis=0, keepdims=True)
        x_tr = (x_tr - mean) / std
        x_ts = (x_ts - mean) / std

    if n_training_samples is not None:
        x_tr = x_tr[:n_training_samples]
        y_tr = y_tr[:n_training_samples]
    if n_test_samples is not None:
        x_ts = x_ts[:n_test_samples]
        y_ts = y_ts[:n_test_samples]

    return DataSet(training_set=DataCollection(x_tr, y_tr),
                   test_set=DataCollection(x_ts, y_ts),
                   name='CIFAR-10')
Esempio n. 5
0
def get_synthetic_deep_dataset(n_training, n_test, **kwargs):
    """
    :param n_training: Number of training samples
    :param n_test: Number of test samples
    :param kwargs: See get_synthetic_deep_data above
    :return:
    """
    x, y = get_synthetic_deep_data(**kwargs)
    return DataSet.from_xy(x,
                           y,
                           training_fraction=n_training /
                           float(n_training + n_test))
Esempio n. 6
0
def get_synthetic_clusters_dataset(n_clusters=4,
                                   n_dims=20,
                                   n_training=1000,
                                   n_test=200,
                                   sparsity=0.5,
                                   flip_noise=0.1,
                                   seed=3425,
                                   dtype='float32'):
    """
    A dataset consisting of clustered binary data with "bit-flip" noise, and the corresponding cluster labels.
    This should be trivially solvable by any classifier, and serves as a basic test of whether your classifier is
    completely broken or not.

    :param n_clusters:
    :param n_dims:
    :param n_samples_training:
    :param n_samples_test:
    :param sparsity:
    :param flip_noise:
    :param seed:
    :return:
    """

    rng = np.random.RandomState(seed)
    labels = rng.randint(n_clusters, size=n_training + n_test)  # (n_samples, )
    centers = rng.rand(n_clusters, n_dims) < sparsity  # (n_samples, n_dims)
    input_data = centers[labels]
    input_data = np.bitwise_xor(
        input_data,
        rng.rand(*input_data.shape) < flip_noise).astype(dtype)

    return DataSet(training_set=DataCollection(input_data[:n_training],
                                               labels[:n_training]),
                   test_set=DataCollection(input_data[n_training:],
                                           labels[n_training:]),
                   name='Synthetic Clusters Dataset')
Esempio n. 7
0
def get_20_newsgroups_dataset(filter_most_common = 2000, numeric = False, shuffling_seed = 1234, bag_of_words = False, count_scaling = None):
    """
    The 20 newsgroups dataset.  In this dataset, you try to predict the topic of a forum from the words contained in
    posts in the forums.

    Words have been preprocessed to the "stemmed" version, as explained on the website:
    http://ana.cachopo.org/datasets-for-single-label-text-categorization

    :param filter_most_common: Can be:
        None: Don't filter out words
        int N: Filter out words that are not in the N most common workds
        (int N, int M): Filter out words that are not between the Nth and Mth most common words.
    :param numeric: Convert everything from words to numbers
    :param shuffling_seed: Random seed for shuffling (you want to shuffle, because everything's sorted by topic)
    :param bag_of_words: Return count vectors for each word
    :param count_scaling: If using bag_of_words, apply the transformation:
        vector = log(1+word_counts)
        To generate the input data (this scaling makes it more suitable for some types of classifiers).
    :return: A DataSet object
    """

    training_set_file = get_file(
        relative_name = 'data/20ng-train-stemmed.txt',
        url = 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-train-stemmed.txt'
        )

    test_set_file = get_file(
        relative_name = 'data/20ng-test-stemmed.txt',
        url = 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-test-stemmed.txt'
        )

    train_words, train_labels = _read_formatted_file(training_set_file)
    test_words, test_labels = _read_formatted_file(test_set_file)

    # Shuffle it up...
    rng = np.random.RandomState(shuffling_seed)
    train_words, train_labels =_shuffle((train_words, train_labels), rng)
    test_words, test_labels =_shuffle((test_words, test_labels), rng)

    # Filter out most-common-but-not-too-common-words
    all_train_words = np.concatenate(train_words)
    filtered_vocab, counts = _find_most_common(all_train_words, filter_most_common)
    train_words = _filter_lists_of_words(train_words, filtered_vocab)
    test_words = _filter_lists_of_words(test_words, filtered_vocab)

    if numeric or bag_of_words:
        train_ixs_list = _list_of_posts_to_list_of_ixs(train_words, filtered_vocab)
        test_ixs_list = _list_of_posts_to_list_of_ixs(test_words, filtered_vocab)
        label_vocab = {lab: i for i, lab in enumerate(np.unique(train_labels))}
        train_labels = _words_to_ints(train_labels, label_vocab)
        test_labels = _words_to_ints(test_labels, label_vocab)

        if bag_of_words:
            train_counts = _list_of_ixs_to_count_matrix(train_ixs_list, n_words=len(filtered_vocab))
            test_counts = _list_of_ixs_to_count_matrix(test_ixs_list, n_words=len(filtered_vocab))
            if count_scaling == 'log':
                train_counts = np.log(1+train_counts)
                test_counts = np.log(1+test_counts)
            return DataSet.from_xyxy(training_inputs = train_counts, training_targets = train_labels, test_inputs = test_counts, test_targets = test_labels)
        else:
            return DataSet.from_xyxy(training_inputs = train_ixs_list, training_targets = train_labels, test_inputs = test_ixs_list, test_targets = test_labels)
    else:
        return DataSet.from_xyxy(training_inputs = train_words, training_targets = train_labels, test_inputs = test_words, test_targets = test_labels)
Esempio n. 8
0
def get_20_newsgroups_dataset(filter_most_common=2000,
                              numeric=False,
                              shuffling_seed=1234,
                              bag_of_words=False,
                              count_scaling=None):
    """
    The 20 newsgroups dataset.  In this dataset, you try to predict the topic of a forum from the words contained in
    posts in the forums.

    Words have been preprocessed to the "stemmed" version, as explained on the website:
    http://ana.cachopo.org/datasets-for-single-label-text-categorization

    :param filter_most_common: Can be:
        None: Don't filter out words
        int N: Filter out words that are not in the N most common workds
        (int N, int M): Filter out words that are not between the Nth and Mth most common words.
    :param numeric: Convert everything from words to numbers
    :param shuffling_seed: Random seed for shuffling (you want to shuffle, because everything's sorted by topic)
    :param bag_of_words: Return count vectors for each word
    :param count_scaling: If using bag_of_words, apply the transformation:
        vector = log(1+word_counts)
        To generate the input data (this scaling makes it more suitable for some types of classifiers).
    :return: A DataSet object
    """

    training_set_file = get_file(
        relative_name='data/20ng-train-stemmed.txt',
        url=
        'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-train-stemmed.txt'
    )

    test_set_file = get_file(
        relative_name='data/20ng-test-stemmed.txt',
        url=
        'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-test-stemmed.txt'
    )

    train_words, train_labels = _read_formatted_file(training_set_file)
    test_words, test_labels = _read_formatted_file(test_set_file)

    # Shuffle it up...
    rng = np.random.RandomState(shuffling_seed)
    train_words, train_labels = _shuffle((train_words, train_labels), rng)
    test_words, test_labels = _shuffle((test_words, test_labels), rng)

    # Filter out most-common-but-not-too-common-words
    all_train_words = np.concatenate(train_words)
    filtered_vocab, counts = _find_most_common(all_train_words,
                                               filter_most_common)
    train_words = _filter_lists_of_words(train_words, filtered_vocab)
    test_words = _filter_lists_of_words(test_words, filtered_vocab)

    if numeric or bag_of_words:
        train_ixs_list = _list_of_posts_to_list_of_ixs(train_words,
                                                       filtered_vocab)
        test_ixs_list = _list_of_posts_to_list_of_ixs(test_words,
                                                      filtered_vocab)
        label_vocab = {lab: i for i, lab in enumerate(np.unique(train_labels))}
        train_labels = _words_to_ints(train_labels, label_vocab)
        test_labels = _words_to_ints(test_labels, label_vocab)

        if bag_of_words:
            train_counts = _list_of_ixs_to_count_matrix(
                train_ixs_list, n_words=len(filtered_vocab))
            test_counts = _list_of_ixs_to_count_matrix(
                test_ixs_list, n_words=len(filtered_vocab))
            if count_scaling == 'log':
                train_counts = np.log(1 + train_counts)
                test_counts = np.log(1 + test_counts)
            return DataSet.from_xyxy(training_inputs=train_counts,
                                     training_targets=train_labels,
                                     test_inputs=test_counts,
                                     test_targets=test_labels)
        else:
            return DataSet.from_xyxy(training_inputs=train_ixs_list,
                                     training_targets=train_labels,
                                     test_inputs=test_ixs_list,
                                     test_targets=test_labels)
    else:
        return DataSet.from_xyxy(training_inputs=train_words,
                                 training_targets=train_labels,
                                 test_inputs=test_words,
                                 test_targets=test_labels)
Esempio n. 9
0
 def __init__(self, **kwargs):
     x_tr, y_tr, x_ts, y_ts, w_true = get_logistic_regression_data(**kwargs)
     DataSet.__init__(self, DataCollection(x_tr, y_tr), DataCollection(x_ts, y_ts))
     self._w_true = w_true
Esempio n. 10
0
def get_synthethic_linear_dataset(noise_level=0.1,
                                  n_input_dims=20,
                                  n_output_dims=4,
                                  n_training_samples=1000,
                                  n_test_samples=200,
                                  nonlinearity=None,
                                  offset_mag=0,
                                  seed=8158):
    """
    A Synthethic dataset that can be used for testing generalized linear models.

    :param noise_level:
    :param n_input_dims:
    :param n_output_dims:
    :param n_training_samples:
    :param n_test_samples:
    :param nonlinearity:
    :param seed:
    :return:
    """

    input_singleton = n_input_dims == 0
    if input_singleton:
        n_input_dims = 1

    output_singleton = n_output_dims == 0
    if output_singleton:  # Unfortunately we have to deal with the inconsistencies in numpy's handling of singleton dimensions.
        n_output_dims = 1

    rng = np.random.RandomState(seed)
    w = rng.randn(n_input_dims, n_output_dims) * 1 / np.sqrt(n_input_dims)
    input_data = rng.randn(n_training_samples + n_test_samples, n_input_dims)
    target_data = np.dot(
        input_data,
        w) + offset_mag * rng.randn(n_output_dims) + noise_level * rng.randn(
            n_training_samples + n_test_samples, n_output_dims)
    if nonlinearity == 'softmax':
        target_data = softmax(target_data, axis=1),
    elif nonlinearity == 'sigmoid':
        target_data = sigm(target_data)
    elif nonlinearity == 'argmax':
        target_data == np.argmax(target_data, axis=1)
    elif nonlinearity is None:
        target_data = target_data
    else:
        assert callable(nonlinearity), 'Unknown nonlinearity: {}'.format(
            nonlinearity)
        target_data = nonlinearity(target_data)

    if input_singleton:
        input_data = input_data[:, 0]

    if output_singleton:
        target_data = target_data[:, 0]

    return DataSet(
        training_set=DataCollection(input_data[:n_training_samples],
                                    target_data[:n_training_samples]),
        test_set=DataCollection(input_data[n_training_samples:],
                                target_data[n_training_samples:]),
    )
Esempio n. 11
0
def get_temporal_mnist_dataset(smoothing_steps=1000, **mnist_kwargs):

    tr_x, tr_y, ts_x, ts_y = get_mnist_dataset(**mnist_kwargs).xyxy
    tr_ixs = temporalize(tr_x, smoothing_steps=smoothing_steps)
    ts_ixs = temporalize(ts_x, smoothing_steps=smoothing_steps)
    return DataSet.from_xyxy(tr_x[tr_ixs], tr_y[tr_ixs], ts_x[ts_ixs], ts_y[ts_ixs])
Esempio n. 12
0
 def __init__(self, **kwargs):
     x_tr, y_tr, x_ts, y_ts, w_true = get_logistic_regression_data(**kwargs)
     DataSet.__init__(self, DataCollection(x_tr, y_tr),
                      DataCollection(x_ts, y_ts))
     self._w_true = w_true