Ejemplo n.º 1
0
def get_synthetic_clusters_dataset(n_clusters = 4, n_dims = 20, n_training = 1000, n_test = 200,
        sparsity = 0.5, flip_noise = 0.1, seed = 3425, dtype = 'float32'):
    """
    A dataset consisting of clustered binary data with "bit-flip" noise, and the corresponding cluster labels.
    This should be trivially solvable by any classifier, and serves as a basic test of whether your classifier is
    completely broken or not.

    :param n_clusters:
    :param n_dims:
    :param n_samples_training:
    :param n_samples_test:
    :param sparsity:
    :param flip_noise:
    :param seed:
    :return:
    """

    rng = np.random.RandomState(seed)
    labels = rng.randint(n_clusters, size = n_training+n_test)  # (n_samples, )
    centers = rng.rand(n_clusters, n_dims) < sparsity  # (n_samples, n_dims)
    input_data = centers[labels]
    input_data = np.bitwise_xor(input_data, rng.rand(*input_data.shape) < flip_noise).astype(dtype)

    return DataSet(
        training_set = DataCollection(input_data[:n_training], labels[:n_training]),
        test_set = DataCollection(input_data[n_training:], labels[n_training:]),
        name = 'Synthetic Clusters Dataset'
        )
Ejemplo n.º 2
0
def get_mnist_dataset(n_training_samples=None,
                      n_test_samples=None,
                      flat=False):
    """
    The MNIST DataSet - the Drosophila of machine learning.

    :param n_training_samples: Cap on the number of training samples
    :param n_test_samples: Cap on the number of test samples
    :param flat: Set to True if we just want flat 784-dimensional input data instead of 28x28 images.
    :return: A DataSet object containing the MNIST data
    """
    filename = get_file(relative_name='data/mnist.pkl',
                        url='http://deeplearning.net/data/mnist/mnist.pkl.gz',
                        data_transformation=unzip_gz)

    with open(filename) as f:
        data = pickle.load(f)

    x_tr, y_tr = data[0] if n_training_samples is None else (
        data[0][0][:n_training_samples], data[0][1][:n_training_samples])
    x_ts, y_ts = data[1] if n_test_samples is None else (
        data[1][0][:n_test_samples], data[1][1][:n_test_samples])
    x_vd, y_vd = data[2]
    if not flat:
        x_tr = x_tr.reshape(-1, 28, 28)
        x_ts = x_ts.reshape(-1, 28, 28)
        x_vd = x_vd.reshape(-1, 28, 28)
    return DataSet(training_set=DataCollection(x_tr, y_tr),
                   test_set=DataCollection(x_ts, y_ts),
                   validation_set=DataCollection(x_vd, y_vd))
Ejemplo n.º 3
0
def get_synthethic_linear_dataset(noise_level=0.1,
                                  n_input_dims=20,
                                  n_output_dims=4,
                                  n_training_samples=1000,
                                  n_test_samples=200,
                                  nonlinearity=None,
                                  offset_mag=0,
                                  seed=8158):
    """
    A Synthethic dataset that can be used for testing generalized linear models.

    :param noise_level:
    :param n_input_dims:
    :param n_output_dims:
    :param n_training_samples:
    :param n_test_samples:
    :param nonlinearity:
    :param seed:
    :return:
    """

    input_singleton = n_input_dims == 0
    if input_singleton:
        n_input_dims = 1

    output_singleton = n_output_dims == 0
    if output_singleton:  # Unfortunately we have to deal with the inconsistencies in numpy's handling of singleton dimensions.
        n_output_dims = 1

    rng = np.random.RandomState(seed)
    w = rng.randn(n_input_dims, n_output_dims)
    input_data = rng.randn(n_training_samples + n_test_samples, n_input_dims)
    target_data = np.dot(
        input_data,
        w) + offset_mag * rng.randn(n_output_dims) + noise_level * rng.randn(
            n_training_samples + n_test_samples, n_output_dims)
    if nonlinearity is not None:
        target_data = nonlinearity(target_data)

    if input_singleton:
        input_data = input_data[:, 0]

    if output_singleton:
        target_data = target_data[:, 0]

    return DataSet(
        training_set=DataCollection(input_data[:n_training_samples],
                                    target_data[:n_training_samples]),
        test_set=DataCollection(input_data[n_training_samples:],
                                target_data[n_training_samples:]),
    )
Ejemplo n.º 4
0
def get_cifar_10_dataset(n_training_samples = None, n_test_samples = None, normalize_inputs = False):
    """
    :param n_training_samples: Number of training samples, or None to leave it at 50000
    :param n_test_samples: Number of test samples, or None to leave it at 10000
    :param normalize_inputs: True to normalize inputs, and turn them from uint8 to double
    :param swap_axes: True to arrange images as (n_samples, n_colors, n_rows, n_cols) instead of (n_samples, n_rows, n_cols, n_colors)

    :return: The CIFAR-10 dataset, which consists of 50000 training and 10000 test images.
        Images are 32x32 uint8 RGB images (n_samples, 3, 32, 32) of 10 categories of objects.
        Targets are integer labels in the range [0, 9]
    """
    # TODO: Make method for downloading/unpacking data (from http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz)
    # We have this for single files already, but in this case the gz contains a folder with the files in it.

    directory = 'data/cifar-10-batches-py'

    n_batches_to_read = 5 if n_training_samples is None else int(np.ceil(n_training_samples/10000.))

    file_paths = [get_file(os.path.join(os.path.join(directory, 'data_batch_%s' % (i, )))) for i in xrange(1, n_batches_to_read+1)] \
        + [get_file(os.path.join(os.path.join(directory, 'test_batch')))]

    data = []
    for file_path in file_paths:
        with open(file_path) as f:
            batch_data = pickle.load(f)
            data.append(batch_data)

    x_tr = np.concatenate([d['data'] for d in data[:-1]], axis = 0).reshape(-1, 3, 32, 32)
    y_tr = np.concatenate([d['labels'] for d in data[:-1]], axis = 0)
    x_ts = data[-1]['data'].reshape(-1, 3, 32, 32)
    y_ts = np.array(data[-1]['labels'])

    if normalize_inputs:
        mean = x_tr.mean(axis=0, keepdims=True)
        std = x_tr.std(axis=0, keepdims=True)
        x_tr = (x_tr - mean)/std
        x_ts = (x_ts - mean)/std

    if n_training_samples is not None:
        x_tr = x_tr[:n_training_samples]
        y_tr = y_tr[:n_training_samples]
    if n_test_samples is not None:
        x_ts = x_ts[:n_test_samples]
        y_ts = y_ts[:n_test_samples]

    return DataSet(training_set=DataCollection(x_tr, y_tr), test_set=DataCollection(x_ts, y_ts), name = 'CIFAR-10')
Ejemplo n.º 5
0
def test_compare_predictors_old():

    x_tr, y_tr, x_ts, y_ts, w_true = get_logistic_regression_data(noise_factor = 0.1)
    dataset = DataSet(DataCollection(x_tr, y_tr), DataCollection(x_ts, y_ts)).process_with(targets_processor=lambda (x, ): (OneHotEncoding()(x[:, 0]), ))

    w_init = 0.1*np.random.randn(dataset.training_set.input.shape[1], dataset.training_set.target.shape[1])
    records = compare_predictors_old(
        dataset = dataset,
        offline_predictor_constructors={
            'Optimal': lambda: MockPredictor(lambda x: sigm(x.dot(w_true)))
            },
        online_predictor_constructors={
            'fast-perceptron': lambda: Perceptron(alpha = 0.1, w = w_init.copy()),
            'slow-perceptron': lambda: Perceptron(alpha = 0.001, w = w_init.copy())
            },
        minibatch_size = 10,
        test_points = sqrtspace(0, 10, 20),
        evaluation_function='mse'
        )
    plot_learning_curves(records, hang = False)
Ejemplo n.º 6
0
def get_cifar_10_dataset(n_training_samples=None, n_test_samples=None):
    """
    :return: The CIFAR-10 dataset, which consists of 50000 training and 10000 test images.
        Images are 32x32 uint8 RGB images (n_samples, 3, 32, 32) of 10 categories of objects.
        Targets are integer labels in the range [0, 9]
    """
    # TODO: Make method for downloading/unpacking data (from http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz)
    # We have this for single files already, but in this case the gz contains a folder with the files in it.

    directory = 'data/cifar-10-batches-py'

    n_batches_to_read = 5 if n_training_samples is None else int(
        np.ceil(n_training_samples / 10000.))

    file_paths = [get_file(os.path.join(os.path.join(directory, 'data_batch_%s' % (i, )))) for i in xrange(1, n_batches_to_read+1)] \
        + [get_file(os.path.join(os.path.join(directory, 'test_batch')))]

    data = []
    for file_path in file_paths:
        with open(file_path) as f:
            batch_data = pickle.load(f)
            data.append(batch_data)

    x_tr = np.concatenate([d['data'] for d in data[:-1]],
                          axis=0).reshape(-1, 3, 32, 32).swapaxes(2, 3)
    y_tr = np.concatenate([d['labels'] for d in data[:-1]], axis=0)
    x_ts = data[-1]['data'].reshape(-1, 3, 32, 32).swapaxes(2, 3)
    y_ts = np.array(data[-1]['labels'])

    if n_training_samples is not None:
        x_tr = x_tr[:n_training_samples]
        y_tr = y_tr[:n_training_samples]
    if n_test_samples is not None:
        x_ts = x_ts[:n_test_samples]
        y_ts = y_ts[:n_test_samples]

    return DataSet(training_set=DataCollection(x_tr, y_tr),
                   test_set=DataCollection(x_ts, y_ts),
                   name='CIFAR-10')
Ejemplo n.º 7
0
 def __init__(self, **kwargs):
     x_tr, y_tr, x_ts, y_ts, w_true = get_logistic_regression_data(**kwargs)
     DataSet.__init__(self, DataCollection(x_tr, y_tr), DataCollection(x_ts, y_ts))
     self._w_true = w_true