Esempio n. 1
0
def get_cifar_10_dataset(n_training_samples=None,
                         n_test_samples=None,
                         normalize_inputs=False):
    """
    :param n_training_samples: Number of training samples, or None to leave it at 50000
    :param n_test_samples: Number of test samples, or None to leave it at 10000
    :param normalize_inputs: True to normalize inputs, and turn them from uint8 to double
    :param swap_axes: True to arrange images as (n_samples, n_colors, n_rows, n_cols) instead of (n_samples, n_rows, n_cols, n_colors)

    :return: The CIFAR-10 dataset, which consists of 50000 training and 10000 test images.
        Images are 32x32 uint8 RGB images (n_samples, 3, 32, 32) of 10 categories of objects.
        Targets are integer labels in the range [0, 9]
    """
    # TODO: Make method for downloading/unpacking data (from http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz)
    # We have this for single files already, but in this case the gz contains a folder with the files in it.

    directory = get_archive(
        relative_path='data/cifar-10',
        url='http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz')

    n_batches_to_read = 5 if n_training_samples is None else int(
        np.ceil(n_training_samples / 10000.))

    file_paths = [get_file(os.path.join(directory, 'cifar-10-batches-py', 'data_batch_%s' % (i, ))) for i in xrange(1, n_batches_to_read+1)] \
        + [get_file(os.path.join(directory, 'cifar-10-batches-py', 'test_batch'))]

    data = []
    for file_path in file_paths:
        with open(file_path) as f:
            batch_data = pickle.load(f)
            data.append(batch_data)

    x_tr = np.concatenate([d['data'] for d in data[:-1]],
                          axis=0).reshape(-1, 3, 32, 32)
    y_tr = np.concatenate([d['labels'] for d in data[:-1]], axis=0)
    x_ts = data[-1]['data'].reshape(-1, 3, 32, 32)
    y_ts = np.array(data[-1]['labels'])

    if normalize_inputs:
        mean = x_tr.mean(axis=0, keepdims=True)
        std = x_tr.std(axis=0, keepdims=True)
        x_tr = (x_tr - mean) / std
        x_ts = (x_ts - mean) / std

    if n_training_samples is not None:
        x_tr = x_tr[:n_training_samples]
        y_tr = y_tr[:n_training_samples]
    if n_test_samples is not None:
        x_ts = x_ts[:n_test_samples]
        y_ts = y_ts[:n_test_samples]

    return DataSet(training_set=DataCollection(x_tr, y_tr),
                   test_set=DataCollection(x_ts, y_ts),
                   name='CIFAR-10')
Esempio n. 2
0
def get_normalized_vgg_net(up_to_layer=None, force_shared_parameters=True):
    """
    Load the normalized version of VGG19 discussed here: https://bethgelab.org/deepneuralart/

    """

    norm_vgg19_file = get_file(
        relative_name='data/norm-vgg-19.pkl',
        url = 'https://s3.amazonaws.com/lasagne/recipes/pretrained/imagenet/vgg19_normalized.pkl',
    )
    with open(norm_vgg19_file) as f:
        vgg_struct = pickle.load(f)

    layer_names = ['conv1_1', 'relu1_1', 'conv1_2', 'relu1_2', 'pool1', 'conv2_1', 'relu2_1', 'conv2_2', 'relu2_2', 'pool2',
        'conv3_1', 'relu3_1', 'conv3_2', 'relu3_2', 'conv3_3', 'relu3_3', 'conv3_4', 'relu3_4', 'pool3', 'conv4_1',
        'relu4_1', 'conv4_2', 'relu4_2', 'conv4_3', 'relu4_3', 'conv4_4', 'relu4_4', 'pool4', 'conv5_1', 'relu5_1',
        'conv5_2', 'relu5_2', 'conv5_3', 'relu5_3', 'conv5_4', 'relu5_4', 'pool5', 'fc6', 'relu6', 'fc7', 'relu7',
        'fc8', 'prob']

    if isinstance(up_to_layer, list):
        up_to_layer = up_to_layer[np.argmax([layer_names.index(layer_name) for layer_name in up_to_layer])]

    assert up_to_layer is not None and layer_names.index(up_to_layer) < layer_names.index('fc6'), "This can only be used to load the convolutional portion of vggnet.  Set "

    net_spec = OrderedDict()
    param_iterator = (p for p in vgg_struct['param values'])
    for layer_name in layer_names:
        if layer_name.startswith('conv'):
            w = param_iterator.next()
            b = param_iterator.next()
            assert w.ndim==4 and b.ndim==1
            layer = ConvolverSpec(w=w, b=b, mode = 'same')
        elif layer_name.startswith('relu'):
            layer = NonlinearitySpec('relu')
        elif layer_name.startswith('pool'):
            layer = PoolerSpec(region=2, stride=2, mode='max')
        elif layer_name.startswith('fc'):
            w = param_iterator.next()
            b = param_iterator.next()
            # Here we'll express the "full" layers as convolutional.
            if layer_name == 'fc6':
                w = w.T.reshape(4096, 512, 7, 7)
            elif layer_name == 'fc7':
                w = w.T.reshape(4096, 4096, 1, 1)
            elif layer_name == 'fc8':
                w = w.T.reshape(1000, 4096, 1, 1)
            else:
                bad_value(layer_name)
            layer = ConvolverSpec(w=w, b=b, mode = 'valid')
        elif layer_name == 'prob':
            layer = NonlinearitySpec('softmax')
        else:
            raise Exception("Don't know how to handle layer: '%s'" % (layer_name, ))
        net_spec[layer_name] = layer
        if layer_name == up_to_layer:
            break

    if up_to_layer is None:
        assert_raises(StopIteration)
    return net_spec
Esempio n. 3
0
def smart_file(location, use_cache=False, make_dir=False):
    """
    :param location: Specifies where the file is.
        If it's formatted as a url, it's downloaded.
        If it begins with a "/", it's assumed to be a local path.
        Otherwise, it is assumed to be referenced relative to the data directory.
    :param use_cache: If True, and the location is a url, make a local cache of the file for future use (note: if the
        file at this url changes, the cached file will not).
    :param make_dir: Make the directory for this file, if it does not exist.
    :yield: The local path to the file.
    """
    its_a_url = is_url(location)
    if its_a_url:
        assert not make_dir, "We cannot 'make the directory' for a URL"
        local_path = get_file(url=location, use_cache=use_cache)

    else:
        local_path = get_artemis_data_path(location)
        if make_dir:
            make_file_dir(local_path)

    yield local_path

    if its_a_url and not use_cache:
        os.remove(local_path)
Esempio n. 4
0
def smart_file(location, use_cache = False, make_dir = False):
    """
    :param location: Specifies where the file is.
        If it's formatted as a url, it's downloaded.
        If it begins with a "/", it's assumed to be a local path.
        Otherwise, it is assumed to be referenced relative to the data directory.
    :param use_cache: If True, and the location is a url, make a local cache of the file for future use (note: if the
        file at this url changes, the cached file will not).
    :param make_dir: Make the directory for this file, if it does not exist.
    :yield: The local path to the file.
    """
    its_a_url = is_url(location)
    if its_a_url:
        assert not make_dir, "We cannot 'make the directory' for a URL"
        local_path = get_file(url=location, use_cache=use_cache)

    else:
        local_path = get_artemis_data_path(location)
        if make_dir:
            make_file_dir(local_path)

    yield local_path

    if its_a_url and not use_cache:
        os.remove(local_path)
Esempio n. 5
0
def get_vgg_layer_specifiers(up_to_layer=None):
    """
    Load the 19-layer VGGNet from the mat file and produce a list of layer specifications which can be used to create
    layers in your architecture of choice.
    Info: https://gist.github.com/ksimonyan/3785162f95cd2d5fee77#file-readme-md
    More Details: http://cs231n.github.io/convolutional-networks/#case

    :param up_to_layer: The layer to stop at.  Or a list of layers, in which case the network will go to the highest.
        Layers are identified by their string names:
        ['conv1_1', 'relu1_1', 'conv1_2', 'relu1_2', 'pool1', 'conv2_1', 'relu2_1', 'conv2_2', 'relu2_2', 'pool2',
        'conv3_1', 'relu3_1', 'conv3_2', 'relu3_2', 'conv3_3', 'relu3_3', 'conv3_4', 'relu3_4', 'pool3', 'conv4_1',
        'relu4_1', 'conv4_2', 'relu4_2', 'conv4_3', 'relu4_3', 'conv4_4', 'relu4_4', 'pool4', 'conv5_1', 'relu5_1',
        'conv5_2', 'relu5_2', 'conv5_3', 'relu5_3', 'conv5_4', 'relu5_4', 'pool5', 'fc6', 'relu6', 'fc7', 'relu7',
        'fc8', 'prob']
    :param force_shared_parameters: Create net with shared paremeters.
    :return: An OrderedDict<str,PrimativeSpecifier> where PrimativeSpecifier objects represent the layers of the network.
    """

    filename = get_file(
        relative_name='data/vgg-19.mat',
        url='http://www.vlfeat.org/matconvnet/models/imagenet-vgg-verydeep-19.mat',
    )
    network_params = loadmat(filename)

    def struct_to_layer(struct):
        layer_type = struct[1][0]
        layer_name = str(struct[0][0])
        assert isinstance(layer_type, basestring)
        if layer_type == 'conv':
            w_orig = struct[2][0, 0]  # (n_rows, n_cols, n_in_maps, n_out_maps)
            w = w_orig.T.swapaxes(2, 3)
            b = struct[2][0, 1][:, 0]
            layer = ConvolverSpec(w=w, b=b, mode = 'valid' if layer_name.startswith('fc') else 'same' if layer_name.startswith('conv') else bad_value(layer_name))
        elif layer_type in ('relu', 'softmax'):
            layer = NonlinearitySpec(layer_type)
        elif layer_type == 'pool':
            layer = PoolerSpec(
                region = tuple(struct[3][0].astype(int)),
                stride = tuple(struct[4][0].astype(int)),
                mode=struct[2][0])
        else:
            raise Exception(
                "Don't know about this '%s' layer type." % layer_type)
        return layer_name, layer

    print 'Loading VGG Net...'
    network_layers = OrderedDict(struct_to_layer(network_params['layers'][0, i][
                                 0, 0]) for i in xrange(network_params['layers'].shape[1]))

    if up_to_layer is not None:
        if isinstance(up_to_layer, (list, tuple)):
            up_to_layer = network_layers.keys()[max(
                network_layers.keys().index(layer_name) for layer_name in up_to_layer)]
        layer_names = [network_params['layers'][0, i][0, 0][0][0]
                       for i in xrange(network_params['layers'].shape[1])]
        network_layers = OrderedDict((k, network_layers[k]) for k in layer_names[
                                     :layer_names.index(up_to_layer) + 1])
    print 'Done.'
    return network_layers
Esempio n. 6
0
def get_vggnet_labels():
    file_loc = get_file(
        relative_name='data/labels.txt',
        url = 'https://raw.githubusercontent.com/HoldenCaulfieldRye/caffe/master/data/ilsvrc12/synset_words.txt')
    with open(file_loc) as f:
        lines = f.readlines()
    labels = [line[10:-1] for line in lines]
    return labels
Esempio n. 7
0
def get_imagenet_label_names():

    url = 'https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/596b27d23537e5a1b5751d2b0481ef172f58b539/imagenet1000_clsid_to_human.txt'
    with open(get_file('data/imagenet/labels.json', url=url)) as f:
        label_items = f.read()

    labels = [line[line.index(':')+1:].lstrip(' \'').rstrip('}, \'') for line in label_items.split('\n')]
    return labels
Esempio n. 8
0
def _read_formatted_file(file_relative_path):

    with open(get_file(file_relative_path)) as f:
        text = f.read()
    pairs = [line.split('\t') for line in text.split('\n')[:-1]]
    labels = [group for group, _ in pairs]
    words = [sentence.split(' ') for _, sentence in pairs]
    return words, labels
Esempio n. 9
0
def get_cifar_10_dataset(n_training_samples = None, n_test_samples = None, normalize_inputs = False):
    """
    :param n_training_samples: Number of training samples, or None to leave it at 50000
    :param n_test_samples: Number of test samples, or None to leave it at 10000
    :param normalize_inputs: True to normalize inputs, and turn them from uint8 to double
    :param swap_axes: True to arrange images as (n_samples, n_colors, n_rows, n_cols) instead of (n_samples, n_rows, n_cols, n_colors)

    :return: The CIFAR-10 dataset, which consists of 50000 training and 10000 test images.
        Images are 32x32 uint8 RGB images (n_samples, 3, 32, 32) of 10 categories of objects.
        Targets are integer labels in the range [0, 9]
    """
    # TODO: Make method for downloading/unpacking data (from http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz)
    # We have this for single files already, but in this case the gz contains a folder with the files in it.

    directory = get_archive(relative_path='data/cifar-10', url = 'http://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz')

    n_batches_to_read = 5 if n_training_samples is None else int(np.ceil(n_training_samples/10000.))

    file_paths = [get_file(os.path.join(directory, 'cifar-10-batches-py', 'data_batch_%s' % (i, ))) for i in xrange(1, n_batches_to_read+1)] \
        + [get_file(os.path.join(directory, 'cifar-10-batches-py', 'test_batch'))]

    data = []
    for file_path in file_paths:
        with open(file_path) as f:
            batch_data = pickle.load(f)
            data.append(batch_data)

    x_tr = np.concatenate([d['data'] for d in data[:-1]], axis = 0).reshape(-1, 3, 32, 32)
    y_tr = np.concatenate([d['labels'] for d in data[:-1]], axis = 0)
    x_ts = data[-1]['data'].reshape(-1, 3, 32, 32)
    y_ts = np.array(data[-1]['labels'])

    if normalize_inputs:
        mean = x_tr.mean(axis=0, keepdims=True)
        std = x_tr.std(axis=0, keepdims=True)
        x_tr = (x_tr - mean)/std
        x_ts = (x_ts - mean)/std

    if n_training_samples is not None:
        x_tr = x_tr[:n_training_samples]
        y_tr = y_tr[:n_training_samples]
    if n_test_samples is not None:
        x_ts = x_ts[:n_test_samples]
        y_ts = y_ts[:n_test_samples]

    return DataSet(training_set=DataCollection(x_tr, y_tr), test_set=DataCollection(x_ts, y_ts), name = 'CIFAR-10')
Esempio n. 10
0
def get_imagenet_label_names():

    url = 'https://gist.githubusercontent.com/yrevar/942d3a0ac09ec9e5eb3a/raw/596b27d23537e5a1b5751d2b0481ef172f58b539/imagenet1000_clsid_to_human.txt'
    with open(get_file('data/imagenet/labels.json', url=url)) as f:
        label_items = f.read()

    labels = [line[line.index(':')+1:].lstrip(' \'').rstrip('}, \'') for line in label_items.split('\n')]
    return labels
Esempio n. 11
0
def _read_formatted_file(file_relative_path):

    with open(get_file(file_relative_path)) as f:
        text = f.read()
    pairs = [line.split('\t') for line in text.split('\n')[:-1]]
    labels = [group for group, _ in pairs]
    words = [sentence.split(' ') for _, sentence in pairs]
    return words, labels
Esempio n. 12
0
def get_imagenet_images(indices):
    """
    Get imagenet images at the given indices
    :param indices:
    :return:
    """
    highest_index = np.max(indices)
    code_url_pairs = get_imagenet_fall11_urls(highest_index+1)
    files = [get_file('data/imagenet/%s%s' % (code_url_pairs[index][0], os.path.splitext(code_url_pairs[index][1])[1]), code_url_pairs[index][1]) for index in indices]
    return [smart_load(f) for f in files]
Esempio n. 13
0
def get_imagenet_images(indices):
    """
    Get imagenet images at the given indices
    :param indices:
    :return:
    """
    highest_index = np.max(indices)
    code_url_pairs = get_imagenet_fall11_urls(highest_index+1)
    files = [get_file('data/imagenet/%s%s' % (code_url_pairs[index][0], os.path.splitext(code_url_pairs[index][1])[1]), code_url_pairs[index][1]) for index in indices]
    return [smart_load(f) for f in files]
Esempio n. 14
0
def test_get_file():
    with hold_file_root(get_artemis_data_path('file_getter_tests'), delete_after=True, delete_before=True):
        print('Testing get_file on unnamed file')
        path = get_file(url='https://drive.google.com/uc?export=download&id=1uC9sJ04V7VjzMj32q4-OLEnRFPvQpYtp')
        with open(path) as f:
            assert f.read()=='a,b,c'

        # Should not download this time
        path = get_file(url='https://drive.google.com/uc?export=download&id=1uC9sJ04V7VjzMj32q4-OLEnRFPvQpYtp')
        with open(path) as f:
            assert f.read()=='a,b,c'

        print('Testing get_file on named file')
        path = get_file(relative_name='my-test.txt', url='https://drive.google.com/uc?export=download&id=1uC9sJ04V7VjzMj32q4-OLEnRFPvQpYtp')
        with open(path) as f:
            assert f.read()=='a,b,c'

        # Should not download this time
        path = get_file(relative_name='my-test.txt', url='https://drive.google.com/uc?export=download&id=1uC9sJ04V7VjzMj32q4-OLEnRFPvQpYtp')
        with open(path) as f:
            assert f.read()=='a,b,c'
Esempio n. 15
0
def test_get_file():
    with hold_file_root(get_artemis_data_path('file_getter_tests'),
                        delete_after=True,
                        delete_before=True):
        print('Testing get_file on unnamed file')
        path = get_file(
            url=
            'https://drive.google.com/uc?export=download&id=1uC9sJ04V7VjzMj32q4-OLEnRFPvQpYtp'
        )
        with open(path) as f:
            assert f.read() == 'a,b,c'

        # Should not download this time
        path = get_file(
            url=
            'https://drive.google.com/uc?export=download&id=1uC9sJ04V7VjzMj32q4-OLEnRFPvQpYtp'
        )
        with open(path) as f:
            assert f.read() == 'a,b,c'

        print('Testing get_file on named file')
        path = get_file(
            relative_name='my-test.txt',
            url=
            'https://drive.google.com/uc?export=download&id=1uC9sJ04V7VjzMj32q4-OLEnRFPvQpYtp'
        )
        with open(path) as f:
            assert f.read() == 'a,b,c'

        # Should not download this time
        path = get_file(
            relative_name='my-test.txt',
            url=
            'https://drive.google.com/uc?export=download&id=1uC9sJ04V7VjzMj32q4-OLEnRFPvQpYtp'
        )
        with open(path) as f:
            assert f.read() == 'a,b,c'
Esempio n. 16
0
def get_mnist_dataset(n_training_samples=None,
                      n_test_samples=None,
                      flat=False,
                      join_train_and_val=False,
                      binarize=False):
    """
    The MNIST DataSet - the Drosophila of machine learning.

    :param n_training_samples: Cap on the number of training samples
    :param n_test_samples: Cap on the number of test samples
    :param flat: Set to True if we just want flat 784-dimensional input data instead of 28x28 images.
    :param join_train_and_val: If true, merge the validation set into the training set.  (giving 60000 training examples,
        and 10000 test examples).  Otherwise you'll have 50000 training, 10000 validation, 10000 test.
    :param binarize: Binarize inputs by thresholding them at 0.5
    :return: A DataSet object containing the MNIST data
    """
    filename = get_file(relative_name='data/mnist.pkl',
                        url='http://deeplearning.net/data/mnist/mnist.pkl.gz',
                        data_transformation=unzip_gz)

    with open(filename, 'rb') as f:
        # data = pickle.load(f, encoding='latin1')
        data = np.load(f, encoding='latin1')

    x_tr, y_tr = data[0] if n_training_samples is None else (
        data[0][0][:n_training_samples], data[0][1][:n_training_samples])
    x_ts, y_ts = data[1] if n_test_samples is None else (
        data[1][0][:n_test_samples], data[1][1][:n_test_samples])
    x_vd, y_vd = data[2]
    if not flat:
        x_tr = x_tr.reshape(-1, 28, 28)
        x_ts = x_ts.reshape(-1, 28, 28)
        x_vd = x_vd.reshape(-1, 28, 28)
    if binarize:
        x_tr = x_tr > 0.5
        x_ts = x_ts > 0.5
        x_vd = x_vd > 0.5

    return \
        DataSet(
            training_set=DataCollection(np.concatenate([x_tr, x_vd], axis=0), np.concatenate([y_tr, y_vd], axis=0)),
            test_set=DataCollection(x_ts, y_ts)
            ) \
        if join_train_and_val else \
        DataSet(
            training_set=DataCollection(x_tr, y_tr),
            test_set=DataCollection(x_ts, y_ts),
            validation_set=DataCollection(x_vd, y_vd)
            )
Esempio n. 17
0
def get_imagenet_fall11_urls(n_images=None):

    if n_images is None:
        n_images = 14197121

    imagenet_urls = get_file(
        'data/imagnet_urls.txt',
        url='http://image-net.org/imagenet_data/urls/imagenet_fall11_urls.tgz',
        data_transformation=unzip_gz)

    print 'Loading %s image URLs....' % (n_images, )
    with open(imagenet_urls) as f:
        f.readline()
        lines = list(line for _, line in izip(xrange(n_images), f))
    indices = [s.index('\t') for s in lines]
    pairs = [(line[:s], line[s + 1:-1]) for line, s in zip(lines, indices)]
    print 'Done.'
    return pairs
Esempio n. 18
0
def read_the_bible(max_characters=None):
    """
    Returns the King James Bible as a single string.
    Thanks to Janel (http://janelwashere.com/pages/bible_daily_reading.html) for compiling it.
    :param max_characters: You have the option to truncate it to a length of max_characters
        (If you're Jewish, for instance)
    :return: A string.
    """

    filename = get_file(
        relative_name='data/king_james_bible.txt',
        url='http://janelwashere.com/files/bible_daily.txt',
    )

    with open(filename) as f:
        text = f.read(-1 if max_characters is None else max_characters)

    return text
Esempio n. 19
0
def read_fifty_shades_of_grey(max_characters=None):
    """
    Returns Fifty Shades of Gray, by EL James.
    :param max_characters:
    :return:
    """

    filename = get_file(relative_name='data/fifty_shades_of_grey.txt',
                        url=None)

    with open(filename) as f:
        text = f.read(-1 if max_characters is None else max_characters)

    # Need to remove some weird non-ascii stuff.
    # http://stackoverflow.com/questions/20078816/replace-non-ascii-characters-with-a-single-space
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)

    return text
Esempio n. 20
0
def get_mnist_dataset(n_training_samples = None, n_test_samples = None, flat = False, join_train_and_val = False, binarize = False):
    """
    The MNIST DataSet - the Drosophila of machine learning.

    :param n_training_samples: Cap on the number of training samples
    :param n_test_samples: Cap on the number of test samples
    :param flat: Set to True if we just want flat 784-dimensional input data instead of 28x28 images.
    :param join_train_and_val: If true, merge the validation set into the training set.  (giving 60000 training examples,
        and 10000 test examples).  Otherwise you'll have 50000 training, 10000 validation, 10000 test.
    :param binarize: Binarize inputs by thresholding them at 0.5
    :return: A DataSet object containing the MNIST data
    """
    filename = get_file(
        relative_name = 'data/mnist.pkl',
        url = 'http://deeplearning.net/data/mnist/mnist.pkl.gz',
        data_transformation = unzip_gz)

    with open(filename, 'rb') as f:
        # data = pickle.load(f, encoding='latin1')
        data = np.load(f, encoding='latin1')

    x_tr, y_tr = data[0] if n_training_samples is None else (data[0][0][:n_training_samples], data[0][1][:n_training_samples])
    x_ts, y_ts = data[1] if n_test_samples is None else (data[1][0][:n_test_samples], data[1][1][:n_test_samples])
    x_vd, y_vd = data[2]
    if not flat:
        x_tr = x_tr.reshape(-1, 28, 28)
        x_ts = x_ts.reshape(-1, 28, 28)
        x_vd = x_vd.reshape(-1, 28, 28)
    if binarize:
        x_tr = x_tr>0.5
        x_ts = x_ts>0.5
        x_vd = x_vd>0.5

    return \
        DataSet(
            training_set=DataCollection(np.concatenate([x_tr, x_vd], axis=0), np.concatenate([y_tr, y_vd], axis=0)),
            test_set=DataCollection(x_ts, y_ts)
            ) \
        if join_train_and_val else \
        DataSet(
            training_set=DataCollection(x_tr, y_tr),
            test_set=DataCollection(x_ts, y_ts),
            validation_set=DataCollection(x_vd, y_vd)
            )
Esempio n. 21
0
def get_image(name, size = None):
    """
    Get an image by name.
    :param name: A string identifying an image from our dictionary.
    :return:
    """
    assert name in IMAGE_COLLECTION, "We don't have the image '%s' in the gallary" % (name, )
    _, ext = os.path.splitext(IMAGE_COLLECTION[name])
    relative_path = os.path.join('images', name)+ext
    filename = get_file(
        relative_name = relative_path,
        url = IMAGE_COLLECTION[name],
        )
    im_array = imread(filename)
    if im_array.ndim==2:
        im_array = im_array[:, :, None] + [0, 0, 0]
    if size is not None:
        im_array = imresize(im_array, get_new_size(im_array.shape[:2], new_size=size))
    return im_array
Esempio n. 22
0
def get_image(name, size = None):
    """
    Get an image by name.
    :param name: A string identifying an image from our dictionary.
    :return:
    """
    assert name in IMAGE_COLLECTION, "We don't have the image '%s' in the gallary" % (name, )
    _, ext = os.path.splitext(IMAGE_COLLECTION[name])
    relative_path = os.path.join('images', name)+ext
    filename = get_file(
        relative_name = relative_path,
        url = IMAGE_COLLECTION[name],
        )
    im_array = imread(filename)
    if im_array.ndim==2:
        im_array = im_array[:, :, None] + [0, 0, 0]
    if size is not None:
        im_array = imresize(im_array, get_new_size(im_array.shape[:2], new_size=size))
    return im_array
Esempio n. 23
0
def get_imagenet_fall11_urls(n_images = None):

    if n_images is None:
        n_images = 14197121

    imagenet_urls = get_file(
        'data/imagnet_urls.txt',
        url = 'http://image-net.org/imagenet_data/urls/imagenet_fall11_urls.tgz',
        data_transformation=unzip_gz
        )

    print('Loading %s image URLs....' % (n_images, ))
    with open(imagenet_urls) as f:
        f.readline()
        lines = list(line for _, line in izip(xrange(n_images), f))
    indices = [s.index('\t') for s in lines]
    pairs = [(line[:s], line[s+1:-1]) for line, s in zip(lines, indices)]
    print('Done.')
    return pairs
Esempio n. 24
0
def get_20_newsgroups_dataset(filter_most_common=2000,
                              numeric=False,
                              shuffling_seed=1234,
                              bag_of_words=False,
                              count_scaling=None):
    """
    The 20 newsgroups dataset.  In this dataset, you try to predict the topic of a forum from the words contained in
    posts in the forums.

    Words have been preprocessed to the "stemmed" version, as explained on the website:
    http://ana.cachopo.org/datasets-for-single-label-text-categorization

    :param filter_most_common: Can be:
        None: Don't filter out words
        int N: Filter out words that are not in the N most common workds
        (int N, int M): Filter out words that are not between the Nth and Mth most common words.
    :param numeric: Convert everything from words to numbers
    :param shuffling_seed: Random seed for shuffling (you want to shuffle, because everything's sorted by topic)
    :param bag_of_words: Return count vectors for each word
    :param count_scaling: If using bag_of_words, apply the transformation:
        vector = log(1+word_counts)
        To generate the input data (this scaling makes it more suitable for some types of classifiers).
    :return: A DataSet object
    """

    training_set_file = get_file(
        relative_name='data/20ng-train-stemmed.txt',
        url=
        'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-train-stemmed.txt'
    )

    test_set_file = get_file(
        relative_name='data/20ng-test-stemmed.txt',
        url=
        'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-test-stemmed.txt'
    )

    train_words, train_labels = _read_formatted_file(training_set_file)
    test_words, test_labels = _read_formatted_file(test_set_file)

    # Shuffle it up...
    rng = np.random.RandomState(shuffling_seed)
    train_words, train_labels = _shuffle((train_words, train_labels), rng)
    test_words, test_labels = _shuffle((test_words, test_labels), rng)

    # Filter out most-common-but-not-too-common-words
    all_train_words = np.concatenate(train_words)
    filtered_vocab, counts = _find_most_common(all_train_words,
                                               filter_most_common)
    train_words = _filter_lists_of_words(train_words, filtered_vocab)
    test_words = _filter_lists_of_words(test_words, filtered_vocab)

    if numeric or bag_of_words:
        train_ixs_list = _list_of_posts_to_list_of_ixs(train_words,
                                                       filtered_vocab)
        test_ixs_list = _list_of_posts_to_list_of_ixs(test_words,
                                                      filtered_vocab)
        label_vocab = {lab: i for i, lab in enumerate(np.unique(train_labels))}
        train_labels = _words_to_ints(train_labels, label_vocab)
        test_labels = _words_to_ints(test_labels, label_vocab)

        if bag_of_words:
            train_counts = _list_of_ixs_to_count_matrix(
                train_ixs_list, n_words=len(filtered_vocab))
            test_counts = _list_of_ixs_to_count_matrix(
                test_ixs_list, n_words=len(filtered_vocab))
            if count_scaling == 'log':
                train_counts = np.log(1 + train_counts)
                test_counts = np.log(1 + test_counts)
            return DataSet.from_xyxy(training_inputs=train_counts,
                                     training_targets=train_labels,
                                     test_inputs=test_counts,
                                     test_targets=test_labels)
        else:
            return DataSet.from_xyxy(training_inputs=train_ixs_list,
                                     training_targets=train_labels,
                                     test_inputs=test_ixs_list,
                                     test_targets=test_labels)
    else:
        return DataSet.from_xyxy(training_inputs=train_words,
                                 training_targets=train_labels,
                                 test_inputs=test_words,
                                 test_targets=test_labels)
Esempio n. 25
0
def get_20_newsgroups_dataset(filter_most_common = 2000, numeric = False, shuffling_seed = 1234, bag_of_words = False, count_scaling = None):
    """
    The 20 newsgroups dataset.  In this dataset, you try to predict the topic of a forum from the words contained in
    posts in the forums.

    Words have been preprocessed to the "stemmed" version, as explained on the website:
    http://ana.cachopo.org/datasets-for-single-label-text-categorization

    :param filter_most_common: Can be:
        None: Don't filter out words
        int N: Filter out words that are not in the N most common workds
        (int N, int M): Filter out words that are not between the Nth and Mth most common words.
    :param numeric: Convert everything from words to numbers
    :param shuffling_seed: Random seed for shuffling (you want to shuffle, because everything's sorted by topic)
    :param bag_of_words: Return count vectors for each word
    :param count_scaling: If using bag_of_words, apply the transformation:
        vector = log(1+word_counts)
        To generate the input data (this scaling makes it more suitable for some types of classifiers).
    :return: A DataSet object
    """

    training_set_file = get_file(
        relative_name = 'data/20ng-train-stemmed.txt',
        url = 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-train-stemmed.txt'
        )

    test_set_file = get_file(
        relative_name = 'data/20ng-test-stemmed.txt',
        url = 'http://ana.cachopo.org/datasets-for-single-label-text-categorization/20ng-test-stemmed.txt'
        )

    train_words, train_labels = _read_formatted_file(training_set_file)
    test_words, test_labels = _read_formatted_file(test_set_file)

    # Shuffle it up...
    rng = np.random.RandomState(shuffling_seed)
    train_words, train_labels =_shuffle((train_words, train_labels), rng)
    test_words, test_labels =_shuffle((test_words, test_labels), rng)

    # Filter out most-common-but-not-too-common-words
    all_train_words = np.concatenate(train_words)
    filtered_vocab, counts = _find_most_common(all_train_words, filter_most_common)
    train_words = _filter_lists_of_words(train_words, filtered_vocab)
    test_words = _filter_lists_of_words(test_words, filtered_vocab)

    if numeric or bag_of_words:
        train_ixs_list = _list_of_posts_to_list_of_ixs(train_words, filtered_vocab)
        test_ixs_list = _list_of_posts_to_list_of_ixs(test_words, filtered_vocab)
        label_vocab = {lab: i for i, lab in enumerate(np.unique(train_labels))}
        train_labels = _words_to_ints(train_labels, label_vocab)
        test_labels = _words_to_ints(test_labels, label_vocab)

        if bag_of_words:
            train_counts = _list_of_ixs_to_count_matrix(train_ixs_list, n_words=len(filtered_vocab))
            test_counts = _list_of_ixs_to_count_matrix(test_ixs_list, n_words=len(filtered_vocab))
            if count_scaling == 'log':
                train_counts = np.log(1+train_counts)
                test_counts = np.log(1+test_counts)
            return DataSet.from_xyxy(training_inputs = train_counts, training_targets = train_labels, test_inputs = test_counts, test_targets = test_labels)
        else:
            return DataSet.from_xyxy(training_inputs = train_ixs_list, training_targets = train_labels, test_inputs = test_ixs_list, test_targets = test_labels)
    else:
        return DataSet.from_xyxy(training_inputs = train_words, training_targets = train_labels, test_inputs = test_words, test_targets = test_labels)