def test_extract_reassemble():
    """ Tests that ExtractGridPatches and ReassembleGridPatches are
    inverse of each other """

    rng = np.random.RandomState([1, 3, 7])

    topo = rng.randn(4, 3 * 5, 3 * 7, 2)

    dataset = DenseDesignMatrix(topo_view=topo)

    patch_shape = (3, 7)

    extractor = ExtractGridPatches(patch_shape, patch_shape)
    reassemblor = ReassembleGridPatches(patch_shape=patch_shape,
                                        orig_shape=topo.shape[1:3])

    dataset.apply_preprocessor(extractor)
    dataset.apply_preprocessor(reassemblor)

    new_topo = dataset.get_topological_view()

    assert new_topo.shape == topo.shape

    if not np.all(new_topo == topo):
        assert False
Beispiel #2
0
    elif stl10:
        print 'STL10 detected'
        dataset = serial.load(
            '${PYLEARN2_DATA_PATH}/stl10/stl10_32x32/train.pkl')
    X = dataset.get_design_matrix()[batch_start:batch_start + batch_size, :]

    size = np.sqrt(model.nvis / 3)

    if cifar10 or cifar100:
        pv1 = make_viewer((X - 127.5) / 127.5, is_color=True, rescale=False)
    elif stl10:
        pv1 = make_viewer(X / 127.5, is_color=True, rescale=False)

    dataset.set_design_matrix(X)

    patchifier = ExtractGridPatches(patch_shape=(size, size),
                                    patch_stride=(1, 1))

    if size == 8:
        if cifar10:
            pipeline = serial.load(
                '${GOODFELI_TMP}/cifar10_preprocessed_pipeline_2M.pkl')
        elif stl10:
            assert False
    elif size == 6:
        if cifar10:
            pipeline = serial.load(
                '${GOODFELI_TMP}/cifar10_preprocessed_pipeline_2M_6x6.pkl')
        elif cifar100:
            pipeline = serial.load(
                '/data/lisa/data/cifar100/cifar100_patches/preprocessor.pkl')
        elif stl10:
Beispiel #3
0
    def _execute(self):

        global pooling_matrix
        save_path = self.save_path
        batch_size = self.batch_size
        feature_type = self.feature_type
        dataset_family = self.dataset_family
        which_set = self.which_set
        model = self.model
        size = self.size

        nan = 0

        dataset_descriptor = dataset_family[which_set][size]

        dataset = dataset_descriptor.dataset_maker()
        expected_num_examples = dataset_descriptor.num_examples

        full_X = dataset.get_design_matrix()
        num_examples = full_X.shape[0]
        assert num_examples == expected_num_examples

        if self.restrict is not None:
            assert self.restrict[1] <= full_X.shape[0]

            print 'restricting to examples ', self.restrict[
                0], ' through ', self.restrict[1], ' exclusive'
            full_X = full_X[self.restrict[0]:self.restrict[1], :]

            assert self.restrict[1] > self.restrict[0]

        #update for after restriction
        num_examples = full_X.shape[0]

        assert num_examples > 0

        dataset.X = None
        dataset.design_loc = None
        dataset.compress = False

        patchifier = ExtractGridPatches(patch_shape=(size, size),
                                        patch_stride=(1, 1))

        pipeline = serial.load(dataset_descriptor.pipeline_path)

        assert isinstance(pipeline.items[0], ExtractPatches)
        pipeline.items[0] = patchifier

        print 'defining features'
        V = T.matrix('V')
        model.make_pseudoparams()
        d = model.e_step.variational_inference(V=V)

        H = d['H_hat']
        Mu1 = d['S_hat']

        assert H.dtype == 'float32'
        assert Mu1.dtype == 'float32'

        if self.feature_type == 'map_hs':
            feat = (H > 0.5) * Mu1
        elif self.feature_type == 'map_h':
            feat = T.cast(H > 0.5, dtype='float32')
        elif self.feature_type == 'exp_hs':
            feat = H * Mu1
        elif self.feature_type == 'exp_h':
            feat = H
        elif self.feature_type == 'exp_h_thresh':
            feat = H * (H > .01)
        else:
            raise NotImplementedError()

        assert feat.dtype == 'float32'
        print 'compiling theano function'
        f = function([V], feat)

        if config.device.startswith('gpu') and model.nhid >= 4000:
            f = halver(f, model.nhid)

        topo_feat_var = T.TensorType(broadcastable=(False, False, False,
                                                    False),
                                     dtype='float32')()
        region_features = function([topo_feat_var],
                                   topo_feat_var.mean(axis=(1, 2)))

        def average_pool(stride):
            def point(p):
                return p * ns / stride

            rval = np.zeros(
                (topo_feat.shape[0], stride, stride, topo_feat.shape[3]),
                dtype='float32')

            for i in xrange(stride):
                for j in xrange(stride):
                    rval[:, i, j, :] = region_features(
                        topo_feat[:,
                                  point(i):point(i + 1),
                                  point(j):point(j + 1), :])

            return rval

        num_superpixels = 7
        output = np.zeros((num_examples, pooling_matrix.shape[0]),
                          dtype='float32')

        fd = DenseDesignMatrix(X=np.zeros((1, 1), dtype='float32'),
                               view_converter=DefaultViewConverter(
                                   [1, 1, model.nhid]))

        ns = 32 - size + 1
        depatchifier = ReassembleGridPatches(orig_shape=(ns, ns),
                                             patch_shape=(1, 1))

        if len(range(0, num_examples - batch_size + 1, batch_size)) <= 0:
            print num_examples
            print batch_size

        for i in xrange(0, num_examples - batch_size + 1, batch_size):
            print i
            t1 = time.time()

            d = copy.copy(dataset)
            d.set_design_matrix(full_X[i:i + batch_size, :])

            t2 = time.time()

            #print '\tapplying preprocessor'
            d.apply_preprocessor(pipeline, can_fit=False)
            X2 = d.get_design_matrix()

            t3 = time.time()

            #print '\trunning theano function'
            feat = f(X2)

            t4 = time.time()

            assert feat.dtype == 'float32'

            feat_dataset = copy.copy(fd)

            if np.any(np.isnan(feat)):
                nan += np.isnan(feat).sum()
                feat[np.isnan(feat)] = 0

            feat_dataset.set_design_matrix(feat)

            #print '\treassembling features'
            feat_dataset.apply_preprocessor(depatchifier)

            #print '\tmaking topological view'
            topo_feat = feat_dataset.get_topological_view()
            assert topo_feat.shape[0] == batch_size

            t5 = time.time()

            #average pooling
            superpixels = average_pool(num_superpixels)

            pooled = pooling_matrix.dot(superpixels.T).T

            output[i:i + batch_size, :] = pooled

            t6 = time.time()

            print(t6 - t1, t2 - t1, t3 - t2, t4 - t3, t5 - t4, t6 - t5)

        if self.chunk_size is not None:
            assert save_path.endswith('.npy')
            save_path_pieces = save_path.split('.npy')
            assert len(save_path_pieces) == 2
            assert save_path_pieces[1] == ''
            save_path = save_path_pieces[0] + '_' + chr(
                ord('A') + self.chunk_id) + '.npy'
        np.save(save_path, output)

        if nan > 0:
            warnings.warn(str(nan) + ' features were nan')
    def _execute(self):

        global num_superpixels
        num_output_features = self.num_output_features
        idxs = self.idxs
        top = self.top
        bottom = self.bottom
        left = self.left
        right = self.right

        save_path = self.save_path
        batch_size = self.batch_size
        dataset_family = self.dataset_family
        which_set = self.which_set
        model = self.model
        size = self.size

        nan = 0

        dataset_descriptor = dataset_family[which_set][size]

        dataset = dataset_descriptor.dataset_maker()
        expected_num_examples = dataset_descriptor.num_examples

        full_X = dataset.get_design_matrix()
        num_examples = full_X.shape[0]
        assert num_examples == expected_num_examples

        if self.restrict is not None:
            assert self.restrict[1] <= full_X.shape[0]

            print 'restricting to examples ', self.restrict[
                0], ' through ', self.restrict[1], ' exclusive'
            full_X = full_X[self.restrict[0]:self.restrict[1], :]

            assert self.restrict[1] > self.restrict[0]

        #update for after restriction
        num_examples = full_X.shape[0]

        assert num_examples > 0

        dataset.X = None
        dataset.design_loc = None
        dataset.compress = False

        patchifier = ExtractGridPatches(patch_shape=(size, size),
                                        patch_stride=(1, 1))

        pipeline = serial.load(dataset_descriptor.pipeline_path)

        assert isinstance(pipeline.items[0], ExtractPatches)
        pipeline.items[0] = patchifier

        print 'defining features'
        V = T.matrix('V')

        mu = model.mu

        feat = triangle_code(V, mu)

        assert feat.dtype == 'float32'
        print 'compiling theano function'
        f = function([V], feat)

        nhid = model.mu.get_value().shape[0]

        if config.device.startswith('gpu') and nhid >= 4000:
            f = halver(f, model.nhid)

        topo_feat_var = T.TensorType(broadcastable=(False, False, False,
                                                    False),
                                     dtype='float32')()
        if self.pool_mode == 'mean':
            region_features = function([topo_feat_var],
                                       topo_feat_var.mean(axis=(1, 2)))
        elif self.pool_mode == 'max':
            region_features = function([topo_feat_var],
                                       topo_feat_var.max(axis=(1, 2)))
        else:
            assert False

        def average_pool(stride):
            def point(p):
                return p * ns / stride

            rval = np.zeros(
                (topo_feat.shape[0], stride, stride, topo_feat.shape[3]),
                dtype='float32')

            for i in xrange(stride):
                for j in xrange(stride):
                    rval[:, i, j, :] = region_features(
                        topo_feat[:,
                                  point(i):point(i + 1),
                                  point(j):point(j + 1), :])

            return rval

        output = np.zeros((num_examples, num_output_features), dtype='float32')

        fd = DenseDesignMatrix(X=np.zeros((1, 1), dtype='float32'),
                               view_converter=DefaultViewConverter(
                                   [1, 1, nhid]))

        ns = 32 - size + 1
        depatchifier = ReassembleGridPatches(orig_shape=(ns, ns),
                                             patch_shape=(1, 1))

        if len(range(0, num_examples - batch_size + 1, batch_size)) <= 0:
            print num_examples
            print batch_size

        for i in xrange(0, num_examples - batch_size + 1, batch_size):
            print i
            t1 = time.time()

            d = copy.copy(dataset)
            d.set_design_matrix(full_X[i:i + batch_size, :])

            t2 = time.time()

            #print '\tapplying preprocessor'
            d.apply_preprocessor(pipeline, can_fit=False)
            X2 = d.get_design_matrix()

            t3 = time.time()

            #print '\trunning theano function'
            feat = f(X2)

            t4 = time.time()

            assert feat.dtype == 'float32'

            feat_dataset = copy.copy(fd)

            if np.any(np.isnan(feat)):
                nan += np.isnan(feat).sum()
                feat[np.isnan(feat)] = 0

            feat_dataset.set_design_matrix(feat)

            #print '\treassembling features'
            feat_dataset.apply_preprocessor(depatchifier)

            #print '\tmaking topological view'
            topo_feat = feat_dataset.get_topological_view()
            assert topo_feat.shape[0] == batch_size

            t5 = time.time()

            #average pooling
            superpixels = average_pool(num_superpixels)

            assert batch_size == 1

            if self.pool_mode == 'mean':
                for j in xrange(num_output_features):
                    output[i:i + batch_size,
                           j] = superpixels[:, top[j]:bottom[j] + 1,
                                            left[j]:right[j] + 1,
                                            idxs[j]].mean()
            elif self.pool_mode == 'max':
                for j in xrange(num_output_features):
                    output[i:i + batch_size,
                           j] = superpixels[:, top[j]:bottom[j] + 1,
                                            left[j]:right[j] + 1,
                                            idxs[j]].max()
            else:
                assert False

            assert output[i:i + batch_size, :].max() < 1e20

            t6 = time.time()

            print(t6 - t1, t2 - t1, t3 - t2, t4 - t3, t5 - t4, t6 - t5)

        if self.chunk_size is not None:
            assert save_path.endswith('.npy')
            save_path_pieces = save_path.split('.npy')
            assert len(save_path_pieces) == 2
            assert save_path_pieces[1] == ''
            save_path = save_path_pieces[0] + '_' + chr(
                ord('A') + self.chunk_id) + '.npy'
        np.save(save_path, output)

        if nan > 0:
            warnings.warn(str(nan) + ' features were nan')
Beispiel #5
0
    def _execute(self):

        batch_size = self.batch_size
        feature_type = self.feature_type
        pooling_region_counts = self.pooling_region_counts
        dataset_family = self.dataset_family
        which_set = self.which_set
        model = self.model
        size = self.size

        nan = 0

        dataset_descriptor = dataset_family[which_set][size]

        dataset = dataset_descriptor.dataset_maker()
        expected_num_examples = dataset_descriptor.num_examples

        full_X = dataset.get_design_matrix()
        assert full_X.dtype == 'float32'
        num_examples = full_X.shape[0]
        assert num_examples == expected_num_examples

        print 'restricting to examples from classes 0 and 1'
        full_X = full_X[dataset.y_fine < 2, :]

        #update for after restriction
        num_examples = full_X.shape[0]

        assert num_examples > 0

        dataset.X = None
        dataset.design_loc = None
        dataset.compress = False

        patchifier = ExtractGridPatches(patch_shape=(size, size),
                                        patch_stride=(1, 1))

        pipeline = serial.load(dataset_descriptor.pipeline_path)

        assert isinstance(pipeline.items[0], ExtractPatches)
        pipeline.items[0] = patchifier

        print 'defining features'
        V = T.matrix('V')
        assert V.type.dtype == 'float32'
        model.make_pseudoparams()
        d = model.infer(V=V)

        H = d['H_hat']
        Mu1 = d['S_hat']
        G = d['G_hat']
        if len(G) != 1:
            raise NotImplementedError(
                "only supports two layer pd-dbms for now")
        G, = G

        assert H.dtype == 'float32'
        assert Mu1.dtype == 'float32'

        nfeat = model.s3c.nhid + model.dbm.rbms[0].nhid

        if self.feature_type == 'map_hs':
            feat = (H > 0.5) * Mu1
            raise NotImplementedError("doesn't support layer 2")
        elif self.feature_type == 'map_h':
            feat = T.cast(H > 0.5, dtype='float32')
            raise NotImplementedError("doesn't support layer 2")
        elif self.feature_type == 'exp_hs':
            feat = H * Mu1
            raise NotImplementedError("doesn't support layer 2")
        elif self.feature_type == 'exp_hs_split':
            Z = H * Mu1
            pos = T.clip(Z, 0., 1e32)
            neg = T.clip(-Z, 0, 1e32)
            feat = T.concatenate((pos, neg), axis=1)
            nfeat *= 2
            raise NotImplementedError("doesn't support layer 2")
        elif self.feature_type == 'exp_h,exp_g':
            feat = T.concatenate((H, G), axis=1)
        elif self.feature_type == 'exp_h_thresh':
            feat = H * (H > .01)
            raise NotImplementedError("doesn't support layer 2")
        else:
            raise NotImplementedError()

        assert feat.dtype == 'float32'
        print 'compiling theano function'
        f = function([V], feat)

        if config.device.startswith('gpu') and nfeat >= 4000:
            f = halver(f, nfeat)

        topo_feat_var = T.TensorType(broadcastable=(False, False, False,
                                                    False),
                                     dtype='float32')()
        if self.pool_mode == 'mean':
            region_feat_var = topo_feat_var.mean(axis=(1, 2))
        elif self.pool_mode == 'max':
            region_feat_var = topo_feat_var.max(axis=(1, 2))
        else:
            raise ValueError("Unknown pool mode: " + self.pool_mode)
        region_features = function([topo_feat_var], region_feat_var)

        def average_pool(stride):
            def point(p):
                return p * ns / stride

            rval = np.zeros(
                (topo_feat.shape[0], stride, stride, topo_feat.shape[3]),
                dtype='float32')

            for i in xrange(stride):
                for j in xrange(stride):
                    rval[:, i, j, :] = region_features(
                        topo_feat[:,
                                  point(i):point(i + 1),
                                  point(j):point(j + 1), :])

            return rval

        outputs = [
            np.zeros((num_examples, count, count, nfeat), dtype='float32')
            for count in pooling_region_counts
        ]

        assert len(outputs) > 0

        fd = DenseDesignMatrix(X=np.zeros((1, 1), dtype='float32'),
                               view_converter=DefaultViewConverter(
                                   [1, 1, nfeat]))

        ns = 32 - size + 1
        depatchifier = ReassembleGridPatches(orig_shape=(ns, ns),
                                             patch_shape=(1, 1))

        if len(range(0, num_examples - batch_size + 1, batch_size)) <= 0:
            print num_examples
            print batch_size

        for i in xrange(0, num_examples - batch_size + 1, batch_size):
            print i
            t1 = time.time()

            d = copy.copy(dataset)
            d.set_design_matrix(full_X[i:i + batch_size, :])

            t2 = time.time()

            #print '\tapplying preprocessor'
            d.apply_preprocessor(pipeline, can_fit=False)
            X2 = np.cast['float32'](d.get_design_matrix())

            t3 = time.time()

            #print '\trunning theano function'
            feat = f(X2)

            t4 = time.time()

            assert feat.dtype == 'float32'

            feat_dataset = copy.copy(fd)

            if np.any(np.isnan(feat)):
                nan += np.isnan(feat).sum()
                feat[np.isnan(feat)] = 0

            feat_dataset.set_design_matrix(feat)

            #print '\treassembling features'
            feat_dataset.apply_preprocessor(depatchifier)

            #print '\tmaking topological view'
            topo_feat = feat_dataset.get_topological_view()
            assert topo_feat.shape[0] == batch_size

            t5 = time.time()

            #average pooling
            for output, count in zip(outputs, pooling_region_counts):
                output[i:i + batch_size, ...] = average_pool(count)

            t6 = time.time()

            print(t6 - t1, t2 - t1, t3 - t2, t4 - t3, t5 - t4, t6 - t5)

        for output, save_path in zip(outputs, self.save_paths):
            np.save(save_path, output)

        if nan > 0:
            warnings.warn(str(nan) + ' features were nan')
Beispiel #6
0
    def __init__(self,
                 which_set,
                 center=False,
                 rescale=False,
                 gcn=None,
                 one_hot=False,
                 start=None,
                 stop=None,
                 axes=('b', 0, 1, 'c'),
                 toronto_prepro=False,
                 preprocessor=None,
                 patch=None):
        """
        Parameters
        ----------
        which_set : str
            One of 'train', 'test'
        gcn : float, optional
            Multiplicative constant to use for global contrast normalization.
            No global contrast normalization is applied, if None

        .. todo::

            WRITEME
        """

        # note: there is no such thing as the cifar10 validation set;
        # pylearn1 defined one but really it should be user-configurable
        # (as it is here)

        self.axes = axes

        # we define here:
        dtype = 'uint8'
        ntrain = 50000
        nvalid = 0  # artefact, we won't use it
        ntest = 10000

        # we also expose the following details:
        self.img_shape = (3, 32, 32)
        self.img_size = N.prod(self.img_shape)
        self.n_classes = 10
        self.label_names = [
            'airplane', 'automobile', 'bird', 'cat', 'deer', 'dog', 'frog',
            'horse', 'ship', 'truck'
        ]

        # prepare loading
        fnames = ['data_batch_%i' % i for i in range(1, 6)]
        lenx = N.ceil((ntrain + nvalid) / 10000.) * 10000
        x = N.zeros((lenx, self.img_size), dtype=dtype)
        y = N.zeros(lenx, dtype=dtype)

        # load train data
        nloaded = 0
        for i, fname in enumerate(fnames):
            data = GridPatchCIFAR10._unpickle(fname)
            x[i * 10000:(i + 1) * 10000, :] = data['data']
            y[i * 10000:(i + 1) * 10000] = data['labels']
            nloaded += 10000
            if nloaded >= ntrain + nvalid + ntest: break

        # load test data
        data = GridPatchCIFAR10._unpickle('test_batch')

        # process this data
        Xs = {'train': x[0:ntrain], 'test': data['data'][0:ntest]}

        Ys = {'train': y[0:ntrain], 'test': data['labels'][0:ntest]}

        X = N.cast['float32'](Xs[which_set])
        y = Ys[which_set]

        if isinstance(y, list):
            y = np.asarray(y)

        if which_set == 'test':
            assert y.shape[0] == 10000

        self.one_hot = one_hot
        if one_hot:
            one_hot = np.zeros((y.shape[0], 10), dtype='float32')
            for i in xrange(y.shape[0]):
                one_hot[i, y[i]] = 1.
            y = one_hot

        if center:
            X -= 127.5
        self.center = center

        if rescale:
            X /= 127.5
        self.rescale = rescale

        if toronto_prepro:
            assert not center
            assert not gcn
            X = X / 255.
            if which_set == 'test':
                other = GridPatchCIFAR10(which_set='train')
                oX = other.X
                oX /= 255.
                X = X - oX.mean(axis=0)
            else:
                X = X - X.mean(axis=0)
        self.toronto_prepro = toronto_prepro

        self.gcn = gcn
        if gcn is not None:
            gcn = float(gcn)
            X = global_contrast_normalize(X, scale=gcn)

        if start is not None:
            # This needs to come after the prepro so that it doesn't change the pixel
            # means computed above for toronto_prepro
            assert start >= 0
            assert stop > start
            assert stop <= X.shape[0]
            X = X[start:stop, :]
            y = y[start:stop]
            assert X.shape[0] == y.shape[0]

        if which_set == 'test':
            assert X.shape[0] == 10000

        # Extracts 5 16,16 patches from each 32,32 image.
        #patch_extractor = ExtractGridPatches(patch_shape=(16,16), patch_stride=5)
        #patch_extractor.apply(self)

        # Converts the samples from 3 to 1 sample.
        view_converter = dense_design_matrix.DefaultViewConverter((32, 32, 3),
                                                                  axes)

        super(GridPatchCIFAR10, self).__init__(X=X,
                                               y=y,
                                               view_converter=view_converter)

        assert not np.any(np.isnan(self.X))

        patch_extractor = ExtractGridPatches(patch_shape=(16, 16),
                                             patch_stride=(16, 16))
        patch_extractor.apply(self)

        if preprocessor:
            preprocessor.apply(self)
Beispiel #7
0
    def _execute(self):

        batch_size = self.batch_size
        pooling_region_counts = self.pooling_region_counts
        dataset_family = self.dataset_family
        which_set = self.which_set
        size = self.size

        nan = 0


        dataset_descriptor = dataset_family[which_set][size]

        dataset = dataset_descriptor.dataset_maker()
        expected_num_examples = dataset_descriptor.num_examples

        full_X = dataset.get_design_matrix()
        num_examples = full_X.shape[0]
        assert num_examples == expected_num_examples

        if self.restrict is not None:
            assert self.restrict[1]  <= full_X.shape[0]

            print 'restricting to examples ',self.restrict[0],' through ',self.restrict[1],' exclusive'
            full_X = full_X[self.restrict[0]:self.restrict[1],:]

            assert self.restrict[1] > self.restrict[0]

        #update for after restriction
        num_examples = full_X.shape[0]

        assert num_examples > 0

        dataset.X = None
        dataset.design_loc = None
        dataset.compress = False

        patchifier = ExtractGridPatches( patch_shape = (size,size), patch_stride = (1,1) )

        pipeline = serial.load(dataset_descriptor.pipeline_path)

        assert isinstance(pipeline.items[0], ExtractPatches)
        pipeline.items[0] = patchifier


        print 'defining features'
        Z = T.matrix('Z')

        if self.one_sided:
            feat = abs(Z)
        else:
            pos = T.clip(Z,0.,1e30)
            neg = T.clip(-Z,0.,1e30)

            feat = T.concatenate((pos, neg), axis=1)

        print 'compiling theano function'
        f = function([Z],feat)

        nfeat = self.W.shape[1] * (2 - self.one_sided)
        if not (nfeat == 1600 or nfeat == 3200):
            print nfeat
            assert False

        if config.device.startswith('gpu') and nfeat >= 4000:
            f = halver(f, nfeat)

        topo_feat_var = T.TensorType(broadcastable = (False,False,False,False), dtype='float32')()
        region_features = function([topo_feat_var],
                topo_feat_var.mean(axis=(1,2)) )

        def average_pool( stride ):
            def point( p ):
                return p * ns / stride

            rval = np.zeros( (topo_feat.shape[0], stride, stride, topo_feat.shape[3] ) , dtype = 'float32')

            for i in xrange(stride):
                for j in xrange(stride):
                    rval[:,i,j,:] = region_features( topo_feat[:,point(i):point(i+1), point(j):point(j+1),:] )

            return rval

        outputs = [ np.zeros((num_examples,count,count,nfeat),dtype='float32') for count in pooling_region_counts ]

        assert len(outputs) > 0

        fd = DenseDesignMatrix(X = np.zeros((1,1),dtype='float32'), view_converter = DefaultViewConverter([1, 1, nfeat] ) )

        ns = 32 - size + 1
        depatchifier = ReassembleGridPatches( orig_shape  = (ns, ns), patch_shape=(1,1) )

        if len(range(0,num_examples-batch_size+1,batch_size)) <= 0:
            print num_examples
            print batch_size

        for i in xrange(0,num_examples-batch_size+1,batch_size):
            print i
            t1 = time.time()

            d = copy.copy(dataset)
            d.set_design_matrix(full_X[i:i+batch_size,:])

            t2 = time.time()

            #print '\tapplying preprocessor'
            d.apply_preprocessor(pipeline, can_fit = False)
            X2 = d.get_design_matrix()

            t3 = time.time()

            M.put(s,'batch',X2)

            M.eval(s, 'Z = sparse_codes(batch, dictionary, lambda)')
            Z = M.get(s, 'Z')

            feat = f(np.cast['float32'](Z))

            t4 = time.time()

            assert feat.dtype == 'float32'

            feat_dataset = copy.copy(fd)

            if np.any(np.isnan(feat)):
                nan += np.isnan(feat).sum()
                feat[np.isnan(feat)] = 0

            feat_dataset.set_design_matrix(feat)

            #print '\treassembling features'
            feat_dataset.apply_preprocessor(depatchifier)

            #print '\tmaking topological view'
            topo_feat = feat_dataset.get_topological_view()
            assert topo_feat.shape[0] == batch_size

            t5 = time.time()

            #average pooling
            for output, count in zip(outputs, pooling_region_counts):
                output[i:i+batch_size,...] = average_pool(count)

            t6 = time.time()

            print (t6-t1, t2-t1, t3-t2, t4-t3, t5-t4, t6-t5)

        for output, save_path in zip(outputs, self.save_paths):
            if self.chunk_size is not None:
                assert save_path.endswith('.npy')
                save_path_pieces = save_path.split('.npy')
                assert len(save_path_pieces) == 2
                assert save_path_pieces[1] == ''
                save_path = save_path_pieces[0] + '_' + chr(ord('A')+self.chunk_id)+'.npy'
            np.save(save_path,output)


        if nan > 0:
            warnings.warn(str(nan)+' features were nan')
Beispiel #8
0
    def __init__(self, which_set, center = False, rescale = False, gcn = None,
            one_hot = False, start = None, stop = None, axes=('b', 0, 1, 'c'),
            toronto_prepro = False, preprocessor = None, patch = None):
        """
        Parameters
        ----------
        which_set : str
            One of 'train', 'test'
        gcn : float, optional
            Multiplicative constant to use for global contrast normalization.
            No global contrast normalization is applied, if None

        .. todo::

            WRITEME
        """


        # note: there is no such thing as the cifar10 validation set;
        # pylearn1 defined one but really it should be user-configurable
        # (as it is here)

        self.axes = axes

        # we define here:
        dtype  = 'uint8'
        ntrain = 50000
        nvalid = 0  # artefact, we won't use it
        ntest  = 10000

        # we also expose the following details:
        self.img_shape = (3,32,32)
        self.img_size = N.prod(self.img_shape)
        self.n_classes = 10
        self.label_names = ['airplane', 'automobile', 'bird', 'cat', 'deer',
                            'dog', 'frog','horse','ship','truck']

        # prepare loading
        fnames = ['data_batch_%i' % i for i in range(1,6)]
        lenx = N.ceil((ntrain + nvalid) / 10000.)*10000
        x = N.zeros((lenx,self.img_size), dtype=dtype)
        y = N.zeros(lenx, dtype=dtype)

        # load train data
        nloaded = 0
        for i, fname in enumerate(fnames):
            data = GridPatchCIFAR10._unpickle(fname)
            x[i*10000:(i+1)*10000, :] = data['data']
            y[i*10000:(i+1)*10000] = data['labels']
            nloaded += 10000
            if nloaded >= ntrain + nvalid + ntest: break;

        # load test data
        data = GridPatchCIFAR10._unpickle('test_batch')

        # process this data
        Xs = {
                'train' : x[0:ntrain],
                'test'  : data['data'][0:ntest]
            }

        Ys = {
                'train' : y[0:ntrain],
                'test'  : data['labels'][0:ntest]
            }

        X = N.cast['float32'](Xs[which_set])
        y = Ys[which_set]

        if isinstance(y,list):
            y = np.asarray(y)

        if which_set == 'test':
            assert y.shape[0] == 10000


        self.one_hot = one_hot
        if one_hot:
            one_hot = np.zeros((y.shape[0],10),dtype='float32')
            for i in xrange(y.shape[0]):
                one_hot[i,y[i]] = 1.
            y = one_hot

        if center:
            X -= 127.5
        self.center = center

        if rescale:
            X /= 127.5
        self.rescale = rescale

        if toronto_prepro:
            assert not center
            assert not gcn
            X = X / 255.
            if which_set == 'test':
                other = GridPatchCIFAR10(which_set='train')
                oX = other.X
                oX /= 255.
                X = X - oX.mean(axis=0)
            else:
                X = X - X.mean(axis=0)
        self.toronto_prepro = toronto_prepro

        self.gcn = gcn
        if gcn is not None:
            gcn = float(gcn)
            X = global_contrast_normalize(X, scale=gcn)

        if start is not None:
            # This needs to come after the prepro so that it doesn't change the pixel
            # means computed above for toronto_prepro
            assert start >= 0
            assert stop > start
            assert stop <= X.shape[0]
            X = X[start:stop, :]
            y = y[start:stop]
            assert X.shape[0] == y.shape[0]

        if which_set == 'test':
            assert X.shape[0] == 10000

        
        # Extracts 5 16,16 patches from each 32,32 image.
        #patch_extractor = ExtractGridPatches(patch_shape=(16,16), patch_stride=5)
        #patch_extractor.apply(self)

        # Converts the samples from 3 to 1 sample. 
        view_converter = dense_design_matrix.DefaultViewConverter((32,32,3), axes)

        super(GridPatchCIFAR10, self).__init__(X=X, y=y, view_converter=view_converter)

        assert not np.any(np.isnan(self.X))

        patch_extractor = ExtractGridPatches(patch_shape=(16,16), patch_stride=(16,16))
        patch_extractor.apply(self)

        if preprocessor:
            preprocessor.apply(self)