Example #1
0
def test_lprint():
    paths = None
    rgb_imgs = larray.lmap(test_lprint, paths)
    rgb_imgs2 = larray.lmap(test_lprint, rgb_imgs)
    s = larray.lprint_str(rgb_imgs2)
    print s
    assert s == """lmap(test_lprint, ...)
Example #2
0
File: view.py Project: wqren/skdata
    def __init__(self, *args, **kwargs):
        FullProtocol.__init__(self, *args, **kwargs)
        view2 = self.view2

        all_x = lmap(self.load_pair, view2.flatten())
        all_y = self.view2.flatten()['label']
        splits = []
        for fold_i, test_fold in enumerate(view2):

            # -- test
            test_x = lmap(self.load_pair, test_fold)
            test_y = test_fold['label']

            train_x = lmap(self.load_pair,
                    np.concatenate([
                        fold
                        for fold_j, fold in enumerate(view2)
                        if fold_j != fold_i]))
            train_y = np.concatenate([
                fold['label']
                for fold_j, fold in enumerate(view2)
                if fold_j != fold_i])

            splits.append(
                    dotdict(
                        x=all_x,
                        y=all_y,
                        train=dotdict(x=train_x, y=train_y),
                        test=dotdict(x=test_x, y=test_y),
                        )
                    )

        self.x = all_x
        self.y = all_y
        self.splits = splits
Example #3
0
    def __init__(self, *args, **kwargs):
        FullProtocol.__init__(self, *args, **kwargs)
        view2 = self.view2

        all_x = lmap(self.load_pair, view2.flatten())
        all_y = self.view2.flatten()['label']
        splits = []
        for fold_i, test_fold in enumerate(view2):

            # -- test
            test_x = lmap(self.load_pair, test_fold)
            test_y = test_fold['label']

            train_x = lmap(
                self.load_pair,
                np.concatenate([
                    fold for fold_j, fold in enumerate(view2)
                    if fold_j != fold_i
                ]))
            train_y = np.concatenate([
                fold['label'] for fold_j, fold in enumerate(view2)
                if fold_j != fold_i
            ])

            splits.append(
                dotdict(
                    x=all_x,
                    y=all_y,
                    train=dotdict(x=train_x, y=train_y),
                    test=dotdict(x=test_x, y=test_y),
                ))

        self.x = all_x
        self.y = all_y
        self.splits = splits
Example #4
0
def test_pprint():
    paths = None
    rgb_imgs = larray.lmap(test_pprint, paths)
    rgb_imgs2 = larray.lmap(test_pprint, rgb_imgs)
    s = larray.pprint_str(rgb_imgs2)
    print s
    assert s == """lmap(test_pprint, ...)
def getPixelFeatures(objects_oi, normalize_on=False, IMGPATH=IMGPATH_DEFAULT):
    """ compute pixel features on images of objects of interest """
    meta = pk.load(open(IMGPATH + "metadata.pkl", "r"))

    """ fix obj field"""
    if len(meta[0]["obj"]) == 1:
        for i, m in enumerate(meta):
            meta[i]["obj"] = m["obj"][0]

    meta_ind = []
    image_paths = []
    for i, m in enumerate(meta):
        if m["obj"] in objects_oi:
            meta_ind.append(i)
            image_paths += [IMGPATH + "images/" + m["id"] + ".png"]

    imgs = larray.lmap(
        ImgLoaderResizer(inshape=(256, 256), shape=(256, 256), dtype="float32", normalize=normalize_on, mask=None),
        image_paths,
    )
    imgs = np.array(imgs)
    ts = imgs.shape
    print ts
    pixels_features = imgs.reshape(ts[0], ts[1] * ts[2])
    pixel_meta = meta[meta_ind]
    return pixels_features, pixel_meta
Example #6
0
    def __init__(self, x_dtype='uint8', x_height=250, x_width=250,
            max_n_per_class=None,
            channel_major=False):
        if self.DATASET_CLASS is None:
            raise NotImplementedError("This is an abstract class")

        # -- build/fetch dataset
        self.dataset = self.DATASET_CLASS()
        self.dataset.meta

        pairsDevTrain = self.dataset.pairsDevTrain
        pairsDevTest = self.dataset.pairsDevTest
        pairsView2 = self.dataset.pairsView2

        if max_n_per_class is not None:
            pairsDevTrain = pairsDevTrain[:, :, :max_n_per_class]
            pairsDevTest = pairsDevTest[:, :, :max_n_per_class]
            pairsView2 = pairsView2[:, :, :max_n_per_class]

        logging.info('pairsDevTrain shape %s' % str(pairsDevTrain.shape))
        logging.info('pairsDevTest shape %s' % str(pairsDevTest.shape))
        logging.info('pairsView2 shape %s' % str(pairsView2.shape))

        paths_labels_dev_train = paths_labels(pairsDevTrain)
        paths_labels_dev_test = paths_labels(pairsDevTest)
        paths_labels_view2 = paths_labels(pairsView2)

        all_paths_labels = np.concatenate([
            paths_labels_dev_train.flatten(),
            paths_labels_dev_test.flatten(),
            paths_labels_view2.flatten()])

        rel_paths = sorted_paths(all_paths_labels)

        self.image_paths = [
                self.dataset.home('images', self.dataset.IMAGE_SUBDIR, pth)
                for pth in rel_paths]

        def lookup(pairs):
            rval = paths_labels_lookup(paths_labels(pairs), rel_paths)
            return rval

        self.dev_train = lookup(pairsDevTrain)
        self.dev_test = lookup(pairsDevTest)
        self.view2 = lookup(pairsView2)

        # -- lazy array helper function
        if self.dataset.COLOR:
            ndim, mode, shape = (3, 'RGB', (x_height, x_width, 3))
        else:
            ndim, mode, shape = (3, 'L', (x_height, x_width, 1))
        loader = ImgLoader(ndim=ndim, dtype=x_dtype, mode=mode, shape=shape)

        self.image_pixels = lmap(loader, self.image_paths)
        self.paths_labels_dev_train = paths_labels_dev_train
        self.paths_labels_dev_test = paths_labels_dev_test
        self.paths_labels_view2 = paths_labels_view2

        assert str(self.image_pixels[0].dtype) == x_dtype
        assert self.image_pixels[0].ndim == 3
Example #7
0
    def get_image_features_lmap(self, images, batched_lmap_speed_thresh=None):
        N, H, W, C = images.shape
        assert C in (1, 3)
        # -- this loading must be simple, and match the unsup_images
        # function in lfw. Anything more elaborate must
        # be included in the pyll pipeline
        chmajor_fn = functools.partial(np.transpose, axes=(2, 0, 1))
        chmajor_fn = lmap_info(
            shape=(C, H, W),
            dtype=images.dtype
            )(chmajor_fn)
        def chmajor_fn_f_map(X):
            return np.transpose(X, axes=(0, 3, 1, 2))
        chmajor_fn.f_map = chmajor_fn_f_map

        rval = pyll_theano_batched_lmap(
                scope.partial(scope.callpipe1, self.pipeline['pipe']),
                lmap(chmajor_fn, images),
                batchsize=self.batchsize,
                print_progress_every=10,  # -- seconds
                abort_on_rows_larger_than=self.max_n_features,
                speed_thresh=batched_lmap_speed_thresh,
                x_dtype='uint8', # HAS TO MATCH ./slm.py
                )
        return rval
Example #8
0
def test_usage():
    np.random.seed(123)

    def load_rgb(pth):
        return pth + '_rgb'
    def load_grey(pth):
        return pth + '_grey'
    def to_64x64(img):
        return img + '_64x64'

    paths = ['a', 'b', 'c', 'd']  # imagine some huge list of image paths
    rgb_imgs = larray.lmap(load_rgb, paths)
    grey_imgs = larray.lmap(load_grey, paths)
    paths_64x64 = larray.lmap(to_64x64, grey_imgs)

    train_set = larray.reindex(rgb_imgs, np.random.permutation(len(paths))).loop()

    l10 = list(train_set[range(10)])
    print l10
    assert ['d', 'a', 'b', 'c'] == [l[0] for l in l10[:4]]
Example #9
0
def slm_memmap(desc, X, name, basedir=None):
    """
    Return a cache_memmap object representing the features of the entire
    set of images.
    """
    if basedir is None:
        basedir = os.getcwd()
    feat_fn = SLMFunction(desc, X.shape[1:])
    feat = larray.lmap(feat_fn, X)
    rval = larray.cache_memmap(feat, name, basedir=basedir)
    return rval
Example #10
0
 def get_images(self, preproc):
     dtype = preproc["dtype"]
     mode = preproc["mode"]
     size = tuple(preproc["size"])
     normalize = preproc["normalize"]
     resource_home = self.home("resources")
     return larray.lmap(
         ImgDownloaderResizer(
             resource_home, self.bucket, inshape=self.insize, shape=size, dtype=dtype, normalize=normalize, mode=mode
         ),
         self.filenames,
     )
Example #11
0
def test_using_precompute():
    np.random.seed(123)

    # example library code  starts here
    def load_rgb(pth):
        return pth + '_rgb'
    def load_grey(pth):
        return pth + '_grey'
    def to_64x64(img):
        return img + '_64x64'

    paths = ['a', 'b', 'c', 'd']  # imagine some huge list of image paths
    grey_imgs = larray.lmap(load_grey, paths)
    paths_64x64 = larray.lmap(to_64x64, grey_imgs)

    train_set = larray.reindex(paths_64x64, np.random.permutation(len(paths))
                              ).loop()

    # example user code starts here.
    # It is easy to memmap the __array__ of paths_64x64, but
    # it is more difficult to compute derived things using that
    # memmap.
    
    # pretend this is a memmap of a precomputed quantity, for example.
    use_paths_64x64 = ['stuff', 'i', 'saved', 'from', 'disk']

    # the rest of the original graph (e.g. train_set)
    # doesn't know about our new memmap
    # or mongo-backed proxy, or whatever we're doing.

    new_train_set = larray.clone(train_set, given={paths_64x64: use_paths_64x64})

    l10 = list(new_train_set[range(10)])
    print l10
    assert l10 == [
            'from', 'stuff', 'i', 'saved',
            'from', 'stuff', 'i', 'saved',
            'from', 'stuff']
Example #12
0
def test_using_precompute():
    np.random.seed(123)

    # example library code  starts here
    def load_rgb(pth):
        return pth + '_rgb'
    def load_grey(pth):
        return pth + '_grey'
    def to_64x64(img):
        return img + '_64x64'

    paths = ['a', 'b', 'c', 'd']  # imagine some huge list of image paths
    rgb_imgs = larray.lmap(load_rgb, paths)
    grey_imgs = larray.lmap(load_grey, paths)
    paths_64x64 = larray.lmap(to_64x64, grey_imgs)

    train_set = larray.reindex(paths_64x64, np.random.permutation(len(paths))).loop()

    # example user code starts here.
    # It is easy to memmap the __array__ of paths_64x64, but
    # it is more difficult to compute derived things using that
    # memmap.
    
    # pretend this is a memmap of a precomputed quantity, for example.
    use_paths_64x64 = ['stuff', 'i', 'saved', 'from', 'disk']

    # the rest of the original graph (e.g. train_set)
    # doesn't know about our new memmap
    # or mongo-backed proxy, or whatever we're doing.

    new_train_set = larray.clone(train_set, given={paths_64x64: use_paths_64x64})

    l10 = list(new_train_set[range(10)])
    print l10
    assert l10 == [
            'from', 'stuff', 'i', 'saved',
            'from', 'stuff', 'i', 'saved',
            'from', 'stuff']
Example #13
0
def pairs_memmap(pair_labels, X, comparison_name, name, basedir=None):
    """
    pair_labels    - something like comes out of verification_pairs
    X              - feature vectors to be combined
    combination_fn - some lambda X[i], X[j]: features1D
    """
    if basedir is None:
        basedir = os.getcwd()
    lidxs, ridxs, matches = pair_labels
    pf = larray.lmap(
            PairFeaturesFn(X, comparison_name),
            lidxs,
            ridxs)
    pf_cache = larray.cache_memmap(pf, name, basedir=basedir)
    return pf_cache, np.asarray(matches)
Example #14
0
 def get_images(self, resize_to=(256, 256), mode='L', dtype='float32',
                crop=None, mask=None, normalize=True):
     """
     Create a lazily reevaluated array with preprocessing specified by the parameters
     resize_to: Image is resized to the tuple given here (note: not reshaped)
     dtype: The datatype of the image array
     mode: 'RGB' or 'L' sepcifies whether or not to store color images
     mask: Image object which is used to mask the image
     crop: array of [minx, maxx, miny, maxy] crop box applied after resize
     normalize: If true, then the image set to zero mean and unit standard deviation
     """
     file_names = [filename for filename in self.meta['filename']]
     return larray.lmap(ImgDownloaderResizer(resize_to=resize_to, dtype=dtype, normalize=normalize,
                                             crop=crop, mask=mask, mode=mode, cache=self.img_cache),
                                             file_names)
Example #15
0
    def get_images(self, preproc, n_jobs=-1, cache=False):
        """
        Create a lazily reevaluated array with preprocessing specified by a preprocessing dictionary
        preproc. See the documentation in ImgDownloaderCacherPreprocesser

        """
        file_names = self.meta["filename"]
        # file_ids = self.meta['id']
        file_ids = np.arange(self.meta.shape[0])
        img_source = get_img_source()
        cachedir = self.imagenet_home("images")
        processor = ImgDownloaderPreprocessor(
            source=img_source, preproc=preproc, n_jobs=n_jobs, cache=cache, cachedir=cachedir
        )
        return larray.lmap(processor, file_names, file_ids, f_map=processor)
 def __init__(self, coll, fs, query, preproc):
     assert len(size) == 2 
     self.coll = coll
     self.fs = fs
     self.query = query
     cursor = coll.find(query).sort('filename')
     self.meta = list(cursor)
     self.filenames = [m['filename'] for m in self.meta]
     self.preproc = preproc
     normalize = self.preproc.get('global_normalize', True)
     size = tuple(self.preproc.get('size', (200, 200)))
     self.imgs = larray.lmap(ImgLoader(fs, ndim=3, shape=size + (3,),
                                       mode='RGB',
                                       normalize=normalize),
                             self.filenames)
Example #17
0
def get_stimarray(marray, mname, perm, perm_id, cache_type, base_dir, read_mode='r'):
    reorder = Reorder2(marray)
    lmap = larray.lmap(reorder, perm, f_map = reorder)
    if cache_type == 'hdf5':
        new_name = mname + '_' + perm_id + '_hdf5'
        print('Getting stimuli from cache hdf5 at %s/%s ' % (base_dir, new_name))
        return larray.cache_hdf5(lmap,
                              name=new_name,
                              basedir=base_dir,
                              mode=read_mode)
    elif cache_type == 'memmap':
        new_name = mname + '_' + perm_id + '_memmap'
        print('Getting stimuli from cache memmap at %s/%s ' % (base_dir, new_name))
        return larray.cache_memmap(lmap,
                              name=new_name,
                              basedir=base_dir)
Example #18
0
    def get_images(self, preproc, n_jobs=-1, cache=False):
        """
        Create a lazily reevaluated array with preprocessing specified by a preprocessing dictionary
        preproc. See the documentation in ImgDownloaderCacherPreprocesser

        """
        file_names = self.meta['filename']
        #file_ids = self.meta['id']
        file_ids = np.arange(self.meta.shape[0])
        img_source = get_img_source()
        cachedir = self.imagenet_home('images')
        processor = ImgDownloaderPreprocessor(source=img_source,
                                              preproc=preproc,
                                              n_jobs=n_jobs,
                                              cache=cache,
                                              cachedir=cachedir)
        return larray.lmap(processor, file_names, file_ids, f_map=processor)
def getPixelFeatures_localized(objects_oi, IMGPATH=IMGPATH_DEFAULT):
    """ compute pixel features on images of objects of interest - localized to window based on metadata """
    meta = pk.load(open(IMGPATH + "metadata.pkl", "r"))
    meta_ind, image_paths, pixels_features = [], [], []
    win = 5
    img_size = 256

    for i, m in enumerate(meta):
        if m["obj"] in objects_oi:
            ii = int(-m["tz"] * img_size / 2 + img_size / 2)
            jj = int(m["ty"] * img_size / 2 + img_size / 2)

            meta_ind.append(i)
            fn = [IMGPATH + "images/" + m["obj"] + "_" + m["id"] + ".png"]
            img = larray.lmap(
                ImgLoaderResizer(
                    inshape=(1024, 1024), shape=(img_size, img_size), dtype="float32", normalize=False, mask=None
                ),
                fn,
            )
            img = np.squeeze(np.array(img))

            # if image section goes beyond border, add a zero padding
            pad = np.zeros(img.shape)
            if ii - win < 0:
                img = np.concatenate((pad, img), axis=0)
                ii += img_size
            elif (ii + win) >= img_size:
                img = np.concatenate((img, pad), axis=0)
            pad = np.zeros(img.shape)
            if jj - win < 0:
                img = np.concatenate((pad, img), axis=1)
                jj += img_size
            elif jj + win >= img_size:
                img = np.concatenate((img, pad), axis=1)

            tmp = img[ii - win : ii + win, jj - win : jj + win].flatten()
            pixels_features.append(tmp)
            image_paths += fn

    pixels_features = np.array(pixels_features)
    pixel_meta = meta[meta_ind]
    return pixels_features, pixel_meta
Example #20
0
def test_usage():
    np.random.seed(123)

    def load_rgb(pth):
        return pth + '_rgb'
    def load_grey(pth):
        return pth + '_grey'
    def to_64x64(img):
        return img + '_64x64'

    paths = ['a', 'b', 'c', 'd']  # imagine some huge list of image paths
    rgb_imgs = larray.lmap(load_rgb, paths)

    train_set = larray.reindex(rgb_imgs, np.random.permutation(len(paths))
                              ).loop()

    l10 = list(train_set[range(10)])
    print l10
    assert ['d', 'a', 'b', 'c'] == [l[0] for l in l10[:4]]
Example #21
0
def get_images(dtype, preproc):
    """
    Return a lazy array whose elements are all the images in lfw.

    XXX: Should the images really be returned in greyscale?

    preproc : a dictionary with keys:
        global_normalize - True / False
        size - (height, width)
        crop - (l, t, r, b)

    """

    all_paths = skdata.lfw.Aligned().raw_classification_task()[0]
    rval = larray.lmap(
                ImgLoaderResizer(
                    dtype=dtype,
                    shape=preproc['size'],
                    crop=preproc['crop'],
                    normalize=preproc['global_normalize']),
                all_paths)
    return rval
def getPixelFeatures(objects_oi, normalize_on=False):
    """ compute pixel features on images of objects of interest """
    meta = pk.load(open(IMGPATH + 'metadata.pkl', 'r'))

    """ fix obj field"""
    if len(meta[0]['obj']) == 1:
        for i,m in enumerate(meta):
            meta[i]['obj'] = m['obj'][0]

    meta_ind = []
    image_paths = []
    for i, m in enumerate(meta):
        if m['obj'] in objects_oi:
            meta_ind.append(i)
            image_paths +=  [IMGPATH + 'obj64s100/' + m['id'] + '.png']

    imgs = larray.lmap(ImgLoaderResizer(inshape=(256,256), shape=(256,256), dtype='float32',normalize=normalize_on, mask=None), image_paths)
    imgs = np.array(imgs)
    ts = imgs.shape
    print ts
    pixels_features = imgs.reshape(ts[0], ts[1]*ts[2])
    pixel_meta = meta[meta_ind]
    return pixels_features, pixel_meta
Example #23
0
    def get_fg11_features(suffix, expected_shape):
        dataset = skdata.lfw.Aligned()
        paths, identities = dataset.raw_classification_task()
        def load_path(path):
            basename = os.path.basename(path)
            name = basename[:-9]  # cut off the digits and the .jpg
            # -- touch the jpg to make sure it's there
            new_path = os.path.join(
                feature_root,
                name,
                basename)
            feature_path = new_path + suffix
            print 'loading', feature_path
            data = scipy.io.loadmat(feature_path)['data']
            assert data.shape == expected_shape
            return np.asarray(data, dtype='float32')
        # -- apply decorator manually here in nested scope
        load_path = larray.lmap_info(
            shape=expected_shape,
            dtype='float32')(load_path)

        rval = larray.lmap(load_path, paths)
        rval = larray.cache_memmap(rval, 'fcache_' + suffix, basedir=os.getcwd())
        return rval
    def get_image_features_lmap(self, images, batched_lmap_speed_thresh=None):
        N, H, W, C = images.shape
        assert C in (1, 3)
        # -- this loading must be simple, and match the unsup_images
        # function in lfw. Anything more elaborate must
        # be included in the pyll pipeline
        chmajor_fn = functools.partial(np.transpose, axes=(2, 0, 1))
        chmajor_fn = lmap_info(shape=(C, H, W), dtype=images.dtype)(chmajor_fn)

        def chmajor_fn_f_map(X):
            return np.transpose(X, axes=(0, 3, 1, 2))

        chmajor_fn.f_map = chmajor_fn_f_map

        rval = pyll_theano_batched_lmap(
            scope.partial(scope.callpipe1, self.pipeline['pipe']),
            lmap(chmajor_fn, images),
            batchsize=self.batchsize,
            print_progress_every=10,  # -- seconds
            abort_on_rows_larger_than=self.max_n_features,
            speed_thresh=batched_lmap_speed_thresh,
            x_dtype='uint8',  # HAS TO MATCH ./slm.py
        )
        return rval
Example #25
0
    def __init__(self,
                 x_dtype='uint8',
                 x_height=250,
                 x_width=250,
                 max_n_per_class=None,
                 channel_major=False):
        if self.DATASET_CLASS is None:
            raise NotImplementedError("This is an abstract class")

        # -- build/fetch dataset
        self.dataset = self.DATASET_CLASS()
        self.dataset.meta

        pairsDevTrain = self.dataset.pairsDevTrain
        pairsDevTest = self.dataset.pairsDevTest
        pairsView2 = self.dataset.pairsView2

        if max_n_per_class is not None:
            pairsDevTrain = pairsDevTrain[:, :, :max_n_per_class]
            pairsDevTest = pairsDevTest[:, :, :max_n_per_class]
            pairsView2 = pairsView2[:, :, :max_n_per_class]

        logging.info('pairsDevTrain shape %s' % str(pairsDevTrain.shape))
        logging.info('pairsDevTest shape %s' % str(pairsDevTest.shape))
        logging.info('pairsView2 shape %s' % str(pairsView2.shape))

        paths_labels_dev_train = paths_labels(pairsDevTrain)
        paths_labels_dev_test = paths_labels(pairsDevTest)
        paths_labels_view2 = paths_labels(pairsView2)

        all_paths_labels = np.concatenate([
            paths_labels_dev_train.flatten(),
            paths_labels_dev_test.flatten(),
            paths_labels_view2.flatten()
        ])

        rel_paths = sorted_paths(all_paths_labels)

        self.image_paths = [
            self.dataset.home('images', self.dataset.IMAGE_SUBDIR, pth)
            for pth in rel_paths
        ]

        def lookup(pairs):
            rval = paths_labels_lookup(paths_labels(pairs), rel_paths)
            return rval

        self.dev_train = lookup(pairsDevTrain)
        self.dev_test = lookup(pairsDevTest)
        self.view2 = lookup(pairsView2)

        # -- lazy array helper function
        if self.dataset.COLOR:
            ndim, mode, shape = (3, 'RGB', (x_height, x_width, 3))
        else:
            ndim, mode, shape = (2, 'L', (x_height, x_width))
        loader = ImgLoader(ndim=ndim, dtype=x_dtype, mode=mode, shape=shape)

        self.image_pixels = lmap(loader, self.image_paths)
        self.paths_labels_dev_train = paths_labels_dev_train
        self.paths_labels_dev_test = paths_labels_dev_test
        self.paths_labels_view2 = paths_labels_view2
Example #26
0
def pyll_theano_batched_lmap(pipeline, seq, batchsize,
        _debug_call_counts=None,
        print_progress_every=float('inf'),
        abort_on_rows_larger_than=None,
        speed_thresh=None,
        x_dtype='float32',
        ):
    """
    This function returns a skdata.larray.lmap object whose function
    is defined by a theano expression.

    The theano expression will be built and compiled specifically for the
    dimensions of the given `seq`. Therefore, in_rows, and out_rows should
    actually be a *pyll* graph, that evaluates to a theano graph.
    """

    in_shp = (batchsize,) + seq.shape[1:]
    batch = np.zeros(in_shp, dtype=x_dtype)
    s_ibatch = theano.shared(batch)
    s_xi = theano.tensor.as_tensor_variable(s_ibatch).type()
    s_N = s_xi.shape[0]
    s_X = theano.tensor.set_subtensor(s_ibatch[:s_N], s_xi)
    #print 'PIPELINE', pipeline
    thing = pipeline((s_X, in_shp))
    #print 'THING'
    #print thing
    #print '==='
    s_obatch, oshp = pyll.rec_eval(thing)
    assert oshp[0] == batchsize
    logger.info('batched_lmap oshp %s' % str(oshp))
    if abort_on_rows_larger_than:
        rowlen = np.prod(oshp[1:])
        if rowlen > abort_on_rows_larger_than:
            raise ValueError('rowlen %i exceeds limit %i' % (
                rowlen, abort_on_rows_larger_than))

    # Compile a function that takes a variable number of elements in,
    # returns the same number of processed elements out,
    # but does all internal computations using a fixed number of elements,
    # because convolutions are fastest when they're hard-coded to a certain
    # size.
    logger.debug('pyll_theano_batched_lmap compiling fn')
    _fn = theano.function([theano.Param(s_xi, strict=True)],
            s_obatch[:s_N],
            updates={
                s_ibatch: s_X, # this allows the inc_subtensor to be in-place
                })
    logger.debug('pyll_theano_batched_lmap compiling fn -> done')

    sums = {'elems': 0, 'times': 0.0}
    if speed_thresh is None:
        time_fn = _fn
    else:
        def time_fn(X):
            t0 = time.time()
            if str(X.dtype) != x_dtype:
                print 'time_fn dtype problem', X.dtype, x_dtype
            rval = _fn(X)
            dt = time.time() - t0
            #print 'DEBUG time_fn dt:', dt
            sums['elems'] += len(X)
            sums['times'] += dt
            return rval

        def raise_if_slow():
            exc = EvalTimeout(
                'batched_lmap failed to compute %i elements in %f secs'
                % (speed_thresh['elements'], speed_thresh['seconds']))
            if sums['elems'] >= speed_thresh['elements']:
                observed_ratio = sums['elems'] / sums['times']
                required_ratio = (speed_thresh['elements'] /
                        speed_thresh['seconds'])
                if observed_ratio < required_ratio:
                    raise exc
                else:
                    sums['elems'] = 0
                    sums['times'] = 0.0

    def fn_1(x):
        if _debug_call_counts:
            _debug_call_counts['fn_1'] += 1
        return time_fn(x[None, :, :, :])[0]

    attrs = {
            'shape': oshp[1:],
            'ndim': len(oshp) -1,
            'dtype': s_obatch.dtype }
    def rval_getattr(attr, objs):
        # -- objs don't matter to the structure of the return value
        try:
            return attrs[attr]
        except KeyError:
            raise AttributeError(attr)

    fn_1.rval_getattr = rval_getattr

    last_print_time = [time.time()]

    def check_for_print(offset, X):
        curtime = time.time()
        if (curtime - last_print_time[0]) > print_progress_every:
            logger.info('pyll_theano_batched_lmap.f_map %i %i' % (
                offset, len(X)))
            last_print_time[0] = curtime

        if speed_thresh is not None:
            raise_if_slow()

    def f_map(X):
        if _debug_call_counts:
            _debug_call_counts['f_map'] += 1

        if len(X) == batchsize:
            check_for_print(offset=0, X=X)
            return time_fn(X)

        rval = np.empty((len(X),) + oshp[1:], dtype=s_obatch.dtype)
        offset = 0
        while offset < len(X):
            check_for_print(offset, X)
            xi = X[offset: offset + batchsize]
            fn_i = time_fn(xi)
            if not np.all(np.isfinite(fn_i)):
                raise ValueError('non-finite features')
            rval[offset:offset + len(xi)] = fn_i
            offset += len(xi)
        return rval

    return larray.lmap(fn_1, seq, f_map=f_map)
Example #27
0
    def normalized_image_features(self, images, xmean, xstd, avg_nrm,
        n_rows_to_estimate_stats=1000,
        flatten=True,
        batched_lmap_speed_thresh=None,
        ):
        """
        svm_dct - dict
            dict of parameters for normalization:
                'remove_std0'
                'varthresh'
                'divrowl2'
            write xmean, xstd if role is 'train'
            read xmean and xstd if role is 'test'
        role - str
            either 'train' or 'test'
        n_rows_to_estimate_stats - bool
            estimate xmean and xstd from the first N feature vectors
        flatten - bool
            return features flattened to vectors
        """

        if not flatten:
            raise NotImplementedError('only flatten is implemented')

        pipeline = self.pipeline
        features_lmap = self.get_image_features_lmap(images)

        n_features = np.prod(features_lmap.shape[1:])

        if xmean is None:
            # -- load enough training data into memory to estimate stats
            cache_train = flatten_elems(
                features_lmap[:n_rows_to_estimate_stats])

            xmean, xstd = mean_and_std(
                cache_train,
                remove_std0=pipeline['remove_std0'])

            xstd = np.sqrt(xstd ** 2 + pipeline['varthresh'])

            if pipeline['divrowl2']:
                avg_nrm = 1e-7 + average_row_l2norm(
                    (cache_train - xmean) / xstd)
            else:
                avg_nrm = 1

        def normalize(x):
            return (x.flatten() - xmean) / (xstd * avg_nrm)

        def normalize_many(x):
            return (x.reshape((len(x), -1)) - xmean) / (xstd * avg_nrm)

        normed_features = lmap(
            lmap_info(
                shape=(n_features,),
                dtype=features_lmap.dtype)(normalize),
            features_lmap,
            ragged=False,
            f_map=normalize_many)

        return normed_features, xmean, xstd, avg_nrm
Example #28
0
    def __init__(self, data_dir, batch_range, init_epoch=1,
                       init_batchnum=None, dp_params=None, test=False,
                       read_mode='r', cache_type='memmap'):

        #load dataset and meta
        modulename, attrname = dp_params['dataset_name']
        module = importlib.import_module(modulename)
        self.dp_params = dp_params
        print('DP_PARAMS', dp_params)
        dataset_obj = getattr(module, attrname)
        print(module, attrname)
        dataset_data = dp_params.get('dataset_data', None)
        if dataset_data is not None:
            dset = dataset_obj(data=dataset_data)
        else:
            dset = dataset_obj()
        self.dset = dset        
        
        perm_type = dp_params.get('perm_type')
        perm, perm_id = self.get_perm()        
        self.perm = perm
        self.perm_id = perm_id
        if 'subslice' in dp_params:
            subslice_method, subslice_kwargs = self.subslice = dp_params['subslice']
            subslice = getattr(self.dset, subslice_method)(**subslice_kwargs).nonzero()[0]
            if perm is not None:
                self.subslice = fast.isin(perm, subslice).nonzero()[0]
            else:
                self.subslice = subslice

        metacol = self.metacol = self.get_metacol()
        if hasattr(metacol, 'keys'):
        	mlen = len(metacol.values()[0])
        else:
        	mlen = len(metacol)

        #compute number of batches
        batch_size = self.batch_size = dp_params['batch_size']
        num_batches = self.num_batches = int(math.ceil(mlen / float(batch_size)))
        num_batches_for_meta = self.num_batches_for_meta = dp_params['num_batches_for_mean']

        images = dset.get_images(preproc=dp_params['preproc'])
        if hasattr(images, 'dirname'):
            base_dir, orig_name = os.path.split(images.dirname)
        else:
            base_dir = dset.home('cache')
            orig_name = 'images_cache_' + get_id(dp_params['preproc'])

        reorder = Reorder(images)
        lmap = larray.lmap(reorder, self.perm, f_map=reorder)
        if cache_type == 'hdf5':
            new_name = orig_name + '_' + self.perm_id + '_hdf5'
            print('Getting stimuli from cache hdf5 at %s/%s ' % (base_dir, new_name))
            self.stimarray = larray.cache_hdf5(lmap,
                                  name=new_name,
                                  basedir=base_dir,
                                  mode=read_mode)
        elif cache_type == 'memmap':
            new_name = orig_name + '_' + self.perm_id + '_memmap'
            print('Getting stimuli from cache memmap at %s/%s ' % (base_dir, new_name))
            self.stimarray = larray.cache_memmap(lmap,
                                  name=new_name,
                                  basedir=base_dir)


        #default data location
        if data_dir == '':
            pstring = hashlib.sha1(repr(dp_params['preproc'])).hexdigest() + '_%d' % dp_params['batch_size']
            data_dir = dset.home('convnet_batches', pstring)
        if not os.path.exists(data_dir):
            print('data_dir %s does not exist, creating' % data_dir)
            os.makedirs(data_dir)
            
        if hasattr(self, 'subslice'):
            hashval = get_id(tuple(subslice.tolist()))
            metafile = os.path.join(data_dir, 'batches_%s.meta' % hashval)
        else:
            metafile = os.path.join(data_dir, 'batches.meta')
        self.metafile = metafile

        if os.path.exists(metafile):
            print('Meta file at %s exists, loading' % metafile)
            bmeta = cPickle.load(open(metafile))
            #assertions checking that the things that need to be the same
            #for these batches to make sense are in fact the same
            assert dp_params['batch_size'] == bmeta['num_cases_per_batch'], (dp_params['batch_size'], bmeta['num_cases_per_batch'])
            if 'subslice' in bmeta or 'subslice' in dp_params:
            	assert dp_params['subslice'] == bmeta['subslice']
            if 'dataset_name' in bmeta:
                assert dp_params['dataset_name'] == bmeta['dataset_name'], (dp_params['dataset_name'], bmeta['dataset_name'])
            if 'preproc' in bmeta:
                assert dp_params['preproc'] == bmeta['preproc'], (dp_params['preproc'], bmeta['preproc'])
                #pass
            if 'dataset_data' in bmeta:
                assert dataset_data == bmeta['dataset_data'], (dataset_data, bmeta['dataset_data'])
        else:
            print('Making batches.meta at %s ...' % metafile)
            imgs_mean = None
            isf = 0
            for bn in range(num_batches_for_meta):
                print('Meta batch %d' % bn)
                #get stimuli and put in the required format
                stims = self.get_stims(bn, batch_size)
                print('Got stims', stims.shape, stims.nbytes)
                if 'float' in repr(stims.dtype):
                    stims = n.uint8(n.round(255 * stims))
                print('Converted to uint8', stims.nbytes)
                d = dldata_to_convnet_reformatting(stims, None)
                #add to the mean
                if imgs_mean is None:
                    imgs_mean = n.zeros((d['data'].shape[0],))
                dlen = d['data'].shape[0]
                fr = isf / (isf + float(dlen))
                imgs_mean *= fr
                imgs_mean += (1 - fr) * d['data'].mean(axis=1)
                isf += dlen

            #write out batches.meta
            outdict = {'num_cases_per_batch': batch_size,
                       'label_names': self.labels_unique,
                       'num_vis': d['data'].shape[0],
                       'data_mean': imgs_mean,
                       'dataset_name': dp_params['dataset_name'],
                       'dataset_data': dataset_data,
                       'preproc': dp_params['preproc']}
            if dp_params.has_key('subslice'):
            	outdict['subslice'] = dp_params['subslice']
            with open(metafile, 'wb') as _f:
                cPickle.dump(outdict, _f)

        self.batch_meta = cPickle.load(open(metafile, 'rb'))

        LabeledDataProvider.__init__(self, data_dir, batch_range,
                                 init_epoch, init_batchnum, dp_params, test)
Example #29
0
def pyll_theano_batched_lmap(
    pipeline,
    seq,
    batchsize,
    _debug_call_counts=None,
    print_progress_every=float('inf'),
    abort_on_rows_larger_than=None,
    speed_thresh=None,
    x_dtype='float32',
):
    """
    This function returns a skdata.larray.lmap object whose function
    is defined by a theano expression.

    The theano expression will be built and compiled specifically for the
    dimensions of the given `seq`. Therefore, in_rows, and out_rows should
    actually be a *pyll* graph, that evaluates to a theano graph.
    """

    in_shp = (batchsize, ) + seq.shape[1:]
    batch = np.zeros(in_shp, dtype=x_dtype)
    s_ibatch = theano.shared(batch)
    s_xi = theano.tensor.as_tensor_variable(s_ibatch).type()
    s_N = s_xi.shape[0]
    s_X = theano.tensor.set_subtensor(s_ibatch[:s_N], s_xi)
    #print 'PIPELINE', pipeline
    thing = pipeline((s_X, in_shp))
    #print 'THING'
    #print thing
    #print '==='
    s_obatch, oshp = pyll.rec_eval(thing)
    assert oshp[0] == batchsize
    logger.info('batched_lmap oshp %s' % str(oshp))
    if abort_on_rows_larger_than:
        rowlen = np.prod(oshp[1:])
        if rowlen > abort_on_rows_larger_than:
            raise ValueError('rowlen %i exceeds limit %i' %
                             (rowlen, abort_on_rows_larger_than))

    # Compile a function that takes a variable number of elements in,
    # returns the same number of processed elements out,
    # but does all internal computations using a fixed number of elements,
    # because convolutions are fastest when they're hard-coded to a certain
    # size.
    logger.debug('pyll_theano_batched_lmap compiling fn')
    _fn = theano.function(
        [theano.Param(s_xi, strict=True)],
        s_obatch[:s_N],
        updates={
            s_ibatch: s_X,  # this allows the inc_subtensor to be in-place
        })
    logger.debug('pyll_theano_batched_lmap compiling fn -> done')

    sums = {'elems': 0, 'times': 0.0}
    if speed_thresh is None:
        time_fn = _fn
    else:

        def time_fn(X):
            t0 = time.time()
            if str(X.dtype) != x_dtype:
                print 'time_fn dtype problem', X.dtype, x_dtype
            rval = _fn(X)
            dt = time.time() - t0
            #print 'DEBUG time_fn dt:', dt
            sums['elems'] += len(X)
            sums['times'] += dt
            return rval

        def raise_if_slow():
            exc = EvalTimeout(
                'batched_lmap failed to compute %i elements in %f secs' %
                (speed_thresh['elements'], speed_thresh['seconds']))
            if sums['elems'] >= speed_thresh['elements']:
                observed_ratio = sums['elems'] / sums['times']
                required_ratio = (speed_thresh['elements'] /
                                  speed_thresh['seconds'])
                if observed_ratio < required_ratio:
                    raise exc
                else:
                    sums['elems'] = 0
                    sums['times'] = 0.0

    def fn_1(x):
        if _debug_call_counts:
            _debug_call_counts['fn_1'] += 1
        return time_fn(x[None, :, :, :])[0]

    attrs = {'shape': oshp[1:], 'ndim': len(oshp) - 1, 'dtype': s_obatch.dtype}

    def rval_getattr(attr, objs):
        # -- objs don't matter to the structure of the return value
        try:
            return attrs[attr]
        except KeyError:
            raise AttributeError(attr)

    fn_1.rval_getattr = rval_getattr

    last_print_time = [time.time()]

    def check_for_print(offset, X):
        curtime = time.time()
        if (curtime - last_print_time[0]) > print_progress_every:
            logger.info('pyll_theano_batched_lmap.f_map %i %i' %
                        (offset, len(X)))
            last_print_time[0] = curtime

        if speed_thresh is not None:
            raise_if_slow()

    def f_map(X):
        if _debug_call_counts:
            _debug_call_counts['f_map'] += 1

        if len(X) == batchsize:
            check_for_print(offset=0, X=X)
            return time_fn(X)

        rval = np.empty((len(X), ) + oshp[1:], dtype=s_obatch.dtype)
        offset = 0
        while offset < len(X):
            check_for_print(offset, X)
            xi = X[offset:offset + batchsize]
            fn_i = time_fn(xi)
            if not np.all(np.isfinite(fn_i)):
                raise ValueError('non-finite features')
            rval[offset:offset + len(xi)] = fn_i
            offset += len(xi)
        return rval

    return larray.lmap(fn_1, seq, f_map=f_map)
    def normalized_image_features(
        self,
        images,
        xmean,
        xstd,
        avg_nrm,
        n_rows_to_estimate_stats=1000,
        flatten=True,
        batched_lmap_speed_thresh=None,
    ):
        """
        svm_dct - dict
            dict of parameters for normalization:
                'remove_std0'
                'varthresh'
                'divrowl2'
            write xmean, xstd if role is 'train'
            read xmean and xstd if role is 'test'
        role - str
            either 'train' or 'test'
        n_rows_to_estimate_stats - bool
            estimate xmean and xstd from the first N feature vectors
        flatten - bool
            return features flattened to vectors
        """

        if not flatten:
            raise NotImplementedError('only flatten is implemented')

        pipeline = self.pipeline
        features_lmap = self.get_image_features_lmap(images)

        n_features = np.prod(features_lmap.shape[1:])

        if xmean is None:
            # -- load enough training data into memory to estimate stats
            cache_train = flatten_elems(
                features_lmap[:n_rows_to_estimate_stats])

            xmean, xstd = mean_and_std(cache_train,
                                       remove_std0=pipeline['remove_std0'])

            xstd = np.sqrt(xstd**2 + pipeline['varthresh'])

            if pipeline['divrowl2']:
                avg_nrm = 1e-7 + average_row_l2norm(
                    (cache_train - xmean) / xstd)
            else:
                avg_nrm = 1

        def normalize(x):
            return (x.flatten() - xmean) / (xstd * avg_nrm)

        def normalize_many(x):
            return (x.reshape((len(x), -1)) - xmean) / (xstd * avg_nrm)

        normed_features = lmap(lmap_info(shape=(n_features, ),
                                         dtype=features_lmap.dtype)(normalize),
                               features_lmap,
                               ragged=False,
                               f_map=normalize_many)

        return normed_features, xmean, xstd, avg_nrm