def test_lprint(): paths = None rgb_imgs = larray.lmap(test_lprint, paths) rgb_imgs2 = larray.lmap(test_lprint, rgb_imgs) s = larray.lprint_str(rgb_imgs2) print s assert s == """lmap(test_lprint, ...)
def __init__(self, *args, **kwargs): FullProtocol.__init__(self, *args, **kwargs) view2 = self.view2 all_x = lmap(self.load_pair, view2.flatten()) all_y = self.view2.flatten()['label'] splits = [] for fold_i, test_fold in enumerate(view2): # -- test test_x = lmap(self.load_pair, test_fold) test_y = test_fold['label'] train_x = lmap(self.load_pair, np.concatenate([ fold for fold_j, fold in enumerate(view2) if fold_j != fold_i])) train_y = np.concatenate([ fold['label'] for fold_j, fold in enumerate(view2) if fold_j != fold_i]) splits.append( dotdict( x=all_x, y=all_y, train=dotdict(x=train_x, y=train_y), test=dotdict(x=test_x, y=test_y), ) ) self.x = all_x self.y = all_y self.splits = splits
def __init__(self, *args, **kwargs): FullProtocol.__init__(self, *args, **kwargs) view2 = self.view2 all_x = lmap(self.load_pair, view2.flatten()) all_y = self.view2.flatten()['label'] splits = [] for fold_i, test_fold in enumerate(view2): # -- test test_x = lmap(self.load_pair, test_fold) test_y = test_fold['label'] train_x = lmap( self.load_pair, np.concatenate([ fold for fold_j, fold in enumerate(view2) if fold_j != fold_i ])) train_y = np.concatenate([ fold['label'] for fold_j, fold in enumerate(view2) if fold_j != fold_i ]) splits.append( dotdict( x=all_x, y=all_y, train=dotdict(x=train_x, y=train_y), test=dotdict(x=test_x, y=test_y), )) self.x = all_x self.y = all_y self.splits = splits
def test_pprint(): paths = None rgb_imgs = larray.lmap(test_pprint, paths) rgb_imgs2 = larray.lmap(test_pprint, rgb_imgs) s = larray.pprint_str(rgb_imgs2) print s assert s == """lmap(test_pprint, ...)
def getPixelFeatures(objects_oi, normalize_on=False, IMGPATH=IMGPATH_DEFAULT): """ compute pixel features on images of objects of interest """ meta = pk.load(open(IMGPATH + "metadata.pkl", "r")) """ fix obj field""" if len(meta[0]["obj"]) == 1: for i, m in enumerate(meta): meta[i]["obj"] = m["obj"][0] meta_ind = [] image_paths = [] for i, m in enumerate(meta): if m["obj"] in objects_oi: meta_ind.append(i) image_paths += [IMGPATH + "images/" + m["id"] + ".png"] imgs = larray.lmap( ImgLoaderResizer(inshape=(256, 256), shape=(256, 256), dtype="float32", normalize=normalize_on, mask=None), image_paths, ) imgs = np.array(imgs) ts = imgs.shape print ts pixels_features = imgs.reshape(ts[0], ts[1] * ts[2]) pixel_meta = meta[meta_ind] return pixels_features, pixel_meta
def __init__(self, x_dtype='uint8', x_height=250, x_width=250, max_n_per_class=None, channel_major=False): if self.DATASET_CLASS is None: raise NotImplementedError("This is an abstract class") # -- build/fetch dataset self.dataset = self.DATASET_CLASS() self.dataset.meta pairsDevTrain = self.dataset.pairsDevTrain pairsDevTest = self.dataset.pairsDevTest pairsView2 = self.dataset.pairsView2 if max_n_per_class is not None: pairsDevTrain = pairsDevTrain[:, :, :max_n_per_class] pairsDevTest = pairsDevTest[:, :, :max_n_per_class] pairsView2 = pairsView2[:, :, :max_n_per_class] logging.info('pairsDevTrain shape %s' % str(pairsDevTrain.shape)) logging.info('pairsDevTest shape %s' % str(pairsDevTest.shape)) logging.info('pairsView2 shape %s' % str(pairsView2.shape)) paths_labels_dev_train = paths_labels(pairsDevTrain) paths_labels_dev_test = paths_labels(pairsDevTest) paths_labels_view2 = paths_labels(pairsView2) all_paths_labels = np.concatenate([ paths_labels_dev_train.flatten(), paths_labels_dev_test.flatten(), paths_labels_view2.flatten()]) rel_paths = sorted_paths(all_paths_labels) self.image_paths = [ self.dataset.home('images', self.dataset.IMAGE_SUBDIR, pth) for pth in rel_paths] def lookup(pairs): rval = paths_labels_lookup(paths_labels(pairs), rel_paths) return rval self.dev_train = lookup(pairsDevTrain) self.dev_test = lookup(pairsDevTest) self.view2 = lookup(pairsView2) # -- lazy array helper function if self.dataset.COLOR: ndim, mode, shape = (3, 'RGB', (x_height, x_width, 3)) else: ndim, mode, shape = (3, 'L', (x_height, x_width, 1)) loader = ImgLoader(ndim=ndim, dtype=x_dtype, mode=mode, shape=shape) self.image_pixels = lmap(loader, self.image_paths) self.paths_labels_dev_train = paths_labels_dev_train self.paths_labels_dev_test = paths_labels_dev_test self.paths_labels_view2 = paths_labels_view2 assert str(self.image_pixels[0].dtype) == x_dtype assert self.image_pixels[0].ndim == 3
def get_image_features_lmap(self, images, batched_lmap_speed_thresh=None): N, H, W, C = images.shape assert C in (1, 3) # -- this loading must be simple, and match the unsup_images # function in lfw. Anything more elaborate must # be included in the pyll pipeline chmajor_fn = functools.partial(np.transpose, axes=(2, 0, 1)) chmajor_fn = lmap_info( shape=(C, H, W), dtype=images.dtype )(chmajor_fn) def chmajor_fn_f_map(X): return np.transpose(X, axes=(0, 3, 1, 2)) chmajor_fn.f_map = chmajor_fn_f_map rval = pyll_theano_batched_lmap( scope.partial(scope.callpipe1, self.pipeline['pipe']), lmap(chmajor_fn, images), batchsize=self.batchsize, print_progress_every=10, # -- seconds abort_on_rows_larger_than=self.max_n_features, speed_thresh=batched_lmap_speed_thresh, x_dtype='uint8', # HAS TO MATCH ./slm.py ) return rval
def test_usage(): np.random.seed(123) def load_rgb(pth): return pth + '_rgb' def load_grey(pth): return pth + '_grey' def to_64x64(img): return img + '_64x64' paths = ['a', 'b', 'c', 'd'] # imagine some huge list of image paths rgb_imgs = larray.lmap(load_rgb, paths) grey_imgs = larray.lmap(load_grey, paths) paths_64x64 = larray.lmap(to_64x64, grey_imgs) train_set = larray.reindex(rgb_imgs, np.random.permutation(len(paths))).loop() l10 = list(train_set[range(10)]) print l10 assert ['d', 'a', 'b', 'c'] == [l[0] for l in l10[:4]]
def slm_memmap(desc, X, name, basedir=None): """ Return a cache_memmap object representing the features of the entire set of images. """ if basedir is None: basedir = os.getcwd() feat_fn = SLMFunction(desc, X.shape[1:]) feat = larray.lmap(feat_fn, X) rval = larray.cache_memmap(feat, name, basedir=basedir) return rval
def get_images(self, preproc): dtype = preproc["dtype"] mode = preproc["mode"] size = tuple(preproc["size"]) normalize = preproc["normalize"] resource_home = self.home("resources") return larray.lmap( ImgDownloaderResizer( resource_home, self.bucket, inshape=self.insize, shape=size, dtype=dtype, normalize=normalize, mode=mode ), self.filenames, )
def test_using_precompute(): np.random.seed(123) # example library code starts here def load_rgb(pth): return pth + '_rgb' def load_grey(pth): return pth + '_grey' def to_64x64(img): return img + '_64x64' paths = ['a', 'b', 'c', 'd'] # imagine some huge list of image paths grey_imgs = larray.lmap(load_grey, paths) paths_64x64 = larray.lmap(to_64x64, grey_imgs) train_set = larray.reindex(paths_64x64, np.random.permutation(len(paths)) ).loop() # example user code starts here. # It is easy to memmap the __array__ of paths_64x64, but # it is more difficult to compute derived things using that # memmap. # pretend this is a memmap of a precomputed quantity, for example. use_paths_64x64 = ['stuff', 'i', 'saved', 'from', 'disk'] # the rest of the original graph (e.g. train_set) # doesn't know about our new memmap # or mongo-backed proxy, or whatever we're doing. new_train_set = larray.clone(train_set, given={paths_64x64: use_paths_64x64}) l10 = list(new_train_set[range(10)]) print l10 assert l10 == [ 'from', 'stuff', 'i', 'saved', 'from', 'stuff', 'i', 'saved', 'from', 'stuff']
def test_using_precompute(): np.random.seed(123) # example library code starts here def load_rgb(pth): return pth + '_rgb' def load_grey(pth): return pth + '_grey' def to_64x64(img): return img + '_64x64' paths = ['a', 'b', 'c', 'd'] # imagine some huge list of image paths rgb_imgs = larray.lmap(load_rgb, paths) grey_imgs = larray.lmap(load_grey, paths) paths_64x64 = larray.lmap(to_64x64, grey_imgs) train_set = larray.reindex(paths_64x64, np.random.permutation(len(paths))).loop() # example user code starts here. # It is easy to memmap the __array__ of paths_64x64, but # it is more difficult to compute derived things using that # memmap. # pretend this is a memmap of a precomputed quantity, for example. use_paths_64x64 = ['stuff', 'i', 'saved', 'from', 'disk'] # the rest of the original graph (e.g. train_set) # doesn't know about our new memmap # or mongo-backed proxy, or whatever we're doing. new_train_set = larray.clone(train_set, given={paths_64x64: use_paths_64x64}) l10 = list(new_train_set[range(10)]) print l10 assert l10 == [ 'from', 'stuff', 'i', 'saved', 'from', 'stuff', 'i', 'saved', 'from', 'stuff']
def pairs_memmap(pair_labels, X, comparison_name, name, basedir=None): """ pair_labels - something like comes out of verification_pairs X - feature vectors to be combined combination_fn - some lambda X[i], X[j]: features1D """ if basedir is None: basedir = os.getcwd() lidxs, ridxs, matches = pair_labels pf = larray.lmap( PairFeaturesFn(X, comparison_name), lidxs, ridxs) pf_cache = larray.cache_memmap(pf, name, basedir=basedir) return pf_cache, np.asarray(matches)
def get_images(self, resize_to=(256, 256), mode='L', dtype='float32', crop=None, mask=None, normalize=True): """ Create a lazily reevaluated array with preprocessing specified by the parameters resize_to: Image is resized to the tuple given here (note: not reshaped) dtype: The datatype of the image array mode: 'RGB' or 'L' sepcifies whether or not to store color images mask: Image object which is used to mask the image crop: array of [minx, maxx, miny, maxy] crop box applied after resize normalize: If true, then the image set to zero mean and unit standard deviation """ file_names = [filename for filename in self.meta['filename']] return larray.lmap(ImgDownloaderResizer(resize_to=resize_to, dtype=dtype, normalize=normalize, crop=crop, mask=mask, mode=mode, cache=self.img_cache), file_names)
def get_images(self, preproc, n_jobs=-1, cache=False): """ Create a lazily reevaluated array with preprocessing specified by a preprocessing dictionary preproc. See the documentation in ImgDownloaderCacherPreprocesser """ file_names = self.meta["filename"] # file_ids = self.meta['id'] file_ids = np.arange(self.meta.shape[0]) img_source = get_img_source() cachedir = self.imagenet_home("images") processor = ImgDownloaderPreprocessor( source=img_source, preproc=preproc, n_jobs=n_jobs, cache=cache, cachedir=cachedir ) return larray.lmap(processor, file_names, file_ids, f_map=processor)
def __init__(self, coll, fs, query, preproc): assert len(size) == 2 self.coll = coll self.fs = fs self.query = query cursor = coll.find(query).sort('filename') self.meta = list(cursor) self.filenames = [m['filename'] for m in self.meta] self.preproc = preproc normalize = self.preproc.get('global_normalize', True) size = tuple(self.preproc.get('size', (200, 200))) self.imgs = larray.lmap(ImgLoader(fs, ndim=3, shape=size + (3,), mode='RGB', normalize=normalize), self.filenames)
def get_stimarray(marray, mname, perm, perm_id, cache_type, base_dir, read_mode='r'): reorder = Reorder2(marray) lmap = larray.lmap(reorder, perm, f_map = reorder) if cache_type == 'hdf5': new_name = mname + '_' + perm_id + '_hdf5' print('Getting stimuli from cache hdf5 at %s/%s ' % (base_dir, new_name)) return larray.cache_hdf5(lmap, name=new_name, basedir=base_dir, mode=read_mode) elif cache_type == 'memmap': new_name = mname + '_' + perm_id + '_memmap' print('Getting stimuli from cache memmap at %s/%s ' % (base_dir, new_name)) return larray.cache_memmap(lmap, name=new_name, basedir=base_dir)
def get_images(self, preproc, n_jobs=-1, cache=False): """ Create a lazily reevaluated array with preprocessing specified by a preprocessing dictionary preproc. See the documentation in ImgDownloaderCacherPreprocesser """ file_names = self.meta['filename'] #file_ids = self.meta['id'] file_ids = np.arange(self.meta.shape[0]) img_source = get_img_source() cachedir = self.imagenet_home('images') processor = ImgDownloaderPreprocessor(source=img_source, preproc=preproc, n_jobs=n_jobs, cache=cache, cachedir=cachedir) return larray.lmap(processor, file_names, file_ids, f_map=processor)
def getPixelFeatures_localized(objects_oi, IMGPATH=IMGPATH_DEFAULT): """ compute pixel features on images of objects of interest - localized to window based on metadata """ meta = pk.load(open(IMGPATH + "metadata.pkl", "r")) meta_ind, image_paths, pixels_features = [], [], [] win = 5 img_size = 256 for i, m in enumerate(meta): if m["obj"] in objects_oi: ii = int(-m["tz"] * img_size / 2 + img_size / 2) jj = int(m["ty"] * img_size / 2 + img_size / 2) meta_ind.append(i) fn = [IMGPATH + "images/" + m["obj"] + "_" + m["id"] + ".png"] img = larray.lmap( ImgLoaderResizer( inshape=(1024, 1024), shape=(img_size, img_size), dtype="float32", normalize=False, mask=None ), fn, ) img = np.squeeze(np.array(img)) # if image section goes beyond border, add a zero padding pad = np.zeros(img.shape) if ii - win < 0: img = np.concatenate((pad, img), axis=0) ii += img_size elif (ii + win) >= img_size: img = np.concatenate((img, pad), axis=0) pad = np.zeros(img.shape) if jj - win < 0: img = np.concatenate((pad, img), axis=1) jj += img_size elif jj + win >= img_size: img = np.concatenate((img, pad), axis=1) tmp = img[ii - win : ii + win, jj - win : jj + win].flatten() pixels_features.append(tmp) image_paths += fn pixels_features = np.array(pixels_features) pixel_meta = meta[meta_ind] return pixels_features, pixel_meta
def test_usage(): np.random.seed(123) def load_rgb(pth): return pth + '_rgb' def load_grey(pth): return pth + '_grey' def to_64x64(img): return img + '_64x64' paths = ['a', 'b', 'c', 'd'] # imagine some huge list of image paths rgb_imgs = larray.lmap(load_rgb, paths) train_set = larray.reindex(rgb_imgs, np.random.permutation(len(paths)) ).loop() l10 = list(train_set[range(10)]) print l10 assert ['d', 'a', 'b', 'c'] == [l[0] for l in l10[:4]]
def get_images(dtype, preproc): """ Return a lazy array whose elements are all the images in lfw. XXX: Should the images really be returned in greyscale? preproc : a dictionary with keys: global_normalize - True / False size - (height, width) crop - (l, t, r, b) """ all_paths = skdata.lfw.Aligned().raw_classification_task()[0] rval = larray.lmap( ImgLoaderResizer( dtype=dtype, shape=preproc['size'], crop=preproc['crop'], normalize=preproc['global_normalize']), all_paths) return rval
def getPixelFeatures(objects_oi, normalize_on=False): """ compute pixel features on images of objects of interest """ meta = pk.load(open(IMGPATH + 'metadata.pkl', 'r')) """ fix obj field""" if len(meta[0]['obj']) == 1: for i,m in enumerate(meta): meta[i]['obj'] = m['obj'][0] meta_ind = [] image_paths = [] for i, m in enumerate(meta): if m['obj'] in objects_oi: meta_ind.append(i) image_paths += [IMGPATH + 'obj64s100/' + m['id'] + '.png'] imgs = larray.lmap(ImgLoaderResizer(inshape=(256,256), shape=(256,256), dtype='float32',normalize=normalize_on, mask=None), image_paths) imgs = np.array(imgs) ts = imgs.shape print ts pixels_features = imgs.reshape(ts[0], ts[1]*ts[2]) pixel_meta = meta[meta_ind] return pixels_features, pixel_meta
def get_fg11_features(suffix, expected_shape): dataset = skdata.lfw.Aligned() paths, identities = dataset.raw_classification_task() def load_path(path): basename = os.path.basename(path) name = basename[:-9] # cut off the digits and the .jpg # -- touch the jpg to make sure it's there new_path = os.path.join( feature_root, name, basename) feature_path = new_path + suffix print 'loading', feature_path data = scipy.io.loadmat(feature_path)['data'] assert data.shape == expected_shape return np.asarray(data, dtype='float32') # -- apply decorator manually here in nested scope load_path = larray.lmap_info( shape=expected_shape, dtype='float32')(load_path) rval = larray.lmap(load_path, paths) rval = larray.cache_memmap(rval, 'fcache_' + suffix, basedir=os.getcwd()) return rval
def get_image_features_lmap(self, images, batched_lmap_speed_thresh=None): N, H, W, C = images.shape assert C in (1, 3) # -- this loading must be simple, and match the unsup_images # function in lfw. Anything more elaborate must # be included in the pyll pipeline chmajor_fn = functools.partial(np.transpose, axes=(2, 0, 1)) chmajor_fn = lmap_info(shape=(C, H, W), dtype=images.dtype)(chmajor_fn) def chmajor_fn_f_map(X): return np.transpose(X, axes=(0, 3, 1, 2)) chmajor_fn.f_map = chmajor_fn_f_map rval = pyll_theano_batched_lmap( scope.partial(scope.callpipe1, self.pipeline['pipe']), lmap(chmajor_fn, images), batchsize=self.batchsize, print_progress_every=10, # -- seconds abort_on_rows_larger_than=self.max_n_features, speed_thresh=batched_lmap_speed_thresh, x_dtype='uint8', # HAS TO MATCH ./slm.py ) return rval
def __init__(self, x_dtype='uint8', x_height=250, x_width=250, max_n_per_class=None, channel_major=False): if self.DATASET_CLASS is None: raise NotImplementedError("This is an abstract class") # -- build/fetch dataset self.dataset = self.DATASET_CLASS() self.dataset.meta pairsDevTrain = self.dataset.pairsDevTrain pairsDevTest = self.dataset.pairsDevTest pairsView2 = self.dataset.pairsView2 if max_n_per_class is not None: pairsDevTrain = pairsDevTrain[:, :, :max_n_per_class] pairsDevTest = pairsDevTest[:, :, :max_n_per_class] pairsView2 = pairsView2[:, :, :max_n_per_class] logging.info('pairsDevTrain shape %s' % str(pairsDevTrain.shape)) logging.info('pairsDevTest shape %s' % str(pairsDevTest.shape)) logging.info('pairsView2 shape %s' % str(pairsView2.shape)) paths_labels_dev_train = paths_labels(pairsDevTrain) paths_labels_dev_test = paths_labels(pairsDevTest) paths_labels_view2 = paths_labels(pairsView2) all_paths_labels = np.concatenate([ paths_labels_dev_train.flatten(), paths_labels_dev_test.flatten(), paths_labels_view2.flatten() ]) rel_paths = sorted_paths(all_paths_labels) self.image_paths = [ self.dataset.home('images', self.dataset.IMAGE_SUBDIR, pth) for pth in rel_paths ] def lookup(pairs): rval = paths_labels_lookup(paths_labels(pairs), rel_paths) return rval self.dev_train = lookup(pairsDevTrain) self.dev_test = lookup(pairsDevTest) self.view2 = lookup(pairsView2) # -- lazy array helper function if self.dataset.COLOR: ndim, mode, shape = (3, 'RGB', (x_height, x_width, 3)) else: ndim, mode, shape = (2, 'L', (x_height, x_width)) loader = ImgLoader(ndim=ndim, dtype=x_dtype, mode=mode, shape=shape) self.image_pixels = lmap(loader, self.image_paths) self.paths_labels_dev_train = paths_labels_dev_train self.paths_labels_dev_test = paths_labels_dev_test self.paths_labels_view2 = paths_labels_view2
def pyll_theano_batched_lmap(pipeline, seq, batchsize, _debug_call_counts=None, print_progress_every=float('inf'), abort_on_rows_larger_than=None, speed_thresh=None, x_dtype='float32', ): """ This function returns a skdata.larray.lmap object whose function is defined by a theano expression. The theano expression will be built and compiled specifically for the dimensions of the given `seq`. Therefore, in_rows, and out_rows should actually be a *pyll* graph, that evaluates to a theano graph. """ in_shp = (batchsize,) + seq.shape[1:] batch = np.zeros(in_shp, dtype=x_dtype) s_ibatch = theano.shared(batch) s_xi = theano.tensor.as_tensor_variable(s_ibatch).type() s_N = s_xi.shape[0] s_X = theano.tensor.set_subtensor(s_ibatch[:s_N], s_xi) #print 'PIPELINE', pipeline thing = pipeline((s_X, in_shp)) #print 'THING' #print thing #print '===' s_obatch, oshp = pyll.rec_eval(thing) assert oshp[0] == batchsize logger.info('batched_lmap oshp %s' % str(oshp)) if abort_on_rows_larger_than: rowlen = np.prod(oshp[1:]) if rowlen > abort_on_rows_larger_than: raise ValueError('rowlen %i exceeds limit %i' % ( rowlen, abort_on_rows_larger_than)) # Compile a function that takes a variable number of elements in, # returns the same number of processed elements out, # but does all internal computations using a fixed number of elements, # because convolutions are fastest when they're hard-coded to a certain # size. logger.debug('pyll_theano_batched_lmap compiling fn') _fn = theano.function([theano.Param(s_xi, strict=True)], s_obatch[:s_N], updates={ s_ibatch: s_X, # this allows the inc_subtensor to be in-place }) logger.debug('pyll_theano_batched_lmap compiling fn -> done') sums = {'elems': 0, 'times': 0.0} if speed_thresh is None: time_fn = _fn else: def time_fn(X): t0 = time.time() if str(X.dtype) != x_dtype: print 'time_fn dtype problem', X.dtype, x_dtype rval = _fn(X) dt = time.time() - t0 #print 'DEBUG time_fn dt:', dt sums['elems'] += len(X) sums['times'] += dt return rval def raise_if_slow(): exc = EvalTimeout( 'batched_lmap failed to compute %i elements in %f secs' % (speed_thresh['elements'], speed_thresh['seconds'])) if sums['elems'] >= speed_thresh['elements']: observed_ratio = sums['elems'] / sums['times'] required_ratio = (speed_thresh['elements'] / speed_thresh['seconds']) if observed_ratio < required_ratio: raise exc else: sums['elems'] = 0 sums['times'] = 0.0 def fn_1(x): if _debug_call_counts: _debug_call_counts['fn_1'] += 1 return time_fn(x[None, :, :, :])[0] attrs = { 'shape': oshp[1:], 'ndim': len(oshp) -1, 'dtype': s_obatch.dtype } def rval_getattr(attr, objs): # -- objs don't matter to the structure of the return value try: return attrs[attr] except KeyError: raise AttributeError(attr) fn_1.rval_getattr = rval_getattr last_print_time = [time.time()] def check_for_print(offset, X): curtime = time.time() if (curtime - last_print_time[0]) > print_progress_every: logger.info('pyll_theano_batched_lmap.f_map %i %i' % ( offset, len(X))) last_print_time[0] = curtime if speed_thresh is not None: raise_if_slow() def f_map(X): if _debug_call_counts: _debug_call_counts['f_map'] += 1 if len(X) == batchsize: check_for_print(offset=0, X=X) return time_fn(X) rval = np.empty((len(X),) + oshp[1:], dtype=s_obatch.dtype) offset = 0 while offset < len(X): check_for_print(offset, X) xi = X[offset: offset + batchsize] fn_i = time_fn(xi) if not np.all(np.isfinite(fn_i)): raise ValueError('non-finite features') rval[offset:offset + len(xi)] = fn_i offset += len(xi) return rval return larray.lmap(fn_1, seq, f_map=f_map)
def normalized_image_features(self, images, xmean, xstd, avg_nrm, n_rows_to_estimate_stats=1000, flatten=True, batched_lmap_speed_thresh=None, ): """ svm_dct - dict dict of parameters for normalization: 'remove_std0' 'varthresh' 'divrowl2' write xmean, xstd if role is 'train' read xmean and xstd if role is 'test' role - str either 'train' or 'test' n_rows_to_estimate_stats - bool estimate xmean and xstd from the first N feature vectors flatten - bool return features flattened to vectors """ if not flatten: raise NotImplementedError('only flatten is implemented') pipeline = self.pipeline features_lmap = self.get_image_features_lmap(images) n_features = np.prod(features_lmap.shape[1:]) if xmean is None: # -- load enough training data into memory to estimate stats cache_train = flatten_elems( features_lmap[:n_rows_to_estimate_stats]) xmean, xstd = mean_and_std( cache_train, remove_std0=pipeline['remove_std0']) xstd = np.sqrt(xstd ** 2 + pipeline['varthresh']) if pipeline['divrowl2']: avg_nrm = 1e-7 + average_row_l2norm( (cache_train - xmean) / xstd) else: avg_nrm = 1 def normalize(x): return (x.flatten() - xmean) / (xstd * avg_nrm) def normalize_many(x): return (x.reshape((len(x), -1)) - xmean) / (xstd * avg_nrm) normed_features = lmap( lmap_info( shape=(n_features,), dtype=features_lmap.dtype)(normalize), features_lmap, ragged=False, f_map=normalize_many) return normed_features, xmean, xstd, avg_nrm
def __init__(self, data_dir, batch_range, init_epoch=1, init_batchnum=None, dp_params=None, test=False, read_mode='r', cache_type='memmap'): #load dataset and meta modulename, attrname = dp_params['dataset_name'] module = importlib.import_module(modulename) self.dp_params = dp_params print('DP_PARAMS', dp_params) dataset_obj = getattr(module, attrname) print(module, attrname) dataset_data = dp_params.get('dataset_data', None) if dataset_data is not None: dset = dataset_obj(data=dataset_data) else: dset = dataset_obj() self.dset = dset perm_type = dp_params.get('perm_type') perm, perm_id = self.get_perm() self.perm = perm self.perm_id = perm_id if 'subslice' in dp_params: subslice_method, subslice_kwargs = self.subslice = dp_params['subslice'] subslice = getattr(self.dset, subslice_method)(**subslice_kwargs).nonzero()[0] if perm is not None: self.subslice = fast.isin(perm, subslice).nonzero()[0] else: self.subslice = subslice metacol = self.metacol = self.get_metacol() if hasattr(metacol, 'keys'): mlen = len(metacol.values()[0]) else: mlen = len(metacol) #compute number of batches batch_size = self.batch_size = dp_params['batch_size'] num_batches = self.num_batches = int(math.ceil(mlen / float(batch_size))) num_batches_for_meta = self.num_batches_for_meta = dp_params['num_batches_for_mean'] images = dset.get_images(preproc=dp_params['preproc']) if hasattr(images, 'dirname'): base_dir, orig_name = os.path.split(images.dirname) else: base_dir = dset.home('cache') orig_name = 'images_cache_' + get_id(dp_params['preproc']) reorder = Reorder(images) lmap = larray.lmap(reorder, self.perm, f_map=reorder) if cache_type == 'hdf5': new_name = orig_name + '_' + self.perm_id + '_hdf5' print('Getting stimuli from cache hdf5 at %s/%s ' % (base_dir, new_name)) self.stimarray = larray.cache_hdf5(lmap, name=new_name, basedir=base_dir, mode=read_mode) elif cache_type == 'memmap': new_name = orig_name + '_' + self.perm_id + '_memmap' print('Getting stimuli from cache memmap at %s/%s ' % (base_dir, new_name)) self.stimarray = larray.cache_memmap(lmap, name=new_name, basedir=base_dir) #default data location if data_dir == '': pstring = hashlib.sha1(repr(dp_params['preproc'])).hexdigest() + '_%d' % dp_params['batch_size'] data_dir = dset.home('convnet_batches', pstring) if not os.path.exists(data_dir): print('data_dir %s does not exist, creating' % data_dir) os.makedirs(data_dir) if hasattr(self, 'subslice'): hashval = get_id(tuple(subslice.tolist())) metafile = os.path.join(data_dir, 'batches_%s.meta' % hashval) else: metafile = os.path.join(data_dir, 'batches.meta') self.metafile = metafile if os.path.exists(metafile): print('Meta file at %s exists, loading' % metafile) bmeta = cPickle.load(open(metafile)) #assertions checking that the things that need to be the same #for these batches to make sense are in fact the same assert dp_params['batch_size'] == bmeta['num_cases_per_batch'], (dp_params['batch_size'], bmeta['num_cases_per_batch']) if 'subslice' in bmeta or 'subslice' in dp_params: assert dp_params['subslice'] == bmeta['subslice'] if 'dataset_name' in bmeta: assert dp_params['dataset_name'] == bmeta['dataset_name'], (dp_params['dataset_name'], bmeta['dataset_name']) if 'preproc' in bmeta: assert dp_params['preproc'] == bmeta['preproc'], (dp_params['preproc'], bmeta['preproc']) #pass if 'dataset_data' in bmeta: assert dataset_data == bmeta['dataset_data'], (dataset_data, bmeta['dataset_data']) else: print('Making batches.meta at %s ...' % metafile) imgs_mean = None isf = 0 for bn in range(num_batches_for_meta): print('Meta batch %d' % bn) #get stimuli and put in the required format stims = self.get_stims(bn, batch_size) print('Got stims', stims.shape, stims.nbytes) if 'float' in repr(stims.dtype): stims = n.uint8(n.round(255 * stims)) print('Converted to uint8', stims.nbytes) d = dldata_to_convnet_reformatting(stims, None) #add to the mean if imgs_mean is None: imgs_mean = n.zeros((d['data'].shape[0],)) dlen = d['data'].shape[0] fr = isf / (isf + float(dlen)) imgs_mean *= fr imgs_mean += (1 - fr) * d['data'].mean(axis=1) isf += dlen #write out batches.meta outdict = {'num_cases_per_batch': batch_size, 'label_names': self.labels_unique, 'num_vis': d['data'].shape[0], 'data_mean': imgs_mean, 'dataset_name': dp_params['dataset_name'], 'dataset_data': dataset_data, 'preproc': dp_params['preproc']} if dp_params.has_key('subslice'): outdict['subslice'] = dp_params['subslice'] with open(metafile, 'wb') as _f: cPickle.dump(outdict, _f) self.batch_meta = cPickle.load(open(metafile, 'rb')) LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
def pyll_theano_batched_lmap( pipeline, seq, batchsize, _debug_call_counts=None, print_progress_every=float('inf'), abort_on_rows_larger_than=None, speed_thresh=None, x_dtype='float32', ): """ This function returns a skdata.larray.lmap object whose function is defined by a theano expression. The theano expression will be built and compiled specifically for the dimensions of the given `seq`. Therefore, in_rows, and out_rows should actually be a *pyll* graph, that evaluates to a theano graph. """ in_shp = (batchsize, ) + seq.shape[1:] batch = np.zeros(in_shp, dtype=x_dtype) s_ibatch = theano.shared(batch) s_xi = theano.tensor.as_tensor_variable(s_ibatch).type() s_N = s_xi.shape[0] s_X = theano.tensor.set_subtensor(s_ibatch[:s_N], s_xi) #print 'PIPELINE', pipeline thing = pipeline((s_X, in_shp)) #print 'THING' #print thing #print '===' s_obatch, oshp = pyll.rec_eval(thing) assert oshp[0] == batchsize logger.info('batched_lmap oshp %s' % str(oshp)) if abort_on_rows_larger_than: rowlen = np.prod(oshp[1:]) if rowlen > abort_on_rows_larger_than: raise ValueError('rowlen %i exceeds limit %i' % (rowlen, abort_on_rows_larger_than)) # Compile a function that takes a variable number of elements in, # returns the same number of processed elements out, # but does all internal computations using a fixed number of elements, # because convolutions are fastest when they're hard-coded to a certain # size. logger.debug('pyll_theano_batched_lmap compiling fn') _fn = theano.function( [theano.Param(s_xi, strict=True)], s_obatch[:s_N], updates={ s_ibatch: s_X, # this allows the inc_subtensor to be in-place }) logger.debug('pyll_theano_batched_lmap compiling fn -> done') sums = {'elems': 0, 'times': 0.0} if speed_thresh is None: time_fn = _fn else: def time_fn(X): t0 = time.time() if str(X.dtype) != x_dtype: print 'time_fn dtype problem', X.dtype, x_dtype rval = _fn(X) dt = time.time() - t0 #print 'DEBUG time_fn dt:', dt sums['elems'] += len(X) sums['times'] += dt return rval def raise_if_slow(): exc = EvalTimeout( 'batched_lmap failed to compute %i elements in %f secs' % (speed_thresh['elements'], speed_thresh['seconds'])) if sums['elems'] >= speed_thresh['elements']: observed_ratio = sums['elems'] / sums['times'] required_ratio = (speed_thresh['elements'] / speed_thresh['seconds']) if observed_ratio < required_ratio: raise exc else: sums['elems'] = 0 sums['times'] = 0.0 def fn_1(x): if _debug_call_counts: _debug_call_counts['fn_1'] += 1 return time_fn(x[None, :, :, :])[0] attrs = {'shape': oshp[1:], 'ndim': len(oshp) - 1, 'dtype': s_obatch.dtype} def rval_getattr(attr, objs): # -- objs don't matter to the structure of the return value try: return attrs[attr] except KeyError: raise AttributeError(attr) fn_1.rval_getattr = rval_getattr last_print_time = [time.time()] def check_for_print(offset, X): curtime = time.time() if (curtime - last_print_time[0]) > print_progress_every: logger.info('pyll_theano_batched_lmap.f_map %i %i' % (offset, len(X))) last_print_time[0] = curtime if speed_thresh is not None: raise_if_slow() def f_map(X): if _debug_call_counts: _debug_call_counts['f_map'] += 1 if len(X) == batchsize: check_for_print(offset=0, X=X) return time_fn(X) rval = np.empty((len(X), ) + oshp[1:], dtype=s_obatch.dtype) offset = 0 while offset < len(X): check_for_print(offset, X) xi = X[offset:offset + batchsize] fn_i = time_fn(xi) if not np.all(np.isfinite(fn_i)): raise ValueError('non-finite features') rval[offset:offset + len(xi)] = fn_i offset += len(xi) return rval return larray.lmap(fn_1, seq, f_map=f_map)
def normalized_image_features( self, images, xmean, xstd, avg_nrm, n_rows_to_estimate_stats=1000, flatten=True, batched_lmap_speed_thresh=None, ): """ svm_dct - dict dict of parameters for normalization: 'remove_std0' 'varthresh' 'divrowl2' write xmean, xstd if role is 'train' read xmean and xstd if role is 'test' role - str either 'train' or 'test' n_rows_to_estimate_stats - bool estimate xmean and xstd from the first N feature vectors flatten - bool return features flattened to vectors """ if not flatten: raise NotImplementedError('only flatten is implemented') pipeline = self.pipeline features_lmap = self.get_image_features_lmap(images) n_features = np.prod(features_lmap.shape[1:]) if xmean is None: # -- load enough training data into memory to estimate stats cache_train = flatten_elems( features_lmap[:n_rows_to_estimate_stats]) xmean, xstd = mean_and_std(cache_train, remove_std0=pipeline['remove_std0']) xstd = np.sqrt(xstd**2 + pipeline['varthresh']) if pipeline['divrowl2']: avg_nrm = 1e-7 + average_row_l2norm( (cache_train - xmean) / xstd) else: avg_nrm = 1 def normalize(x): return (x.flatten() - xmean) / (xstd * avg_nrm) def normalize_many(x): return (x.reshape((len(x), -1)) - xmean) / (xstd * avg_nrm) normed_features = lmap(lmap_info(shape=(n_features, ), dtype=features_lmap.dtype)(normalize), features_lmap, ragged=False, f_map=normalize_many) return normed_features, xmean, xstd, avg_nrm