def get_image_features(self, task, batched_lmap_speed_thresh=None): if batched_lmap_speed_thresh is None: batched_lmap_speed_thresh = self.batched_lmap_speed_thresh images = task.images try: rval, _images, cdict = self.image_features[images] # -- currently it is true that all tasks should be indexing into # -- the same set of images. Later when this is not the case, # -- delete this over-strict check. assert _images is images except KeyError: feature_lmap = self.get_image_features_lmap(task.images, batched_lmap_speed_thresh) rval = cache_memmap( feature_lmap, self.memmap_name + '_image_features_' + task.name, del_atexit=self.memmap_del_atexit) foobar.append_ndarray_signature(rval[0], 'get_image_features features 0', task.name) foobar.append_ndarray_signature(rval[100], 'get_image_features features 100', task.name) cdict = {} self.image_features[images] = rval, images, cdict return rval, cdict
def get_image_features(self, task, batched_lmap_speed_thresh=None): if batched_lmap_speed_thresh is None: batched_lmap_speed_thresh = self.batched_lmap_speed_thresh images = task.images try: rval, _images, cdict = self.image_features[images] # -- currently it is true that all tasks should be indexing into # -- the same set of images. Later when this is not the case, # -- delete this over-strict check. assert _images is images except KeyError: feature_lmap = self.get_image_features_lmap( task.images, batched_lmap_speed_thresh) rval = cache_memmap(feature_lmap, self.memmap_name + '_image_features_' + task.name, del_atexit=self.memmap_del_atexit) foobar.append_ndarray_signature(rval[0], 'get_image_features features 0', task.name) foobar.append_ndarray_signature(rval[100], 'get_image_features features 100', task.name) cdict = {} self.image_features[images] = rval, images, cdict return rval, cdict
def slm_memmap(desc, X, name, basedir=None): """ Return a cache_memmap object representing the features of the entire set of images. """ if basedir is None: basedir = os.getcwd() feat_fn = SLMFunction(desc, X.shape[1:]) feat = larray.lmap(feat_fn, X) rval = larray.cache_memmap(feat, name, basedir=basedir) return rval
def pairs_memmap(pair_labels, X, comparison_name, name, basedir=None): """ pair_labels - something like comes out of verification_pairs X - feature vectors to be combined combination_fn - some lambda X[i], X[j]: features1D """ if basedir is None: basedir = os.getcwd() lidxs, ridxs, matches = pair_labels pf = larray.lmap( PairFeaturesFn(X, comparison_name), lidxs, ridxs) pf_cache = larray.cache_memmap(pf, name, basedir=basedir) return pf_cache, np.asarray(matches)
def get_stimarray(marray, mname, perm, perm_id, cache_type, base_dir, read_mode='r'): reorder = Reorder2(marray) lmap = larray.lmap(reorder, perm, f_map = reorder) if cache_type == 'hdf5': new_name = mname + '_' + perm_id + '_hdf5' print('Getting stimuli from cache hdf5 at %s/%s ' % (base_dir, new_name)) return larray.cache_hdf5(lmap, name=new_name, basedir=base_dir, mode=read_mode) elif cache_type == 'memmap': new_name = mname + '_' + perm_id + '_memmap' print('Getting stimuli from cache memmap at %s/%s ' % (base_dir, new_name)) return larray.cache_memmap(lmap, name=new_name, basedir=base_dir)
def get_fg11_features(suffix, expected_shape): dataset = skdata.lfw.Aligned() paths, identities = dataset.raw_classification_task() def load_path(path): basename = os.path.basename(path) name = basename[:-9] # cut off the digits and the .jpg # -- touch the jpg to make sure it's there new_path = os.path.join( feature_root, name, basename) feature_path = new_path + suffix print 'loading', feature_path data = scipy.io.loadmat(feature_path)['data'] assert data.shape == expected_shape return np.asarray(data, dtype='float32') # -- apply decorator manually here in nested scope load_path = larray.lmap_info( shape=expected_shape, dtype='float32')(load_path) rval = larray.lmap(load_path, paths) rval = larray.cache_memmap(rval, 'fcache_' + suffix, basedir=os.getcwd()) return rval
def larray_cache_memmap(obj, name, basedir=None, msg=None): return larray.cache_memmap(obj, name, basedir=basedir, msg=msg)
def __init__(self, data_dir, batch_range, init_epoch=1, init_batchnum=None, dp_params=None, test=False, read_mode='r', cache_type='memmap'): #load dataset and meta modulename, attrname = dp_params['dataset_name'] module = importlib.import_module(modulename) self.dp_params = dp_params print('DP_PARAMS', dp_params) dataset_obj = getattr(module, attrname) print(module, attrname) dataset_data = dp_params.get('dataset_data', None) if dataset_data is not None: dset = dataset_obj(data=dataset_data) else: dset = dataset_obj() self.dset = dset perm_type = dp_params.get('perm_type') perm, perm_id = self.get_perm() self.perm = perm self.perm_id = perm_id if 'subslice' in dp_params: subslice_method, subslice_kwargs = self.subslice = dp_params['subslice'] subslice = getattr(self.dset, subslice_method)(**subslice_kwargs).nonzero()[0] if perm is not None: self.subslice = fast.isin(perm, subslice).nonzero()[0] else: self.subslice = subslice metacol = self.metacol = self.get_metacol() if hasattr(metacol, 'keys'): mlen = len(metacol.values()[0]) else: mlen = len(metacol) #compute number of batches batch_size = self.batch_size = dp_params['batch_size'] num_batches = self.num_batches = int(math.ceil(mlen / float(batch_size))) num_batches_for_meta = self.num_batches_for_meta = dp_params['num_batches_for_mean'] images = dset.get_images(preproc=dp_params['preproc']) if hasattr(images, 'dirname'): base_dir, orig_name = os.path.split(images.dirname) else: base_dir = dset.home('cache') orig_name = 'images_cache_' + get_id(dp_params['preproc']) reorder = Reorder(images) lmap = larray.lmap(reorder, self.perm, f_map=reorder) if cache_type == 'hdf5': new_name = orig_name + '_' + self.perm_id + '_hdf5' print('Getting stimuli from cache hdf5 at %s/%s ' % (base_dir, new_name)) self.stimarray = larray.cache_hdf5(lmap, name=new_name, basedir=base_dir, mode=read_mode) elif cache_type == 'memmap': new_name = orig_name + '_' + self.perm_id + '_memmap' print('Getting stimuli from cache memmap at %s/%s ' % (base_dir, new_name)) self.stimarray = larray.cache_memmap(lmap, name=new_name, basedir=base_dir) #default data location if data_dir == '': pstring = hashlib.sha1(repr(dp_params['preproc'])).hexdigest() + '_%d' % dp_params['batch_size'] data_dir = dset.home('convnet_batches', pstring) if not os.path.exists(data_dir): print('data_dir %s does not exist, creating' % data_dir) os.makedirs(data_dir) if hasattr(self, 'subslice'): hashval = get_id(tuple(subslice.tolist())) metafile = os.path.join(data_dir, 'batches_%s.meta' % hashval) else: metafile = os.path.join(data_dir, 'batches.meta') self.metafile = metafile if os.path.exists(metafile): print('Meta file at %s exists, loading' % metafile) bmeta = cPickle.load(open(metafile)) #assertions checking that the things that need to be the same #for these batches to make sense are in fact the same assert dp_params['batch_size'] == bmeta['num_cases_per_batch'], (dp_params['batch_size'], bmeta['num_cases_per_batch']) if 'subslice' in bmeta or 'subslice' in dp_params: assert dp_params['subslice'] == bmeta['subslice'] if 'dataset_name' in bmeta: assert dp_params['dataset_name'] == bmeta['dataset_name'], (dp_params['dataset_name'], bmeta['dataset_name']) if 'preproc' in bmeta: assert dp_params['preproc'] == bmeta['preproc'], (dp_params['preproc'], bmeta['preproc']) #pass if 'dataset_data' in bmeta: assert dataset_data == bmeta['dataset_data'], (dataset_data, bmeta['dataset_data']) else: print('Making batches.meta at %s ...' % metafile) imgs_mean = None isf = 0 for bn in range(num_batches_for_meta): print('Meta batch %d' % bn) #get stimuli and put in the required format stims = self.get_stims(bn, batch_size) print('Got stims', stims.shape, stims.nbytes) if 'float' in repr(stims.dtype): stims = n.uint8(n.round(255 * stims)) print('Converted to uint8', stims.nbytes) d = dldata_to_convnet_reformatting(stims, None) #add to the mean if imgs_mean is None: imgs_mean = n.zeros((d['data'].shape[0],)) dlen = d['data'].shape[0] fr = isf / (isf + float(dlen)) imgs_mean *= fr imgs_mean += (1 - fr) * d['data'].mean(axis=1) isf += dlen #write out batches.meta outdict = {'num_cases_per_batch': batch_size, 'label_names': self.labels_unique, 'num_vis': d['data'].shape[0], 'data_mean': imgs_mean, 'dataset_name': dp_params['dataset_name'], 'dataset_data': dataset_data, 'preproc': dp_params['preproc']} if dp_params.has_key('subslice'): outdict['subslice'] = dp_params['subslice'] with open(metafile, 'wb') as _f: cPickle.dump(outdict, _f) self.batch_meta = cPickle.load(open(metafile, 'rb')) LabeledDataProvider.__init__(self, data_dir, batch_range, init_epoch, init_batchnum, dp_params, test)
def train_indexed_image_classification(self, train, valid=None): if valid is None: train_name = train.name valid_name = 'None' else: train_name = train.name valid_name = valid.name assert train.all_images is valid.all_images assert train.all_labels is valid.all_labels info('train_indexed_image_classification: %s/%s' % (train_name, valid_name)) normed_features, xmean, xstd, avg_nrm = \ self.normalized_image_features( train.all_images, None, None, None, flatten=True) assert train.name is not None if hasattr(self, 'cmemmap'): assert train.all_images is self.cmemmap_all_images else: self.cmemmap_all_images = train.all_images self.cmemmap = cache_memmap(normed_features, self.memmap_name, del_atexit=True) if not hasattr(self, 'history'): self.load_ensemble_history(fields=[]) svm = self.load_svm(train_name, valid_name, self.cmemmap.shape[1], train.n_classes, self.pipeline['l2_reg']) svm.feature_xmean = xmean svm.feature_xstd = xstd svm.feature_avg_nrm = avg_nrm svm.train_name = train_name svm.valid_name = valid_name prev_xw_trn = self.load_prev_xw(train_name, train_name, valid_name, use_history='using_history') info('train_indexed_image_classification: Fitting SVM with prev_xw') svm.fit(self.cmemmap[train.idxs], train.all_labels[train.idxs], prev_xw_trn) info('-> loaded alpha %s' % str(svm.alpha)) info('-> loaded prvl2 %s' % str(svm.prev_l2_regularization)) info('-> loaded prvw2 %s' % str(svm.prev_w_l2_sqr)) if valid is None: # -- XXX: it is currently a hack to use the existence # of the validation set to decide when to compute # an svm without the history features... it currently # so happens that for the fit/val split we have a validation # set and we want to train both ways, and for the sel/test # split we do not have a validation set and we only want the # fit-with-history training. assert train.name == 'sel' svm0 = None else: svm0 = copy.deepcopy(svm) if (prev_xw_trn is not None) and prev_xw_trn.size: info('Fitting SVM without prev_xw') svm0.fit(self.cmemmap[train.idxs], train.all_labels[train.idxs], np.zeros_like(prev_xw_trn)) self.add_results( [ 'train_indexed_image_classification', train_name, valid_name, ], { 'train_name': train_name, 'valid used': (valid is not None), 'valid_name': valid_name, }, { 'model0': svm0, 'model': svm, }) self.loss_indexed_image_classification(svm, train) if valid is not None: self.loss_indexed_image_classification(svm, valid) self.loss_indexed_image_classification( svm0, valid, use_history='not_using_history') return svm
def train_view2(namebases, basedirs, test=None, use_libsvm=False, trace_normalize=False, model_kwargs=None): """To use use precomputed kernels with libsvm, do use_libsvm = {'kernel': 'precomputed'} otherwise, use_libsvm = True will use 'linear' """ pair_features = [[larray.cache_memmap(None, name=view2_filename(nb, snum), basedir=bdir) for snum in range(10)] for nb, bdir in zip(namebases, basedirs)] split_data = [verification_pairs('fold_%d' % split_num, test=test) for split_num in range(10)] train_errs = [] test_errs = [] if model_kwargs is None: model_kwargs = {} for ind in range(10): train_inds = [_ind for _ind in range(10) if _ind != ind] print ('Constructing stuff for split %d ...' % ind) test_X = [pf[ind][:] for pf in pair_features] test_y = split_data[ind][2] train_X = [np.vstack([pf[_ind][:] for _ind in train_inds]) for pf in pair_features] train_y = np.concatenate([split_data[_ind][2] for _ind in train_inds]) train_decisions = np.zeros(len(train_y)) test_decisions = np.zeros(len(test_y)) #train_Xyd_n, test_Xyd_n = toyproblem.normalize_Xcols( # (np.hstack(train_X), train_y, train_decisions,), # (np.hstack(test_X), test_y, test_decisions,)) normalized = [dan_normalize((t0, t1), trace_normalize=trace_normalize, data=None) for t0, t1 in zip(train_X, test_X)] train_X = np.hstack([n[0] for n in normalized]) test_X = np.hstack([n[1] for n in normalized]) train_Xyd_n = (train_X, train_y, train_decisions) test_Xyd_n = (test_X, test_y, test_decisions) print ('Training split %d ...' % ind) if use_libsvm: if hasattr(use_libsvm, 'keys'): kernel = use_libsvm.get('kernel', 'linear') else: kernel = 'linear' if kernel == 'precomputed': (_Xtrain, _ytrain, _dtrain) = train_Xyd_n print ('Computing training kernel ...') Ktrain = np.dot(_Xtrain, _Xtrain.T) print ('... computed training kernel of shape', Ktrain.shape) train_Xyd_n = (Ktrain, _ytrain, _dtrain) train_data = (Ktrain, _ytrain, _dtrain) print ('Computing testtrain kernel ...') (_Xtest, _ytest, _dtest) = test_Xyd_n Ktest = np.dot(_Xtest, _Xtrain.T) print ('... computed testtrain kernel of shape', Ktest.shape) test_Xyd_n = (Ktest, _ytest, _dtest) model_kwargs['kernel'] = kernel svm, _ = train_scikits(train_Xyd_n, labelset=[-1, 1], model_type='svm.SVC', model_kwargs=model_kwargs, normalization=False ) else: svm = toyproblem.train_svm(train_Xyd_n, l2_regularization=1e-3, max_observations=20000) #train_decisions = svm_decisions_lfw(svm, train_Xyd_n) #test_decisions = svm_decisions_lfw(svm, test_Xyd_n) #train_predictions = predictions_from_decisions(train_decisions) #test_predictions = predictions_from_decisions(test_decisions) train_predictions = svm.predict(train_Xyd_n[0]) test_predictions = svm.predict(test_Xyd_n[0]) train_err = (train_predictions != train_y).mean() test_err = (test_predictions != test_y).mean() print 'split %d train err %f' % (ind, train_err) print 'split %d test err %f' % (ind, test_err) train_errs.append(train_err) test_errs.append(test_err) train_err_mean = np.mean(train_errs) print 'train err mean', train_err_mean test_err_mean = np.mean(test_errs) print 'test err mean', test_err_mean return train_err_mean, test_err_mean
def train_indexed_image_classification(self, train, valid=None): if valid is None: train_name = train.name valid_name = 'None' else: train_name = train.name valid_name = valid.name assert train.all_images is valid.all_images assert train.all_labels is valid.all_labels info('train_indexed_image_classification: %s/%s' % ( train_name, valid_name)) normed_features, xmean, xstd, avg_nrm = \ self.normalized_image_features( train.all_images, None, None, None, flatten=True) assert train.name is not None if hasattr(self, 'cmemmap'): assert train.all_images is self.cmemmap_all_images else: self.cmemmap_all_images = train.all_images self.cmemmap = cache_memmap( normed_features, self.memmap_name, del_atexit=True) if not hasattr(self, 'history'): self.load_ensemble_history(fields=[]) svm = self.load_svm( train_name, valid_name, self.cmemmap.shape[1], train.n_classes, self.pipeline['l2_reg']) svm.feature_xmean = xmean svm.feature_xstd = xstd svm.feature_avg_nrm = avg_nrm svm.train_name = train_name svm.valid_name = valid_name prev_xw_trn = self.load_prev_xw( train_name, train_name, valid_name, use_history='using_history') info('train_indexed_image_classification: Fitting SVM with prev_xw') svm.fit(self.cmemmap[train.idxs], train.all_labels[train.idxs], prev_xw_trn) info('-> loaded alpha %s' % str(svm.alpha)) info('-> loaded prvl2 %s' % str(svm.prev_l2_regularization)) info('-> loaded prvw2 %s' % str(svm.prev_w_l2_sqr)) if valid is None: # -- XXX: it is currently a hack to use the existence # of the validation set to decide when to compute # an svm without the history features... it currently # so happens that for the fit/val split we have a validation # set and we want to train both ways, and for the sel/test # split we do not have a validation set and we only want the # fit-with-history training. assert train.name == 'sel' svm0 = None else: svm0 = copy.deepcopy(svm) if (prev_xw_trn is not None) and prev_xw_trn.size: info('Fitting SVM without prev_xw') svm0.fit(self.cmemmap[train.idxs], train.all_labels[train.idxs], np.zeros_like(prev_xw_trn)) self.add_results( [ 'train_indexed_image_classification', train_name, valid_name, ], { 'train_name': train_name, 'valid used': (valid is not None), 'valid_name': valid_name, }, { 'model0': svm0, 'model': svm, }) self.loss_indexed_image_classification(svm, train) if valid is not None: self.loss_indexed_image_classification(svm, valid) self.loss_indexed_image_classification(svm0, valid, use_history='not_using_history') return svm
def test_memmap_cache(self): self.battery(lambda obj: larray.cache_memmap(obj, 'name_foo'))