def report(self,processed,total): if processed % self.step != 0: return elapsed = self.timer.total() if processed == 0: eta = 0.0 else: eta = elapsed * (total - processed) / processed if self.rootOnly: mpi.rootprint('{} {}/{}, elapsed {}, eta {}.'.format(self.header, processed, total, hms(elapsed), hms(eta))) else: mpi.nodeprint('{} {}/{}, elapsed {}, eta {}.'.format(self.header, processed, total, hms(elapsed), hms(eta)))
def normalize_data(self, m = None, std = None, sabotage = False): if self.normalized: mpi.rootprint('Warning: you are re-normalizing.') if m is None or std is None: # if either is none, we recompute. for i in range(self.nCodeLocal): for j in range(self.nMetabins): self.compute_feature(i,j,normalize=False) self.mLocal[i,j] = np.mean(self.featBuffer) self.stdLocal[i,j] = np.std(self.featBuffer)+1e-8 if sabotage: self.mLocal[i,j] *= 0. self.stdLocal[i,j] *= 0. self.stdLocal[i,j] += 1. else: self.mLocal[:] = m self.stdLocal[:] = std self.normalized = True
help='whether to read local cache or not') parser.add_argument('-c', '--nClass', type=int, default=10, help='number of classes') parser.add_argument('-t', '--random_iterations', type=int, default=1, help='number of random iterations') parser.add_argument('-s', '--skip_normalization', default=False, action='store_true') mpi.rootprint(str(sys.argv)) args = parser.parse_args(sys.argv[1:]) # cifar specifications data_file = 'cifar_tr_{}_{}.mat' label_file = 'tr_label.mat' test_data_file = 'cifar_te_{}_{}.mat' test_label_file = 'te_label.mat' nTraining = 50000 nTesting = 10000 grafter = grafting_mb.GrafterMPI() tester = grafting_mb.GrafterMPI() grafter.init_specs(nTraining, args.nBinsPerEdge, args.nCodes, args.nClass, args.maxSelFeat, args.gamma, np.float64)
rank = MPI.COMM_WORLD.Get_rank() parser = argparse.ArgumentParser(description="Script to test cifar with existing trained dump.", \ epilog="Yangqing Jia at NECLA, 2011") parser.add_argument('-r', '--data_root', default='.', help='the dataset path') parser.add_argument('-n', '--nBinsPerEdge', type=int, default=0, help='the number of bins per edge') parser.add_argument('-d', '--nCodes', type=int, default=0, help='the number of codes') parser.add_argument('-b', '--batch_size', type=int, default=1000, help='the batch size that the data is stored') parser.add_argument('-m', '--maxSelFeat', type=int, default=6400, help='max number of selected features') parser.add_argument('-g', '--gamma', type=float, default=0.01, help='regularization term for classification') parser.add_argument('-e', '--local_cache_root', default=None, help='local cache root') parser.add_argument('-l', '--read_local_cache', type=int, default=0, help='whether to read local cache or not') parser.add_argument('-c', '--nClass', type=int, default=10, help='number of classes') parser.add_argument('-t', '--random_iterations', type=int, default=1, help='number of random iterations') parser.add_argument('-s', '--skip_normalization', default=False, action = 'store_true') mpi.rootprint(str(sys.argv)) args = parser.parse_args(sys.argv[1:]) # cifar specifications data_file = 'cifar_tr_{}_{}.mat' label_file = 'tr_label.mat' test_data_file = 'cifar_te_{}_{}.mat' test_label_file = 'te_label.mat' nTraining = 50000 nTesting = 10000 grafter = grafting_mb.GrafterMPI() tester = grafting_mb.GrafterMPI() grafter.init_specs(nTraining, args.nBinsPerEdge, args.nCodes, args.nClass, args.maxSelFeat, args.gamma, np.float64) tester.init_specs(nTesting, args.nBinsPerEdge, args.nCodes, args.nClass, args.maxSelFeat, args.gamma, np.float64)
def graft(self, dump_every = 0, \ dump_file = None, \ nActiveSet = None, \ tester = None, \ test_every = 10, \ samplePerRun = 1, \ fromDumpFile = None \ ): ''' the main grafting algorithm ==Parameters== dump_every: the frequency to dump the current result. 0 if you do not want to dump dump_file: dump file name. nActiveSet: when retraining, the number of features in the active set. pass None for full retraining (may be slow!) pass a positive number to select the last features pass a negative number to select features via their gradient values (recommended, much better than other approaches) pass 0 for boosting tester: the grafterMPI class that hosts the test data test_every: the frequency to compute test accuracy samplePerRun: in each feature selection run, how many features (in proportions) we should sample to select feature from. Pass 1 to enumerate all features. fromDumpFile: restore from dump file (not implemented for the mb version yet) ''' self.comm.barrier() mpi.rootprint('*'*38) mpi.rootprint('*'*15+'grafting'+'*'*15) mpi.rootprint('*'*38) if True: mpi.rootprint('Number of data: {}'.format(self.nData)) mpi.rootprint('Number of labels: {}'.format(self.nLabel)) mpi.rootprint('Number of codes: {}'.format(self.nCodes)) mpi.rootprint('Bins: {0}x{0}'.format(self.nBinsPerEdge)) mpi.rootprint('Total pooling areas: {}'.format(self.nMetabins)) mpi.rootprint('Total features: {}'.format(self.nMetabins*self.nCodes)) mpi.rootprint('Number of features to select: {}'.format(self.maxGraftDim)) mpi.rootprint('Graft Settings:') mpi.rootprint('dump_every = {}\nnActiveSet={}\ntest_every={}\nsamplePerRun={}'.format(\ dump_every, nActiveSet, test_every, samplePerRun)) self.comm.barrier() if tester is not None: # normalize the test data with the stats of the training data tester.normalize_data(self.mLocal, self.stdLocal) if fromDumpFile is not None: self.restore_from_dump_file(fromDumpFile, tester) old_loss = 1e10 timer = Timer() itertimer = Timer() for T in range(self.nSelFeats, self.maxGraftDim): itertimer.reset() mpi.rootprint('*'*15+'Round {}'.format(T)+'*'*15) score, codeid, metabinid = self.select_new_feature_by_grad(samplePerRun) mpi.rootprint('Selected Feature [code: {}, metabin: {}], score {}'.format(codeid, metabinid, score)) # add this feature to the selected features self.append_feature(codeid, metabinid) mpi.rootprint('Number of Features: {}'.format(self.nSelFeats)) mpi.rootprint('Feature selection took {} secs'.format(itertimer.lap())) mpi.rootprint('Retraining the model...') loss = self.retrain_model(nActiveSet, samplePerRun) mpi.rootprint('Total loss reduction {}/{}={}'.format(loss, old_loss, loss/old_loss)) mpi.rootprint('Current training accuracy: {}'.format(self.compute_current_accuracy())) mpi.rootprint('Model retraining took {} secs'.format(itertimer.lap())) old_loss = loss if tester is not None: tester.append_feature(codeid, metabinid) if (T+1) % test_every == 0: # print test accuracy test_accuracy = tester.compute_test_accuracy(self.weights, self.b) mpi.rootprint('Current Testing accuracy: {}'.format(test_accuracy)) self.safebarrier() mpi.rootprint('This round took {} secs, total {} secs'.format(timer.lap(), timer.total())) mpi.rootprint('ETA {} secs.'.format(timer.total() * (self.maxGraftDim-T)/(T+1.0e-5))) if dump_every > 0 and (T+1) % dump_every == 0 and dump_file is not None: mpi.rootprint('*'*15 + 'Dumping' + '*'*15) self.dump_current_state(dump_file + str(T)+'.mat') mpi.rootprint('*'*15+'Finalizing'.format(T)+'*'*15) if dump_file is not None: self.dump_current_state(dump_file + 'final.mat')
def randomselecttest(self, tester=None, random_iterations=1, should_normalize = True): ''' test the performance of random selection modified by Ian Goodfellow to use seeded random number generation so that results are replicable ''' self.comm.barrier() mpi.rootprint('*'*46) mpi.rootprint('*'*15+'random selection'+'*'*15) mpi.rootprint('*'*46) trainaccu = np.zeros(random_iterations) testaccu = np.zeros(random_iterations) rng = np.random.RandomState([1,2,3]) if tester is not None: # normalize the test data with the stats of the training data tester.normalize_data(self.mLocal, self.stdLocal, sabotage = not should_normalize) itertimer = Timer() for iter in range(random_iterations): itertimer.reset() mpi.rootprint('*'*15+'Round {}'.format(iter)+'*'*15) if self.rank == 0: #decide which features we are going to select allidx = np.array(range(self.nCodes*self.nMetabins),dtype=np.int) rng.shuffle(allidx) codeidlist = allidx / self.nMetabins metabinidlist = allidx % self.nMetabins else: codeidlist = None metabinidlist = None codeidlist = self.comm.bcast(codeidlist, root=0) metabinidlist = self.comm.bcast(metabinidlist, root=0) self.append_multiple_features(codeidlist[:self.maxGraftDim], metabinidlist[:self.maxGraftDim]) mpi.rootprint('Feature selection took {} secs'.format(itertimer.lap())) mpi.rootprint('Training...') loss = self.retrain_model(None) trainaccu[iter] = self.compute_current_accuracy() mpi.rootprint('Training took {} secs'.format(itertimer.lap())) mpi.rootprint('Current training accuracy: {}'.format(trainaccu[iter])) if tester is not None: tester.append_multiple_features(codeidlist[:self.maxGraftDim], metabinidlist[:self.maxGraftDim]) testaccu[iter] = tester.compute_test_accuracy(self.weights, self.b) mpi.rootprint('Current Testing accuracy: {}'.format(testaccu[iter])) mpi.rootprint('Testing selection took {} secs'.format(itertimer.lap())) self.safebarrier() mpi.rootprint('*'*15+'Summary'+'*'*15) mpi.rootprint('Training accuracy: {} +- {}'.format(np.mean(trainaccu),np.std(trainaccu))) mpi.rootprint('Testing accuracy: {} +- {}'.format(np.mean(testaccu),np.std(testaccu)))
def train_whole_model(self, tester=None): ''' test the performance using all the features may be memory consuming. ''' self.comm.barrier() mpi.rootprint('*'*46) mpi.rootprint('*'*15+'whole featureset'+'*'*15) mpi.rootprint('*'*46) if tester is not None: # normalize the test data with the stats of the training data tester.normalize_data(self.mLocal, self.stdLocal) timer = Timer() timer.reset() if self.maxGraftDim != self.nMetabins*self.nCodes: mpi.rootprint('Please initialize with maxGraftDim=nMetabins*nCodes') return self.nSelFeats = 0 self.isSelected[:] = False mpi.rootprint('Generating Features...') for code in range(self.nCodes): for metabin in range(self.nMetabins): self.append_feature(code, metabin) if tester is not None: tester.append_feature(code, metabin) mpi.rootprint('Feature generation took {} secs'.format(timer.lap())) mpi.rootprint('Training...') loss = self.retrain_model(None) mpi.rootprint('Training took {} secs'.format(timer.lap())) mpi.rootprint('Training accuracy: {}'.format(self.compute_current_accuracy())) if tester is not None: mpi.rootprint('Current Testing accuracy: {}'.format(tester.compute_test_accuracy(self.weights, self.b)))
def load_data_batch(self, root, batch_size, file_template, labelfile, \ rootRead = True, isTest = False, \ local_cache_root = None, read_local_cache = False, should_normalize = True): ''' load the data in batches. file_template should be 'filename_{}_{}.mat' where the batch size and batch id will be filled. The mat file will contain a variable called 'feat'. labelfile is the file for labels starting from either 0 or 1 (our code converts the labels to 0 ~ nLabel-1). ''' from scipy import io nBatches = int(np.ceil(float(self.nData) / batch_size)) # deal both cases: batch starts with 0 or 1 if os.path.exists(os.path.join(root,file_template.format(batch_size, 0))): allrange = range(nBatches) else: allrange = range(1,nBatches+1) if local_cache_root is not None and not os.path.exists(local_cache_root): try: os.makedirs(local_cache_root) except OSError: mpi.nodeprint('Warning: I cannot create the directory necessary.') if read_local_cache and local_cache_root is not None: # load from local cache sid = 0 for bid in allrange: mpi.rootprint('From Local Cache: Loading batch {} of {}'.format(bid, nBatches)) filename = os.path.join(local_cache_root, file_template.format(batch_size, bid)) matdata = io.loadmat(filename) batchNdata = matdata['feat'].shape[2] self.featSlice[:,:,sid:sid+batchNdata] = matdata['feat'] sid += batchNdata elif rootRead: # root reads the file, and then propagates the values to other machines dataid = 0 # current feature id dataBuffer = np.zeros(self.nBaseFeat, dtype = self.dtype) timer = Timer() for bid in allrange: mpi.rootprint('RootRead: Loading batch {} of {}'.format(bid, nBatches)) if self.rank == 0: # read only of I am root filename = os.path.join(root, file_template.format(batch_size, bid)) print filename matdata = io.loadmat(filename) feat = matdata['feat'] batchNdata = feat.shape[0] else: feat = None batchNdata = 0 # broadcast the features # it seems that doing this one-datum-by-one-datum is the fastest... batchNdata = self.comm.bcast(batchNdata, root=0) for batchfeatid in range(batchNdata): if self.rank == 0: dataBuffer[:] = feat[batchfeatid] self.comm.Bcast(dataBuffer, root = 0) # the data storage is like # [bin1_code1 bin1_code2 ... bin1_codeK bin2_code1 ... binN_codeK] # while our data is [nCodeLocal, nBins, nData] self.featSlice[:,:,dataid] = \ dataBuffer.reshape(self.nBins, self.nCodes)[:,self.codeRange[0]:self.codeRange[1]].T dataid += 1 if local_cache_root is not None: # write local cache, so we may read it back later filename = os.path.join(local_cache_root, file_template.format(batch_size, bid)) try: io.savemat(filename,{'feat': self.featSlice[:,:, dataid-batchNdata:dataid]}, oned_as='row') except Exception, e: mpi.nodeprint('Unable to save to local buffer {}'.format(filename)) mpi.rootprint('Elapsed {} secs.'.format(timer.lap()))
def init_specs(self, nData, nBinsPerEdge, nCodes, nLabel, maxGraftDim, gamma, dtype, \ metabinGenerator = bd.rectangularBins): ''' Initialize the specs. Specifically, the raw data (for the base bins) is a nBinsPerEdge^2 * nCodes * nData cube, and each node will host a subset of the codes (all bins for any single code will be hosted on the same node). ==Parameters== nData: number of data points. nBinsPerEdge: number of base bins per edge. For example, for 4x4 base bins, pass 4. nCodes: the codebook size. nLabel: number of labels. maxGraftDim: the maximum number of features to select. gamma: regularizer for the classifier. dtype: data type. Only np.float64 is supported now as we have some c-code that has double-precision version only. metabinGenerator: the function to generate metabins. See bindef.py ''' # determine feature range and data range if nData < self.size or nCodes < self.size: print 'Seriously? Is the problem really large scale?' # I know it's unethical, but whatever exit() self.nData = nData self.nCodes = nCodes self.nBinsPerEdge = nBinsPerEdge self.nBins = nBinsPerEdge*nBinsPerEdge self.nBaseFeat = self.nCodes*self.nBins self.metabins = metabinGenerator(nBinsPerEdge) self.nMetabins = self.metabins.shape[0] self.nLabel = nLabel if maxGraftDim > self.nMetabins*self.nCodes: mpi.rootprint('Warning: maxGraftDim should be no more than the number of available features.') maxGraftDim = self.nMetabins*self.nCodes self.maxGraftDim = maxGraftDim self.gamma = gamma self.dtype = dtype self.ncode_per_node = int(np.ceil( float(nCodes) / self.size )) self.codeRange = [self.ncode_per_node*self.rank, min(self.ncode_per_node*(self.rank+1), nCodes)] self.nCodeLocal = int(self.codeRange[1] - self.codeRange[0]) self.mLocal = np.zeros((self.nCodeLocal, self.nMetabins), dtype=self.dtype) self.stdLocal = np.zeros((self.nCodeLocal, self.nMetabins), dtype=self.dtype) self.normalized = False # pre-allocate data space self.featSlice = np.zeros([self.nCodeLocal, self.nBins, self.nData], dtype=self.dtype) self.labels = -np.ones([self.nLabel, self.nData], dtype=self.dtype) self.rawlabels = np.zeros(self.nData, dtype=np.int) # pre-allocate selected features cache if self.rank < self.nLabel: self.dataSel = np.zeros([self.maxGraftDim, self.nData], dtype=self.dtype) # selected features else: self.dataSel = None # pre-allocate classifier parameters self.weights = np.zeros([self.nLabel, self.maxGraftDim], dtype=self.dtype) # weights self.b = np.zeros(self.nLabel, dtype=self.dtype) # bias self.curr_wxb = np.zeros([self.nLabel,self.nData], dtype=self.dtype) # current prediction # pre-allocate feature selection statistics self.nSelFeats = 0 # number of selected features self.selCodeID = np.zeros(self.maxGraftDim, dtype=np.int) self.selMetabinID = np.zeros(self.maxGraftDim, dtype=np.int) self.isSelected = np.zeros((self.nCodes, self.nMetabins),dtype=np.bool) # 0-1 array to define if a feature is selected # pre-allocate mpi buffer here self.featBuffer = np.zeros(self.nData, dtype = self.dtype) self.featBufferPerCode = np.zeros((self.nMetabins, self.nData), dtype=self.dtype) # other buffers self.localGradMat = np.zeros((self.nCodeLocal, self.nMetabins, self.nLabel), dtype = self.dtype) self.scoreVec = np.zeros((self.nCodeLocal, self.nMetabins),dtype=self.dtype) # the buffer to store local gradients for feature selection
# this sometimes helps python do garbage collection matdata = None # load label if self.rank == 0: matdata = io.loadmat(os.path.join(root, labelfile)) # if the label starts with 1, make it start with 0 if matdata['label'].min() == 1: matdata['label'] -= 1 self.rawlabels[:] = matdata['label'].reshape(matdata['label'].size)[:self.nData] matdata = None self.comm.Bcast(self.rawlabels, root=0) for i in range(self.nData): # we need to make the label matrix a -1/1 matrix self.labels[self.rawlabels[i],i] = 1 if not isTest: mpi.rootprint('Normalizing training data') timer = Timer() self.normalize_data(sabotage = not should_normalize) mpi.nodeprint('Normalization took {} secs.'.format(timer.lap())) def append_feature(self,codeid, metabinid): ''' find the owner of the feature, broadcast it to all the nodes, and append the feature to the currently selected features if necessary. from the owner of the feature, broadcast this feature and append it to the current selected features. Each instance will update the slice of data it is responsible for ''' # find the owner owner = int( codeid / self.ncode_per_node ) if self.rank == owner: