def setup_shared_xy(self, borrow=True): '''Set up theano shared variables for input X and output y. X is a matrix of num_samples_per_mega_batch x num_features y is a vector of num_samples_per_mega_batch x 1 ''' self.th_train_set = FeatureSet() mega_batch_y = np.zeros(self.num_samples / self.num_mega_batches) self.th_train_set.yf = theano.shared( np.asarray(mega_batch_y, dtype=theano.config.floatX), name='train_set.y', borrow=borrow, ) self.th_train_set.y = T.cast(self.th_train_set.yf, 'int32') # set up theano shared variables mega_batch_x = np.zeros((self.num_samples / self.num_mega_batches, self.num_features)) self.th_train_set.x = theano.shared( np.asarray(mega_batch_x, dtype=theano.config.floatX), name='train_set.X', borrow=borrow, ) logging.info('set up th_train_set.x with shape: {}'.format( self.th_train_set.x.get_value(borrow=borrow).shape)) if self.global_logging_level <= logging.DEBUG2: logging.debug2('self.th_train_set.y[:20]={}'.format(self.th_train_set.y.eval()[:20]))
def compare( self ): '''Compare server hashes of this AU''' def diff( hash_1, hash_2 ): '''Returns the differences and agreement between two hashes''' hash_1_entries, hash_2_entries = ( [ entry.rsplit( None, 1 ) for entry in hash.splitlines() if entry and not entry.startswith( '#' ) ] for hash in ( hash_1, hash_2 ) ) differences = [] difference_count = 0 offset_1 = 0 offset_2 = 0 while True: try: hash_1, filename_1 = hash_1_entries[ offset_1 ] hash_2, filename_2 = hash_2_entries[ offset_2 ] except IndexError: differences.extend( '< ' + filename for hash, filename in hash_1_entries[ offset_1 : ] ) differences.extend( '> ' + filename for hash, filename in hash_2_entries[ offset_2 : ] ) entry_total = len( hash_1_entries ) + len( hash_2_entries ) difference_count += entry_total - offset_1 - offset_2 return differences, 100*( entry_total - difference_count )//entry_total if filename_1 == filename_2: if hash_1 == 'Hash error (see log)': differences.append( 'X ' + filename_1 ) difference_count += 2 elif hash_1 != hash_2: differences.append( '! ' + filename_1 ) difference_count += 2 offset_1 += 1 offset_2 += 1 elif filename_1 < filename_2: differences.append( '< ' + filename_1 ) difference_count += 1 offset_1 += 1 else: differences.append( '> ' + filename_2 ) difference_count += 1 offset_2 += 1 local_comparison = len( self.hash_records ) == 2 location = 'Local' if local_comparison else 'Remote' logging.debug( self.status_message( 'Comparing hashes of %s on ' + str( self.hash_records[ 0 ][ 0 ] ) + ' to %s' ) ) logging.debug2( 'First hash file:\n' + self.hash_records[ 0 ][ 1 ] ) logging.debug2( 'Second hash file:\n' + self.hash_records[ -1 ][ 1 ] ) difference, agreement = diff( self.hash_records[ 0 ][ 1 ], self.hash_records[ -1 ][ 1 ] ) self.hash_records[ -1 ] += difference, agreement if agreement >= configuration.getint( PROGRAM, 'agreement' ): logging.info( location + ' hash file match for AU "%s"' % self.AU ) if local_comparison and self.remote_clients: self.state = Content.State.CHECK return else: self.state = Content.State.HASH_MATCH else: logging.warn( location + ' hash file mismatch for AU "%s"' % self.AU ) self.state = Content.State.HASH_MISMATCH raise Leaving_Pipeline
def load_shared_from_ram(self, mega_batch_index, load_y): '''Load data from main memory to theano shared variable. ''' mega_batch_size = self.num_samples / self.num_mega_batches lo, hi = mega_batch_index * mega_batch_size, (mega_batch_index + 1) * mega_batch_size if self.global_logging_level <= logging.DEBUG2: logging.debug2('Setting train_set.x[{}:{}]'.format(lo, hi)) self.th_train_set.x.set_value(self.mm_train_set.x[lo:hi]) if load_y: self.th_train_set.yf.set_value(self.mm_train_set.y[lo:hi])
def test_dbn( dataset_file, label_file, pretrain_model_file, finetuned_model_file, hidden_layers_sizes=[1024], pretraining_epochs=100, pretrain_lr=0.01, k=1, finetune_training_epochs=1000, finetune_lr=0.1, batch_size=10, numpy_rng_seed=4242, valid_size=None, test_size=None, ): """ Demonstrates how to train and test a Deep Belief Network. :type finetune_lr: float :param finetune_lr: learning rate used in the finetune stage :type pretraining_epochs: int :param pretraining_epochs: number of epoch to do pretraining :type pretrain_lr: float :param pretrain_lr: learning rate to be used during pre-training :type k: int :param k: number of Gibbs steps in CD/PCD :type training_epochs: int :param training_epochs: maximal number of iterations to run the optimizer :type dataset_file: string :param dataset_file: path to the pickled dataset file :type batch_size: int :param batch_size: the size of a minibatch """ logging.info("THEANO_FLAGS={}".format(os.getenv("THEANO_FLAGS"))) datasets = load_data(dataset_file, label_file, valid_size=valid_size, test_size=test_size) train_set_x, train_set_y = datasets[0] num_features = train_set_x.get_value(borrow=True).shape[1] num_classes = len(set(train_set_y.eval())) logging.info("num_features={}, num_classes={}".format(num_features, num_classes)) logging.info("hidden_layers_sizes={}".format(hidden_layers_sizes)) logging.info("pretraining_epochs={}".format(pretraining_epochs)) logging.info("pretrain_lr={}".format(pretrain_lr)) logging.info("CD-k={}".format(k)) logging.info("finetune_training_epochs={}".format(finetune_training_epochs)) logging.info("finetune_lr={}".format(finetune_lr)) logging.info("batch_size={}".format(batch_size)) logging.info("numpy_rng seed={}".format(numpy_rng_seed)) # numpy random generator numpy_rng = numpy.random.RandomState(numpy_rng_seed) logging.info("... building the model") # construct the Deep Belief Network dbn = DBN(numpy_rng=numpy_rng, n_ins=num_features, hidden_layers_sizes=hidden_layers_sizes, n_outs=num_classes) # compute number of minibatches for training, validation and testing n_train_batches = dbn.get_num_batches(train_set_x, batch_size) logging.info("n_train_batches={}".format(n_train_batches)) # start-snippet-2 ######################### # PRETRAINING THE MODEL # ######################### logging.info("... getting the pretraining functions") pretraining_fns = dbn.pretraining_functions(train_set_x=train_set_x, batch_size=batch_size, k=k) logging.info("... pre-training the model") start_time = timeit.default_timer() # Pre-train layer-wise for ii in xrange(dbn.n_layers): logging.debug("pretrain layer {}/{}".format(ii + 1, dbn.n_layers)) # go through pretraining epochs for epoch in xrange(pretraining_epochs): logging.debug("pretrain layer {}, epoch {}/{}".format(ii + 1, epoch + 1, pretraining_epochs)) # go through the training set cc = [] for batch_index in xrange(n_train_batches): logging.debug( "pretrain layer {}, epoch {}, batch {}/{}".format( ii + 1, epoch + 1, batch_index + 1, n_train_batches ) ) cc.append(pretraining_fns[ii](index=batch_index, lr=pretrain_lr)) logging.debug2( "W={}, hbias={}, vbias={}".format( dbn.rbm_layers[0].W[:2, :2].eval(), dbn.rbm_layers[0].hbias[:4].eval(), dbn.rbm_layers[0].vbias[:4].eval(), ) ) logging.debug2("cost={}".format(cc)) logging.info("Pre-training layer {:d}, epoch {:d}, avg cost {}".format(ii, epoch, numpy.mean(cc))) end_time = timeit.default_timer() # end-snippet-2 logging.info("The pretraining code ran for {:.2f}m".format((end_time - start_time) / 60.0)) if pretrain_model_file: logging.info("saving pretrain model file to {}".format(pretrain_model_file)) dbn.save_model(pretrain_model_file) else: logging.info("not saving model file") ######################## # FINETUNING THE MODEL # ######################## # get the training, validation and testing function for the model logging.info("... getting the finetuning functions") train_fn, train_model, validate_model, test_model = dbn.build_finetune_functions( datasets=datasets, batch_size=batch_size, learning_rate=finetune_lr ) logging.info("... finetuning the model") # early-stopping parameters patience = 4 * n_train_batches # look as this many minibatches regardless patience_increase = 2.0 # wait this much longer when a new best is found improvement_threshold = 0.995 # a relative improvement of this much is considered significant # go through this many minibatches before checking the network on the validation set; # in this case we check every epoch validation_frequency = min(n_train_batches, patience / 2) best_validation_loss = numpy.inf avg_test_loss = 0.0 start_time = timeit.default_timer() done_looping = False epoch = 0 logging.debug2("W[:2, :4]={}, b[:4]={}".format(dbn.logLayer.W.eval()[:2, :4], dbn.logLayer.b[:4].eval())) while (epoch < finetune_training_epochs) and (not done_looping): epoch = epoch + 1 logging.debug("finetune epoch {}/{}".format(epoch, finetune_training_epochs)) for minibatch_index in xrange(n_train_batches): logging.debug("finetune epoch {}, minibatch {}/{}".format(epoch, minibatch_index + 1, n_train_batches)) # ****** execute the update ****** minibatch_avg_cost = train_fn(minibatch_index) logging.debug2( "hiddenLayer0 W[:1, :4]={}, b[:4]={}".format( dbn.sigmoid_layers[0].W[:1, :4].eval(), dbn.sigmoid_layers[0].b[:4].eval() ) ) logging.debug2( "logLayer W[:1, :4]={}, b[:4]={}".format(dbn.logLayer.W[:1, :4].eval(), dbn.logLayer.b[:4].eval()) ) # ******************************** logging.info("minibatch_avg_cost={}".format(minibatch_avg_cost)) iter_num = (epoch - 1) * n_train_batches + minibatch_index if (iter_num + 1) % validation_frequency == 0: logging.debug("finetune epoch {}, minibatch {}, iter_num {}".format(epoch, minibatch_index, iter_num)) validation_losses = validate_model() logging.debug2("validation_losses={}".format(validation_losses)) this_validation_loss = numpy.mean(validation_losses) logging.info( "epoch {:d}, minibatch {:d}/{:d}, validation error {:f} %".format( epoch, minibatch_index + 1, n_train_batches, this_validation_loss * 100.0 ) ) # if we got the best validation score until now if this_validation_loss < best_validation_loss: # improve patience if loss improvement is good enough if this_validation_loss < best_validation_loss * improvement_threshold: patience = max(patience, iter_num * patience_increase) # save best validation score and iteration number best_validation_loss = this_validation_loss best_iter = iter_num # test it on the test set test_losses = test_model() avg_test_loss = numpy.mean(test_losses) logging.info( " epoch {:d}, minibatch {:d}/{:d}, test error of best model {} %".format( epoch, minibatch_index + 1, n_train_batches, avg_test_loss * 100.0 ) ) if patience <= iter_num: done_looping = True break train_loss = train_model() logging.debug2("train_loss={}".format(train_loss)) avg_train_loss = numpy.mean(train_model()) end_time = timeit.default_timer() logging.info( ( "Optimization complete with avg train error of %f, best validation error of %f, " "obtained at iteration %i, with avg test error %f" ) % (avg_train_loss, best_validation_loss, best_iter + 1, avg_test_loss) ) logging.info("The fine tuning code ran for {:.2f}m".format((end_time - start_time) / 60.0)) logging.info("Saving finetuned model file to {}".format(finetuned_model_file)) dbn.save_model(finetuned_model_file) logging.info("done saving")
def train_score(mini_idx_begin, mini_idx_end): """inclusive of mini_idx_begin and exclusive of mini_idx_end""" logging.debug2("mini_idx_begin={}, end={}".format(mini_idx_begin, mini_idx_end)) return [train_score_i(i) for i in xrange(mini_idx_begin, mini_idx_end)]
def compare(self): '''Compare server hashes of this AU''' def diff(hash_1, hash_2): '''Returns the differences and agreement between two hashes''' hash_1_entries, hash_2_entries = ([ entry.rsplit(None, 1) for entry in hash.splitlines() if entry and not entry.startswith('#') ] for hash in (hash_1, hash_2)) differences = [] difference_count = 0 offset_1 = 0 offset_2 = 0 while True: try: hash_1, filename_1 = hash_1_entries[offset_1] hash_2, filename_2 = hash_2_entries[offset_2] except IndexError: differences.extend( '< ' + filename for hash, filename in hash_1_entries[offset_1:]) differences.extend( '> ' + filename for hash, filename in hash_2_entries[offset_2:]) entry_total = len(hash_1_entries) + len(hash_2_entries) difference_count += entry_total - offset_1 - offset_2 return differences, 100 * (entry_total - difference_count) // entry_total if filename_1 == filename_2: if hash_1 == 'Hash error (see log)': differences.append('X ' + filename_1) difference_count += 2 elif hash_1 != hash_2: differences.append('! ' + filename_1) difference_count += 2 offset_1 += 1 offset_2 += 1 elif filename_1 < filename_2: differences.append('< ' + filename_1) difference_count += 1 offset_1 += 1 else: differences.append('> ' + filename_2) difference_count += 1 offset_2 += 1 local_comparison = len(self.hash_records) == 2 location = 'Local' if local_comparison else 'Remote' logging.debug( self.status_message('Comparing hashes of %s on ' + str(self.hash_records[0][0]) + ' to %s')) logging.debug2('First hash file:\n' + self.hash_records[0][1]) logging.debug2('Second hash file:\n' + self.hash_records[-1][1]) difference, agreement = diff(self.hash_records[0][1], self.hash_records[-1][1]) self.hash_records[-1] += difference, agreement if agreement >= configuration.getint(PROGRAM, 'agreement'): logging.info(location + ' hash file match for AU "%s"' % self.AU) if local_comparison and self.remote_clients: self.state = Content.State.CHECK return else: self.state = Content.State.HASH_MATCH else: logging.warn(location + ' hash file mismatch for AU "%s"' % self.AU) self.state = Content.State.HASH_MISMATCH raise Leaving_Pipeline
def finetune(self): ######################## # FINETUNING THE MODEL # ######################## # get the training, validation and testing function for the model logging.info('... getting the finetuning functions') # assert self.valid_size + self.test_size == self.num_samples / self.num_mega_batches # datasets = [ (self.th_train_set.x, self.th_train_set.y), # (self.th_train_set.x[:self.valid_size], self.th_train_set.y[:self.valid_size]), # (self.th_train_set.x[self.valid_size:], self.th_train_set.y[self.valid_size:]) ] mega_batch_size = self.num_samples / self.num_mega_batches logging.info('mega_batch_size={}'.format(mega_batch_size)) # num_valid_batches = self.valid_size / self.batch_size ## @todo # num_test_batches = self.test_size / self.batch_size train_fn, train_score, indices = self.dbn.build_finetune_functions2( datasets=datasets, mini_batch_size=self.batch_size, mega_batch_size=mega_batch_size, ) logging.info('... finetuning the model') # early-stopping parameters patience = 4 * self.num_minibatches # look as this many examples regardless patience_increase = 2. # wait this much longer when a new best is found improvement_threshold = 0.995 # a relative improvement of this much is considered significant # go through this many minibatches before checking the network on the validation set; # in this case we check every epoch validation_frequency = min(self.num_minibatches, patience / 2) logging.info('patience={}, patience_increase={}, improvement_threshold={:.4f}, validation_frequency={}'.format( patience, patience_increase, improvement_threshold, validation_frequency)) done_looping = False epoch = self.finetune_epoch_start best_train_loss = np.inf if self.global_logging_level <= logging.DEBUG2: logging.debug2('W[:2, :4]={}, b[:4]={}'.format( self.dbn.logLayer.W[:2, :4].eval(), self.dbn.logLayer.b[:4].eval())) # special case optimization: load only once if num_mega_batches == 1 if self.num_mega_batches == 1: logging.info('Single megabatch. Loading it into GPU/CPU') self.load_mega_batch(0, load_y=True) if self.global_logging_level <= logging.DEBUG: # it may take a few seconds to compute L2 norm l2 = [p.norm(2).eval() for p in self.dbn.params] l2all = np.sqrt(sum(x * x for x in l2)) logging.info('epoch {}: L2 norms: all={}, individual={}'.format( epoch, l2all, ', '.join(str(x) for x in l2))) # match printing of minibatch_avg_cost after each minibatch logging.info('delta_t=0, epoch 0, mega 0, mini 0, minibatch_avg_cost={}'.format( np.mean(train_score(0, self.num_minibatches_in_mega)) / self.batch_size)) # fixed learning_rate learning_rate = np.asscalar(np.array(self.finetune_lr, dtype=theano.config.floatX)) start_time = timeit.default_timer() while (epoch < self.finetune_training_epochs) and (not done_looping): # learning_rate = self.finetune_lr / np.sqrt(epoch + 1) # learning_rate = np.asscalar(np.array(learning_rate, dtype=theano.config.floatX)) epoch = epoch + 1 logging.info('finetune epoch {}/{}, learning_rate={:.4f}'.format( epoch, self.finetune_training_epochs, learning_rate)) train_loss = [] for mega_batch_index in xrange(self.num_mega_batches): if self.num_mega_batches > 1: self.load_mega_batch(mega_batch_index, load_y=True) minibatch_avg_cost_arr = np.zeros(self.num_minibatches_in_mega) for minibatch_index in xrange(self.num_minibatches_in_mega): if self.global_logging_level <= logging.DEBUG: logging.debug('finetune epoch {}, megabatch {}/{}, minibatch {}/{}'.format( epoch, mega_batch_index + 1, self.num_mega_batches, minibatch_index + 1, self.num_minibatches_in_mega)) # ****** execute the update ****** if self.global_logging_level <= logging.DEBUG2: logging.debug2('x_begin/end, y_begin/end={}'.format(indices(minibatch_index))) minibatch_avg_cost = train_fn(minibatch_index, learning_rate) minibatch_avg_cost_arr[minibatch_index] = minibatch_avg_cost if self.global_logging_level <= logging.DEBUG2: logging.debug2('hiddenLayer0 W[:1, :4]={}, b[:4]={}'.format( self.dbn.sigmoid_layers[0].W[:1, :4].eval(), self.dbn.sigmoid_layers[0].b[:4].eval())) if self.global_logging_level <= logging.DEBUG2: logging.debug2('logLayer W[:1, :4]={}, b[:4]={}'.format( self.dbn.logLayer.W[:1, :4].eval(), self.dbn.logLayer.b[:4].eval())) # ******************************** logging.info('delta_t={:.3f}, epoch {}/{}, mega {}/{}, mini {}/{}, minibatch_avg_cost={}'.format( timeit.default_timer() - start_time, epoch, self.finetune_training_epochs, mega_batch_index + 1, self.num_mega_batches, minibatch_index + 1, self.num_minibatches_in_mega, minibatch_avg_cost)) # train_loss is computed at the end of a mega-batch. # Hence, the values here will likely be different than minibatch_avg_cost even if the error computed # in train_loss is the same as the cost in minibatch_avg_cost if self.global_logging_level <= logging.DEBUG: train_loss.extend(train_score(0, self.num_minibatches_in_mega)) if self.global_logging_level <= logging.DEBUG2: logging.debug2('train_loss={}'.format(train_loss)) logging.info('delta_t={}, avg_minibatch_avg_cost={}'.format( timeit.default_timer() - start_time, np.mean(minibatch_avg_cost_arr))) if self.global_logging_level <= logging.DEBUG: l2 = [p.norm(2).eval() for p in self.dbn.params] l2all = np.sqrt(sum(x * x for x in l2)) logging.info('epoch {}: L2 norms: all={}, individual={}'.format( epoch, l2all, ', '.join(str(x) for x in l2))) # np.mean(train_loss) gives the mean loss per minibatch if self.global_logging_level <= logging.DEBUG: logging.debug('train_loss: {}'.format(train_loss[:10])) avg_train_loss = np.mean(train_loss) / self.batch_size logging.info('epoch {}: avg_train_loss={}'.format(epoch, avg_train_loss)) if avg_train_loss < best_train_loss: if avg_train_loss < best_train_loss * improvement_threshold: patience = max(patience, epoch * patience_increase) best_train_loss = avg_train_loss if patience <= epoch: done_looping = True break logging.info('Optimization complete with avg train error of {:f}'.format(avg_train_loss)) end_time = timeit.default_timer() logging.info('The fine tuning code ran for {:.2f}m'.format((end_time - start_time) / 60.)) logging.info('Saving finetuned model file to {}'.format(self.finetuned_model_file)) self.dbn.save_model(self.finetuned_model_file) logging.info('done saving')
def pretrain(self): # start-snippet-2 ######################### # PRETRAINING THE MODEL # ######################### logging.info('... getting the pretraining functions') pretraining_fns = self.dbn.pretraining_functions( train_set_x=self.th_train_set.x, batch_size=self.batch_size, k=self.k ) logging.info('... pre-training the model') start_time = timeit.default_timer() if self.global_logging_level <= logging.DEBUG2: logging.debug2('Initial weights:') for ii, layer in enumerate(self.dbn.rbm_layers): logging.debug2('layer {}, W={}, hbias={}, vbias={}'.format( ii, layer.W[:2, :2].eval(), layer.hbias[:4].eval(), layer.vbias[:4].eval())) # special case optimization: load only once if num_mega_batches == 1 if self.num_mega_batches == 1: logging.info('Single megabatch. Loading it into GPU/CPU') self.load_mega_batch(0, load_y=False) # Pre-train layer-wise for layer_idx in xrange(self.dbn.n_layers): logging.info('pretrain layer {}/{}'.format(layer_idx + 1, self.dbn.n_layers)) # go through pretraining epochs for epoch in xrange(self.pretrain_epoch_start, self.pretraining_epochs): logging.info('pretrain layer {}, epoch {}/{}'.format( layer_idx + 1, epoch + 1, self.pretraining_epochs)) # go through the training set cc = np.zeros(self.num_minibatches_in_mega * self.num_mega_batches) cc_idx = 0 for mega_batch_index in xrange(self.num_mega_batches): if self.num_mega_batches > 1: self.load_mega_batch(mega_batch_index, load_y=False) for batch_index in xrange(self.num_minibatches_in_mega): if self.global_logging_level <= logging.DEBUG: logging.debug('pretrain layer {}, epoch {}/{}, mega_batch {}/{}, batch {}/{}'.format( layer_idx + 1, epoch + 1, self.pretraining_epochs, mega_batch_index + 1, self.num_mega_batches, batch_index + 1, self.num_minibatches_in_mega)) cost = pretraining_fns[layer_idx](index=batch_index, lr=self.pretrain_lr) cc[cc_idx] = cost cc_idx += 1 if self.global_logging_level <= logging.DEBUG2: for layer in self.dbn.rbm_layers: for ii, layer in enumerate(self.dbn.rbm_layers): logging.debug2('layer {}, W={}, hbias={}, vbias={}'.format( ii, layer.W[:2, :2].eval(), layer.hbias[:4].eval(), layer.vbias[:4].eval())) if self.global_logging_level <= logging.DEBUG2: logging.debug2('cost: {}'.format(cc)) logging.info('pre-train layer {:d}, epoch {:d}/{}, avg cost {}'.format( layer_idx, epoch + 1, self.pretraining_epochs, np.mean(cc))) l2 = [p.norm(2).eval() for p in self.dbn.params] l2all = np.sqrt(sum(x * x for x in l2)) logging.info('pre-train layer {:d}, epoch {}/{}, L2 norms: all={}, individual={}'.format( layer_idx, epoch + 1, self.pretraining_epochs, l2all, ', '.join(str(x) for x in l2))) end_time = timeit.default_timer() # end-snippet-2 logging.info('The pretraining code ran for {:.2f}m'.format((end_time - start_time) / 60.)) if self.pretrain_model_file: logging.info('saving pretrain model file to {}'.format(self.pretrain_model_file)) self.dbn.save_model(self.pretrain_model_file) else: logging.info('pretrain model not saved')