def learning_algorithm(args): name = args.algorithm learning_rate = float(args.learning_rate) momentum = args.momentum clipping_threshold = args.clipping clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) if name == 'adam': adam = Adam(learning_rate=learning_rate) step_rule = CompositeRule([adam, clipping]) learning_rate = adam.learning_rate elif name == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate) step_rule = CompositeRule([clipping, rms_prop]) learning_rate = rms_prop.learning_rate elif name == 'momentum': sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) step_rule = CompositeRule([clipping, sgd_momentum]) learning_rate = sgd_momentum.learning_rate elif name == 'sgd': sgd = Scale(learning_rate=learning_rate) step_rule = CompositeRule([clipping, sgd]) learning_rate = sgd.learning_rate else: raise NotImplementedError return step_rule, learning_rate
def test_momentum(): a = shared_floatx([3, 4]) cost = (a**2).sum() steps, updates = Momentum(0.1, 0.5).compute_steps( OrderedDict([(a, tensor.grad(cost, a))])) f = theano.function([], [steps[a]], updates=updates) assert_allclose(f()[0], [0.6, 0.8]) assert_allclose(f()[0], [0.9, 1.2]) assert_allclose(f()[0], [1.05, 1.4])
def learning_algorithm(args): name = args.algorithm learning_rate = float(args.learning_rate) momentum = args.momentum clipping_threshold = args.clipping if name == 'adam': adam = Adam(learning_rate=learning_rate) step_rule = adam elif name == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate, decay_rate=0.9) step_rule = CompositeRule([StepClipping(1.), rms_prop]) else: sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) step_rule = sgd_momentum return step_rule
def learning_algorithm(args): name = args.algorithm learning_rate = float(args.learning_rate) momentum = args.momentum clipping_threshold = args.clipping if name == 'adam': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) adam = Adam(learning_rate=learning_rate) step_rule = CompositeRule([adam, clipping]) elif name == 'rms_prop': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) rms_prop = RMSProp(learning_rate=learning_rate) step_rule = CompositeRule([clipping, rms_prop]) else: clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) step_rule = CompositeRule([clipping, sgd_momentum]) return step_rule
def train_model(cost, error_rate, train_stream, load_location=None, save_location=None): cost.name = "Cross_entropy" error_rate.name = 'Error_rate' # Define the model model = Model(cost) # Load the parameters from a dumped model if load_location is not None: logger.info('Loading parameters...') model.set_param_values(load_parameter_values(load_location)) cg = ComputationGraph(cost) step_rule = Momentum(learning_rate=0.1, momentum=0.9) algorithm = GradientDescent(cost=cost, step_rule=step_rule, params=cg.parameters) main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=algorithm, extensions=[ # DataStreamMonitoring([cost], test_stream, prefix='test', # after_epoch=False, every_n_epochs=10), DataStreamMonitoring([cost], train_stream, prefix='train', after_epoch=True), Printing(after_epoch=True) ]) main_loop.run() # Save the main loop if save_location is not None: logger.info('Saving the main loop...') dump_manager = MainLoopDumpManager(save_location) dump_manager.dump(main_loop) logger.info('Saved')
def learning_algorithm(args): name = args.algorithm learning_rate = float(args.learning_rate) momentum = args.momentum clipping_threshold = args.clipping if name == 'adam': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) adam = Adam(learning_rate=learning_rate) # [adam, clipping] means 'step clipping' # [clipping, adam] means 'gradient clipping' step_rule = CompositeRule([adam, clipping]) elif name == 'rms_prop': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) rms_prop = RMSProp(learning_rate=learning_rate) rm_non_finite = RemoveNotFinite() step_rule = CompositeRule([clipping, rms_prop, rm_non_finite]) else: clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) rm_non_finite = RemoveNotFinite() step_rule = CompositeRule([clipping, sgd_momentum, rm_non_finite]) return step_rule
def learning_algorithm(learning_rate, momentum=0.0, clipping_threshold=100, algorithm='sgd'): if algorithm == 'adam': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) adam = Adam(learning_rate=learning_rate) # [adam, clipping] means 'step clipping' # [clipping, adam] means 'gradient clipping' step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) rms_prop = RMSProp(learning_rate=learning_rate) rm_non_finite = RemoveNotFinite() step_rule = CompositeRule([clipping, rms_prop, rm_non_finite]) elif algorithm == 'sgd': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) rm_non_finite = RemoveNotFinite() step_rule = CompositeRule([clipping, sgd_momentum, rm_non_finite]) else: raise NotImplementedError return step_rule
def setup_algorithms(cost, cg, method, type="ff"): """Setup training algorithm. Parameters ---------- cost : expression cost expression cg : ComputationGraph Computation graph method : string training method: SGD, momentum SGD, AdaGrad, RMSprop learning_rate : float learning rate for learning method Returns ------- algorithm : GradientDescent Gradient Descent algorithm based on different optimization method """ if method == "sgd": step_rule = Scale(learning_rate=0.01) elif method == "momentum": step_rule = Momentum(learning_rate=0.01, momentum=0.95) elif method == "adagrad": step_rule = AdaGrad() elif method == "rmsprop": step_rule = RMSProp() if type == "RNN": step_rule = CompositeRule([StepClipping(1.0), step_rule]) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=step_rule) return algorithm
def main(feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=None, num_batches=None): if feature_maps is None: feature_maps = [32, 48, 64, 96, 96, 128] if mlp_hiddens is None: mlp_hiddens = [1000] if conv_sizes is None: conv_sizes = [9, 7, 5, 3, 2, 1] if pool_sizes is None: pool_sizes = [2, 2, 2, 2, 1, 1] if batch_size is None: batch_size = 64 conv_steps=[2, 1, 1, 1, 1, 1] #same as stride image_size = (128, 128) output_size = 2 learningRate = 0.001 drop_prob = 0.4 weight_noise = 0.75 num_epochs = 150 num_batches = None host_plot='http://*****:*****@ %s' % (graph_name, datetime.datetime.now(), socket.gethostname()), channels=[['train_error_rate', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) PLOT_AVAILABLE = True except ImportError: PLOT_AVAILABLE = False extensions.append(Checkpoint(save_to, after_epoch=True, after_training=True, save_separately=['log'])) logger.info("Building the model") model = Model(cost) ########### Loading images ##################### main_loop = MainLoop( algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
cost_monitor = cost_train else: raise ValueError, conf.task recognizer.initialize() cg = ComputationGraph([cost_train, y_hat, x_m, y, y_m]) weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg = apply_noise(cg, weights, conf.weight_noise) #************* training algorithm ************* model = Model(cost_train) if conf.step_rule == 'AdaDelta': step_rule = AdaDelta() elif conf.step_rule == 'Momentum': step_rule = Momentum(learning_rate=conf.learning_rate, momentum=conf.momentum) else: raise ('step_rule not known: {}'.format(conf.step_rule)) step_rule = CompositeRule([step_rule, StepClipping(conf.step_clipping)]) algorithm = GradientDescent(cost=cost_train, parameters=cg.parameters, step_rule=step_rule) #***************** main loop **************** train_monitor = TrainingDataMonitoring([cost_monitor], prefix="train") val_monitor = DataStreamMonitoring([cost_monitor], stream_val, prefix="val") if conf.task == 'CTC': PER_Monitor = PhonemeErrorRate elif conf.task == 'framewise':
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs, test_cost, experiment_path, initialization, init_width, weight_noise, z_prob, z_prob_states, z_prob_cells, drop_prob_igates, ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len, decrease_lr_after_epoch, lr_decay, **kwargs): print '.. PTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() ########################################### # # LOAD DATA # ########################################### def onehot(x, numclasses=None): """ Convert integer encoding for class-labels (starting with 0 !) to one-hot encoding. The output is an array whose shape is the shape of the input array plus an extra dimension, containing the 'one-hot'-encoded labels. """ if x.shape == (): x = x[None] if numclasses is None: numclasses = x.max() + 1 result = numpy.zeros(list(x.shape) + [numclasses], dtype="int") z = numpy.zeros(x.shape, dtype="int") for c in range(numclasses): z *= 0 z[numpy.where(x == c)] = 1 result[..., c] += z return result.astype(theano.config.floatX) alphabetsize = 10000 data = np.load('penntree_char_and_word.npz') trainset = data['train_words'] validset = data['valid_words'] testset = data['test_words'] if testing: trainset = trainset[:3000] validset = validset[:3000] if share_mask: if not z_prob: raise ValueError('z_prob must be provided when using share_mask') if z_prob_cells or z_prob_states: raise ValueError( 'z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)' ) z_prob_cells = z_prob # we don't want to actually use these masks, so this is to debug z_prob_states = None else: if z_prob: raise ValueError('z_prob is only used with share_mask') z_prob_cells = z_prob_cells or '1' z_prob_states = z_prob_states or '1' # rng = np.random.RandomState(seed) ########################################### # # MAKE STREAMS # ########################################### def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream( IndexableDataset(indexables=OrderedDict([('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data', ) * 3 + stream.sources + ( 'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream, ) train_stream, = prep_dataset(trainset) valid_stream, = prep_dataset(validset) test_stream, = prep_dataset(testset) #################### data = train_stream.get_epoch_iterator(as_dict=True).next() #################### ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('data') y = x zoneouts_states = T.tensor3('zoneouts_states') zoneouts_cells = T.tensor3('zoneouts_cells') zoneouts_igates = T.tensor3('zoneouts_igates') x.tag.test_value = data['data'] zoneouts_states.tag.test_value = data['zoneouts_states'] zoneouts_cells.tag.test_value = data['zoneouts_cells'] zoneouts_igates.tag.test_value = data['zoneouts_igates'] if init_width and not initialization == 'uniform': raise ValueError('Width is only for uniform init, whassup?') if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=init_width) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 4, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropLSTM(dim=layer_size, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn%d' % l, ogates_zoneout=ogates_zoneout) for l in range(num_layers) ] elif rnn_type.lower() == 'gru': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 3, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropGRU(dim=layer_size, weights_init=weights_init, activation=Tanh(), name='rnn%d' % l) for l in range(num_layers) ] elif rnn_type.lower() == 'srnn': # FIXME!!! make ReLU in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropSimpleRecurrent(dim=layer_size, weights_init=weights_init, activation=Rectifier(), name='rnn%d' % l) for l in range(num_layers) ] else: raise NotImplementedError hid_to_out = Linear(layer_size, alphabetsize, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) for layer in in_to_hids: layer.initialize() for layer in recurrent_layers: layer.initialize() hid_to_out.initialize() layer_input = x #in_to_hid.apply(x) init_updates = OrderedDict() for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)): rnn_embedding = in_to_hid.apply(layer_input) if rnn_type.lower() == 'lstm': states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) cells_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name, cells_init.name = "states_init", "cells_init" states, cells = layer.apply( rnn_embedding, zoneouts_states[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_cells[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_igates[:, :, l * layer_size:(l + 1) * layer_size], states_init, cells_init) init_updates.update([(states_init, states[-1]), (cells_init, cells[-1])]) elif rnn_type.lower() in ['gru', 'srnn']: # untested! states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name = "states_init" states = layer.apply(rnn_embedding, zoneouts_states, zoneouts_igates, states_init) init_updates.update([(states_init, states[-1])]) else: raise NotImplementedError layer_input = states y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1])) shape_ = y_hat_pre_softmax.shape y_hat = Softmax().apply(y_hat_pre_softmax.reshape((-1, alphabetsize))) #################### ########################################### # # SET UP COSTS AND MONITORS # ########################################### cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)), y_hat).copy('cost') bpc = (cost / np.log(2.0)).copy(name='bpr') perp = T.exp(cost).copy(name='perp') cost_train = cost.copy(name='train_cost') cg_train = ComputationGraph([cost_train]) ########################################### # # NORM STABILIZER # ########################################### norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt( T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) elif penalty == 'hids': for l in range(num_layers): assert 'rnn%d_apply_states' % l in [ o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables) ] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): for l in range(num_layers): if output.name == 'rnn%d_apply_states' % l: norms = _magnitude(output) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' #cost_valid = cost_train cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy( 'cost_train') #should this be cost_train.outputs[0]? no. cg_train = ComputationGraph([cost_train]) ########################################### # # WEIGHT NOISE # ########################################### if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') model = Model(cost_train) learning_rate = float(learning_rate) clipping = StepClipping(threshold=np.cast[floatX](clipping)) if algorithm == 'adam': adam = Adam(learning_rate=learning_rate) learning_rate = adam.learning_rate step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate) learning_rate = rms_prop.learning_rate step_rule = CompositeRule([clipping, rms_prop]) elif algorithm == 'momentum': sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) learning_rate = sgd_momentum.learning_rate step_rule = CompositeRule([clipping, sgd_momentum]) elif algorithm == 'sgd': sgd = Scale(learning_rate=learning_rate) learning_rate = sgd.learning_rate step_rule = CompositeRule([clipping, sgd]) else: raise NotImplementedError algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)}) algorithm.add_updates(init_updates) def cond_number(x): _, _, sing_vals = T.nlinalg.svd(x, True, True) sing_mags = abs(sing_vals) return T.max(sing_mags) / T.min(sing_mags) def rms(x): return (x * x).mean().sqrt() whysplode_cond = [] whysplode_rms = [] for i, p in enumerate(init_updates): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) for i, p in enumerate(cg_train.parameters): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) observed_vars = [ cost_train, cost, bpc, perp, learning_rate, aggregation.mean( algorithm.total_gradient_norm).copy("gradient_norm_mean") ] # + whysplode_rms parameters = model.get_parameter_dict() for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name=name + "_norm")) observed_vars.append( algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_inits = [p.clone() for p in init_updates] cg_dev = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), dev_inits)) dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3] dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:])) dev_monitor = DataStreamMonitoring(variables=[dev_cost, dev_bpc, dev_perp], data_stream=valid_stream, prefix="dev", updates=dev_init_updates) # noone does this if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name extensions = [] extensions.extend( [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) if test_cost: test_inits = [p.clone() for p in init_updates] cg_test = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), test_inits)) test_cost, test_bpc, test_perp = cg_test.outputs[:3] test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:])) test_monitor = DataStreamMonitoring( variables=[test_cost, test_bpc, test_perp], data_stream=test_stream, prefix="test", updates=test_init_updates) extensions.extend([test_monitor]) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append( SaveParams('dev_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) class RollsExtension(TrainingExtension): """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """ def __init__(self, shvars): self.shvars = shvars def before_epoch(self): for v in self.shvars: v.set_value(np.roll(v.get_value(), 1, 0)) extensions.append( RollsExtension(init_updates.keys() + dev_init_updates.keys() + (test_init_updates.keys() if test_cost else []))) class LearningRateSchedule(TrainingExtension): """ Lets you set a number to divide learning rate by each epoch + when to start doing that """ def __init__(self): self.epoch_number = 0 def after_epoch(self): self.epoch_number += 1 if self.epoch_number > decrease_lr_after_epoch: learning_rate.set_value(learning_rate.get_value() / lr_decay) if bool(lr_decay) != bool(decrease_lr_after_epoch): raise ValueError( 'Need to define both lr_decay and decrease_lr_after_epoch') if lr_decay and decrease_lr_after_epoch: extensions.append(LearningRateSchedule()) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) main_loop.run() print "Execution time: %f" % (time.time() - t1)
from blocks.roles import WEIGHT from blocks.graph import ComputationGraph, apply_noise, apply_dropout from datastream import RandomTransposeIt step_rule_name = 'adam' learning_rate = 0.1 momentum = 0.9 if step_rule_name == 'adadelta': step_rule = AdaDelta() elif step_rule_name == 'rmsprop': step_rule = RMSProp() elif step_rule_name == 'momentum': step_rule_name = "mom%s,%s" % (repr(learning_rate), repr(momentum)) step_rule = Momentum(learning_rate=learning_rate, momentum=momentum) elif step_rule_name == 'adam': step_rule = Adam() else: raise ValueError("No such step rule: " + step_rule_name) ibatchsize = None iter_scheme = RandomTransposeIt(ibatchsize, False, None, False) valid_iter_scheme = RandomTransposeIt(ibatchsize, False, None, False) w_noise_std = 0.05 r_dropout = 0.0 s_dropout = 0.0 i_dropout = 0.0 a_dropout = 0.0
def main(save_to, num_epochs, regularization=0.001, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 convnet = create_all_conv_net() x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost')) test_components = (ComponentwiseCrossEntropy().apply( y.flatten(), probs).copy(name='components')) test_error_rate = (MisclassificationRate().apply( y.flatten(), probs).copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), probs).copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate, test_components]) # Apply dropout to all layer outputs except final softmax dropout_vars = VariableFilter( roles=[OUTPUT], bricks=[Convolutional], theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(drop_cg.variables) # train_cg = apply_dropout(drop_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(drop_cg, [x], 0.2) train_cg = drop_cg # train_cg = test_cg train_cost, train_error_rate, train_components = train_cg.outputs # Apply regularization to the cost biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) l2_norm = sum([(W**2).sum() for W in weights]) l2_norm.name = 'l2_norm' l2_regularization = regularization * l2_norm l2_regularization.name = 'l2_regularization' test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + regularization * l2_norm train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train", )) #cifar10_train_stream = RandomPadCropFlip( # NormalizeBatchLevels(DataStream.default_stream( # cifar10_train, iteration_scheme=ShuffledScheme( # cifar10_train.num_examples, batch_size)), # which_sources=('features',)), # (32, 32), pad=5, which_sources=('features',)) cifar10_train_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme(cifar10_train.num_examples, batch_size)), which_sources=('features', )) test_batch_size = 1000 cifar10_test = CIFAR10(("test", )) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme(cifar10_test.num_examples, test_batch_size)), which_sources=('features', )) momentum = Momentum(0.002, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # step_rule = CompositeRule([StepClipping(100), momentum]) step_rule = momentum # Train with simple SGD algorithm = GradientDescent(cost=train_cost, parameters=train_cg.parameters, step_rule=step_rule) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [(1, 0.005), (3, 0.01), (5, 0.02), (200, 0.002), (250, 0.0002), (300, 0.00002)]), DataStreamMonitoring([test_cost, test_error_rate, test_confusion], cifar10_test_stream, prefix="test"), TrainingDataMonitoring([ train_cost, train_error_rate, train_cost_without_regularization, l2_regularization, momentum.learning_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", every_n_batches=10), # after_epoch=True), Plot('Training performance for ' + save_to, channels=[ [ 'train_cost_with_regularization', 'train_cost_without_regularization', 'train_l2_regularization' ], ['train_error_rate'], ['train_total_gradient_norm'], ], every_n_batches=10), # after_batch=True), Plot('Test performance for ' + save_to, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing() ] if histogram: attribution = AttributionExtension(components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(save_to, True, True)) model = Model(train_cost) main_loop = MainLoop(algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def main(args): """Run experiment. """ lr_tag = float_tag(args.learning_rate) x_dim, train_stream, valid_stream, test_stream = datasets.get_streams( args.data, args.batch_size) #------------------------------------------------------------ # Setup model deterministic_act = Tanh deterministic_size = 1. if args.method == 'vae': sizes_tag = args.layer_spec.replace(",", "-") layer_sizes = [int(i) for i in args.layer_spec.split(",")] layer_sizes, z_dim = layer_sizes[:-1], layer_sizes[-1] name = "%s-%s-%s-lr%s-spl%d-%s" % \ (args.data, args.method, args.name, lr_tag, args.n_samples, sizes_tag) if args.activation == "tanh": hidden_act = Tanh() elif args.activation == "logistic": hidden_act = Logistic() elif args.activation == "relu": hidden_act = Rectifier() else: raise "Unknown hidden nonlinearity %s" % args.hidden_act model = VAE(x_dim=x_dim, hidden_layers=layer_sizes, hidden_act=hidden_act, z_dim=z_dim, batch_norm=args.batch_normalization) model.initialize() elif args.method == 'dvae': sizes_tag = args.layer_spec.replace(",", "-") layer_sizes = [int(i) for i in args.layer_spec.split(",")] layer_sizes, z_dim = layer_sizes[:-1], layer_sizes[-1] name = "%s-%s-%s-lr%s-spl%d-%s" % \ (args.data, args.method, args.name, lr_tag, args.n_samples, sizes_tag) if args.activation == "tanh": hidden_act = Tanh() elif args.activation == "logistic": hidden_act = Logistic() elif args.activation == "relu": hidden_act = Rectifier() else: raise "Unknown hidden nonlinearity %s" % args.hidden_act model = DVAE(x_dim=x_dim, hidden_layers=layer_sizes, hidden_act=hidden_act, z_dim=z_dim, batch_norm=args.batch_normalization) model.initialize() elif args.method == 'rws': sizes_tag = args.layer_spec.replace(",", "-") qbase = "" if not args.no_qbaseline else "noqb-" name = "%s-%s-%s-%slr%s-dl%d-spl%d-%s" % \ (args.data, args.method, args.name, qbase, lr_tag, args.deterministic_layers, args.n_samples, sizes_tag) p_layers, q_layers = create_layers(args.layer_spec, x_dim, args.deterministic_layers, deterministic_act, deterministic_size) model = ReweightedWakeSleep( p_layers, q_layers, qbaseline=(not args.no_qbaseline), ) model.initialize() elif args.method == 'bihm-rws': sizes_tag = args.layer_spec.replace(",", "-") name = "%s-%s-%s-lr%s-dl%d-spl%d-%s" % \ (args.data, args.method, args.name, lr_tag, args.deterministic_layers, args.n_samples, sizes_tag) p_layers, q_layers = create_layers(args.layer_spec, x_dim, args.deterministic_layers, deterministic_act, deterministic_size) model = BiHM( p_layers, q_layers, l1reg=args.l1reg, l2reg=args.l2reg, ) model.initialize() elif args.method == 'continue': import cPickle as pickle from os.path import basename, splitext with open(args.model_file, 'rb') as f: m = pickle.load(f) if isinstance(m, MainLoop): m = m.model model = m.get_top_bricks()[0] while len(model.parents) > 0: model = model.parents[0] assert isinstance(model, (BiHM, ReweightedWakeSleep, VAE)) mname, _, _ = basename(args.model_file).rpartition("_model.pkl") name = "%s-cont-%s-lr%s-spl%s" % (mname, args.name, lr_tag, args.n_samples) else: raise ValueError("Unknown training method '%s'" % args.method) #------------------------------------------------------------ x = tensor.matrix('features') #------------------------------------------------------------ # Testset monitoring train_monitors = [] valid_monitors = [] test_monitors = [] for s in [1, 10, 100, 1000]: log_p, log_ph = model.log_likelihood(x, s) log_p = -log_p.mean() log_ph = -log_ph.mean() log_p.name = "log_p_%d" % s log_ph.name = "log_ph_%d" % s #train_monitors += [log_p, log_ph] #valid_monitors += [log_p, log_ph] test_monitors += [log_p, log_ph] #------------------------------------------------------------ # Z estimation #for s in [100000]: # z2 = tensor.exp(model.estimate_log_z2(s)) / s # z2.name = "z2_%d" % s # # valid_monitors += [z2] # test_monitors += [z2] #------------------------------------------------------------ # Gradient and training monitoring if args.method in ['vae', 'dvae']: log_p_bound, gradients = model.get_gradients(x, args.n_samples) log_p_bound = -log_p_bound.mean() log_p_bound.name = "log_p_bound" cost = log_p_bound train_monitors += [ log_p_bound, named(model.kl_term.mean(), 'kl_term'), named(model.recons_term.mean(), 'recons_term') ] valid_monitors += [ log_p_bound, named(model.kl_term.mean(), 'kl_term'), named(model.recons_term.mean(), 'recons_term') ] test_monitors += [ log_p_bound, named(model.kl_term.mean(), 'kl_term'), named(model.recons_term.mean(), 'recons_term') ] else: log_p, log_ph, gradients = model.get_gradients(x, args.n_samples) log_p = -log_p.mean() log_ph = -log_ph.mean() log_p.name = "log_p" log_ph.name = "log_ph" cost = log_ph train_monitors += [log_p, log_ph] valid_monitors += [log_p, log_ph] #------------------------------------------------------------ # Detailed monitoring """ n_layers = len(p_layers) log_px, w, log_p, log_q, samples = model.log_likelihood(x, n_samples) exp_samples = [] for l in xrange(n_layers): e = (w.dimshuffle(0, 1, 'x')*samples[l]).sum(axis=1) e.name = "inference_h%d" % l e.tag.aggregation_scheme = aggregation.TakeLast(e) exp_samples.append(e) s1 = samples[1] sh1 = s1.shape s1_ = s1.reshape([sh1[0]*sh1[1], sh1[2]]) s0, _ = model.p_layers[0].sample_expected(s1_) s0 = s0.reshape([sh1[0], sh1[1], s0.shape[1]]) s0 = (w.dimshuffle(0, 1, 'x')*s0).sum(axis=1) s0.name = "inference_h0^" s0.tag.aggregation_scheme = aggregation.TakeLast(s0) exp_samples.append(s0) # Draw P-samples p_samples, _, _ = model.sample_p(100) #weights = model.importance_weights(samples) #weights = weights / weights.sum() for i, s in enumerate(p_samples): s.name = "psamples_h%d" % i s.tag.aggregation_scheme = aggregation.TakeLast(s) # samples = model.sample(100, oversample=100) for i, s in enumerate(samples): s.name = "samples_h%d" % i s.tag.aggregation_scheme = aggregation.TakeLast(s) """ cg = ComputationGraph([cost]) #------------------------------------------------------------ if args.step_rule == "momentum": step_rule = Momentum(args.learning_rate, 0.95) elif args.step_rule == "rmsprop": step_rule = RMSProp(args.learning_rate) elif args.step_rule == "adam": step_rule = Adam(args.learning_rate) else: raise "Unknown step_rule %s" % args.step_rule #parameters = cg.parameters[:4] + cg.parameters[5:] parameters = cg.parameters algorithm = GradientDescent( cost=cost, parameters=parameters, gradients=gradients, step_rule=CompositeRule([ #StepClipping(25), step_rule, #RemoveNotFinite(1.0), ])) #------------------------------------------------------------ train_monitors += [ aggregation.mean(algorithm.total_gradient_norm), aggregation.mean(algorithm.total_step_norm) ] #------------------------------------------------------------ # Live plotting? plotting_extensions = [] if args.live_plotting: plotting_extensions = [ PlotManager( name, [ Plotter(channels=[[ "valid_%s" % cost.name, "valid_log_p" ], ["train_total_gradient_norm", "train_total_step_norm"]], titles=[ "validation cost", "norm of training gradient and step" ]), DisplayImage( [ WeightDisplay(model.p_layers[0].mlp. linear_transformations[0].W, n_weights=100, image_shape=(28, 28)) ] #ImageDataStreamDisplay(test_stream, image_shape=(28,28))] ) ]) ] main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), ProgressBar(), TrainingDataMonitoring( train_monitors, prefix="train", after_epoch=True), DataStreamMonitoring( valid_monitors, data_stream=valid_stream, prefix="valid"), DataStreamMonitoring(test_monitors, data_stream=test_stream, prefix="test", after_epoch=False, after_training=True, every_n_epochs=10), #SharedVariableModifier( # algorithm.step_rule.components[0].learning_rate, # half_lr_func, # before_training=False, # after_epoch=False, # after_batch=False, # every_n_epochs=half_lr), TrackTheBest('valid_%s' % cost.name), Checkpoint(name + ".pkl", save_separately=['log', 'model']), FinishIfNoImprovementAfter('valid_%s_best_so_far' % cost.name, epochs=args.patience), FinishAfter(after_n_epochs=args.max_epochs), Printing() ] + plotting_extensions) main_loop.run()
def test_momentum_broadcastable(): verify_broadcastable_handling(Momentum(0.5))
def initialaze_algorithm(config, save_path, bokeh_name, params, bokeh_server, bokeh, use_load_ext, load_log, fast_start, recognizer, data, model, cg, regularized_cg, cost, train_cost, parameters, max_norm_rules, observables, batch_size, batch_cost, weights_entropy, labels_mask, labels, gradients=None): primary_observables = observables secondary_observables = [] validation_observables = [] root_path, extension = os.path.splitext(save_path) train_conf = config['training'] # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) if 'adam' in rule_names: assert len(rule_names) == 1 logger.info("Using Adam for training") core_rules.append( Adam(learning_rate=train_conf.get('scale', 0.002), beta1=train_conf.get('beta1', 0.1), beta2=train_conf.get('beta2', 0.001), epsilon=train_conf.get('epsilon', 1e-8), decay_factor=train_conf.get('decay_rate', (1 - 1e-8)))) burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append( BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') #theano_func_kwargs={'mode':NanGuardMode(nan_is_error=True)}) logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances')] + weights_entropy def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name.startswith('weights_entropy'): chld_id = recognizer.child_id_from_postfix(var.name) result.append(rename(aggregation.mean(var, labels_mask[chld_id].sum()), 'weights_entropy_per_label'+ recognizer.children[chld_id].names_postfix)) elif var.name.endswith('_nll'): chld_id = recognizer.child_id_from_postfix(var.name) result.append(rename(aggregation.mean(var.sum(), labels_mask[chld_id].sum()), var.name+'_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( primary_observables, after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables), data.get_stream("valid", shuffle=False, **data_params_valid), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) additional_patience_notifiers = [] uas = DependencyErrorRate(recognizer.children[0], data, **config['monitoring']['search']) las = AuxiliaryErrorRates(uas, name='LAS') lab = AuxiliaryErrorRates(uas, name='LAB') per_monitoring = DataStreamMonitoring( [uas, las, lab], data.get_one_stream("valid", data.langs[0], batches=False, shuffle=False, **data_params_valid)[0], prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_uas = TrackTheBest( per_monitoring.record_name(uas)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_las = TrackTheBest( per_monitoring.record_name(las)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_lab = TrackTheBest( per_monitoring.record_name(lab)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_uas, track_the_best_las, track_the_best_lab, ] per = uas track_the_best_per = track_the_best_uas additional_patience_notifiers = [track_the_best_lab, track_the_best_las] track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500, num_stds=train_conf.get('clip_stds', 1.0))) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf['num_batches'], after_n_epochs=train_conf['num_epochs']), # .add_condition(["after_batch"], _gradient_norm_is_none), ] main_postfix = recognizer.children[0].names_postfix channels = [ # Plot 1: training and validation costs [average_monitoring.record_name(train_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'+main_postfix), validation._record_name('weights_entropy_per_label'+main_postfix)], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'+main_postfix), validation._record_name('weights_penalty_per_recording'+main_postfix)]] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server),] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar()] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name] + additional_patience_notifiers extensions.append(Patience(**patience_conf)) if train_conf.get('min_performance_stops'): extensions.append(EarlyTermination( param_name=track_the_best_per.best_name, min_performance_by_epoch=train_conf['min_performance_stops'])) extensions.append(Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
def main(port_data, LR, Init1, Init2): mlp_hiddens = [500] filter_sizes = [(3, 3), (3, 3), (3, 3)] feature_maps = [20, 50, 80] pooling_sizes = [(2, 2), (2, 2), (2, 2)] save_to = "DvC_parcour.pkl" image_size = (128, 128) output_size = 2 learningRate = LR num_epochs = 10 num_batches = None if socket.gethostname() == 'tim-X550JX': host_plot = 'http://*****:*****@ %s' % ('CNN ', datetime.datetime.now(), socket.gethostname()), channels=[['train_error_rate', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) model = Model(cost) main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
params = cg.parameters model = Model([cost]) print "model parameters:" print model.get_parameter_dict() if "adagrad" in config: print "using adagrad" thisRule=AdaGrad(learning_rate=learning_rate) elif "adadelta" in config: print "using adadelta" thisRule=AdaDelta() elif "momentum" in config: print "using momentum" mWeight = float(config["momentum"]) thisRule=Momentum(learning_rate=learning_rate, momentum=mWeight) else: print "using traditional SGD" thisRule=Scale(learning_rate=learning_rate) if "gradientClipping" in config: threshold = float(config["gradientClipping"]) print "using gradient clipping with threshold ", threshold thisRule=CompositeRule([StepClipping(threshold), thisRule]) #step_rule=CompositeRule([StepClipping(config['step_clipping']), # eval(config['step_rule'])()]) algorithm = GradientDescent(cost=cost, parameters=params, step_rule=thisRule,
def main(n_hiddens=1500, n_out=200, noise_std=0.5, learning_rate=2e-1, momentum=0.9, gamma=2.0): ###################### # Model ###################### import theano from theano.sandbox.rng_mrg import MRG_RandomStreams import numpy import theano.tensor as tensor from lvq.lvq import AaronLVQ, SupervisedNG, initialize_prototypes from blocks.bricks import Linear, LinearMaxout from blocks.bricks import Rectifier, Softmax, Logistic from blocks.bricks.cost import MisclassificationRate from blocks.config import config from blocks.initialization import Uniform, Constant from blocks.model import Model from lvq.batch_norm import MLP #from blocks.bricks import MLP seed = config.default_seed rng = MRG_RandomStreams(seed) x = tensor.tensor4('features') flat_x = tensor.flatten(x, outdim=2) flat_x_noise = flat_x + rng.normal(size=flat_x.shape, std=noise_std) y = tensor.imatrix('targets') flat_y = tensor.flatten(y, outdim=1) act = Rectifier() mlp = MLP(dims=[784, n_hiddens, n_hiddens, n_out], activations=[act, act, None]) mlp.weights_init = Uniform(0.0, 0.001) mlp.biases_init = Constant(0.0) mlp.initialize() train_out = mlp.apply(flat_x_noise) test_out = mlp.inference(flat_x) lamb = theano.shared(numpy.float32(10.0)) lvq = SupervisedNG(10, n_out, nonlin=True, gamma=gamma, lamb=lamb, name='lvq') test_loss, test_misclass = lvq.apply(test_out, flat_y, 'test') loss, misclass = lvq.apply(train_out, flat_y, 'train') model = Model(loss) ###################### # Data ###################### #from mnist import MNIST from fuel.datasets import MNIST from fuel.streams import DataStream from fuel.schemes import ShuffledScheme, SequentialScheme from fuel.transformers import ScaleAndShift, ForceFloatX mnist_train = MNIST([ 'train' ]) #MNIST('train', drop_input=False, sources=('features', 'targets')) mnist_test = MNIST([ 'test' ]) # MNIST('test', drop_input=False, sources=('features', 'targets')) batch_size = 100 #Batch size for training batch_size_mon = 2000 # Batch size for monitoring and batch normalization n_batches = int( numpy.ceil(float(mnist_train.num_examples) / batch_size_mon)) num_protos = 10 ind = range(mnist_train.num_examples) train_ind = ind[:50000] val_ind = ind[50000:] def preprocessing(data_stream): return ForceFloatX(ScaleAndShift(data_stream, 1 / 255.0, 0.0, which_sources=('features', )), which_sources=('features', )) train_stream_mon = preprocessing( DataStream(mnist_train, iteration_scheme=ShuffledScheme(train_ind, batch_size_mon))) train_stream_bn = preprocessing( DataStream(mnist_train, iteration_scheme=ShuffledScheme(train_ind, batch_size_mon))) train_stream = preprocessing( DataStream(mnist_train, iteration_scheme=ShuffledScheme(train_ind, batch_size))) valid_stream = preprocessing( DataStream(mnist_train, iteration_scheme=ShuffledScheme(val_ind, batch_size))) test_stream = preprocessing( DataStream(mnist_test, iteration_scheme=ShuffledScheme(mnist_test.num_examples, batch_size_mon))) initialize_prototypes(lvq, x, train_out, train_stream_mon) ###################### # Training ###################### from blocks.main_loop import MainLoop from blocks.extensions import FinishAfter, Timing, Printing from blocks.extensions.monitoring import DataStreamMonitoring, TrainingDataMonitoring from blocks.algorithms import GradientDescent, Momentum, RMSProp, Adam from blocks.extensions.saveload import Checkpoint from lvq.extensions import EarlyStopping, LRDecay, MomentumSwitchOff, NCScheduler from lvq.batch_norm import BatchNormExtension lr = theano.shared(numpy.float32(1e-3)) step_rule = Momentum( learning_rate, 0.9 ) #Adam(learning_rate=lr) #RMSProp(learning_rate=1e-5, max_scaling=1e4) num_epochs = 100 earlystop = EarlyStopping('valid_lvq_apply_misclass', 100, './exp/alvq.pkl') main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=GradientDescent(cost=model.outputs[0], parameters=model.parameters, step_rule=step_rule), extensions=[ FinishAfter(after_n_epochs=num_epochs), BatchNormExtension(model, train_stream_bn, n_batches), LRDecay(step_rule.learning_rate, [20, 40, 60, 80]), MomentumSwitchOff(step_rule.momentum, num_epochs - 20), NCScheduler(lamb, 30., 0, num_epochs), DataStreamMonitoring(variables=[test_loss, test_misclass], data_stream=train_stream_mon, prefix='train'), DataStreamMonitoring(variables=[test_loss, test_misclass], data_stream=valid_stream, prefix='valid'), DataStreamMonitoring(variables=[test_loss, test_misclass], data_stream=test_stream, prefix='test'), Timing(), earlystop, Printing(after_epoch=True) ]) main_loop.run() return main_loop.status.get('best_test_lvq_apply_misclass', None)
algorithm = GradientDescent(cost=cost, parameters=params, step_rule=AdaGrad(learning_rate=learning_rate), on_unused_sources='warn') elif "adadelta" in config: print "using adadelta" algorithm = GradientDescent(cost=cost, parameters=params, step_rule=AdaDelta(), on_unused_sources='warn') elif "momentum" in config: print "using momentum" mWeight = float(config["momentum"]) algorithm = GradientDescent(cost=cost, parameters=params, step_rule=Momentum(learning_rate=learning_rate, momentum=mWeight), on_unused_sources='warn') else: print "using traditional SGD" algorithm = GradientDescent(cost=cost, parameters=params, step_rule=Scale(learning_rate=learning_rate), on_unused_sources='warn') extensions = [] if "saveModel" in config: print "saving model after training" extensions.append(Checkpoint(path=networkfile, after_n_epochs=n_epochs)) extensions.append(FinishAfter(after_n_epochs=n_epochs)) extensions.append( F1Extension(layer3=layer3, y=y,
def main(save_to, cost_name, learning_rate, momentum, num_epochs): mlp = MLP([None], [784, 10], weights_init=IsotropicGaussian(0.01), biases_init=Constant(0)) mlp.initialize() x = tensor.matrix('features') y = tensor.lmatrix('targets') scores = mlp.apply(x) batch_size = y.shape[0] indices = tensor.arange(y.shape[0]) target_scores = tensor.set_subtensor( tensor.zeros((batch_size, 10))[indices, y.flatten()], 1) score_diff = scores - target_scores # Logistic Regression if cost_name == 'lr': cost = Softmax().categorical_cross_entropy(y.flatten(), scores).mean() # MSE elif cost_name == 'mse': cost = (score_diff**2).mean() # Perceptron elif cost_name == 'perceptron': cost = (scores.max(axis=1) - scores[indices, y.flatten()]).mean() # TLE elif cost_name == 'minmin': cost = abs(score_diff[indices, y.flatten()]).mean() cost += abs(score_diff[indices, scores.argmax(axis=1)]).mean() # TLEcut elif cost_name == 'minmin_cut': # Score of the groundtruth should be greater or equal than its target score cost = tensor.maximum(0, -score_diff[indices, y.flatten()]).mean() # Score of the prediction should be less or equal than its actual score cost += tensor.maximum(0, score_diff[indices, scores.argmax(axis=1)]).mean() # TLE2 elif cost_name == 'minmin2': cost = ((score_diff[tensor.arange(y.shape[0]), y.flatten()])**2).mean() cost += ((score_diff[tensor.arange(y.shape[0]), scores.argmax(axis=1)])**2).mean() # Direct loss minimization elif cost_name == 'direct': epsilon = 0.1 cost = (-scores[indices, (scores + epsilon * target_scores).argmax(axis=1)] + scores[indices, scores.argmax(axis=1)]).mean() cost /= epsilon elif cost_name == 'svm': cost = (scores[indices, (scores - 1 * target_scores).argmax(axis=1)] - scores[indices, y.flatten()]).mean() else: raise ValueError("Unknown cost " + cost) error_rate = MisclassificationRate().apply(y.flatten(), scores) error_rate.name = 'error_rate' cg = ComputationGraph([cost]) cost.name = 'cost' mnist_train = MNIST(("train", )) mnist_test = MNIST(("test", )) if learning_rate == None: learning_rate = 0.0001 if momentum == None: momentum = 0.0 rule = Momentum(learning_rate=learning_rate, momentum=momentum) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=rule) extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs), DataStreamMonitoring([cost, error_rate], Flatten(DataStream.default_stream( mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, 500)), which_sources=('features', )), prefix="test"), # CallbackExtension( # lambda: rule.learning_rate.set_value(rule.learning_rate.get_value() * 0.9), # after_epoch=True), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm), rule.learning_rate ], prefix="train", after_epoch=True), Checkpoint(save_to), Printing() ] if BLOCKS_EXTRAS_AVAILABLE: extensions.append( Plot('MNIST example', channels=[['test_cost', 'test_error_rate'], ['train_total_gradient_norm']])) main_loop = MainLoop(algorithm, Flatten(DataStream.default_stream( mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, 50)), which_sources=('features', )), model=Model(cost), extensions=extensions) main_loop.run() df = pandas.DataFrame.from_dict(main_loop.log, orient='index') res = { 'cost': cost_name, 'learning_rate': learning_rate, 'momentum': momentum, 'train_cost': df.train_cost.iloc[-1], 'test_cost': df.test_cost.iloc[-1], 'best_test_cost': df.test_cost.min(), 'train_error': df.train_error_rate.iloc[-1], 'test_error': df.test_error_rate.iloc[-1], 'best_test_error': df.test_error_rate.min() } res = { k: float(v) if isinstance(v, numpy.ndarray) else v for k, v in res.items() } json.dump(res, sys.stdout) sys.stdout.flush()
out_soft1 = Flattener(name='Flatt1').apply(conv_sequence1.apply(out8)) predict1 = NDimensionalSoftmax(name='Soft1').apply(out_soft1) cost = CategoricalCrossEntropy(name='Cross1').apply(y.flatten(), predict1).copy(name='cost') error = MisclassificationRate().apply(y.flatten(), predict1) #Little trick to plot the error rate in two different plots (We can't use two time the same data in the plot for a unknow reason) error_rate = error.copy(name='error_rate') error_rate2 = error.copy(name='error_rate2') cg = ComputationGraph([cost, error_rate]) ########### GET THE DATA ##################### stream_train = ServerDataStream(('image_features','targets'), False, port=5512, hwm=40) stream_valid = ServerDataStream(('image_features','targets'), False, port=5513, hwm=40) ########### DEFINE THE ALGORITHM ############# algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum(learning_rate=0.01, momentum=0.9)) extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs), TrainingDataMonitoring([cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True), DataStreamMonitoring([cost, error_rate, error_rate2], stream_valid, prefix="valid", after_epoch=True), Checkpoint("catsVsDogs_fire2_new.pkl", after_epoch=True, save_separately=['log']), ProgressBar(), Printing()] #Adding a live plot with the bokeh server extensions.append(Plot( 'CatsVsDogs_Fire2', channels=[['train_error_rate', 'valid_error_rate'],
def main(): nclasses = 27 import argparse parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=1) parser.add_argument("--length", type=int, default=180) parser.add_argument("--num-epochs", type=int, default=100) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--learning-rate", type=float, default=1e-3) parser.add_argument("--epsilon", type=float, default=1e-5) parser.add_argument("--num-hidden", type=int, default=1000) parser.add_argument("--baseline", action="store_true") parser.add_argument("--initialization", choices="identity glorot orthogonal uniform".split(), default="identity") parser.add_argument("--initial-gamma", type=float, default=1e-1) parser.add_argument("--initial-beta", type=float, default=0) parser.add_argument("--cluster", action="store_true") parser.add_argument("--activation", choices=list(activations.keys()), default="tanh") parser.add_argument("--optimizer", choices="sgdmomentum adam rmsprop", default="rmsprop") parser.add_argument("--continue-from") parser.add_argument("--evaluate") parser.add_argument("--dump-hiddens") args = parser.parse_args() np.random.seed(args.seed) blocks.config.config.default_seed = args.seed if args.continue_from: from blocks.serialization import load main_loop = load(args.continue_from) main_loop.run() sys.exit(0) graphs, extensions, updates = construct_graphs(args, nclasses) ### optimization algorithm definition if args.optimizer == "adam": optimizer = Adam(learning_rate=args.learning_rate) elif args.optimizer == "rmsprop": optimizer = RMSProp(learning_rate=args.learning_rate, decay_rate=0.9) elif args.optimizer == "sgdmomentum": optimizer = Momentum(learning_rate=args.learning_rate, momentum=0.99) step_rule = CompositeRule([ StepClipping(1.), optimizer, ]) algorithm = GradientDescent(cost=graphs["training"].outputs[0], parameters=graphs["training"].parameters, step_rule=step_rule) algorithm.add_updates(updates["training"]) model = Model(graphs["training"].outputs[0]) extensions = extensions["training"] + extensions["inference"] # step monitor step_channels = [] step_channels.extend([ algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name) for name, param in model.get_parameter_dict().items() ]) step_channels.append( algorithm.total_step_norm.copy(name="total_step_norm")) step_channels.append( algorithm.total_gradient_norm.copy(name="total_gradient_norm")) step_channels.extend(graphs["training"].outputs) logger.warning("constructing training data monitor") extensions.append( TrainingDataMonitoring(step_channels, prefix="iteration", after_batch=True)) # parameter monitor extensions.append( DataStreamMonitoring([ param.norm(2).copy(name="parameter.norm:%s" % name) for name, param in model.get_parameter_dict().items() ], data_stream=None, after_epoch=True)) validation_interval = 500 # performance monitor for situation in "training inference".split(): if situation == "inference" and not args.evaluate: # save time when we don't need the inference graph continue for which_set in "train valid test".split(): logger.warning("constructing %s %s monitor" % (which_set, situation)) channels = list(graphs[situation].outputs) extensions.append( DataStreamMonitoring(channels, prefix="%s_%s" % (which_set, situation), every_n_batches=validation_interval, data_stream=get_stream( which_set=which_set, batch_size=args.batch_size, num_examples=10000, length=args.length))) extensions.extend([ TrackTheBest("valid_training_error_rate", "best_valid_training_error_rate"), DumpBest("best_valid_training_error_rate", "best.zip"), FinishAfter(after_n_epochs=args.num_epochs), #FinishIfNoImprovementAfter("best_valid_error_rate", epochs=50), Checkpoint("checkpoint.zip", on_interrupt=False, every_n_epochs=1, use_cpickle=True), DumpLog("log.pkl", after_epoch=True) ]) if not args.cluster: extensions.append(ProgressBar()) extensions.extend([ Timing(), Printing(every_n_batches=validation_interval), PrintingTo("log"), ]) main_loop = MainLoop(data_stream=get_stream(which_set="train", batch_size=args.batch_size, length=args.length, augment=True), algorithm=algorithm, extensions=extensions, model=model) if args.dump_hiddens: dump_hiddens(args, main_loop) return if args.evaluate: evaluate(args, main_loop) return main_loop.run()
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph(batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter(applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append(rename(gain_matrix.min(), 'min_gain')) primary_observables.append(rename(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = rename(recognizer.labels.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter(applications=[r.generator.readout.readout], name="output_0")(cost_cg) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=[r.bottom.apply], name_regex="output")(cost_cg)[-1] attended, = VariableFilter(applications=[r.generator.transition.apply], name="attended")(cost_cg) attended_mask, = VariableFilter(applications=[ r.generator.transition.apply ], name="attended_mask")(cost_cg) weights, = VariableFilter(applications=[r.generator.evaluate], name="weights")(cost_cg) from blocks.roles import AUXILIARY l2_cost, = VariableFilter(roles=[AUXILIARY], theano_name='l2_cost_aux')(cost_cg) cost_forward, = VariableFilter(roles=[AUXILIARY], theano_name='costs_forward_aux')(cost_cg) max_recording_length = rename(bottom_output.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = rename(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = rename(attended.shape[0], "max_attended_length") max_num_phonemes = rename(labels.shape[0], "max_num_phonemes") min_energy = rename(energies.min(), "min_energy") max_energy = rename(energies.max(), "max_energy") mean_attended = rename(abs(attended).mean(), "mean_attended") mean_bottom_output = rename( abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = rename(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy") mask_density = rename(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density ]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config.get('regularization', dict()) regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [ p for p in cg.parameters if p not in attention_params ] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = ( train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2) train_cost = rename(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=Model( regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise')) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1] ] # model prior variance model = Model(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise with open(params, 'r') as src: param_values = load_parameters(src) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append( AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [ v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable) ] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat( [name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([ name for name, p in parameters.items() if not p in maxnorm_subjects ])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects) ] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold ] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements**0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5 step_norm = algorithm.steps[param].norm(2) / num_elements**0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length ] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty ] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append( rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append( rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append( Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append( TrainingDataMonitoring(primary_observables + [l2_cost, cost_forward], after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables + [l2_cost, cost_forward]), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions(before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append( AdaptiveClipping(algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf.get('num_batches'), after_n_epochs=train_conf.get('num_epochs')).add_condition( ["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [ average_monitoring.record_name(train_cost), validation.record_name(cost) ], # Plot 2: gradient norm, [ average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold) ], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [ average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label') ], # Plot 5: training and validation monotonicity penalty [ average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording') ] ] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server), ] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True).add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension, )).add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension, )), ProgressBar() ] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains(labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name ] extensions.append(Patience(**patience_conf)) extensions.append( Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
def main(save_to, num_epochs, weight_decay=0.0001, noise_pressure=0, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 prior_noise_level = -10 noise_step_rule = Scale(1e-6) noise_rate = theano.shared(numpy.asarray(1e-5, dtype=theano.config.floatX)) convnet = create_res_net(out_noise=True, tied_noise=True, tied_sigma=True, noise_rate=noise_rate, prior_noise_level=prior_noise_level) x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet test_probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), test_probs) .copy(name='cost')) test_error_rate = (MisclassificationRate().apply(y.flatten(), test_probs) .copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), test_probs) .copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate]) # Apply dropout to all layer outputs except final softmax # dropout_vars = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) # drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(test_cg.variables) # train_cg = apply_dropout(test_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(test_cg, [x], 0.2) # train_cg = drop_cg # train_cg = apply_batch_normalization(test_cg) # train_cost, train_error_rate, train_components = train_cg.outputs with batch_normalization(convnet): with training_noise(convnet): train_probs = convnet.apply(x) train_cost = (CategoricalCrossEntropy().apply(y.flatten(), train_probs) .copy(name='cost')) train_components = (ComponentwiseCrossEntropy().apply(y.flatten(), train_probs).copy(name='components')) train_error_rate = (MisclassificationRate().apply(y.flatten(), train_probs).copy(name='error_rate')) train_cg = ComputationGraph([train_cost, train_error_rate, train_components]) population_updates = get_batch_normalization_updates(train_cg) bn_alpha = 0.9 extra_updates = [(p, p * bn_alpha + m * (1 - bn_alpha)) for p, m in population_updates] # for annealing nit_penalty = theano.shared(numpy.asarray(noise_pressure, dtype=theano.config.floatX)) nit_penalty.name = 'nit_penalty' # Compute noise rates for training graph train_logsigma = VariableFilter(roles=[LOG_SIGMA])(train_cg.variables) train_mean_log_sigma = tensor.concatenate([n.flatten() for n in train_logsigma]).mean() train_mean_log_sigma.name = 'mean_log_sigma' train_nits = VariableFilter(roles=[NITS])(train_cg.auxiliary_variables) train_nit_rate = tensor.concatenate([n.flatten() for n in train_nits]).mean() train_nit_rate.name = 'nit_rate' train_nit_regularization = nit_penalty * train_nit_rate train_nit_regularization.name = 'nit_regularization' # Apply regularization to the cost trainable_parameters = VariableFilter(roles=[WEIGHT, BIAS])( train_cg.parameters) mask_parameters = [p for p in trainable_parameters if get_brick(p).name == 'mask'] noise_parameters = VariableFilter(roles=[NOISE])(train_cg.parameters) biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) nonmask_weights = [p for p in weights if get_brick(p).name != 'mask'] l2_norm = sum([(W ** 2).sum() for W in nonmask_weights]) l2_norm.name = 'l2_norm' l2_regularization = weight_decay * l2_norm l2_regularization.name = 'l2_regularization' # testversion test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + l2_regularization + train_nit_regularization train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train",)) cifar10_train_stream = RandomPadCropFlip( NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme( cifar10_train.num_examples, batch_size)), which_sources=('features',)), (32, 32), pad=4, which_sources=('features',)) test_batch_size = 128 cifar10_test = CIFAR10(("test",)) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme( cifar10_test.num_examples, test_batch_size)), which_sources=('features',)) momentum = Momentum(0.01, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # Create a step rule that reduces the learning rate of noise scale_mask = Restrict(noise_step_rule, mask_parameters) step_rule = CompositeRule([scale_mask, momentum]) # from theano.compile.nanguardmode import NanGuardMode # Train with simple SGD algorithm = GradientDescent( cost=train_cost, parameters=trainable_parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) #, # theano_func_kwargs={ # 'mode': NanGuardMode( # nan_is_error=True, inf_is_error=True, big_is_error=True)}) exp_name = save_to.replace('.%d', '') # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [ (0, 0.01), # Warm up with 0.01 learning rate (50, 0.1), # Then go back to 0.1 (100, 0.01), (150, 0.001) # (83, 0.01), # Follow the schedule in the paper # (125, 0.001) ]), EpochSchedule(noise_step_rule.learning_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4) ]), EpochSchedule(noise_rate, [ (0, 1e-2), (2, 1e-1), (4, 1) # (0, 1e-6), # (2, 1e-5), # (4, 1e-4), # (6, 3e-4), # (8, 1e-3), # Causes nit rate to jump # (10, 3e-3), # (12, 1e-2), # (15, 3e-2), # (19, 1e-1), # (24, 3e-1), # (30, 1) ]), NoiseExtension( noise_parameters=noise_parameters), NoisyDataStreamMonitoring( [test_cost, test_error_rate, test_confusion], cifar10_test_stream, noise_parameters=noise_parameters, prefix="test"), TrainingDataMonitoring( [train_cost, train_error_rate, train_nit_rate, train_cost_without_regularization, l2_regularization, train_nit_regularization, momentum.learning_rate, train_mean_log_sigma, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", every_n_batches=17), # after_epoch=True), Plot('Training performance for ' + exp_name, channels=[ ['train_cost_with_regularization', 'train_cost_without_regularization', 'train_nit_regularization', 'train_l2_regularization'], ['train_error_rate'], ['train_total_gradient_norm'], ['train_mean_log_sigma'], ], every_n_batches=17), Plot('Test performance for ' + exp_name, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), EpochCheckpoint(save_to, use_cpickle=True, after_epoch=True), ProgressBar(), Printing()] if histogram: attribution = AttributionExtension( components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(exp_name, True, True)) model = Model(train_cost) main_loop = MainLoop( algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def run(model_name, port_train, port_valid): running_on_laptop = socket.gethostname() == 'yop' X = tensor.tensor4('image_features', dtype='float32') T = tensor.matrix('targets', dtype='float32') image_border_size = (100, 100) if running_on_laptop: host_plot = 'http://*****:*****@ %s' % (model_name, datetime.datetime.now(), socket.gethostname()), channels=[['loss'], ['error', 'valid_error']], after_epoch=True, server_url=host_plot), Printing(), Checkpoint('/tmp/train_bn2') ] main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, extensions=extensions, model=model) main_loop.run()
from blocks.serialization import load main_loop = load(args.continue_from) main_loop.run() sys.exit(0) graphs, extensions, updates = construct_graphs(args, nclasses) ### optimization algorithm definition if args.optimizer == "adam": optimizer = Adam(learning_rate=args.learning_rate) # zzzz optimizer.learning_rate = theano.shared(np.asarray(optimizer.learning_rate, dtype=theano.config.floatX)) elif args.optimizer == "rmsprop": optimizer = RMSProp(learning_rate=args.learning_rate, decay_rate=0.9) elif args.optimizer == "sgdmomentum": optimizer = Momentum(learning_rate=args.learning_rate, momentum=0.99) step_rule = CompositeRule([ StepClipping(1.), optimizer, ]) algorithm = GradientDescent(cost=graphs["training"].outputs[0], parameters=graphs["training"].parameters, step_rule=step_rule) algorithm.add_updates(updates["training"]) model = Model(graphs["training"].outputs[0]) extensions = extensions["training"] + extensions["inference"] extensions.append(SharedVariableModifier( optimizer.learning_rate, functools.partial(learning_rate_decayer, args.learning_rate_decay)))
batch_size_mon))) ###################### # Training ###################### from blocks.main_loop import MainLoop from blocks.extensions import FinishAfter, Timing, Printing from blocks.extensions.monitoring import DataStreamMonitoring from blocks.algorithms import GradientDescent, Momentum, Adam from blocks.extensions.saveload import Checkpoint from lvq.batch_norm import BatchNormExtension from lvq.extensions import EarlyStopping, LRDecay, MomentumSwitchOff learning_rate = theano.shared(numpy.float32(1e-4)) step_rule = Momentum(5e-2, 0.9) main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=GradientDescent(cost=model.outputs[0], parameters=model.parameters, step_rule=step_rule), extensions=[ FinishAfter(after_n_epochs=100), BatchNormExtension(model, train_stream_bn, n_batches), LRDecay(step_rule.learning_rate, [20, 40, 60, 80]), #MomentumSwitchOff(step_rule.momentum, 140), DataStreamMonitoring(variables=[test_loss, test_misclass], data_stream=train_stream_mon, prefix='train'),
n_begin_end_pts = 5 # how many points we consider at the beginning and end of the known trajectory with open(os.path.join(data.path, 'arrival-clusters.pkl')) as f: tgtcls = cPickle.load(f) dim_embeddings = [ ('origin_call', data.origin_call_train_size, 10), ('origin_stand', data.stands_size, 10), ('week_of_year', 52, 10), ('day_of_week', 7, 10), ('qhour_of_day', 24 * 4, 10), ('day_type', 3, 10), ('taxi_id', 448, 10), ] dim_input = n_begin_end_pts * 2 * 2 + sum(x for (_, _, x) in dim_embeddings) dim_hidden = [100] dim_output = tgtcls.shape[0] embed_weights_init = IsotropicGaussian(0.01) mlp_weights_init = IsotropicGaussian(0.1) mlp_biases_init = Constant(0.01) step_rule = Momentum(learning_rate=0.01, momentum=0.9) batch_size = 200 valid_set = 'cuts/test_times_0' max_splits = 100
def main(num_epochs=1000): x = tensor.matrix('features') y = tensor.lmatrix('targets') softmax_regressor = SoftmaxRegressor(input_dim=784, n_classes=10) probs = softmax_regressor.get_probs(features=x) params = softmax_regressor.get_params() weights = softmax_regressor.get_weights() cost = softmax_regressor.get_cost(probs=probs, targets=y).mean() cost.name = 'cost' misclassification = softmax_regressor.get_misclassification( probs=probs, targets=y).mean() misclassification.name = 'misclassification' train_dataset = MNIST('train') test_dataset = MNIST('test') algorithm = GradientDescent(cost=cost, params=params, step_rule=Momentum(learning_rate=0.1, momentum=0.1)) train_data_stream = ForceFloatX( data_stream=DataStream(dataset=train_dataset, iteration_scheme=ShuffledScheme( examples=train_dataset.num_examples, batch_size=100, ))) test_data_stream = ForceFloatX( data_stream=DataStream(dataset=test_dataset, iteration_scheme=SequentialScheme( examples=test_dataset.num_examples, batch_size=1000, ))) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append( DataStreamMonitoring([cost, misclassification], test_data_stream, prefix='test')) extensions.append( TrainingDataMonitoring([cost, misclassification], prefix='train', after_epoch=True)) plotters = [] plotters.append( Plotter(channels=[[ 'test_cost', 'test_misclassification', 'train_cost', 'train_misclassification' ]], titles=['Costs'])) display_train = ImageDataStreamDisplay( data_stream=copy.deepcopy(train_data_stream), image_shape=(28, 28, 1), axes=(0, 1, 'c'), shift=-0.5, rescale=2., ) weight_display = WeightDisplay(weights=weights, transpose=(1, 0), image_shape=(28, 28, 1), axes=(0, 1, 'c'), shift=-0.5, rescale=2., grid_shape=(1, 10)) images_displayer = DisplayImage( image_getters=[display_train, weight_display], titles=['Training examples', 'Softmax weights']) plotters.append(images_displayer) extensions.append( PlotManager('MNIST softmax examples', plotters=plotters, after_epoch=False, every_n_epochs=10, after_training=True)) extensions.append(Printing()) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()