def test_step_clipping_no_threshold_regression(): """Test regression for #1145, incorrect output when threshold=None.""" rule1 = StepClipping() assert rule1.threshold is None gradients = {0: shared_floatx(3.0), 1: shared_floatx(4.0)} clipped1, updates = rule1.compute_steps(gradients) assert len(updates) == 0 assert clipped1 == gradients
def test_step_clipping_no_threshold_regression(): """Test regression for #1145, incorrect output when threshold=None.""" rule1 = StepClipping() assert rule1.threshold is None gradients = {0: shared_floatx(3.0), 1: shared_floatx(4.0)} clipped1, updates = rule1.compute_steps(gradients) assert len(updates) == 0 assert clipped1 == gradients
def test_step_clipping(): rule1 = StepClipping(4) rule2 = StepClipping(5) gradients = {0: shared_floatx(3.0), 1: shared_floatx(4.0)} clipped1, _ = rule1.compute_steps(gradients) assert_allclose(clipped1[0].eval(), 12 / 5.0) assert_allclose(clipped1[1].eval(), 16 / 5.0) clipped2, _ = rule2.compute_steps(gradients) assert_allclose(clipped2[0].eval(), 3.0) assert_allclose(clipped2[1].eval(), 4.0)
def learning_algorithm(args): name = args.algorithm learning_rate = float(args.learning_rate) momentum = args.momentum clipping_threshold = args.clipping clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) if name == 'adam': adam = Adam(learning_rate=learning_rate) step_rule = CompositeRule([adam, clipping]) learning_rate = adam.learning_rate elif name == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate) step_rule = CompositeRule([clipping, rms_prop]) learning_rate = rms_prop.learning_rate elif name == 'momentum': sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) step_rule = CompositeRule([clipping, sgd_momentum]) learning_rate = sgd_momentum.learning_rate elif name == 'sgd': sgd = Scale(learning_rate=learning_rate) step_rule = CompositeRule([clipping, sgd]) learning_rate = sgd.learning_rate else: raise NotImplementedError return step_rule, learning_rate
def learning_algorithm(args): name = args.algorithm learning_rate = float(args.learning_rate) momentum = args.momentum clipping_threshold = args.clipping if name == 'adam': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) adam = Adam(learning_rate=learning_rate) step_rule = CompositeRule([adam, clipping]) elif name == 'rms_prop': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) rms_prop = RMSProp(learning_rate=learning_rate) step_rule = CompositeRule([clipping, rms_prop]) else: clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) step_rule = CompositeRule([clipping, sgd_momentum]) return step_rule
def learning_algorithm(args): name = args.algorithm learning_rate = float(args.learning_rate) momentum = args.momentum clipping_threshold = args.clipping if name == 'adam': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) adam = Adam(learning_rate=learning_rate) # [adam, clipping] means 'step clipping' # [clipping, adam] means 'gradient clipping' step_rule = CompositeRule([adam, clipping]) elif name == 'rms_prop': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) rms_prop = RMSProp(learning_rate=learning_rate) rm_non_finite = RemoveNotFinite() step_rule = CompositeRule([clipping, rms_prop, rm_non_finite]) else: clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) rm_non_finite = RemoveNotFinite() step_rule = CompositeRule([clipping, sgd_momentum, rm_non_finite]) return step_rule
def learning_algorithm(learning_rate, momentum=0.0, clipping_threshold=100, algorithm='sgd'): if algorithm == 'adam': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) adam = Adam(learning_rate=learning_rate) # [adam, clipping] means 'step clipping' # [clipping, adam] means 'gradient clipping' step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) rms_prop = RMSProp(learning_rate=learning_rate) rm_non_finite = RemoveNotFinite() step_rule = CompositeRule([clipping, rms_prop, rm_non_finite]) elif algorithm == 'sgd': clipping = StepClipping(threshold=np.cast[floatX](clipping_threshold)) sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) rm_non_finite = RemoveNotFinite() step_rule = CompositeRule([clipping, sgd_momentum, rm_non_finite]) else: raise NotImplementedError return step_rule
def learning_algorithm(args): name = args.algorithm learning_rate = float(args.learning_rate) momentum = args.momentum clipping_threshold = args.clipping if name == 'adam': adam = Adam(learning_rate=learning_rate) step_rule = adam elif name == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate, decay_rate=0.9) step_rule = CompositeRule([StepClipping(1.), rms_prop]) else: sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) step_rule = sgd_momentum return step_rule
def test_step_clipping(): rule1 = StepClipping(4) rule2 = StepClipping(5) gradients = {0: shared_floatx(3.0), 1: shared_floatx(4.0)} clipped1, _ = rule1.compute_steps(gradients) assert_allclose(clipped1[0].eval(), 12 / 5.0) assert_allclose(clipped1[1].eval(), 16 / 5.0) clipped2, _ = rule2.compute_steps(gradients) assert_allclose(clipped2[0].eval(), 3.0) assert_allclose(clipped2[1].eval(), 4.0)
def test_composite_rule(): rule = CompositeRule([StepClipping(4), Scale(0.1)]) gradients = {0: shared_floatx(3.0), 1: shared_floatx(4.0)} result, _ = rule.compute_steps(gradients) assert_allclose(result[0].eval(), 12 / 50.0) assert_allclose(result[1].eval(), 16 / 50.0) class RuleWithUpdates(StepRule): def __init__(self, updates): self.updates = updates def compute_steps(self, previous_steps): return previous_steps, self.updates rule = CompositeRule([RuleWithUpdates([(1, 2)]), RuleWithUpdates([(3, 4)])]) assert rule.compute_steps(None)[1] == [(1, 2), (3, 4)]
def train(self, data_file, output_data_file, n_epochs=0): training_data = dataset.T_H5PYDataset(data_file, which_sets=('train',)) test_data = dataset.T_H5PYDataset(data_file, which_sets=('test',)) session = Session(root_url='http://localhost:5006') if self.MainLoop is None: step_rules = [RMSProp(learning_rate=0.2, decay_rate=0.95), StepClipping(1)] algorithm = GradientDescent(cost=self.Cost, parameters=self.ComputationGraph.parameters, step_rule=CompositeRule(step_rules), on_unused_sources='ignore') train_stream = DataStream.default_stream( training_data, iteration_scheme=SequentialScheme( training_data.num_examples, batch_size=100)) test_stream = DataStream.default_stream( test_data, iteration_scheme=SequentialScheme( test_data.num_examples, batch_size=100)) self.MainLoop = MainLoop( model=Model(self.Cost), data_stream=train_stream, algorithm=algorithm, extensions=[ FinishAfter(after_n_epochs=n_epochs), Printing(), Checkpoint(output_data_file, every_n_epochs=50), TrainingDataMonitoring([self.Cost], after_batch=True, prefix='train'), DataStreamMonitoring([self.Cost], after_batch=True, data_stream=test_stream, prefix='test'), Plot(output_data_file, channels=[['train_cost', 'test_cost']]) ]) self.MainLoop.run()
def train(self, training_data): step_rules = [Adam(), StepClipping(1.0)] algorithm = GradientDescent( cost=self.Cost, parameters=self.ComputationGraph.parameters, step_rule=CompositeRule(step_rules)) train_stream = DataStream.default_stream( training_data, iteration_scheme=SequentialScheme(training_data.num_examples, batch_size=20)) main = MainLoop(model=Model(self.Cost), data_stream=train_stream, algorithm=algorithm, extensions=[ FinishAfter(), Printing(), Checkpoint('trainingdata.tar', every_n_epochs=10) ]) main.run()
def setup_algorithms(cost, cg, method, type="ff"): """Setup training algorithm. Parameters ---------- cost : expression cost expression cg : ComputationGraph Computation graph method : string training method: SGD, momentum SGD, AdaGrad, RMSprop learning_rate : float learning rate for learning method Returns ------- algorithm : GradientDescent Gradient Descent algorithm based on different optimization method """ if method == "sgd": step_rule = Scale(learning_rate=0.01) elif method == "momentum": step_rule = Momentum(learning_rate=0.01, momentum=0.95) elif method == "adagrad": step_rule = AdaGrad() elif method == "rmsprop": step_rule = RMSProp() if type == "RNN": step_rule = CompositeRule([StepClipping(1.0), step_rule]) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=step_rule) return algorithm
emit = gmm_emitter.emit(h[-2]) emit.name = 'emitter' cg = ComputationGraph(cost) model = Model(cost) ################# # Algorithm ################# n_batches = 139*16 algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Adam(lr)])) train_monitor = TrainingDataMonitoring( variables=[cost], every_n_batches = n_batches, prefix="train") valid_monitor = DataStreamMonitoring( [cost], valid_stream, after_epoch = True, #before_first_epoch = False, prefix="valid") extensions = extensions=[ Timing(every_n_batches = n_batches),
data_stream=data_stream_val, prefix='validation', after_epoch=True) monitor_test = DataStreamMonitoring(variables=[error], data_stream=data_stream_test, prefix='test', after_epoch=True) learning_rate = 0.00008 n_epochs = 100 algorithm = GradientDescent(cost=cost, parameters=cg.parameters, on_unused_sources='ignore', step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ])) main_loop = MainLoop(model=Model(cost), data_stream=data_stream_train, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=n_epochs), monitor, monitor_val, monitor_test, saveSnapshot( '/home/xuehongyang/checkpoints_read/snapshot', save_main_loop=False, after_epoch=True, save_separately=['log', 'model']), ProgressBar(),
def construct_main_loop(name, task_name, patch_shape, batch_size, n_spatial_dims, n_patches, max_epochs, patience_epochs, learning_rate, gradient_limiter, hyperparameters, **kwargs): task = tasks.get_task(**hyperparameters) hyperparameters["n_channels"] = task.n_channels extensions = [] # let theta noise decay as training progresses for key in "location_std scale_std".split(): hyperparameters[key] = theano.shared(hyperparameters[key], name=key) extensions.append( util.ExponentialDecay(hyperparameters[key], hyperparameters["%s_decay" % key], after_batch=True)) print "constructing graphs..." graphs, outputs, updates = construct_graphs(task=task, **hyperparameters) print "setting up main loop..." from blocks.model import Model model = Model(outputs["train"]["cost"]) from blocks.algorithms import GradientDescent, CompositeRule, StepClipping, Adam, RMSProp from extensions import Compressor if gradient_limiter == "clip": limiter = StepClipping(1.) elif gradient_limiter == "compress": limiter = Compressor() else: raise ValueError() algorithm = GradientDescent( cost=outputs["train"]["cost"], parameters=graphs["train"].parameters, step_rule=CompositeRule([limiter, Adam(learning_rate=learning_rate)])) algorithm.add_updates(updates["train"]) extensions.extend( construct_monitors(algorithm=algorithm, task=task, model=model, graphs=graphs, outputs=outputs, updates=updates, **hyperparameters)) from blocks.extensions import FinishAfter, Printing, ProgressBar, Timing from blocks.extensions.stopping import FinishIfNoImprovementAfter from blocks.extensions.training import TrackTheBest from blocks.extensions.saveload import Checkpoint from dump import DumpBest, LightCheckpoint, PrintingTo, DumpGraph, DumpLog extensions.extend([ TrackTheBest("valid_error_rate", "best_valid_error_rate"), FinishIfNoImprovementAfter("best_valid_error_rate", epochs=patience_epochs), FinishAfter(after_n_epochs=max_epochs), DumpBest("best_valid_error_rate", name + "_best.zip"), Checkpoint(hyperparameters["checkpoint_save_path"], on_interrupt=False, every_n_epochs=10, use_cpickle=True), DumpLog("log.pkl", after_epoch=True), ProgressBar(), Timing(), Printing(), PrintingTo(name + "_log"), DumpGraph(name + "_grad_graph") ]) from blocks.main_loop import MainLoop main_loop = MainLoop(data_stream=task.get_stream("train"), algorithm=algorithm, extensions=extensions, model=model) from tabulate import tabulate print "parameter sizes:" print tabulate( (key, "x".join(map(str, value.get_value().shape)), value.get_value().size) for key, value in main_loop.model.get_parameter_dict().items()) return main_loop
x = tensor.matrix('features', dtype='uint8') y = tensor.matrix('targets', dtype='uint8') y_hat, cost, cells = nn_fprop(x, y, vocab_size, hidden_size, num_layers, model) # COST cg = ComputationGraph(cost) if dropout > 0: # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015) inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')(cg.variables) cg = apply_dropout(cg, inputs, dropout) cost = cg.outputs[0] # Learning algorithm step_rules = [RMSProp(learning_rate=learning_rate, decay_rate=decay_rate), StepClipping(step_clipping)] algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules)) # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, gradient_norm, step_norm] dev_monitor = DataStreamMonitoring(variables=[cost], after_epoch=True, before_first_epoch=True, data_stream=dev_stream, prefix="dev") train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True, before_first_epoch=True, prefix='tra') extensions = [dev_monitor, train_monitor, Timing(), Printing(after_batch=True), FinishAfter(after_n_epochs=nepochs),
def main(config, tr_stream, dev_stream, use_bokeh=False, the_task=None, the_track=None): config['the_task'] = the_task # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( # end_embed is dimension of word embedding matrix in encoder; enc_nhids number of hidden units in encoder GRU config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, config['use_attention'], cost_type=config['error_fct']) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) testVar = decoder.getTestVar( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') my_rng = numpy.random.RandomState(config['rng_value']) if config['identity_init']: encoder.weights_init = decoder.weights_init = Identity() else: encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.rng = decoder.rng = my_rng encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() encoder.bidir.prototype.rng = my_rng decoder.transition.weights_init = Orthogonal() decoder.transition.rng = my_rng encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff'], seed=my_rng) cost = cg.outputs[0] # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") # this is ugly code and done, because I am not sure if the order of the extensions is important if 'track2' in config['saveto']: # less epochs for track 2, because of more data if config['early_stopping']: extensions = [ FinishAfter(after_n_epochs=config['finish_after']/2), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] else: extensions = [ FinishAfter(after_n_epochs=config['finish_after']/2), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] else: if config['early_stopping']: extensions = [ FinishAfter(after_n_epochs=config['finish_after']), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] else: extensions = [ FinishAfter(after_n_epochs=config['finish_after']), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], #every_n_batches=1, every_n_batches=config['sampling_freq'], src_vocab_size=8)) #src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['val_set'] is not None: logger.info("Building accuracy validator") extensions.append( AccuracyValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, after_training=True, #after_epoch=True)) every_n_epochs=5)) else: logger.info("No validation set given for this language") # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run()
def mainPredict(config, data_to_predict_stream, use_ensemble, lang=None, et_version=False, use_bokeh=False, the_track=None): # Create Theano variables assert the_track != None logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, cost_type=config['error_fct']) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set extensions logger.info("Initializing (empty) extensions") extensions = [ ] logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Reload the model (as this is prediction, it is 100% necessary): if config['reload']: #extensions.append(LoadOnlyBestModel(config['saveto'])) # without early stopping use LoadOnlyModel here! extensions.append(LoadOnlyModel(config['saveto'])) # without early stopping use LoadOnlyModel here! else: raise Exception('No model available for prediction! (Check config[\'reload\'] variable)') # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=search_model, algorithm=algorithm, #algorithm=None, data_stream=data_to_predict_stream, extensions=extensions ) predictByHand(main_loop, decoder, data_to_predict_stream, use_ensemble, lang, et_version, config, the_track=the_track)
def main(config, tr_stream, dev_stream): # Create Theano variables logger.info('Creating theano variables') source_char_seq = tensor.lmatrix('source_char_seq') source_sample_matrix = tensor.btensor3('source_sample_matrix') source_char_aux = tensor.bmatrix('source_char_aux') source_word_mask = tensor.bmatrix('source_word_mask') target_char_seq = tensor.lmatrix('target_char_seq') target_char_aux = tensor.bmatrix('target_char_aux') target_char_mask = tensor.bmatrix('target_char_mask') target_sample_matrix = tensor.btensor3('target_sample_matrix') target_word_mask = tensor.bmatrix('target_word_mask') target_resample_matrix = tensor.btensor3('target_resample_matrix') target_prev_char_seq = tensor.lmatrix('target_prev_char_seq') target_prev_char_aux = tensor.bmatrix('target_prev_char_aux') target_bos_idx = tr_stream.trg_bos target_space_idx = tr_stream.space_idx['target'] # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['src_dgru_nhids'], config['enc_nhids'], config['src_dgru_depth'], config['bidir_encoder_depth']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['trg_dgru_nhids'], config['trg_igru_nhids'], config['dec_nhids'], config['enc_nhids'] * 2, config['transition_depth'], config['trg_igru_depth'], config['trg_dgru_depth'], target_space_idx, target_bos_idx) representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask) cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix, target_resample_matrix, target_char_aux, target_char_mask, target_word_mask, target_prev_char_seq, target_prev_char_aux) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() for layer_n in range(config['src_dgru_depth']): encoder.decimator.dgru.transitions[layer_n].weights_init = Orthogonal() for layer_n in range(config['bidir_encoder_depth']): encoder.children[ 1 + layer_n].prototype.recurrent.weights_init = Orthogonal() if config['trg_igru_depth'] == 1: decoder.interpolator.igru.weights_init = Orthogonal() else: for layer_n in range(config['trg_igru_depth']): decoder.interpolator.igru.transitions[ layer_n].weights_init = Orthogonal() for layer_n in range(config['trg_dgru_depth']): decoder.interpolator.feedback_brick.dgru.transitions[ layer_n].weights_init = Orthogonal() for layer_n in range(config['transition_depth']): decoder.transition.transitions[layer_n].weights_init = Orthogonal() encoder.initialize() decoder.initialize() # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(str(shape), count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(str(value.get_value().shape), name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Set extensions logger.info("Initializing extensions") # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) train_monitor = CostCurve([cost, gradient_norm, step_norm], config=config, after_batch=True, before_first_epoch=True, prefix='tra') extensions = [ train_monitor, Timing(), Printing(every_n_batches=config['print_freq']), FinishAfter(after_n_batches=config['finish_after']), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") generated = decoder.generate(representation, source_word_mask) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[config['transition_depth']]) ) # generated[transition_depth] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], transition_depth=config['transition_depth'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def train(args, trial=11, no_valid=False): # Creating unique strings to save for experiments. data_valid = "data/"+args.data_name+"_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) data_test = data_valid.replace("_valid_size", "_test_size") # If we want validation set to match modData of test set if modDataValid == 1: data_valid = data_valid.replace("_trial_", "_" + modData + "_trial_") data_test = data_test.replace("_trial_", "_" + modData + "_trial_") # By default, it is m0 data_train = "data/"+args.data_name+"_trial_"+str(trial)+"_train_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) subStr = "rnn_type_"+args.rnn_type + "_trial_"+str(trial) + "_hiddenSize_"+str(args.hidden_size)+\ "_numLayers_"+str(args.num_layers)+ \ "_dropout_"+str(args.dropout)+"_train_size_"+str(args.train_size) + "_transitions_"+str(args.transitions)+\ "_novalid_"+str(args.no_valid) if modData == "m1": data_train = data_train.replace("_trial_", "_m1_trial_") subStr = subStr.replace("_trial_", "_m1_trial_") elif modData == "m3": data_train = data_train.replace("_trial_", "_m3_trial_") subStr = subStr.replace("_trial_", "_m3_trial_") data_valid = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_valid_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) data_test = "data/"+args.data_name+"_m3_trial_"+str(trial)+"_test_size_"+str(args.train_size)+\ "_transitions_"+str(args.transitions) print("on test: " + subStr) # Perform folder prefixing prefix_path = models_folder + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\ "_boost_"+bStr(args.boosting) load_path2 = prefix + load_path save_path2 = prefix + save_path last_path2 = prefix + last_path plots_output2 = plots_output + args.data_name + "/" + subStr +"_tgrad_"+str(args.truncate_gradient)+\ "_boost_"+bStr(args.boosting) # obtain vocabulary size ix_to_char, char_to_ix, vocab_size = get_metadata( data_test.replace("_test", "")) print("vocab_size: " + str(vocab_size)) # Get train, valid, test streams sharedDataTrain, train_stream = get_stream_inGPU(data_train, sharedName='sharedData') train_streamCopy = copy.deepcopy(train_stream) sharedDataValid, dev_stream = get_stream_inGPU(data_valid, sharedName='sharedData') valid_streamCopy = copy.deepcopy(dev_stream) sharedDataTest, test_stream = get_stream_inGPU(data_test, sharedName='sharedData') test_streamCopy = copy.deepcopy(test_stream) # Create dummy sums sharedMRRSUM = shared(np.array(0.0, dtype=theano.config.floatX)) sharedTOTSUM = shared(np.array(0.0, dtype=theano.config.floatX)) sharedSUMVARs = { 'sharedMRRSUM': sharedMRRSUM, 'sharedTOTSUM': sharedTOTSUM } # Initialize batches batch_index_From = T.scalar('int_stream_From', dtype='int32') batch_index_To = T.scalar('int_stream_To', dtype='int32') # Index theano variables x = sharedDataTrain['x'][:, batch_index_From:batch_index_To] x.name = 'x' x_mask = sharedDataTrain['x_mask'][:, batch_index_From:batch_index_To] x_mask.name = 'x_mask' x_mask_o = sharedDataTrain['x_mask_o'][:, batch_index_From:batch_index_To] x_mask_o.name = 'x_mask_o' x_mask_o_mask = sharedDataTrain[ 'x_mask_o_mask'][:, batch_index_From:batch_index_To] x_mask_o_mask.name = 'x_mask_o_mask' y = sharedDataTrain['y'][:, batch_index_From:batch_index_To] y.name = 'y' y_mask = sharedDataTrain['y_mask'][:, batch_index_From:batch_index_To] y_mask.name = 'y_mask' y_mask_o = sharedDataTrain['y_mask_o'][:, batch_index_From:batch_index_To] y_mask_o.name = 'y_mask_o' y_mask_o_mask = sharedDataTrain[ 'y_mask_o_mask'][:, batch_index_From:batch_index_To] y_mask_o_mask.name = 'y_mask_o_mask' lens = sharedDataTrain['lens'][:, batch_index_From:batch_index_To] lens.name = 'lens' # Generate temp shared vars tempSharedData = {} tempSharedData[theano.config.floatX] = [ shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)), shared(np.array([[0], [0]], dtype=theano.config.floatX)) ] tempSharedData['uint8'] = [ shared(np.array([[0], [0]], dtype='uint8')), shared(np.array([[0], [0]], dtype='uint8')), shared(np.array([[0], [0]], dtype='uint8')) ] # Final mask is due to the generated mask and the input mask x_mask_final = x_mask * x_mask_o * x_mask_o_mask y_mask_final = y_mask * y_mask_o * y_mask_o_mask # Build neural network linear_output, cost = nn_fprop( x, x_mask_final, y, y_mask_final, lens, vocab_size, hidden_size, num_layers, rnn_type, boosting=boosting, scan_kwargs={'truncate_gradient': truncate_gradient}) # Keep a constant in gpu memory constant1 = shared(np.float32(1.0)) cost_int, ymasksum = RR_cost(y, linear_output, y_mask_final, constant1) # Validation calculations fRR = function(inputs=[ theano.In(batch_index_From, borrow=True), theano.In(batch_index_To, borrow=True) ], updates=[(sharedMRRSUM, sharedMRRSUM + cost_int), (sharedTOTSUM, sharedTOTSUM + ymasksum)]) # COST cg = ComputationGraph(cost) if dropout > 0: # Apply dropout only to the non-recurrent inputs (Zaremba et al. 2015) inputs = VariableFilter(theano_name_regex=r'.*apply_input.*')( cg.variables) cg = apply_dropout(cg, inputs, dropout) cost = cg.outputs[0] # Learning algorithm step_rules = [ RMSProp(learning_rate=rmsPropLearnRate, decay_rate=decay_rate), StepClipping(step_clipping) ] algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules)) # Extensions # This is for tracking our best result trackbest = track_best('valid_MRR', save_path2, last_path2, num_epochs, nepochs, maxIterations, epsilon, tempSharedData) if onlyPlots: prefixes = ["train_cross", "valid_cross", "test_cross"] gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) monitored_vars = [cost, gradient_norm, step_norm] #this is faster train_monitor = myTrainingDataMonitoring( variables=monitored_vars, prefix=prefixes[0], after_batch=True, saveEveryXIteration=saveEveryXIteration) #train_monitor = DataStreamMonitoringPlot(variables=[cost], # data_stream=train_streamCopy, prefix=prefixes[0], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataTrain, after_batch=True, saveEveryXIteration = saveEveryXIteration) valid_monitor = DataStreamMonitoringPlot( variables=[cost], data_stream=valid_streamCopy, prefix=prefixes[1], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataValid, after_batch=True, saveEveryXIteration=saveEveryXIteration) test_monitor = DataStreamMonitoringPlot( variables=[cost], data_stream=test_streamCopy, prefix=prefixes[2], sharedDataTrain=sharedDataTrain, sharedDataActualTest=sharedDataTest, after_batch=True, saveEveryXIteration=saveEveryXIteration) trackbest = [trackbest[0], trackbest[2], trackbest[3], trackbest[4]] plot = Plot('Live Plotting', saveFolder=plots_output2, channels=[ 'train_cross_cost', 'valid_cross_cost', 'test_cross_cost' ], numProcesses=numProcesses, saveEveryXIteration=saveEveryXIteration, after_batch=True) extensions = [ train_monitor, valid_monitor, test_monitor, plot, Printing(), ProgressBar(), ] + trackbest else: dev_monitor = myDataStreamMonitoring(after_epoch=True, before_epoch=False, data_stream=dev_stream, prefix="valid", fRR=fRR, sharedVars=sharedSUMVARs, sharedDataTrain=sharedDataTrain, sharedDataValid=sharedDataValid) extensions = [ dev_monitor, Printing(), ProgressBar(), ] + trackbest if learning_rate_decay not in (0, 1): extensions.append( SharedVariableModifier(step_rules[0].learning_rate, lambda n, lr: np.cast[theano.config.floatX] (learning_rate_decay * lr), after_epoch=True, after_batch=False)) print 'number of parameters in the model: ' + str( T.sum([p.size for p in cg.parameters]).eval()) # Finally build the main loop and train the model main_loop = MainLoop(data_stream=train_stream, algorithm=algorithm, model=Model(cost), extensions=extensions) main_loop.run()
def main(config, tr_stream, dev_stream, source_vocab, target_vocab, use_bokeh=False): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') initial_context = tensor.matrix('initial_context') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) # let user specify the target transition class name in config, # eval it and pass to decoder target_transition_name = config.get( 'target_transition', 'GRUInitialStateWithInitialStateSumContext') target_transition = eval(target_transition_name) logger.info('Using target transition: {}'.format(target_transition_name)) decoder = InitialContextDecoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, config['context_dim'], target_transition) cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask, initial_context) cost.name = 'decoder_cost' # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() logger.info('Creating computational graph') cg = ComputationGraph(cost) # GRAPH TRANSFORMATIONS FOR BETTER TRAINING # TODO: validate performance with/without regularization if config.get('l2_regularization', False) is True: l2_reg_alpha = config['l2_regularization_alpha'] logger.info( 'Applying l2 regularization with alpha={}'.format(l2_reg_alpha)) model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # why do we need to name the cost variable? Where did the original name come from? cost.name = 'decoder_cost_cost' cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions # TODO: add checking for existing model and loading logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Create the theano variables that we need for the sampling graph sampling_input = tensor.lmatrix('input') sampling_context = tensor.matrix('context_input') # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config.get('bleu_script', None) is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation, sampling_context) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler( model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab=source_vocab, trg_vocab=target_vocab, src_vocab_size=config['src_vocab_size'], )) # Add early stopping based on bleu if config.get('bleu_script', None) is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, sampling_context, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Add early stopping based on Meteor if config.get('meteor_directory', None) is not None: logger.info("Building meteor validator") extensions.append( MeteorValidator(sampling_input, sampling_context, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(config['model_save_directory'], channels=[[ 'decoder_cost', 'validation_set_bleu_score', 'validation_set_meteor_score' ]], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # if there is dropout or random noise, we need to use the output of the modified graph if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0: algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) else: algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # enrich the logged information extensions.append(Timing(every_n_batches=100)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def main(name, dataset, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim, oldmodel): image_size, data_train, data_valid, data_test = datasets.get_data(dataset) train_stream = Flatten( DataStream(data_train, iteration_scheme=SequentialScheme(data_train.num_examples, batch_size))) valid_stream = Flatten( DataStream(data_valid, iteration_scheme=SequentialScheme(data_valid.num_examples, batch_size))) test_stream = Flatten( DataStream(data_test, iteration_scheme=SequentialScheme(data_test.num_examples, batch_size))) if name is None: name = dataset img_height, img_width = image_size x_dim = img_height * img_width rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2 * read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2 * x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e" % value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) subdir = time.strftime("%Y%m%d-%H%M%S") + "-" + name longname = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % ( dataset, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) pickle_file = subdir + "/" + longname + ".pkl" print("\nRunning experiment %s" % longname) print(" dataset: %s" % dataset) print(" subdirectory: %s" % subdir) print(" learning rate: %g" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print(" batch size: %d" % batch_size) print(" epochs: %d" % epochs) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel(n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') #x_recons = 1. + x x_recons, kl_terms = draw.reconstruct(x) #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100) #x_recons = x_recons[-1,:,:] #samples = draw.sample(100) #x_recons = samples[-1, :, :] #x_recons = samples[-1, :, :] recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t, :].mean() kl_term_t.name = "kl_term_%d" % t #x_recons_t = T.nnet.sigmoid(c[t,:,:]) #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) #recons_term_t = recons_term_t.mean() #recons_term_t.name = "recons_term_%d" % t monitors += [kl_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], #["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ if not os.path.exists(subdir): os.makedirs(subdir) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring(train_monitors, prefix="train", after_epoch=True), # DataStreamMonitoring( # monitors, # valid_stream, ## updates=scan_updates, # prefix="valid"), DataStreamMonitoring( monitors, test_stream, # updates=scan_updates, prefix="test"), Checkpoint(name, before_training=False, after_epoch=True, save_separately=['log', 'model']), #Checkpoint(image_size=image_size, save_subdir=subdir, path=pickle_file, before_training=False, after_epoch=True, save_separately=['log', 'model']), Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) if oldmodel is not None: print("Initializing parameters with old model %s" % oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_param_values(oldmodel.get_param_values()) del oldmodel main_loop.run()
def main(exp_config, source_vocab, target_vocab, dev_stream, use_bokeh=True): # def setup_model_and_stream(exp_config, source_vocab, target_vocab): # def setup_model_and_stream(exp_config, source_vocab, target_vocab): train_encoder, train_decoder, theano_sampling_source_input, theano_sampling_context_input, generated, masked_stream = setup_model_and_stream( exp_config, source_vocab, target_vocab) cost = create_model(train_encoder, train_decoder, exp_config.get('imt_smoothing_constant', 0.005)) # Set up training model logger.info("Building model") train_model = Model(cost) # Set the parameters from a trained models (.npz file) logger.info("Loading parameters from model: {}".format( exp_config['saved_parameters'])) # Note the brick delimeter='-' is here for legacy reasons because blocks changed the serialization API param_values = LoadNMT.load_parameter_values( exp_config['saved_parameters'], brick_delimiter=exp_config.get('brick_delimiter', None)) LoadNMT.set_model_parameters(train_model, param_values) logger.info('Creating computational graph') cg = ComputationGraph(cost) # GRAPH TRANSFORMATIONS FOR BETTER TRAINING if exp_config.get('l2_regularization', False) is True: l2_reg_alpha = exp_config['l2_regularization_alpha'] logger.info( 'Applying l2 regularization with alpha={}'.format(l2_reg_alpha)) model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # why do we need to rename the cost variable? Where did the original name come from? cost.name = 'decoder_cost_cost' cg = ComputationGraph(cost) # apply dropout for regularization # Note dropout variables are hard-coded here if exp_config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, exp_config['dropout']) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(exp_config['saveto']): os.makedirs(exp_config['saveto']) # TODO: mv the actual config file once we switch to .yaml for min-risk shutil.copy(exp_config['config_file'], exp_config['saveto']) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=exp_config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(exp_config['saveto'], every_n_batches=exp_config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary # TODO: change the if statement here if exp_config['hook_samples'] >= 1 or exp_config['bleu_script'] is not None: logger.info("Building sampling model") search_model = Model(generated) _, samples = VariableFilter( bricks=[train_decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling -- TODO: sampling is broken for min-risk #if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu # TODO: use multimodal meteor and BLEU validator # TODO: add 'validator' key to IMT config # Add early stopping based on bleu if exp_config.get('bleu_script', None) is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(theano_sampling_source_input, theano_sampling_context_input, samples=samples, config=exp_config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=exp_config['normalized_bleu'], every_n_batches=exp_config['bleu_val_freq'])) if exp_config.get('imt_f1_validation', False) is not False: logger.info("Building imt F1 validator") extensions.append( IMT_F1_Validator(theano_sampling_source_input, theano_sampling_context_input, samples=samples, config=exp_config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=exp_config['normalized_bleu'], every_n_batches=exp_config['bleu_val_freq'])) # Add early stopping based on Meteor # if exp_config.get('meteor_directory', None) is not None: # logger.info("Building meteor validator") # extensions.append( # MeteorValidator(theano_sampling_source_input, theano_sampling_context_input, # samples=samples, # config=config, # model=search_model, data_stream=dev_stream, # src_vocab=src_vocab, # trg_vocab=trg_vocab, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if exp_config['reload']: extensions.append(LoadNMT(exp_config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(exp_config['model_save_directory'], channels=[[ 'decoder_cost_cost', 'validation_set_imt_f1_score', 'validation_set_bleu_score', 'validation_set_meteor_score' ]], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # if there is l2_regularization, dropout or random noise, we need to use the output of the modified graph # WORKING: try to catch and fix nan if exp_config['dropout'] < 1.0: if exp_config.get('nan_guard', False): from theano.compile.nanguardmode import NanGuardMode algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping( exp_config['step_clipping']), eval(exp_config['step_rule'])() ]), on_unused_sources='warn', theano_func_kwargs={ 'mode': NanGuardMode(nan_is_error=True, inf_is_error=True) }) else: algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping( exp_config['step_clipping']), eval(exp_config['step_rule'])() ]), on_unused_sources='warn') else: algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping( exp_config['step_clipping']), eval(exp_config['step_rule'])() ]), on_unused_sources='warn') # enrich the logged information extensions.append(Timing(every_n_batches=100)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=train_model, algorithm=algorithm, data_stream=masked_stream, extensions=extensions) # Train! main_loop.run()
softmax = NDimensionalSoftmax(name='ndim_softmax') activation_input = lookup_input.apply(x) hidden = rnn.apply(linear_input.apply(activation_input)) activation_output = linear_output.apply(hidden) y_est = softmax.apply(activation_output, extra_ndim=1) cost = softmax.categorical_cross_entropy(y, activation_output, extra_ndim=1).mean() from blocks.graph import ComputationGraph from blocks.algorithms import GradientDescent, Adam cg = ComputationGraph([cost]) step_rules = [RMSProp(learning_rate=0.002, decay_rate=0.95), StepClipping(1.0)] algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule(step_rules), on_unused_sources='ignore') from blocks.extensions import Timing, FinishAfter, Printing, ProgressBar from blocks.extensions.monitoring import TrainingDataMonitoring from fuel.streams import DataStream from fuel.schemes import SequentialScheme from blocks.main_loop import MainLoop from blocks.extensions.saveload import Checkpoint from blocks.model import Model
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs, test_cost, experiment_path, initialization, init_width, weight_noise, z_prob, z_prob_states, z_prob_cells, drop_prob_igates, ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len, decrease_lr_after_epoch, lr_decay, **kwargs): print '.. PTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() ########################################### # # LOAD DATA # ########################################### def onehot(x, numclasses=None): """ Convert integer encoding for class-labels (starting with 0 !) to one-hot encoding. The output is an array whose shape is the shape of the input array plus an extra dimension, containing the 'one-hot'-encoded labels. """ if x.shape == (): x = x[None] if numclasses is None: numclasses = x.max() + 1 result = numpy.zeros(list(x.shape) + [numclasses], dtype="int") z = numpy.zeros(x.shape, dtype="int") for c in range(numclasses): z *= 0 z[numpy.where(x == c)] = 1 result[..., c] += z return result.astype(theano.config.floatX) alphabetsize = 10000 data = np.load('penntree_char_and_word.npz') trainset = data['train_words'] validset = data['valid_words'] testset = data['test_words'] if testing: trainset = trainset[:3000] validset = validset[:3000] if share_mask: if not z_prob: raise ValueError('z_prob must be provided when using share_mask') if z_prob_cells or z_prob_states: raise ValueError( 'z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)' ) z_prob_cells = z_prob # we don't want to actually use these masks, so this is to debug z_prob_states = None else: if z_prob: raise ValueError('z_prob is only used with share_mask') z_prob_cells = z_prob_cells or '1' z_prob_states = z_prob_states or '1' # rng = np.random.RandomState(seed) ########################################### # # MAKE STREAMS # ########################################### def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream( IndexableDataset(indexables=OrderedDict([('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data', ) * 3 + stream.sources + ( 'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream, ) train_stream, = prep_dataset(trainset) valid_stream, = prep_dataset(validset) test_stream, = prep_dataset(testset) #################### data = train_stream.get_epoch_iterator(as_dict=True).next() #################### ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('data') y = x zoneouts_states = T.tensor3('zoneouts_states') zoneouts_cells = T.tensor3('zoneouts_cells') zoneouts_igates = T.tensor3('zoneouts_igates') x.tag.test_value = data['data'] zoneouts_states.tag.test_value = data['zoneouts_states'] zoneouts_cells.tag.test_value = data['zoneouts_cells'] zoneouts_igates.tag.test_value = data['zoneouts_igates'] if init_width and not initialization == 'uniform': raise ValueError('Width is only for uniform init, whassup?') if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=init_width) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 4, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropLSTM(dim=layer_size, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn%d' % l, ogates_zoneout=ogates_zoneout) for l in range(num_layers) ] elif rnn_type.lower() == 'gru': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 3, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropGRU(dim=layer_size, weights_init=weights_init, activation=Tanh(), name='rnn%d' % l) for l in range(num_layers) ] elif rnn_type.lower() == 'srnn': # FIXME!!! make ReLU in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropSimpleRecurrent(dim=layer_size, weights_init=weights_init, activation=Rectifier(), name='rnn%d' % l) for l in range(num_layers) ] else: raise NotImplementedError hid_to_out = Linear(layer_size, alphabetsize, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) for layer in in_to_hids: layer.initialize() for layer in recurrent_layers: layer.initialize() hid_to_out.initialize() layer_input = x #in_to_hid.apply(x) init_updates = OrderedDict() for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)): rnn_embedding = in_to_hid.apply(layer_input) if rnn_type.lower() == 'lstm': states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) cells_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name, cells_init.name = "states_init", "cells_init" states, cells = layer.apply( rnn_embedding, zoneouts_states[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_cells[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_igates[:, :, l * layer_size:(l + 1) * layer_size], states_init, cells_init) init_updates.update([(states_init, states[-1]), (cells_init, cells[-1])]) elif rnn_type.lower() in ['gru', 'srnn']: # untested! states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name = "states_init" states = layer.apply(rnn_embedding, zoneouts_states, zoneouts_igates, states_init) init_updates.update([(states_init, states[-1])]) else: raise NotImplementedError layer_input = states y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1])) shape_ = y_hat_pre_softmax.shape y_hat = Softmax().apply(y_hat_pre_softmax.reshape((-1, alphabetsize))) #################### ########################################### # # SET UP COSTS AND MONITORS # ########################################### cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)), y_hat).copy('cost') bpc = (cost / np.log(2.0)).copy(name='bpr') perp = T.exp(cost).copy(name='perp') cost_train = cost.copy(name='train_cost') cg_train = ComputationGraph([cost_train]) ########################################### # # NORM STABILIZER # ########################################### norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt( T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) elif penalty == 'hids': for l in range(num_layers): assert 'rnn%d_apply_states' % l in [ o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables) ] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): for l in range(num_layers): if output.name == 'rnn%d_apply_states' % l: norms = _magnitude(output) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' #cost_valid = cost_train cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy( 'cost_train') #should this be cost_train.outputs[0]? no. cg_train = ComputationGraph([cost_train]) ########################################### # # WEIGHT NOISE # ########################################### if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') model = Model(cost_train) learning_rate = float(learning_rate) clipping = StepClipping(threshold=np.cast[floatX](clipping)) if algorithm == 'adam': adam = Adam(learning_rate=learning_rate) learning_rate = adam.learning_rate step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate) learning_rate = rms_prop.learning_rate step_rule = CompositeRule([clipping, rms_prop]) elif algorithm == 'momentum': sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) learning_rate = sgd_momentum.learning_rate step_rule = CompositeRule([clipping, sgd_momentum]) elif algorithm == 'sgd': sgd = Scale(learning_rate=learning_rate) learning_rate = sgd.learning_rate step_rule = CompositeRule([clipping, sgd]) else: raise NotImplementedError algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)}) algorithm.add_updates(init_updates) def cond_number(x): _, _, sing_vals = T.nlinalg.svd(x, True, True) sing_mags = abs(sing_vals) return T.max(sing_mags) / T.min(sing_mags) def rms(x): return (x * x).mean().sqrt() whysplode_cond = [] whysplode_rms = [] for i, p in enumerate(init_updates): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) for i, p in enumerate(cg_train.parameters): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) observed_vars = [ cost_train, cost, bpc, perp, learning_rate, aggregation.mean( algorithm.total_gradient_norm).copy("gradient_norm_mean") ] # + whysplode_rms parameters = model.get_parameter_dict() for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name=name + "_norm")) observed_vars.append( algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_inits = [p.clone() for p in init_updates] cg_dev = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), dev_inits)) dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3] dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:])) dev_monitor = DataStreamMonitoring(variables=[dev_cost, dev_bpc, dev_perp], data_stream=valid_stream, prefix="dev", updates=dev_init_updates) # noone does this if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name extensions = [] extensions.extend( [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) if test_cost: test_inits = [p.clone() for p in init_updates] cg_test = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), test_inits)) test_cost, test_bpc, test_perp = cg_test.outputs[:3] test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:])) test_monitor = DataStreamMonitoring( variables=[test_cost, test_bpc, test_perp], data_stream=test_stream, prefix="test", updates=test_init_updates) extensions.extend([test_monitor]) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append( SaveParams('dev_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) class RollsExtension(TrainingExtension): """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """ def __init__(self, shvars): self.shvars = shvars def before_epoch(self): for v in self.shvars: v.set_value(np.roll(v.get_value(), 1, 0)) extensions.append( RollsExtension(init_updates.keys() + dev_init_updates.keys() + (test_init_updates.keys() if test_cost else []))) class LearningRateSchedule(TrainingExtension): """ Lets you set a number to divide learning rate by each epoch + when to start doing that """ def __init__(self): self.epoch_number = 0 def after_epoch(self): self.epoch_number += 1 if self.epoch_number > decrease_lr_after_epoch: learning_rate.set_value(learning_rate.get_value() / lr_decay) if bool(lr_decay) != bool(decrease_lr_after_epoch): raise ValueError( 'Need to define both lr_decay and decrease_lr_after_epoch') if lr_decay and decrease_lr_after_epoch: extensions.append(LearningRateSchedule()) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) main_loop.run() print "Execution time: %f" % (time.time() - t1)
np.random.seed(args.seed) blocks.config.config.default_seed = args.seed if args.continue_from: from blocks.serialization import load main_loop = load(args.continue_from) main_loop.run() sys.exit(0) graphs, extensions, updates = construct_graphs(args, nclasses, sequence_length) #graph, extension, update = construct_graphs(args, nclasses, sequence_length) ### optimization algorithm definition step_rule = CompositeRule([ StepClipping(1.), #Momentum(learning_rate=args.learning_rate, momentum=0.9), RMSProp(learning_rate=args.learning_rate, decay_rate=0.5), ]) algorithm = GradientDescent(cost=graphs["training"].outputs[0], parameters=graphs["training"].parameters, step_rule=step_rule) algorithm.add_updates(updates["training"]) model = Model(graphs["training"].outputs[0]) extensions = extensions["training"] + extensions["inference"] # step monitor (after epoch to limit the log size) step_channels = [] step_channels.extend([ algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name)
def main(config, tr_stream, dev_stream, use_bokeh=False, slim_iteration_state=False, switch_controller=None, reset_epoch=False): """This method largely corresponds to the ``main`` method in the original Blocks implementation in blocks-examples and most of the code is copied from there. Following modifications have been made: - Support fixing word embedding during training - Dropout fix https://github.com/mila-udem/blocks-examples/issues/46 - If necessary, add the exp3s extension Args: config (dict): NMT config tr_stream (DataStream): Training data stream dev_stream (DataStream): Validation data stream use_bokeh (bool): Whether to use bokeh for plotting slim_iteration_state (bool): Whether to store the full iteration state or only the epoch iterator without data stream state switch_controller (SourceSwitchController): Controlling strategy if monolingual data is used as well reset_epoch (bool): Set epoch_started in main loop status to false. Sometimes required if you change training parameters such as mono_data_integration """ nmt_model = NMTModel(config) nmt_model.set_up() # Set extensions logging.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([nmt_model.cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], slim_iteration_state, every_n_batches=config['save_freq']) ] # Add early stopping based on bleu if config['bleu_script'] is not None: logging.info("Building bleu validator") extensions.append( BleuValidator(nmt_model.sampling_input, samples=nmt_model.samples, config=config, model=nmt_model.search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], store_full_main_loop=config['store_full_main_loop'], every_n_batches=config['bleu_val_freq'])) if switch_controller: switch_controller.beam_search = BeamSearch(samples=nmt_model.samples) switch_controller.src_sentence = nmt_model.sampling_input extensions.append(switch_controller) # Reload model if necessary if config['reload']: extensions.append( LoadNMT(config['saveto'], slim_iteration_state, reset_epoch)) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Decoding cost', channels=[['decoder_cost_cost']], after_batch=True)) # Add an extension for correct handling of SIGTERM and SIGINT extensions.append(AlwaysEpochInterrupt(every_n_batches=1)) # Set up training algorithm logging.info("Initializing training algorithm") # https://github.com/mila-udem/blocks-examples/issues/46 train_params = nmt_model.cg.parameters # fs439: fix embeddings? if config['fix_embeddings']: train_params = [] embedding_params = [ 'softmax1', 'softmax0', 'maxout_bias', 'embeddings', 'lookuptable', 'transform_feedback' ] for p in nmt_model.cg.parameters: add_param = True for ann in p.tag.annotations: if ann.name in embedding_params: logging.info("Do not train %s due to annotation %s" % (p, ann)) add_param = False break if add_param: train_params.append(p) # Change cost=cost to cg.outputs[0] ? algorithm = GradientDescent(cost=nmt_model.cg.outputs[0] if config['dropout'] < 1.0 else nmt_model.cost, parameters=train_params, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Initialize main loop logging.info("Initializing main loop") main_loop = MainLoop(model=nmt_model.training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Reset epoch if reset_epoch: main_loop.status['epoch_started'] = False # Train! main_loop.run()
def test_step_clipping_broadcastable(): verify_broadcastable_handling(StepClipping(0.4))
def train_language_model(new_training_job, config, save_path, params, fast_start, fuel_server, seed): c = config if seed: fuel.config.default_seed = seed blocks.config.config.default_seed = seed data, lm, retrieval = initialize_data_and_model(config) # full main loop can be saved... main_loop_path = os.path.join(save_path, 'main_loop.tar') # or only state (log + params) which can be useful not to pickle embeddings state_path = os.path.join(save_path, 'training_state.tar') stream_path = os.path.join(save_path, 'stream.pkl') best_tar_path = os.path.join(save_path, "best_model.tar") words = tensor.ltensor3('words') words_mask = tensor.matrix('words_mask') if theano.config.compute_test_value != 'off': test_value_data = next( data.get_stream('train', batch_size=4, max_length=5).get_epoch_iterator()) words.tag.test_value = test_value_data[0] words_mask.tag.test_value = test_value_data[1] costs, updates = lm.apply(words, words_mask) cost = rename(costs.mean(), 'mean_cost') cg = Model(cost) if params: logger.debug("Load parameters from {}".format(params)) with open(params) as src: cg.set_parameter_values(load_parameters(src)) length = rename(words.shape[1], 'length') perplexity, = VariableFilter(name='perplexity')(cg) perplexities = VariableFilter(name_regex='perplexity.*')(cg) monitored_vars = [length, cost] + perplexities if c['dict_path']: num_definitions, = VariableFilter(name='num_definitions')(cg) monitored_vars.extend([num_definitions]) parameters = cg.get_parameter_dict() trained_parameters = parameters.values() saved_parameters = parameters.values() if c['embedding_path']: logger.debug("Exclude word embeddings from the trained parameters") trained_parameters = [ p for p in trained_parameters if not p == lm.get_def_embeddings_params() ] saved_parameters = [ p for p in saved_parameters if not p == lm.get_def_embeddings_params() ] if c['cache_size'] != 0: logger.debug("Enable fake recursivity for looking up embeddings") trained_parameters = [ p for p in trained_parameters if not p == lm.get_cache_params() ] logger.info("Cost parameters" + "\n" + pprint.pformat([ " ".join( (key, str(parameters[key].get_value().shape), 'trained' if parameters[key] in trained_parameters else 'frozen')) for key in sorted(parameters.keys()) ], width=120)) rules = [] if c['grad_clip_threshold']: rules.append(StepClipping(c['grad_clip_threshold'])) rules.append(Adam(learning_rate=c['learning_rate'], beta1=c['momentum'])) algorithm = GradientDescent(cost=cost, parameters=trained_parameters, step_rule=CompositeRule(rules)) if c['cache_size'] != 0: algorithm.add_updates(updates) train_monitored_vars = list(monitored_vars) if c['grad_clip_threshold']: train_monitored_vars.append(algorithm.total_gradient_norm) word_emb_RMS, = VariableFilter(name='word_emb_RMS')(cg) main_rnn_in_RMS, = VariableFilter(name='main_rnn_in_RMS')(cg) train_monitored_vars.extend([word_emb_RMS, main_rnn_in_RMS]) if c['monitor_parameters']: train_monitored_vars.extend(parameter_stats(parameters, algorithm)) # We use a completely random seed on purpose. With Fuel server # it's currently not possible to restore the state of the training # stream. That's why it's probably better to just have it stateless. stream_seed = numpy.random.randint(0, 10000000) if fuel_server else None training_stream = data.get_stream('train', batch_size=c['batch_size'], max_length=c['max_length'], seed=stream_seed) valid_stream = data.get_stream('valid', batch_size=c['batch_size_valid'], max_length=c['max_length'], seed=stream_seed) original_training_stream = training_stream if fuel_server: # the port will be configured by the StartFuelServer extension training_stream = ServerDataStream( sources=training_stream.sources, produces_examples=training_stream.produces_examples) validation = DataStreamMonitoring(monitored_vars, valid_stream, prefix="valid").set_conditions( before_first_epoch=not fast_start, on_resumption=True, every_n_batches=c['mon_freq_valid']) track_the_best = TrackTheBest(validation.record_name(perplexity), choose_best=min).set_conditions( on_resumption=True, after_epoch=True, every_n_batches=c['mon_freq_valid']) # don't save them the entire main loop to avoid pickling everything if c['fast_checkpoint']: load = (LoadNoUnpickling(state_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) cp_args = { 'save_main_loop': False, 'save_separately': ['log', 'iteration_state'], 'parameters': saved_parameters } checkpoint = Checkpoint(state_path, before_training=not fast_start, every_n_batches=c['save_freq_batches'], after_training=not fast_start, **cp_args) if c['checkpoint_every_n_batches']: intermediate_cp = IntermediateCheckpoint( state_path, every_n_batches=c['checkpoint_every_n_batches'], after_training=False, **cp_args) else: load = (Load(main_loop_path, load_iteration_state=True, load_log=True).set_conditions( before_training=not new_training_job)) cp_args = { 'save_separately': ['iteration_state'], 'parameters': saved_parameters } checkpoint = Checkpoint(main_loop_path, before_training=not fast_start, every_n_batches=c['save_freq_batches'], after_training=not fast_start, **cp_args) if c['checkpoint_every_n_batches']: intermediate_cp = IntermediateCheckpoint( main_loop_path, every_n_batches=c['checkpoint_every_n_batches'], after_training=False, **cp_args) checkpoint = checkpoint.add_condition( ['after_batch', 'after_epoch'], OnLogRecord(track_the_best.notification_name), (best_tar_path, )) extensions = [ load, StartFuelServer(original_training_stream, stream_path, before_training=fuel_server), Timing(every_n_batches=c['mon_freq_train']) ] if retrieval: extensions.append( RetrievalPrintStats(retrieval=retrieval, every_n_batches=c['mon_freq_train'], before_training=not fast_start)) extensions.extend([ TrainingDataMonitoring(train_monitored_vars, prefix="train", every_n_batches=c['mon_freq_train']), validation, track_the_best, checkpoint ]) if c['checkpoint_every_n_batches']: extensions.append(intermediate_cp) extensions.extend([ DumpTensorflowSummaries(save_path, every_n_batches=c['mon_freq_train'], after_training=True), Printing(on_resumption=True, every_n_batches=c['mon_freq_train']), FinishIfNoImprovementAfter(track_the_best.notification_name, iterations=50 * c['mon_freq_valid'], every_n_batches=c['mon_freq_valid']), FinishAfter(after_n_batches=c['n_batches']) ]) logger.info("monitored variables during training:" + "\n" + pprint.pformat(train_monitored_vars, width=120)) logger.info("monitored variables during valid:" + "\n" + pprint.pformat(monitored_vars, width=120)) main_loop = MainLoop(algorithm, training_stream, model=Model(cost), extensions=extensions) main_loop.run()
x_mask, attended=mlp_context.apply(context)) cost = cost_matrix.sum() / x_mask.sum() cost.name = "sequence_log_likelihood" cg = ComputationGraph(cost) model = Model(cost) ################# # Algorithm ################# algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule( [StepClipping(10.0), Adam(lr)])) train_monitor = TrainingDataMonitoring(variables=[cost], after_epoch=True, prefix="train") extensions = extensions = [ train_monitor, TrackTheBest('train_sequence_log_likelihood'), Printing(after_epoch=True) ] main_loop = MainLoop(model=model, data_stream=data_stream, algorithm=algorithm,
def main(): nclasses = 27 import argparse parser = argparse.ArgumentParser() parser.add_argument("--seed", type=int, default=1) parser.add_argument("--length", type=int, default=180) parser.add_argument("--num-epochs", type=int, default=100) parser.add_argument("--batch-size", type=int, default=64) parser.add_argument("--learning-rate", type=float, default=1e-3) parser.add_argument("--epsilon", type=float, default=1e-5) parser.add_argument("--num-hidden", type=int, default=1000) parser.add_argument("--baseline", action="store_true") parser.add_argument("--initialization", choices="identity glorot orthogonal uniform".split(), default="identity") parser.add_argument("--initial-gamma", type=float, default=1e-1) parser.add_argument("--initial-beta", type=float, default=0) parser.add_argument("--cluster", action="store_true") parser.add_argument("--activation", choices=list(activations.keys()), default="tanh") parser.add_argument("--optimizer", choices="sgdmomentum adam rmsprop", default="rmsprop") parser.add_argument("--continue-from") parser.add_argument("--evaluate") parser.add_argument("--dump-hiddens") args = parser.parse_args() np.random.seed(args.seed) blocks.config.config.default_seed = args.seed if args.continue_from: from blocks.serialization import load main_loop = load(args.continue_from) main_loop.run() sys.exit(0) graphs, extensions, updates = construct_graphs(args, nclasses) ### optimization algorithm definition if args.optimizer == "adam": optimizer = Adam(learning_rate=args.learning_rate) elif args.optimizer == "rmsprop": optimizer = RMSProp(learning_rate=args.learning_rate, decay_rate=0.9) elif args.optimizer == "sgdmomentum": optimizer = Momentum(learning_rate=args.learning_rate, momentum=0.99) step_rule = CompositeRule([ StepClipping(1.), optimizer, ]) algorithm = GradientDescent(cost=graphs["training"].outputs[0], parameters=graphs["training"].parameters, step_rule=step_rule) algorithm.add_updates(updates["training"]) model = Model(graphs["training"].outputs[0]) extensions = extensions["training"] + extensions["inference"] # step monitor step_channels = [] step_channels.extend([ algorithm.steps[param].norm(2).copy(name="step_norm:%s" % name) for name, param in model.get_parameter_dict().items() ]) step_channels.append( algorithm.total_step_norm.copy(name="total_step_norm")) step_channels.append( algorithm.total_gradient_norm.copy(name="total_gradient_norm")) step_channels.extend(graphs["training"].outputs) logger.warning("constructing training data monitor") extensions.append( TrainingDataMonitoring(step_channels, prefix="iteration", after_batch=True)) # parameter monitor extensions.append( DataStreamMonitoring([ param.norm(2).copy(name="parameter.norm:%s" % name) for name, param in model.get_parameter_dict().items() ], data_stream=None, after_epoch=True)) validation_interval = 500 # performance monitor for situation in "training inference".split(): if situation == "inference" and not args.evaluate: # save time when we don't need the inference graph continue for which_set in "train valid test".split(): logger.warning("constructing %s %s monitor" % (which_set, situation)) channels = list(graphs[situation].outputs) extensions.append( DataStreamMonitoring(channels, prefix="%s_%s" % (which_set, situation), every_n_batches=validation_interval, data_stream=get_stream( which_set=which_set, batch_size=args.batch_size, num_examples=10000, length=args.length))) extensions.extend([ TrackTheBest("valid_training_error_rate", "best_valid_training_error_rate"), DumpBest("best_valid_training_error_rate", "best.zip"), FinishAfter(after_n_epochs=args.num_epochs), #FinishIfNoImprovementAfter("best_valid_error_rate", epochs=50), Checkpoint("checkpoint.zip", on_interrupt=False, every_n_epochs=1, use_cpickle=True), DumpLog("log.pkl", after_epoch=True) ]) if not args.cluster: extensions.append(ProgressBar()) extensions.extend([ Timing(), Printing(every_n_batches=validation_interval), PrintingTo("log"), ]) main_loop = MainLoop(data_stream=get_stream(which_set="train", batch_size=args.batch_size, length=args.length, augment=True), algorithm=algorithm, extensions=extensions, model=model) if args.dump_hiddens: dump_hiddens(args, main_loop) return if args.evaluate: evaluate(args, main_loop) return main_loop.run()