def test_main_loop(): class TestDataStream(object): def __init__(self): self.epochs = self._generate_data() def _generate_data(self): def wrap_in_dicts(iterable): for x in iterable: yield dict(data=x) yield iter(wrap_in_dicts([1, 2, 3])) yield iter(wrap_in_dicts([4, 5])) yield iter(wrap_in_dicts([6, 7, 8, 9])) def get_epoch_iterator(self, as_dict): assert as_dict is True return next(self.epochs) finish_extension = FinishAfter() finish_extension.add_condition( 'after_epoch', predicate=lambda log: log.status['epochs_done'] == 2) main_loop = MainLoop(MockAlgorithm(), TestDataStream(), extensions=[WriteBatchExtension(), finish_extension]) main_loop.run() assert main_loop.log.status['iterations_done'] == 5 assert main_loop.log.status['_epoch_ends'] == [3, 5] assert len(main_loop.log) == 5 for i in range(1, 6): assert main_loop.log[i]['batch'] == dict(data=i)
dev_monitor = DataStreamMonitoring(variables=[cost], after_epoch=True, before_first_epoch=True, data_stream=dev_stream, prefix="dev") train_monitor = TrainingDataMonitoring(variables=monitored_vars, after_batch=True, before_first_epoch=True, prefix='tra') extensions = [ dev_monitor, train_monitor, Timing(), Printing(after_batch=True), FinishAfter(after_n_epochs=nepochs), saveload.Load(load_path), saveload.Checkpoint(last_path), ] + track_best('dev_cost', save_path) if learning_rate_decay not in (0, 1): extensions.append( SharedVariableModifier(step_rules[0].learning_rate, lambda n, lr: numpy.cast[theano.config.floatX] (learning_rate_decay * lr), after_epoch=True, after_batch=False)) print 'number of parameters in the model: ' + str( tensor.sum([p.size for p in cg.parameters]).eval()) # Finally build the main loop and train the model
def main(job_id, params): config = ConfigParser.ConfigParser() config.readfp(open('./params')) max_epoch = int(config.get('hyperparams', 'max_iter', 100)) base_lr = float(config.get('hyperparams', 'base_lr', 0.01)) train_batch = int(config.get('hyperparams', 'train_batch', 256)) valid_batch = int(config.get('hyperparams', 'valid_batch', 512)) test_batch = int(config.get('hyperparams', 'valid_batch', 512)) W_sd = float(config.get('hyperparams', 'W_sd', 0.01)) W_mu = float(config.get('hyperparams', 'W_mu', 0.0)) b_sd = float(config.get('hyperparams', 'b_sd', 0.01)) b_mu = float(config.get('hyperparams', 'b_mu', 0.0)) hidden_units = int(config.get('hyperparams', 'hidden_units', 32)) input_dropout_ratio = float( config.get('hyperparams', 'input_dropout_ratio', 0.2)) dropout_ratio = float(config.get('hyperparams', 'dropout_ratio', 0.2)) weight_decay = float(config.get('hyperparams', 'weight_decay', 0.001)) max_norm = float(config.get('hyperparams', 'max_norm', 100.0)) solver = config.get('hyperparams', 'solver_type', 'rmsprop') data_file = config.get('hyperparams', 'data_file') side = config.get('hyperparams', 'side', 'b') # Spearmint optimization parameters: if params: base_lr = float(params['base_lr'][0]) dropout_ratio = float(params['dropout_ratio'][0]) hidden_units = params['hidden_units'][0] weight_decay = params['weight_decay'][0] if 'adagrad' in solver: solver_type = CompositeRule([ AdaGrad(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) else: solver_type = CompositeRule([ RMSProp(learning_rate=base_lr), VariableClipping(threshold=max_norm) ]) input_dim = {'l': 11427, 'r': 10519, 'b': 10519 + 11427} data_file = config.get('hyperparams', 'data_file') if 'b' in side: train = H5PYDataset(data_file, which_set='train') valid = H5PYDataset(data_file, which_set='valid') test = H5PYDataset(data_file, which_set='test') x_l = tensor.matrix('l_features') x_r = tensor.matrix('r_features') x = tensor.concatenate([x_l, x_r], axis=1) else: train = H5PYDataset(data_file, which_set='train', sources=['{}_features'.format(side), 'targets']) valid = H5PYDataset(data_file, which_set='valid', sources=['{}_features'.format(side), 'targets']) test = H5PYDataset(data_file, which_set='test', sources=['{}_features'.format(side), 'targets']) x = tensor.matrix('{}_features'.format(side)) y = tensor.lmatrix('targets') # Define a feed-forward net with an input, two hidden layers, and a softmax output: model = MLP(activations=[ Rectifier(name='h1'), Rectifier(name='h2'), Softmax(name='output'), ], dims=[input_dim[side], hidden_units, hidden_units, 2], weights_init=IsotropicGaussian(std=W_sd, mean=W_mu), biases_init=IsotropicGaussian(b_sd, b_mu)) # Don't forget to initialize params: model.initialize() # y_hat is the output of the neural net with x as its inputs y_hat = model.apply(x) # Define a cost function to optimize, and a classification error rate. # Also apply the outputs from the net and corresponding targets: cost = CategoricalCrossEntropy().apply(y.flatten(), y_hat) error = MisclassificationRate().apply(y.flatten(), y_hat) error.name = 'error' # This is the model: before applying dropout model = Model(cost) # Need to define the computation graph for the cost func: cost_graph = ComputationGraph([cost]) # This returns a list of weight vectors for each layer W = VariableFilter(roles=[WEIGHT])(cost_graph.variables) # Add some regularization to this model: cost += weight_decay * l2_norm(W) cost.name = 'entropy' # computational graph with l2 reg cost_graph = ComputationGraph([cost]) # Apply dropout to inputs: inputs = VariableFilter([INPUT])(cost_graph.variables) dropout_inputs = [ input for input in inputs if input.name.startswith('linear_') ] dropout_graph = apply_dropout(cost_graph, [dropout_inputs[0]], input_dropout_ratio) dropout_graph = apply_dropout(dropout_graph, dropout_inputs[1:], dropout_ratio) dropout_cost = dropout_graph.outputs[0] dropout_cost.name = 'dropout_entropy' # Learning Algorithm (notice: we use the dropout cost for learning): algo = GradientDescent(step_rule=solver_type, params=dropout_graph.parameters, cost=dropout_cost) # algo.step_rule.learning_rate.name = 'learning_rate' # Data stream used for training model: training_stream = Flatten( DataStream.default_stream(dataset=train, iteration_scheme=ShuffledScheme( train.num_examples, batch_size=train_batch))) training_monitor = TrainingDataMonitoring([ dropout_cost, aggregation.mean(error), aggregation.mean(algo.total_gradient_norm) ], after_batch=True) # Use the 'valid' set for validation during training: validation_stream = Flatten( DataStream.default_stream(dataset=valid, iteration_scheme=ShuffledScheme( valid.num_examples, batch_size=valid_batch))) validation_monitor = DataStreamMonitoring(variables=[cost, error], data_stream=validation_stream, prefix='validation', after_epoch=True) test_stream = Flatten( DataStream.default_stream( dataset=test, iteration_scheme=ShuffledScheme(test.num_examples, batch_size=test_batch))) test_monitor = DataStreamMonitoring(variables=[error], data_stream=test_stream, prefix='test', after_training=True) plotting = Plot('AdniNet_{}'.format(side), channels=[ ['dropout_entropy', 'validation_entropy'], ['error', 'validation_error'], ], after_batch=False) # Checkpoint class used to save model and log: stamp = datetime.datetime.fromtimestamp( time.time()).strftime('%Y-%m-%d-%H:%M') checkpoint = Checkpoint('./models/{}net/{}'.format(side, stamp), save_separately=['model', 'log'], every_n_epochs=1) # Home-brewed class for early stopping when we detect we have started to overfit early_stopper = FinishIfOverfitting(error_name='error', validation_name='validation_error', threshold=0.1, epochs=5, burn_in=100) # The main loop will train the network and output reports, etc main_loop = MainLoop(data_stream=training_stream, model=model, algorithm=algo, extensions=[ validation_monitor, training_monitor, plotting, FinishAfter(after_n_epochs=max_epoch), early_stopper, Printing(), ProgressBar(), checkpoint, test_monitor, ]) main_loop.run() ve = float(main_loop.log.last_epoch_row['validation_error']) te = float(main_loop.log.last_epoch_row['error']) spearmint_loss = ve + abs(te - ve) print 'Spearmint Loss: {}'.format(spearmint_loss) return spearmint_loss
cost.name = "sequence_cost" algorithm = GradientDescent( cost=cost, parameters=list(Selector(generator).get_parameters().values()), step_rule=Adam(), # because we want use all the stuff in the training data on_unused_sources='ignore') main_loop = MainLoop(algorithm=algorithm, data_stream=DataStream( train_data, iteration_scheme=SequentialScheme( train_data.num_examples, batch_size=20)), model=Model(cost), extensions=[ FinishAfter(), TrainingDataMonitoring([cost], prefix="this_step", after_batch=True), TrainingDataMonitoring([cost], prefix="average", every_n_batches=100), Checkpoint(args.model, every_n_epochs=5), Printing(every_n_batches=100) ]) main_loop.run() elif args.mode == "retrain": main_loop = load(open(args.model, "rb")) main_loop.run()
def main(config, tr_stream, dev_stream, use_bokeh=False): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('En-Zh', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run()
x1 = encoder.apply(f1) x2 = encoder.apply(f2) from cost import ContrastiveLoss cost = ContrastiveLoss(q=dims[-1]).apply(x1=x1, x2=x2, y1=y1, y2=y2) cost.name = 'contrastive_loss' cost_test = cost.copy('contrastive_loss_test') cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, step_rule=AdaDelta(), parameters=cg.parameters) main_loop = MainLoop(algorithm=algorithm, data_stream=train_stream, extensions=[ TrainingDataMonitoring(variables=[cost], after_epoch=True), DataStreamMonitoring(data_stream=test_stream, variables=[cost_test], after_epoch=True), Printing(after_epoch=True), FinishAfter(after_n_epochs=50), Plot(document='siamese network larger', channels=[[cost.name, cost_test.name]], start_server=True, after_epoch=True) ])
def main(config, tr_stream, dev_stream, use_bokeh=False, slim_iteration_state=False, switch_controller=None, reset_epoch=False): """This method largely corresponds to the ``main`` method in the original Blocks implementation in blocks-examples and most of the code is copied from there. Following modifications have been made: - Support fixing word embedding during training - Dropout fix https://github.com/mila-udem/blocks-examples/issues/46 - If necessary, add the exp3s extension Args: config (dict): NMT config tr_stream (DataStream): Training data stream dev_stream (DataStream): Validation data stream use_bokeh (bool): Whether to use bokeh for plotting slim_iteration_state (bool): Whether to store the full iteration state or only the epoch iterator without data stream state switch_controller (SourceSwitchController): Controlling strategy if monolingual data is used as well reset_epoch (bool): Set epoch_started in main loop status to false. Sometimes required if you change training parameters such as mono_data_integration """ nmt_model = NMTModel(config) nmt_model.set_up(make_prunable=(args.prune_every > 0)) # Set extensions logging.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([nmt_model.cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], slim_iteration_state, every_n_batches=config['save_freq']) ] # Add early stopping based on bleu if config['bleu_script'] is not None: logging.info("Building bleu validator") extensions.append( BleuValidator(nmt_model.sampling_input, samples=nmt_model.samples, config=config, model=nmt_model.search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], store_full_main_loop=config['store_full_main_loop'], every_n_batches=config['bleu_val_freq'])) if switch_controller: switch_controller.beam_search = BeamSearch(samples=nmt_model.samples) switch_controller.src_sentence = nmt_model.sampling_input extensions.append(switch_controller) # Reload model if necessary if config['reload']: extensions.append( LoadNMT(config['saveto'], slim_iteration_state, reset_epoch)) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Decoding cost', channels=[['decoder_cost_cost']], after_batch=True)) # Add an extension for correct handling of SIGTERM and SIGINT extensions.append(AlwaysEpochInterrupt(every_n_batches=1)) # Set up training algorithm logging.info("Initializing training algorithm") # https://github.com/mila-udem/blocks-examples/issues/46 train_params = nmt_model.cg.parameters # fs439: fix embeddings? if config['fix_embeddings']: train_params = [] embedding_params = [ 'softmax1', 'softmax0', 'maxout_bias', 'embeddings', 'lookuptable', 'transform_feedback' ] for p in nmt_model.cg.parameters: add_param = True for ann in p.tag.annotations: if ann.name in embedding_params: logging.info("Do not train %s: %s" % (p, ann)) add_param = False break if add_param: train_params.append(p) # Change cost=cost to cg.outputs[0] ? cost_func = nmt_model.cg.outputs[0] if config['dropout'] < 1.0 \ else nmt_model.cost if config['step_rule'] in ['AdaGrad', 'Adam']: step_rule = eval(config['step_rule'])(learning_rate=args.learning_rate) else: step_rule = eval(config['step_rule'])() step_rule = CompositeRule( [StepClipping(config['step_clipping']), step_rule]) if args.prune_every < 1: algorithm = GradientDescent(cost=cost_func, parameters=train_params, step_rule=step_rule) else: algorithm = PruningGradientDescent( prune_layer_configs=args.prune_layers.split(','), prune_layout_path=args.prune_layout_path, prune_n_steps=args.prune_n_steps, prune_every=args.prune_every, prune_reset_every=args.prune_reset_every, nmt_model=nmt_model, cost=cost_func, parameters=train_params, step_rule=step_rule) # Initialize main loop logging.info("Initializing main loop") main_loop = MainLoop(model=nmt_model.training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Reset epoch if reset_epoch: main_loop.status['epoch_started'] = False # Train! main_loop.run()
def test_training_data_monitoring(): weights = numpy.array([-1, 1], dtype=theano.config.floatX) features = [ numpy.array(f, dtype=theano.config.floatX) for f in [[1, 2], [3, 5], [5, 8]] ] targets = numpy.array([(weights * f).sum() for f in features]) n_batches = 3 dataset = IterableDataset(dict(features=features, targets=targets)) x = tensor.vector('features') y = tensor.scalar('targets') W = shared_floatx([0, 0], name='W') V = shared_floatx(7, name='V') W_sum = W.sum().copy(name='W_sum') cost = ((x * W).sum() - y)**2 cost.name = 'cost' class TrueCostExtension(TrainingExtension): def before_batch(self, data): self.main_loop.log.current_row['true_cost'] = (( (W.get_value() * data["features"]).sum() - data["targets"])**2) # Note, that unlike a Theano variable, a monitored # quantity can't be reused in more than one TrainingDataMonitoring ftt1 = MeanFeaturesTimesTarget(requires=[x, y], name='ftt1') ftt2 = MeanFeaturesTimesTarget(requires=[x, y], name='ftt2') main_loop = MainLoop(model=None, data_stream=dataset.get_example_stream(), algorithm=GradientDescent(cost=cost, parameters=[W], step_rule=Scale(0.001)), extensions=[ FinishAfter(after_n_epochs=1), TrainingDataMonitoring([W_sum, cost, V, ftt1], prefix="train1", after_batch=True), TrainingDataMonitoring( [aggregation.mean(W_sum), cost, ftt2], prefix="train2", after_epoch=True), TrueCostExtension() ]) main_loop.run() # Check monitoring of a shared varible assert_allclose(main_loop.log.current_row['train1_V'], 7.0) for i in range(n_batches): # The ground truth is written to the log before the batch is # processed, where as the extension writes after the batch is # processed. This is why the iteration numbers differs here. assert_allclose(main_loop.log[i]['true_cost'], main_loop.log[i + 1]['train1_cost']) assert_allclose( main_loop.log[n_batches]['train2_cost'], sum([main_loop.log[i]['true_cost'] for i in range(n_batches)]) / n_batches) assert_allclose( main_loop.log[n_batches]['train2_W_sum'], sum([ main_loop.log[i]['train1_W_sum'] for i in range(1, n_batches + 1) ]) / n_batches) # Check monitoring of non-Theano quantites for i in range(n_batches): assert_allclose(main_loop.log[i + 1]['train1_ftt1'], features[i] * targets[i]) assert_allclose(main_loop.log[n_batches]['train2_ftt2'], (features * targets[:, None]).mean(axis=0))
from blocks.algorithms import GradientDescent, Momentum, Adam from blocks.extensions.saveload import Checkpoint from lvq.batch_norm import BatchNormExtension from lvq.extensions import EarlyStopping, LRDecay, MomentumSwitchOff learning_rate = theano.shared(numpy.float32(1e-4)) step_rule = Momentum(5e-2, 0.9) main_loop = MainLoop( model=model, data_stream=train_stream, algorithm=GradientDescent(cost=model.outputs[0], parameters=model.parameters, step_rule=step_rule), extensions=[ FinishAfter(after_n_epochs=100), BatchNormExtension(model, train_stream_bn, n_batches), LRDecay(step_rule.learning_rate, [20, 40, 60, 80]), #MomentumSwitchOff(step_rule.momentum, 140), DataStreamMonitoring(variables=[test_loss, test_misclass], data_stream=train_stream_mon, prefix='train'), DataStreamMonitoring(variables=[test_loss, test_misclass], data_stream=valid_stream, prefix='valid'), DataStreamMonitoring(variables=[test_loss, test_misclass], data_stream=test_stream, prefix='test'), EarlyStopping('valid_misclass', 100, './exp/softmax_2_bn_noise.pkl'), Timing(), Printing(after_epoch=True)
def main(config, tr_stream, dev_stream, use_bokeh=False, the_task=None, the_track=None, use_embeddings=False, lang='german'): config['the_task'] = the_task # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( # end_embed is dimension of word embedding matrix in encoder; enc_nhids number of hidden units in encoder GRU config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, config['use_attention'], cost_type=config['error_fct']) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) testVar = decoder.getTestVar( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') my_rng = numpy.random.RandomState(config['rng_value']) if config['identity_init']: encoder.weights_init = decoder.weights_init = Identity() else: encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.rng = decoder.rng = my_rng encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() encoder.bidir.prototype.rng = my_rng decoder.transition.weights_init = Orthogonal() decoder.transition.rng = my_rng encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff'], seed=my_rng) cost = cg.outputs[0] # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") # this is ugly code and done, because I am not sure if the order of the extensions is important if 'track2' in config['saveto']: # less epochs for track 2, because of more data if config['early_stopping']: extensions = [ FinishAfter(after_n_epochs=config['finish_after']/2), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] else: extensions = [ FinishAfter(after_n_epochs=config['finish_after']/2), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] else: if config['early_stopping']: extensions = [ FinishAfter(after_n_epochs=config['finish_after']), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] else: extensions = [ FinishAfter(after_n_epochs=config['finish_after']), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], #every_n_batches=1, every_n_batches=config['sampling_freq'], src_vocab_size=8)) #src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['val_set'] is not None: logger.info("Building accuracy validator") extensions.append( AccuracyValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, after_training=True, #after_epoch=True)) every_n_epochs=5)) else: logger.info("No validation set given for this language") # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Load pretrained embeddings if necessary; after the other parameters; ORDER MATTERS if use_embeddings: extensions.append(LoadEmbeddings(config['embeddings'][0] + lang + config['embeddings'][1])) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run()
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids'],name='word_encoder') decoder = Decoder(vocab_size=config['trg_vocab_size'], embedding_dim=config['dec_embed'], state_dim=config['dec_nhids'], representation_dim=config['enc_nhids'] * 2, match_function=config['match_function'], use_doubly_stochastic=config['use_doubly_stochastic'], lambda_ds=config['lambda_ds'], use_local_attention=config['use_local_attention'], window_size=config['window_size'], use_step_decay_cost=config['use_step_decay_cost'], use_concentration_cost=config['use_concentration_cost'], lambda_ct=config['lambda_ct'], use_stablilizer=config['use_stablilizer'], lambda_st=config['lambda_st']) # here attended dim (representation_dim) of decoder is 2*enc_nhinds # because the context given by the encoder is a bidirectional context if mode == "train": # Create Theano variables logger.info('Creating theano variables') context_sentences=[]; context_sentence_masks=[]; for i in range(config['ctx_num']): context_sentences.append(tensor.lmatrix('context_'+str(i))); context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask')); source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') dev_source = tensor.lmatrix('dev_source') dev_target=tensor.lmatrix('dev_target') # Get training and development set streams tr_stream = get_tr_stream_withContext(**config) dev_stream = get_dev_stream_with_grdTruth(**config) # Get cost of the model sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask); sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]); sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]); for i in range(config['ctx_num']): tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]); tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]); sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0); sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0); cost = decoder.cost(sentence_representations_list, sentence_masks_list, target_sentence, target_sentence_mask) logger.info('Creating computational graph') perplexity = tensor.exp(cost) perplexity.name = 'perplexity' costs_computer = function(context_sentences+context_sentence_masks+[target_sentence, target_sentence_mask, source_sentence, source_sentence_mask], (perplexity)) cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init =decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init =decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise( cg, enc_params+dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([perplexity], after_batch=True), CheckpointNMT(config['saveto'], config['model_name'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate( sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, model_name=config['model_name'], hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if False: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'], n_best=3, track_n_models=6)) logger.info("Building perplexity validator") extensions.append( pplValidation(dev_source,dev_target, config=config, model=costs_computer, data_stream=dev_stream, model_name=config['model_name'], every_n_batches=config['sampling_freq'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) initial_learning_rate = config['initial_learning_rate'] log_path = os.path.join(config['saveto'], 'log') if config['reload'] and os.path.exists(log_path): with open(log_path, 'rb') as source: log = cPickle.load(source) last = max(log.keys()) - 1 if 'learning_rate' in log[last]: initial_learning_rate = log[last]['learning_rate'] # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([Scale(initial_learning_rate), StepClipping(config['step_clipping']), eval(config['step_rule'])()])) _learning_rate = algorithm.step_rule.components[0].learning_rate if config['learning_rate_decay']: extensions.append( LearningRateHalver(record_name='validation_cost', comparator=lambda x, y: x > y, learning_rate=_learning_rate, patience_default=3)) else: extensions.append(OldModelRemover(saveto=config['saveto'])) if config['learning_rate_grow']: extensions.append( LearningRateDoubler(record_name='validation_cost', comparator=lambda x, y: x < y, learning_rate=_learning_rate, patience_default=3)) extensions.append( SimplePrinting(config['model_name'], after_batch=True)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run() elif mode == 'ppl': # Create Theano variables # Create Theano variables logger.info('Creating theano variables') context_sentences=[]; context_sentence_masks=[]; for i in range(config['ctx_num']): context_sentences.append(tensor.lmatrix('context_'+str(i))); context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask')); source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') # Get training and development set streams #tr_stream = get_tr_stream_withContext(**config) dev_stream = get_dev_stream_withContext_grdTruth(**config) # Get cost of the model sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask); sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]); sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]); for i in range(config['ctx_num']): tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]); tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]); sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0); sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0); cost = decoder.cost(sentence_representations_list, sentence_masks_list, target_sentence, target_sentence_mask) logger.info('Creating computational graph') costs_computer = function(context_sentences+context_sentence_masks+[target_sentence, target_sentence_mask, source_sentence, source_sentence_mask], (cost)) logger.info("Loading the model..") model = Model(cost) #loader = LoadNMT(config['saveto']) loader = LoadNMT(config['validation_load']); loader.set_model_parameters(model, loader.load_parameters_default()) logger.info("Started Validation: ") ts = dev_stream.get_epoch_iterator() total_cost = 0.0 total_tokens=0.0 #pbar = ProgressBar(max_value=len(ts)).start()#modified pbar = ProgressBar(max_value=10000).start(); for i, (ctx_0,ctx_0_mask,ctx_1,ctx_1_mask,ctx_2,ctx_2_mask,src, src_mask, trg, trg_mask) in enumerate(ts): costs = costs_computer(*[ctx_0,ctx_1,ctx_2,ctx_0_mask,ctx_1_mask,ctx_2_mask,trg, trg_mask,src, src_mask]) cost = costs.sum() total_cost+=cost total_tokens+=trg_mask.sum() pbar.update(i + 1) total_cost/=total_tokens; pbar.finish() #dev_stream.reset() # run afterprocess # self.ap.main() total_cost=2**total_cost; print("Average validation cost: " + str(total_cost)); elif mode == 'translate': logger.info('Creating theano variables') context_sentences=[]; context_sentence_masks=[]; for i in range(config['ctx_num']): context_sentences.append(tensor.lmatrix('context_'+str(i))); context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask')); source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 trg_vocab = _ensure_special_tokens( cPickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} config['batch_size'] = 1 sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask); sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]); sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]); for i in range(config['ctx_num']): tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]); tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]); sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0); sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0); generated = decoder.generate(sentence_representations_list,sentence_masks_list) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) #loader = LoadNMT(config['saveto']) loader = LoadNMT(config['validation_load']); loader.set_model_parameters(model, loader.load_parameters_default()) logger.info("Started translation: ") test_stream = get_dev_stream_withContext(**config) ts = test_stream.get_epoch_iterator() rts = open(config['val_set_source']).readlines() ftrans_original = open(config['val_output_orig'], 'w') saved_weights = [] total_cost = 0.0 pbar = ProgressBar(max_value=len(rts)).start() for i, (line, line_raw) in enumerate(zip(ts, rts)): trans_in = line_raw[3].split() seqs=[]; input_=[]; input_mask=[]; for j in range(config['ctx_num']+1): seqs.append(sutils._oov_to_unk( line[2*j][0], config['src_vocab_size'], unk_idx)) input_mask.append(numpy.tile(line[2*j+1][0],(config['beam_size'], 1))) input_.append(numpy.tile(seqs[j], (config['beam_size'], 1))) #v=costs_computer(input_[0]); # draw sample, checking to ensure we don't get an empty string back trans, costs, attendeds, weights = \ beam_search.search( input_values={source_sentence: input_[3],source_sentence_mask:input_mask[3], context_sentences[0]: input_[0],context_sentence_masks[0]:input_mask[0], context_sentences[1]: input_[1],context_sentence_masks[1]:input_mask[1], context_sentences[2]: input_[2],context_sentence_masks[2]:input_mask[2]}, max_length=3*len(seqs[2]), eol_symbol=trg_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths b = numpy.argsort(costs)[0] #best=numpy.argsort(costs)[0:config['beam_size']]; #for b in best: try: total_cost += costs[b] trans_out = trans[b] totalLen=4*len(line[0][0]); #weight = weights[b][:, :totalLen] weight=weights trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' saved_weights.append(weight) print(' '.join(trans_out), file=ftrans_original) pbar.update(i + 1) pbar.finish() logger.info("Total cost of the test: {}".format(total_cost)) cPickle.dump(saved_weights, open(config['attention_weights'], 'wb')) ftrans_original.close() ap = afterprocesser(config) ap.main()
def train(step_rule, state_dim, epochs, seed, experiment_path, initialization, to_watch, patience, static_mask, batch_size, rnn_type, num_layers, augment, seq_len, drop_prob, drop_prob_states, drop_prob_cells, drop_prob_igates, ogates_zoneout, stoch_depth, share_mask, gaussian_drop, weight_noise, norm_cost_coeff, penalty, input_drop, **kwargs): print '.. cPTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() def numpy_rng(random_seed=None): if random_seed == None: random_seed = 1223 return numpy.random.RandomState(random_seed) ########################################### # # MAKE DATA STREAMS # ########################################### rng = np.random.RandomState(seed) if share_mask: drop_prob_cells = drop_prob # we don't want to actually use these masks, so this is to debug drop_prob_states = None print '.. initializing iterators' if static_mask: train_stream = get_static_mask_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, False, augment=augment) train_stream_evaluation = get_static_mask_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) dev_stream = get_static_mask_ptb_stream('valid', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) else: train_stream = get_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, False, augment=augment) train_stream_evaluation = get_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) dev_stream = get_ptb_stream('valid', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) data = train_stream.get_epoch_iterator(as_dict=True).next() #import ipdb; ipdb.set_trace() ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('features', dtype=floatX) x, y = x[:-1], x[1:] drops_states = T.tensor3('drops_states') drops_cells = T.tensor3('drops_cells') drops_igates = T.tensor3('drops_igates') x.tag.test_value = data['features'] #y.tag.test_value = data['outputs'] drops_states.tag.test_value = data['drops_states'] drops_cells.tag.test_value = data['drops_cells'] drops_igates.tag.test_value = data['drops_igates'] if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=.2) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hid = Linear(50, state_dim * 4, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = ZoneoutLSTM(dim=state_dim, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn', ogates_zoneout=ogates_zoneout) elif rnn_type.lower() == 'gru': in_to_hid = Linear(50, state_dim * 3, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = ZoneoutGRU(dim=state_dim, weights_init=weights_init, activation=Tanh(), name='rnn') elif rnn_type.lower() == 'srnn': in_to_hid = Linear(50, state_dim, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = ZoneoutSimpleRecurrent(dim=state_dim, weights_init=weights_init, activation=Rectifier(), name='rnn') else: raise NotImplementedError hid_to_out = Linear(state_dim, 50, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) in_to_hid.initialize() recurrent_layer.initialize() hid_to_out.initialize() h = in_to_hid.apply(x) if rnn_type.lower() == 'lstm': yh = recurrent_layer.apply(h, drops_states, drops_cells, drops_igates)[0] else: yh = recurrent_layer.apply(h, drops_states, drops_cells, drops_igates) y_hat_pre_softmax = hid_to_out.apply(yh) shape_ = y_hat_pre_softmax.shape # y_hat = Softmax().apply( # y_hat_pre_softmax.reshape((-1, shape_[-1])))# .reshape(shape_) ########################################### # # SET UP COSTS, MONITORS, and REGULARIZATION # ########################################### # cost = CategoricalCrossEntropy().apply(y.flatten().astype('int64'), y_hat) def crossentropy_lastaxes(yhat, y): # for sequence of distributions/targets return -(y * T.log(yhat)).sum(axis=yhat.ndim - 1) def softmax_lastaxis(x): # for sequence of distributions return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape) yhat = softmax_lastaxis(y_hat_pre_softmax) cross_entropies = crossentropy_lastaxes(yhat, y) cross_entropy = cross_entropies.mean().copy(name="cross_entropy") cost = cross_entropy.copy(name="cost") batch_cost = cost.copy(name='batch_cost') nll_cost = cost.copy(name='nll_cost') bpc = (nll_cost / np.log(2.0)).copy(name='bpr') #nll_cost = aggregation.mean(batch_cost, batch_size).copy(name='nll_cost') cost_monitor = aggregation.mean( batch_cost, batch_size).copy(name='sequence_cost_monitor') cost_per_character = aggregation.mean( batch_cost, (seq_len - 1) * batch_size).copy(name='character_cost') cost_train = cost.copy(name='train_batch_cost') cost_train_monitor = cost_monitor.copy('train_batch_cost_monitor') cg_train = ComputationGraph([cost_train, cost_train_monitor]) ################## # NORM STABILIZER ################## norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt( T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) ## debugging nans stuff #gr = T.grad(norm_cost, cg_train.parameters, disconnected_inputs='ignore') #grf = theano.function([x, input_mask], gr) #grz = grf(x.tag.test_value, input_mask.tag.test_value) #params = cg_train.parameters #mynanz = [(pp, np.sum(gg)) for pp,gg in zip(params, grz) if np.isnan(np.sum(gg))] #for mm in mynanz: print mm ##import ipdb; ipdb.set_trace() elif penalty == 'hids': assert 'rnn_apply_states' in [ o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables) ] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): if output.name == 'rnn_apply_states': norms = _magnitude(output) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy( 'cost_train') #should this be cost_train.outputs[0]? cg_train = ComputationGraph([cost_train, cost_train_monitor]) #, norm_cost]) ################## # WEIGHT NOISE ################## if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') cost_train_monitor = cg_train.outputs[1].copy( 'train_batch_cost_monitor') # if 'l2regularization' in kwargs: # weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) # cost_train += kwargs['l2regularization'] * sum([ # (weight ** 2).sum() for weight in weights]) # cost_train.name = 'cost_train' # cg_train = ComputationGraph(cost_train) model = Model(cost_train) train_cost_per_character = aggregation.mean( cost_train_monitor, (seq_len - 1) * batch_size).copy(name='train_character_cost') algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) observed_vars = [ cost_train, cost_train_monitor, train_cost_per_character, aggregation.mean(algorithm.total_gradient_norm) ] # parameters = model.get_parameter_dict() # for name, param in parameters.iteritems(): # observed_vars.append(param.norm(2).copy(name=name + "_norm")) # observed_vars.append( # algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_monitor = DataStreamMonitoring(variables=[nll_cost, bpc], data_stream=dev_stream, prefix="dev") extensions = [] if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name extensions.extend( [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append( SaveParams('dev_nll_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) ########################################### # # MAIN LOOOOOOOOOOOP # ########################################### main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) # if write_predictions: # with open('predicted.txt', 'w') as f_pred: # with open('targets.txt', 'w') as f_targets: # evaluator = CTCEvaluator( # eol_symbol, x, input_mask, y_hat, phoneme_dict, black_list) # evaluator.evaluate(dev_stream, file_pred=f_pred, # file_targets=f_targets) # return main_loop.run() print "Execution time: %f" % (time.time() - t1)
def main(config, tr_stream, dev_stream, use_bokeh=False, src_vocab=None, trg_vocab=None): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() logger.info('Creating computational graph') cg = ComputationGraph(cost) # GRAPH TRANSFORMATIONS FOR BETTER TRAINING # TODO: allow user to remove some params from the graph, for example if embeddings should be kept static if config.get('l2_regularization', False) is True: l2_reg_alpha = config['l2_regularization_alpha'] logger.info( 'Applying l2 regularization with alpha={}'.format(l2_reg_alpha)) model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # why do we need to name the cost variable? Where did the original name come from? cost.name = 'decoder_cost_cost' cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # allow user to externally initialize some params model_params = training_model.get_parameter_dict() if config.get('external_embeddings', None) is not None: for key in config['external_embeddings']: path_to_params = config['external_embeddings'][key] logger.info( 'Replacing {} parameters with external params at: {}'.format( key, path_to_params)) external_params = numpy.load(path_to_params) len_external_idx = external_params.shape[0] print(external_params.shape) # Working: look in the dictionary and overwrite the correct rows existing_params = model_params[key].get_value() if key == '/bidirectionalencoder/embeddings.W': vocab = src_vocab elif key == '/decoder/sequencegenerator/readout/lookupfeedbackwmt15/lookuptable.W': vocab = trg_vocab else: raise KeyError( 'Unknown embedding parameter key: {}'.format(key)) for k, i in vocab.items(): if i < len_external_idx: existing_params[i] = external_params[i] # model_params_shape = model_params[key].get_value().shape # assert model_params[key].get_value().shape == external_params.shape, ("Parameter dims must not change," # "shapes {} and {} do not match". # format(model_params_shape, # external_params.shape)) model_params[key].set_value(existing_params) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions logger.info("Initializing extensions") extensions = [] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) # note that generated containes several different outputs generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling # Note: this is broken for unicode chars #if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab_size=config['src_vocab_size'])) # WORKING: remove these validators in favor of Async # TODO: implement burn-in in the validation extension (don't fire until we're past the burn-in iteration) # Add early stopping based on bleu # if config.get('bleu_script', None) is not None: # logger.info("Building bleu validator") # extensions.append( # BleuValidator(sampling_input, samples=samples, config=config, # model=search_model, data_stream=dev_stream, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # Add early stopping based on Meteor # if config.get('meteor_directory', None) is not None: # logger.info("Building meteor validator") # extensions.append( # MeteorValidator(sampling_input, samples=samples, config=config, # model=search_model, data_stream=dev_stream, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Set up training algorithm logger.info("Initializing training algorithm") # if there is dropout or random noise, we need to use the output of the modified graph if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0: algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) else: algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # enrich the logged information extensions.extend([ Timing(every_n_batches=100), FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ]) # External non-blocking validation extensions.append( RunExternalValidation(config=config, every_n_batches=config['bleu_val_freq'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(config['model_save_directory'], channels=[['decoder_cost_cost'], ['validation_set_bleu_score'], ['validation_set_meteor_score']], every_n_batches=1)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def train(model, configs): get_streams = configs['get_streams'] save_path = configs['save_path'] num_epochs = configs['num_epochs'] batch_size = configs['batch_size'] lrs = configs['lrs'] until_which_epoch = configs['until_which_epoch'] grad_clipping = configs['grad_clipping'] monitorings = model.monitorings # Training if configs['weight_noise'] > 0: cg = ComputationGraph(model.cost) weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg = apply_noise(cg, weights, configs['weight_noise']) model.cost = cg.outputs[0].copy(name='CE') if configs['l2_reg'] > 0: cg = ComputationGraph(model.cost) weights = VariableFilter(roles=[WEIGHT])(cg.variables) new_cost = model.cost + configs['l2_reg'] * sum([ (weight ** 2).sum() for weight in weights]) model.cost = new_cost.copy(name='CE') blocks_model = Model(model.cost) all_params = blocks_model.parameters print "Number of found parameters:" + str(len(all_params)) print all_params default_lr = np.float32(configs['lrs'][0]) lr_var = theano.shared(default_lr, name="learning_rate") clipping = StepClipping(threshold=np.cast[floatX](grad_clipping)) # sgd_momentum = Momentum( # learning_rate=0.0001, # momentum=0.95) # step_rule = CompositeRule([clipping, sgd_momentum]) adam = Adam(learning_rate=lr_var) step_rule = CompositeRule([clipping, adam]) training_algorithm = GradientDescent( cost=model.cost, parameters=all_params, step_rule=step_rule) monitored_variables = [ lr_var, aggregation.mean(training_algorithm.total_gradient_norm)] + monitorings for param in all_params: name = param.tag.annotations[0].name + "." + param.name to_monitor = training_algorithm.gradients[param].norm(2) to_monitor.name = name + "_grad_norm" monitored_variables.append(to_monitor) to_monitor = param.norm(2) to_monitor.name = name + "_norm" monitored_variables.append(to_monitor) train_data_stream, valid_data_stream = get_streams(batch_size) train_monitoring = TrainingDataMonitoring( variables=monitored_variables, prefix="train", after_epoch=True) valid_monitoring = DataStreamMonitoring( variables=monitored_variables, data_stream=valid_data_stream, prefix="valid", after_epoch=True) main_loop = MainLoop( algorithm=training_algorithm, data_stream=train_data_stream, model=blocks_model, extensions=[ train_monitoring, valid_monitoring, FinishAfter(after_n_epochs=num_epochs), SaveParams('valid_CE', blocks_model, save_path, after_epoch=True), SaveLog(after_epoch=True), ProgressBar(), LRDecay(lr_var, lrs, until_which_epoch, after_epoch=True), Printing(after_epoch=True)]) main_loop.run()
iteration_scheme=SequentialScheme( data_test.num_examples, batch_size=bs)) learning_rate = 0.0002 n_epochs = 100 algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ])) print('..loading...') load = Load('/home/xuehongyang/checkpoints_open/snapshot_18') predictor = PredictDataStream(data_stream=data_stream_test, output_tensor=result, path='/home/xuehongyang/RESULT_MAIN', before_training=True, after_epoch=False, after_training=False) main_loop = MainLoop( model=Model(cost), data_stream=data_stream_train, algorithm=algorithm, extensions=[Timing(), FinishAfter(after_n_epochs=1), load, predictor]) print('start prediction ...') main_loop.run()
def train(cli_params): fn = 'noname' if 'save_to' in nodefaultargs or not cli_params.get('load_from'): fn = cli_params['save_to'] cli_params['save_dir'] = prepare_dir(fn) nodefaultargs.append('save_dir') logfile = os.path.join(cli_params['save_dir'], 'log.txt') # Log also DEBUG to a file fh = logging.FileHandler(filename=logfile) fh.setLevel(logging.DEBUG) logger.addHandler(fh) logger.info('Logging into %s' % logfile) p, loaded = load_and_log_params(cli_params) in_dim, data, whiten, cnorm = setup_data(p, test_set=False) if not loaded: # Set the zero layer to match input dimensions p.encoder_layers = (in_dim, ) + p.encoder_layers ladder = setup_model(p) # Training all_params = ComputationGraph([ladder.costs.total]).parameters logger.info('Found the following parameters: %s' % str(all_params)) # Fetch all batch normalization updates. They are in the clean path. # you can turn off BN by setting is_normalizing = False in ladder.py bn_updates = ComputationGraph([ladder.costs.class_clean]).updates assert not bn_updates or 'counter' in [u.name for u in bn_updates.keys()], \ 'No batch norm params in graph - the graph has been cut?' training_algorithm = GradientDescent( cost=ladder.costs.total, parameters=all_params, step_rule=Adam(learning_rate=ladder.lr)) # In addition to actual training, also do BN variable approximations if bn_updates: training_algorithm.add_updates(bn_updates) short_prints = { "train": OrderedDict([ ('T_E', ladder.error.clean), ('T_O', ladder.oos.clean), ('T_C_class', ladder.costs.class_corr), ('T_C_de', ladder.costs.denois.values()), ('T_T', ladder.costs.total), ]), "valid_approx": OrderedDict([ ('V_C_class', ladder.costs.class_clean), ('V_E', ladder.error.clean), ('V_O', ladder.oos.clean), ('V_C_de', ladder.costs.denois.values()), ('V_T', ladder.costs.total), ]), "valid_final": OrderedDict([ ('VF_C_class', ladder.costs.class_clean), ('VF_E', ladder.error.clean), ('VF_O', ladder.oos.clean), ('VF_C_de', ladder.costs.denois.values()), ('V_T', ladder.costs.total), ]), } if len(data.valid_ind): main_loop = MainLoop( training_algorithm, # Datastream used for training make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, n_unlabeled=p.unlabeled_samples, whiten=whiten, cnorm=cnorm, balanced_classes=p.balanced_classes, dseed=p.dseed), model=Model(ladder.costs.total), extensions=[ FinishAfter(after_n_epochs=p.num_epochs), # This will estimate the validation error using # running average estimates of the batch normalization # parameters, mean and variance ApproxTestMonitoring([ ladder.costs.class_clean, ladder.error.clean, ladder.oos.clean, ladder.costs.total ] + ladder.costs.denois.values(), make_datastream( data.valid, data.valid_ind, p.valid_batch_size, whiten=whiten, cnorm=cnorm, balanced_classes=p.balanced_classes, scheme=ShuffledScheme), prefix="valid_approx"), # This Monitor is slower, but more accurate since it will first # estimate batch normalization parameters from training data and # then do another pass to calculate the validation error. FinalTestMonitoring( [ ladder.costs.class_clean, ladder.error.clean, ladder.oos.clean, ladder.costs.total ] + ladder.costs.denois.values(), make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, whiten=whiten, cnorm=cnorm, balanced_classes=p.balanced_classes, scheme=ShuffledScheme), make_datastream(data.valid, data.valid_ind, p.valid_batch_size, n_labeled=len(data.valid_ind), whiten=whiten, cnorm=cnorm, balanced_classes=p.balanced_classes, scheme=ShuffledScheme), prefix="valid_final", after_n_epochs=p.num_epochs, after_training=True), TrainingDataMonitoring([ ladder.error.clean, ladder.oos.clean, ladder.costs.total, ladder.costs.class_corr, training_algorithm.total_gradient_norm ] + ladder.costs.denois.values(), prefix="train", after_epoch=True), # ladder.costs.class_clean - save model whenever we have best validation result another option `('train',ladder.costs.total)` SaveParams(('valid_approx', ladder.error.clean), all_params, p.save_dir, after_epoch=True), SaveExpParams(p, p.save_dir, before_training=True), SaveLog(p.save_dir, after_training=True), ShortPrinting(short_prints), LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, lrmin=p.lrmin, after_epoch=True), ]) else: main_loop = MainLoop( training_algorithm, # Datastream used for training make_datastream(data.train, data.train_ind, p.batch_size, n_labeled=p.labeled_samples, n_unlabeled=p.unlabeled_samples, whiten=whiten, cnorm=cnorm, balanced_classes=p.balanced_classes, dseed=p.dseed), model=Model(ladder.costs.total), extensions=[ FinishAfter(after_n_epochs=p.num_epochs), TrainingDataMonitoring([ ladder.error.clean, ladder.oos.clean, ladder.costs.total, ladder.costs.class_corr, training_algorithm.total_gradient_norm ] + ladder.costs.denois.values(), prefix="train", after_epoch=True), # ladder.costs.class_clean - save model whenever we have best validation result another option `('train',ladder.costs.total)` SaveParams(('train', ladder.error.clean), all_params, p.save_dir, after_epoch=True), SaveExpParams(p, p.save_dir, before_training=True), SaveLog(p.save_dir, after_training=True), ShortPrinting(short_prints), LRDecay(ladder.lr, p.num_epochs * p.lrate_decay, p.num_epochs, lrmin=p.lrmin, after_epoch=True), ]) main_loop.run() # Get results if len(data.valid_ind) == 0: return None df = DataFrame.from_dict(main_loop.log, orient='index') col = 'valid_final_error_rate_clean' logger.info('%s %g' % (col, df[col].iloc[-1])) if main_loop.log.status['epoch_interrupt_received']: return None return df
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim, oldmodel, image_size): datasource = name if datasource == 'mnist': if image_size is not None: raise Exception('image size for data source %s is pre configured' % datasource) image_size = 28 else: if image_size is None: raise Exception('Undefined image size for data source %s' % datasource) x_dim = image_size * image_size img_height = img_width = image_size rnninits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } inits = { #'weights_init': Orthogonal(), 'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } if attention != "": read_N, write_N = attention.split(',') read_N = int(read_N) write_N = int(write_N) read_dim = 2 * read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=write_N, **inits) attention_tag = "r%d-w%d" % (read_N, write_N) else: read_dim = 2 * x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) attention_tag = "full" #---------------------------------------------------------------------- # Learning rate def lr_tag(value): """ Convert a float into a short tag-usable string representation. E.g.: 0.1 -> 11 0.01 -> 12 0.001 -> 13 0.005 -> 53 """ exp = np.floor(np.log10(value)) leading = ("%e" % value)[0] return "%s%d" % (leading, -exp) lr_str = lr_tag(learning_rate) name = "%s-%s-t%d-enc%d-dec%d-z%d-lr%s" % (name, attention_tag, n_iter, enc_dim, dec_dim, z_dim, lr_str) print("\nRunning experiment %s" % name) print(" learning rate: %g" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print(" batch size: %d" % batch_size) print() #---------------------------------------------------------------------- encoder_rnn = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder_rnn = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Identity()], [(read_dim + dec_dim), 4 * enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Identity()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) draw = DrawModel(n_iter, reader=reader, encoder_mlp=encoder_mlp, encoder_rnn=encoder_rnn, sampler=q_sampler, decoder_mlp=decoder_mlp, decoder_rnn=decoder_rnn, writer=writer) draw.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') #x_recons = 1. + x x_recons, kl_terms = draw.reconstruct(x) #x_recons, _, _, _, _ = draw.silly(x, n_steps=10, batch_size=100) #x_recons = x_recons[-1,:,:] #samples = draw.sample(100) #x_recons = samples[-1, :, :] #x_recons = samples[-1, :, :] recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ StepClipping(10.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) #algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t, :].mean() kl_term_t.name = "kl_term_%d" % t #x_recons_t = T.nnet.sigmoid(c[t,:,:]) #recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) #recons_term_t = recons_term_t.mean() #recons_term_t.name = "recons_term_%d" % t monitors += [kl_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [ ["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], #["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"] ] #------------------------------------------------------------ if datasource == 'mnist': train_ds = BinarizedMNIST("train", sources=['features'], flatten=['features']) test_ds = BinarizedMNIST("test", sources=['features'], flatten=['features']) else: datasource_fname = os.path.join(fuel.config.data_path, datasource, datasource + '.hdf5') train_ds = H5PYDataset(datasource_fname, which_set='train', sources=['features'], flatten=['features']) test_ds = H5PYDataset(datasource_fname, which_set='test', sources=['features'], flatten=['features']) train_stream = DataStream(train_ds, iteration_scheme=SequentialScheme( train_ds.num_examples, batch_size)) test_stream = DataStream(test_ds, iteration_scheme=SequentialScheme( test_ds.num_examples, batch_size)) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), TrainingDataMonitoring(train_monitors, prefix="train", after_epoch=True), # DataStreamMonitoring( # monitors, # valid_stream, ## updates=scan_updates, # prefix="valid"), DataStreamMonitoring( monitors, test_stream, # updates=scan_updates, prefix="test"), MyCheckpoint(image_size=image_size, path=name + ".pkl", before_training=False, after_epoch=True, save_separately=['log', 'model']), #Dump(name), # Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) if oldmodel is not None: print("Initializing parameters with old model %s" % oldmodel) with open(oldmodel, "rb") as f: oldmodel = pickle.load(f) main_loop.model.set_param_values(oldmodel.get_param_values()) del oldmodel main_loop.run()
def main(model, cost, config, tr_stream, dev_stream, use_bokeh=False): # Set the parameters from a trained models (.npz file) logger.info("Loading parameters from model: {}".format( exp_config['saved_parameters'])) # Note the brick delimeter='-' is here for legacy reasons because blocks changed the serialization API param_values = LoadNMT.load_parameter_values( exp_config['saved_parameters'], brick_delimiter=exp_config.get('brick_delimiter', None)) LoadNMT.set_model_parameters(model, param_values) logger.info('Creating computational graph') cg = ComputationGraph(cost) # GRAPH TRANSFORMATIONS FOR BETTER TRAINING if config.get('l2_regularization', False) is True: l2_reg_alpha = config['l2_regularization_alpha'] logger.info( 'Applying l2 regularization with alpha={}'.format(l2_reg_alpha)) model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # why do we need to rename the cost variable? Where did the original name come from? cost.name = 'decoder_cost_cost' cg = ComputationGraph(cost) # apply dropout for regularization # Note dropout variables are hard-coded here if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) # TODO: mv the actual config file once we switch to .yaml for min-risk # shutil.copy(config['config_file'], config['saveto']) # shutil.copy(config['config_file'], config['saveto']) # TODO: this breaks when we directly reference a class in the config obj instead of using reflection with codecs.open(os.path.join(config['saveto'], 'config.yaml'), 'w', encoding='utf8') as yaml_out: yaml_out.write(yaml.dump(config)) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary # TODO: change the if statement here if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = train_encoder.apply( theano_sampling_source_input, tensor.ones(theano_sampling_source_input.shape)) # TODO: the generated output actually contains several more values, ipdb to see what they are generated = train_decoder.generate(theano_sampling_source_input, sampling_representation, theano_sampling_context_input) search_model = Model(generated) _, samples = VariableFilter( bricks=[train_decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling -- TODO: sampling is broken for min-risk #if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu # TODO: use multimodal meteor and BLEU validator # Add early stopping based on bleu if config.get('bleu_script', None) is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(theano_sampling_source_input, theano_sampling_context_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=src_vocab, trg_vocab=trg_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Add early stopping based on Meteor if config.get('meteor_directory', None) is not None: logger.info("Building meteor validator") extensions.append( MeteorValidator(theano_sampling_source_input, theano_sampling_context_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=src_vocab, trg_vocab=trg_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(config['model_save_directory'], channels=[[ 'decoder_cost_cost', 'validation_set_bleu_score', 'validation_set_meteor_score' ]], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # if there is l2_regularization, dropout or random noise, we need to use the output of the modified graph if config['dropout'] < 1.0: algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), on_unused_sources='warn') else: algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), on_unused_sources='warn') #algorithm = GradientDescent( # cost=cost, parameters=cg.parameters, # step_rule=CompositeRule([StepClipping(config['step_clipping']), # eval(config['step_rule'])()], # ), # on_unused_sources='warn' #) # enrich the logged information extensions.append(Timing(every_n_batches=100)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def run(discriminative_regularization=True): streams = create_celeba_streams(training_batch_size=100, monitoring_batch_size=500, include_targets=False) main_loop_stream, train_monitor_stream, valid_monitor_stream = streams[:3] # Compute parameter updates for the batch normalization population # statistics. They are updated following an exponential moving average. rval = create_training_computation_graphs(discriminative_regularization) cg, bn_cg, variance_parameters = rval pop_updates = list( set(get_batch_normalization_updates(bn_cg, allow_duplicates=True))) decay_rate = 0.05 extra_updates = [(p, m * decay_rate + p * (1 - decay_rate)) for p, m in pop_updates] model = Model(bn_cg.outputs[0]) selector = Selector( find_bricks( model.top_bricks, lambda brick: brick.name in ('encoder_convnet', 'encoder_mlp', 'decoder_convnet', 'decoder_mlp' ))) parameters = list(selector.get_parameters().values()) + variance_parameters # Prepare algorithm step_rule = Adam() algorithm = GradientDescent(cost=bn_cg.outputs[0], parameters=parameters, step_rule=step_rule) algorithm.add_updates(extra_updates) # Prepare monitoring monitored_quantities_list = [] for graph in [bn_cg, cg]: cost, kl_term, reconstruction_term = graph.outputs cost.name = 'nll_upper_bound' avg_kl_term = kl_term.mean(axis=0) avg_kl_term.name = 'avg_kl_term' avg_reconstruction_term = -reconstruction_term.mean(axis=0) avg_reconstruction_term.name = 'avg_reconstruction_term' monitored_quantities_list.append( [cost, avg_kl_term, avg_reconstruction_term]) train_monitoring = DataStreamMonitoring(monitored_quantities_list[0], train_monitor_stream, prefix="train", updates=extra_updates, after_epoch=False, before_first_epoch=False, every_n_epochs=5) valid_monitoring = DataStreamMonitoring(monitored_quantities_list[1], valid_monitor_stream, prefix="valid", after_epoch=False, before_first_epoch=False, every_n_epochs=5) # Prepare checkpoint save_path = 'celeba_vae_{}regularization.zip'.format( '' if discriminative_regularization else 'no_') checkpoint = Checkpoint(save_path, every_n_epochs=5, use_cpickle=True) extensions = [ Timing(), FinishAfter(after_n_epochs=75), train_monitoring, valid_monitoring, checkpoint, Printing(), ProgressBar() ] main_loop = MainLoop(data_stream=main_loop_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
def main(save_to, num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=500, num_batches=None): if feature_maps is None: feature_maps = [20, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5] if pool_sizes is None: pool_sizes = [2, 2] image_size = (32, 23) batch_size = 50 output_size = 2 learningRate = 0.1 num_epochs = 10 num_batches = None # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 3, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format(i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cost = (CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs).copy(name='error_rate')) cg = ComputationGraph([cost, error_rate]) ########### Loading images ##################### from fuel.datasets.dogs_vs_cats import DogsVsCats from fuel.streams import DataStream, ServerDataStream from fuel.schemes import ShuffledScheme from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, Random2DRotation from fuel.transformers import Flatten, Cast, ScaleAndShift def create_data(data): stream = DataStream.default_stream(data, iteration_scheme=ShuffledScheme( data.num_examples, batch_size)) stream_downscale = MinimumImageDimensions( stream, image_size, which_sources=('image_features', )) #stream_rotate = Random2DRotation(stream_downscale, which_sources=('image_features',)) stream_max = ScikitResize(stream_downscale, image_size, which_sources=('image_features', )) stream_scale = ScaleAndShift(stream_max, 1. / 255, 0, which_sources=('image_features', )) stream_cast = Cast(stream_scale, dtype='float32', which_sources=('image_features', )) #stream_flat = Flatten(stream_scale, which_sources=('image_features',)) return stream_cast stream_data_train = create_data( DogsVsCats(('train', ), subset=slice(0, 20000))) stream_data_test = create_data( DogsVsCats(('train', ), subset=slice(20000, 25000))) # Train with simple SGD algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=learningRate)) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [] extensions.append(Timing()) extensions.append( FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches)) extensions.append( DataStreamMonitoring([cost, error_rate], stream_data_test, prefix="valid")) extensions.append( TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True)) extensions.append(Checkpoint(save_to)) extensions.append(ProgressBar()) extensions.append(Printing()) model = Model(cost) ########### Loading images ##################### main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
channels, prefix="%s_%s" % (which_set, situation), after_epoch=True, data_stream=get_stream(which_set=which_set, batch_size=args.batch_size)))#, num_examples=1000))) for situation in "inference".split(): # add inference for which_set in "valid test".split(): logger.warning("constructing %s %s monitor" % (which_set, situation)) channels = list(graphs[situation].outputs) extensions.append(DataStreamMonitoring( channels, prefix="%s_%s" % (which_set, situation), after_epoch=True, data_stream=get_stream(which_set=which_set, batch_size=args.batch_size)))#, num_examples=1000))) extensions.extend([ TrackTheBest("valid_training_error_rate", "best_valid_training_error_rate"), DumpBest("best_valid_training_error_rate", "best.zip"), FinishAfter(after_n_epochs=args.num_epochs), #FinishIfNoImprovementAfter("best_valid_error_rate", epochs=50), Checkpoint("checkpoint.zip", on_interrupt=False, every_n_epochs=1, use_cpickle=True), DumpLog("log.pkl", after_epoch=True)]) if not args.cluster: extensions.append(ProgressBar()) extensions.extend([ Timing(), Printing(), PrintingTo("log"), ]) main_loop = MainLoop( data_stream=get_stream(which_set="train", batch_size=args.batch_size), algorithm=algorithm, extensions=extensions, model=model)
def create_main_loop(save_to, num_epochs, unit_order=None, batch_size=500, num_batches=None): image_size = (28, 28) output_size = 10 convnet = create_lenet_5() x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) case_costs = CasewiseCrossEntropy().apply(y.flatten(), probs) cost = case_costs.mean().copy(name='cost') error_rate = (MisclassificationRate().apply(y.flatten(), probs).copy(name='error_rate')) cg = ComputationGraph([cost, error_rate, case_costs]) # Apply regularization to the cost weights = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + sum([0.0003 * (W**2).sum() for W in weights]) cost.name = 'cost_with_regularization' mnist_train = MNIST(("train", )) mnist_train_stream = DataStream.default_stream( mnist_train, iteration_scheme=ShuffledScheme(mnist_train.num_examples, batch_size)) mnist_test = MNIST(("test", )) mnist_test_stream = DataStream.default_stream( mnist_test, iteration_scheme=ShuffledScheme(mnist_test.num_examples, batch_size)) # Generate pics for biases biases = VariableFilter(roles=[BIAS])(cg.parameters) # Train with simple SGD algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=AdaDelta()) synpic_extension = SynpicExtension(synpic_parameters=biases, case_costs=case_costs, case_labels=y, pics=x, batch_size=batch_size, pic_size=image_size, label_count=output_size, after_batch=True) # Impose an orderint for the SaveImages extension if unit_order is not None: with open(unit_order, 'rb') as handle: histograms = pickle.load(handle) unit_order = compute_unit_order(histograms) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), synpic_extension, SaveImages(picsources=[synpic_extension], title="LeNet-5: batch {i}, " + "cost {cost_with_regularization:.2f}, " + "trainerr {error_rate:.3f}", data=[cost, error_rate], graph='error_rate', graph_len=500, unit_order=unit_order, after_batch=True), DataStreamMonitoring([cost, error_rate], mnist_test_stream, prefix="test"), TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing() ] model = Model(cost) main_loop = MainLoop(algorithm, mnist_train_stream, model=model, extensions=extensions) main_loop.synpic = synpic_extension return main_loop
def main(config, tr_stream, dev_stream, source_vocab, target_vocab, use_bokeh=False): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') # Note that the _names_ are changed from normal NMT # for IMT training, we use only the suffix as the reference target_sentence = tensor.lmatrix('target_suffix') target_sentence_mask = tensor.matrix('target_suffix_mask') # TODO: change names back to *_suffix, there is currently a theano function name error # TODO: in the GradientDescent Algorithm target_prefix = tensor.lmatrix('target_prefix') target_prefix_mask = tensor.matrix('target_prefix_mask') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = NMTPrefixDecoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, loss_function='cross_entropy') # rename to match baseline NMT systems decoder.name = 'decoder' # TODO: change the name of `target_sentence` to `target_suffix` for more clarity cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask, target_prefix, target_prefix_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # INITIALIZATION logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) trainable_params = cg.parameters # target_embeddings = model.get_parameter_dict()['/target_recurrent_lm_with_alignments/target_embeddings.W'] # trainable_params.remove(source_embeddings) # trainable_params.remove(target_embeddings) # TODO: fixed dropout mask for recurrent params? # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), # TrainingDataMonitoring(trainable_params, after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up the sampling graph for validation during training # Theano variables for the sampling graph sampling_vars = load_params_and_get_beam_search(config, encoder=encoder, decoder=decoder) beam_search, search_model, samples, sampling_input, sampling_prefix = sampling_vars if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab=source_vocab, trg_vocab=target_vocab, src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, sampling_prefix, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # TODO: add first-word accuracy validation # TODO: add IMT meteor early stopping if config.get('imt_f1_validation', None) is not None: logger.info("Building imt F1 validator") extensions.append( IMT_F1_Validator(sampling_input, sampling_prefix, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(config['model_save_directory'], channels=[['decoder_cost_cost'], [ 'validation_set_bleu_score', 'validation_set_imt_f1_score' ]], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # WORKING: implement confidence model # if there is dropout or random noise, we need to use the output of the modified graph if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0: algorithm = GradientDescent( cost=cg.outputs[0], parameters=trainable_params, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]), on_unused_sources='warn') else: algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), on_unused_sources='warn') # END WORKING: implement confidence model # enrich the logged information extensions.append(Timing(every_n_batches=100)) # for i, (k,v) in enumerate(algorithm.updates): # v.name = k.name + '_{}'.format(i) # # aux_vars = [v for v in cg.auxiliary_variables[-3:]] # import ipdb; ipdb.set_trace() extensions.extend([ TrainingDataMonitoring([cost], after_batch=True), # TrainingDataMonitoring([v for k,v in algorithm.updates[:2]], after_batch=True), # TrainingDataMonitoring(aux_vars, after_batch=True), TrainingDataMonitoring(trainable_params, after_batch=True), Printing(after_batch=True) ]) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
thisRule=Scale(learning_rate=learning_rate) if "gradientClipping" in config: threshold = float(config["gradientClipping"]) print "using gradient clipping with threshold ", threshold thisRule=CompositeRule([StepClipping(threshold), thisRule]) #step_rule=CompositeRule([StepClipping(config['step_clipping']), # eval(config['step_rule'])()]) algorithm = GradientDescent(cost=cost, parameters=params, step_rule=thisRule, on_unused_sources='warn') extensions = [] if "saveModel" in config: print "saving model after training" extensions.append(Checkpoint(path=networkfile, after_n_epochs=n_epochs)) extensions.append(FinishAfter(after_n_epochs=n_epochs)) extensions.append(F1Extension(layer3 = layer3, y = y, model = model, data_stream = data_stream_test, num_samples = numSamplesTest, batch_size = 1, every_n_epochs=1)) extensions.append(ModelResults(layer3 = layer3, y = y, model = model, data_stream = data_stream_test, num_samples = numSamplesTest, batch_size = 1, after_n_epochs=n_epochs)) my_loop = MainLoop(model=model, data_stream=data_stream, algorithm=algorithm, extensions=extensions) time1 = time.time() my_loop.run() time2 = time.time() print "time for training network: " + str(time2 - time1)
def main(save_to, num_epochs, regularization=0.001, subset=None, num_batches=None, batch_size=None, histogram=None, resume=False): output_size = 10 convnet = create_all_conv_net() x = tensor.tensor4('features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) test_cost = (CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost')) test_components = (ComponentwiseCrossEntropy().apply( y.flatten(), probs).copy(name='components')) test_error_rate = (MisclassificationRate().apply( y.flatten(), probs).copy(name='error_rate')) test_confusion = (ConfusionMatrix().apply(y.flatten(), probs).copy(name='confusion')) test_confusion.tag.aggregation_scheme = Sum(test_confusion) test_cg = ComputationGraph([test_cost, test_error_rate, test_components]) # Apply dropout to all layer outputs except final softmax dropout_vars = VariableFilter( roles=[OUTPUT], bricks=[Convolutional], theano_name_regex="^conv_[25]_apply_output$")(test_cg.variables) drop_cg = apply_dropout(test_cg, dropout_vars, 0.5) # Apply 0.2 dropout to the pre-averaging layer # dropout_vars_2 = VariableFilter( # roles=[OUTPUT], bricks=[Convolutional], # theano_name_regex="^conv_8_apply_output$")(drop_cg.variables) # train_cg = apply_dropout(drop_cg, dropout_vars_2, 0.2) # Apply 0.2 dropout to the input, as in the paper # train_cg = apply_dropout(drop_cg, [x], 0.2) train_cg = drop_cg # train_cg = test_cg train_cost, train_error_rate, train_components = train_cg.outputs # Apply regularization to the cost biases = VariableFilter(roles=[BIAS])(train_cg.parameters) weights = VariableFilter(roles=[WEIGHT])(train_cg.variables) l2_norm = sum([(W**2).sum() for W in weights]) l2_norm.name = 'l2_norm' l2_regularization = regularization * l2_norm l2_regularization.name = 'l2_regularization' test_cost = test_cost + l2_regularization test_cost.name = 'cost_with_regularization' # Training version of cost train_cost_without_regularization = train_cost train_cost_without_regularization.name = 'cost_without_regularization' train_cost = train_cost + regularization * l2_norm train_cost.name = 'cost_with_regularization' cifar10_train = CIFAR10(("train", )) #cifar10_train_stream = RandomPadCropFlip( # NormalizeBatchLevels(DataStream.default_stream( # cifar10_train, iteration_scheme=ShuffledScheme( # cifar10_train.num_examples, batch_size)), # which_sources=('features',)), # (32, 32), pad=5, which_sources=('features',)) cifar10_train_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_train, iteration_scheme=ShuffledScheme(cifar10_train.num_examples, batch_size)), which_sources=('features', )) test_batch_size = 1000 cifar10_test = CIFAR10(("test", )) cifar10_test_stream = NormalizeBatchLevels(DataStream.default_stream( cifar10_test, iteration_scheme=ShuffledScheme(cifar10_test.num_examples, test_batch_size)), which_sources=('features', )) momentum = Momentum(0.002, 0.9) # Create a step rule that doubles the learning rate of biases, like Caffe. # scale_bias = Restrict(Scale(2), biases) # step_rule = CompositeRule([scale_bias, momentum]) # step_rule = CompositeRule([StepClipping(100), momentum]) step_rule = momentum # Train with simple SGD algorithm = GradientDescent(cost=train_cost, parameters=train_cg.parameters, step_rule=step_rule) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [ Timing(), FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches), EpochSchedule(momentum.learning_rate, [(1, 0.005), (3, 0.01), (5, 0.02), (200, 0.002), (250, 0.0002), (300, 0.00002)]), DataStreamMonitoring([test_cost, test_error_rate, test_confusion], cifar10_test_stream, prefix="test"), TrainingDataMonitoring([ train_cost, train_error_rate, train_cost_without_regularization, l2_regularization, momentum.learning_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", every_n_batches=10), # after_epoch=True), Plot('Training performance for ' + save_to, channels=[ [ 'train_cost_with_regularization', 'train_cost_without_regularization', 'train_l2_regularization' ], ['train_error_rate'], ['train_total_gradient_norm'], ], every_n_batches=10), # after_batch=True), Plot('Test performance for ' + save_to, channels=[[ 'train_error_rate', 'test_error_rate', ]], after_epoch=True), Checkpoint(save_to), ProgressBar(), Printing() ] if histogram: attribution = AttributionExtension(components=train_components, parameters=cg.parameters, components_size=output_size, after_batch=True) extensions.insert(0, attribution) if resume: extensions.append(Load(save_to, True, True)) model = Model(train_cost) main_loop = MainLoop(algorithm, cifar10_train_stream, model=model, extensions=extensions) main_loop.run() if histogram: save_attributions(attribution, filename=histogram) with open('execution-log.json', 'w') as outfile: json.dump(main_loop.log, outfile, cls=NumpyEncoder)
def run(epochs=1, corpus="data/", HIDDEN_DIMS=100, path="./"): brown = BrownDataset(corpus) INPUT_DIMS = brown.get_vocabulary_size() OUTPUT_DIMS = brown.get_vocabulary_size() # These are theano variables x = tensor.lmatrix('context') y = tensor.ivector('output') # Construct the graph input_to_hidden = LookupTable(name='input_to_hidden', length=INPUT_DIMS, dim=HIDDEN_DIMS) # Compute the weight matrix for every word in the context and then compute # the average. h = tensor.mean(input_to_hidden.apply(x), axis=1) hidden_to_output = Linear(name='hidden_to_output', input_dim=HIDDEN_DIMS, output_dim=OUTPUT_DIMS) y_hat = Softmax().apply(hidden_to_output.apply(h)) # And initialize with random varibales and set the bias vector to 0 weights = IsotropicGaussian(0.01) input_to_hidden.weights_init = hidden_to_output.weights_init = weights input_to_hidden.biases_init = hidden_to_output.biases_init = Constant(0) input_to_hidden.initialize() hidden_to_output.initialize() # And now the cost function cost = CategoricalCrossEntropy().apply(y, y_hat) cg = ComputationGraph(cost) W1, W2 = VariableFilter(roles=[WEIGHT])(cg.variables) cost = cost + 0.01 * (W1 ** 2).sum() + 0.01 * (W2 ** 2).sum() cost.name = 'cost_with_regularization' mini_batch = SequentialScheme(brown.num_instances(), 512) data_stream = DataStream.default_stream(brown, iteration_scheme=mini_batch) # Now we tie up lose ends and construct the algorithm for the training # and define what happens in the main loop. algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=0.1)) extensions = [ ProgressBar(), FinishAfter(after_n_epochs=epochs), Printing(), # TrainingDataMonitoring(variables=[cost]), SaveWeights(layers=[input_to_hidden, hidden_to_output], prefixes=['%sfirst' % path, '%ssecond' % path]), # Plot( # 'Word Embeddings', # channels=[ # [ # 'cost_with_regularization' # ] # ]) ] logger.info("Starting main loop...") main = MainLoop(data_stream=data_stream, algorithm=algorithm, extensions=extensions) main.run() pickle.dump(cg, open('%scg.pickle' % path, 'wb'))
variables=[cost], data_stream=test_stream, prefix='test' ) plotting = Plot('AdniNet_{}'.format(side), channels=[ ['entropy', 'validation_entropy'], ['error', 'validation_error'], ], after_batch=False) # The main loop will train the network and output reports, etc stamp = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M') main = MainLoop( data_stream=training_stream, model=autoencoder, algorithm=algo, extensions=[ FinishAfter(after_n_epochs=max_iter), FinishIfNoImprovementAfter(notification_name='validation_error', epochs=3), Printing(), validation_monitor, training_monitor, test_monitor, plotting, Checkpoint('./models/{}'.format(side, stamp)) ]) main.run()
def train(config, tr_stream, logprob_stream): trng = RandomStreams(config['seed'] if 'seed' in config else 1234) enc_ids, dec_ids = get_enc_dec_ids(config['cgs']) # Create Theano variables floatX = theano.config.floatX src_sel = tensor.matrix('src_selector', dtype=floatX) trg_sel = tensor.matrix('trg_selector', dtype=floatX) x = tensor.lmatrix('source') y = tensor.lmatrix('target') x_mask = tensor.matrix('source_mask') y_mask = tensor.matrix('target_mask') x_sampling = tensor.matrix('source', dtype='int64') y_sampling = tensor.vector('target', dtype='int64') prev_state = tensor.matrix('prev_state', dtype=floatX) src_sel_sampling = tensor.matrix('src_selector', dtype=floatX) trg_sel_sampling = tensor.matrix('trg_selector', dtype=floatX) # for multi source - maximum is 10 for now xs = [tensor.lmatrix('source%d' % i) for i in range(10)] x_masks = [tensor.matrix('source%d_mask' % i) for i in range(10)] xs_sampling = [ tensor.matrix('source%d' % i, dtype='int64') for i in range(10) ] # test values """ import numpy theano.config.compute_test_value = 'warn' src_sel.tag.test_value = numpy.zeros((80, 3), dtype=floatX) trg_sel.tag.test_value = numpy.zeros((80, 1), dtype=floatX) x.tag.test_value = numpy.random.randint(0, 10, size=(12, 80)) y.tag.test_value = numpy.random.randint(0, 10, size=(14, 80)) x_mask.tag.test_value = numpy.ones_like(x.tag.test_value).astype(floatX) y_mask.tag.test_value = numpy.ones_like(y.tag.test_value).astype(floatX) x_sampling.tag.test_value = numpy.random.randint(0, 10, size=(1, 1)) y_sampling.tag.test_value = numpy.random.randint(0, 10, size=(1,)) prev_state.tag.test_value = numpy.random.randn(1, 1000).astype(floatX) src_sel_sampling.tag.test_value = numpy.zeros((1, 3), dtype=floatX) trg_sel_sampling.tag.test_value = numpy.zeros((1, 1), dtype=floatX) for i in range(10): xs[i].tag.test_value = numpy.random.randint(0, 10, size=(12, 80)) x_masks[i].tag.test_value = \ numpy.ones_like(x.tag.test_value).astype(floatX) xs_sampling[i].tag.test_value = \ numpy.random.randint(0, 10, size=(1, 1)) """ # Create encoder-decoder architecture enc_dec = EncoderDecoder(encoder=MultiEncoder(enc_ids=enc_ids, **config), decoder=MultiDecoder(**config)) # Build training computational graphs probs, opt_rets = enc_dec.build_models(x, x_mask, y, y_mask, src_sel, trg_sel, xs=xs, x_masks=x_masks, trng=trng) # Get costs costs = enc_dec.get_costs(probs, y, y_mask, decay_cs=config.get('decay_c', None), opt_rets=opt_rets) # Computation graphs cgs = enc_dec.get_computational_graphs(costs) # Build sampling models f_inits, f_nexts, f_next_states = enc_dec.build_sampling_models( x_sampling, y_sampling, src_sel_sampling, trg_sel_sampling, prev_state, trng=trng, xs=xs_sampling) # Some printing enc_dec.print_params(cgs) # Get training parameters with optional excludes training_params, excluded_params = enc_dec.get_training_params( cgs, exclude_encs=config['exclude_encs'], additional_excludes=config['additional_excludes'], readout_only=config.get('readout_only', None), train_shared=config.get('train_shared', None)) # Some more printing enc_dec.print_training_params(cgs, training_params) # Set up training algorithm algorithm = SGDMultiCG(costs=costs, tparams=training_params, drop_input=config['drop_input'], step_rule=config['step_rule'], learning_rate=config['learning_rate'], clip_c=config['step_clipping'], step_rule_kwargs=config.get('step_rule_kwargs', {})) # Set up training model training_models = OrderedDict() for k, v in costs.iteritems(): training_models[k] = Model(costs[k]) # Set extensions extensions = [ Timing(after_batch=True), FinishAfter(after_n_batches=config['finish_after']), CostMonitoringWithMultiCG(after_batch=True), Printing(after_batch=True), PrintMultiStream(after_batch=True), DumpWithMultiCG(saveto=config['saveto'], save_accumulators=config['save_accumulators'], every_n_batches=config['save_freq'], no_blocks=True) ] # Reload model if necessary if config['reload'] and os.path.exists(config['saveto']): extensions.append( LoadFromDumpMultiCG(saveto=config['saveto'], load_accumulators=config['load_accumulators'], no_blocks=True)) # Add sampling to computational graphs for i, (cg_name, cg) in enumerate(cgs.iteritems()): eid, did = p_(cg_name) if config['hook_samples'] > 0: extensions.append( Sampler(f_init=f_inits[cg_name], f_next=f_nexts[cg_name], data_stream=tr_stream, num_samples=config['hook_samples'], src_eos_idx=config['src_eos_idxs'][eid], trg_eos_idx=config['trg_eos_idxs'][did], enc_id=eid, dec_id=did, every_n_batches=config['sampling_freq'], cond_init_trg=config.get('cond_init_trg', False), f_next_state=f_next_states.get(cg_name, None))) # Save parameters incrementally without overwriting if config.get('incremental_dump', False): extensions.append( IncrementalDump(saveto=config['saveto'], burnin=config['val_burn_in'], every_n_batches=config['save_freq'])) # Compute log probability on dev set if 'log_prob_freq' in config: extensions.append( LogProbComputer(cgs=config['cgs'], f_log_probs=enc_dec.build_f_log_probs( probs, x, x_mask, y, y_mask, src_sel, trg_sel, xs=xs, x_masks=x_masks), streams=logprob_stream, every_n_batches=config['log_prob_freq'])) # Initialize main loop main_loop = MainLoopWithMultiCGnoBlocks(models=training_models, algorithm=algorithm, data_stream=tr_stream, extensions=extensions, num_encs=config['num_encs'], num_decs=config['num_decs']) # Train! main_loop.run() # Be patient, after a month :-) print 'done'
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weghts_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost( chars, chars_mask, targets, targets_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in params.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies,) = VariableFilter( application=generator.readout.readout, name="output")(cg.variables) (activations,) = VariableFilter( application=generator.transition.apply, name=generator.transition.apply.states[0])(cg.variables) max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_activation = named_copy(abs(activations).mean(), "mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(named_copy( param.norm(2), name + "_norm")) observables.append(named_copy( algorithm.gradients[param].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring( observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition("after_batch", _is_nan), Plot(os.path.basename(save_path), [[average_monitoring.record_name(cost)], [average_monitoring.record_name(cost_per_character)]], every_n_batches=10), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1)]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_param_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( bricks=[reverser.generator], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(input_.shape[1], samples) outputs, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) encoded_input = [char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip()] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input,))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
def main(config, tr_stream, dev_stream): # Create Theano variables source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Test values ''' theano.config.compute_test_value = 'warn' source_sentence.tag.test_value = numpy.random.randint(10, size=(10, 10)) target_sentence.tag.test_value = numpy.random.randint(10, size=(10, 10)) source_sentence_mask.tag.test_value = \ numpy.random.rand(10, 10).astype('float32') target_sentence_mask.tag.test_value = \ numpy.random.rand(10, 10).astype('float32') sampling_input.tag.test_value = numpy.random.randint(10, size=(10, 10)) ''' # Construct model encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) # Initialize model encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() cg = ComputationGraph(cost) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] print('Parameter shapes') for shape, count in Counter(shapes).most_common(): print(' {:15}: {}'.format(shape, count)) # Set up training algorithm algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Set up beam search and sampling computation graphs sampling_representation = encoder.apply(sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) samples, = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph( generated[1])) # generated[1] is the next_outputs # Set up training model training_model = Model(cost) enc_param_dict = Selector(encoder).get_params() dec_param_dict = Selector(decoder).get_params() gh_model_name = '/data/lisatmp3/firatorh/nmt/wmt15/trainedModels/blocks/sanity/refGHOG_adadelta_40k_best_bleu_model.npz' tmp_file = numpy.load(gh_model_name) gh_model = dict(tmp_file) tmp_file.close() for key in enc_param_dict: print '{:15}: {}'.format(enc_param_dict[key].get_value().shape, key) for key in dec_param_dict: print '{:15}: {}'.format(dec_param_dict[key].get_value().shape, key) enc_param_dict['/bidirectionalencoder/embeddings.W'].set_value( gh_model['W_0_enc_approx_embdr']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/forward.state_to_state'].set_value( gh_model['W_enc_transition_0']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/forward.state_to_update'].set_value( gh_model['G_enc_transition_0']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/forward.state_to_reset'].set_value( gh_model['R_enc_transition_0']) enc_param_dict['/bidirectionalencoder/fwd_fork/fork_inputs.W'].set_value( gh_model['W_0_enc_input_embdr_0']) enc_param_dict['/bidirectionalencoder/fwd_fork/fork_inputs.b'].set_value( gh_model['b_0_enc_input_embdr_0']) enc_param_dict[ '/bidirectionalencoder/fwd_fork/fork_update_inputs.W'].set_value( gh_model['W_0_enc_update_embdr_0']) enc_param_dict[ '/bidirectionalencoder/fwd_fork/fork_reset_inputs.W'].set_value( gh_model['W_0_enc_reset_embdr_0']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/backward.state_to_state'].set_value( gh_model['W_back_enc_transition_0']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/backward.state_to_update'].set_value( gh_model['G_back_enc_transition_0']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/backward.state_to_reset'].set_value( gh_model['R_back_enc_transition_0']) enc_param_dict['/bidirectionalencoder/back_fork/fork_inputs.W'].set_value( gh_model['W_0_back_enc_input_embdr_0']) enc_param_dict['/bidirectionalencoder/back_fork/fork_inputs.b'].set_value( gh_model['b_0_back_enc_input_embdr_0']) enc_param_dict[ '/bidirectionalencoder/back_fork/fork_update_inputs.W'].set_value( gh_model['W_0_back_enc_update_embdr_0']) enc_param_dict[ '/bidirectionalencoder/back_fork/fork_reset_inputs.W'].set_value( gh_model['W_0_back_enc_reset_embdr_0']) dec_param_dict[ '/decoder/sequencegenerator/readout/lookupfeedbackwmt15/lookuptable.W'].set_value( gh_model['W_0_dec_approx_embdr']) #dec_param_dict['/decoder/sequencegenerator/readout/lookupfeedback/lookuptable.W'].set_value(gh_model['W_0_dec_approx_embdr']) dec_param_dict[ '/decoder/sequencegenerator/readout/initializablefeedforwardsequence/maxout_bias.b'].set_value( gh_model['b_0_dec_hid_readout_0']) dec_param_dict[ '/decoder/sequencegenerator/readout/initializablefeedforwardsequence/softmax0.W'].set_value( gh_model['W1_dec_deep_softmax']) # Missing W1 dec_param_dict[ '/decoder/sequencegenerator/readout/initializablefeedforwardsequence/softmax1.W'].set_value( gh_model['W2_dec_deep_softmax']) dec_param_dict[ '/decoder/sequencegenerator/readout/initializablefeedforwardsequence/softmax1.b'].set_value( gh_model['b_dec_deep_softmax']) dec_param_dict[ '/decoder/sequencegenerator/readout/merge/transform_states.W'].set_value( gh_model['W_0_dec_hid_readout_0']) dec_param_dict[ '/decoder/sequencegenerator/readout/merge/transform_feedback.W'].set_value( gh_model['W_0_dec_prev_readout_0']) dec_param_dict[ '/decoder/sequencegenerator/readout/merge/transform_weighted_averages.W'].set_value( gh_model['W_0_dec_repr_readout']) dec_param_dict[ '/decoder/sequencegenerator/readout/merge/transform_weighted_averages.b'].set_value( gh_model['b_0_dec_repr_readout']) dec_param_dict['/decoder/sequencegenerator/fork/fork_inputs.b'].set_value( gh_model['b_0_dec_input_embdr_0']) dec_param_dict['/decoder/sequencegenerator/fork/fork_inputs.W'].set_value( gh_model['W_0_dec_input_embdr_0']) dec_param_dict[ '/decoder/sequencegenerator/fork/fork_update_inputs.W'].set_value( gh_model['W_0_dec_update_embdr_0']) dec_param_dict[ '/decoder/sequencegenerator/fork/fork_reset_inputs.W'].set_value( gh_model['W_0_dec_reset_embdr_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_inputs.W'].set_value( gh_model['W_0_dec_dec_inputter_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_inputs.b'].set_value( gh_model['b_0_dec_dec_inputter_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_update_inputs.W'].set_value( gh_model['W_0_dec_dec_updater_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_update_inputs.b'].set_value( gh_model['b_0_dec_dec_updater_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_reset_inputs.W'].set_value( gh_model['W_0_dec_dec_reseter_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_reset_inputs.b'].set_value( gh_model['b_0_dec_dec_reseter_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/decoder.state_to_state'].set_value( gh_model['W_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/decoder.state_to_update'].set_value( gh_model['G_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/decoder.state_to_reset'].set_value( gh_model['R_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/attention/state_trans/transform_states.W'].set_value( gh_model['B_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/attention/preprocess.W'].set_value( gh_model['A_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/attention/energy_comp/linear.W'].set_value( gh_model['D_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/decoder/state_initializer/linear_0.W'].set_value( gh_model['W_0_dec_initializer_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/decoder/state_initializer/linear_0.b'].set_value( gh_model['b_0_dec_initializer_0']) config['val_burn_in'] = -1 # Initialize main loop main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=[ FinishAfter(after_n_batches=1), Sampler(model=search_model, config=config, data_stream=tr_stream, every_n_batches=config['sampling_freq']), BleuValidator( sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_eos_idx=config['src_eos_idx'], trg_eos_idx=config['trg_eos_idx'], before_training=True, before_batch=True), #every_n_batches=config['bleu_val_freq']), TrainingDataMonitoring([cost], after_batch=True), #Plot('En-Fr', channels=[['decoder_cost_cost']], # after_batch=True), Printing(after_batch=True) ]) # Train! main_loop.run()
def main(max_seq_length, lstm_dim, batch_size, num_batches, num_epochs): dataset_train = IterableDataset( generate_data(max_seq_length, batch_size, num_batches)) dataset_test = IterableDataset( generate_data(max_seq_length, batch_size, 100)) stream_train = DataStream(dataset=dataset_train) stream_test = DataStream(dataset=dataset_test) x = T.tensor3('x') y = T.matrix('y') # we need to provide data for the LSTM layer of size 4 * ltsm_dim, see # LSTM layer documentation for the explanation x_to_h = Linear(1, lstm_dim * 4, name='x_to_h', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) lstm = LSTM(lstm_dim, name='lstm', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) h_to_o = Linear(lstm_dim, 1, name='h_to_o', weights_init=IsotropicGaussian(), biases_init=Constant(0.0)) x_transform = x_to_h.apply(x) h, c = lstm.apply(x_transform) # only values of hidden units of the last timeframe are used for # the classification y_hat = h_to_o.apply(h[-1]) y_hat = Logistic().apply(y_hat) cost = BinaryCrossEntropy().apply(y, y_hat) cost.name = 'cost' lstm.initialize() x_to_h.initialize() h_to_o.initialize() cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Adam()) test_monitor = DataStreamMonitoring(variables=[cost], data_stream=stream_test, prefix="test") train_monitor = TrainingDataMonitoring(variables=[cost], prefix="train", after_epoch=True) main_loop = MainLoop(algorithm, stream_train, extensions=[ test_monitor, train_monitor, FinishAfter(after_n_epochs=num_epochs), Printing(), ProgressBar() ]) main_loop.run() print('Learned weights:') for layer in (x_to_h, lstm, h_to_o): print("Layer '%s':" % layer.name) for param in layer.parameters: print(param.name, ': ', param.get_value()) print() return main_loop