class OrthogonalGlorot(GlorotBengio): """Initialize a random orthogonal matrix. """ def __init__(self, *args, **kwargs): super(OrthogonalGlorot,self).__init__(*args, **kwargs) self.orth = Orthogonal() def generate(self, rng, shape): if len(shape) == 1: return super(OrthogonalGlorot,self).generate(rng,shape) N = shape[0] M = shape[1] // N if M > 1 and len(shape) == 2 and shape[1] == M*N: res = [self.orth.generate(rng,(N,N)) for i in range(M)] return numpy.concatenate(res,axis=-1) return self.orth.generate(rng, shape)
def main(nvis, nhid, encoding_lstm_dim, decoding_lstm_dim, T=1): x = tensor.matrix('features') # Construct and initialize model encoding_mlp = MLP([Tanh()], [None, None]) decoding_mlp = MLP([Tanh()], [None, None]) encoding_lstm = LSTM(dim=encoding_lstm_dim) decoding_lstm = LSTM(dim=decoding_lstm_dim) draw = DRAW(nvis=nvis, nhid=nhid, T=T, encoding_mlp=encoding_mlp, decoding_mlp=decoding_mlp, encoding_lstm=encoding_lstm, decoding_lstm=decoding_lstm, biases_init=Constant(0), weights_init=Orthogonal()) draw.push_initialization_config() encoding_lstm.weights_init = IsotropicGaussian(std=0.001) decoding_lstm.weights_init = IsotropicGaussian(std=0.001) draw.initialize() # Compute cost cost = -draw.log_likelihood_lower_bound(x).mean() cost.name = 'nll_upper_bound' model = Model(cost) # Datasets and data streams mnist_train = BinarizedMNIST('train') train_loop_stream = ForceFloatX( DataStream(dataset=mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 100))) train_monitor_stream = ForceFloatX( DataStream(dataset=mnist_train, iteration_scheme=SequentialScheme(mnist_train.num_examples, 500))) mnist_valid = BinarizedMNIST('valid') valid_monitor_stream = ForceFloatX( DataStream(dataset=mnist_valid, iteration_scheme=SequentialScheme(mnist_valid.num_examples, 500))) mnist_test = BinarizedMNIST('test') test_monitor_stream = ForceFloatX( DataStream(dataset=mnist_test, iteration_scheme=SequentialScheme(mnist_test.num_examples, 500))) # Get parameters and monitoring channels computation_graph = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(computation_graph.variables) monitoring_channels = dict([ ('avg_' + channel.tag.name, channel.mean()) for channel in VariableFilter( name='.*term$')(computation_graph.auxiliary_variables) ]) for name, channel in monitoring_channels.items(): channel.name = name monitored_quantities = monitoring_channels.values() + [cost] # Training loop step_rule = RMSProp(learning_rate=1e-3, decay_rate=0.95) algorithm = GradientDescent(cost=cost, params=params, step_rule=step_rule) algorithm.add_updates(computation_graph.updates) main_loop = MainLoop( model=model, data_stream=train_loop_stream, algorithm=algorithm, extensions=[ Timing(), SerializeMainLoop('vae.pkl', save_separately=['model']), FinishAfter(after_n_epochs=200), DataStreamMonitoring(monitored_quantities, train_monitor_stream, prefix="train", updates=computation_graph.updates), DataStreamMonitoring(monitored_quantities, valid_monitor_stream, prefix="valid", updates=computation_graph.updates), DataStreamMonitoring(monitored_quantities, test_monitor_stream, prefix="test", updates=computation_graph.updates), ProgressBar(), Printing() ]) main_loop.run()
def main(config, tr_stream, dev_stream, use_bokeh=False, the_task=None, the_track=None): config['the_task'] = the_task # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( # end_embed is dimension of word embedding matrix in encoder; enc_nhids number of hidden units in encoder GRU config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, config['use_attention'], cost_type=config['error_fct']) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) testVar = decoder.getTestVar( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') my_rng = numpy.random.RandomState(config['rng_value']) if config['identity_init']: encoder.weights_init = decoder.weights_init = Identity() else: encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.rng = decoder.rng = my_rng encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() encoder.bidir.prototype.rng = my_rng decoder.transition.weights_init = Orthogonal() decoder.transition.rng = my_rng encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff'], seed=my_rng) cost = cg.outputs[0] # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") # this is ugly code and done, because I am not sure if the order of the extensions is important if 'track2' in config['saveto']: # less epochs for track 2, because of more data if config['early_stopping']: extensions = [ FinishAfter(after_n_epochs=config['finish_after']/2), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] else: extensions = [ FinishAfter(after_n_epochs=config['finish_after']/2), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] else: if config['early_stopping']: extensions = [ FinishAfter(after_n_epochs=config['finish_after']), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] else: extensions = [ FinishAfter(after_n_epochs=config['finish_after']), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], #every_n_batches=1, every_n_batches=config['sampling_freq'], src_vocab_size=8)) #src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['val_set'] is not None: logger.info("Building accuracy validator") extensions.append( AccuracyValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, after_training=True, #after_epoch=True)) every_n_epochs=5)) else: logger.info("No validation set given for this language") # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run()
# Read examples and look up the right surface form data_stream = Mapping(data_stream, morph_lookup, add_sources=("targets", )) # Read in 10 samples at a time data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) # Pad the examples data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialisation settings m.weights_init = IsotropicGaussian(0.1) m.biases_init = Constant(0.0) m.push_initialization_config() m.encoder.weights_init = Orthogonal() m.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = m.cost(chars, chars_mask, targets, targets_mask).sum() batch_size = chars.shape[1].copy(name="batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" print("Cost graph is built", file=sys.stderr) model = Model(cost)
def main(config, tr_stream, dev_stream): # Create Theano variables logger.info('Creating theano variables') source_char_seq = tensor.lmatrix('source_char_seq') source_sample_matrix = tensor.btensor3('source_sample_matrix') source_char_aux = tensor.bmatrix('source_char_aux') source_word_mask = tensor.bmatrix('source_word_mask') target_char_seq = tensor.lmatrix('target_char_seq') target_char_aux = tensor.bmatrix('target_char_aux') target_char_mask = tensor.bmatrix('target_char_mask') target_sample_matrix = tensor.btensor3('target_sample_matrix') target_word_mask = tensor.bmatrix('target_word_mask') target_resample_matrix = tensor.btensor3('target_resample_matrix') target_prev_char_seq = tensor.lmatrix('target_prev_char_seq') target_prev_char_aux = tensor.bmatrix('target_prev_char_aux') target_bos_idx = tr_stream.trg_bos target_space_idx = tr_stream.space_idx['target'] # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['src_dgru_nhids'], config['enc_nhids'], config['src_dgru_depth'], config['bidir_encoder_depth']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['trg_dgru_nhids'], config['trg_igru_nhids'], config['dec_nhids'], config['enc_nhids'] * 2, config['transition_depth'], config['trg_igru_depth'], config['trg_dgru_depth'], target_space_idx, target_bos_idx) representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask) cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix, target_resample_matrix, target_char_aux, target_char_mask, target_word_mask, target_prev_char_seq, target_prev_char_aux) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() for layer_n in range(config['src_dgru_depth']): encoder.decimator.dgru.transitions[layer_n].weights_init = Orthogonal() for layer_n in range(config['bidir_encoder_depth']): encoder.children[ 1 + layer_n].prototype.recurrent.weights_init = Orthogonal() if config['trg_igru_depth'] == 1: decoder.interpolator.igru.weights_init = Orthogonal() else: for layer_n in range(config['trg_igru_depth']): decoder.interpolator.igru.transitions[ layer_n].weights_init = Orthogonal() for layer_n in range(config['trg_dgru_depth']): decoder.interpolator.feedback_brick.dgru.transitions[ layer_n].weights_init = Orthogonal() for layer_n in range(config['transition_depth']): decoder.transition.transitions[layer_n].weights_init = Orthogonal() encoder.initialize() decoder.initialize() # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(str(shape), count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(str(value.get_value().shape), name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Set extensions logger.info("Initializing extensions") # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) train_monitor = CostCurve([cost, gradient_norm, step_norm], config=config, after_batch=True, before_first_epoch=True, prefix='tra') extensions = [ train_monitor, Timing(), Printing(every_n_batches=config['print_freq']), FinishAfter(after_n_batches=config['finish_after']), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") generated = decoder.generate(representation, source_word_mask) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[config['transition_depth']]) ) # generated[transition_depth] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], transition_depth=config['transition_depth'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def set_up(self, config=None, make_prunable=False): """Loads and initializes all the theano variables for the training model and the decoding model. Args: config (dict): NMT configuration """ if config: self.config = config else: config = self.config # Create Theano variables logging.debug('Creating theano variables') source_sentence_mask = tensor.matrix('source_mask') target_sentence_mask = tensor.matrix('target_mask') # Construct model (fs439: Add NoLookup options) if config['dec_layers'] != 1: logging.fatal("Only dec_layers=1 supported.") logging.debug('Building RNN encoder-decoder') if config['src_sparse_feat_map']: if config['enc_layers'] != 1: logging.fatal("Only enc_layers=1 supported for sparse " "source features.") source_sentence = tensor.tensor3('source') self.sampling_input = tensor.tensor3('input') encoder = NoLookupEncoder(config['enc_embed'], config['enc_nhids']) else: source_sentence = tensor.lmatrix('source') self.sampling_input = tensor.lmatrix('input') if config['enc_layers'] > 1 and not config['enc_share_weights']: encoder = DeepBidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_layers'], config['enc_skip_connections'], config['enc_nhids']) else: encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_layers'], config['enc_skip_connections'], config['enc_nhids']) if config['trg_sparse_feat_map']: target_sentence = tensor.tensor3('target') decoder = NoLookupDecoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['att_nhids'], config['maxout_nhids'], config['enc_nhids'] * 2, config['attention'], config['dec_attention_sources'], config['dec_readout_sources'], config['memory'], config['memory_size'], config['seq_len'], config['dec_init']) else: target_sentence = tensor.lmatrix('target') decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['att_nhids'], config['maxout_nhids'], config['enc_nhids'] * 2, config['attention'], config['dec_attention_sources'], config['dec_readout_sources'], config['memory'], config['memory_size'], config['seq_len'], config['dec_init'], make_prunable=make_prunable) if config['annotations'] != 'direct': annotators = [] add_direct = False for name in config['annotations'].split(','): if name == 'direct': add_direct = True elif name == 'hierarchical': annotators.append(HierarchicalAnnotator(encoder)) else: logging.fatal("Annotation strategy %s unknown" % name) encoder = EncoderWithAnnotators(encoder, annotators, add_direct) annotations, annotations_mask = encoder.apply(source_sentence, source_sentence_mask) self.cost = decoder.cost(annotations, annotations_mask, target_sentence, target_sentence_mask) logging.info('Creating computational graph') self.cg = ComputationGraph(self.cost) # Initialize model logging.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() try: encoder.bidir.prototype.weights_init = Orthogonal() except AttributeError: pass # Its fine, no bidirectional encoder decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logging.info('Applying dropout') dropout_inputs = [ x for x in self.cg.intermediary_variables if x.name == 'maxout_apply_output' ] self.cg = apply_dropout(self.cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logging.info('Applying weight noise to ff layers') if encoder.lookup: enc_params = Selector(encoder.lookup).get_parameters().values() enc_params += Selector(encoder.fwd_fork).get_parameters().values() enc_params += Selector(encoder.back_fork).get_parameters().values() dec_params = Selector( decoder.sequence_generator.readout).get_parameters().values() dec_params += Selector( decoder.sequence_generator.fork).get_parameters().values() self.cg = apply_noise(self.cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in self.cg.parameters] logging.debug("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logging.debug(' {:15}: {}'.format(shape, count)) logging.debug("Total number of CG parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logging.debug("Parameter names: ") for name, value in enc_dec_param_dict.items(): logging.debug(' {:15}: {}'.format(value.get_value().shape, name)) logging.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logging.info("Building model") self.training_model = Model(self.cost) logging.info("Building sampling model") src_shape = (self.sampling_input.shape[-2], self.sampling_input.shape[-1]) # batch_size x sen_length sampling_representation, _ = encoder.apply(self.sampling_input, tensor.ones(src_shape)) generated = decoder.generate(src_shape, sampling_representation) self.search_model = Model(generated) generated_outputs = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs self.samples = generated_outputs[1] self.encoder = encoder self.decoder = decoder
def main(config, tr_stream, dev_stream, use_bokeh=False): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, config['topical_embedding_dim']) topical_transformer = topicalq_transformer(config['topical_vocab_size'], config['topical_embedding_dim'], config['enc_nhids'], config['topical_word_num'], config['batch_size']) if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') source_topical_word = tensor.lmatrix('source_topical') source_topical_mask = tensor.matrix('source_topical_mask') # Get training and development set streams tr_stream = get_tr_stream_with_topicalq(**config) dev_stream = get_dev_stream_with_topicalq(**config) topic_embedding = topical_transformer.apply(source_topical_word) # Get cost of the model representation = encoder.apply(source_sentence, source_sentence_mask) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = representation[0, :, (representation.shape[2] / 2):] cost = decoder.cost(representation, source_sentence_mask, tw_representation, source_topical_mask, target_sentence, target_sentence_mask, topic_embedding, content_embedding) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() topical_transformer.weights_init = IsotropicGaussian( config['weight_scale']) topical_transformer.biases_init = Constant(0) topical_transformer.push_allocation_config() #don't know whether the initialize is for topical_transformer.look_up.weights_init = Orthogonal() topical_transformer.transformer.weights_init = Orthogonal() topical_transformer.initialize() word_topical_embedding = cPickle.load( open(config['topical_embeddings'], 'rb')) np_word_topical_embedding = numpy.array(word_topical_embedding, dtype='float32') topical_transformer.look_up.W.set_value(np_word_topical_embedding) topical_transformer.look_up.W.tag.role = [] # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] ''' # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate( sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) ''' # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, on_unused_sources='warn', step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run() elif mode == 'translate': # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_topical_word = tensor.lmatrix('source_topical') # Get test set stream test_stream = get_dev_stream_with_topicalq( config['test_set'], config['src_vocab'], config['src_vocab_size'], config['topical_test_set'], config['topical_vocab'], config['topical_vocab_size'], config['unk_id']) ftrans = open(config['test_set'] + '.trans.out', 'w') # Helper utilities sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 # Get beam search logger.info("Building sampling model") topic_embedding = topical_transformer.apply(source_topical_word) representation = encoder.apply(source_sentence, tensor.ones(source_sentence.shape)) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = representation[0, :, (representation.shape[2] / 2):] generated = decoder.generate(source_sentence, representation, tw_representation, topical_embedding=topic_embedding, content_embedding=content_embedding) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) # Get target vocabulary trg_vocab = _ensure_special_tokens(pickle.load( open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Started translation: ") total_cost = 0.0 for i, line in enumerate(test_stream.get_epoch_iterator()): seq = sutils._oov_to_unk(line[0], config['src_vocab_size'], unk_idx) seq2 = line[1] input_ = numpy.tile(seq, (config['beam_size'], 1)) input_topical = numpy.tile(seq2, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ beam_search.search( input_values={source_sentence: input_,source_topical_word:input_topical}, max_length=10*len(seq), eol_symbol=src_eos_idx, ignore_first_eol=True) ''' # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths ''' #best = numpy.argsort(costs)[0] best = numpy.argsort(costs)[0:config['beam_size']] for b in best: try: total_cost += costs[b] trans_out = trans[b] # convert idx to words trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info("Translated {} lines of test set...".format(i)) logger.info("Total cost of the test: {}".format(total_cost)) ftrans.close() elif mode == 'rerank': # Create Theano variables ftrans = open(config['val_set'] + '.scores.out', 'w') logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') config['src_data'] = config['val_set'] config['trg_data'] = config['val_set_grndtruth'] config['batch_size'] = 1 config['sort_k_batches'] = 1 test_stream = get_tr_stream_unsorted(**config) logger.info("Building sampling model") representations = encoder.apply(source_sentence, source_sentence_mask) costs = decoder.cost(representations, source_sentence_mask, target_sentence, target_sentence_mask) logger.info("Loading the model..") model = Model(costs) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) costs_computer = function([ source_sentence, source_sentence_mask, target_sentence, target_sentence_mask ], costs) iterator = test_stream.get_epoch_iterator() scores = [] for i, (src, src_mask, trg, trg_mask) in enumerate(iterator): costs = costs_computer(*[src, src_mask, trg, trg_mask]) cost = costs.sum() print(i, cost) scores.append(cost) ftrans.write(str(cost) + "\n") ftrans.close()
def main(config, tr_stream, dev_stream, use_bokeh=False, src_vocab=None, trg_vocab=None): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() logger.info('Creating computational graph') cg = ComputationGraph(cost) # GRAPH TRANSFORMATIONS FOR BETTER TRAINING # TODO: allow user to remove some params from the graph, for example if embeddings should be kept static if config.get('l2_regularization', False) is True: l2_reg_alpha = config['l2_regularization_alpha'] logger.info( 'Applying l2 regularization with alpha={}'.format(l2_reg_alpha)) model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # why do we need to name the cost variable? Where did the original name come from? cost.name = 'decoder_cost_cost' cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # allow user to externally initialize some params model_params = training_model.get_parameter_dict() if config.get('external_embeddings', None) is not None: for key in config['external_embeddings']: path_to_params = config['external_embeddings'][key] logger.info( 'Replacing {} parameters with external params at: {}'.format( key, path_to_params)) external_params = numpy.load(path_to_params) len_external_idx = external_params.shape[0] print(external_params.shape) # Working: look in the dictionary and overwrite the correct rows existing_params = model_params[key].get_value() if key == '/bidirectionalencoder/embeddings.W': vocab = src_vocab elif key == '/decoder/sequencegenerator/readout/lookupfeedbackwmt15/lookuptable.W': vocab = trg_vocab else: raise KeyError( 'Unknown embedding parameter key: {}'.format(key)) for k, i in vocab.items(): if i < len_external_idx: existing_params[i] = external_params[i] # model_params_shape = model_params[key].get_value().shape # assert model_params[key].get_value().shape == external_params.shape, ("Parameter dims must not change," # "shapes {} and {} do not match". # format(model_params_shape, # external_params.shape)) model_params[key].set_value(existing_params) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions logger.info("Initializing extensions") extensions = [] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) # note that generated containes several different outputs generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling # Note: this is broken for unicode chars #if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab_size=config['src_vocab_size'])) # WORKING: remove these validators in favor of Async # TODO: implement burn-in in the validation extension (don't fire until we're past the burn-in iteration) # Add early stopping based on bleu # if config.get('bleu_script', None) is not None: # logger.info("Building bleu validator") # extensions.append( # BleuValidator(sampling_input, samples=samples, config=config, # model=search_model, data_stream=dev_stream, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # Add early stopping based on Meteor # if config.get('meteor_directory', None) is not None: # logger.info("Building meteor validator") # extensions.append( # MeteorValidator(sampling_input, samples=samples, config=config, # model=search_model, data_stream=dev_stream, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Set up training algorithm logger.info("Initializing training algorithm") # if there is dropout or random noise, we need to use the output of the modified graph if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0: algorithm = GradientDescent(cost=cg.outputs[0], parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) else: algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # enrich the logged information extensions.extend([ Timing(every_n_batches=100), FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ]) # External non-blocking validation extensions.append( RunExternalValidation(config=config, every_n_batches=config['bleu_val_freq'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(config['model_save_directory'], channels=[['decoder_cost_cost'], ['validation_set_bleu_score'], ['validation_set_meteor_score']], every_n_batches=1)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
import argparse parser = argparse.ArgumentParser() parser.add_argument("--hyperparameters", help="YAML file from which to load hyperparameters") parser.add_argument("--parameters", help="npy/npz file from which to load parameters") args = parser.parse_args() with open(os.path.join(os.path.dirname(__file__), "defaults.yaml"), "rb") as f: hyperparameters = yaml.load(f) if args.hyperparameters: with open(args.hyperparameters, "rb") as f: hyperparameters.update(yaml.load(f)) hyperparameters["n_spatial_dims"] = len(hyperparameters["patch_shape"]) hyperparameters["initargs"] = dict(weights_init=Orthogonal(), biases_init=Constant(0)) hyperparameters["hyperparameters"] = hyperparameters main_loop = construct_main_loop(**hyperparameters) if args.parameters: load_model_parameters(args.parameters, main_loop.model) print "training..." main_loop.run()
def main(config, tr_stream, dev_stream, source_vocab, target_vocab, use_bokeh=False): # add the tags from this function to the IMT datastream # prediction function signature # [target_suffix, source_mask, source, target_prefix_mask, target_prefix, target_suffix_mask] prediction_function = get_prediction_function(exp_config=config) tr_stream = Mapping( tr_stream, CallPredictionFunctionOnStream(prediction_function, [1, 0, 5, 4, 7, 6]), #tr_stream = Mapping(tr_stream, CallFunctionOnStream(prediction_function, [6, 1, 0, 5, 4, 7]), add_sources=('predictions', 'orig_readouts', 'prediction_tags')) # now datastream has 11 things import ipdb ipdb.set_trace() # WORKING: call prediction function twice to get new readouts on predictions instead of reference suffs # the only difference is the index of the suffix tr_stream = Mapping(tr_stream, CallPredictionFunctionOnStream(prediction_function, [1, 0, 5, 4, 7, 8]), add_sources=('dummy_predictions', 'readouts', 'dummy_prediction_tags')) import ipdb ipdb.set_trace() # Create the prediction confidence model # the first draft of this model uses the readout output (before the post-merge step) as the per-timestep state vector # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') # Note that the _names_ are changed from normal NMT # for IMT training, we use only the suffix as the reference target_sentence = tensor.lmatrix('target_suffix') target_sentence_mask = tensor.matrix('target_suffix_mask') target_prefix = tensor.lmatrix('target_prefix') target_prefix_mask = tensor.matrix('target_prefix_mask') # symbolic variable which tags each timestep as GOOD/BAD # Note: later this might be tags for a hypothesis i.e. from TER(p), right now the timesteps are actually determined by the reference # By zipping the confidence model output with the reference, we get the model's confidence that this reference word # will be predicted correctly prediction_tags = tensor.matrix('prediction_tags') readouts = tensor.tensor3('readouts') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = NMTPrefixDecoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, loss_function='cross_entropy') # rename to match baseline NMT systems decoder.name = 'decoder' cost = decoder.confidence_cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask, target_prefix, target_prefix_mask, readouts, prediction_tags) # WORKING: add l2 regularization logger.info('Creating computational graph') # working: implement cost for confidence model cg = ComputationGraph(cost) # INITIALIZATION logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() #cost_cg = ComputationGraph(cost) if config['l2_reg']: l2_reg_alpha = config['l2_reg_alpha'] model_weights = VariableFilter(roles=[WEIGHT])(cg.variables) for W in model_weights: cost = cost + (l2_reg_alpha * (W**2).sum()) # do we need to name the cost variable again? cost.name = 'cost' cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name in set([ 'confidence_model1_apply_output', 'confidence_model2_apply_output', 'confidence_model3_apply_output' ]) ] # if x.name == 'maxout_apply_output'] # if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # WORKING: implement confidence -- remove all params except output model cost_model = Model(cost) model_params = cost_model.get_parameter_dict() trainable_params = cg.parameters import ipdb ipdb.set_trace() print('trainable params') #params_to_remove = [model_params[k] for k in model_params.keys() if 'confidence' not in k] #for p in params_to_remove: # trainable_params.remove(p) # target_embeddings = model.get_parameter_dict()['/target_recurrent_lm_with_alignments/target_embeddings.W'] # trainable_params.remove(source_embeddings) # trainable_params.remove(target_embeddings) # END WORKING: implement confidence -- remove all params except output model # TODO: fixed dropout mask for recurrent params? # Print shapes # shapes = [param.get_value().shape for param in cg.parameters] # logger.info("Parameter shapes: ") # for shape, count in Counter(shapes).most_common(): # logger.info(' {:15}: {}'.format(shape, count)) # logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names # enc_dec_param_dict = merge(Selector(encoder).get_parameters(), # Selector(decoder).get_parameters()) # logger.info("Parameter names: ") # for name, value in enc_dec_param_dict.items(): # logger.info(' {:15}: {}'.format(value.get_value().shape, name)) # logger.info("Total number of parameters: {}" # .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), # TrainingDataMonitoring(trainable_params, after_batch=True), # Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # WORKING: confidence prediction #monitor everything that could possibly be relevant # Set up the sampling graph for validation during training # Theano variables for the sampling graph # Note this also loads the model parameters sampling_vars = load_params_and_get_beam_search(config, encoder=encoder, decoder=decoder) beam_search, search_model, samples, sampling_input, sampling_prefix = sampling_vars #if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab=source_vocab, # trg_vocab=target_vocab, # src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu #if config['bleu_script'] is not None: # logger.info("Building bleu validator") # extensions.append( # BleuValidator(sampling_input, sampling_prefix, samples=samples, config=config, # model=search_model, data_stream=dev_stream, # src_vocab=source_vocab, # trg_vocab=target_vocab, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # TODO: add first-word accuracy validation # TODO: add IMT meteor early stopping #if config.get('imt_f1_validation', None) is not None: # logger.info("Building imt F1 validator") # extensions.append( # IMT_F1_Validator(sampling_input, sampling_prefix, # samples=samples, # config=config, # model=search_model, data_stream=dev_stream, # src_vocab=source_vocab, # trg_vocab=target_vocab, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # TODO: hacking here: get the predictions of the confidence model using the `readouts` source of the data_stream # Note that the parameters of this model must be pretrained, otherwise this doesn't make sense # confidence_predictions = decoder.get_confidence(readouts) # confidence_prediction_model = Model(confidence_predictions) # # confidence_param_values = LoadNMT.load_parameter_values(config['confidence_saved_parameters'], brick_delimiter=None) # LoadNMT.set_model_parameters(confidence_prediction_model, confidence_param_values) # # confidence_prediction_func = confidence_prediction_model.get_theano_function() # import ipdb; ipdb.set_trace() # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( # Plot(config['model_save_directory'], channels=[['decoder_confidence_cost_cost']], Plot(config['model_save_directory'], channels=[['cost']], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # WORKING: implement confidence model # if there is dropout or random noise, we need to use the output of the modified graph algorithm = GradientDescent( cost=cg.outputs[0], parameters=trainable_params, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), # eval(config['step_rule'])(), RemoveNotFinite()]), # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]), on_unused_sources='warn') #if config['dropout'] < 1.0: # algorithm = GradientDescent( # cost=cg.outputs[0], parameters=trainable_params, # step_rule=CompositeRule([StepClipping(config['step_clipping']), # eval(config['step_rule'])(), RemoveNotFinite()]), # # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]), # on_unused_sources='warn' # ) #else: # algorithm = GradientDescent( # cost=cost, parameters=cg.parameters, # step_rule=CompositeRule([StepClipping(config['step_clipping']), # eval(config['step_rule'])()]), # on_unused_sources='warn' # ) # END WORKING: implement confidence model import ipdb ipdb.set_trace() # enrich the logged information extensions.append(Timing(every_n_batches=100)) # WORKING: debugging confidence # get theano function from model # WORKING: implement word-level confidence cost # @application(inputs=['representation', 'source_sentence_mask', # 'target_sentence_mask', 'target_sentence', 'target_prefix_mask', 'target_prefix'], # outputs=['cost']) # def confidence_cost(self, representation, source_sentence_mask, # target_sentence, target_sentence_mask, target_prefix, target_prefix_mask): logger.info('Creating theano variables') # WORKING: 26.9.16 -- get confidence outputs directly from (source, prefix, suffix) inputs # This is equivalent to forced alignment --> confidence scores # Note: but this section should probably be in "evaluate" mode, not here in "train" # source_sentence = tensor.lmatrix('source') # source_sentence_mask = tensor.matrix('source_mask') # Note that the _names_ are changed from normal NMT # for IMT training, we use only the suffix as the reference #target_sentence = tensor.lmatrix('target_suffix') #target_sentence_mask = tensor.matrix('target_suffix_mask') # TODO: change names back to *_suffix, there is currently a theano function name error # TODO: in the GradientDescent Algorithm #target_prefix = tensor.lmatrix('target_prefix') #target_prefix_mask = tensor.matrix('target_prefix_mask') # confidence_output = decoder.confidence_cost( # encoder.apply(source_sentence, source_sentence_mask), # source_sentence_mask, target_sentence, target_sentence_mask, # target_prefix, target_prefix_mask) # confidence_model = Model(confidence_output) # t_cost_func = confidence_model.get_theano_function() # inputs # [source_mask, source, target_prefix_mask, target_prefix, target_suffix_mask, target_suffix] #import ipdb;ipdb.set_trace() # get the right args from the datastream # TODO: just print source, prefix, suffix, prediction, correct to new files -- this makes sure everything is aligned # OUTPUT_DIR = '/media/1tb_drive/imt_models/word_prediction_accuracy_experiments/en-de/exp_1' # for the_file in os.listdir(OUTPUT_DIR): # file_path = os.path.join(OUTPUT_DIR, the_file) # try: # if os.path.isfile(file_path): # os.unlink(file_path) # except Exception as e: # print(e) # # def write_file_truncate_mask(filename, data, mask, mode='a'): # ''' data is list of list ''' # # assert len(data) == len(mask) # with codecs.open(filename, mode, encoding='utf8') as out: # for l, m in zip(data, mask): # output = u' '.join(l[:int(m.sum())]) + u'\n' # out.write(output) # logger.info('Wrote file: {}'.format(filename)) # # # target_ivocab = {k:v.decode('utf8') for v,k in target_vocab.items()} # source_ivocab = {k:v.decode('utf8') for v,k in source_vocab.items()} # import ipdb; ipdb.set_trace() # tag_ivocab = {1: 'True', 0: 'False'} # # test_iter = tr_stream.get_epoch_iterator() # it = 0 # for t_source, t_source_mask, t_target, t_target_mask, t_target_prefix, t_target_prefix_mask, t_target_suffix, t_target_suffix_mask in test_iter: # if it <= 1000: # it += 1 # t_cost = t_cost_func(t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask, t_target_suffix) # readouts = t_cost[0] # preds = readouts.argmax(axis=2) # correct = preds.T == t_target_suffix # # # source_output = os.path.join(OUTPUT_DIR,'sources.en') # prefix_output = os.path.join(OUTPUT_DIR,'prefixes.de') # suffix_output = os.path.join(OUTPUT_DIR,'suffixes.de') # prediction_output = os.path.join(OUTPUT_DIR,'predictions.de') # correct_output = os.path.join(OUTPUT_DIR,'prefix_word_prediction_acc.out') # # source_text = [[source_ivocab[w] for w in s] for s in t_source] # prefix_text = [[target_ivocab[w] for w in s] for s in t_target_prefix] # suffix_text = [[target_ivocab[w] for w in s] for s in t_target_suffix] # pred_text = [[target_ivocab[w] for w in s] for s in preds.T] # correct_text = [[tag_ivocab[w] for w in s] for s in correct] # # # for triple in zip([source_output, prefix_output, suffix_output, prediction_output, correct_output], # [source_text, prefix_text, suffix_text, pred_text, correct_text], # [t_source_mask, t_target_prefix_mask, t_target_suffix_mask, t_target_suffix_mask, t_target_suffix_mask]): # write_file_truncate_mask(*triple) # else: # break # # import ipdb; ipdb.set_trace() #t_cost = t_cost_func(t_source, t_target_prefix) #t_cost = t_cost_func(t_target_suffix, t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask) #t_cost = t_cost_func(t_source_mask, t_source, t_target_prefix_mask, t_target_prefix, t_target_suffix_mask, t_target_suffix) # return confidence_cost, flat_y, confidence_logits, readouts #predictions = t_cost[0].argmax(axis=2) # TODO: next step -- print gradients and weights during training find out where nan is coming from # TODO: look at the gradient of this function with respect to parameters? -- see here: http://deeplearning.net/software/theano/tutorial/gradients.html # TODO: function which adds right/wrong tags for model predictions to the datastream. In this case we can learn a simple linear model as a baseline # TODO: print predictions for each batch for each timestep to file -- _dont shuffle_ so that we get the right order # import ipdb;ipdb.set_trace() # from blocks reverse_words example # observables = [ # cost, min_energy, max_energy, mean_activation, # batch_size, max_length, cost_per_character, # algorithm.total_step_norm, algorithm.total_gradient_norm] # for name, parameter in trainable_params.items(): # observables.append(parameter.norm(2).copy(name + "_norm")) # observables.append(algorithm.gradients[parameter].norm(2).copy( # name + "_grad_norm")) for i, (k, v) in enumerate(algorithm.updates): v.name = k.name + '_{}'.format(i) aux_vars = [v for v in cg.auxiliary_variables[-3:]] # import ipdb; ipdb.set_trace() extensions.extend([ TrainingDataMonitoring([cost], after_batch=True), # TrainingDataMonitoring([v for k,v in algorithm.updates[:2]], after_batch=True), # TrainingDataMonitoring(aux_vars, after_batch=True), # TrainingDataMonitoring(trainable_params, after_batch=True), Printing(after_batch=True) ]) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) import ipdb ipdb.set_trace() # Train! main_loop.run()
def train(step_rule, label_dim, state_dim, epochs, seed, dropout, test_cost, experiment_path, features, weight_noise, to_watch, patience, batch_size, batch_norm, **kwargs): print '.. TIMIT experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() # ------------------------------------------------------------------------ # Streams rng = np.random.RandomState(seed) stream_args = dict(rng=rng, batch_size=batch_size) print '.. initializing iterators' train_dataset = Timit('train', features=features) train_stream = construct_stream(train_dataset, **stream_args) dev_dataset = Timit('dev', features=features) dev_stream = construct_stream(dev_dataset, **stream_args) test_dataset = Timit('test', features=features) test_stream = construct_stream(test_dataset, **stream_args) update_stream = construct_stream(train_dataset, n_batches=100, **stream_args) phone_dict = train_dataset.get_phoneme_dict() phoneme_dict = { k: phone_to_phoneme_dict[v] if v in phone_to_phoneme_dict else v for k, v in phone_dict.iteritems() } ind_to_phoneme = {v: k for k, v in phoneme_dict.iteritems()} eol_symbol = ind_to_phoneme['<STOP>'] # ------------------------------------------------------------------------ # Graph print '.. building model' x = T.tensor3('features') y = T.matrix('phonemes') input_mask = T.matrix('features_mask') output_mask = T.matrix('phonemes_mask') theano.config.compute_test_value = 'off' x.tag.test_value = np.random.randn(100, 24, 123).astype(floatX) y.tag.test_value = np.ones((30, 24), dtype=floatX) input_mask.tag.test_value = np.ones((100, 24), dtype=floatX) output_mask.tag.test_value = np.ones((30, 24), dtype=floatX) seq_len = 100 input_dim = 123 activation = Tanh() recurrent_init = IdentityInit(0.99) rec1 = TimLSTM(not batch_norm, input_dim, state_dim, activation, name='LSTM') rec1.initialize() l1 = Linear(state_dim, label_dim + 1, name='out_linear', weights_init=Orthogonal(), biases_init=Constant(0.0)) l1.initialize() o1 = rec1.apply(x) y_hat_o = l1.apply(o1) shape = y_hat_o.shape y_hat = Softmax().apply(y_hat_o.reshape((-1, shape[-1]))).reshape(shape) y_mask = output_mask y_hat_mask = input_mask # ------------------------------------------------------------------------ # Costs and Algorithm ctc_cost = T.sum( ctc.cpu_ctc_th(y_hat_o, T.sum(y_hat_mask, axis=0), y + T.ones_like(y), T.sum(y_mask, axis=0))) batch_cost = ctc_cost.copy(name='batch_cost') bs = y.shape[1] cost_train = aggregation.mean(batch_cost, bs).copy("sequence_cost") cost_per_character = aggregation.mean( batch_cost, output_mask.sum()).copy("character_cost") cg_train = ComputationGraph(cost_train) model = Model(cost_train) train_cost_per_character = aggregation.mean( cost_train, output_mask.sum()).copy("train_character_cost") algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters, on_unused_sources='warn') # ------------------------------------------------------------------------ # Monitoring and extensions parameters = model.get_parameter_dict() observed_vars = [ cost_train, train_cost_per_character, aggregation.mean(algorithm.total_gradient_norm) ] for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name + "_norm")) observed_vars.append( algorithm.gradients[param].norm(2).copy(name + "_grad_norm")) train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_monitor = DataStreamMonitoring( variables=[cost_train, cost_per_character], data_stream=dev_stream, prefix="dev") train_ctc_monitor = CTCMonitoring(x, input_mask, y_hat, eol_symbol, train_stream, prefix='train', every_n_epochs=1, before_training=True, phoneme_dict=phoneme_dict, black_list=black_list, train=True) dev_ctc_monitor = CTCMonitoring(x, input_mask, y_hat, eol_symbol, dev_stream, prefix='dev', every_n_epochs=1, phoneme_dict=phoneme_dict, black_list=black_list) extensions = [] if 'load_path' in kwargs: extensions.append(Load(kwargs['load_path'])) extensions.extend([ FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor, train_ctc_monitor, dev_ctc_monitor ]) if test_cost: test_monitor = DataStreamMonitoring( variables=[cost_train, cost_per_character], data_stream=test_stream, prefix="test") test_ctc_monitor = CTCMonitoring(x, input_mask, y_hat, eol_symbol, test_stream, prefix='test', every_n_epochs=1, phoneme_dict=phoneme_dict, black_list=black_list) extensions.append(test_monitor) extensions.append(test_ctc_monitor) #if not os.path.exists(experiment_path): # os.makedirs(experiment_path) #best_path = os.path.join(experiment_path, 'best/') #if not os.path.exists(best_path): # os.mkdir(best_path) #best_path = os.path.join(best_path, 'model.bin') extensions.append(EarlyStopping(to_watch, patience, '/dev/null')) extensions.extend([ProgressBar(), Printing()]) # ------------------------------------------------------------------------ # Main Loop main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) print "Building time: %f" % (time.time() - t0) # if write_predictions: # with open('predicted.txt', 'w') as f_pred: # with open('targets.txt', 'w') as f_targets: # evaluator = CTCEvaluator( # eol_symbol, x, input_mask, y_hat, phoneme_dict, black_list) # evaluator.evaluate(dev_stream, file_pred=f_pred, # file_targets=f_targets) # return main_loop.run()
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) topical_transformer = topicalq_transformer( config['source_topic_vocab_size'], config['topical_embedding_dim'], config['enc_nhids'], config['topical_word_num'], config['batch_size']) decoder = Decoder(vocab_size=config['trg_vocab_size'], topicWord_size=config['trg_topic_vocab_size'], embedding_dim=config['dec_embed'], topical_dim=config['topical_embedding_dim'], state_dim=config['dec_nhids'], representation_dim=config['enc_nhids'] * 2, match_function=config['match_function'], use_doubly_stochastic=config['use_doubly_stochastic'], lambda_ds=config['lambda_ds'], use_local_attention=config['use_local_attention'], window_size=config['window_size'], use_step_decay_cost=config['use_step_decay_cost'], use_concentration_cost=config['use_concentration_cost'], lambda_ct=config['lambda_ct'], use_stablilizer=config['use_stablilizer'], lambda_st=config['lambda_st']) # here attended dim (representation_dim) of decoder is 2*enc_nhinds # because the context given by the encoder is a bidirectional context if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') target_topic_sentence = tensor.lmatrix('target_topic') target_topic_binary_sentence = tensor.lmatrix('target_binary_topic') #target_topic_sentence_mask=tensor.lmatrix('target_topic_mask'); sampling_input = tensor.lmatrix('input') source_topical_word = tensor.lmatrix('source_topical') source_topical_mask = tensor.matrix('source_topical_mask') topic_embedding = topical_transformer.apply(source_topical_word) # Get training and development set streams tr_stream = get_tr_stream_with_topic_target(**config) #dev_stream = get_dev_tr_stream_with_topic_target(**config) # Get cost of the model representations = encoder.apply(source_sentence, source_sentence_mask) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = representations[0, :, (representations.shape[2] / 2):] cost = decoder.cost(representations, source_sentence_mask, tw_representation, source_topical_mask, target_sentence, target_sentence_mask, target_topic_sentence, target_topic_binary_sentence, topic_embedding, content_embedding) logger.info('Creating computational graph') perplexity = tensor.exp(cost) perplexity.name = 'perplexity' cg = ComputationGraph(cost) costs_computer = function([ target_sentence, target_sentence_mask, source_sentence, source_sentence_mask, source_topical_word, target_topic_sentence, target_topic_binary_sentence ], (perplexity), on_unused_input='ignore') # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() topical_transformer.weights_init = IsotropicGaussian( config['weight_scale']) topical_transformer.biases_init = Constant(0) topical_transformer.push_allocation_config() #don't know whether the initialize is for topical_transformer.look_up.weights_init = Orthogonal() topical_transformer.transformer.weights_init = Orthogonal() topical_transformer.initialize() word_topical_embedding = cPickle.load( open(config['topical_embeddings'], 'rb')) np_word_topical_embedding = numpy.array(word_topical_embedding, dtype='float32') topical_transformer.look_up.W.set_value(np_word_topical_embedding) topical_transformer.look_up.W.tag.role = [] # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([perplexity], after_batch=True), CheckpointNMT(config['saveto'], config['model_name'], every_n_batches=config['save_freq']) ] # # Set up beam search and sampling computation graphs if necessary # if config['hook_samples'] >= 1 or config['bleu_script'] is not None: # logger.info("Building sampling model") # sampling_representation = encoder.apply( # sampling_input, tensor.ones(sampling_input.shape)) # generated = decoder.generate( # sampling_input, sampling_representation) # search_model = Model(generated) # _, samples = VariableFilter( # bricks=[decoder.sequence_generator], name="outputs")( # ComputationGraph(generated[1])) # # # Add sampling # if config['hook_samples'] >= 1: # logger.info("Building sampler") # extensions.append( # Sampler(model=search_model, data_stream=tr_stream, # model_name=config['model_name'], # hook_samples=config['hook_samples'], # every_n_batches=config['sampling_freq'], # src_vocab_size=config['src_vocab_size'])) # # # Add early stopping based on bleu # if False: # logger.info("Building bleu validator") # extensions.append( # BleuValidator(sampling_input, samples=samples, config=config, # model=search_model, data_stream=dev_stream, # normalize=config['normalized_bleu'], # every_n_batches=config['bleu_val_freq'], # n_best=3, # track_n_models=6)) # # logger.info("Building perplexity validator") # extensions.append( # pplValidation( config=config, # model=costs_computer, data_stream=dev_stream, # model_name=config['model_name'], # every_n_batches=config['sampling_freq'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) initial_learning_rate = config['initial_learning_rate'] log_path = os.path.join(config['saveto'], 'log') if config['reload'] and os.path.exists(log_path): with open(log_path, 'rb') as source: log = cPickle.load(source) last = max(log.keys()) - 1 if 'learning_rate' in log[last]: initial_learning_rate = log[last]['learning_rate'] # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ Scale(initial_learning_rate), StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), on_unused_sources='ignore') _learning_rate = algorithm.step_rule.components[0].learning_rate if config['learning_rate_decay']: extensions.append( LearningRateHalver(record_name='validation_cost', comparator=lambda x, y: x > y, learning_rate=_learning_rate, patience_default=3)) else: extensions.append(OldModelRemover(saveto=config['saveto'])) if config['learning_rate_grow']: extensions.append( LearningRateDoubler(record_name='validation_cost', comparator=lambda x, y: x < y, learning_rate=_learning_rate, patience_default=3)) extensions.append( SimplePrinting(config['model_name'], after_batch=True)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run() elif mode == 'translate': logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') source_topical_word = tensor.lmatrix('source_topical') tw_vocab_overlap = tensor.lmatrix('tw_vocab_overlap') tw_vocab_overlap_matrix = cPickle.load( open(config['tw_vocab_overlap'], 'rb')) tw_vocab_overlap_matrix = numpy.array(tw_vocab_overlap_matrix, dtype='int32') #tw_vocab_overlap=shared(tw_vocab_overlap_matrix); topic_embedding = topical_transformer.apply(source_topical_word) sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 trg_vocab = _ensure_special_tokens(cPickle.load( open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) topic_embedding = topical_transformer.apply(source_topical_word) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = sampling_representation[0, :, ( sampling_representation.shape[2] / 2):] generated = decoder.generate(sampling_input, sampling_representation, tw_representation, topical_embedding=topic_embedding, content_embedding=content_embedding) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) #loader = LoadNMT(config['saveto']) loader = LoadNMT(config['validation_load']) loader.set_model_parameters(model, loader.load_parameters_default()) logger.info("Started translation: ") test_stream = get_dev_stream_with_topicalq(**config) ts = test_stream.get_epoch_iterator() rts = open(config['val_set_source']).readlines() ftrans_original = open(config['val_output_orig'], 'w') saved_weights = [] total_cost = 0.0 pbar = ProgressBar(max_value=len(rts)).start() for i, (line, line_raw) in enumerate(zip(ts, rts)): trans_in = line_raw.split() seq = sutils._oov_to_unk(line[0], config['src_vocab_size'], unk_idx) seq1 = line[1] input_topical = numpy.tile(seq1, (config['beam_size'], 1)) input_ = numpy.tile(seq, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs, attendeds, weights = \ beam_search.search( input_values={sampling_input: input_,source_topical_word:input_topical,tw_vocab_overlap:tw_vocab_overlap_matrix}, tw_vocab_overlap=tw_vocab_overlap_matrix, max_length=3*len(seq), eol_symbol=trg_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths best = numpy.argsort(costs)[0] try: total_cost += costs[best] trans_out = trans[best] weight = weights[best][:, :len(trans_in)] trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' saved_weights.append(weight) print(' '.join(trans_out), file=ftrans_original) pbar.update(i + 1) pbar.finish() logger.info("Total cost of the test: {}".format(total_cost)) cPickle.dump(saved_weights, open(config['attention_weights'], 'wb')) ftrans_original.close() # ap = afterprocesser(config) # ap.main() elif mode == 'score': logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') target_topic_sentence = tensor.lmatrix('target_topic') target_topic_binary_sentence = tensor.lmatrix('target_binary_topic') source_topical_word = tensor.lmatrix('source_topical') topic_embedding = topical_transformer.apply(source_topical_word) # Get cost of the model representations = encoder.apply(source_sentence, source_sentence_mask) costs = decoder.cost(representations, source_sentence_mask, target_sentence, target_sentence_mask, target_topic_sentence, target_topic_binary_sentence, topic_embedding) config['batch_size'] = 1 config['sort_k_batches'] = 1 # Get test set stream test_stream = get_tr_stream_with_topic_target(**config) logger.info("Building sampling model") logger.info("Loading the model..") model = Model(costs) loader = LoadNMT(config['validation_load']) loader.set_model_parameters(model, loader.load_parameters_default()) costs_computer = function([ target_sentence, target_sentence_mask, source_sentence, source_sentence_mask, source_topical_word, target_topic_sentence, target_topic_binary_sentence ], (costs), on_unused_input='ignore') iterator = test_stream.get_epoch_iterator() scores = [] att_weights = [] for i, (src, src_mask, trg, trg_mask, te, te_mask, tt, tt_mask, tb, tb_mask) in enumerate(iterator): costs = costs_computer(*[trg, trg_mask, src, src_mask, te, tt, tb]) cost = costs.sum() print(i, cost) scores.append(cost) print(sum(scores) / 10007)
def Fire(image_shape, num_input, conv1, conv2, conv3, out, i): layers11 = [] layers12 = [] layers13 = [] layers14 = [] ############# SQUEEZE ########### ### 4 Conv 1x1 ### layers11.append(Convolutional(filter_size=(1,1), num_channels=num_input, num_filters=conv1, image_size=image_shape, border_mode='half', name='conv_{}'.format(i))) layers11.append(BatchNormalization(name='batch_{}'.format(i))) layers11.append(Rectifier()) conv_sequence11 = ConvolutionalSequence(layers11, num_channels=num_input, image_size=image_shape, weights_init=Orthogonal(), use_bias=False, name='convSeq_{}'.format(i)) conv_sequence11.initialize() out11 = conv_sequence11.apply(out) i = i + 1 layers12.append(Convolutional(filter_size=(1,1), num_channels=num_input, num_filters=conv1, image_size=image_shape, border_mode='half', name='conv_{}'.format(i))) layers12.append(BatchNormalization(name='batch_{}'.format(i))) layers12.append(Rectifier()) conv_sequence12 = ConvolutionalSequence(layers12, num_channels=num_input, image_size=image_shape, weights_init=Orthogonal(), use_bias=False, name='convSeq_{}'.format(i)) conv_sequence12.initialize() out12 = conv_sequence12.apply(out) i = i + 1 layers13.append(Convolutional(filter_size=(1,1), num_channels=num_input, num_filters=conv1, image_size=image_shape, border_mode='half', name='conv_{}'.format(i))) layers13.append(BatchNormalization(name='batch_{}'.format(i))) layers13.append(Rectifier()) conv_sequence13 = ConvolutionalSequence(layers13, num_channels=num_input, image_size=image_shape, weights_init=Orthogonal(), use_bias=False, name='convSeq_{}'.format(i)) conv_sequence13.initialize() out13 = conv_sequence13.apply(out) i = i + 1 layers14.append(Convolutional(filter_size=(1,1), num_channels=num_input, num_filters=conv1, image_size=image_shape, border_mode='half', name='conv_{}'.format(i))) layers14.append(BatchNormalization(name='batch_{}'.format(i))) layers14.append(Rectifier()) conv_sequence14 = ConvolutionalSequence(layers14, num_channels=num_input, image_size=image_shape, weights_init=Orthogonal(), use_bias=False, name='convSeq_{}'.format(i)) conv_sequence14.initialize() out14 = conv_sequence14.apply(out) i = i + 1 squeezed = T.concatenate([out11, out12, out13, out14], axis=1) ####### EXPAND ##### layers21 = [] layers22 = [] layers23 = [] layers24 = [] layers31 = [] layers32 = [] layers33 = [] layers34 = [] num_input2 = conv1 * 4 ### 4 conv 1x1 ### layers21.append(Convolutional(filter_size=(1,1), num_channels=num_input2, num_filters=conv2, image_size=image_shape, border_mode='half', name='conv_{}'.format(i))) layers21.append(BatchNormalization(name='batch_{}'.format(i))) layers21.append(Rectifier()) conv_sequence21 = ConvolutionalSequence(layers21, num_channels=num_input2, image_size=image_shape, weights_init=Orthogonal(), use_bias=False, name='convSeq_{}'.format(i)) conv_sequence21.initialize() out21 = conv_sequence21.apply(squeezed) i = i + 1 layers22.append(Convolutional(filter_size=(1,1), num_channels=num_input2, num_filters=conv2, image_size=image_shape, border_mode='half', name='conv_{}'.format(i))) layers22.append(BatchNormalization(name='batch_{}'.format(i))) layers22.append(Rectifier()) conv_sequence22 = ConvolutionalSequence(layers22, num_channels=num_input2, image_size=image_shape, weights_init=Orthogonal(), use_bias=False, name='convSeq_{}'.format(i)) conv_sequence22.initialize() out22 = conv_sequence22.apply(squeezed) i = i + 1 layers23.append(Convolutional(filter_size=(1,1), num_channels=num_input2, num_filters=conv2, image_size=image_shape, border_mode='half', name='conv_{}'.format(i))) layers23.append(BatchNormalization(name='batch_{}'.format(i))) layers23.append(Rectifier()) conv_sequence23 = ConvolutionalSequence(layers23, num_channels=num_input2, image_size=image_shape, weights_init=Orthogonal(), use_bias=False, name='convSeq_{}'.format(i)) conv_sequence23.initialize() out23 = conv_sequence23.apply(squeezed) i = i + 1 layers24.append(Convolutional(filter_size=(1,1), num_channels=num_input2, num_filters=conv2, image_size=image_shape, border_mode='half', name='conv_{}'.format(i))) layers24.append(BatchNormalization(name='batch_{}'.format(i))) layers24.append(Rectifier()) conv_sequence24 = ConvolutionalSequence(layers24, num_channels=num_input2, image_size=image_shape, weights_init=Orthogonal(), use_bias=False, name='convSeq_{}'.format(i)) conv_sequence24.initialize() out24 = conv_sequence24.apply(squeezed) i = i + 1 ### 4 conv 3x3 ### layers31.append(Convolutional(filter_size=(3,3), num_channels=num_input2, num_filters=conv3, image_size=image_shape, border_mode='half', name='conv_{}'.format(i))) layers31.append(BatchNormalization(name='batch_{}'.format(i))) layers31.append(Rectifier()) conv_sequence31 = ConvolutionalSequence(layers31, num_channels=num_input2, image_size=image_shape, weights_init=Orthogonal(), use_bias=False, name='convSeq_{}'.format(i)) conv_sequence31.initialize() out31 = conv_sequence31.apply(squeezed) i = i + 1 layers32.append(Convolutional(filter_size=(3,3), num_channels=num_input2, num_filters=conv3, image_size=image_shape, border_mode='half', name='conv_{}'.format(i))) layers32.append(BatchNormalization(name='batch_{}'.format(i))) layers32.append(Rectifier()) conv_sequence32 = ConvolutionalSequence(layers32, num_channels=num_input2, image_size=image_shape, weights_init=Orthogonal(), use_bias=False, name='convSeq_{}'.format(i)) conv_sequence32.initialize() out32 = conv_sequence32.apply(squeezed) i = i + 1 layers33.append(Convolutional(filter_size=(3,3), num_channels=num_input2, num_filters=conv3, image_size=image_shape, border_mode='half', name='conv_{}'.format(i))) layers33.append(BatchNormalization(name='batch_{}'.format(i))) layers33.append(Rectifier()) conv_sequence33 = ConvolutionalSequence(layers33, num_channels=num_input2, image_size=image_shape, weights_init=Orthogonal(), use_bias=False, name='convSeq_{}'.format(i)) conv_sequence33.initialize() out33 = conv_sequence33.apply(squeezed) i = i + 1 layers34.append(Convolutional(filter_size=(3,3), num_channels=num_input2, num_filters=conv3, image_size=image_shape, border_mode='half', name='conv_{}'.format(i))) layers34.append(BatchNormalization(name='batch_{}'.format(i))) layers34.append(Rectifier()) conv_sequence34 = ConvolutionalSequence(layers34, num_channels=num_input2, image_size=image_shape, weights_init=Orthogonal(), use_bias=False, name='convSeq_{}'.format(i)) conv_sequence34.initialize() out34 = conv_sequence34.apply(squeezed) i = i + 1 #Merge return T.concatenate([out21, out22, out23, out24, out31, out32, out33, out34], axis=1)
#Define the parameters #Create the symbolics variable x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') num_epochs = 1000 layers = [] ###############FIRST STAGE####################### #Create the convolutions layers layers.append(Convolutional(filter_size=(7,7), step=(2,2), num_filters=96, border_mode='half', name='conv_0')) layers.append(BatchNormalization(name='batch_0')) layers.append(Rectifier()) layers.append(MaxPooling((3,3), step=(2,2), padding=(1,1), name='pool_0')) convSeq = ConvolutionalSequence(layers, num_channels=3, image_size=(220,220), weights_init=Orthogonal(), use_bias=False, name='ConvSeq') convSeq.initialize() out = convSeq.apply(x) #FIRE MODULES out1 = Fire((55,55), 96, 16, 16, 16, out, 10) out2 = Fire((55,55), 128, 16, 16, 16, out1, 25) out3 = Fire((55,55), 128, 32, 32, 32, out2, 300) out31 = MaxPooling((3,3), step=(2,2), padding=(1,1), name='poolLow').apply(out3) out4 = Fire((28,28), 256, 32, 32, 32, out31, 45) out5 = Fire((28,28), 256, 48, 48, 48, out4, 500) out6 = Fire((28,28), 384, 48, 48, 48, out5, 65) out7 = Fire((28,28), 384, 64, 64, 64, out6, 700) out71 = MaxPooling((3,3), step=(2,2), padding=(1,1), name='poolLow2').apply(out7) out8 = Fire((14,14), 512, 64, 64, 64, out71, 85)
def __init__(self, *args, **kwargs): super(OrthogonalGlorot,self).__init__(*args, **kwargs) self.orth = Orthogonal()
def main(mode, save_path, num_batches, from_dump): if mode == "train": # Experiment configuration dimension = 100 readout_dimension = len(char2code) # Data processing pipeline data_stream = DataStreamMapping( mapping=lambda data: tuple(array.T for array in data), data_stream=PaddingDataStream( BatchDataStream( iteration_scheme=ConstantScheme(10), data_stream=DataStreamMapping( mapping=reverse_words, add_sources=("targets", ), data_stream=DataStreamFilter( predicate=lambda data: len(data[0]) <= 100, data_stream=OneBillionWord( "training", [99], char2code, level="character", preprocess=str.lower).get_default_stream()))))) # Build the model chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") encoder = Bidirectional(GatedRecurrent(dim=dimension, activation=Tanh()), weights_init=Orthogonal()) encoder.initialize() fork = Fork([ name for name in encoder.prototype.apply.sequences if name != 'mask' ], weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) fork.input_dim = dimension fork.fork_dims = {name: dimension for name in fork.fork_names} fork.initialize() lookup = LookupTable(readout_dimension, dimension, weights_init=IsotropicGaussian(0.1)) lookup.initialize() transition = Transition(activation=Tanh(), dim=dimension, attended_dim=2 * dimension, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, match_dim=dimension, name="attention") readout = LinearReadout(readout_dim=readout_dimension, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( readout_dimension, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() bricks = [encoder, fork, lookup, generator] # Give an idea of what's going on params = Selector(bricks).get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) # Build the cost computation graph batch_cost = generator.cost( targets, targets_mask, attended=encoder.apply(**dict_union(fork.apply( lookup.lookup(chars), return_dict=True), mask=chars_mask)), attended_mask=chars_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Fetch variables useful for debugging max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") cg = ComputationGraph(cost) energies = unpack(VariableFilter(application=readout.readout, name="output")(cg.variables), singleton=True) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") (activations, ) = VariableFilter( application=generator.transition.apply, name="states")(cg.variables) mean_activation = named_copy(activations.mean(), "mean_activation") # Define the training algorithm. algorithm = GradientDescent(cost=cost, step_rule=CompositeRule([ GradientClipping(10.0), SteepestDescent(0.01) ])) observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) main_loop = MainLoop( model=bricks, data_stream=data_stream, algorithm=algorithm, extensions=([LoadFromDump(from_dump)] if from_dump else []) + [ Timing(), TrainingDataMonitoring(observables, after_every_batch=True), TrainingDataMonitoring( observables, prefix="average", every_n_batches=10), FinishAfter(after_n_batches=num_batches).add_condition( "after_batch", lambda log: math.isnan( log.current_row.total_gradient_norm)), Plot(os.path.basename(save_path), [["average_" + cost.name], ["average_" + cost_per_character.name]], every_n_batches=10), SerializeMainLoop(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "test": with open(save_path, "rb") as source: encoder, fork, lookup, generator = dill.load(source) logger.info("Model is loaded") chars = tensor.lmatrix("features") generated = generator.generate( n_steps=3 * chars.shape[0], batch_size=chars.shape[1], attended=encoder.apply(**dict_union( fork.apply(lookup.lookup(chars), return_dict=True))), attended_mask=tensor.ones(chars.shape)) sample_function = ComputationGraph(generated).get_theano_function() logging.info("Sampling function is compiled") while True: # Python 2-3 compatibility line = input("Enter a sentence\n") batch_size = int(input("Enter a number of samples\n")) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) states, samples, glimpses, weights, costs = sample_function( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for i in range(samples.shape[1]): sample = list(samples[:, i]) try: true_length = sample.index(char2code['</S>']) + 1 except ValueError: true_length = len(sample) sample = sample[:true_length] cost = costs[:true_length, i].sum() message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=lambda tuple_: -tuple_[0]) for _, message in messages: print(message)
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids'],name='word_encoder') decoder = Decoder(vocab_size=config['trg_vocab_size'], embedding_dim=config['dec_embed'], state_dim=config['dec_nhids'], representation_dim=config['enc_nhids'] * 2, match_function=config['match_function'], use_doubly_stochastic=config['use_doubly_stochastic'], lambda_ds=config['lambda_ds'], use_local_attention=config['use_local_attention'], window_size=config['window_size'], use_step_decay_cost=config['use_step_decay_cost'], use_concentration_cost=config['use_concentration_cost'], lambda_ct=config['lambda_ct'], use_stablilizer=config['use_stablilizer'], lambda_st=config['lambda_st']) # here attended dim (representation_dim) of decoder is 2*enc_nhinds # because the context given by the encoder is a bidirectional context if mode == "train": # Create Theano variables logger.info('Creating theano variables') context_sentences=[]; context_sentence_masks=[]; for i in range(config['ctx_num']): context_sentences.append(tensor.lmatrix('context_'+str(i))); context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask')); source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') dev_source = tensor.lmatrix('dev_source') dev_target=tensor.lmatrix('dev_target') # Get training and development set streams tr_stream = get_tr_stream_withContext(**config) dev_stream = get_dev_stream_with_grdTruth(**config) # Get cost of the model sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask); sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]); sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]); for i in range(config['ctx_num']): tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]); tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]); sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0); sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0); cost = decoder.cost(sentence_representations_list, sentence_masks_list, target_sentence, target_sentence_mask) logger.info('Creating computational graph') perplexity = tensor.exp(cost) perplexity.name = 'perplexity' costs_computer = function(context_sentences+context_sentence_masks+[target_sentence, target_sentence_mask, source_sentence, source_sentence_mask], (perplexity)) cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init =decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init =decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise( cg, enc_params+dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([perplexity], after_batch=True), CheckpointNMT(config['saveto'], config['model_name'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate( sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, model_name=config['model_name'], hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if False: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'], n_best=3, track_n_models=6)) logger.info("Building perplexity validator") extensions.append( pplValidation(dev_source,dev_target, config=config, model=costs_computer, data_stream=dev_stream, model_name=config['model_name'], every_n_batches=config['sampling_freq'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) initial_learning_rate = config['initial_learning_rate'] log_path = os.path.join(config['saveto'], 'log') if config['reload'] and os.path.exists(log_path): with open(log_path, 'rb') as source: log = cPickle.load(source) last = max(log.keys()) - 1 if 'learning_rate' in log[last]: initial_learning_rate = log[last]['learning_rate'] # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([Scale(initial_learning_rate), StepClipping(config['step_clipping']), eval(config['step_rule'])()])) _learning_rate = algorithm.step_rule.components[0].learning_rate if config['learning_rate_decay']: extensions.append( LearningRateHalver(record_name='validation_cost', comparator=lambda x, y: x > y, learning_rate=_learning_rate, patience_default=3)) else: extensions.append(OldModelRemover(saveto=config['saveto'])) if config['learning_rate_grow']: extensions.append( LearningRateDoubler(record_name='validation_cost', comparator=lambda x, y: x < y, learning_rate=_learning_rate, patience_default=3)) extensions.append( SimplePrinting(config['model_name'], after_batch=True)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run() elif mode == 'ppl': # Create Theano variables # Create Theano variables logger.info('Creating theano variables') context_sentences=[]; context_sentence_masks=[]; for i in range(config['ctx_num']): context_sentences.append(tensor.lmatrix('context_'+str(i))); context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask')); source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') # Get training and development set streams #tr_stream = get_tr_stream_withContext(**config) dev_stream = get_dev_stream_withContext_grdTruth(**config) # Get cost of the model sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask); sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]); sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]); for i in range(config['ctx_num']): tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]); tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]); sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0); sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0); cost = decoder.cost(sentence_representations_list, sentence_masks_list, target_sentence, target_sentence_mask) logger.info('Creating computational graph') costs_computer = function(context_sentences+context_sentence_masks+[target_sentence, target_sentence_mask, source_sentence, source_sentence_mask], (cost)) logger.info("Loading the model..") model = Model(cost) #loader = LoadNMT(config['saveto']) loader = LoadNMT(config['validation_load']); loader.set_model_parameters(model, loader.load_parameters_default()) logger.info("Started Validation: ") ts = dev_stream.get_epoch_iterator() total_cost = 0.0 total_tokens=0.0 #pbar = ProgressBar(max_value=len(ts)).start()#modified pbar = ProgressBar(max_value=10000).start(); for i, (ctx_0,ctx_0_mask,ctx_1,ctx_1_mask,ctx_2,ctx_2_mask,src, src_mask, trg, trg_mask) in enumerate(ts): costs = costs_computer(*[ctx_0,ctx_1,ctx_2,ctx_0_mask,ctx_1_mask,ctx_2_mask,trg, trg_mask,src, src_mask]) cost = costs.sum() total_cost+=cost total_tokens+=trg_mask.sum() pbar.update(i + 1) total_cost/=total_tokens; pbar.finish() #dev_stream.reset() # run afterprocess # self.ap.main() total_cost=2**total_cost; print("Average validation cost: " + str(total_cost)); elif mode == 'translate': logger.info('Creating theano variables') context_sentences=[]; context_sentence_masks=[]; for i in range(config['ctx_num']): context_sentences.append(tensor.lmatrix('context_'+str(i))); context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask')); source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 trg_vocab = _ensure_special_tokens( cPickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} config['batch_size'] = 1 sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask); sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]); sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]); for i in range(config['ctx_num']): tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]); tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]); sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0); sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0); generated = decoder.generate(sentence_representations_list,sentence_masks_list) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) #loader = LoadNMT(config['saveto']) loader = LoadNMT(config['validation_load']); loader.set_model_parameters(model, loader.load_parameters_default()) logger.info("Started translation: ") test_stream = get_dev_stream_withContext(**config) ts = test_stream.get_epoch_iterator() rts = open(config['val_set_source']).readlines() ftrans_original = open(config['val_output_orig'], 'w') saved_weights = [] total_cost = 0.0 pbar = ProgressBar(max_value=len(rts)).start() for i, (line, line_raw) in enumerate(zip(ts, rts)): trans_in = line_raw[3].split() seqs=[]; input_=[]; input_mask=[]; for j in range(config['ctx_num']+1): seqs.append(sutils._oov_to_unk( line[2*j][0], config['src_vocab_size'], unk_idx)) input_mask.append(numpy.tile(line[2*j+1][0],(config['beam_size'], 1))) input_.append(numpy.tile(seqs[j], (config['beam_size'], 1))) #v=costs_computer(input_[0]); # draw sample, checking to ensure we don't get an empty string back trans, costs, attendeds, weights = \ beam_search.search( input_values={source_sentence: input_[3],source_sentence_mask:input_mask[3], context_sentences[0]: input_[0],context_sentence_masks[0]:input_mask[0], context_sentences[1]: input_[1],context_sentence_masks[1]:input_mask[1], context_sentences[2]: input_[2],context_sentence_masks[2]:input_mask[2]}, max_length=3*len(seqs[2]), eol_symbol=trg_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths b = numpy.argsort(costs)[0] #best=numpy.argsort(costs)[0:config['beam_size']]; #for b in best: try: total_cost += costs[b] trans_out = trans[b] totalLen=4*len(line[0][0]); #weight = weights[b][:, :totalLen] weight=weights trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' saved_weights.append(weight) print(' '.join(trans_out), file=ftrans_original) pbar.update(i + 1) pbar.finish() logger.info("Total cost of the test: {}".format(total_cost)) cPickle.dump(saved_weights, open(config['attention_weights'], 'wb')) ftrans_original.close() ap = afterprocesser(config) ap.main()
dictionary={ 'a': 1, 'b': 2, 'c': 3, '<UNK>': 4 }, bos_token=None, eos_token=None, unk_token='<UNK>', level='character') alphabet_size = 4 lstm_dim = 2 lstm1 = LSTM(dim=lstm_dim, use_bias=False, weights_init=Orthogonal()) lstm2 = LSTM(dim=lstm_dim, use_bias=False, weights_init=Orthogonal()) rnn = RecurrentStack([lstm1, lstm2], name="transition") readout = Readout(readout_dim=alphabet_size, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(alphabet_size, feedback_dim=alphabet_size, name="feedback"), name="readout") seq_gen = SequenceGenerator(readout=readout, transition=rnn, weights_init=IsotropicGaussian(0.01),
def load_params_and_get_beam_search(exp_config, decoder=None, encoder=None, brick_delimiter=None): if encoder is None: encoder = BidirectionalEncoder(exp_config['src_vocab_size'], exp_config['enc_embed'], exp_config['enc_nhids']) # Note: decoder should be None when we are just doing prediction, not validation if decoder is None: decoder = NMTPrefixDecoder(exp_config['trg_vocab_size'], exp_config['dec_embed'], exp_config['dec_nhids'], exp_config['enc_nhids'] * 2, loss_function='cross_entropy') # rename to match baseline NMT systems so that params can be transparently initialized decoder.name = 'decoder' # Create Theano variables logger.info('Creating theano variables') sampling_input = tensor.lmatrix('sampling_input') sampling_prefix = tensor.lmatrix('sampling_target_prefix') # Get beam search logger.info("Building sampling model") sampling_representation = encoder.apply(sampling_input, tensor.ones(sampling_input.shape)) # Note: prefix can be empty if we want to simulate baseline NMT n_steps = exp_config.get('n_steps', None) generated = decoder.generate(sampling_input, sampling_representation, target_prefix=sampling_prefix, n_steps=n_steps) # create the 1-step sampling graph _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph( generated[1])) # generated[1] is next_outputs # set up beam search beam_search = BeamSearch(samples=samples) logger.info("Creating Search Model...") search_model = Model(generated) # optionally set beam search model parameter values from an .npz file # Note: we generally would set the model params in this way when doing only prediction/evaluation # Go ahead and initialize to some random values -- this is because the confidence model params below are optional if not hasattr(encoder, 'initialized'): encoder.push_initialization_config() encoder.initialize() encoder.bidir.prototype.weights_init = Orthogonal() if not hasattr(decoder, 'initialized'): decoder.push_initialization_config() decoder.transition.weights_init = Orthogonal() decoder.initialize() if exp_config.get('load_from_saved_parameters', False): logger.info("Loading parameters from model: {}".format( exp_config['saved_parameters'])) param_values = LoadNMT.load_parameter_values( exp_config['saved_parameters'], brick_delimiter=brick_delimiter) LoadNMT.set_model_parameters(search_model, param_values) # TODO: CONFIDENCE PREDICTION SHOULD BE OPTIONAL -- RIGHT NOW IT'S HARD-CODED INTO BEAM SEARCH if exp_config.get('confidence_saved_parameters', False): param_values = LoadNMT.load_parameter_values( exp_config['confidence_saved_parameters'], brick_delimiter=brick_delimiter) LoadNMT.set_model_parameters(search_model, param_values) return beam_search, search_model, samples, sampling_input, sampling_prefix
def set_up_predictor(self, nmt_model_path): """Initializes the predictor with the given NMT model. Code following ``blocks.machine_translation.main``. """ self.src_vocab_size = self.config['src_vocab_size'] self.trgt_vocab_size = self.config['trg_vocab_size'] # Create Theano variables logging.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logging.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(self.config['src_vocab_size'], self.config['enc_embed'], self.config['enc_nhids']) decoder = Decoder(self.config['trg_vocab_size'], self.config['dec_embed'], self.config['dec_nhids'], self.config['enc_nhids'] * 2) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logging.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model (TODO: really necessary?) logging.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( self.config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # Apply dropout for regularization (TODO: remove?) if self.config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logging.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, self.config['dropout']) # Apply weight noise for regularization (TODO: remove?) if self.config['weight_noise_ff'] > 0.0: logging.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params+dec_params, self.config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logging.debug("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logging.debug(' {:15}: {}'.format(shape, count)) logging.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logging.debug("Parameter names: ") for name, value in enc_dec_param_dict.items(): logging.debug(' {:15}: {}'.format(value.get_value().shape, name)) logging.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logging.info("Building model") # Set extensions logging.info("Initializing extensions") # Set up beam search and sampling computation graphs if necessary logging.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Follows blocks.machine_translation.BleuValidator.__init__ self.source_sentence = sampling_input self.samples = samples self.model = search_model self.normalize = True self.verbose = self.config.get('val_set_out', None) # Reload model if necessary if self.config['reload']: loader = LoadNMT(nmt_model_path, self.config['saveto'], search_model) loader.load_weights() self.best_models = [] self.val_bleu_curve = [] self.search_algorithm = MyopticSearch(samples=samples) self.search_algorithm.compile()
def main(config, tr_stream, dev_stream, source_vocab, target_vocab, use_bokeh=False): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') # Note that the _names_ are changed from normal NMT # for IMT training, we use only the suffix as the reference target_sentence = tensor.lmatrix('target_suffix') target_sentence_mask = tensor.matrix('target_suffix_mask') # TODO: change names back to *_suffix, there is currently a theano function name error # TODO: in the GradientDescent Algorithm target_prefix = tensor.lmatrix('target_prefix') target_prefix_mask = tensor.matrix('target_prefix_mask') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = NMTPrefixDecoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, loss_function='cross_entropy') # rename to match baseline NMT systems decoder.name = 'decoder' # TODO: change the name of `target_sentence` to `target_suffix` for more clarity cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask, target_prefix, target_prefix_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # INITIALIZATION logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog # this is the probability of dropping out, so you probably want to make it <=0.5 logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) trainable_params = cg.parameters # target_embeddings = model.get_parameter_dict()['/target_recurrent_lm_with_alignments/target_embeddings.W'] # trainable_params.remove(source_embeddings) # trainable_params.remove(target_embeddings) # TODO: fixed dropout mask for recurrent params? # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # create the training directory, and copy this config there if directory doesn't exist if not os.path.isdir(config['saveto']): os.makedirs(config['saveto']) shutil.copy(config['config_file'], config['saveto']) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), # TrainingDataMonitoring(trainable_params, after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up the sampling graph for validation during training # Theano variables for the sampling graph sampling_vars = load_params_and_get_beam_search(config, encoder=encoder, decoder=decoder) beam_search, search_model, samples, sampling_input, sampling_prefix = sampling_vars if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab=source_vocab, trg_vocab=target_vocab, src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, sampling_prefix, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # TODO: add first-word accuracy validation # TODO: add IMT meteor early stopping if config.get('imt_f1_validation', None) is not None: logger.info("Building imt F1 validator") extensions.append( IMT_F1_Validator(sampling_input, sampling_prefix, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_vocab=source_vocab, trg_vocab=target_vocab, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot(config['model_save_directory'], channels=[['decoder_cost_cost'], [ 'validation_set_bleu_score', 'validation_set_imt_f1_score' ]], every_n_batches=10)) # Set up training algorithm logger.info("Initializing training algorithm") # WORKING: implement confidence model # if there is dropout or random noise, we need to use the output of the modified graph if config['dropout'] < 1.0 or config['weight_noise_ff'] > 0.0: algorithm = GradientDescent( cost=cg.outputs[0], parameters=trainable_params, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), # step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)]), on_unused_sources='warn') else: algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ]), on_unused_sources='warn') # END WORKING: implement confidence model # enrich the logged information extensions.append(Timing(every_n_batches=100)) # for i, (k,v) in enumerate(algorithm.updates): # v.name = k.name + '_{}'.format(i) # # aux_vars = [v for v in cg.auxiliary_variables[-3:]] # import ipdb; ipdb.set_trace() extensions.extend([ TrainingDataMonitoring([cost], after_batch=True), # TrainingDataMonitoring([v for k,v in algorithm.updates[:2]], after_batch=True), # TrainingDataMonitoring(aux_vars, after_batch=True), TrainingDataMonitoring(trainable_params, after_batch=True), Printing(after_batch=True) ]) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def test_integer_sequence_generator(): """Test a sequence generator with integer outputs. Such sequence generators can be used to e.g. model language. """ rng = numpy.random.RandomState(1234) readout_dim = 5 feedback_dim = 3 dim = 20 batch_size = 30 n_steps = 10 transition = GatedRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal()) generator = SequenceGenerator(Readout( readout_dim=readout_dim, source_names=["states"], emitter=SoftmaxEmitter(theano_seed=1234), feedback_brick=LookupFeedback(readout_dim, feedback_dim)), transition, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), seed=1234) generator.initialize() # Test 'cost_matrix' method y = tensor.lmatrix('y') mask = tensor.matrix('mask') costs = generator.cost_matrix(y, mask) assert costs.ndim == 2 costs_fun = theano.function([y, mask], [costs]) y_test = rng.randint(readout_dim, size=(n_steps, batch_size)) m_test = numpy.ones((n_steps, batch_size), dtype=floatX) costs_val = costs_fun(y_test, m_test)[0] assert costs_val.shape == (n_steps, batch_size) assert_allclose(costs_val.sum(), 482.827, rtol=1e-5) # Test 'cost' method cost = generator.cost(y, mask) assert cost.ndim == 0 cost_val = theano.function([y, mask], [cost])(y_test, m_test) assert_allclose(cost_val, 16.0942, rtol=1e-5) # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method cg = ComputationGraph([cost]) var_filter = VariableFilter(roles=[AUXILIARY]) aux_var_name = '_'.join( [generator.name, generator.cost.name, 'per_sequence_element']) cost_per_el = [ el for el in var_filter(cg.variables) if el.name == aux_var_name ][0] assert cost_per_el.ndim == 0 cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test) assert_allclose(cost_per_el_val, 1.60942, rtol=1e-5) # Test generate states, outputs, costs = generator.generate(iterate=True, batch_size=batch_size, n_steps=n_steps) cg = ComputationGraph(states + outputs + costs) states_val, outputs_val, costs_val = theano.function( [], [states, outputs, costs], updates=cg.updates)() assert states_val.shape == (n_steps, batch_size, dim) assert outputs_val.shape == (n_steps, batch_size) assert outputs_val.dtype == 'int64' assert costs_val.shape == (n_steps, batch_size) assert_allclose(states_val.sum(), -17.91811, rtol=1e-5) assert_allclose(costs_val.sum(), 482.863, rtol=1e-5) assert outputs_val.sum() == 630 # Test masks agnostic results of cost cost1 = costs_fun([[1], [2]], [[1], [1]])[0] cost2 = costs_fun([[3, 1], [4, 2], [2, 0]], [[1, 1], [1, 1], [1, 0]])[0] assert_allclose(cost1.sum(), cost2[:, 1].sum(), rtol=1e-5)
def main(mode, save_path, num_batches, data_path=None): reverser = WordReverser(100, len(char2code), name="reverser") if mode == "train": # Data processing pipeline dataset_options = dict(dictionary=char2code, level="character", preprocess=_lower) if data_path: dataset = TextFile(data_path, **dataset_options) else: dataset = OneBillionWord("training", [99], **dataset_options) data_stream = dataset.get_example_stream() data_stream = Filter(data_stream, _filter_long) data_stream = Mapping(data_stream, reverse_words, add_sources=("targets",)) data_stream = Batch(data_stream, iteration_scheme=ConstantScheme(10)) data_stream = Padding(data_stream) data_stream = Mapping(data_stream, _transpose) # Initialization settings reverser.weights_init = IsotropicGaussian(0.1) reverser.biases_init = Constant(0.0) reverser.push_initialization_config() reverser.encoder.weghts_init = Orthogonal() reverser.generator.transition.weights_init = Orthogonal() # Build the cost computation graph chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") batch_cost = reverser.cost( chars, chars_mask, targets, targets_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in params.items()], width=120)) # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() # Define the training algorithm. cg = ComputationGraph(cost) algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=CompositeRule([StepClipping(10.0), Scale(0.01)])) # Fetch variables useful for debugging generator = reverser.generator (energies,) = VariableFilter( application=generator.readout.readout, name="output")(cg.variables) (activations,) = VariableFilter( application=generator.transition.apply, name=generator.transition.apply.states[0])(cg.variables) max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_activation = named_copy(abs(activations).mean(), "mean_activation") observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(named_copy( param.norm(2), name + "_norm")) observables.append(named_copy( algorithm.gradients[param].norm(2), name + "_grad_norm")) # Construct the main loop and start training! average_monitoring = TrainingDataMonitoring( observables, prefix="average", every_n_batches=10) main_loop = MainLoop( model=model, data_stream=data_stream, algorithm=algorithm, extensions=[ Timing(), TrainingDataMonitoring(observables, after_batch=True), average_monitoring, FinishAfter(after_n_batches=num_batches) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition("after_batch", _is_nan), Plot(os.path.basename(save_path), [[average_monitoring.record_name(cost)], [average_monitoring.record_name(cost_per_character)]], every_n_batches=10), # Saving the model and the log separately is convenient, # because loading the whole pickle takes quite some time. Checkpoint(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1)]) main_loop.run() elif mode == "sample" or mode == "beam_search": chars = tensor.lmatrix("input") generated = reverser.generate(chars) model = Model(generated) logger.info("Loading the model..") model.set_param_values(load_parameter_values(save_path)) def generate(input_): """Generate output sequences for an input sequence. Incapsulates most of the difference between sampling and beam search. Returns ------- outputs : list of lists Trimmed output sequences. costs : list The negative log-likelihood of generating the respective sequences. """ if mode == "beam_search": samples, = VariableFilter( bricks=[reverser.generator], name="outputs")( ComputationGraph(generated[1])) # NOTE: this will recompile beam search functions # every time user presses Enter. Do not create # a new `BeamSearch` object every time if # speed is important for you. beam_search = BeamSearch(input_.shape[1], samples) outputs, costs = beam_search.search( {chars: input_}, char2code['</S>'], 3 * input_.shape[0]) else: _1, outputs, _2, _3, costs = ( model.get_theano_function()(input_)) outputs = list(outputs.T) costs = list(costs.T) for i in range(len(outputs)): outputs[i] = list(outputs[i]) try: true_length = outputs[i].index(char2code['</S>']) + 1 except ValueError: true_length = len(outputs[i]) outputs[i] = outputs[i][:true_length] costs[i] = costs[i][:true_length].sum() return outputs, costs while True: line = input("Enter a sentence\n") message = ("Enter the number of samples\n" if mode == "sample" else "Enter the beam size\n") batch_size = int(input(message)) encoded_input = [char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip()] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input,))[0] print("Target: ", target) samples, costs = generate( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for sample, cost in equizip(samples, costs): message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=operator.itemgetter(0), reverse=True) for _, message in messages: print(message)
shuffle_entities = True concat_ctx_and_question = False concat_question_before = False embed_size = 200 ctx_lstm_size = [256, 256] ctx_skip_connections = False question_lstm_size = [256] question_skip_connections = True attention_mlp_hidden = [200] attention_mlp_activations = [Tanh()] step_rule = CompositeRule([RMSProp(decay_rate=0.95, learning_rate=5e-5), BasicMomentum(momentum=0.9)]) dropout = 0.2 w_noise = 0. valid_freq = 10000 save_freq = 10000 print_freq = 1000 weights_init = IsotropicGaussian(0.01) biases_init = Constant(0.) transition_weights_init = Orthogonal()
def __init__(self, config, vocab_size): context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) context_bag = to_bag(context, vocab_size) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='embed') embed.weights_init = IsotropicGaussian(0.01) #embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt') #embed.weights_init = Constant(embeddings_initial_value) # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Build the encoder bricks transition = GatedRecurrent(activation=Tanh(), dim=config.generator_lstm_size, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=cenc_dim, match_dim=config.generator_lstm_size, name="attention") readout = Readout(readout_dim=vocab_size, source_names=[ transition.apply.states[0], attention.take_glimpses.outputs[0] ], emitter=MaskedSoftmaxEmitter(context_bag=context_bag, name='emitter'), feedback_brick=LookupFeedback( vocab_size, config.feedback_size), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator") cost = generator.cost(answer, answer_mask.astype(theano.config.floatX), attended=cenc, attended_mask=context_mask.astype( theano.config.floatX), name="cost") self.predictions = generator.generate( n_steps=7, batch_size=config.batch_size, attended=cenc, attended_mask=context_mask.astype(theano.config.floatX), iterate=True)[1] # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # initialize new stuff manually (change!) generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0) generator.push_allocation_config() generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def mainPredict(config, data_to_predict_stream, use_ensemble, lang=None, et_version=False, use_bokeh=False, the_track=None): # Create Theano variables assert the_track != None logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, cost_type=config['error_fct']) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set extensions logger.info("Initializing (empty) extensions") extensions = [ ] logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Reload the model (as this is prediction, it is 100% necessary): if config['reload']: #extensions.append(LoadOnlyBestModel(config['saveto'])) # without early stopping use LoadOnlyModel here! extensions.append(LoadOnlyModel(config['saveto'])) # without early stopping use LoadOnlyModel here! else: raise Exception('No model available for prediction! (Check config[\'reload\'] variable)') # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=search_model, algorithm=algorithm, #algorithm=None, data_stream=data_to_predict_stream, extensions=extensions ) predictByHand(main_loop, decoder, data_to_predict_stream, use_ensemble, lang, et_version, config, the_track=the_track)
def main(): logging.basicConfig( level=logging.DEBUG, format="%(asctime)s: %(name)s: %(levelname)s: %(message)s") parser = argparse.ArgumentParser( "Case study of generating a Markov chain with RNN.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "mode", choices=["train", "sample"], help="The mode to run. Use `train` to train a new model" " and `sample` to sample a sequence generated by an" " existing one.") parser.add_argument("prefix", default="sine", help="The prefix for model, timing and state files") parser.add_argument("--steps", type=int, default=100, help="Number of steps to plot") args = parser.parse_args() dim = 10 num_states = ChainIterator.num_states feedback_dim = 8 transition = GatedRecurrent(name="transition", activation=Tanh(), dim=dim) generator = SequenceGenerator(LinearReadout( readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback(num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.allocate() logger.debug("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) if args.mode == "train": rng = numpy.random.RandomState(1) batch_size = 50 generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() logger.debug("transition.weights_init={}".format( transition.weights_init)) cost = generator.cost(tensor.lmatrix('x')).sum() gh_model = GroundhogModel(generator, cost) state = GroundhogState(args.prefix, batch_size, learning_rate=0.0001).as_dict() data = ChainIterator(rng, 100, batch_size) trainer = SGD(gh_model, state, data) main_loop = MainLoop(data, None, None, gh_model, trainer, state, None) main_loop.main() elif args.mode == "sample": load_params(generator, args.prefix + "model.npz") sample = ComputationGraph( generator.generate(n_steps=args.steps, batch_size=1, iterate=True)).function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, ChainIterator.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, ChainIterator.trans_prob)) else: assert False
def main(name, epochs, batch_size, learning_rate, attention, n_iter, enc_dim, dec_dim, z_dim): if name is None: tag = "watt" if attention else "woatt" name = "%s-t%d-enc%d-dec%d-z%d" % (tag, n_iter, enc_dim, dec_dim, z_dim) print("\nRunning experiment %s" % name) print(" learning rate: %5.3f" % learning_rate) print(" attention: %s" % attention) print(" n_iterations: %d" % n_iter) print(" encoder dimension: %d" % enc_dim) print(" z dimension: %d" % z_dim) print(" decoder dimension: %d" % dec_dim) print() #------------------------------------------------------------------------ x_dim = 28 * 28 img_height, img_width = (28, 28) rnninits = { 'weights_init': Orthogonal(), #'weights_init': IsotropicGaussian(0.001), 'biases_init': Constant(0.), } inits = { 'weights_init': Orthogonal(), #'weights_init': IsotropicGaussian(0.01), 'biases_init': Constant(0.), } prior_mu = T.zeros([z_dim]) prior_log_sigma = T.zeros([z_dim]) if attention: read_N = 4 write_N = 6 read_dim = 2 * read_N**2 reader = AttentionReader(x_dim=x_dim, dec_dim=dec_dim, width=img_width, height=img_height, N=read_N, **inits) writer = AttentionWriter(input_dim=dec_dim, output_dim=x_dim, width=img_width, height=img_height, N=read_N, **inits) else: read_dim = 2 * x_dim reader = Reader(x_dim=x_dim, dec_dim=dec_dim, **inits) writer = Writer(input_dim=dec_dim, output_dim=x_dim, **inits) encoder = LSTM(dim=enc_dim, name="RNN_enc", **rnninits) decoder = LSTM(dim=dec_dim, name="RNN_dec", **rnninits) encoder_mlp = MLP([Tanh()], [(read_dim + dec_dim), 4 * enc_dim], name="MLP_enc", **inits) decoder_mlp = MLP([Tanh()], [z_dim, 4 * dec_dim], name="MLP_dec", **inits) q_sampler = Qsampler(input_dim=enc_dim, output_dim=z_dim, **inits) for brick in [ reader, writer, encoder, decoder, encoder_mlp, decoder_mlp, q_sampler ]: brick.allocate() brick.initialize() #------------------------------------------------------------------------ x = tensor.matrix('features') # This is one iteration def one_iteration(c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec, x): x_hat = x - T.nnet.sigmoid(c) r = reader.apply(x, x_hat, h_dec) i_enc = encoder_mlp.apply(T.concatenate([r, h_dec], axis=1)) h_enc, c_enc = encoder.apply(states=h_enc, cells=c_enc, inputs=i_enc, iterate=False) z_mean, z_log_sigma, z = q_sampler.apply(h_enc) i_dec = decoder_mlp.apply(z) h_dec, c_dec = decoder.apply(states=h_dec, cells=c_dec, inputs=i_dec, iterate=False) c = c + writer.apply(h_dec) return c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec outputs_info = [ T.zeros([batch_size, x_dim]), # c T.zeros([batch_size, enc_dim]), # h_enc T.zeros([batch_size, enc_dim]), # c_enc T.zeros([batch_size, z_dim]), # z_mean T.zeros([batch_size, z_dim]), # z_log_sigma T.zeros([batch_size, z_dim]), # z T.zeros([batch_size, dec_dim]), # h_dec T.zeros([batch_size, dec_dim]), # c_dec ] outputs, scan_updates = theano.scan(fn=one_iteration, sequences=[], outputs_info=outputs_info, non_sequences=[x], n_steps=n_iter) c, h_enc, c_enc, z_mean, z_log_sigma, z, h_dec, c_dec = outputs kl_terms = (prior_log_sigma - z_log_sigma + 0.5 * (tensor.exp(2 * z_log_sigma) + (z_mean - prior_mu)**2) / tensor.exp(2 * prior_log_sigma) - 0.5).sum(axis=-1) x_recons = T.nnet.sigmoid(c[-1, :, :]) recons_term = BinaryCrossEntropy().apply(x, x_recons) recons_term.name = "recons_term" cost = recons_term + kl_terms.sum(axis=0).mean() cost.name = "nll_bound" #------------------------------------------------------------ cg = ComputationGraph([cost]) params = VariableFilter(roles=[PARAMETER])(cg.variables) algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule([ #StepClipping(3.), Adam(learning_rate), ]) #step_rule=RMSProp(learning_rate), #step_rule=Momentum(learning_rate=learning_rate, momentum=0.95) ) algorithm.add_updates(scan_updates) #------------------------------------------------------------------------ # Setup monitors monitors = [cost] for t in range(n_iter): kl_term_t = kl_terms[t, :].mean() kl_term_t.name = "kl_term_%d" % t x_recons_t = T.nnet.sigmoid(c[t, :, :]) recons_term_t = BinaryCrossEntropy().apply(x, x_recons_t) recons_term_t = recons_term_t.mean() recons_term_t.name = "recons_term_%d" % t monitors += [kl_term_t, recons_term_t] train_monitors = monitors[:] train_monitors += [aggregation.mean(algorithm.total_gradient_norm)] train_monitors += [aggregation.mean(algorithm.total_step_norm)] # Live plotting... plot_channels = [["train_nll_bound", "test_nll_bound"], ["train_kl_term_%d" % t for t in range(n_iter)], ["train_recons_term_%d" % t for t in range(n_iter)], ["train_total_gradient_norm", "train_total_step_norm"]] #------------------------------------------------------------ mnist_train = BinarizedMNIST("train", sources=['features']) mnist_test = BinarizedMNIST("test", sources=['features']) #mnist_train = MNIST("train", binary=True, sources=['features']) #mnist_test = MNIST("test", binary=True, sources=['features']) main_loop = MainLoop( model=None, data_stream=ForceFloatX( DataStream(mnist_train, iteration_scheme=SequentialScheme( mnist_train.num_examples, batch_size))), algorithm=algorithm, extensions=[ Timing(), FinishAfter(after_n_epochs=epochs), DataStreamMonitoring( monitors, ForceFloatX( DataStream(mnist_test, iteration_scheme=SequentialScheme( mnist_test.num_examples, batch_size))), updates=scan_updates, prefix="test"), TrainingDataMonitoring(train_monitors, prefix="train", after_every_epoch=True), SerializeMainLoop(name + ".pkl"), Plot(name, channels=plot_channels), ProgressBar(), Printing() ]) main_loop.run()
def build_theano_functions(self): x = T.ftensor3('x') # shape of input : batch X time X value y = T.ftensor4('y') layers_input = [x] dims = np.array([self.time_dim]) for dim in self.lstm_layers_dim: dims = np.append(dims, dim) print "Dimensions =", dims # layer is just an index of the layer for layer in range(len(self.lstm_layers_dim)): # before the cell, input, forget and output gates, x needs to # be transformed linear = Linear( dims[layer], dims[layer + 1] * 4, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=1.,std=1), biases_init=Constant(0), name="linear" + str(layer)) linear.initialize() lstm_input = linear.apply(layers_input[layer]) # the lstm wants batch X time X value lstm = LSTM(dim=dims[layer + 1], weights_init=IsotropicGaussian(mean=0., std=0.5), biases_init=Constant(1), name="lstm" + str(layer)) lstm.initialize() # hack to use Orthogonal on lstm w_state lstm.W_state.set_value( self.orth_scale * Orthogonal().generate(np.random, lstm.W_state.get_value().shape)) h, _dummy = lstm.apply(lstm_input) layers_input.append(h) # this is where Alex Graves' paper starts print "Last linear transform dim :", dims[1:].sum() output_transform = Linear( dims[1:].sum(), self.output_dim, weights_init=Orthogonal(self.orth_scale), #weights_init=IsotropicGaussian(mean=0., std=1), use_bias=False, name="output_transform") output_transform.initialize() if len(self.lstm_layers_dim) == 1: print "hallo there, only one layer speaking" y_hat = output_transform.apply(layers_input[-1]) else: y_hat = output_transform.apply( T.concatenate(layers_input[1:], axis=2)) # transforms to find each gmm params (mu, pi, sig) # small hack to softmax a 3D tensor #pis = T.reshape( # T.nnet.softmax( # T.nnet.sigmoid( # T.reshape(y_hat[:,:,0:self.gmm_dim], (self.time_dim*self.batch_dim, self.gmm_dim)))), # (self.batch_dim, self.time_dim, self.gmm_dim)) pis = T.reshape( T.nnet.softmax( T.reshape(y_hat[:, :, :self.gmm_dim], (self.sequence_dim * self.batch_dim, self.gmm_dim))), (self.batch_dim, self.sequence_dim, self.gmm_dim)) sig = T.exp(y_hat[:, :, self.gmm_dim:self.gmm_dim * 2]) + 1e-6 #sig = T.nnet.relu(y_hat[:,:,self.gmm_dim:self.gmm_dim*2])+0.1 #mus = 2.*T.tanh(y_hat[:,:,self.gmm_dim*2:]) mus = y_hat[:, :, self.gmm_dim * 2:] pis = pis[:, :, :, np.newaxis] mus = mus[:, :, :, np.newaxis] sig = sig[:, :, :, np.newaxis] #y = y[:,:,np.newaxis,:] y = T.patternbroadcast(y, (False, False, True, False)) mus = T.patternbroadcast(mus, (False, False, False, True)) sig = T.patternbroadcast(sig, (False, False, False, True)) # sum likelihood with targets # see blog for this crazy Pr() = sum log sum prod # axes :: (batch, sequence, mixture, time) expo_term = -0.5 * ((y - mus)**2) / sig**2 coeff = T.log(T.maximum(1. / (T.sqrt(2. * np.pi) * sig), EPS)) #coeff = T.log(1./(T.sqrt(2.*np.pi)*sig)) sequences = coeff + expo_term log_sequences = T.log(pis + EPS) + T.sum( sequences, axis=3, keepdims=True) log_sequences_max = T.max(log_sequences, axis=2, keepdims=True) LL = -(log_sequences_max + T.log(EPS + T.sum( T.exp(log_sequences - log_sequences_max), axis=2, keepdims=True)) ).mean() model = Model(LL) self.model = model parameters = model.parameters grads = T.grad(LL, parameters) updates = [] lr = T.scalar('lr') for i in range(len(grads)): #updates.append(tuple([parameters[i], parameters[i] - self.lr*grads[i]])) updates.append( tuple([parameters[i], parameters[i] - lr * grads[i]])) #gradf = theano.function([x, y],[LL],updates=updates, mode=NanGuardMode(nan_is_error=True, inf_is_error=True, big_is_error=False)) if self.debug: gradf = theano.function([x, y, lr], [LL, pis, mus, sig], updates=updates) else: #gradf = theano.function([x, y, z],[zLL],updates=updates) gradf = theano.function([x, y, lr], [LL], updates=updates) f = theano.function([x], [pis, sig, mus]) return gradf, f
def test_sequence_generator(): """Test a sequence generator with no contexts and continuous outputs. Such sequence generators can be used to model e.g. dynamical systems. """ rng = numpy.random.RandomState(1234) output_dim = 1 dim = 20 batch_size = 30 n_steps = 10 transition = SimpleRecurrent(activation=Tanh(), dim=dim, weights_init=Orthogonal()) generator = SequenceGenerator( Readout(readout_dim=output_dim, source_names=["states"], emitter=TestEmitter()), transition, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0.0), seed=1234) generator.initialize() # Test 'cost_matrix' method y = tensor.tensor3('y') mask = tensor.matrix('mask') costs = generator.cost_matrix(y, mask) assert costs.ndim == 2 y_test = rng.uniform(size=(n_steps, batch_size, output_dim)).astype(floatX) m_test = numpy.ones((n_steps, batch_size), dtype=floatX) costs_val = theano.function([y, mask], [costs])(y_test, m_test)[0] assert costs_val.shape == (n_steps, batch_size) assert_allclose(costs_val.sum(), 115.593, rtol=1e-5) # Test 'cost' method cost = generator.cost(y, mask) assert cost.ndim == 0 cost_val = theano.function([y, mask], [cost])(y_test, m_test) assert_allclose(cost_val, 3.8531, rtol=1e-5) # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method cg = ComputationGraph([cost]) var_filter = VariableFilter(roles=[AUXILIARY]) aux_var_name = '_'.join([generator.name, generator.cost.name, 'per_sequence_element']) cost_per_el = [el for el in var_filter(cg.variables) if el.name == aux_var_name][0] assert cost_per_el.ndim == 0 cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test) assert_allclose(cost_per_el_val, 0.38531, rtol=1e-5) # Test 'generate' method states, outputs, costs = [variable.eval() for variable in generator.generate( states=rng.uniform( size=(batch_size, dim)).astype(floatX), iterate=True, batch_size=batch_size, n_steps=n_steps)] assert states.shape == (n_steps, batch_size, dim) assert outputs.shape == (n_steps, batch_size, output_dim) assert costs.shape == (n_steps, batch_size) assert_allclose(outputs.sum(), -0.33683, rtol=1e-5) assert_allclose(states.sum(), 15.7909, rtol=1e-5) # There is no generation cost in this case, since generation is # deterministic assert_allclose(costs.sum(), 0.0)
def main(config, tr_stream, dev_stream): # Create Theano variables source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Test values ''' theano.config.compute_test_value = 'warn' source_sentence.tag.test_value = numpy.random.randint(10, size=(10, 10)) target_sentence.tag.test_value = numpy.random.randint(10, size=(10, 10)) source_sentence_mask.tag.test_value = \ numpy.random.rand(10, 10).astype('float32') target_sentence_mask.tag.test_value = \ numpy.random.rand(10, 10).astype('float32') sampling_input.tag.test_value = numpy.random.randint(10, size=(10, 10)) ''' # Construct model encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) # Initialize model encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() cg = ComputationGraph(cost) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] print('Parameter shapes') for shape, count in Counter(shapes).most_common(): print(' {:15}: {}'.format(shape, count)) # Set up training algorithm algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Set up beam search and sampling computation graphs sampling_representation = encoder.apply(sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) samples, = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")(ComputationGraph( generated[1])) # generated[1] is the next_outputs # Set up training model training_model = Model(cost) enc_param_dict = Selector(encoder).get_params() dec_param_dict = Selector(decoder).get_params() gh_model_name = '/data/lisatmp3/firatorh/nmt/wmt15/trainedModels/blocks/sanity/refGHOG_adadelta_40k_best_bleu_model.npz' tmp_file = numpy.load(gh_model_name) gh_model = dict(tmp_file) tmp_file.close() for key in enc_param_dict: print '{:15}: {}'.format(enc_param_dict[key].get_value().shape, key) for key in dec_param_dict: print '{:15}: {}'.format(dec_param_dict[key].get_value().shape, key) enc_param_dict['/bidirectionalencoder/embeddings.W'].set_value( gh_model['W_0_enc_approx_embdr']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/forward.state_to_state'].set_value( gh_model['W_enc_transition_0']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/forward.state_to_update'].set_value( gh_model['G_enc_transition_0']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/forward.state_to_reset'].set_value( gh_model['R_enc_transition_0']) enc_param_dict['/bidirectionalencoder/fwd_fork/fork_inputs.W'].set_value( gh_model['W_0_enc_input_embdr_0']) enc_param_dict['/bidirectionalencoder/fwd_fork/fork_inputs.b'].set_value( gh_model['b_0_enc_input_embdr_0']) enc_param_dict[ '/bidirectionalencoder/fwd_fork/fork_update_inputs.W'].set_value( gh_model['W_0_enc_update_embdr_0']) enc_param_dict[ '/bidirectionalencoder/fwd_fork/fork_reset_inputs.W'].set_value( gh_model['W_0_enc_reset_embdr_0']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/backward.state_to_state'].set_value( gh_model['W_back_enc_transition_0']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/backward.state_to_update'].set_value( gh_model['G_back_enc_transition_0']) enc_param_dict[ '/bidirectionalencoder/bidirectionalwmt15/backward.state_to_reset'].set_value( gh_model['R_back_enc_transition_0']) enc_param_dict['/bidirectionalencoder/back_fork/fork_inputs.W'].set_value( gh_model['W_0_back_enc_input_embdr_0']) enc_param_dict['/bidirectionalencoder/back_fork/fork_inputs.b'].set_value( gh_model['b_0_back_enc_input_embdr_0']) enc_param_dict[ '/bidirectionalencoder/back_fork/fork_update_inputs.W'].set_value( gh_model['W_0_back_enc_update_embdr_0']) enc_param_dict[ '/bidirectionalencoder/back_fork/fork_reset_inputs.W'].set_value( gh_model['W_0_back_enc_reset_embdr_0']) dec_param_dict[ '/decoder/sequencegenerator/readout/lookupfeedbackwmt15/lookuptable.W'].set_value( gh_model['W_0_dec_approx_embdr']) #dec_param_dict['/decoder/sequencegenerator/readout/lookupfeedback/lookuptable.W'].set_value(gh_model['W_0_dec_approx_embdr']) dec_param_dict[ '/decoder/sequencegenerator/readout/initializablefeedforwardsequence/maxout_bias.b'].set_value( gh_model['b_0_dec_hid_readout_0']) dec_param_dict[ '/decoder/sequencegenerator/readout/initializablefeedforwardsequence/softmax0.W'].set_value( gh_model['W1_dec_deep_softmax']) # Missing W1 dec_param_dict[ '/decoder/sequencegenerator/readout/initializablefeedforwardsequence/softmax1.W'].set_value( gh_model['W2_dec_deep_softmax']) dec_param_dict[ '/decoder/sequencegenerator/readout/initializablefeedforwardsequence/softmax1.b'].set_value( gh_model['b_dec_deep_softmax']) dec_param_dict[ '/decoder/sequencegenerator/readout/merge/transform_states.W'].set_value( gh_model['W_0_dec_hid_readout_0']) dec_param_dict[ '/decoder/sequencegenerator/readout/merge/transform_feedback.W'].set_value( gh_model['W_0_dec_prev_readout_0']) dec_param_dict[ '/decoder/sequencegenerator/readout/merge/transform_weighted_averages.W'].set_value( gh_model['W_0_dec_repr_readout']) dec_param_dict[ '/decoder/sequencegenerator/readout/merge/transform_weighted_averages.b'].set_value( gh_model['b_0_dec_repr_readout']) dec_param_dict['/decoder/sequencegenerator/fork/fork_inputs.b'].set_value( gh_model['b_0_dec_input_embdr_0']) dec_param_dict['/decoder/sequencegenerator/fork/fork_inputs.W'].set_value( gh_model['W_0_dec_input_embdr_0']) dec_param_dict[ '/decoder/sequencegenerator/fork/fork_update_inputs.W'].set_value( gh_model['W_0_dec_update_embdr_0']) dec_param_dict[ '/decoder/sequencegenerator/fork/fork_reset_inputs.W'].set_value( gh_model['W_0_dec_reset_embdr_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_inputs.W'].set_value( gh_model['W_0_dec_dec_inputter_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_inputs.b'].set_value( gh_model['b_0_dec_dec_inputter_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_update_inputs.W'].set_value( gh_model['W_0_dec_dec_updater_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_update_inputs.b'].set_value( gh_model['b_0_dec_dec_updater_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_reset_inputs.W'].set_value( gh_model['W_0_dec_dec_reseter_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/distribute/fork_reset_inputs.b'].set_value( gh_model['b_0_dec_dec_reseter_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/decoder.state_to_state'].set_value( gh_model['W_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/decoder.state_to_update'].set_value( gh_model['G_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/decoder.state_to_reset'].set_value( gh_model['R_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/attention/state_trans/transform_states.W'].set_value( gh_model['B_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/attention/preprocess.W'].set_value( gh_model['A_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/attention/energy_comp/linear.W'].set_value( gh_model['D_dec_transition_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/decoder/state_initializer/linear_0.W'].set_value( gh_model['W_0_dec_initializer_0']) dec_param_dict[ '/decoder/sequencegenerator/att_trans/decoder/state_initializer/linear_0.b'].set_value( gh_model['b_0_dec_initializer_0']) config['val_burn_in'] = -1 # Initialize main loop main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=[ FinishAfter(after_n_batches=1), Sampler(model=search_model, config=config, data_stream=tr_stream, every_n_batches=config['sampling_freq']), BleuValidator( sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_eos_idx=config['src_eos_idx'], trg_eos_idx=config['trg_eos_idx'], before_training=True, before_batch=True), #every_n_batches=config['bleu_val_freq']), TrainingDataMonitoring([cost], after_batch=True), #Plot('En-Fr', channels=[['decoder_cost_cost']], # after_batch=True), Printing(after_batch=True) ]) # Train! main_loop.run()