def construct_model(input_dim, output_dim): # Construct the model r = tensor.fmatrix('r') x = tensor.fmatrix('x') y = tensor.ivector('y') # input_dim must be nr mlp = MLP(activations=activation_functions, dims=[input_dim] + hidden_dims + [2]) weights = mlp.apply(r) final = tensor.dot(x, weights) cost = Softmax().categorical_cross_entropy(y, final).mean() pred = final.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in [mlp]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply noise cg = ComputationGraph([cost, error_rate]) noise_vars = VariableFilter(roles=[WEIGHT])(cg) apply_noise(cg, noise_vars, noise_std) [cost, error_rate] = cg.outputs return cost, error_rate
def construct_model(input_dim, output_dim): # Construct the model r = tensor.fmatrix('r') x = tensor.fmatrix('x') y = tensor.ivector('y') nx = x.shape[0] nj = x.shape[1] # also is r.shape[0] nr = r.shape[1] # r is nj x nr # x is nx x nj # y is nx x 1 # r_rep is nx x nj x nr r_rep = r[None, :, :].repeat(axis=0, repeats=nx) # x3 is nx x nj x 1 x3 = x[:, :, None] # concat is nx x nj x (nr + 1) concat = tensor.concatenate([r_rep, x3], axis=2) mlp_input = concat.reshape((nx * nj, nr + 1)) # input_dim must be nr mlp = MLP(activations=activation_functions, dims=[input_dim + 1] + hidden_dims + [output_dim]) activations = mlp.apply(mlp_input) act_sh = activations.reshape((nx, nj, output_dim)) final = act_sh.mean(axis=1) cost = Softmax().categorical_cross_entropy(y, final).mean() pred = final.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in [mlp]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply noise cg = ComputationGraph([cost, error_rate]) noise_vars = VariableFilter(roles=[WEIGHT])(cg) apply_noise(cg, noise_vars, noise_std) [cost_reg, error_rate_reg] = cg.outputs return cost_reg, error_rate_reg, cost, error_rate
def construct_model(input_dim, output_dim): # Construct the model r = tensor.fmatrix('r') x = tensor.fmatrix('x') y = tensor.ivector('y') nx = x.shape[0] nj = x.shape[1] # also is r.shape[0] nr = r.shape[1] # r is nj x nr # x is nx x nj # y is nx x 1 # r_rep is nx x nj x nr r_rep = r[None, :, :].repeat(axis=0, repeats=nx) # x3 is nx x nj x 1 x3 = x[:, :, None] # concat is nx x nj x (nr + 1) concat = tensor.concatenate([r_rep, x3], axis=2) mlp_input = concat.reshape((nx * nj, nr + 1)) # input_dim must be nr mlp = MLP(activations=activation_functions, dims=[input_dim+1] + hidden_dims + [output_dim]) activations = mlp.apply(mlp_input) act_sh = activations.reshape((nx, nj, output_dim)) final = act_sh.mean(axis=1) cost = Softmax().categorical_cross_entropy(y, final).mean() pred = final.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in [mlp]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply noise cg = ComputationGraph([cost, error_rate]) noise_vars = VariableFilter(roles=[WEIGHT])(cg) apply_noise(cg, noise_vars, noise_std) [cost_reg, error_rate_reg] = cg.outputs return cost_reg, error_rate_reg, cost, error_rate
def test_apply_noise(): x = tensor.scalar() y = tensor.scalar() z = x + y cg = ComputationGraph([z]) noised_cg = apply_noise(cg, [y], 1, 1) assert_allclose( noised_cg.outputs[0].eval({x: 1., y: 1.}), 2 + MRG_RandomStreams(1).normal(tuple()).eval())
def main(save_to, num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=200, num_batches=None): if feature_maps is None: feature_maps = [32, 32, 64, 64, 128, 128] if mlp_hiddens is None: mlp_hiddens = [1000] if conv_sizes is None: conv_sizes = [7, 5, 5, 5, 3, 3] if pool_sizes is None: pool_sizes = [2, 2, 2, 2, 2, 2] image_size = (128, 128) batch_size = 64 output_size = 2 learningRate = 0.01 drop_prob = 0.4 weight_noise = 0.75 num_epochs = 150 num_batches = None # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 3, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', weights_init=IsotropicGaussian(std=0.2, mean=0), biases_init=IsotropicGaussian(std=0.09, mean=0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = IsotropicGaussian(std=0.2, mean=0) convnet.layers[1].weights_init = IsotropicGaussian(std=0.09, mean=0) convnet.top_mlp.linear_transformations[0].weights_init = IsotropicGaussian( std=0.8, mean=0) convnet.top_mlp.linear_transformations[1].weights_init = IsotropicGaussian( std=0.11, mean=0) convnet.initialize() logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format(i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) cost = (CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs).copy(name='error_rate')) error_rate2 = error_rate.copy(name='error_rate2') cg = ComputationGraph([cost, error_rate]) weights = VariableFilter(roles=[FILTER, WEIGHT])(cg.variables) ############# Dropout ############# logger.info('Applying dropout') cg = apply_dropout(cg, weights[-1:0], drop_prob) #Dropout only on fully-connected layer dropped_out = VariableFilter(roles=[DROPOUT])(cg.variables) ############# Guaussian Noise ############# logger.info('Applying Gaussian noise') cg = apply_noise(cg, weights, weight_noise) ########### Loading images ##################### from fuel.datasets.dogs_vs_cats import DogsVsCats from fuel.streams import DataStream, ServerDataStream from fuel.schemes import ShuffledScheme from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, Random2DRotation from fuel.transformers import Flatten, Cast, ScaleAndShift def create_data(data): stream = DataStream(data, iteration_scheme=ShuffledScheme( data.num_examples, batch_size)) stream = MinimumImageDimensions(stream, image_size, which_sources=('image_features', )) stream = MaximumImageDimensions(stream, image_size, which_sources=('image_features', )) stream = RandomHorizontalSwap(stream, which_sources=('image_features', )) stream = Random2DRotation(stream, which_sources=('image_features', )) #stream = ScikitResize(stream, image_size, which_sources=('image_features',)) stream = ScaleAndShift(stream, 1. / 255, 0, which_sources=('image_features', )) stream = Cast(stream, dtype='float32', which_sources=('image_features', )) return stream #stream_data_train = ServerDataStream(('image_features','targets'), False, port=5560) #stream_data_valid = ServerDataStream(('image_features','targets'), False, port=5561) stream_data_train = create_data( DogsVsCats(('train', ), subset=slice(0, 22500))) stream_data_valid = create_data( DogsVsCats(('train', ), subset=slice(22500, 25000))) #stream_data_train = create_data(DogsVsCats(('train',), subset=slice(0, 10))) #stream_data_valid = create_data(DogsVsCats(('train',), subset=slice(10, 12))) # Train with simple SGD # On the importance of initialization and momentum in deep learning: choose lowest momentum w/ lowest error algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Momentum(learning_rate=learningRate, momentum=0.995)) #algorithm = GradientDescent(cost=cost, parameters=cg.parameters,step_rule=Scale(learning_rate=learningRate)) #algorithm = GradientDescent(cost=cost, parameters=cg.parameters,step_rule=Adam(0.001)) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [] extensions.append(Timing()) extensions.append( FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches)) extensions.append( DataStreamMonitoring([cost, error_rate], stream_data_valid, prefix="valid")) extensions.append( TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True)) extensions.append( Checkpoint("Model1_isogaussian_init.pkl", after_epoch=True, after_training=True, save_separately=['log'])) extensions.append(ProgressBar()) extensions.append(Printing()) host_plot = 'http://*****:*****@ %s' % ('CNN1_isogaussian', datetime.datetime.now(), socket.gethostname()), channels=[['train_error_rate', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) logger.info("Building the model") model = Model(cost) ########### Loading images ##################### main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def set_up_predictor(self, nmt_model_path): """Initializes the predictor with the given NMT model. Code following ``blocks.machine_translation.main``. """ self.src_vocab_size = self.config['src_vocab_size'] self.trgt_vocab_size = self.config['trg_vocab_size'] # Create Theano variables logging.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logging.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(self.config['src_vocab_size'], self.config['enc_embed'], self.config['enc_nhids']) decoder = Decoder(self.config['trg_vocab_size'], self.config['dec_embed'], self.config['dec_nhids'], self.config['enc_nhids'] * 2) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logging.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model (TODO: really necessary?) logging.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( self.config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # Apply dropout for regularization (TODO: remove?) if self.config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logging.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, self.config['dropout']) # Apply weight noise for regularization (TODO: remove?) if self.config['weight_noise_ff'] > 0.0: logging.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params+dec_params, self.config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logging.debug("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logging.debug(' {:15}: {}'.format(shape, count)) logging.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logging.debug("Parameter names: ") for name, value in enc_dec_param_dict.items(): logging.debug(' {:15}: {}'.format(value.get_value().shape, name)) logging.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logging.info("Building model") # Set extensions logging.info("Initializing extensions") # Set up beam search and sampling computation graphs if necessary logging.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Follows blocks.machine_translation.BleuValidator.__init__ self.source_sentence = sampling_input self.samples = samples self.model = search_model self.normalize = True self.verbose = self.config.get('val_set_out', None) # Reload model if necessary if self.config['reload']: loader = LoadNMT(nmt_model_path, self.config['saveto'], search_model) loader.load_weights() self.best_models = [] self.val_bleu_curve = [] self.search_algorithm = MyopticSearch(samples=samples) self.search_algorithm.compile()
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, config['topical_embedding_dim']) topical_transformer = topicalq_transformer(config['topical_vocab_size'], config['topical_embedding_dim'], config['enc_nhids'], config['topical_word_num'], config['batch_size']) if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') source_topical_word = tensor.lmatrix('source_topical') source_topical_mask = tensor.matrix('source_topical_mask') # Get training and development set streams tr_stream = get_tr_stream_with_topicalq(**config) dev_stream = get_dev_stream_with_topicalq(**config) topic_embedding = topical_transformer.apply(source_topical_word) # Get cost of the model representation = encoder.apply(source_sentence, source_sentence_mask) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = representation[0, :, (representation.shape[2] / 2):] cost = decoder.cost(representation, source_sentence_mask, tw_representation, source_topical_mask, target_sentence, target_sentence_mask, topic_embedding, content_embedding) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() topical_transformer.weights_init = IsotropicGaussian( config['weight_scale']) topical_transformer.biases_init = Constant(0) topical_transformer.push_allocation_config() #don't know whether the initialize is for topical_transformer.look_up.weights_init = Orthogonal() topical_transformer.transformer.weights_init = Orthogonal() topical_transformer.initialize() word_topical_embedding = cPickle.load( open(config['topical_embeddings'], 'rb')) np_word_topical_embedding = numpy.array(word_topical_embedding, dtype='float32') topical_transformer.look_up.W.set_value(np_word_topical_embedding) topical_transformer.look_up.W.tag.role = [] # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] ''' # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate( sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) ''' # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, on_unused_sources='warn', step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run() elif mode == 'translate': # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_topical_word = tensor.lmatrix('source_topical') # Get test set stream test_stream = get_dev_stream_with_topicalq( config['test_set'], config['src_vocab'], config['src_vocab_size'], config['topical_test_set'], config['topical_vocab'], config['topical_vocab_size'], config['unk_id']) ftrans = open(config['test_set'] + '.trans.out', 'w') # Helper utilities sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 # Get beam search logger.info("Building sampling model") topic_embedding = topical_transformer.apply(source_topical_word) representation = encoder.apply(source_sentence, tensor.ones(source_sentence.shape)) tw_representation = topical_transformer.look_up.apply( source_topical_word.T) content_embedding = representation[0, :, (representation.shape[2] / 2):] generated = decoder.generate(source_sentence, representation, tw_representation, topical_embedding=topic_embedding, content_embedding=content_embedding) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) # Get target vocabulary trg_vocab = _ensure_special_tokens(pickle.load( open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Started translation: ") total_cost = 0.0 for i, line in enumerate(test_stream.get_epoch_iterator()): seq = sutils._oov_to_unk(line[0], config['src_vocab_size'], unk_idx) seq2 = line[1] input_ = numpy.tile(seq, (config['beam_size'], 1)) input_topical = numpy.tile(seq2, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ beam_search.search( input_values={source_sentence: input_,source_topical_word:input_topical}, max_length=10*len(seq), eol_symbol=src_eos_idx, ignore_first_eol=True) ''' # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths ''' #best = numpy.argsort(costs)[0] best = numpy.argsort(costs)[0:config['beam_size']] for b in best: try: total_cost += costs[b] trans_out = trans[b] # convert idx to words trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i + 1)) trans_out = '<UNK>' print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info("Translated {} lines of test set...".format(i)) logger.info("Total cost of the test: {}".format(total_cost)) ftrans.close() elif mode == 'rerank': # Create Theano variables ftrans = open(config['val_set'] + '.scores.out', 'w') logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') config['src_data'] = config['val_set'] config['trg_data'] = config['val_set_grndtruth'] config['batch_size'] = 1 config['sort_k_batches'] = 1 test_stream = get_tr_stream_unsorted(**config) logger.info("Building sampling model") representations = encoder.apply(source_sentence, source_sentence_mask) costs = decoder.cost(representations, source_sentence_mask, target_sentence, target_sentence_mask) logger.info("Loading the model..") model = Model(costs) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) costs_computer = function([ source_sentence, source_sentence_mask, target_sentence, target_sentence_mask ], costs) iterator = test_stream.get_epoch_iterator() scores = [] for i, (src, src_mask, trg, trg_mask) in enumerate(iterator): costs = costs_computer(*[src, src_mask, trg, trg_mask]) cost = costs.sum() print(i, cost) scores.append(cost) ftrans.write(str(cost) + "\n") ftrans.close()
def __init__(self, ref_data, output_dim): input_dim = ref_data.shape[1] ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data') rng = RandomStreams() ae_bricks = [] ae_input = ref_data_sh ae_costs = [] for i, (idim, odim) in enumerate(zip([input_dim] + ae_dims[:-1], ae_dims)): ae_mlp = MLP(activations=[ae_activations[i]], dims=[idim, odim], name='enc%i' % i) enc = ae_mlp.apply(ae_input) enc_n = ae_mlp.apply( ae_input + rng.normal(size=ae_input.shape, std=ae_f_noise_std)) ae_mlp_dec = MLP(activations=[ae_activations[i]], dims=[odim, idim], name='dec%i' % i) dec = ae_mlp_dec.apply(enc_n) cost = tensor.sqrt(((ae_input - dec) ** 2).sum(axis=1)).mean() + \ ae_l1_pen * abs(enc).sum(axis=1).mean() ae_costs.append(cost) ae_input = enc ae_bricks = ae_bricks + [ae_mlp, ae_mlp_dec] self.ae_costs = ae_costs ref_data_enc = ae_input # Construct the model j = tensor.lvector('j') r = ref_data_enc[j, :] x = tensor.fmatrix('x') y = tensor.ivector('y') # input_dim must be nr mlp = MLP(activations=activation_functions, dims=[ae_dims[-1]] + hidden_dims + [n_inter], name='inter_gen') mlp2 = MLP(activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name='end_mlp') inter_weights = mlp.apply(r) if inter_bias == None: ibias = Bias(n_inter) ibias.biases_init = Constant(0) ibias.initialize() inter = ibias.apply(tensor.dot(x, inter_weights)) else: inter = tensor.dot(x, inter_weights) - inter_bias inter = inter_act_fun.apply(inter) final = mlp2.apply(inter) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) # error_rate = tensor.neq(y, pred).mean() ber = balanced_error_rate.ber(y, pred) # Initialize parameters for brick in ae_bricks + [mlp, mlp2]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply regularization cg = ComputationGraph([cost, ber]) if r_dropout != 0: # - dropout on input vector r : r_dropout cg = apply_dropout(cg, [r], r_dropout) if x_dropout != 0: cg = apply_dropout(cg, [x], x_dropout) if s_dropout != 0: # - dropout on intermediate layers of first mlp : s_dropout s_dropout_vars = list( set( VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([inter_weights]))) - set([inter_weights])) cg = apply_dropout(cg, s_dropout_vars, s_dropout) if i_dropout != 0: # - dropout on input to second mlp : i_dropout cg = apply_dropout(cg, [inter], i_dropout) if a_dropout != 0: # - dropout on hidden layers of second mlp : a_dropout a_dropout_vars = list( set( VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([final]))) - set([inter_weights]) - set(s_dropout_vars)) cg = apply_dropout(cg, a_dropout_vars, a_dropout) if r_noise_std != 0: cg = apply_noise(cg, [r], r_noise_std) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) [cost_reg, ber_reg] = cg.outputs if s_l1pen != 0: s_weights = VariableFilter(bricks=mlp.linear_transformations, roles=[WEIGHT])(cg) cost_reg = cost_reg + s_l1pen * sum( abs(w).sum() for w in s_weights) if i_l1pen != 0: cost_reg = cost_reg + i_l1pen * abs(inter).sum() if a_l1pen != 0: a_weights = VariableFilter(bricks=mlp2.linear_transformations, roles=[WEIGHT])(cg) cost_reg = cost_reg + a_l1pen * sum( abs(w).sum() for w in a_weights) self.cost = cost self.cost_reg = cost_reg self.ber = ber self.ber_reg = ber_reg self.pred = pred self.confidence = confidence
def train(model, configs): get_streams = configs['get_streams'] save_path = configs['save_path'] num_epochs = configs['num_epochs'] batch_size = configs['batch_size'] lrs = configs['lrs'] until_which_epoch = configs['until_which_epoch'] grad_clipping = configs['grad_clipping'] monitorings = model.monitorings # Training if configs['weight_noise'] > 0: cg = ComputationGraph(model.cost) weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg = apply_noise(cg, weights, configs['weight_noise']) model.cost = cg.outputs[0].copy(name='CE') if configs['l2_reg'] > 0: cg = ComputationGraph(model.cost) weights = VariableFilter(roles=[WEIGHT])(cg.variables) new_cost = model.cost + configs['l2_reg'] * sum([ (weight ** 2).sum() for weight in weights]) model.cost = new_cost.copy(name='CE') blocks_model = Model(model.cost) all_params = blocks_model.parameters print "Number of found parameters:" + str(len(all_params)) print all_params default_lr = np.float32(configs['lrs'][0]) lr_var = theano.shared(default_lr, name="learning_rate") clipping = StepClipping(threshold=np.cast[floatX](grad_clipping)) # sgd_momentum = Momentum( # learning_rate=0.0001, # momentum=0.95) # step_rule = CompositeRule([clipping, sgd_momentum]) adam = Adam(learning_rate=lr_var) step_rule = CompositeRule([clipping, adam]) training_algorithm = GradientDescent( cost=model.cost, parameters=all_params, step_rule=step_rule, on_unused_sources='warn') monitored_variables = [ lr_var, aggregation.mean(training_algorithm.total_gradient_norm)] + monitorings for param in all_params: name = param.tag.annotations[0].name + "." + param.name to_monitor = training_algorithm.gradients[param].norm(2) to_monitor.name = name + "_grad_norm" monitored_variables.append(to_monitor) to_monitor = param.norm(2) to_monitor.name = name + "_norm" monitored_variables.append(to_monitor) train_data_stream, valid_data_stream = get_streams(batch_size) train_monitoring = TrainingDataMonitoring( variables=monitored_variables, prefix="train", after_epoch=True) valid_monitoring = DataStreamMonitoring( variables=monitored_variables, data_stream=valid_data_stream, prefix="valid", after_epoch=True) main_loop = MainLoop( algorithm=training_algorithm, data_stream=train_data_stream, model=blocks_model, extensions=[ train_monitoring, valid_monitoring, FinishAfter(after_n_epochs=num_epochs), SaveParams('valid_CE', blocks_model, save_path, after_epoch=True), SaveLog(after_epoch=True), ProgressBar(), # ErrorPerVideo(model, after_epoch=True, on_interrupt=True), LRDecay(lr_var, lrs, until_which_epoch, after_epoch=True), Printing(after_epoch=True)]) main_loop.run()
def __init__(self): inp = tensor.lmatrix('bytes') # Make state vars state_vars = {} for i, d in enumerate(hidden_dims): state_vars['states%d'%i] = theano.shared(numpy.zeros((num_seqs, d)) .astype(theano.config.floatX), name='states%d'%i) state_vars['cells%d'%i] = theano.shared(numpy.zeros((num_seqs, d)) .astype(theano.config.floatX), name='cells%d'%i) # Construct brick cchlstm = CCHLSTM(io_dim=io_dim, hidden_dims=hidden_dims, cond_cert=cond_cert, activation=activation_function) # Random pass passdict = {} for i, p in enumerate(block_prob): passdict['pass%d'%i] = rng.binomial(size=(inp.shape[1], inp.shape[0]), p=1-p) # Apply it outs = cchlstm.apply(inputs=inp.dimshuffle(1, 0), **dict(state_vars.items() + passdict.items())) states = [] active_prop = [] for i in range(len(hidden_dims)): states.append((state_vars['states%d'%i], outs[3*i+1][-1, :, :])) states.append((state_vars['cells%d'%i], outs[3*i+2][-1, :, :])) active_prop.append(outs[3*i+3].mean()) active_prop[-1].name = 'active_prop_%d'%i out = outs[0].dimshuffle(1, 0, 2) # Do prediction and calculate cost pred = out.argmax(axis=2) cost = Softmax().categorical_cross_entropy(inp[:, 1:].flatten(), out[:, :-1, :].reshape((inp.shape[0]*(inp.shape[1]-1), io_dim))) error_rate = tensor.neq(inp[:, 1:].flatten(), pred[:, :-1].flatten()).mean() # Initialize all bricks for brick in [cchlstm]: brick.weights_init = IsotropicGaussian(0.1) brick.biases_init = Constant(0.) brick.initialize() # Apply noise and dropoutvars cg = ComputationGraph([cost, error_rate]) if w_noise_std > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, w_noise_std) [cost_reg, error_rate_reg] = cg.outputs self.sgd_cost = cost_reg self.monitor_vars = [[cost, cost_reg], [error_rate, error_rate_reg], active_prop] cost.name = 'cost' cost_reg.name = 'cost_reg' error_rate.name = 'error_rate' error_rate_reg.name = 'error_rate_reg' self.out = out self.pred = pred self.states = states
def __init__(self): inp = tensor.lmatrix('bytes') in_onehot = tensor.eq(tensor.arange(io_dim, dtype='int16').reshape((1, 1, io_dim)), inp[:, :, None]) in_onehot.name = 'in_onehot' dgsrnn = DGSRNN(input_dim=io_dim, state_dim=state_dim, act=activation, transition_h=transition_hidden, tr_h_activations=transition_hidden_activations, name='dgsrnn') prev_state = theano.shared(numpy.zeros((num_seqs, state_dim)).astype(theano.config.floatX), name='state') states, resets = dgsrnn.apply(inputs=in_onehot.dimshuffle(1, 0, 2), drop_updates_mask=rng.binomial(size=(inp.shape[1], inp.shape[0], state_dim), p=1-drop_update, dtype=theano.config.floatX), state=prev_state) states = states.dimshuffle(1, 0, 2) resets = resets.dimshuffle(1, 0, 2) self.states = [(prev_state, states[:, -1, :])] out_mlp = MLP(dims=[state_dim] + output_hidden + [io_dim], activations=output_hidden_activations + [None], name='output_mlp') states_sh = states.reshape((inp.shape[0]*inp.shape[1], state_dim)) out = out_mlp.apply(states_sh).reshape((inp.shape[0], inp.shape[1], io_dim)) # Do prediction and calculate cost pred = out.argmax(axis=2) cost = Softmax().categorical_cross_entropy(inp[:, 1:].flatten(), out[:, :-1, :].reshape((inp.shape[0]*(inp.shape[1]-1), io_dim))) error_rate = tensor.neq(inp[:, 1:].flatten(), pred[:, :-1].flatten()).mean() # Initialize all bricks for brick in [dgsrnn, out_mlp]: brick.weights_init = IsotropicGaussian(0.001) brick.biases_init = Constant(0.0) brick.initialize() # Apply noise and dropout cg = ComputationGraph([cost, error_rate, states, resets]) if weight_noise_std > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, weight_noise_std) if output_h_dropout > 0: dv = VariableFilter(name='input_', bricks=out_mlp.linear_transformations)(cg) print "Output H dropout on", len(dv), "vars" cg = apply_dropout(cg, dv, output_h_dropout) [cost_reg, error_rate_reg, states, resets] = cg.outputs if l1_state > 0: cost_reg = cost_reg + l1_state * abs(states).mean() if l1_reset > 0: cost_reg = cost_reg + l1_reset * abs(resets).mean() self.cost = cost self.error_rate = error_rate self.cost_reg = cost_reg self.error_rate_reg = error_rate_reg self.out = out self.pred = pred
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs, test_cost, experiment_path, initialization, init_width, weight_noise, z_prob, z_prob_states, z_prob_cells, drop_prob_igates, ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len, decrease_lr_after_epoch, lr_decay, **kwargs): print '.. PTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() ########################################### # # LOAD DATA # ########################################### def onehot(x, numclasses=None): """ Convert integer encoding for class-labels (starting with 0 !) to one-hot encoding. The output is an array whose shape is the shape of the input array plus an extra dimension, containing the 'one-hot'-encoded labels. """ if x.shape == (): x = x[None] if numclasses is None: numclasses = x.max() + 1 result = numpy.zeros(list(x.shape) + [numclasses], dtype="int") z = numpy.zeros(x.shape, dtype="int") for c in range(numclasses): z *= 0 z[numpy.where(x == c)] = 1 result[..., c] += z return result.astype(theano.config.floatX) alphabetsize = 10000 data = np.load('penntree_char_and_word.npz') trainset = data['train_words'] validset = data['valid_words'] testset = data['test_words'] if testing: trainset = trainset[:3000] validset = validset[:3000] if share_mask: if not z_prob: raise ValueError('z_prob must be provided when using share_mask') if z_prob_cells or z_prob_states: raise ValueError( 'z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)' ) z_prob_cells = z_prob # we don't want to actually use these masks, so this is to debug z_prob_states = None else: if z_prob: raise ValueError('z_prob is only used with share_mask') z_prob_cells = z_prob_cells or '1' z_prob_states = z_prob_states or '1' # rng = np.random.RandomState(seed) ########################################### # # MAKE STREAMS # ########################################### def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream( IndexableDataset(indexables=OrderedDict([('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord(stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data', ) * 3 + stream.sources + ( 'zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream, ) train_stream, = prep_dataset(trainset) valid_stream, = prep_dataset(validset) test_stream, = prep_dataset(testset) #################### data = train_stream.get_epoch_iterator(as_dict=True).next() #################### ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('data') y = x zoneouts_states = T.tensor3('zoneouts_states') zoneouts_cells = T.tensor3('zoneouts_cells') zoneouts_igates = T.tensor3('zoneouts_igates') x.tag.test_value = data['data'] zoneouts_states.tag.test_value = data['zoneouts_states'] zoneouts_cells.tag.test_value = data['zoneouts_cells'] zoneouts_igates.tag.test_value = data['zoneouts_igates'] if init_width and not initialization == 'uniform': raise ValueError('Width is only for uniform init, whassup?') if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=init_width) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 4, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropLSTM(dim=layer_size, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn%d' % l, ogates_zoneout=ogates_zoneout) for l in range(num_layers) ] elif rnn_type.lower() == 'gru': in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size * 3, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropGRU(dim=layer_size, weights_init=weights_init, activation=Tanh(), name='rnn%d' % l) for l in range(num_layers) ] elif rnn_type.lower() == 'srnn': # FIXME!!! make ReLU in_to_hids = [ Linear(layer_size if l > 0 else alphabetsize, layer_size, name='in_to_hid%d' % l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers) ] recurrent_layers = [ DropSimpleRecurrent(dim=layer_size, weights_init=weights_init, activation=Rectifier(), name='rnn%d' % l) for l in range(num_layers) ] else: raise NotImplementedError hid_to_out = Linear(layer_size, alphabetsize, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) for layer in in_to_hids: layer.initialize() for layer in recurrent_layers: layer.initialize() hid_to_out.initialize() layer_input = x #in_to_hid.apply(x) init_updates = OrderedDict() for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)): rnn_embedding = in_to_hid.apply(layer_input) if rnn_type.lower() == 'lstm': states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) cells_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name, cells_init.name = "states_init", "cells_init" states, cells = layer.apply( rnn_embedding, zoneouts_states[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_cells[:, :, l * layer_size:(l + 1) * layer_size], zoneouts_igates[:, :, l * layer_size:(l + 1) * layer_size], states_init, cells_init) init_updates.update([(states_init, states[-1]), (cells_init, cells[-1])]) elif rnn_type.lower() in ['gru', 'srnn']: # untested! states_init = theano.shared( np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name = "states_init" states = layer.apply(rnn_embedding, zoneouts_states, zoneouts_igates, states_init) init_updates.update([(states_init, states[-1])]) else: raise NotImplementedError layer_input = states y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1])) shape_ = y_hat_pre_softmax.shape y_hat = Softmax().apply(y_hat_pre_softmax.reshape((-1, alphabetsize))) #################### ########################################### # # SET UP COSTS AND MONITORS # ########################################### cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)), y_hat).copy('cost') bpc = (cost / np.log(2.0)).copy(name='bpr') perp = T.exp(cost).copy(name='perp') cost_train = cost.copy(name='train_cost') cg_train = ComputationGraph([cost_train]) ########################################### # # NORM STABILIZER # ########################################### norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt( T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) elif penalty == 'hids': for l in range(num_layers): assert 'rnn%d_apply_states' % l in [ o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables) ] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): for l in range(num_layers): if output.name == 'rnn%d_apply_states' % l: norms = _magnitude(output) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' #cost_valid = cost_train cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy( 'cost_train') #should this be cost_train.outputs[0]? no. cg_train = ComputationGraph([cost_train]) ########################################### # # WEIGHT NOISE # ########################################### if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') model = Model(cost_train) learning_rate = float(learning_rate) clipping = StepClipping(threshold=np.cast[floatX](clipping)) if algorithm == 'adam': adam = Adam(learning_rate=learning_rate) learning_rate = adam.learning_rate step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate) learning_rate = rms_prop.learning_rate step_rule = CompositeRule([clipping, rms_prop]) elif algorithm == 'momentum': sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) learning_rate = sgd_momentum.learning_rate step_rule = CompositeRule([clipping, sgd_momentum]) elif algorithm == 'sgd': sgd = Scale(learning_rate=learning_rate) learning_rate = sgd.learning_rate step_rule = CompositeRule([clipping, sgd]) else: raise NotImplementedError algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)}) algorithm.add_updates(init_updates) def cond_number(x): _, _, sing_vals = T.nlinalg.svd(x, True, True) sing_mags = abs(sing_vals) return T.max(sing_mags) / T.min(sing_mags) def rms(x): return (x * x).mean().sqrt() whysplode_cond = [] whysplode_rms = [] for i, p in enumerate(init_updates): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) for i, p in enumerate(cg_train.parameters): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append( cond_number(p).copy( 'ini%d:%s_cond(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append( rms(p).copy('ini%d:%s_rms(%s)' % (i, p.name, "x".join(map(str, p.get_value().shape))))) observed_vars = [ cost_train, cost, bpc, perp, learning_rate, aggregation.mean( algorithm.total_gradient_norm).copy("gradient_norm_mean") ] # + whysplode_rms parameters = model.get_parameter_dict() for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name=name + "_norm")) observed_vars.append( algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_inits = [p.clone() for p in init_updates] cg_dev = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), dev_inits)) dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3] dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:])) dev_monitor = DataStreamMonitoring(variables=[dev_cost, dev_bpc, dev_perp], data_stream=valid_stream, prefix="dev", updates=dev_init_updates) # noone does this if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name extensions = [] extensions.extend( [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) if test_cost: test_inits = [p.clone() for p in init_updates] cg_test = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace( zip(init_updates.keys(), test_inits)) test_cost, test_bpc, test_perp = cg_test.outputs[:3] test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:])) test_monitor = DataStreamMonitoring( variables=[test_cost, test_bpc, test_perp], data_stream=test_stream, prefix="test", updates=test_init_updates) extensions.extend([test_monitor]) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append( SaveParams('dev_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) class RollsExtension(TrainingExtension): """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """ def __init__(self, shvars): self.shvars = shvars def before_epoch(self): for v in self.shvars: v.set_value(np.roll(v.get_value(), 1, 0)) extensions.append( RollsExtension(init_updates.keys() + dev_init_updates.keys() + (test_init_updates.keys() if test_cost else []))) class LearningRateSchedule(TrainingExtension): """ Lets you set a number to divide learning rate by each epoch + when to start doing that """ def __init__(self): self.epoch_number = 0 def after_epoch(self): self.epoch_number += 1 if self.epoch_number > decrease_lr_after_epoch: learning_rate.set_value(learning_rate.get_value() / lr_decay) if bool(lr_decay) != bool(decrease_lr_after_epoch): raise ValueError( 'Need to define both lr_decay and decrease_lr_after_epoch') if lr_decay and decrease_lr_after_epoch: extensions.append(LearningRateSchedule()) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) main_loop.run() print "Execution time: %f" % (time.time() - t1)
def main(config, tr_stream, dev_stream, use_bokeh=False): print("~def main") # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') print("~sampling_input = tensor.lmatrix") # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) print("~source_sentence_mask, target_sentence, target_sentence_mask") logger.info('Creating computational graph') cg = ComputationGraph(cost) print("~ComputationGraph") # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() print("~decoder.initialize()") # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) print("~cg = apply_dropout") # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff']) print("~cg = apply_noise") # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) print("~logger.info") # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) print("~training_model") # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] print("~every_n_batches=config") # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs sample = Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size']) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( sample ) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) sampling_fn = search_model.get_theano_function() print(" - - - - - - - - - - - - - - " ) sort_k_batches = 12 batch_size = 80 seq_len = 50 trg_ivocab = None src_vocab_size = config['src_vocab_size'] trg_vocab_size = config['trg_vocab_size'] unk_id = config['unk_id'] src_vocab = config['src_vocab'] trg_vocab = config['trg_vocab'] src_vocab = ensure_special_tokens( src_vocab if isinstance(src_vocab, dict) else cPickle.load(open(src_vocab)), bos_idx=0, eos_idx=src_vocab_size - 1, unk_idx=unk_id) trg_vocab = ensure_special_tokens( trg_vocab if isinstance(trg_vocab, dict) else cPickle.load(open(trg_vocab)), bos_idx=0, eos_idx=trg_vocab_size - 1, unk_idx=unk_id) if not trg_ivocab: trg_ivocab = {v: k for k, v in trg_vocab.items()} src_data = config['src_data'] trg_data = config['trg_data'] src_dataset = TextFile([src_data], src_vocab, None) trg_dataset = TextFile([trg_data], trg_vocab, None) inputstringfile="inputstringfile.cs" input_dataset = TextFile([inputstringfile], src_vocab, None) stream = Merge([input_dataset.get_example_stream(), trg_dataset.get_example_stream()], ('source', 'target')) stream2 = Filter(stream, predicate=_too_long(seq_len=seq_len)) stream3 = Mapping(stream2, _oov_to_unk(src_vocab_size=src_vocab_size, trg_vocab_size=trg_vocab_size, unk_id=unk_id)) stream4 = Batch(stream3, iteration_scheme=ConstantScheme( batch_size*sort_k_batches)) stream5 = Mapping(stream4, SortMapping(_length)) stream6 = Unpack(stream5) stream7 = Batch( stream6, iteration_scheme=ConstantScheme(batch_size)) input_stream = DataStream(input_dataset) print("dev_stream : ", type( dev_stream ) ) print("input_stream : ", type( input_stream ) ) epochone = input_stream.get_epoch_iterator() vocab = input_stream.dataset.dictionary unk_sym = input_stream.dataset.unk_token eos_sym = input_stream.dataset.eos_token for i, line in enumerate(epochone): seq = oov_to_unk( line[0], config['src_vocab_size'], unk_id) input_ = numpy.tile(seq, ( 1 , 1)) print("seq : " , type( seq ) , seq ) print("input_ : ", type( input_ ) , input_ , inspect.getmembers( input_ ) ) _1, outputs, _2, _3, costs = ( sampling_fn( input_ ) ) outputs = outputs.flatten() costs = costs.T print(" outputs : " , outputs , type( outputs ) ) print("idx_to_word: ", idx_to_word(outputs , trg_ivocab)) print(" - - - - - - - - - - - - - - " )
def __init__(self): inp = tensor.tensor3('input')[:,:,0] target = tensor.matrix('target') target = target.reshape((target.shape[0],)) product = tensor.lvector('product') missing = tensor.eq(inp, 0) train_input_mean = 1470614.1 train_input_std = 3256577.0 trans_1 = tensor.concatenate((inp[:,1:],tensor.zeros((inp.shape[0],1))), axis=1) trans_2 = tensor.concatenate((tensor.zeros((inp.shape[0],1)), inp[:,:-1]), axis=1) inp = tensor.switch(missing,(trans_1+trans_2)/2, inp) lookup = LookupTable(length = 352, dim=hidden_dim) product_embed= lookup.apply(product) salut = tensor.concatenate((inp, missing),axis = 1) linear = Linear(input_dim=108, output_dim=hidden_dim, name="MLP_in") inter = linear.apply(salut) inter = inter + product_embed mlp1 = MLP(activations=[Rectifier(),Rectifier(),Rectifier()], dims=[hidden_dim,hidden_dim,hidden_dim,hidden_dim], name="premier") inter1 = mlp1.apply(inter) linear2= Linear(input_dim = hidden_dim, output_dim = out_dim, name="ouput_linear") pred = linear2.apply(inter1)*train_input_std + train_input_mean pred = pred.reshape((product.shape[0],)) cost = tensor.mean(abs((pred-target)/target)) # Initialize all bricks for brick in [linear, linear2, mlp1, lookup]: brick.weights_init = IsotropicGaussian(0.1) brick.biases_init = Constant(0.) brick.initialize() # Apply noise and dropout cg = ComputationGraph([cost]) if w_noise_std > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, w_noise_std) if i_dropout > 0: cg = apply_dropout(cg, [inter1], i_dropout) [cost_reg] = cg.outputs cost_reg += 1e-20 if cost_reg is not cost: self.cost = cost self.cost_reg = cost_reg cost_reg.name = 'cost_reg' cost.name = 'cost' self.sgd_cost = cost_reg self.monitor_vars = [[cost, cost_reg]] else: self.cost = cost cost.name = 'cost' self.sgd_cost = cost self.monitor_vars = [[cost]] self.pred = pred pred.name = 'pred'
cost_monitor = ctc.cost(y, y_hat_softmax, y_m, x_m).mean() cost_monitor.name = "cost_monitor" elif conf.task=='framewise': cost_train = categorical_crossentropy_batch().apply(y_hat_softmax, y, x_m) cost_train.name='cost' cost_monitor = cost_train else: raise ValueError, conf.task recognizer.initialize() cg = ComputationGraph([cost_train, y_hat, x_m, y, y_m]) weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg = apply_noise(cg, weights, conf.weight_noise) #************* training algorithm ************* model = Model(cost_train) if conf.step_rule=='AdaDelta': step_rule = AdaDelta() elif conf.step_rule=='Momentum': step_rule = Momentum(learning_rate=conf.learning_rate, momentum=conf.momentum) else: raise('step_rule not known: {}'.format(conf.step_rule)) step_rule = CompositeRule([step_rule, StepClipping(conf.step_clipping)]) algorithm = GradientDescent(cost=cost_train, parameters=cg.parameters, step_rule = step_rule)
edit_distance = edit_distances.mean() edit_distance.name = "edit_distance" errors_per_char = (edit_distances / y_len).mean() errors_per_char.name = "errors_per_char" is_error = tensor.neq(dl, y) * tensor.lt(tensor.arange(L)[:, None], y_len[None, :]) is_error = tensor.switch(is_error.sum(axis=0), tensor.ones((B,)), tensor.neq(y_len, dl_length)) error_rate = is_error.mean() error_rate.name = "error_rate" # REGULARIZATION cg = ComputationGraph([cost, error_rate]) if weight_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, weight_noise) for vfilter, p in dropout_locs: cg = apply_dropout(cg, vfilter(cg), p) [cost_reg, error_rate_reg] = cg.outputs ctc_reg = cost_reg + 1e-24 ctc_reg.name = "CTC" if l2_output_bias > 0: cost_reg += l2_output_bias * sum(x.norm(2) for x in VariableFilter(roles=[BIAS], bricks=[rec_to_o])(cg)) if l2_output_weight > 0: cost_reg += l2_output_weight * sum(x.norm(2) for x in VariableFilter(roles=[WEIGHT], bricks=[rec_to_o])(cg)) if l2_all_bias > 0: cost_reg += l2_all_bias * sum(x.norm(2) for x in VariableFilter(roles=[BIAS])(cg)) if l2_all_weight > 0: cost_reg += l2_all_weight * sum(x.norm(2) for x in VariableFilter(roles=[WEIGHT])(cg))
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph( batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter( applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter( applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append( rename(gain_matrix.min(), 'min_gain')) primary_observables.append( rename(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = rename(recognizer.labels.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter( applications=[r.generator.readout.readout], name="output_0")( cost_cg) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=[r.bottom.apply], name_regex="output")( cost_cg)[-1] attended, = VariableFilter( applications=[r.generator.transition.apply], name="attended")( cost_cg) attended_mask, = VariableFilter( applications=[r.generator.transition.apply], name="attended_mask")( cost_cg) weights, = VariableFilter( applications=[r.generator.evaluate], name="weights")( cost_cg) max_recording_length = rename(bottom_output.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = rename(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = rename(attended.shape[0], "max_attended_length") max_num_phonemes = rename(labels.shape[0], "max_num_phonemes") min_energy = rename(energies.min(), "min_energy") max_energy = rename(energies.max(), "max_energy") mean_attended = rename(abs(attended).mean(), "mean_attended") mean_bottom_output = rename(abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = rename(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy") mask_density = rename(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config.get('regularization', dict()) regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = (train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2) train_cost = rename(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=Model(regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise') ) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1]] # model prior variance model = Model(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise with open(params, 'r') as src: param_values = load_parameters(src) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append( BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append(rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append(rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( primary_observables, after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf.get('num_batches'), after_n_epochs=train_conf.get('num_epochs')) .add_condition(["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [average_monitoring.record_name(train_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label')], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording')]] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server),] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar()] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains( labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name] extensions.append(Patience(**patience_conf)) extensions.append(Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
def train_model(cost, cross_entropy, updates, train_stream, valid_stream, args, gate_values=None): step_rule = learning_algorithm(args) cg = ComputationGraph(cost) # ADD REGULARIZATION # WEIGHT NOISE weight_noise = args.weight_noise if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg_train = apply_noise(cg, weights, weight_noise) cost = cg_train.outputs[0] cost.name = "cost_with_weight_noise" cg = ComputationGraph(cost) logger.info(cg.parameters) algorithm = GradientDescent(cost=cost, step_rule=step_rule, params=cg.parameters) algorithm.add_updates(updates) # extensions to be added extensions = [] if args.load_path is not None: extensions.append(Load(args.load_path)) outputs = [ variable for variable in cg.variables if variable.name == "presoft"] if args.generate: extensions.append(TextGenerationExtension( outputs=outputs, generation_length=args.generated_text_lenght, initial_text_length=args.initial_text_length, every_n_batches=args.monitoring_freq, ploting_path=os.path.join(args.save_path, 'prob_plot.png'), softmax_sampling=args.softmax_sampling, dataset=args.dataset, updates=updates, interactive_mode=args.interactive_mode)) extensions.extend([ TrainingDataMonitoring([cost], prefix='train', every_n_batches=args.monitoring_freq, after_epoch=True), DataStreamMonitoring([cost, cross_entropy], valid_stream, args.mini_batch_size_valid, state_updates=updates, prefix='valid', before_first_epoch=not(args.visualize_gates), every_n_batches=args.monitoring_freq), ResetStates([v for v, _ in updates], every_n_batches=100), ProgressBar()]) # Creating directory for saving model. if not args.interactive_mode: if not os.path.exists(args.save_path): os.makedirs(args.save_path) else: raise Exception('Directory already exists') early_stopping = EarlyStopping('valid_cross_entropy', args.patience, args.save_path, every_n_batches=args.monitoring_freq) # Visualizing extensions if args.interactive_mode: extensions.append(InteractiveMode()) if args.visualize_gates and (gate_values is not None): if args.rnn_type == "lstm": extensions.append(VisualizeGateLSTM(gate_values, updates, args.dataset, ploting_path=None)) elif args.rnn_type == "soft": extensions.append(VisualizeGateSoft(gate_values, updates, args.dataset, ploting_path=None)) else: assert(False) extensions.append(early_stopping) extensions.append(Printing(every_n_batches=args.monitoring_freq)) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=extensions ) main_loop.run()
def main(config, tr_stream, dev_stream, use_bokeh=False, the_task=None, the_track=None): config['the_task'] = the_task # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( # end_embed is dimension of word embedding matrix in encoder; enc_nhids number of hidden units in encoder GRU config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2, config['use_attention'], cost_type=config['error_fct']) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) testVar = decoder.getTestVar( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') my_rng = numpy.random.RandomState(config['rng_value']) if config['identity_init']: encoder.weights_init = decoder.weights_init = Identity() else: encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.rng = decoder.rng = my_rng encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() encoder.bidir.prototype.rng = my_rng decoder.transition.weights_init = Orthogonal() decoder.transition.rng = my_rng encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff'], seed=my_rng) cost = cg.outputs[0] # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") # this is ugly code and done, because I am not sure if the order of the extensions is important if 'track2' in config['saveto']: # less epochs for track 2, because of more data if config['early_stopping']: extensions = [ FinishAfter(after_n_epochs=config['finish_after']/2), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] else: extensions = [ FinishAfter(after_n_epochs=config['finish_after']/2), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] else: if config['early_stopping']: extensions = [ FinishAfter(after_n_epochs=config['finish_after']), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] else: extensions = [ FinishAfter(after_n_epochs=config['finish_after']), #FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], #every_n_batches=1, every_n_batches=config['sampling_freq'], src_vocab_size=8)) #src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['val_set'] is not None: logger.info("Building accuracy validator") extensions.append( AccuracyValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, after_training=True, #after_epoch=True)) every_n_epochs=5)) else: logger.info("No validation set given for this language") # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run()
def set_up(self, config=None, make_prunable=False): """Loads and initializes all the theano variables for the training model and the decoding model. Args: config (dict): NMT configuration """ if config: self.config = config else: config = self.config # Create Theano variables logging.debug('Creating theano variables') source_sentence_mask = tensor.matrix('source_mask') target_sentence_mask = tensor.matrix('target_mask') # Construct model (fs439: Add NoLookup options) if config['dec_layers'] != 1: logging.fatal("Only dec_layers=1 supported.") logging.debug('Building RNN encoder-decoder') if config['src_sparse_feat_map']: if config['enc_layers'] != 1: logging.fatal("Only enc_layers=1 supported for sparse " "source features.") source_sentence = tensor.tensor3('source') self.sampling_input = tensor.tensor3('input') encoder = NoLookupEncoder(config['enc_embed'], config['enc_nhids']) else: source_sentence = tensor.lmatrix('source') self.sampling_input = tensor.lmatrix('input') if config['enc_layers'] > 1 and not config['enc_share_weights']: encoder = DeepBidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_layers'], config['enc_skip_connections'], config['enc_nhids']) else: encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_layers'], config['enc_skip_connections'], config['enc_nhids']) if config['trg_sparse_feat_map']: target_sentence = tensor.tensor3('target') decoder = NoLookupDecoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['att_nhids'], config['maxout_nhids'], config['enc_nhids'] * 2, config['attention'], config['dec_attention_sources'], config['dec_readout_sources'], config['memory'], config['memory_size'], config['seq_len'], config['dec_init']) else: target_sentence = tensor.lmatrix('target') decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['att_nhids'], config['maxout_nhids'], config['enc_nhids'] * 2, config['attention'], config['dec_attention_sources'], config['dec_readout_sources'], config['memory'], config['memory_size'], config['seq_len'], config['dec_init'], make_prunable=make_prunable) if config['annotations'] != 'direct': annotators = [] add_direct = False for name in config['annotations'].split(','): if name == 'direct': add_direct = True elif name == 'hierarchical': annotators.append(HierarchicalAnnotator(encoder)) else: logging.fatal("Annotation strategy %s unknown" % name) encoder = EncoderWithAnnotators(encoder, annotators, add_direct) annotations, annotations_mask = encoder.apply(source_sentence, source_sentence_mask) self.cost = decoder.cost(annotations, annotations_mask, target_sentence, target_sentence_mask) logging.info('Creating computational graph') self.cg = ComputationGraph(self.cost) # Initialize model logging.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() try: encoder.bidir.prototype.weights_init = Orthogonal() except AttributeError: pass # Its fine, no bidirectional encoder decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logging.info('Applying dropout') dropout_inputs = [ x for x in self.cg.intermediary_variables if x.name == 'maxout_apply_output' ] self.cg = apply_dropout(self.cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logging.info('Applying weight noise to ff layers') if encoder.lookup: enc_params = Selector(encoder.lookup).get_parameters().values() enc_params += Selector(encoder.fwd_fork).get_parameters().values() enc_params += Selector(encoder.back_fork).get_parameters().values() dec_params = Selector( decoder.sequence_generator.readout).get_parameters().values() dec_params += Selector( decoder.sequence_generator.fork).get_parameters().values() self.cg = apply_noise(self.cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in self.cg.parameters] logging.debug("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logging.debug(' {:15}: {}'.format(shape, count)) logging.debug("Total number of CG parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logging.debug("Parameter names: ") for name, value in enc_dec_param_dict.items(): logging.debug(' {:15}: {}'.format(value.get_value().shape, name)) logging.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logging.info("Building model") self.training_model = Model(self.cost) logging.info("Building sampling model") src_shape = (self.sampling_input.shape[-2], self.sampling_input.shape[-1]) # batch_size x sen_length sampling_representation, _ = encoder.apply(self.sampling_input, tensor.ones(src_shape)) generated = decoder.generate(src_shape, sampling_representation) self.search_model = Model(generated) generated_outputs = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs self.samples = generated_outputs[1] self.encoder = encoder self.decoder = decoder
def __init__(self, config): inp = tensor.imatrix('bytes') embed = theano.shared(config.embedding_matrix.astype(theano.config.floatX), name='embedding_matrix') in_repr = embed[inp.flatten(), :].reshape((inp.shape[0], inp.shape[1], config.repr_dim)) in_repr.name = 'in_repr' bricks = [] states = [] # Construct predictive GRU hierarchy hidden = [] costs = [] next_target = in_repr.dimshuffle(1, 0, 2) for i, (hdim, cf, q) in enumerate(zip(config.hidden_dims, config.cost_factors, config.hidden_q)): init_state = theano.shared(numpy.zeros((config.num_seqs, hdim)).astype(theano.config.floatX), name='st0_%d'%i) linear = Linear(input_dim=config.repr_dim, output_dim=3*hdim, name="lstm_in_%d"%i) lstm = GatedRecurrent(dim=hdim, activation=config.activation_function, name="lstm_rec_%d"%i) linear2 = Linear(input_dim=hdim, output_dim=config.repr_dim, name='lstm_out_%d'%i) tanh = Tanh('lstm_out_tanh_%d'%i) bricks += [linear, lstm, linear2, tanh] if i > 0: linear1 = Linear(input_dim=config.hidden_dims[i-1], output_dim=3*hdim, name='lstm_in2_%d'%i) bricks += [linear1] next_target = tensor.cast(next_target, dtype=theano.config.floatX) inter = linear.apply(theano.gradient.disconnected_grad(next_target)) if i > 0: inter += linear1.apply(theano.gradient.disconnected_grad(hidden[-1][:-1,:,:])) new_hidden = lstm.apply(inputs=inter[:,:,:hdim], gate_inputs=inter[:,:,hdim:], states=init_state) states.append((init_state, new_hidden[-1, :, :])) hidden += [tensor.concatenate([init_state[None,:,:], new_hidden],axis=0)] pred = tanh.apply(linear2.apply(hidden[-1][:-1,:,:])) costs += [numpy.float32(cf) * (-next_target * pred).sum(axis=2).mean()] costs += [numpy.float32(cf) * q * abs(pred).sum(axis=2).mean()] diff = next_target - pred next_target = tensor.ge(diff, 0.5) - tensor.le(diff, -0.5) # Construct output from hidden states hidden = [s.dimshuffle(1, 0, 2) for s in hidden] out_parts = [] out_dims = config.out_hidden + [config.io_dim] for i, (dim, state) in enumerate(zip(config.hidden_dims, hidden)): pred_linear = Linear(input_dim=dim, output_dim=out_dims[0], name='pred_linear_%d'%i) bricks.append(pred_linear) lin = theano.gradient.disconnected_grad(state) out_parts.append(pred_linear.apply(lin)) # Do prediction and calculate cost out = sum(out_parts) if len(out_dims) > 1: out = config.out_hidden_act[0](name='out_act0').apply(out) mlp = MLP(dims=out_dims, activations=[x(name='out_act%d'%i) for i, x in enumerate(config.out_hidden_act[1:])] +[Identity()], name='out_mlp') bricks.append(mlp) out = mlp.apply(out.reshape((inp.shape[0]*(inp.shape[1]+1),-1)) ).reshape((inp.shape[0],inp.shape[1]+1,-1)) pred = out.argmax(axis=2) cost = Softmax().categorical_cross_entropy(inp.flatten(), out[:,:-1,:].reshape((inp.shape[0]*inp.shape[1], config.io_dim))).mean() error_rate = tensor.neq(inp.flatten(), pred[:,:-1].flatten()).mean() sgd_cost = cost + sum(costs) # Initialize all bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize() # apply noise cg = ComputationGraph([sgd_cost, cost, error_rate]+costs) if config.weight_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.weight_noise) sgd_cost = cg.outputs[0] cost = cg.outputs[1] error_rate = cg.outputs[2] costs = cg.outputs[3:] # put stuff into self that is usefull for training or extensions self.sgd_cost = sgd_cost sgd_cost.name = 'sgd_cost' for i in range(len(costs)): costs[i].name = 'pred_cost_%d'%i cost.name = 'cost' error_rate.name = 'error_rate' self.monitor_vars = [costs, [cost], [error_rate]] self.out = out[:,1:,:] self.pred = pred[:,1:] self.states = states
cost = model.cost(**inputs) cg = ComputationGraph(cost) monitored = set([cost] + VariableFilter(roles=[roles.COST])(cg.variables)) valid_monitored = monitored if hasattr(model, 'valid_cost'): valid_cost = model.valid_cost(**inputs) valid_cg = ComputationGraph(valid_cost) valid_monitored = set([valid_cost] + VariableFilter( roles=[roles.COST])(valid_cg.variables)) if hasattr(config, 'dropout') and config.dropout < 1.0: cg = apply_dropout(cg, config.dropout_inputs(cg), config.dropout) if hasattr(config, 'noise') and config.noise > 0.0: cg = apply_noise(cg, config.noise_inputs(cg), config.noise) cost = cg.outputs[0] cg = Model(cost) logger.info('# Parameter shapes:') parameters_size = 0 for value in cg.parameters: logger.info(' %20s %s' % (value.get_value().shape, value.name)) parameters_size += reduce(operator.mul, value.get_value().shape, 1) logger.info('Total number of parameters: %d in %d matrices' % (parameters_size, len(cg.parameters))) if hasattr(config, 'step_rule'): step_rule = config.step_rule else: step_rule = AdaDelta()
def initialize_graph(recognizer, data, config, params): # Separate attention_params to be handled differently # when regularization is applied attentions = recognizer.all_children().generator.transition.attention.get() attention_params = [Selector(attention).get_parameters().values() for attention in attentions] logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) observables = [] # monitored each batch cg = recognizer.get_cost_graph(batch=True) labels = [] labels_mask = [] for chld in recognizer.children: lbls = VariableFilter(applications=[chld.cost], name='labels'+chld.names_postfix)(cg) lbls_mask = VariableFilter(applications=[chld.cost], name='labels_mask'+chld.names_postfix)(cg) if len(lbls) == 1: labels += lbls labels_mask += lbls_mask batch_cost = cg.outputs[0].sum() batch_size = rename(labels[0].shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=recognizer.all_children().bottom.apply.get(), name_regex="output")( cost_cg) attended = VariableFilter( applications=recognizer.all_children().generator.transition.apply.get(), name="attended")( cost_cg) attended_mask = VariableFilter( applications=recognizer.all_children().generator.transition.apply.get(), name="attended_mask")( cost_cg) weights = VariableFilter( applications=recognizer.all_children().generator.evaluate.get(), name="weights")( cost_cg) def get_renamed_list(rlist, elem_func, elem_name): return [rename(elem_func(elem), elem_name+chld.names_postfix) for elem,chld in zip(rlist, recognizer.children)] max_sentence_lengths = get_renamed_list(bottom_output, lambda e: e.shape[0], "max_sentence_length") max_attended_mask_lengths = get_renamed_list(attended_mask, lambda e: e.shape[0], "max_attended_mask_length") max_attended_lengths = get_renamed_list(attended, lambda e: e.shape[0], "max_attended_length") max_num_characters = get_renamed_list(labels, lambda e: e.shape[0], "max_num_characters") mean_attended = get_renamed_list(attended, lambda e: abs(e).mean(), "mean_attended") mean_bottom_output = get_renamed_list(bottom_output, lambda e: abs(e).mean(), "mean_bottom_output") mask_density = get_renamed_list(labels_mask, lambda e: e.mean(), "mask_density") weights_entropy = [rename(entropy(w, lm), "weights_entropy"+chld.names_postfix) for w, lm, chld in zip(weights, labels_mask, recognizer.children)] observables += max_attended_lengths + max_attended_mask_lengths + max_sentence_lengths # # Monitoring of cost terms is tricky because of Blocks #514 - since the # costs are annotations that are not part of the original output graph, # they are unaffected by replacements such as dropout!! # cost_terms = [] for chld in recognizer.children: chld_cost_terms = VariableFilter(applications=[chld.generator.evaluate], name_regex='.*_nll')(cost_cg) chld_cost_terms = [rename(var, var.name[:-4] + chld.names_postfix + '_nll') for var in chld_cost_terms] cost_terms += chld_cost_terms cg = ComputationGraph([cost, batch_size] + weights_entropy + mean_attended + mean_bottom_output + max_num_characters + mask_density + cost_terms) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config['regularization'] regularized_cg = cg if reg_config.get('dropout'): drop_conf = reg_config['dropout'] bot_drop = drop_conf.get('bottom', 0.0) if bot_drop: logger.info('apply bottom dropout') regularized_cg = apply_dropout(regularized_cg, bottom_output, bot_drop) enc_drop = drop_conf.get('encoder', 0.0) if enc_drop: logger.info('apply encoder dropout') enc_bricks = reduce(lambda acc,x: acc+list(x), recognizer.all_children().encoder.children.get(), []) enc_states = VariableFilter(bricks=enc_bricks, name_regex='states')(regularized_cg) regularized_cg = apply_dropout(regularized_cg, enc_states, enc_drop) post_merge_drop = drop_conf.get('post_merge', 0.0) if post_merge_drop: logger.info('apply post_merge dropout') pm_bricks = [] for chld in recognizer.children: cpm_bricks = list(chld.generator.readout.post_merge.children) cpm_bricks += cpm_bricks[-1].children cpm_bricks = [b for b in cpm_bricks if isinstance(b, type(chld.post_merge_activation))] pm_bricks += cpm_bricks regularized_cg = apply_dropout( regularized_cg, VariableFilter(bricks=pm_bricks, name='output')(regularized_cg), post_merge_drop) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = (train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2) train_cost = train_cost.copy(name='train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=SpeechModel(regularized_cg.outputs[0] ).get_parameter_dict().values(), **reg_config.get('adaptive_noise') ) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1]] # model prior variance if len(cost_terms): # Please note - the aggragation (mean) is done in # "attach_aggregation_schemes" ct_names = [v.name for v in cost_terms] for v in regularized_cg.outputs: if v.name in ct_names: observables.append(rename(v.sum()/batch_size, v.name)) for chld in recognizer.children: if chld.train_tags: tags_cost = VariableFilter(applications=[chld.addTagCost], name='output')(regularized_cg)[0] observables += [rename(tags_cost.sum()/batch_size, 'tags_nll'+chld.names_postfix)] # Model is weird class, we spend lots of time arguing with Bart # what it should be. However it can already nice things, e.g. # one extract all the parameters from the computation graphs # and give them hierahical names. This help to notice when a # because of some bug a parameter is not in the computation # graph. model = SpeechModel(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise param_values = load_parameter_values(params) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in parameters.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] return { 'observables': observables, 'max_norm_rules': max_norm_rules, 'cg': cg, 'regularized_cg' : regularized_cg, 'train_cost' : train_cost, 'cost' : cost, 'batch_size' : batch_size, 'batch_cost' : batch_cost, 'parameters' : parameters, 'gradients': gradients, 'model' : model, 'data' : data, 'recognizer' : recognizer, 'weights_entropy' : weights_entropy, 'labels_mask' : labels_mask, 'labels' : labels }
def main(config, tr_stream, dev_stream, use_bokeh=True): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: logger.info("Adding bokeh plot extension") extensions.append( Plot('De-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run()
def __init__(self): inp = tensor.lmatrix('bytes') in_onehot = tensor.eq(tensor.arange(io_dim, dtype='int16').reshape((1, 1, io_dim)), inp[:, :, None]) in_onehot.name = 'in_onehot' gfgru = GFGRU(input_dim=io_dim, recurrent_blocks=recurrent_blocks, control_hidden=control_hidden, control_hidden_activations=control_hidden_activations) hidden_total_dim = sum(x for (x, _, _, _) in recurrent_blocks) prev_states_dict = {} for i, (dim, _, _, _) in enumerate(recurrent_blocks): prev_state = theano.shared(numpy.zeros((num_seqs, dim)).astype(theano.config.floatX), name='states_save') prev_states_dict['init_state_%d'%i] = prev_state states = [x.dimshuffle(1, 0, 2) for x in gfgru.apply(in_onehot.dimshuffle(1, 0, 2), **prev_states_dict)] self.states = [] for i, _ in enumerate(recurrent_blocks): self.states.append((prev_states_dict['init_state_%d'%i], states[i][:, -1, :])) states_concat = tensor.concatenate(states, axis=2) out_mlp = MLP(dims=[hidden_total_dim] + output_hidden + [io_dim], activations=output_hidden_activations + [None], name='output_mlp') states_sh = states_concat.reshape((inp.shape[0]*inp.shape[1], hidden_total_dim)) out = out_mlp.apply(states_sh).reshape((inp.shape[0], inp.shape[1], io_dim)) # Do prediction and calculate cost pred = out.argmax(axis=2) cost = Softmax().categorical_cross_entropy(inp[:, 1:].flatten(), out[:, :-1, :].reshape((inp.shape[0]*(inp.shape[1]-1), io_dim))) error_rate = tensor.neq(inp[:, 1:].flatten(), pred[:, :-1].flatten()).mean() # Initialize all bricks for brick in [gfgru, out_mlp]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # Apply noise and dropout cg = ComputationGraph([cost, error_rate]) if weight_noise_std > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, weight_noise_std) if recurrent_h_dropout > 0: dv = gfgru.recurrent_h_dropout_vars(cg) print "Recurrent H dropout on", len(dv), "vars" cg = apply_dropout(cg, dv, recurrent_h_dropout) if control_h_dropout > 0: dv = gfgru.control_h_dropout_vars(cg) print "Control H dropout on", len(dv), "vars" cg = apply_dropout(cg, dv, control_h_dropout) if output_h_dropout > 0: dv = VariableFilter(name='input_', bricks=out_mlp.linear_transformations)(cg) print "Output H dropout on", len(dv), "vars" cg = apply_dropout(cg, dv, output_h_dropout) [cost_reg, error_rate_reg] = cg.outputs self.cost = cost self.error_rate = error_rate self.cost_reg = cost_reg self.error_rate_reg = error_rate_reg self.out = out self.pred = pred
def __init__(self, ref_data, output_dim): input_dim = ref_data.shape[1] ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name="ref_data") # Construct the model j = tensor.lvector("j") r = ref_data_sh[j, :] x = tensor.fmatrix("x") y = tensor.ivector("y") # input_dim must be nr mlp0 = MLP(activations=activation_functions_0, dims=[input_dim] + hidden_dims_0, name="e0") mlp0vs = MLP(activations=[None], dims=[hidden_dims_0[-1], input_dim], name="de0") mlp1 = MLP( activations=activation_functions_1, dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter], name="inter_gen" ) mlp2 = MLP( activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name="end_mlp" ) encod = mlp0.apply(r) rprime = mlp0vs.apply(encod) inter_weights = mlp1.apply(encod) ibias = Bias(n_inter) ibias.biases_init = Constant(0) ibias.initialize() inter = inter_act_fun.apply(ibias.apply(tensor.dot(x, inter_weights))) final = mlp2.apply(inter) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in [mlp0, mlp0vs, mlp1, mlp2]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply regularization cg = ComputationGraph([cost, error_rate]) if r_dropout != 0: # - dropout on input vector r : r_dropout cg = apply_dropout(cg, [r], r_dropout) if s_dropout != 0: # - dropout on intermediate layers of first mlp : s_dropout s_dropout_vars = list( set(VariableFilter(bricks=[Tanh], name="output")(ComputationGraph([inter_weights]))) - set([inter_weights]) ) cg = apply_dropout(cg, s_dropout_vars, s_dropout) if i_dropout != 0: # - dropout on input to second mlp : i_dropout cg = apply_dropout(cg, [inter], i_dropout) if a_dropout != 0: # - dropout on hidden layers of second mlp : a_dropout a_dropout_vars = list( set(VariableFilter(bricks=[Tanh], name="output")(ComputationGraph([final]))) - set([inter_weights]) - set(s_dropout_vars) ) cg = apply_dropout(cg, a_dropout_vars, a_dropout) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) [cost_reg, error_rate_reg] = cg.outputs # add reconstruction penalty for AE part penalty_val = tensor.sqrt(((r - rprime) ** 2).sum(axis=1)).mean() cost_reg = cost_reg + reconstruction_penalty * penalty_val self.cost = cost self.cost_reg = cost_reg self.error_rate = error_rate self.error_rate_reg = error_rate_reg self.pred = pred self.confidence = confidence
def __init__(self, config, vocab_size): question = tensor.imatrix('question') # set up 32-bit integer matrices question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.ivector('answer') candidates = tensor.imatrix('candidates') candidates_mask = tensor.imatrix('candidates_mask') # and the multple choice answers: ans1 = tensor.ivector('ans1') ans1_mask = tensor.ivector('ans1_mask') ans2 = tensor.ivector('ans2') ans2_mask = tensor.ivector('ans2_mask') ans3 = tensor.ivector('ans3') ans3_mask = tensor.ivector('ans3_mask') ans4 = tensor.ivector('ans4') ans4_mask = tensor.ivector('ans4_mask') bricks = [] # inverts 1st and 2nd dimensions of matrix question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) # Embed questions and cntext embed = LookupTable(vocab_size, config.embed_size, name='question_embed') bricks.append(embed) qembed = embed.apply(question) cembed = embed.apply(context) a1embed = embed.apply(ans1) a2embed = embed.apply(ans2) a3embed = embed.apply(ans3) a4embed = embed.apply(ans4) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + qlstms + clstms # Calculate question encoding (concatenate layer1) if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) if config.ctx_skip_connections: cenc_dim = 2*sum(config.ctx_lstm_size) cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2*config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP attention_mlp = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp') attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') bricks += [attention_mlp, attention_qlinear, attention_clinear] layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))) .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2]))) att_weights.name = 'att_weights_0' att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights.name = 'att_weights' attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0) attended.name = 'attended' # Now we can calculate our output out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities], activations=config.out_mlp_activations + [Identity()], name='out_mlp') bricks += [out_mlp] probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1)) probs.name = 'probs' # not needed anymore, since we're not only looking at entities # is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :], # tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1) # probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs)) # Calculate prediction, cost and error rate # vocab = tensor.arange(10) # probs = numpy.asarray([0, 0.8, 0, 0.2], dtype=numpy.float32) # context = numpy.asarray([3, 2, 8, 1], dtype=numpy.int32) # ans3 = numpy.asarray([2, 8, 1], dtype=numpy.int32) # ans1 = numpy.asarray([1, 3, 4], dtype=numpy.int32) # ans2 = numpy.asarray([1, 1, 4], dtype=numpy.int32) # convert probs vector to one that's the same size as vocab, with all zeros except probs: # probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs)) probsPadded = tensor.zeros_like(vocab_size, dtype=numpy.float32) probsSubset = probsPadded[cembed] #TODO this should be masked b = tensor.set_subtensor(probsSubset, probs) # get the similarity score of each (masked) answer with the context probs: ans1probs = b[a1enc] ans1score = tensor.switch(ans1_mask, ans1probs, tensor.zeros_like(ans1probs)).sum() ans2probs = b[a2enc] ans2score = ans2probs.sum() ans3probs = b[a3enc] ans3score = ans3probs.sum() ans4probs = b[a4enc] ans4score = ans4probs.sum() # and pick the best one: allans = tensor.stacklists([ans1score, ans2score, ans3score, ans4score]) pred = tensor.argmax(allans) cg = ComputationGraph([ans1probs, ans1score, ans2probs, ans2score, ans3probs, ans3score, ans4probs, ans4score, allans, pred]) f = cg.get_theano_function() out = f() #pred = probs.argmax(axis=1) #print "pred" #print pred TODO CHANGE THIS! cost = Softmax().categorical_cross_entropy(answer, probs).mean() error_rate = tensor.neq(answer, pred).mean() # Apply dropout cg = ComputationGraph([cost, error_rate]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg, error_rate_reg] = cg.outputs # Other stuff cost_reg.name = cost.name = 'cost' error_rate_reg.name = error_rate.name = 'error_rate' self.probs = probs self.probs.name = "probs" self.cost = cost self.cost.name = "cost" # self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg], [error_rate_reg]] self.monitor_vars_valid = [[cost], [error_rate]] # Initialize bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def set_up(self, config = None, make_prunable = False): """Loads and initializes all the theano variables for the training model and the decoding model. Args: config (dict): NMT configuration """ if config: self.config = config else: config = self.config # Create Theano variables logging.debug('Creating theano variables') source_sentence_mask = tensor.matrix('source_mask') target_sentence_mask = tensor.matrix('target_mask') # Construct model (fs439: Add NoLookup options) if config['dec_layers'] != 1: logging.fatal("Only dec_layers=1 supported.") logging.debug('Building RNN encoder-decoder') if config['src_sparse_feat_map']: if config['enc_layers'] != 1: logging.fatal("Only enc_layers=1 supported for sparse " "source features.") source_sentence = tensor.tensor3('source') self.sampling_input = tensor.tensor3('input') encoder = NoLookupEncoder(config['enc_embed'], config['enc_nhids']) else: source_sentence = tensor.lmatrix('source') self.sampling_input = tensor.lmatrix('input') if config['enc_layers'] > 1 and not config['enc_share_weights']: encoder = DeepBidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_layers'], config['enc_skip_connections'], config['enc_nhids']) else: encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_layers'], config['enc_skip_connections'], config['enc_nhids']) if config['trg_sparse_feat_map']: target_sentence = tensor.tensor3('target') decoder = NoLookupDecoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['att_nhids'], config['maxout_nhids'], config['enc_nhids'] * 2, config['attention'], config['dec_attention_sources'], config['dec_readout_sources'], config['memory'], config['memory_size'], config['seq_len'], config['dec_init']) else: target_sentence = tensor.lmatrix('target') decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['att_nhids'], config['maxout_nhids'], config['enc_nhids'] * 2, config['attention'], config['dec_attention_sources'], config['dec_readout_sources'], config['memory'], config['memory_size'], config['seq_len'], config['dec_init'], make_prunable=make_prunable) if config['annotations'] != 'direct': annotators = [] add_direct = False for name in config['annotations'].split(','): if name == 'direct': add_direct = True elif name == 'hierarchical': annotators.append(HierarchicalAnnotator(encoder)) else: logging.fatal("Annotation strategy %s unknown" % name) encoder = EncoderWithAnnotators(encoder, annotators, add_direct) annotations, annotations_mask = encoder.apply(source_sentence, source_sentence_mask) self.cost = decoder.cost(annotations, annotations_mask, target_sentence, target_sentence_mask) logging.info('Creating computational graph') self.cg = ComputationGraph(self.cost) # Initialize model logging.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() try: encoder.bidir.prototype.weights_init = Orthogonal() except AttributeError: pass # Its fine, no bidirectional encoder decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logging.info('Applying dropout') dropout_inputs = [x for x in self.cg.intermediary_variables if x.name == 'maxout_apply_output'] self.cg = apply_dropout(self.cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logging.info('Applying weight noise to ff layers') if encoder.lookup: enc_params = Selector(encoder.lookup).get_parameters().values() enc_params += Selector(encoder.fwd_fork).get_parameters().values() enc_params += Selector(encoder.back_fork).get_parameters().values() dec_params = Selector( decoder.sequence_generator.readout).get_parameters().values() dec_params += Selector( decoder.sequence_generator.fork).get_parameters().values() self.cg = apply_noise(self.cg, enc_params+dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in self.cg.parameters] logging.debug("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logging.debug(' {:15}: {}'.format(shape, count)) logging.debug("Total number of CG parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logging.debug("Parameter names: ") for name, value in enc_dec_param_dict.items(): logging.debug(' {:15}: {}'.format(value.get_value().shape, name)) logging.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logging.info("Building model") self.training_model = Model(self.cost) logging.info("Building sampling model") src_shape = (self.sampling_input.shape[-2], self.sampling_input.shape[-1]) # batch_size x sen_length sampling_representation,_ = encoder.apply(self.sampling_input, tensor.ones(src_shape)) generated = decoder.generate(src_shape, sampling_representation) self.search_model = Model(generated) generated_outputs = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs self.samples = generated_outputs[1] self.encoder = encoder self.decoder = decoder
def main(save_to, num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=200, num_batches=None): if feature_maps is None: feature_maps = [32, 32, 64, 64, 128, 128] if mlp_hiddens is None: mlp_hiddens = [1000] if conv_sizes is None: conv_sizes = [7, 5, 5, 5, 3, 3] if pool_sizes is None: pool_sizes = [2, 2, 2, 2, 2, 2] image_size = (128, 128) batch_size = 64 output_size = 2 learningRate = 0.01 drop_prob = 0.4 weight_noise = 0.75 num_epochs = 150 num_batches = None # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 3, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info("Input dim: {} {} {}".format( *convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format( i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') # Normalize input and apply the convnet probs = convnet.apply(x) # Save on csv # numpy.save(probs) cost = (CategoricalCrossEntropy().apply(y.flatten(), probs) .copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs) .copy(name='error_rate')) error_rate2 = error_rate.copy(name='error_rate2') cg = ComputationGraph([cost, error_rate]) weights = VariableFilter(roles=[FILTER, WEIGHT])(cg.variables) ############# Dropout ############# logger.info('Applying dropout') cg = apply_dropout(cg, weights[0:3], drop_prob) dropped_out = VariableFilter(roles=[DROPOUT])(cg.variables) ############# Guaussian Noise ############# logger.info('Applying Gaussian noise') cg = apply_noise(cg, weights, weight_noise) ########### Loading images ##################### from fuel.datasets.dogs_vs_cats import DogsVsCats from fuel.streams import DataStream, ServerDataStream from fuel.schemes import ShuffledScheme from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, Random2DRotation from fuel.transformers import Flatten, Cast, ScaleAndShift def create_data(data): stream = DataStream(data, iteration_scheme=ShuffledScheme(data.num_examples, batch_size)) stream = MinimumImageDimensions(stream, image_size, which_sources=('image_features',)) stream = MaximumImageDimensions(stream, image_size, which_sources=('image_features',)) stream = RandomHorizontalSwap(stream, which_sources=('image_features',)) stream = Random2DRotation(stream, which_sources=('image_features',)) #stream = ScikitResize(stream, image_size, which_sources=('image_features',)) stream = ScaleAndShift(stream, 1./255, 0, which_sources=('image_features',)) stream = Cast(stream, dtype='float32', which_sources=('image_features',)) return stream stream_data_train = create_data(DogsVsCats(('train',), subset=slice(0, 22500))) stream_data_test = create_data(DogsVsCats(('train',), subset=slice(22500, 25000))) #stream_data_train = create_data(DogsVsCats(('train',), subset=slice(0, 10))) #stream_data_test = create_data(DogsVsCats(('train',), subset=slice(10, 12))) # Train with simple SGD algorithm = GradientDescent(cost=cost, parameters=cg.parameters,step_rule=Momentum(learning_rate=learningRate, momentum=0.7)) #algorithm = GradientDescent(cost=cost, parameters=cg.parameters,step_rule=Scale(learning_rate=learningRate)) #algorithm = GradientDescent(cost=cost, parameters=cg.parameters,step_rule=Adam(0.001)) # `Timing` extension reports time for reading data, aggregating a batch # and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs,after_n_batches=num_batches)) extensions.append(DataStreamMonitoring([cost, error_rate],stream_data_test,prefix="valid")) extensions.append(TrainingDataMonitoring([cost, error_rate,aggregation.mean(algorithm.total_gradient_norm)],prefix="train",after_epoch=True)) extensions.append(Checkpoint("Model1_uniform_init.pkl", after_epoch=True, after_training=True, save_separately=['log'])) extensions.append(ProgressBar()) extensions.append(Printing()) host_plot='http://hades.calculquebec.ca:5090' extensions.append(Plot('5C 3*3C 2*2P 204080...F 004LR 09Mom %s %s @ %s' % ('CNN ', datetime.datetime.now(), socket.gethostname()), channels=[['train_error_rate', 'valid_error_rate'], ['train_total_gradient_norm']], after_epoch=True, server_url=host_plot)) logger.info("Building the model") model = Model(cost) ########### Loading images ##################### main_loop = MainLoop( algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def initialize_all(config, save_path, bokeh_name, params, bokeh_server, bokeh, test_tag, use_load_ext, load_log, fast_start): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) train_conf = config['training'] recognizer = create_model(config, data, test_tag) # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) prediction, prediction_mask = add_exploration(recognizer, data, train_conf) # # Observables: # primary_observables = [] # monitored each batch secondary_observables = [] # monitored every 10 batches validation_observables = [] # monitored on the validation set cg = recognizer.get_cost_graph(batch=True, prediction=prediction, prediction_mask=prediction_mask) labels, = VariableFilter(applications=[recognizer.cost], name='labels')(cg) labels_mask, = VariableFilter(applications=[recognizer.cost], name='labels_mask')(cg) gain_matrix = VariableFilter( theano_name=RewardRegressionEmitter.GAIN_MATRIX)(cg) if len(gain_matrix): gain_matrix, = gain_matrix primary_observables.append(rename(gain_matrix.min(), 'min_gain')) primary_observables.append(rename(gain_matrix.max(), 'max_gain')) batch_cost = cg.outputs[0].sum() batch_size = rename(recognizer.labels.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_total_cost" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter(applications=[r.generator.readout.readout], name="output_0")(cost_cg) bottom_output = VariableFilter( # We need name_regex instead of name because LookupTable calls itsoutput output_0 applications=[r.bottom.apply], name_regex="output")(cost_cg)[-1] attended, = VariableFilter(applications=[r.generator.transition.apply], name="attended")(cost_cg) attended_mask, = VariableFilter(applications=[ r.generator.transition.apply ], name="attended_mask")(cost_cg) weights, = VariableFilter(applications=[r.generator.evaluate], name="weights")(cost_cg) from blocks.roles import AUXILIARY l2_cost, = VariableFilter(roles=[AUXILIARY], theano_name='l2_cost_aux')(cost_cg) cost_forward, = VariableFilter(roles=[AUXILIARY], theano_name='costs_forward_aux')(cost_cg) max_recording_length = rename(bottom_output.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = rename(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = rename(attended.shape[0], "max_attended_length") max_num_phonemes = rename(labels.shape[0], "max_num_phonemes") min_energy = rename(energies.min(), "min_energy") max_energy = rename(energies.max(), "max_energy") mean_attended = rename(abs(attended).mean(), "mean_attended") mean_bottom_output = rename( abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = rename(monotonicity_penalty(weights, labels_mask), "weights_penalty") weights_entropy = rename(entropy(weights, labels_mask), "weights_entropy") mask_density = rename(labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density ]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config.get('regularization', dict()) regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [ p for p in cg.parameters if p not in attention_params ] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) train_cost = regularized_cg.outputs[0] if reg_config.get("penalty_coof", .0) > 0: # big warning!!! # here we assume that: # regularized_weights_penalty = regularized_cg.outputs[1] train_cost = (train_cost + reg_config.get("penalty_coof", .0) * regularized_cg.outputs[1] / batch_size) if reg_config.get("decay", .0) > 0: train_cost = ( train_cost + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters))**2) train_cost = rename(train_cost, 'train_cost') gradients = None if reg_config.get('adaptive_noise'): logger.info('apply adaptive noise') if ((reg_config.get("penalty_coof", .0) > 0) or (reg_config.get("decay", .0) > 0)): logger.error('using adaptive noise with alignment weight panalty ' 'or weight decay is probably stupid') train_cost, regularized_cg, gradients, noise_brick = apply_adaptive_noise( cg, cg.outputs[0], variables=cg.parameters, num_examples=data.get_dataset('train').num_examples, parameters=Model( regularized_cg.outputs[0]).get_parameter_dict().values(), **reg_config.get('adaptive_noise')) train_cost.name = 'train_cost' adapt_noise_cg = ComputationGraph(train_cost) model_prior_mean = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_mean')(adapt_noise_cg)[0], 'model_prior_mean') model_cost = rename( VariableFilter(applications=[noise_brick.apply], name='model_cost')(adapt_noise_cg)[0], 'model_cost') model_prior_variance = rename( VariableFilter(applications=[noise_brick.apply], name='model_prior_variance')(adapt_noise_cg)[0], 'model_prior_variance') regularized_cg = ComputationGraph( [train_cost, model_cost] + regularized_cg.outputs + [model_prior_mean, model_prior_variance]) primary_observables += [ regularized_cg.outputs[1], # model cost regularized_cg.outputs[2], # task cost regularized_cg.outputs[-2], # model prior mean regularized_cg.outputs[-1] ] # model prior variance model = Model(train_cost) if params: logger.info("Load parameters from " + params) # please note: we cannot use recognizer.load_params # as it builds a new computation graph that dies not have # shapred variables added by adaptive weight noise with open(params, 'r') as src: param_values = load_parameters(src) model.set_parameter_values(param_values) parameters = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat([(key, parameters[key].get_value().shape) for key in sorted(parameters.keys())], width=120)) # Define the training algorithm. clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append( AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False) > 0: logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [ v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable) ] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat( [name for name, p in parameters.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([ name for name, p in parameters.items() if not p in maxnorm_subjects ])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects) ] burn_in = [] if train_conf.get('burn_in_steps', 0): burn_in.append(BurnIn(num_steps=train_conf['burn_in_steps'])) algorithm = GradientDescent( cost=train_cost, parameters=parameters.values(), gradients=gradients, step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)] + burn_in), on_unused_sources='warn') logger.debug("Scan Ops in the gradients") gradient_cg = ComputationGraph(algorithm.gradients.values()) for op in ComputationGraph(gradient_cg).scans: logger.debug(op) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. secondary_observables += list(regularized_cg.outputs) if not 'train_cost' in [v.name for v in secondary_observables]: secondary_observables += [train_cost] secondary_observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold ] for name, param in parameters.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements**0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements**0.5 step_norm = algorithm.steps[param].norm(2) / num_elements**0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' secondary_observables.append(stats) primary_observables += [ train_cost, algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length ] validation_observables += [ rename(aggregation.mean(batch_cost, batch_size), cost.name), rename(aggregation.sum_(batch_size), 'num_utterances'), weights_entropy, weights_penalty ] def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append( rename(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append( rename(aggregation.mean(var, labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result mon_conf = config['monitoring'] # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append( Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append( TrainingDataMonitoring(primary_observables + [l2_cost, cost_forward], after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(secondary_observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes(validation_observables + [l2_cost, cost_forward]), data.get_stream("valid", shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['validate_every_epochs'], every_n_batches=mon_conf['validate_every_batches'], after_training=False) extensions.append(validation) per = PhonemeErrorRate(recognizer, data, **config['monitoring']['search']) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=mon_conf['search_every_epochs'], every_n_batches=mon_conf['search_every_batches'], after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_cost = TrackTheBest( validation.record_name(cost)).set_conditions(before_first_epoch=True, after_epoch=True) extensions += [track_the_best_cost, track_the_best_per] extensions.append( AdaptiveClipping(algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter( data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf.get('num_batches'), after_n_epochs=train_conf.get('num_epochs')).add_condition( ["after_batch"], _gradient_norm_is_none), ] channels = [ # Plot 1: training and validation costs [ average_monitoring.record_name(train_cost), validation.record_name(cost) ], # Plot 2: gradient norm, [ average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold) ], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [ average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label') ], # Plot 5: training and validation monotonicity penalty [ average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording') ] ] if bokeh: extensions += [ Plot(bokeh_name if bokeh_name else os.path.basename(save_path), channels, every_n_batches=10, server_url=bokeh_server), ] extensions += [ Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True).add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension, )).add_condition( ['after_epoch'], OnLogRecord(track_the_best_cost.notification_name), (root_path + "_best_ll" + extension, )), ProgressBar() ] extensions.append(EmbedIPython(use_main_loop_run_caller_env=True)) if config['net']['criterion']['name'].startswith('mse'): extensions.append( LogInputsGains(labels, cg, recognizer.generator.readout.emitter, data)) if train_conf.get('patience'): patience_conf = train_conf['patience'] if not patience_conf.get('notification_names'): # setdefault will not work for empty list patience_conf['notification_names'] = [ track_the_best_per.notification_name, track_the_best_cost.notification_name ] extensions.append(Patience(**patience_conf)) extensions.append( Printing(every_n_batches=1, attribute_filter=PrintingFilterList())) return model, algorithm, data, extensions
def train_model(cost, unregularized_cost, updates, train_stream, valid_stream, args, gate_values=None): step_rule = learning_algorithm(args) cg = ComputationGraph(cost) # ADD REGULARIZATION # WEIGHT NOISE weight_noise = args.weight_noise if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg_train = apply_noise(cg, weights, weight_noise) cost = cg_train.outputs[0] cost.name = "cost_with_weight_noise" cg = ComputationGraph(cost) logger.info(cg.parameters) # Define algorithm algorithm = GradientDescent(cost=cost, step_rule=step_rule, parameters=cg.parameters) # Add the updates to carry the hidden state algorithm.add_updates(updates) # Extensions to be added extensions = [] # Load from a dumped model if args.load_path is not None: extensions.append(Load(args.load_path)) # Generation extension if args.generate: extensions.append(TextGenerationExtension( cost=cost, generation_length=args.generated_text_lenght, initial_text_length=args.initial_text_length, every_n_batches=1, ploting_path=os.path.join(args.save_path, 'prob_plot.png'), softmax_sampling=args.softmax_sampling, dataset=args.dataset, updates=updates, interactive_mode=args.interactive_mode)) # Training and Validation score monitoring extensions.extend([ TrainingDataMonitoring([cost], prefix='train', every_n_batches=args.monitoring_freq), DataStreamMonitoring([cost, unregularized_cost], valid_stream, args.mini_batch_size_valid, args.dataset, state_updates=updates, prefix='valid', before_first_epoch=(args.visualize == "nothing"), every_n_batches=args.monitoring_freq)]) # Creating directory for saving model. if not args.interactive_mode: if not os.path.exists(args.save_path): os.makedirs(args.save_path) elif 'test' in args.save_path: print "Rewriting in " + args.save_path else: raise Exception('Directory already exists') # Early stopping extensions.append(EarlyStopping('valid_' + unregularized_cost.name, args.patience, args.save_path, every_n_batches=args.monitoring_freq)) # Printing extensions.append(ProgressBar()) extensions.append(Printing(every_n_batches=args.monitoring_freq)) # Reset the initial states if args.dataset == "sine": reset_frequency = 1 else: reset_frequency = 100 extensions.append(ResetStates([v for v, _ in updates], every_n_batches=reset_frequency)) # Visualizing extensions if args.interactive_mode: extensions.append(InteractiveMode()) main_loop = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=extensions ) main_loop.run()
def __init__(self): inp = tensor.tensor3('input') inp = inp.dimshuffle(1,0,2) target = tensor.matrix('target') target = target.reshape((target.shape[0],)) product = tensor.lvector('product') missing = tensor.eq(inp, 0) train_input_mean = 1470614.1 train_input_std = 3256577.0 trans_1 = tensor.concatenate((inp[1:,:,:],tensor.zeros((1,inp.shape[1],inp.shape[2]))), axis=0) trans_2 = tensor.concatenate((tensor.zeros((1,inp.shape[1],inp.shape[2])), inp[:-1,:,:]), axis=0) inp = tensor.switch(missing,(trans_1+trans_2)/2, inp) lookup = LookupTable(length = 352, dim=4*hidden_dim) product_embed= lookup.apply(product) salut = tensor.concatenate((inp, missing),axis =2) linear = Linear(input_dim=input_dim+1, output_dim=4*hidden_dim, name="lstm_in") inter = linear.apply(salut) inter = inter + product_embed[None,:,:] lstm = LSTM(dim=hidden_dim, activation=activation_function, name="lstm") hidden, cells = lstm.apply(inter) linear2= Linear(input_dim = hidden_dim, output_dim = out_dim, name="ouput_linear") pred = linear2.apply(hidden[-1])*train_input_std + train_input_mean pred = pred.reshape((product.shape[0],)) cost = tensor.mean(abs((pred-target)/target)) # Initialize all bricks for brick in [linear, linear2, lstm, lookup]: brick.weights_init = IsotropicGaussian(0.1) brick.biases_init = Constant(0.) brick.initialize() # Apply noise and dropout cg = ComputationGraph([cost]) if w_noise_std > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, w_noise_std) if i_dropout > 0: cg = apply_dropout(cg, [hidden], i_dropout) [cost_reg] = cg.outputs cost_reg += 1e-20 if cost_reg is not cost: self.cost = cost self.cost_reg = cost_reg cost_reg.name = 'cost_reg' cost.name = 'cost' self.sgd_cost = cost_reg self.monitor_vars = [[cost, cost_reg]] else: self.cost = cost cost.name = 'cost' self.sgd_cost = cost self.monitor_vars = [[cost]] self.pred = pred pred.name = 'pred'
def main( num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=500, num_batches=None ): ############# Architecture ############# if feature_maps is None: feature_maps = [20, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5] if pool_sizes is None: pool_sizes = [2, 2] image_size = (32, 32) batch_size = 50 output_size = 2 learningRate = 0.1 num_epochs = 10 num_batches = None delta = 0.01 drop_prob = 0.5 weight_noise = 0.75 # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet( conv_activations, 3, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode="full", weights_init=Uniform(width=0.2), biases_init=Constant(0), ) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=0.2) convnet.layers[1].weights_init = Uniform(width=0.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=0.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=0.11) convnet.initialize() logging.info("Input dim: {} {} {}".format(*convnet.children[0].get_dim("input_"))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format(i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format(i, layer.__class__.__name__, *layer.get_dim("output"))) x = tensor.tensor4("image_features") y = tensor.lmatrix("targets") probs = (convnet.apply(x)).copy(name="probs") # Computational Graph just for cost for drop_out and noise application cg_probs = ComputationGraph([probs]) inputs = VariableFilter(roles=[INPUT])(cg_probs.variables) weights = VariableFilter(roles=[FILTER, WEIGHT])(cg_probs.variables) ############# Regularization ############# # regularization = 0 logger.info("Applying regularization") regularization = delta * sum([(W ** 2).mean() for W in weights]) probs.name = "reg_probs" ############# Guaussian Noise ############# logger.info("Applying Gaussian noise") cg_train = apply_noise(cg_probs, weights, weight_noise) ############# Dropout ############# logger.info("Applying dropout") cg_probs = apply_dropout(cg_probs, inputs, drop_prob) dropped_out = VariableFilter(roles=[DROPOUT])(cg_probs.variables) inputs_referenced = [var.tag.replacement_of for var in dropped_out] set(inputs) == set(inputs_referenced) ############# Batch normalization ############# # recalculate probs after dropout and noise and regularization: probs = cg_probs.outputs[0] + regularization cost = CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name="cost") error_rate = MisclassificationRate().apply(y.flatten(), probs).copy(name="error_rate") cg = ComputationGraph([probs, cost, error_rate]) cg = apply_batch_normalization(cg) ########### Loading images ##################### from fuel.datasets.dogs_vs_cats import DogsVsCats from fuel.streams import DataStream, ServerDataStream from fuel.schemes import ShuffledScheme from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, Random2DRotation from fuel.transformers import Flatten, Cast, ScaleAndShift def create_data(data): stream = DataStream(data, iteration_scheme=ShuffledScheme(data.num_examples, batch_size)) stream_downscale = MinimumImageDimensions(stream, image_size, which_sources=("image_features",)) stream_rotate = Random2DRotation(stream_downscale, which_sources=("image_features",)) stream_max = ScikitResize(stream_rotate, image_size, which_sources=("image_features",)) stream_scale = ScaleAndShift(stream_max, 1.0 / 255, 0, which_sources=("image_features",)) stream_cast = Cast(stream_scale, dtype="float32", which_sources=("image_features",)) # stream_flat = Flatten(stream_scale, which_sources=('image_features',)) return stream_cast stream_data_train = create_data(DogsVsCats(("train",), subset=slice(0, 20))) stream_data_test = create_data(DogsVsCats(("train",), subset=slice(20, 30))) # Train with simple SGD algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=learningRate)) # algorithm = GradientDescent(cost=cost, parameters=cg.parameters,step_rule=Adam(0.001)) # algorithm.add_updates(extra_updates) # `Timing` extension reports time for reading data, aggregating a batch and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches)) extensions.append(DataStreamMonitoring([cost, error_rate], stream_data_test, prefix="valid")) extensions.append( TrainingDataMonitoring( [cost, error_rate, aggregation.mean(algorithm.total_gradient_norm)], prefix="train", after_epoch=True ) ) # extensions.append(Checkpoint(save_to)) extensions.append(ProgressBar()) extensions.append(Printing()) logger.info("Building the model") model = Model(cost) main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') better = tensor.imatrix('better') better_mask = tensor.imatrix('better_mask') worse = tensor.imatrix('worse') worse_mask = tensor.imatrix('worse_mask') b_left = tensor.imatrix('b_left') b_left_mask = tensor.imatrix('b_left_mask') b_right = tensor.imatrix('b_right') b_right_mask = tensor.imatrix('b_right_mask') w_left = tensor.imatrix('w_left') w_left_mask = tensor.imatrix('w_left_mask') w_right = tensor.imatrix('w_right') w_right_mask = tensor.imatrix('w_right_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) better = better.dimshuffle(1, 0) better_mask = better_mask.dimshuffle(1, 0) worse = worse.dimshuffle(1, 0) worse_mask = worse_mask.dimshuffle(1, 0) b_left = b_left.dimshuffle(1, 0) b_left_mask = b_left_mask.dimshuffle(1, 0) b_right = b_right.dimshuffle(1, 0) b_right_mask = b_right_mask.dimshuffle(1, 0) w_left = w_left.dimshuffle(1, 0) w_left_mask = w_left_mask.dimshuffle(1, 0) w_right = w_right.dimshuffle(1, 0) w_right_mask = w_right_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # candidate encoders candidates_hidden_list = [] candidate_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_fwd_lstm_in_0_0') candidate_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_fwd_lstm_0') candidate_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='candidate_bwd_lstm_in_0_0') candidate_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='candidate_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [candidate_fwd_lstm, candidate_bwd_lstm, candidate_fwd_lstm_ins, candidate_bwd_lstm_ins] #computing better encoding better_embed = embed.apply(better) better_fwd_tmp = candidate_fwd_lstm_ins.apply(better_embed) better_bwd_tmp = candidate_bwd_lstm_ins.apply(better_embed) better_fwd_hidden, _ = candidate_fwd_lstm.apply(better_fwd_tmp, mask=better_mask.astype(theano.config.floatX)) better_bwd_hidden, _ = candidate_bwd_lstm.apply(better_bwd_tmp[::-1], mask=better_mask.astype(theano.config.floatX)[::-1]) better_hidden_list = [better_fwd_hidden, better_bwd_hidden] better_enc_dim = 2*sum(config.ctx_lstm_size) better_enc = tensor.concatenate([h[-1,:,:] for h in better_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_enc.name = 'better_enc' candidates_hidden_list = candidates_hidden_list + [better_fwd_hidden, better_bwd_hidden] #computing worse encoding worse_embed = embed.apply(worse) worse_fwd_tmp = candidate_fwd_lstm_ins.apply(worse_embed) worse_bwd_tmp = candidate_bwd_lstm_ins.apply(worse_embed) worse_fwd_hidden, _ = candidate_fwd_lstm.apply(worse_fwd_tmp, mask=worse_mask.astype(theano.config.floatX)) worse_bwd_hidden, _ = candidate_bwd_lstm.apply(worse_bwd_tmp[::-1], mask=worse_mask.astype(theano.config.floatX)[::-1]) worse_hidden_list = [worse_fwd_hidden, worse_bwd_hidden] worse_enc_dim = 2*sum(config.ctx_lstm_size) worse_enc = tensor.concatenate([h[-1,:,:] for h in worse_hidden_list], axis=1) worse_enc.name = 'worse_enc' candidates_hidden_list = candidates_hidden_list + [worse_fwd_hidden, worse_bwd_hidden] #left encoders left_context_hidden_list = [] left_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_fwd_lstm_in_0_0') left_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_fwd_lstm_0') left_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='left_context_bwd_lstm_in_0_0') left_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='left_context_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [left_context_fwd_lstm, left_context_bwd_lstm, left_context_fwd_lstm_ins, left_context_bwd_lstm_ins] #right encoders right_context_hidden_list = [] right_context_fwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_fwd_lstm_in_0_0') right_context_fwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_fwd_lstm_0') right_context_bwd_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4*config.ctx_lstm_size[0], name='right_context_bwd_lstm_in_0_0') right_context_bwd_lstm = LSTM(dim=config.ctx_lstm_size[0], activation=Tanh(), name='right_context_bwd_lstm_0') #adding encoding bricks for initialization bricks = bricks + [right_context_fwd_lstm, right_context_bwd_lstm, right_context_fwd_lstm_ins, right_context_bwd_lstm_ins] #left half encodings better_left_embed = embed.apply(b_left) better_left_fwd_tmp = left_context_fwd_lstm_ins.apply(better_left_embed) better_left_bwd_tmp = left_context_bwd_lstm_ins.apply(better_left_embed) better_left_fwd_hidden, _ = left_context_fwd_lstm.apply(better_left_fwd_tmp, mask=b_left_mask.astype(theano.config.floatX)) better_left_bwd_hidden, _ = left_context_bwd_lstm.apply(better_left_bwd_tmp[::-1], mask=b_left_mask.astype(theano.config.floatX)[::-1]) better_left_hidden_list = [better_left_fwd_hidden, better_left_bwd_hidden] better_left_enc_dim = 2*sum(config.ctx_lstm_size) better_left_enc = tensor.concatenate([h[-1,:,:] for h in better_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_left_enc.name = 'better_left_enc' left_context_hidden_list = left_context_hidden_list + [better_left_fwd_hidden, better_left_bwd_hidden] worse_left_embed = embed.apply(w_left) worse_left_fwd_tmp = left_context_fwd_lstm_ins.apply(worse_left_embed) worse_left_bwd_tmp = left_context_bwd_lstm_ins.apply(worse_left_embed) worse_left_fwd_hidden, _ = left_context_fwd_lstm.apply(worse_left_fwd_tmp, mask=w_left_mask.astype(theano.config.floatX)) worse_left_bwd_hidden, _ = left_context_bwd_lstm.apply(worse_left_bwd_tmp[::-1], mask=w_left_mask.astype(theano.config.floatX)[::-1]) worse_left_hidden_list = [worse_left_fwd_hidden, worse_left_bwd_hidden] worse_left_enc_dim = 2*sum(config.ctx_lstm_size) worse_left_enc = tensor.concatenate([h[-1,:,:] for h in worse_left_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size worse_left_enc.name = 'worse_left_enc' left_context_hidden_list = left_context_hidden_list + [worse_left_fwd_hidden, worse_left_bwd_hidden] #right half encoding better_right_embed = embed.apply(b_right) better_right_fwd_tmp = right_context_fwd_lstm_ins.apply(better_right_embed) better_right_bwd_tmp = right_context_bwd_lstm_ins.apply(better_right_embed) better_right_fwd_hidden, _ = right_context_fwd_lstm.apply(better_right_fwd_tmp, mask=b_right_mask.astype(theano.config.floatX)) better_right_bwd_hidden, _ = right_context_bwd_lstm.apply(better_right_bwd_tmp[::-1], mask=b_right_mask.astype(theano.config.floatX)[::-1]) better_right_hidden_list = [better_right_fwd_hidden, better_right_bwd_hidden] better_right_enc_dim = 2*sum(config.ctx_lstm_size) better_right_enc = tensor.concatenate([h[-1,:,:] for h in better_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size better_right_enc.name = 'better_right_enc' right_context_hidden_list = right_context_hidden_list + [better_right_fwd_hidden, better_right_bwd_hidden] worse_right_embed = embed.apply(w_right) worse_right_fwd_tmp = right_context_fwd_lstm_ins.apply(worse_right_embed) worse_right_bwd_tmp = right_context_bwd_lstm_ins.apply(worse_right_embed) worse_right_fwd_hidden, _ = right_context_fwd_lstm.apply(worse_right_fwd_tmp, mask=w_right_mask.astype(theano.config.floatX)) worse_right_bwd_hidden, _ = right_context_bwd_lstm.apply(worse_right_bwd_tmp[::-1], mask=w_right_mask.astype(theano.config.floatX)[::-1]) worse_right_hidden_list = [worse_right_fwd_hidden, worse_right_bwd_hidden] worse_right_enc_dim = 2*sum(config.ctx_lstm_size) worse_right_enc = tensor.concatenate([h[-1,:,:] for h in worse_right_hidden_list], axis=1) #concating last state of fwd and bwd LSTMs 2*dim * batch_size worse_right_enc.name = 'worse_right_enc' right_context_hidden_list = right_context_hidden_list + [worse_right_fwd_hidden, worse_right_bwd_hidden] # F1 prediction MLP prediction_mlp = MLP(dims=config.prediction_mlp_hidden + [1], activations=config.prediction_mlp_activations[1:] + [Identity()], name='prediction_mlp') prediction_qlinear = Linear(input_dim=qenc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, name='preq') prediction_cand_linear = Linear(input_dim=worse_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='precand') prediction_left_half_linear = Linear(input_dim=better_left_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preleft') prediction_right_half_linear = Linear(input_dim=better_right_enc_dim, output_dim=config.prediction_mlp_hidden[0]/4.0, use_bias=False, name='preright') bricks += [prediction_mlp, prediction_qlinear, prediction_cand_linear, prediction_left_half_linear, prediction_right_half_linear] better_layer1 = Tanh('tan1').apply(tensor.concatenate([prediction_cand_linear.apply(better_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(better_left_enc), prediction_right_half_linear.apply(better_right_enc)],axis=1)) better_layer1.name = 'better_layer1' worse_layer1 = Tanh('tan2').apply(tensor.concatenate([prediction_cand_linear.apply(worse_enc), prediction_qlinear.apply(qenc), prediction_left_half_linear.apply(worse_left_enc), prediction_right_half_linear.apply(worse_right_enc)],axis=1)) worse_layer1.name = 'worse_layer1' better_pred_weights = Tanh('rec1').apply(prediction_mlp.apply(better_layer1)) #batch_size worse_pred_weights = Tanh('rec2').apply(prediction_mlp.apply(worse_layer1)) #batch_size # numpy.set_printoptions(edgeitems=500) # better_pred_weights = theano.printing.Print('better')(better_pred_weights) # worse_pred_weights = theano.printing.Print('better')(worse_pred_weights) # #cost : max(0,- score-better + score-worse + margin) margin = config.margin conditions = tensor.lt(better_pred_weights, worse_pred_weights + margin).astype(theano.config.floatX) self.predictions = conditions cost = (-better_pred_weights + worse_pred_weights + margin) * conditions cost = cost.mean() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + candidates_hidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def construct_model(input_dim, out_dim): # Construct the model r = tensor.fmatrix('r') x = tensor.fmatrix('x') y = tensor.ivector('y') nx = x.shape[0] nj = x.shape[1] # also is r.shape[0] nr = r.shape[1] # r is nj x nr # x is nx x nj # y is nx # r_rep is nx x nj x nr r_rep = r[None, :, :].repeat(axis=0, repeats=nx) # x3 is nx x nj x 1 x3 = x[:, :, None] # concat is nx x nj x (nr + 1) concat = tensor.concatenate([r_rep, x3], axis=2) # Change concat from Batch x Time x Features to T X B x F mlp_input = concat.dimshuffle(1, 0, 2) if use_ensembling: # Split time dimension into batches of size num_feats # Join that dimension with the B dimension ens_shape = (num_feats, mlp_input.shape[0]/num_feats, mlp_input.shape[1]) mlp_input = mlp_input.reshape(ens_shape + (input_dim+1,)) mlp_input = mlp_input.reshape((ens_shape[0], ens_shape[1] * ens_shape[2], input_dim+1)) mlp = MLP(dims=[input_dim+1] + mlp_hidden_dims, activations=[activation_function for _ in mlp_hidden_dims], name='mlp') lstm_bot_linear = Linear(input_dim=mlp_hidden_dims[-1], output_dim=4 * lstm_hidden_dim, name="lstm_input_linear") lstm = LSTM(dim=lstm_hidden_dim, activation=activation_function, name="hidden_recurrent") lstm_top_linear = Linear(input_dim=lstm_hidden_dim, output_dim=out_dim, name="out_linear") rnn_input = mlp.apply(mlp_input) pre_rnn = lstm_bot_linear.apply(rnn_input) states = lstm.apply(pre_rnn)[0] activations = lstm_top_linear.apply(states) if use_ensembling: activations = activations.reshape(ens_shape + (out_dim,)) # Unsplit batches (ensembling) activations = tensor.mean(activations, axis=1) # Mean over time activations = tensor.mean(activations, axis=0) cost = Softmax().categorical_cross_entropy(y, activations) pred = activations.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in (mlp, lstm_bot_linear, lstm, lstm_top_linear): brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.) brick.initialize() # apply noise cg = ComputationGraph([cost, error_rate]) noise_vars = VariableFilter(roles=[WEIGHT])(cg) apply_noise(cg, noise_vars, noise_std) apply_dropout(cg, [rnn_input], dropout) [cost_reg, error_rate_reg] = cg.outputs return cost_reg, error_rate_reg, cost, error_rate
def main(config, tr_stream, dev_stream, use_bokeh=False): # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [ x for x in cg.intermediary_variables if x.name == 'maxout_apply_output' ] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def train(step_rule, state_dim, epochs, seed, experiment_path, initialization, to_watch, patience, static_mask, batch_size, rnn_type, num_layers, augment, seq_len, drop_prob, drop_prob_states, drop_prob_cells, drop_prob_igates, ogates_zoneout, stoch_depth, share_mask, gaussian_drop, weight_noise, norm_cost_coeff, penalty, input_drop, **kwargs): print '.. cPTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() def numpy_rng(random_seed=None): if random_seed == None: random_seed = 1223 return numpy.random.RandomState(random_seed) ########################################### # # MAKE DATA STREAMS # ########################################### rng = np.random.RandomState(seed) if share_mask: drop_prob_cells = drop_prob # we don't want to actually use these masks, so this is to debug drop_prob_states = None print '.. initializing iterators' if static_mask: train_stream = get_static_mask_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, False, augment=augment) train_stream_evaluation = get_static_mask_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) dev_stream = get_static_mask_ptb_stream('valid', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) else: train_stream = get_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, False, augment=augment) train_stream_evaluation = get_ptb_stream('train', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) dev_stream = get_ptb_stream('valid', batch_size, seq_len, drop_prob_states, drop_prob_cells, drop_prob_igates, state_dim, True, augment=augment) data = train_stream.get_epoch_iterator(as_dict=True).next() #import ipdb; ipdb.set_trace() ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('features', dtype=floatX) x, y = x[:-1], x[1:] drops_states = T.tensor3('drops_states') drops_cells = T.tensor3('drops_cells') drops_igates = T.tensor3('drops_igates') x.tag.test_value = data['features'] #y.tag.test_value = data['outputs'] drops_states.tag.test_value = data['drops_states'] drops_cells.tag.test_value = data['drops_cells'] drops_igates.tag.test_value = data['drops_igates'] if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=.2) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hid = Linear(50, state_dim * 4, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = ZoneoutLSTM(dim=state_dim, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn', ogates_zoneout=ogates_zoneout) elif rnn_type.lower() == 'gru': in_to_hid = Linear(50, state_dim * 3, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = ZoneoutGRU(dim=state_dim, weights_init=weights_init, activation=Tanh(), name='rnn') elif rnn_type.lower() == 'srnn': in_to_hid = Linear(50, state_dim, name='in_to_hid', weights_init=weights_init, biases_init=Constant(0.0)) recurrent_layer = ZoneoutSimpleRecurrent(dim=state_dim, weights_init=weights_init, activation=Rectifier(), name='rnn') else: raise NotImplementedError hid_to_out = Linear(state_dim, 50, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) in_to_hid.initialize() recurrent_layer.initialize() hid_to_out.initialize() h = in_to_hid.apply(x) if rnn_type.lower() == 'lstm': yh = recurrent_layer.apply(h, drops_states, drops_cells, drops_igates)[0] else: yh = recurrent_layer.apply(h, drops_states, drops_cells, drops_igates) y_hat_pre_softmax = hid_to_out.apply(yh) shape_ = y_hat_pre_softmax.shape # y_hat = Softmax().apply( # y_hat_pre_softmax.reshape((-1, shape_[-1])))# .reshape(shape_) ########################################### # # SET UP COSTS, MONITORS, and REGULARIZATION # ########################################### # cost = CategoricalCrossEntropy().apply(y.flatten().astype('int64'), y_hat) def crossentropy_lastaxes(yhat, y): # for sequence of distributions/targets return -(y * T.log(yhat)).sum(axis=yhat.ndim - 1) def softmax_lastaxis(x): # for sequence of distributions return T.nnet.softmax(x.reshape((-1, x.shape[-1]))).reshape(x.shape) yhat = softmax_lastaxis(y_hat_pre_softmax) cross_entropies = crossentropy_lastaxes(yhat, y) cross_entropy = cross_entropies.mean().copy(name="cross_entropy") cost = cross_entropy.copy(name="cost") batch_cost = cost.copy(name='batch_cost') nll_cost = cost.copy(name='nll_cost') bpc = (nll_cost / np.log(2.0)).copy(name='bpr') #nll_cost = aggregation.mean(batch_cost, batch_size).copy(name='nll_cost') cost_monitor = aggregation.mean( batch_cost, batch_size).copy(name='sequence_cost_monitor') cost_per_character = aggregation.mean( batch_cost, (seq_len - 1) * batch_size).copy(name='character_cost') cost_train = cost.copy(name='train_batch_cost') cost_train_monitor = cost_monitor.copy('train_batch_cost_monitor') cg_train = ComputationGraph([cost_train, cost_train_monitor]) ################## # NORM STABILIZER ################## norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt( T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) ## debugging nans stuff #gr = T.grad(norm_cost, cg_train.parameters, disconnected_inputs='ignore') #grf = theano.function([x, input_mask], gr) #grz = grf(x.tag.test_value, input_mask.tag.test_value) #params = cg_train.parameters #mynanz = [(pp, np.sum(gg)) for pp,gg in zip(params, grz) if np.isnan(np.sum(gg))] #for mm in mynanz: print mm ##import ipdb; ipdb.set_trace() elif penalty == 'hids': assert 'rnn_apply_states' in [ o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables) ] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): if output.name == 'rnn_apply_states': norms = _magnitude(output) norm_cost += T.mean( T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy( 'cost_train') #should this be cost_train.outputs[0]? cg_train = ComputationGraph([cost_train, cost_train_monitor]) #, norm_cost]) ################## # WEIGHT NOISE ################## if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') cost_train_monitor = cg_train.outputs[1].copy( 'train_batch_cost_monitor') # if 'l2regularization' in kwargs: # weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) # cost_train += kwargs['l2regularization'] * sum([ # (weight ** 2).sum() for weight in weights]) # cost_train.name = 'cost_train' # cg_train = ComputationGraph(cost_train) model = Model(cost_train) train_cost_per_character = aggregation.mean( cost_train_monitor, (seq_len - 1) * batch_size).copy(name='train_character_cost') algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) observed_vars = [ cost_train, cost_train_monitor, train_cost_per_character, aggregation.mean(algorithm.total_gradient_norm) ] # parameters = model.get_parameter_dict() # for name, param in parameters.iteritems(): # observed_vars.append(param.norm(2).copy(name=name + "_norm")) # observed_vars.append( # algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring(variables=observed_vars, prefix="train", after_epoch=True) dev_monitor = DataStreamMonitoring(variables=[nll_cost, bpc], data_stream=dev_stream, prefix="dev") extensions = [] if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name extensions.extend( [FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append( SaveParams('dev_nll_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) ########################################### # # MAIN LOOOOOOOOOOOP # ########################################### main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) # if write_predictions: # with open('predicted.txt', 'w') as f_pred: # with open('targets.txt', 'w') as f_targets: # evaluator = CTCEvaluator( # eol_symbol, x, input_mask, y_hat, phoneme_dict, black_list) # evaluator.evaluate(dev_stream, file_pred=f_pred, # file_targets=f_targets) # return main_loop.run() print "Execution time: %f" % (time.time() - t1)
def main(num_epochs, feature_maps=None, mlp_hiddens=None, conv_sizes=None, pool_sizes=None, batch_size=500, num_batches=None): ############# Architecture ############# if feature_maps is None: feature_maps = [20, 50] if mlp_hiddens is None: mlp_hiddens = [500] if conv_sizes is None: conv_sizes = [5, 5] if pool_sizes is None: pool_sizes = [2, 2] image_size = (32, 32) batch_size = 50 output_size = 2 learningRate = 0.1 num_epochs = 10 num_batches = None delta = 0.01 drop_prob = 0.5 weight_noise = 0.75 # Use ReLUs everywhere and softmax for the final prediction conv_activations = [Rectifier() for _ in feature_maps] mlp_activations = [Rectifier() for _ in mlp_hiddens] + [Softmax()] convnet = LeNet(conv_activations, 3, image_size, filter_sizes=zip(conv_sizes, conv_sizes), feature_maps=feature_maps, pooling_sizes=zip(pool_sizes, pool_sizes), top_mlp_activations=mlp_activations, top_mlp_dims=mlp_hiddens + [output_size], border_mode='full', weights_init=Uniform(width=.2), biases_init=Constant(0)) # We push initialization config to set different initialization schemes # for convolutional layers. convnet.push_initialization_config() convnet.layers[0].weights_init = Uniform(width=.2) convnet.layers[1].weights_init = Uniform(width=.09) convnet.top_mlp.linear_transformations[0].weights_init = Uniform(width=.08) convnet.top_mlp.linear_transformations[1].weights_init = Uniform(width=.11) convnet.initialize() logging.info( "Input dim: {} {} {}".format(*convnet.children[0].get_dim('input_'))) for i, layer in enumerate(convnet.layers): if isinstance(layer, Activation): logging.info("Layer {} ({})".format(i, layer.__class__.__name__)) else: logging.info("Layer {} ({}) dim: {} {} {}".format( i, layer.__class__.__name__, *layer.get_dim('output'))) x = tensor.tensor4('image_features') y = tensor.lmatrix('targets') probs = (convnet.apply(x)).copy(name='probs') # Computational Graph just for cost for drop_out and noise application cg_probs = ComputationGraph([probs]) inputs = VariableFilter(roles=[INPUT])(cg_probs.variables) weights = VariableFilter(roles=[FILTER, WEIGHT])(cg_probs.variables) ############# Regularization ############# #regularization = 0 logger.info('Applying regularization') regularization = delta * sum([(W**2).mean() for W in weights]) probs.name = "reg_probs" ############# Guaussian Noise ############# logger.info('Applying Gaussian noise') cg_train = apply_noise(cg_probs, weights, weight_noise) ############# Dropout ############# logger.info('Applying dropout') cg_probs = apply_dropout(cg_probs, inputs, drop_prob) dropped_out = VariableFilter(roles=[DROPOUT])(cg_probs.variables) inputs_referenced = [var.tag.replacement_of for var in dropped_out] set(inputs) == set(inputs_referenced) ############# Batch normalization ############# # recalculate probs after dropout and noise and regularization: probs = cg_probs.outputs[0] + regularization cost = (CategoricalCrossEntropy().apply(y.flatten(), probs).copy(name='cost')) error_rate = (MisclassificationRate().apply(y.flatten(), probs).copy(name='error_rate')) cg = ComputationGraph([probs, cost, error_rate]) cg = apply_batch_normalization(cg) ########### Loading images ##################### from fuel.datasets.dogs_vs_cats import DogsVsCats from fuel.streams import DataStream, ServerDataStream from fuel.schemes import ShuffledScheme from fuel.transformers.image import RandomFixedSizeCrop, MinimumImageDimensions, Random2DRotation from fuel.transformers import Flatten, Cast, ScaleAndShift def create_data(data): stream = DataStream(data, iteration_scheme=ShuffledScheme( data.num_examples, batch_size)) stream_downscale = MinimumImageDimensions( stream, image_size, which_sources=('image_features', )) stream_rotate = Random2DRotation(stream_downscale, which_sources=('image_features', )) stream_max = ScikitResize(stream_rotate, image_size, which_sources=('image_features', )) stream_scale = ScaleAndShift(stream_max, 1. / 255, 0, which_sources=('image_features', )) stream_cast = Cast(stream_scale, dtype='float32', which_sources=('image_features', )) #stream_flat = Flatten(stream_scale, which_sources=('image_features',)) return stream_cast stream_data_train = create_data( DogsVsCats(('train', ), subset=slice(0, 20))) stream_data_test = create_data( DogsVsCats(('train', ), subset=slice(20, 30))) # Train with simple SGD algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=learningRate)) #algorithm = GradientDescent(cost=cost, parameters=cg.parameters,step_rule=Adam(0.001)) #algorithm.add_updates(extra_updates) # `Timing` extension reports time for reading data, aggregating a batch and monitoring; # `ProgressBar` displays a nice progress bar during training. extensions = [] extensions.append(Timing()) extensions.append( FinishAfter(after_n_epochs=num_epochs, after_n_batches=num_batches)) extensions.append( DataStreamMonitoring([cost, error_rate], stream_data_test, prefix="valid")) extensions.append( TrainingDataMonitoring([ cost, error_rate, aggregation.mean(algorithm.total_gradient_norm) ], prefix="train", after_epoch=True)) #extensions.append(Checkpoint(save_to)) extensions.append(ProgressBar()) extensions.append(Printing()) logger.info("Building the model") model = Model(cost) main_loop = MainLoop(algorithm, stream_data_train, model=model, extensions=extensions) main_loop.run()
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids'],name='word_encoder') decoder = Decoder(vocab_size=config['trg_vocab_size'], embedding_dim=config['dec_embed'], state_dim=config['dec_nhids'], representation_dim=config['enc_nhids'] * 2, match_function=config['match_function'], use_doubly_stochastic=config['use_doubly_stochastic'], lambda_ds=config['lambda_ds'], use_local_attention=config['use_local_attention'], window_size=config['window_size'], use_step_decay_cost=config['use_step_decay_cost'], use_concentration_cost=config['use_concentration_cost'], lambda_ct=config['lambda_ct'], use_stablilizer=config['use_stablilizer'], lambda_st=config['lambda_st']) # here attended dim (representation_dim) of decoder is 2*enc_nhinds # because the context given by the encoder is a bidirectional context if mode == "train": # Create Theano variables logger.info('Creating theano variables') context_sentences=[]; context_sentence_masks=[]; for i in range(config['ctx_num']): context_sentences.append(tensor.lmatrix('context_'+str(i))); context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask')); source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') dev_source = tensor.lmatrix('dev_source') dev_target=tensor.lmatrix('dev_target') # Get training and development set streams tr_stream = get_tr_stream_withContext(**config) dev_stream = get_dev_stream_with_grdTruth(**config) # Get cost of the model sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask); sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]); sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]); for i in range(config['ctx_num']): tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]); tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]); sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0); sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0); cost = decoder.cost(sentence_representations_list, sentence_masks_list, target_sentence, target_sentence_mask) logger.info('Creating computational graph') perplexity = tensor.exp(cost) perplexity.name = 'perplexity' costs_computer = function(context_sentences+context_sentence_masks+[target_sentence, target_sentence_mask, source_sentence, source_sentence_mask], (perplexity)) cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init =decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init =decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise( cg, enc_params+dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([perplexity], after_batch=True), CheckpointNMT(config['saveto'], config['model_name'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate( sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, model_name=config['model_name'], hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if False: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'], n_best=3, track_n_models=6)) logger.info("Building perplexity validator") extensions.append( pplValidation(dev_source,dev_target, config=config, model=costs_computer, data_stream=dev_stream, model_name=config['model_name'], every_n_batches=config['sampling_freq'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) initial_learning_rate = config['initial_learning_rate'] log_path = os.path.join(config['saveto'], 'log') if config['reload'] and os.path.exists(log_path): with open(log_path, 'rb') as source: log = cPickle.load(source) last = max(log.keys()) - 1 if 'learning_rate' in log[last]: initial_learning_rate = log[last]['learning_rate'] # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([Scale(initial_learning_rate), StepClipping(config['step_clipping']), eval(config['step_rule'])()])) _learning_rate = algorithm.step_rule.components[0].learning_rate if config['learning_rate_decay']: extensions.append( LearningRateHalver(record_name='validation_cost', comparator=lambda x, y: x > y, learning_rate=_learning_rate, patience_default=3)) else: extensions.append(OldModelRemover(saveto=config['saveto'])) if config['learning_rate_grow']: extensions.append( LearningRateDoubler(record_name='validation_cost', comparator=lambda x, y: x < y, learning_rate=_learning_rate, patience_default=3)) extensions.append( SimplePrinting(config['model_name'], after_batch=True)) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run() elif mode == 'ppl': # Create Theano variables # Create Theano variables logger.info('Creating theano variables') context_sentences=[]; context_sentence_masks=[]; for i in range(config['ctx_num']): context_sentences.append(tensor.lmatrix('context_'+str(i))); context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask')); source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') # Get training and development set streams #tr_stream = get_tr_stream_withContext(**config) dev_stream = get_dev_stream_withContext_grdTruth(**config) # Get cost of the model sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask); sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]); sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]); for i in range(config['ctx_num']): tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]); tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]); sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0); sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0); cost = decoder.cost(sentence_representations_list, sentence_masks_list, target_sentence, target_sentence_mask) logger.info('Creating computational graph') costs_computer = function(context_sentences+context_sentence_masks+[target_sentence, target_sentence_mask, source_sentence, source_sentence_mask], (cost)) logger.info("Loading the model..") model = Model(cost) #loader = LoadNMT(config['saveto']) loader = LoadNMT(config['validation_load']); loader.set_model_parameters(model, loader.load_parameters_default()) logger.info("Started Validation: ") ts = dev_stream.get_epoch_iterator() total_cost = 0.0 total_tokens=0.0 #pbar = ProgressBar(max_value=len(ts)).start()#modified pbar = ProgressBar(max_value=10000).start(); for i, (ctx_0,ctx_0_mask,ctx_1,ctx_1_mask,ctx_2,ctx_2_mask,src, src_mask, trg, trg_mask) in enumerate(ts): costs = costs_computer(*[ctx_0,ctx_1,ctx_2,ctx_0_mask,ctx_1_mask,ctx_2_mask,trg, trg_mask,src, src_mask]) cost = costs.sum() total_cost+=cost total_tokens+=trg_mask.sum() pbar.update(i + 1) total_cost/=total_tokens; pbar.finish() #dev_stream.reset() # run afterprocess # self.ap.main() total_cost=2**total_cost; print("Average validation cost: " + str(total_cost)); elif mode == 'translate': logger.info('Creating theano variables') context_sentences=[]; context_sentence_masks=[]; for i in range(config['ctx_num']): context_sentences.append(tensor.lmatrix('context_'+str(i))); context_sentence_masks.append(tensor.matrix('context_'+str(i)+'_mask')); source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 trg_vocab = _ensure_special_tokens( cPickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} config['batch_size'] = 1 sentence_representations_list=encoder.apply(source_sentence, source_sentence_mask); sentence_representations_list=sentence_representations_list.dimshuffle(['x',0,1,2]); sentence_masks_list=source_sentence_mask.T.dimshuffle(['x',0,1]); for i in range(config['ctx_num']): tmp_rep=encoder.apply(context_sentences[i],context_sentence_masks[i]); tmp_rep=tmp_rep.dimshuffle(['x',0,1,2]); sentence_representations_list=tensor.concatenate([sentence_representations_list,tmp_rep],axis=0); sentence_masks_list=tensor.concatenate([sentence_masks_list,context_sentence_masks[i].T.dimshuffle(['x',0,1])],axis=0); generated = decoder.generate(sentence_representations_list,sentence_masks_list) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) #loader = LoadNMT(config['saveto']) loader = LoadNMT(config['validation_load']); loader.set_model_parameters(model, loader.load_parameters_default()) logger.info("Started translation: ") test_stream = get_dev_stream_withContext(**config) ts = test_stream.get_epoch_iterator() rts = open(config['val_set_source']).readlines() ftrans_original = open(config['val_output_orig'], 'w') saved_weights = [] total_cost = 0.0 pbar = ProgressBar(max_value=len(rts)).start() for i, (line, line_raw) in enumerate(zip(ts, rts)): trans_in = line_raw[3].split() seqs=[]; input_=[]; input_mask=[]; for j in range(config['ctx_num']+1): seqs.append(sutils._oov_to_unk( line[2*j][0], config['src_vocab_size'], unk_idx)) input_mask.append(numpy.tile(line[2*j+1][0],(config['beam_size'], 1))) input_.append(numpy.tile(seqs[j], (config['beam_size'], 1))) #v=costs_computer(input_[0]); # draw sample, checking to ensure we don't get an empty string back trans, costs, attendeds, weights = \ beam_search.search( input_values={source_sentence: input_[3],source_sentence_mask:input_mask[3], context_sentences[0]: input_[0],context_sentence_masks[0]:input_mask[0], context_sentences[1]: input_[1],context_sentence_masks[1]:input_mask[1], context_sentences[2]: input_[2],context_sentence_masks[2]:input_mask[2]}, max_length=3*len(seqs[2]), eol_symbol=trg_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths b = numpy.argsort(costs)[0] #best=numpy.argsort(costs)[0:config['beam_size']]; #for b in best: try: total_cost += costs[b] trans_out = trans[b] totalLen=4*len(line[0][0]); #weight = weights[b][:, :totalLen] weight=weights trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' saved_weights.append(weight) print(' '.join(trans_out), file=ftrans_original) pbar.update(i + 1) pbar.finish() logger.info("Total cost of the test: {}".format(total_cost)) cPickle.dump(saved_weights, open(config['attention_weights'], 'wb')) ftrans_original.close() ap = afterprocesser(config) ap.main()
def __init__(self, ref_data, output_dim): input_dim = ref_data.shape[1] ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data') # Construct the model j = tensor.lvector('j') r = ref_data_sh[j, :] x = tensor.fmatrix('x') y = tensor.ivector('y') # input_dim must be nr mlp0 = MLP(activations=activation_functions_0, dims=[input_dim] + hidden_dims_0, name='e0') mlp0vs = MLP(activations=[None], dims=[hidden_dims_0[-1], input_dim], name='de0') mlp1 = MLP(activations=activation_functions_1, dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter], name='inter_gen') mlp2 = MLP(activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name='end_mlp') encod = mlp0.apply(r) rprime = mlp0vs.apply(encod) inter_weights = mlp1.apply(encod) ibias = Bias(n_inter) ibias.biases_init = Constant(0) ibias.initialize() inter = inter_act_fun.apply(ibias.apply(tensor.dot(x, inter_weights))) final = mlp2.apply(inter) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in [mlp0, mlp0vs, mlp1, mlp2]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply regularization cg = ComputationGraph([cost, error_rate]) if r_dropout != 0: # - dropout on input vector r : r_dropout cg = apply_dropout(cg, [r], r_dropout) if s_dropout != 0: # - dropout on intermediate layers of first mlp : s_dropout s_dropout_vars = list( set( VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([inter_weights]))) - set([inter_weights])) cg = apply_dropout(cg, s_dropout_vars, s_dropout) if i_dropout != 0: # - dropout on input to second mlp : i_dropout cg = apply_dropout(cg, [inter], i_dropout) if a_dropout != 0: # - dropout on hidden layers of second mlp : a_dropout a_dropout_vars = list( set( VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([final]))) - set([inter_weights]) - set(s_dropout_vars)) cg = apply_dropout(cg, a_dropout_vars, a_dropout) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) [cost_reg, error_rate_reg] = cg.outputs # add reconstruction penalty for AE part penalty_val = tensor.sqrt(((r - rprime)**2).sum(axis=1)).mean() cost_reg = cost_reg + reconstruction_penalty * penalty_val self.cost = cost self.cost_reg = cost_reg self.error_rate = error_rate self.error_rate_reg = error_rate_reg self.pred = pred self.confidence = confidence
def train(config, save_path, bokeh_name, params, bokeh_server, test_tag, use_load_ext, load_log, fast_start, validation_epochs, validation_batches, per_epochs, per_batches): root_path, extension = os.path.splitext(save_path) data = Data(**config['data']) # Build the main brick and initialize all parameters. recognizer = SpeechRecognizer( data.recordings_source, data.labels_source, data.eos_label, data.num_features, data.num_labels, name="recognizer", data_prepend_eos=data.prepend_eos, character_map=data.character_map, **config["net"]) for brick_path, attribute_dict in sorted( config['initialization'].items(), key=lambda (k, v): -k.count('/')): for attribute, value in attribute_dict.items(): brick, = Selector(recognizer).select(brick_path).bricks setattr(brick, attribute, value) brick.push_initialization_config() recognizer.initialize() # Separate attention_params to be handled differently # when regularization is applied attention = recognizer.generator.transition.attention attention_params = Selector(attention).get_parameters().values() logger.info( "Initialization schemes for all bricks.\n" "Works well only in my branch with __repr__ added to all them,\n" "there is an issue #463 in Blocks to do that properly.") def show_init_scheme(cur): result = dict() for attr in dir(cur): if attr.endswith('_init'): result[attr] = getattr(cur, attr) for child in cur.children: result[child.name] = show_init_scheme(child) return result logger.info(pprint.pformat(show_init_scheme(recognizer))) if params: logger.info("Load parameters from " + params) recognizer.load_params(params) if test_tag: tensor.TensorVariable.__str__ = tensor.TensorVariable.__repr__ __stream = data.get_stream("train") __data = next(__stream.get_epoch_iterator(as_dict=True)) recognizer.recordings.tag.test_value = __data[data.recordings_source] recognizer.recordings_mask.tag.test_value = __data[data.recordings_source + '_mask'] recognizer.labels.tag.test_value = __data[data.labels_source] recognizer.labels_mask.tag.test_value = __data[data.labels_source + '_mask'] theano.config.compute_test_value = 'warn' batch_cost = recognizer.get_cost_graph().sum() batch_size = named_copy(recognizer.recordings.shape[1], "batch_size") # Assumes constant batch size. `aggregation.mean` is not used because # of Blocks #514. cost = batch_cost / batch_size cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Fetch variables useful for debugging. # It is important not to use any aggregation schemes here, # as it's currently impossible to spread the effect of # regularization on their variables, see Blocks #514. cost_cg = ComputationGraph(cost) r = recognizer energies, = VariableFilter( applications=[r.generator.readout.readout], name="output_0")( cost_cg) bottom_output, = VariableFilter( applications=[r.bottom.apply], name="output")( cost_cg) attended, = VariableFilter( applications=[r.generator.transition.apply], name="attended")( cost_cg) attended_mask, = VariableFilter( applications=[r.generator.transition.apply], name="attended_mask")( cost_cg) weights, = VariableFilter( applications=[r.generator.evaluate], name="weights")( cost_cg) max_recording_length = named_copy(r.recordings.shape[0], "max_recording_length") # To exclude subsampling related bugs max_attended_mask_length = named_copy(attended_mask.shape[0], "max_attended_mask_length") max_attended_length = named_copy(attended.shape[0], "max_attended_length") max_num_phonemes = named_copy(r.labels.shape[0], "max_num_phonemes") min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_attended = named_copy(abs(attended).mean(), "mean_attended") mean_bottom_output = named_copy(abs(bottom_output).mean(), "mean_bottom_output") weights_penalty = named_copy(monotonicity_penalty(weights, r.labels_mask), "weights_penalty") weights_entropy = named_copy(entropy(weights, r.labels_mask), "weights_entropy") mask_density = named_copy(r.labels_mask.mean(), "mask_density") cg = ComputationGraph([ cost, weights_penalty, weights_entropy, min_energy, max_energy, mean_attended, mean_bottom_output, batch_size, max_num_phonemes, mask_density]) # Regularization. It is applied explicitly to all variables # of interest, it could not be applied to the cost only as it # would not have effect on auxiliary variables, see Blocks #514. reg_config = config['regularization'] regularized_cg = cg if reg_config.get('dropout'): logger.info('apply dropout') regularized_cg = apply_dropout(cg, [bottom_output], 0.5) if reg_config.get('noise'): logger.info('apply noise') noise_subjects = [p for p in cg.parameters if p not in attention_params] regularized_cg = apply_noise(cg, noise_subjects, reg_config['noise']) regularized_cost = regularized_cg.outputs[0] regularized_weights_penalty = regularized_cg.outputs[1] # Model is weird class, we spend lots of time arguing with Bart # what it should be. However it can already nice things, e.g. # one extract all the parameters from the computation graphs # and give them hierahical names. This help to notice when a # because of some bug a parameter is not in the computation # graph. model = SpeechModel(regularized_cost) params = model.get_parameter_dict() logger.info("Parameters:\n" + pprint.pformat( [(key, params[key].get_value().shape) for key in sorted(params.keys())], width=120)) # Define the training algorithm. train_conf = config['training'] clipping = StepClipping(train_conf['gradient_threshold']) clipping.threshold.name = "gradient_norm_threshold" rule_names = train_conf.get('rules', ['momentum']) core_rules = [] if 'momentum' in rule_names: logger.info("Using scaling and momentum for training") core_rules.append(Momentum(train_conf['scale'], train_conf['momentum'])) if 'adadelta' in rule_names: logger.info("Using AdaDelta for training") core_rules.append(AdaDelta(train_conf['decay_rate'], train_conf['epsilon'])) max_norm_rules = [] if reg_config.get('max_norm', False): logger.info("Apply MaxNorm") maxnorm_subjects = VariableFilter(roles=[WEIGHT])(cg.parameters) if reg_config.get('max_norm_exclude_lookup', False): maxnorm_subjects = [v for v in maxnorm_subjects if not isinstance(get_brick(v), LookupTable)] logger.info("Parameters covered by MaxNorm:\n" + pprint.pformat([name for name, p in params.items() if p in maxnorm_subjects])) logger.info("Parameters NOT covered by MaxNorm:\n" + pprint.pformat([name for name, p in params.items() if not p in maxnorm_subjects])) max_norm_rules = [ Restrict(VariableClipping(reg_config['max_norm'], axis=0), maxnorm_subjects)] algorithm = GradientDescent( cost=regularized_cost + reg_config.get("penalty_coof", .0) * regularized_weights_penalty / batch_size + reg_config.get("decay", .0) * l2_norm(VariableFilter(roles=[WEIGHT])(cg.parameters)) ** 2, parameters=params.values(), step_rule=CompositeRule( [clipping] + core_rules + max_norm_rules + # Parameters are not changed at all # when nans are encountered. [RemoveNotFinite(0.0)])) # More variables for debugging: some of them can be added only # after the `algorithm` object is created. observables = regularized_cg.outputs observables += [ algorithm.total_step_norm, algorithm.total_gradient_norm, clipping.threshold] for name, param in params.items(): num_elements = numpy.product(param.get_value().shape) norm = param.norm(2) / num_elements ** 0.5 grad_norm = algorithm.gradients[param].norm(2) / num_elements ** 0.5 step_norm = algorithm.steps[param].norm(2) / num_elements ** 0.5 stats = tensor.stack(norm, grad_norm, step_norm, step_norm / grad_norm) stats.name = name + '_stats' observables.append(stats) def attach_aggregation_schemes(variables): # Aggregation specification has to be factored out as a separate # function as it has to be applied at the very last stage # separately to training and validation observables. result = [] for var in variables: if var.name == 'weights_penalty': result.append(named_copy(aggregation.mean(var, batch_size), 'weights_penalty_per_recording')) elif var.name == 'weights_entropy': result.append(named_copy(aggregation.mean( var, recognizer.labels_mask.sum()), 'weights_entropy_per_label')) else: result.append(var) return result # Build main loop. logger.info("Initialize extensions") extensions = [] if use_load_ext and params: extensions.append(Load(params, load_iteration_state=True, load_log=True)) if load_log and params: extensions.append(LoadLog(params)) extensions += [ Timing(after_batch=True), CGStatistics(), #CodeVersion(['lvsr']), ] extensions.append(TrainingDataMonitoring( [observables[0], algorithm.total_gradient_norm, algorithm.total_step_norm, clipping.threshold, max_recording_length, max_attended_length, max_attended_mask_length], after_batch=True)) average_monitoring = TrainingDataMonitoring( attach_aggregation_schemes(observables), prefix="average", every_n_batches=10) extensions.append(average_monitoring) validation = DataStreamMonitoring( attach_aggregation_schemes([cost, weights_entropy, weights_penalty]), data.get_stream("valid"), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=validation_epochs, every_n_batches=validation_batches, after_training=False) extensions.append(validation) recognizer.init_beam_search(10) per = PhonemeErrorRate(recognizer, data.get_dataset("valid")) per_monitoring = DataStreamMonitoring( [per], data.get_stream("valid", batches=False, shuffle=False), prefix="valid").set_conditions( before_first_epoch=not fast_start, every_n_epochs=per_epochs, every_n_batches=per_batches, after_training=False) extensions.append(per_monitoring) track_the_best_per = TrackTheBest( per_monitoring.record_name(per)).set_conditions( before_first_epoch=True, after_epoch=True) track_the_best_likelihood = TrackTheBest( validation.record_name(cost)).set_conditions( before_first_epoch=True, after_epoch=True) extensions += [track_the_best_likelihood, track_the_best_per] extensions.append(AdaptiveClipping( algorithm.total_gradient_norm.name, clipping, train_conf['gradient_threshold'], decay_rate=0.998, burnin_period=500)) extensions += [ SwitchOffLengthFilter(data.length_filter, after_n_batches=train_conf.get('stop_filtering')), FinishAfter(after_n_batches=train_conf['num_batches'], after_n_epochs=train_conf['num_epochs']) .add_condition(["after_batch"], _gradient_norm_is_none), # Live plotting: requires launching `bokeh-server` # and allows to see what happens online. Plot(bokeh_name if bokeh_name else os.path.basename(save_path), [# Plot 1: training and validation costs [average_monitoring.record_name(regularized_cost), validation.record_name(cost)], # Plot 2: gradient norm, [average_monitoring.record_name(algorithm.total_gradient_norm), average_monitoring.record_name(clipping.threshold)], # Plot 3: phoneme error rate [per_monitoring.record_name(per)], # Plot 4: training and validation mean weight entropy [average_monitoring._record_name('weights_entropy_per_label'), validation._record_name('weights_entropy_per_label')], # Plot 5: training and validation monotonicity penalty [average_monitoring._record_name('weights_penalty_per_recording'), validation._record_name('weights_penalty_per_recording')]], every_n_batches=10, server_url=bokeh_server), Checkpoint(save_path, before_first_epoch=not fast_start, after_epoch=True, every_n_batches=train_conf.get('save_every_n_batches'), save_separately=["model", "log"], use_cpickle=True) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_per.notification_name), (root_path + "_best" + extension,)) .add_condition( ['after_epoch'], OnLogRecord(track_the_best_likelihood.notification_name), (root_path + "_best_ll" + extension,)), ProgressBar(), Printing(every_n_batches=1, attribute_filter=PrintingFilterList() )] # Save the config into the status log = TrainingLog() log.status['_config'] = repr(config) main_loop = MainLoop( model=model, log=log, algorithm=algorithm, data_stream=data.get_stream("train"), extensions=extensions) main_loop.run()
def main(config, tr_stream, dev_stream): # Create Theano variables logger.info('Creating theano variables') source_char_seq = tensor.lmatrix('source_char_seq') source_sample_matrix = tensor.btensor3('source_sample_matrix') source_char_aux = tensor.bmatrix('source_char_aux') source_word_mask = tensor.bmatrix('source_word_mask') target_char_seq = tensor.lmatrix('target_char_seq') target_char_aux = tensor.bmatrix('target_char_aux') target_char_mask = tensor.bmatrix('target_char_mask') target_sample_matrix = tensor.btensor3('target_sample_matrix') target_word_mask = tensor.bmatrix('target_word_mask') target_resample_matrix = tensor.btensor3('target_resample_matrix') target_prev_char_seq = tensor.lmatrix('target_prev_char_seq') target_prev_char_aux = tensor.bmatrix('target_prev_char_aux') target_bos_idx = tr_stream.trg_bos target_space_idx = tr_stream.space_idx['target'] # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['char_enc_nhids'], config['enc_nhids'], config['encoder_layers']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['char_dec_nhids'], config['dec_nhids'], config['enc_nhids'] * 2, config['transition_layers'], target_space_idx, target_bos_idx) representation = encoder.apply(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask) cost = decoder.cost(representation, source_word_mask, target_char_seq, target_sample_matrix, target_resample_matrix, target_char_aux, target_char_mask, target_word_mask, target_prev_char_seq, target_prev_char_aux) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() for layer_n in range(config['encoder_layers']): encoder.decimator.dgru.transitions[layer_n].weights_init = Orthogonal() encoder.children[ 1 + layer_n].prototype.recurrent.weights_init = Orthogonal() decoder.interpolator.igru.weights_init = Orthogonal() decoder.interpolator.feedback_brick.dgru.transitions[ 0].weights_init = Orthogonal() for layer_n in range(config['transition_layers']): decoder.transition.transitions[layer_n].weights_init = Orthogonal() encoder.initialize() decoder.initialize() # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise(cg, enc_params + dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(str(shape), count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge( Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(str(value.get_value().shape), name)) logger.info("Total number of parameters: {}".format( len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([ StepClipping(config['step_clipping']), eval(config['step_rule'])() ])) # Set extensions logger.info("Initializing extensions") # Extensions gradient_norm = aggregation.mean(algorithm.total_gradient_norm) step_norm = aggregation.mean(algorithm.total_step_norm) train_monitor = CostCurve([cost, gradient_norm, step_norm], config=config, after_batch=True, before_first_epoch=True, prefix='tra') extensions = [ train_monitor, Timing(), Printing(after_batch=True), FinishAfter(after_n_batches=config['finish_after']), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") generated = decoder.generate(representation, source_word_mask) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[config['transition_layers']]) ) # generated[transition_layers] is next_outputs # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], transition_layers=config['transition_layers'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(source_char_seq, source_sample_matrix, source_char_aux, source_word_mask, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop(model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions) # Train! main_loop.run()
def __init__(self, ref_data, output_dim): ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data') # Construct the model j = tensor.lvector('j') x = tensor.fmatrix('x') y = tensor.ivector('y') last_outputs = [] s_dropout_vars = [] r_dropout_vars = [] i_dropout_vars = [] penalties = [] for i in range(nparts): fs = numpy.random.binomial(1, part_r_proba, size=(ref_data.shape[1],)) input_dim = int(fs.sum()) fs_sh = theano.shared(fs) r = ref_data_sh[j, :][:, fs_sh.nonzero()[0]] mlp0 = MLP(activations=activation_functions_0, dims=[input_dim] + hidden_dims_0, name='enc%d'%i) mlp0r = MLP(activations=[None], dims=[hidden_dims_0[-1], input_dim], name='dec%d'%i) mlp1 = MLP(activations=activation_functions_1, dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter], name='inter_gen_%d'%i) mlp2 = MLP(activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name='end_mlp_%d'%i) encod = mlp0.apply(r) rprime = mlp0r.apply(encod) inter_weights = mlp1.apply(encod) ibias = Bias(n_inter, name='inter_bias_%d'%i) inter = ibias.apply(tensor.dot(x, inter_weights)) inter = inter_act_fun.apply(inter) out = mlp2.apply(inter) penalties.append(tensor.sqrt(((rprime - r)**2).sum(axis=1)).mean()[None]) last_outputs.append(out) r_dropout_vars.append(r) s_dropout_vars = s_dropout_vars + ( VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([inter_weights])) ) i_dropout_vars.append(inter) # Initialize parameters for brick in [mlp0, mlp0r, mlp1, mlp2, ibias]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() final = tensor.concatenate([x[:, :, None] for x in last_outputs], axis=2).mean(axis=2) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # apply regularization cg = ComputationGraph([cost, error_rate]) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) if s_dropout != 0: cg = apply_dropout(cg, s_dropout_vars, s_dropout) if r_dropout != 0: cg = apply_dropout(cg, r_dropout_vars, r_dropout) if i_dropout != 0: cg = apply_dropout(cg, i_dropout_vars, i_dropout) [cost_reg, error_rate_reg] = cg.outputs cost_reg = cost_reg + reconstruction_penalty * tensor.concatenate(penalties, axis=0).sum() self.cost = cost self.cost_reg = cost_reg self.error_rate = error_rate self.error_rate_reg = error_rate_reg self.pred = pred self.confidence = confidence
output = fc.apply(data2.reshape((data2.shape[0], 25*50))) # COST AND ERROR MEASURE cost = Softmax().categorical_cross_entropy(label, output).mean() cost.name = 'cost' error_rate = tensor.neq(tensor.argmax(output, axis=1), label).mean() error_rate.name = 'error_rate' # REGULARIZATION cg = ComputationGraph([cost, error_rate]) if weight_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, weight_noise) if dropout > 0: cg = apply_dropout(cg, [eeg1, eeg2, data1, data2] + VariableFilter(name='output', bricks=fc.linear_transformations[:-1])(cg), dropout) # for vfilter, p in dropout_locs: # cg = apply_dropout(cg, vfilter(cg), p) [cost_reg, error_rate_reg] = cg.outputs # INITIALIZATION for brick in [conv_eeg, maxpool_eeg, conv_eeg2, maxpool_eeg2, conv, maxpool, conv2, maxpool2, fc]: brick.weights_init = weights_init brick.biases_init = biases_init brick.initialize() # ==========================================================================================
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') ans_indices = tensor.imatrix('ans_indices') # n_steps * n_samples ans_indices_mask = tensor.imatrix('ans_indices_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) ans_indices = ans_indices.dimshuffle(1, 0) ans_indices_mask = ans_indices_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='embed') embed.weights_init = IsotropicGaussian(0.01) # embed.weights_init = Constant(init_embedding_table(filename='embeddings/vocab_embeddings.txt')) # one directional LSTM encoding q_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4 * config.pre_lstm_size, name='q_lstm_in') q_lstm = LSTM(dim=config.pre_lstm_size, activation=Tanh(), name='q_lstm') c_lstm_ins = Linear(input_dim=config.embed_size, output_dim=4 * config.pre_lstm_size, name='c_lstm_in') c_lstm = LSTM(dim=config.pre_lstm_size, activation=Tanh(), name='c_lstm') bricks += [q_lstm, c_lstm, q_lstm_ins, c_lstm_ins] q_tmp = q_lstm_ins.apply(embed.apply(question)) c_tmp = c_lstm_ins.apply(embed.apply(context)) q_hidden, _ = q_lstm.apply(q_tmp, mask=question_mask.astype( theano.config.floatX)) # lq, bs, dim c_hidden, _ = c_lstm.apply(c_tmp, mask=context_mask.astype( theano.config.floatX)) # lc, bs, dim # Attention mechanism Bilinear question attention_question = Linear(input_dim=config.pre_lstm_size, output_dim=config.pre_lstm_size, name='att_question') bricks += [attention_question] att_weights_question = q_hidden[ None, :, :, :] * attention_question.apply( c_hidden.reshape( (c_hidden.shape[0] * c_hidden.shape[1], c_hidden.shape[2]))).reshape( (c_hidden.shape[0], c_hidden.shape[1], c_hidden.shape[2]))[:, None, :, :] # --> lc,lq,bs,dim att_weights_question = att_weights_question.sum( axis=3) # sum over axis 3 -> dimensions --> lc,lq,bs att_weights_question = att_weights_question.dimshuffle( 0, 2, 1) # --> lc,bs,lq att_weights_question = att_weights_question.reshape( (att_weights_question.shape[0] * att_weights_question.shape[1], att_weights_question.shape[2])) # --> lc*bs,lq att_weights_question = tensor.nnet.softmax( att_weights_question ) # softmax over axis 1 -> length of question # --> lc*bs,lq att_weights_question = att_weights_question.reshape( (c_hidden.shape[0], q_hidden.shape[1], q_hidden.shape[0])) # --> lc,bs,lq att_weights_question = att_weights_question.dimshuffle( 0, 2, 1) # --> lc,lq,bs question_context_attention = att_weights_question.dimshuffle(2, 1, 0) question_context_attention.name = "question_context_attention" self.analyse_vars = [question_context_attention] attended_question = tensor.sum( q_hidden[None, :, :, :] * att_weights_question[:, :, :, None], axis=1) # sum over axis 1 -> length of question --> lc,bs,dim attended_question.name = 'attended_question' # Match LSTM cqembed = tensor.concatenate([c_hidden, attended_question], axis=2) mlstms, mhidden_list = make_bidir_lstm_stack( cqembed, 2 * config.pre_lstm_size, context_mask.astype(theano.config.floatX), config.match_lstm_size, config.match_skip_connections, 'match') bricks = bricks + mlstms if config.match_skip_connections: menc_dim = 2 * sum(config.match_lstm_size) menc = tensor.concatenate(mhidden_list, axis=2) else: menc_dim = 2 * config.match_lstm_size[-1] menc = tensor.concatenate(mhidden_list[-2:], axis=2) menc.name = 'menc' #pointer networks decoder LSTM and Attention parameters params = init_params(data_dim=config.decoder_data_dim, lstm_dim=config.decoder_lstm_output_dim) tparams = init_tparams(params) self.theano_params = [] add_role(tparams['lstm_de_W'], WEIGHT) add_role(tparams['lstm_de_U'], WEIGHT) add_role(tparams['lstm_de_b'], BIAS) add_role(tparams['ptr_b1'], BIAS) add_role(tparams['ptr_b2'], BIAS) add_role(tparams['ptr_v'], WEIGHT) add_role(tparams['ptr_W1'], WEIGHT) add_role(tparams['ptr_W2'], WEIGHT) self.theano_params = tparams.values() #n_steps = length , n_samples = batch_size n_steps = ans_indices.shape[0] n_samples = ans_indices.shape[1] preds, generations = ptr_network( tparams, cqembed, context_mask.astype(theano.config.floatX), ans_indices, ans_indices_mask.astype(theano.config.floatX), config.decoder_lstm_output_dim, menc) self.generations = generations idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'), tensor.ones((n_samples, ), dtype='int64')) idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'), tensor.arange(n_samples, dtype='int64')) probs = preds[idx_steps, ans_indices, idx_samples] # probs *= y_mask off = 1e-8 if probs.dtype == 'float16': off = 1e-6 # probs += (1 - y_mask) # change unmasked position to 1, since log(1) = 0 probs += off # probs_printed = theano.printing.Print('this is probs')(probs) cost = -tensor.log(probs) cost *= ans_indices_mask cost = cost.sum(axis=0) / ans_indices_mask.sum(axis=0) cost = cost.mean() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, mhidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' # self.predictions.name = 'pred' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # self.analyse_vars= [cost, self.predictions, att_weights_start, att_weights_end, att_weights, att_target] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def train_model(cost, unregularized_cost, updates, train_stream, valid_stream, args, gate_values=None): step_rule = learning_algorithm(args) cg = ComputationGraph(cost) # ADD REGULARIZATION # WEIGHT NOISE weight_noise = args.weight_noise if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg.variables) cg_train = apply_noise(cg, weights, weight_noise) cost = cg_train.outputs[0] cost.name = "cost_with_weight_noise" cg = ComputationGraph(cost) logger.info(cg.parameters) # Define algorithm algorithm = GradientDescent(cost=cost, step_rule=step_rule, parameters=cg.parameters) # Add the updates to carry the hidden state algorithm.add_updates(updates) # Extensions to be added extensions = [] # Load from a dumped model if args.load_path is not None: if args.fine_tuning: cost = fine_tuning(cost, args) else: extensions.append(Load(args.load_path)) # Generation extension if args.generate: extensions.append( TextGenerationExtension( cost=cost, generation_length=args.generated_text_lenght, initial_text_length=args.initial_text_length, every_n_batches=1, ploting_path=os.path.join(args.save_path, 'prob_plot.png'), softmax_sampling=args.softmax_sampling, dataset=args.dataset, updates=updates, interactive_mode=args.interactive_mode)) # Training and Validation score monitoring extensions.extend([ TrainingDataMonitoring([cost], prefix='train', every_n_batches=args.monitoring_freq), DataStreamMonitoring([cost, unregularized_cost], valid_stream, args.mini_batch_size_valid, args.dataset, state_updates=updates, prefix='valid', before_first_epoch=(args.visualize is None), every_n_batches=args.monitoring_freq) ]) # Creating directory for saving model. if not args.interactive_mode: if not os.path.exists(args.save_path): os.makedirs(args.save_path) elif 'test' in args.save_path: print("Rewriting in " + args.save_path) else: raise Exception('Directory already exists') # Early stopping extensions.append( EarlyStopping('valid_' + unregularized_cost.name, args.patience, args.save_path, every_n_batches=args.monitoring_freq)) # Printing extensions.append(ProgressBar()) extensions.append(Printing(every_n_batches=args.monitoring_freq)) # Reset the initial states if args.dataset == "sine": reset_frequency = 1 else: reset_frequency = 100 extensions.append( ResetStates([v for v, _ in updates], every_n_batches=reset_frequency)) # Visualizing extensions if args.interactive_mode: extensions.append(InteractiveMode()) main_loop = MainLoop(model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=extensions) # This is where the magic happens! main_loop.run()
valid_stream = stream.valid(req_vars) cost = model.cost(**inputs) cg = ComputationGraph(cost) monitored = set([cost] + VariableFilter(roles=[roles.COST])(cg.variables)) valid_monitored = monitored if hasattr(model, 'valid_cost'): valid_cost = model.valid_cost(**inputs) valid_cg = ComputationGraph(valid_cost) valid_monitored = set([valid_cost] + VariableFilter(roles=[roles.COST])(valid_cg.variables)) if hasattr(config, 'dropout') and config.dropout < 1.0: cg = apply_dropout(cg, config.dropout_inputs(cg), config.dropout) if hasattr(config, 'noise') and config.noise > 0.0: cg = apply_noise(cg, config.noise_inputs(cg), config.noise) cost = cg.outputs[0] cg = Model(cost) logger.info('# Parameter shapes:') parameters_size = 0 for value in cg.parameters: logger.info(' %20s %s' % (value.get_value().shape, value.name)) parameters_size += reduce(operator.mul, value.get_value().shape, 1) logger.info('Total number of parameters: %d in %d matrices' % (parameters_size, len(cg.parameters))) if hasattr(config, 'step_rule'): step_rule = config.step_rule else: step_rule = AdaDelta()
def __init__(self, ref_data, output_dim): input_dim = ref_data.shape[1] ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data') rng = RandomStreams() ae_bricks = [] ae_input = ref_data_sh ae_costs = [] for i, (idim, odim) in enumerate(zip([input_dim] + ae_dims[:-1], ae_dims)): ae_mlp = MLP(activations=[ae_activations[i]], dims=[idim, odim], name='enc%i'%i) enc = ae_mlp.apply(ae_input) enc_n = ae_mlp.apply(ae_input + rng.normal(size=ae_input.shape, std=ae_f_noise_std)) ae_mlp_dec = MLP(activations=[ae_activations[i]], dims=[odim, idim], name='dec%i'%i) dec = ae_mlp_dec.apply(enc_n) cost = tensor.sqrt(((ae_input - dec) ** 2).sum(axis=1)).mean() + \ ae_l1_pen * abs(enc).sum(axis=1).mean() ae_costs.append(cost) ae_input = enc ae_bricks = ae_bricks + [ae_mlp, ae_mlp_dec] self.ae_costs = ae_costs ref_data_enc = ae_input # Construct the model j = tensor.lvector('j') r = ref_data_enc[j, :] x = tensor.fmatrix('x') y = tensor.ivector('y') # input_dim must be nr mlp = MLP(activations=activation_functions, dims=[ae_dims[-1]] + hidden_dims + [n_inter], name='inter_gen') mlp2 = MLP(activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name='end_mlp') inter_weights = mlp.apply(r) if inter_bias == None: ibias = Bias(n_inter) ibias.biases_init = Constant(0) ibias.initialize() inter = ibias.apply(tensor.dot(x, inter_weights)) else: inter = tensor.dot(x, inter_weights) - inter_bias inter = inter_act_fun.apply(inter) final = mlp2.apply(inter) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) # error_rate = tensor.neq(y, pred).mean() ber = balanced_error_rate.ber(y, pred) # Initialize parameters for brick in ae_bricks + [mlp, mlp2]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply regularization cg = ComputationGraph([cost, ber]) if r_dropout != 0: # - dropout on input vector r : r_dropout cg = apply_dropout(cg, [r], r_dropout) if x_dropout != 0: cg = apply_dropout(cg, [x], x_dropout) if s_dropout != 0: # - dropout on intermediate layers of first mlp : s_dropout s_dropout_vars = list(set(VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([inter_weights]))) - set([inter_weights])) cg = apply_dropout(cg, s_dropout_vars, s_dropout) if i_dropout != 0: # - dropout on input to second mlp : i_dropout cg = apply_dropout(cg, [inter], i_dropout) if a_dropout != 0: # - dropout on hidden layers of second mlp : a_dropout a_dropout_vars = list(set(VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([final]))) - set([inter_weights]) - set(s_dropout_vars)) cg = apply_dropout(cg, a_dropout_vars, a_dropout) if r_noise_std != 0: cg = apply_noise(cg, [r], r_noise_std) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) [cost_reg, ber_reg] = cg.outputs if s_l1pen != 0: s_weights = VariableFilter(bricks=mlp.linear_transformations, roles=[WEIGHT])(cg) cost_reg = cost_reg + s_l1pen * sum(abs(w).sum() for w in s_weights) if i_l1pen != 0: cost_reg = cost_reg + i_l1pen * abs(inter).sum() if a_l1pen != 0: a_weights = VariableFilter(bricks=mlp2.linear_transformations, roles=[WEIGHT])(cg) cost_reg = cost_reg + a_l1pen * sum(abs(w).sum() for w in a_weights) self.cost = cost self.cost_reg = cost_reg self.ber = ber self.ber_reg = ber_reg self.pred = pred self.confidence = confidence
def train(algorithm, learning_rate, clipping, momentum, layer_size, epochs, test_cost, experiment_path, initialization, init_width, weight_noise, z_prob, z_prob_states, z_prob_cells, drop_prob_igates, ogates_zoneout, batch_size, stoch_depth, share_mask, gaussian_drop, rnn_type, num_layers, norm_cost_coeff, penalty, testing, seq_len, decrease_lr_after_epoch, lr_decay, **kwargs): print '.. PTB experiment' print '.. arguments:', ' '.join(sys.argv) t0 = time.time() ########################################### # # LOAD DATA # ########################################### def onehot(x, numclasses=None): """ Convert integer encoding for class-labels (starting with 0 !) to one-hot encoding. The output is an array whose shape is the shape of the input array plus an extra dimension, containing the 'one-hot'-encoded labels. """ if x.shape == (): x = x[None] if numclasses is None: numclasses = x.max() + 1 result = numpy.zeros(list(x.shape) + [numclasses], dtype="int") z = numpy.zeros(x.shape, dtype="int") for c in range(numclasses): z *= 0 z[numpy.where(x == c)] = 1 result[..., c] += z return result.astype(theano.config.floatX) alphabetsize = 10000 data = np.load('penntree_char_and_word.npz') trainset = data['train_words'] validset = data['valid_words'] testset = data['test_words'] if testing: trainset = trainset[:3000] validset = validset[:3000] if share_mask: if not z_prob: raise ValueError('z_prob must be provided when using share_mask') if z_prob_cells or z_prob_states: raise ValueError('z_prob_states and z_prob_cells must not be provided when using share_mask (use z_prob instead)') z_prob_cells = z_prob # we don't want to actually use these masks, so this is to debug z_prob_states = None else: if z_prob: raise ValueError('z_prob is only used with share_mask') z_prob_cells = z_prob_cells or '1' z_prob_states = z_prob_states or '1' # rng = np.random.RandomState(seed) ########################################### # # MAKE STREAMS # ########################################### def prep_dataset(dataset): dataset = dataset[:(len(dataset) - (len(dataset) % (seq_len * batch_size)))] dataset = dataset.reshape(batch_size, -1, seq_len).transpose((1, 0, 2)) stream = DataStream(IndexableDataset(indexables=OrderedDict([ ('data', dataset)])), iteration_scheme=SequentialExampleScheme(dataset.shape[0])) stream = Transpose(stream, [(1, 0)]) stream = SampleDropsNPWord( stream, z_prob_states, z_prob_cells, drop_prob_igates, layer_size, num_layers, False, stoch_depth, share_mask, gaussian_drop, alphabetsize) stream.sources = ('data',) * 3 + stream.sources + ('zoneouts_states', 'zoneouts_cells', 'zoneouts_igates') return (stream,) train_stream, = prep_dataset(trainset) valid_stream, = prep_dataset(validset) test_stream, = prep_dataset(testset) #################### data = train_stream.get_epoch_iterator(as_dict=True).next() #################### ########################################### # # BUILD MODEL # ########################################### print '.. building model' x = T.tensor3('data') y = x zoneouts_states = T.tensor3('zoneouts_states') zoneouts_cells = T.tensor3('zoneouts_cells') zoneouts_igates = T.tensor3('zoneouts_igates') x.tag.test_value = data['data'] zoneouts_states.tag.test_value = data['zoneouts_states'] zoneouts_cells.tag.test_value = data['zoneouts_cells'] zoneouts_igates.tag.test_value = data['zoneouts_igates'] if init_width and not initialization == 'uniform': raise ValueError('Width is only for uniform init, whassup?') if initialization == 'glorot': weights_init = NormalizedInitialization() elif initialization == 'uniform': weights_init = Uniform(width=init_width) elif initialization == 'ortho': weights_init = OrthogonalInitialization() else: raise ValueError('No such initialization') if rnn_type.lower() == 'lstm': in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size*4, name='in_to_hid%d'%l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)] recurrent_layers = [DropLSTM(dim=layer_size, weights_init=weights_init, activation=Tanh(), model_type=6, name='rnn%d'%l, ogates_zoneout=ogates_zoneout) for l in range(num_layers)] elif rnn_type.lower() == 'gru': in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size*3, name='in_to_hid%d'%l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)] recurrent_layers = [DropGRU(dim=layer_size, weights_init=weights_init, activation=Tanh(), name='rnn%d'%l) for l in range(num_layers)] elif rnn_type.lower() == 'srnn': # FIXME!!! make ReLU in_to_hids = [Linear(layer_size if l > 0 else alphabetsize, layer_size, name='in_to_hid%d'%l, weights_init=weights_init, biases_init=Constant(0.0)) for l in range(num_layers)] recurrent_layers = [DropSimpleRecurrent(dim=layer_size, weights_init=weights_init, activation=Rectifier(), name='rnn%d'%l) for l in range(num_layers)] else: raise NotImplementedError hid_to_out = Linear(layer_size, alphabetsize, name='hid_to_out', weights_init=weights_init, biases_init=Constant(0.0)) for layer in in_to_hids: layer.initialize() for layer in recurrent_layers: layer.initialize() hid_to_out.initialize() layer_input = x #in_to_hid.apply(x) init_updates = OrderedDict() for l, (in_to_hid, layer) in enumerate(zip(in_to_hids, recurrent_layers)): rnn_embedding = in_to_hid.apply(layer_input) if rnn_type.lower() == 'lstm': states_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX)) cells_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name, cells_init.name = "states_init", "cells_init" states, cells = layer.apply(rnn_embedding, zoneouts_states[:, :, l * layer_size : (l + 1) * layer_size], zoneouts_cells[:, :, l * layer_size : (l + 1) * layer_size], zoneouts_igates[:, :, l * layer_size : (l + 1) * layer_size], states_init, cells_init) init_updates.update([(states_init, states[-1]), (cells_init, cells[-1])]) elif rnn_type.lower() in ['gru', 'srnn']: # untested! states_init = theano.shared(np.zeros((batch_size, layer_size), dtype=floatX)) states_init.name = "states_init" states = layer.apply(rnn_embedding, zoneouts_states, zoneouts_igates, states_init) init_updates.update([(states_init, states[-1])]) else: raise NotImplementedError layer_input = states y_hat_pre_softmax = hid_to_out.apply(T.join(0, [states_init], states[:-1])) shape_ = y_hat_pre_softmax.shape y_hat = Softmax().apply( y_hat_pre_softmax.reshape((-1, alphabetsize))) #################### ########################################### # # SET UP COSTS AND MONITORS # ########################################### cost = CategoricalCrossEntropy().apply(y.reshape((-1, alphabetsize)), y_hat).copy('cost') bpc = (cost/np.log(2.0)).copy(name='bpr') perp = T.exp(cost).copy(name='perp') cost_train = cost.copy(name='train_cost') cg_train = ComputationGraph([cost_train]) ########################################### # # NORM STABILIZER # ########################################### norm_cost = 0. def _magnitude(x, axis=-1): return T.sqrt(T.maximum(T.sqr(x).sum(axis=axis), numpy.finfo(x.dtype).tiny)) if penalty == 'cells': assert VariableFilter(roles=[MEMORY_CELL])(cg_train.variables) for cell in VariableFilter(roles=[MEMORY_CELL])(cg_train.variables): norms = _magnitude(cell) norm_cost += T.mean(T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) elif penalty == 'hids': for l in range(num_layers): assert 'rnn%d_apply_states'%l in [o.name for o in VariableFilter(roles=[OUTPUT])(cg_train.variables)] for output in VariableFilter(roles=[OUTPUT])(cg_train.variables): for l in range(num_layers): if output.name == 'rnn%d_apply_states'%l: norms = _magnitude(output) norm_cost += T.mean(T.sum((norms[1:] - norms[:-1])**2, axis=0) / (seq_len - 1)) norm_cost.name = 'norm_cost' #cost_valid = cost_train cost_train += norm_cost_coeff * norm_cost cost_train = cost_train.copy('cost_train') #should this be cost_train.outputs[0]? no. cg_train = ComputationGraph([cost_train]) ########################################### # # WEIGHT NOISE # ########################################### if weight_noise > 0: weights = VariableFilter(roles=[WEIGHT])(cg_train.variables) cg_train = apply_noise(cg_train, weights, weight_noise) cost_train = cg_train.outputs[0].copy(name='cost_train') model = Model(cost_train) learning_rate = float(learning_rate) clipping = StepClipping(threshold=np.cast[floatX](clipping)) if algorithm == 'adam': adam = Adam(learning_rate=learning_rate) learning_rate = adam.learning_rate step_rule = CompositeRule([adam, clipping]) elif algorithm == 'rms_prop': rms_prop = RMSProp(learning_rate=learning_rate) learning_rate = rms_prop.learning_rate step_rule = CompositeRule([clipping, rms_prop]) elif algorithm == 'momentum': sgd_momentum = Momentum(learning_rate=learning_rate, momentum=momentum) learning_rate = sgd_momentum.learning_rate step_rule = CompositeRule([clipping, sgd_momentum]) elif algorithm == 'sgd': sgd = Scale(learning_rate=learning_rate) learning_rate = sgd.learning_rate step_rule = CompositeRule([clipping, sgd]) else: raise NotImplementedError algorithm = GradientDescent(step_rule=step_rule, cost=cost_train, parameters=cg_train.parameters) # theano_func_kwargs={"mode": theano.compile.MonitorMode(post_func=detect_nan)}) algorithm.add_updates(init_updates) def cond_number(x): _, _, sing_vals = T.nlinalg.svd(x, True, True) sing_mags = abs(sing_vals) return T.max(sing_mags) / T.min(sing_mags) def rms(x): return (x*x).mean().sqrt() whysplode_cond = [] whysplode_rms = [] for i, p in enumerate(init_updates): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append(cond_number(p).copy('ini%d:%s_cond(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append(rms(p).copy('ini%d:%s_rms(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) for i, p in enumerate(cg_train.parameters): v = p.get_value() if p.get_value().shape == 2: whysplode_cond.append(cond_number(p).copy('ini%d:%s_cond(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) whysplode_rms.append(rms(p).copy('ini%d:%s_rms(%s)'%(i, p.name, "x".join(map(str, p.get_value().shape))))) observed_vars = [cost_train, cost, bpc, perp, learning_rate, aggregation.mean(algorithm.total_gradient_norm).copy("gradient_norm_mean")] # + whysplode_rms parameters = model.get_parameter_dict() for name, param in parameters.iteritems(): observed_vars.append(param.norm(2).copy(name=name + "_norm")) observed_vars.append( algorithm.gradients[param].norm(2).copy(name=name + "_grad_norm")) train_monitor = TrainingDataMonitoring( variables=observed_vars, prefix="train", after_epoch=True ) dev_inits = [p.clone() for p in init_updates] cg_dev = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace(zip(init_updates.keys(), dev_inits)) dev_cost, dev_bpc, dev_perp = cg_dev.outputs[:3] dev_init_updates = OrderedDict(zip(dev_inits, cg_dev.outputs[3:])) dev_monitor = DataStreamMonitoring( variables=[dev_cost, dev_bpc, dev_perp], data_stream=valid_stream, prefix="dev", updates=dev_init_updates ) # noone does this if 'load_path' in kwargs: with open(kwargs['load_path']) as f: loaded = np.load(f) model = Model(cost_train) params_dicts = model.get_parameter_dict() params_names = params_dicts.keys() for param_name in params_names: param = params_dicts[param_name] # '/f_6_.W' --> 'f_6_.W' slash_index = param_name.find('/') param_name = param_name[slash_index + 1:] if param.get_value().shape == loaded[param_name].shape: print 'Found: ' + param_name param.set_value(loaded[param_name]) else: print 'Not found: ' + param_name extensions = [] extensions.extend([FinishAfter(after_n_epochs=epochs), train_monitor, dev_monitor]) if test_cost: test_inits = [p.clone() for p in init_updates] cg_test = ComputationGraph([cost, bpc, perp] + init_updates.values()).replace(zip(init_updates.keys(), test_inits)) test_cost, test_bpc, test_perp = cg_test.outputs[:3] test_init_updates = OrderedDict(zip(test_inits, cg_test.outputs[3:])) test_monitor = DataStreamMonitoring( variables=[test_cost, test_bpc, test_perp], data_stream=test_stream, prefix="test", updates=test_init_updates ) extensions.extend([test_monitor]) if not os.path.exists(experiment_path): os.makedirs(experiment_path) log_path = os.path.join(experiment_path, 'log.txt') fh = logging.FileHandler(filename=log_path) fh.setLevel(logging.DEBUG) logger.addHandler(fh) extensions.append(SaveParams('dev_cost', model, experiment_path, every_n_epochs=1)) extensions.append(SaveLog(every_n_epochs=1)) extensions.append(ProgressBar()) extensions.append(Printing()) class RollsExtension(TrainingExtension): """ rolls the cell and state activations between epochs so that first batch gets correct initial activations """ def __init__(self, shvars): self.shvars = shvars def before_epoch(self): for v in self.shvars: v.set_value(np.roll(v.get_value(), 1, 0)) extensions.append(RollsExtension(init_updates.keys() + dev_init_updates.keys() + (test_init_updates.keys() if test_cost else []))) class LearningRateSchedule(TrainingExtension): """ Lets you set a number to divide learning rate by each epoch + when to start doing that """ def __init__(self): self.epoch_number = 0 def after_epoch(self): self.epoch_number += 1 if self.epoch_number > decrease_lr_after_epoch: learning_rate.set_value(learning_rate.get_value()/lr_decay) if bool(lr_decay) != bool(decrease_lr_after_epoch): raise ValueError('Need to define both lr_decay and decrease_lr_after_epoch') if lr_decay and decrease_lr_after_epoch: extensions.append(LearningRateSchedule()) main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) t1 = time.time() print "Building time: %f" % (t1 - t0) main_loop.run() print "Execution time: %f" % (time.time() - t1)
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') ans_indices = tensor.imatrix('ans_indices') # n_steps * n_samples ans_indices_mask = tensor.imatrix('ans_indices_mask') context_bag = tensor.eq(context[:, :, None], tensor.arange(vocab_size)).sum(axis=1).clip( 0, 1) bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) ans_indices = ans_indices.dimshuffle(1, 0) ans_indices_mask = ans_indices_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt') # embed.weights_init = Constant(embeddings_initial_value) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' #embed size: 200, lstm_size = 256 #qenc: length * batch_size * (2*lstm_size) # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) cqembed = tensor.concatenate( [ cembed, tensor.extra_ops.repeat( qenc[None, :, :], cembed.shape[0], axis=0) ], axis=2 ) #length * batch_size * (embed+2*lstm_size) this is what goes into encoder clstms, chidden_list = make_bidir_lstm_stack( cqembed, config.embed_size + qenc_dim, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' #cenc: length * batch_size * (2*lstm_size) #pointer networks decoder LSTM and Attention parameters params = init_params(data_dim=config.decoder_data_dim, lstm_dim=config.decoder_lstm_output_dim) tparams = init_tparams(params) self.theano_params = [] add_role(tparams['lstm_de_W'], WEIGHT) add_role(tparams['lstm_de_U'], WEIGHT) add_role(tparams['lstm_de_b'], BIAS) add_role(tparams['ptr_v'], WEIGHT) add_role(tparams['ptr_W1'], WEIGHT) add_role(tparams['ptr_W2'], WEIGHT) self.theano_params = tparams.values() # for p in tparams.values(): # add_role(p, WEIGHT) # self.theano_params.append(p) #n_steps = length , n_samples = batch_size n_steps = ans_indices.shape[0] n_samples = ans_indices.shape[1] preds, generations = ptr_network( tparams, cqembed, context_mask.astype(theano.config.floatX), ans_indices, ans_indices_mask.astype(theano.config.floatX), config.decoder_lstm_output_dim, cenc) self.generations = generations idx_steps = tensor.outer(tensor.arange(n_steps, dtype='int64'), tensor.ones((n_samples, ), dtype='int64')) idx_samples = tensor.outer(tensor.ones((n_steps, ), dtype='int64'), tensor.arange(n_samples, dtype='int64')) probs = preds[idx_steps, ans_indices, idx_samples] # probs *= y_mask off = 1e-8 if probs.dtype == 'float16': off = 1e-6 # probs += (1 - y_mask) # change unmasked position to 1, since log(1) = 0 probs += off # probs_printed = theano.printing.Print('this is probs')(probs) cost = -tensor.log(probs) cost *= ans_indices_mask cost = cost.sum(axis=0) / ans_indices_mask.sum(axis=0) cost = cost.mean() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self, config, vocab_size): context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) context_bag = to_bag(context, vocab_size) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='embed') embed.weights_init = IsotropicGaussian(0.01) #embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt') #embed.weights_init = Constant(embeddings_initial_value) # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Build the encoder bricks transition = GatedRecurrent(activation=Tanh(), dim=config.generator_lstm_size, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=cenc_dim, match_dim=config.generator_lstm_size, name="attention") readout = Readout(readout_dim=vocab_size, source_names=[ transition.apply.states[0], attention.take_glimpses.outputs[0] ], emitter=MaskedSoftmaxEmitter(context_bag=context_bag, name='emitter'), feedback_brick=LookupFeedback( vocab_size, config.feedback_size), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator") cost = generator.cost(answer, answer_mask.astype(theano.config.floatX), attended=cenc, attended_mask=context_mask.astype( theano.config.floatX), name="cost") self.predictions = generator.generate( n_steps=7, batch_size=config.batch_size, attended=cenc, attended_mask=context_mask.astype(theano.config.floatX), iterate=True)[1] # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # initialize new stuff manually (change!) generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0) generator.push_allocation_config() generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP attention_mlp = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp') attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') bricks += [attention_mlp, attention_qlinear, attention_clinear] layer1 = Tanh().apply( attention_clinear.apply( cenc.reshape((cenc.shape[0] * cenc.shape[1], cenc.shape[2] ))).reshape((cenc.shape[0], cenc.shape[1], config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply( layer1.reshape( (layer1.shape[0] * layer1.shape[1], layer1.shape[2]))) att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights = tensor.nnet.sigmoid(att_weights.T).T att_weights.name = 'att_weights' att_target = tensor.eq( tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)), tensor.tile(context[:, None, :], (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1) cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum() self.predictions = tensor.gt(att_weights, 0.1) * context # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def main(mode, config, use_bokeh=False): # Construct model logger.info('Building RNN encoder-decoder') encoder = BidirectionalEncoder( config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder( config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) if mode == "train": # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Get training and development set streams tr_stream = get_tr_stream(**config) dev_stream = get_dev_stream(**config) # Get cost of the model cost = decoder.cost( encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) logger.info('Creating computational graph') cg = ComputationGraph(cost) # Initialize model logger.info('Initializing model') encoder.weights_init = decoder.weights_init = IsotropicGaussian( config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: logger.info('Applying weight noise to ff layers') enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector( decoder.sequence_generator.readout).get_params().values() dec_params += Selector( decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.state_init).get_params().values() cg = apply_noise( cg, enc_params+dec_params, config['weight_noise_ff']) # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.items(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}" .format(len(enc_dec_param_dict))) # Set up training model logger.info("Building model") training_model = Model(cost) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), TrainingDataMonitoring([cost], after_batch=True), Printing(after_batch=True), CheckpointNMT(config['saveto'], every_n_batches=config['save_freq']) ] # Set up beam search and sampling computation graphs if necessary if config['hook_samples'] >= 1 or config['bleu_script'] is not None: logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate( sampling_input, sampling_representation) search_model = Model(generated) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # Add sampling if config['hook_samples'] >= 1: logger.info("Building sampler") extensions.append( Sampler(model=search_model, data_stream=tr_stream, hook_samples=config['hook_samples'], every_n_batches=config['sampling_freq'], src_vocab_size=config['src_vocab_size'])) # Add early stopping based on bleu if config['bleu_script'] is not None: logger.info("Building bleu validator") extensions.append( BleuValidator(sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, normalize=config['normalized_bleu'], every_n_batches=config['bleu_val_freq'])) # Reload model if necessary if config['reload']: extensions.append(LoadNMT(config['saveto'])) # Plot cost in bokeh if necessary if use_bokeh and BOKEH_AVAILABLE: extensions.append( Plot('Cs-En', channels=[['decoder_cost_cost']], after_batch=True)) # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run() elif mode == 'translate': # Create Theano variables logger.info('Creating theano variables') sampling_input = tensor.lmatrix('source') # Get test set stream test_stream = get_dev_stream( config['test_set'], config['src_vocab'], config['src_vocab_size'], config['unk_id']) ftrans = open(config['test_set'] + '.trans.out', 'w') # Helper utilities sutils = SamplingBase() unk_idx = config['unk_id'] src_eos_idx = config['src_vocab_size'] - 1 trg_eos_idx = config['trg_vocab_size'] - 1 # Get beam search logger.info("Building sampling model") sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) _, samples = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is next_outputs beam_search = BeamSearch(samples=samples) logger.info("Loading the model..") model = Model(generated) loader = LoadNMT(config['saveto']) loader.set_model_parameters(model, loader.load_parameters()) # Get target vocabulary trg_vocab = _ensure_special_tokens( pickle.load(open(config['trg_vocab'], 'rb')), bos_idx=0, eos_idx=trg_eos_idx, unk_idx=unk_idx) trg_ivocab = {v: k for k, v in trg_vocab.items()} logger.info("Started translation: ") total_cost = 0.0 for i, line in enumerate(test_stream.get_epoch_iterator()): seq = sutils._oov_to_unk( line[0], config['src_vocab_size'], unk_idx) input_ = numpy.tile(seq, (config['beam_size'], 1)) # draw sample, checking to ensure we don't get an empty string back trans, costs = \ beam_search.search( input_values={sampling_input: input_}, max_length=3*len(seq), eol_symbol=src_eos_idx, ignore_first_eol=True) # normalize costs according to the sequence lengths if config['normalized_bleu']: lengths = numpy.array([len(s) for s in trans]) costs = costs / lengths best = numpy.argsort(costs)[0] try: total_cost += costs[best] trans_out = trans[best] # convert idx to words trans_out = sutils._idx_to_word(trans_out, trg_ivocab) except ValueError: logger.info( "Can NOT find a translation for line: {}".format(i+1)) trans_out = '<UNK>' print(trans_out, file=ftrans) if i != 0 and i % 100 == 0: logger.info( "Translated {} lines of test set...".format(i)) logger.info("Total cost of the test: {}".format(total_cost)) ftrans.close()
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='question_embed') embed.weights_init = IsotropicGaussian(0.01) # Calculate question encoding (concatenate layer1) qembed = embed.apply(question) qlstms, qhidden_list = make_bidir_lstm_stack( qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') bricks = bricks + qlstms if config.question_skip_connections: qenc_dim = 2 * sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list], axis=1) else: qenc_dim = 2 * config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1, :, :] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) cqembed = tensor.concatenate([ cembed, tensor.extra_ops.repeat(qenc[None, :, :], cembed.shape[0], axis=0) ], axis=2) clstms, chidden_list = make_bidir_lstm_stack( cqembed, config.embed_size + qenc_dim, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.question_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism Bilinear attention_clinear_1 = Linear(input_dim=cenc_dim, output_dim=qenc_dim, name='attc_1') bricks += [attention_clinear_1] att_start = qenc[None, :, :] * attention_clinear_1.apply( cenc.reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape( (cenc.shape[0], cenc.shape[1], cenc.shape[2])) att_start = att_start.sum(axis=2) att_start = tensor.nnet.softmax(att_start.T).T attention_clinear_2 = Linear(input_dim=cenc_dim, output_dim=qenc_dim, name='attc_2') bricks += [attention_clinear_2] att_end = qenc[None, :, :] * attention_clinear_2.apply( cenc.reshape( (cenc.shape[0] * cenc.shape[1], cenc.shape[2]))).reshape( (cenc.shape[0], cenc.shape[1], cenc.shape[2])) att_end = att_end.sum(axis=2) att_end = tensor.nnet.softmax(att_end.T).T att_start = tensor.dot( tensor.le( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_start) att_end = tensor.dot( tensor.ge( tensor.tile( theano.tensor.arange(context.shape[0])[None, :], (context.shape[0], 1)), tensor.tile( theano.tensor.arange(context.shape[0])[:, None], (1, context.shape[0]))), att_end) # add attention from left and right att_weights = att_start * att_end att_target = tensor.eq( tensor.tile(answer[None, :, :], (context.shape[0], 1, 1)), tensor.tile(context[:, None, :], (1, answer.shape[0], 1))).sum(axis=1).clip(0, 1) self.predictions = tensor.gt(att_weights, 0.25) * context att_target = att_target / (att_target.sum(axis=0) + 0.00001) att_weights = att_weights / (att_weights.sum(axis=0) + 0.00001) #cost = (tensor.nnet.binary_crossentropy(att_weights, att_target) * context_mask).sum() / context_mask.sum() cost = (((att_weights - att_target)**2) * context_mask).sum() / context_mask.sum() # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' att_start.name = 'att_start' att_end.name = 'att_end' att_weights.name = 'att_weights' att_target.name = 'att_target' self.predictions.name = 'pred' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] self.analyse_vars = [ cost, self.predictions, att_start, att_end, att_weights, att_target ] # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def main(config, tr_stream, dev_stream): # Create Theano variables source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') sampling_input = tensor.lmatrix('input') # Construct model encoder = BidirectionalEncoder(config['src_vocab_size'], config['enc_embed'], config['enc_nhids']) decoder = Decoder(config['trg_vocab_size'], config['dec_embed'], config['dec_nhids'], config['enc_nhids'] * 2) cost = decoder.cost(encoder.apply(source_sentence, source_sentence_mask), source_sentence_mask, target_sentence, target_sentence_mask) # Initialize model encoder.weights_init = decoder.weights_init = IsotropicGaussian(config['weight_scale']) encoder.biases_init = decoder.biases_init = Constant(0) encoder.push_initialization_config() decoder.push_initialization_config() encoder.bidir.prototype.weights_init = Orthogonal() decoder.transition.weights_init = Orthogonal() encoder.initialize() decoder.initialize() cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) # Apply weight noise for regularization if config['weight_noise_ff'] > 0.0: enc_params = Selector(encoder.lookup).get_params().values() enc_params += Selector(encoder.fwd_fork).get_params().values() enc_params += Selector(encoder.back_fork).get_params().values() dec_params = Selector(decoder.sequence_generator.readout).get_params().values() dec_params += Selector(decoder.sequence_generator.fork).get_params().values() dec_params += Selector(decoder.transition.initial_transformer).get_params().values() cg = apply_noise(cg, enc_params+dec_params, config['weight_noise_ff']) cost = cg.outputs[0] # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) # Print parameter names enc_dec_param_dict = merge(Selector(encoder).get_params(), Selector(decoder).get_params()) logger.info("Parameter names: ") for name, value in enc_dec_param_dict.iteritems(): logger.info(' {:15}: {}'.format(value.get_value().shape, name)) logger.info("Total number of parameters: {}".format(len(enc_dec_param_dict))) # Set up training algorithm if args.subtensor_fix: assert config['step_rule'] == 'AdaDelta' from subtensor_gradient import GradientDescent_SubtensorFix, AdaDelta_SubtensorFix, subtensor_params lookups = subtensor_params(cg, [encoder.lookup, decoder.sequence_generator.readout.feedback_brick.lookup]) algorithm = GradientDescent_SubtensorFix( subtensor_params=lookups, cost=cost, params=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), RemoveNotFinite(0.9), AdaDelta_SubtensorFix(subtensor_params=lookups)]) ) else: algorithm = GradientDescent( cost=cost, params=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), RemoveNotFinite(0.9), eval(config['step_rule'])()]) ) # Set up beam search and sampling computation graphs sampling_representation = encoder.apply( sampling_input, tensor.ones(sampling_input.shape)) generated = decoder.generate(sampling_input, sampling_representation) search_model = Model(generated) samples, = VariableFilter( bricks=[decoder.sequence_generator], name="outputs")( ComputationGraph(generated[1])) # generated[1] is the next_outputs # Set up training model training_model = Model(cost) # Set extensions extensions = [ Sampler( model=search_model, config=config, data_stream=tr_stream, src_eos_idx=config['src_eos_idx'], trg_eos_idx=config['trg_eos_idx'], every_n_batches=config['sampling_freq']), BleuValidator( sampling_input, samples=samples, config=config, model=search_model, data_stream=dev_stream, src_eos_idx=config['src_eos_idx'], trg_eos_idx=config['trg_eos_idx'], every_n_batches=config['bleu_val_freq']), TrainingDataMonitoring([cost], after_batch=True), #Plot('En-Fr', channels=[['decoder_cost_cost']], # after_batch=True), Printing(after_batch=True), Dump(config['saveto'], every_n_batches=config['save_freq']) ] # Reload model if necessary if config['reload']: extensions += [LoadFromDumpWMT15(config['saveto'])] # Initialize main loop main_loop = MainLoop( model=training_model, algorithm=algorithm, data_stream=tr_stream, extensions=extensions ) # Train! main_loop.run()
def __init__(self, config, vocab_size, id_to_vocab, logger): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.ivector('answer') candidates = tensor.imatrix('candidates') candidates_mask = tensor.imatrix('candidates_mask') # question_actual = tensor.imatrix('question_actual') # context_actual = tensor.imatrix('context_actual') # answer_actual = tensor.imatrix('answer_actual') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) # Embed questions and cntext embed = LookupTable(vocab_size, config.embed_size, name='question_embed') bricks.append(embed) qembed = embed.apply(question) cembed = embed.apply(context) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + qlstms + clstms # Calculate question encoding (concatenate layer1) if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] #u qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) if config.ctx_skip_connections: cenc_dim = 2*sum(config.ctx_lstm_size) cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2*config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP attention_mlp = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp') attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') bricks += [attention_mlp, attention_qlinear, attention_clinear] layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))) .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2]))) att_weights.name = 'att_weights_0' att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights.name = 'att_weights' #r attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0) attended.name = 'attended' # Now we can calculate our output out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities], activations=config.out_mlp_activations + [Identity()], name='out_mlp') bricks += [out_mlp] # g^AR probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1)) probs.name = 'probs' is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :], tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1) probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs)) # Calculate prediction, cost and error rate pred = probs.argmax(axis=1) cost = Softmax().categorical_cross_entropy(answer, probs).mean() error_rate = tensor.neq(answer, pred).mean() # Apply dropout cg = ComputationGraph([cost, error_rate]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg, error_rate_reg] = cg.outputs # Other stuff cost_reg.name = cost.name = 'cost' error_rate_reg.name = error_rate.name = 'error_rate' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg], [error_rate_reg]] self.monitor_vars_valid = [[cost], [error_rate]] # Initialize bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self, ref_data, output_dim): ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data') # Construct the model j = tensor.lvector('j') x = tensor.fmatrix('x') y = tensor.ivector('y') last_outputs = [] s_dropout_vars = [] r_dropout_vars = [] i_dropout_vars = [] penalties = [] for i in range(nparts): fs = numpy.random.binomial(1, part_r_proba, size=(ref_data.shape[1], )) input_dim = int(fs.sum()) fs_sh = theano.shared(fs) r = ref_data_sh[j, :][:, fs_sh.nonzero()[0]] mlp0 = MLP(activations=activation_functions_0, dims=[input_dim] + hidden_dims_0, name='enc%d' % i) mlp0r = MLP(activations=[None], dims=[hidden_dims_0[-1], input_dim], name='dec%d' % i) mlp1 = MLP(activations=activation_functions_1, dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter], name='inter_gen_%d' % i) mlp2 = MLP(activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name='end_mlp_%d' % i) encod = mlp0.apply(r) rprime = mlp0r.apply(encod) inter_weights = mlp1.apply(encod) ibias = Bias(n_inter, name='inter_bias_%d' % i) inter = ibias.apply(tensor.dot(x, inter_weights)) inter = inter_act_fun.apply(inter) out = mlp2.apply(inter) penalties.append( tensor.sqrt(((rprime - r)**2).sum(axis=1)).mean()[None]) last_outputs.append(out) r_dropout_vars.append(r) s_dropout_vars = s_dropout_vars + (VariableFilter( bricks=[Tanh], name='output')(ComputationGraph([inter_weights ]))) i_dropout_vars.append(inter) # Initialize parameters for brick in [mlp0, mlp0r, mlp1, mlp2, ibias]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() final = tensor.concatenate([x[:, :, None] for x in last_outputs], axis=2).mean(axis=2) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # apply regularization cg = ComputationGraph([cost, error_rate]) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) if s_dropout != 0: cg = apply_dropout(cg, s_dropout_vars, s_dropout) if r_dropout != 0: cg = apply_dropout(cg, r_dropout_vars, r_dropout) if i_dropout != 0: cg = apply_dropout(cg, i_dropout_vars, i_dropout) [cost_reg, error_rate_reg] = cg.outputs cost_reg = cost_reg + reconstruction_penalty * tensor.concatenate( penalties, axis=0).sum() self.cost = cost self.cost_reg = cost_reg self.error_rate = error_rate self.error_rate_reg = error_rate_reg self.pred = pred self.confidence = confidence