def bilstm_layer(in_dim, inp, h_dim, n): linear = Linear(input_dim=in_dim, output_dim=h_dim * 4, name='linear' + str(n)+inp.name) lstm = LSTM(dim=h_dim, name='lstm' + str(n)+inp.name) bilstm = Bidirectional(prototype=lstm) bilstm.name = 'bilstm' + str(n) + inp.name initialize([linear, bilstm]) return bilstm.apply(linear.apply(inp))[0]
class TestBidirectional(unittest.TestCase): def setUp(self): self.bidir = Bidirectional(weights_init=Orthogonal(), prototype=Recurrent( dim=3, activation=Tanh())) self.simple = Recurrent(dim=3, weights_init=Orthogonal(), activation=Tanh()) self.bidir.initialize() self.simple.initialize() self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=floatX) self.x_val = (numpy.ones((24, 4, 3), dtype=floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=floatX) self.mask_val[12:24, 3] = 0 def test(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_bidir = theano.function([x, mask], [self.bidir.apply(x, mask=mask)]) calc_simple = theano.function([x, mask], [self.simple.apply(x, mask=mask)]) h_bidir = calc_bidir(self.x_val, self.mask_val)[0] h_simple = calc_simple(self.x_val, self.mask_val)[0] h_simple_rev = calc_simple(self.x_val[::-1], self.mask_val[::-1])[0] assert_allclose(h_simple, h_bidir[..., :3], rtol=1e-04) assert_allclose(h_simple_rev, h_bidir[::-1, ..., 3:], rtol=1e-04)
def bilstm_layer(in_dim, inp, h_dim, n, pref=""): linear = Linear(input_dim=in_dim, output_dim=h_dim * 4, name='linear' + str(n) + pref) lstm = LSTM(dim=h_dim, name='lstm' + str(n) + pref) bilstm = Bidirectional(prototype=lstm) bilstm.name = 'bilstm' + str(n) + pref initialize([linear, bilstm]) return bilstm.apply(linear.apply(inp))[0]
class TestBidirectional(unittest.TestCase): def setUp(self): self.bidir = Bidirectional(weights_init=Orthogonal(), prototype=SimpleRecurrent( dim=3, activation=Tanh())) self.simple = SimpleRecurrent(dim=3, weights_init=Orthogonal(), activation=Tanh(), seed=1) self.bidir.allocate() self.simple.initialize() self.bidir.children[0].params[0].set_value( self.simple.params[0].get_value()) self.bidir.children[1].params[0].set_value( self.simple.params[0].get_value()) self.x_val = 0.1 * numpy.asarray( list(itertools.permutations(range(4))), dtype=floatX) self.x_val = (numpy.ones((24, 4, 3), dtype=floatX) * self.x_val[..., None]) self.mask_val = numpy.ones((24, 4), dtype=floatX) self.mask_val[12:24, 3] = 0 def test(self): x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_bidir = theano.function([x, mask], [self.bidir.apply(x, mask=mask)]) calc_simple = theano.function([x, mask], [self.simple.apply(x, mask=mask)]) h_bidir = calc_bidir(self.x_val, self.mask_val)[0] h_simple = calc_simple(self.x_val, self.mask_val)[0] h_simple_rev = calc_simple(self.x_val[::-1], self.mask_val[::-1])[0] assert_allclose(h_simple, h_bidir[..., :3], rtol=1e-04) assert_allclose(h_simple_rev, h_bidir[::-1, ..., 3:], rtol=1e-04)
def example5(): """Bidir + simplereccurent. Adaptation from a unittest in blocks """ bidir = Bidirectional(weights_init=Orthogonal(), prototype=SimpleRecurrent( dim=3, activation=Tanh())) simple = SimpleRecurrent(dim=3, weights_init=Orthogonal(), activation=Tanh(), seed=1) bidir.allocate() simple.initialize() bidir.children[0].parameters[0].set_value( simple.parameters[0].get_value()) bidir.children[1].parameters[0].set_value( simple.parameters[0].get_value()) #Initialize theano variables and functions x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_bidir = theano.function([x, mask], [bidir.apply(x, mask=mask)]) calc_simple = theano.function([x, mask], [simple.apply(x, mask=mask)]) #Testing time x_val = 0.1 * np.asarray( list(itertools.permutations(range(4))), dtype=theano.config.floatX) x_val = (np.ones((24, 4, 3), dtype=theano.config.floatX) * x_val[..., None]) mask_val = np.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 h_bidir = calc_bidir(x_val, mask_val)[0] h_simple = calc_simple(x_val, mask_val)[0] h_simple_rev = calc_simple(x_val[::-1], mask_val[::-1])[0] print(h_bidir) print(h_simple) print(h_simple_rev)
class Encoder(Initializable): def __init__( self, encoder_type, num_characters, input_dim, encoder_dim, **kwargs): assert encoder_type in [None, 'bidirectional'] self.encoder_type = encoder_type super(Encoder, self).__init__(**kwargs) self.children = [] if encoder_type in ['lookup', 'bidirectional']: self.embed_label = LookupTable( num_characters, input_dim, name='embed_label') self.children += [ self.embed_label] else: # If there is no encoder. assert num_characters == input_dim if encoder_type == 'bidirectional': transition = RecurrentWithFork( GatedRecurrent(dim=encoder_dim).apply, input_dim, name='encoder_transition') self.encoder = Bidirectional(transition, name='encoder') self.children.append(self.encoder) @application def apply(self, x, x_mask=None): if self.encoder_type is None: return x if self.encoder_type in ['lookup', 'bidirectional']: embed_x = self.embed_label.apply(x) if self.encoder_type == 'lookup': encoded_x = embed_x if self.encoder_type == 'bidirectional': encoded_x = self.encoder.apply(embed_x, x_mask) return encoded_x
def example5(): """Bidir + simplereccurent. Adaptation from a unittest in blocks """ bidir = Bidirectional(weights_init=Orthogonal(), prototype=SimpleRecurrent(dim=3, activation=Tanh())) simple = SimpleRecurrent(dim=3, weights_init=Orthogonal(), activation=Tanh(), seed=1) bidir.allocate() simple.initialize() bidir.children[0].parameters[0].set_value(simple.parameters[0].get_value()) bidir.children[1].parameters[0].set_value(simple.parameters[0].get_value()) #Initialize theano variables and functions x = tensor.tensor3('x') mask = tensor.matrix('mask') calc_bidir = theano.function([x, mask], [bidir.apply(x, mask=mask)]) calc_simple = theano.function([x, mask], [simple.apply(x, mask=mask)]) #Testing time x_val = 0.1 * np.asarray(list(itertools.permutations(range(4))), dtype=theano.config.floatX) x_val = (np.ones( (24, 4, 3), dtype=theano.config.floatX) * x_val[..., None]) mask_val = np.ones((24, 4), dtype=theano.config.floatX) mask_val[12:24, 3] = 0 h_bidir = calc_bidir(x_val, mask_val)[0] h_simple = calc_simple(x_val, mask_val)[0] h_simple_rev = calc_simple(x_val[::-1], mask_val[::-1])[0] print(h_bidir) print(h_simple) print(h_simple_rev)
class Decimator(Initializable): """Source word encoder, mapping a charater-level word to a vector. This encoder is able to learn the morphology. For compatibility with previous version, we call it Decimator. """ def __init__(self, vocab_size, embedding_dim, dgru_state_dim, dgru_depth, **kwargs): super(Decimator, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.embedding_dim = embedding_dim self.lookup = LookupTable(name='embeddings') self.dgru_depth = dgru_depth # representation self.dgru = RecurrentStack([ DGRU(activation=Tanh(), dim=self.dgru_state_dim) for _ in range(dgru_depth) ], skip_connections=True) # importance of this representation self.bidir_w = Bidirectional(RecurrentWithFork( DGRU(activation=Tanh(), dim=self.dgru_state_dim // 2), self.embedding_dim, name='src_word_with_fork'), name='bidir_src_word_encoder') self.gru_fork = Fork( [name for name in self.dgru.apply.sequences if name != 'mask'], prototype=Linear(), name='gru_fork') # map to a energy scalar self.wl = Linear(input_dim=dgru_state_dim, output_dim=1) self.children = [ self.lookup, self.dgru, self.gru_fork, self.bidir_w, self.wl ] def _push_allocation_config(self): self.lookup.length = self.vocab_size self.lookup.dim = self.embedding_dim self.gru_fork.input_dim = self.embedding_dim self.gru_fork.output_dims = [ self.dgru.get_dim(name) for name in self.gru_fork.output_names ] @application(inputs=['char_seq', 'sample_matrix', 'char_aux'], outputs=['representation', 'weight']) def apply(self, char_seq, sample_matrix, char_aux): # Time as first dimension embeddings = self.lookup.apply(char_seq) gru_out = self.dgru.apply(**merge( self.gru_fork.apply(embeddings, as_dict=True), {'mask': char_aux})) wgru_out = tensor.exp( self.wl.apply(self.bidir_w.apply(embeddings, char_aux))) if self.dgru_depth > 1: gru_out = gru_out[-1] gru_out = tensor.addbroadcast(wgru_out, 2) * gru_out sampled_representation = tensor.tanh( tensor.batched_dot(sample_matrix, gru_out.dimshuffle([1, 0, 2]))) return sampled_representation.dimshuffle([1, 0, 2]), wgru_out def get_dim(self, name): if name == 'output': return self.dgru_state_dim super(Decimator, self).get_dim(name)
(max_sentence_length, mini_batch_size, 1)).astype(np.float32) x_mask.tag.test_value = np.random.choice( [0.0, 1.0], size=batch_of_sentences).astype(np.float32) print("x shape", x.shape.tag.test_value) # array([29, 16])) word_embedding = lookup.apply(x) print("word_embedding shape", word_embedding.shape.tag.test_value) # array([ 29, 16, 100])) print("x_extra shape", x_extra.shape.tag.test_value) # array([ 29, 16, 1])) embedding_extended = tensor.concatenate([word_embedding, x_extra], axis=-1) print("embedding_extended shape", embedding_extended.shape.tag.test_value) # array([ 29, 16, 101])) rnn_outputs = rnn.apply(embedding_extended, mask=x_mask) print("rnn_outputs shape", rnn_outputs.shape.tag.test_value) # array([ 29, 16, 202])) ### So : Need to reshape the rnn outputs to produce suitable input here... # Convert a tensor here into a long stream of vectors # The shape actually depends on the specific batch... (for instance, the last one in an epoch may be smaller) #rnn_outputs_reshaped = rnn_outputs.reshape( (max_sentence_length*mini_batch_size, hidden_dim*2) ) # not parameterized properly rnn_outputs_reshaped = rnn_outputs.reshape( (x.shape[0] * x.shape[1], hidden_dim * 2)) print("rnn_outputs_reshaped shape", rnn_outputs_reshaped.shape.tag.test_value) #array([464, 202])) labels_raw = gather.apply(rnn_outputs_reshaped) # This is pre-softmaxing print("labels_raw shape", labels_raw.shape.tag.test_value) # array([ 464, 5]))
def main(mode, save_path, num_batches, from_dump): if mode == "train": # Experiment configuration dimension = 100 readout_dimension = len(char2code) # Data processing pipeline data_stream = DataStreamMapping( mapping=lambda data: tuple(array.T for array in data), data_stream=PaddingDataStream( BatchDataStream( iteration_scheme=ConstantScheme(10), data_stream=DataStreamMapping( mapping=reverse_words, add_sources=("targets", ), data_stream=DataStreamFilter( predicate=lambda data: len(data[0]) <= 100, data_stream=OneBillionWord( "training", [99], char2code, level="character", preprocess=str.lower).get_default_stream()))))) # Build the model chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") encoder = Bidirectional(GatedRecurrent(dim=dimension, activation=Tanh()), weights_init=Orthogonal()) encoder.initialize() fork = Fork([ name for name in encoder.prototype.apply.sequences if name != 'mask' ], weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) fork.input_dim = dimension fork.fork_dims = {name: dimension for name in fork.fork_names} fork.initialize() lookup = LookupTable(readout_dimension, dimension, weights_init=IsotropicGaussian(0.1)) lookup.initialize() transition = Transition(activation=Tanh(), dim=dimension, attended_dim=2 * dimension, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, match_dim=dimension, name="attention") readout = LinearReadout(readout_dim=readout_dimension, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( readout_dimension, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() bricks = [encoder, fork, lookup, generator] # Give an idea of what's going on params = Selector(bricks).get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) # Build the cost computation graph batch_cost = generator.cost( targets, targets_mask, attended=encoder.apply(**dict_union(fork.apply( lookup.lookup(chars), return_dict=True), mask=chars_mask)), attended_mask=chars_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Fetch variables useful for debugging max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") cg = ComputationGraph(cost) energies = unpack(VariableFilter(application=readout.readout, name="output")(cg.variables), singleton=True) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") (activations, ) = VariableFilter( application=generator.transition.apply, name="states")(cg.variables) mean_activation = named_copy(activations.mean(), "mean_activation") # Define the training algorithm. algorithm = GradientDescent(cost=cost, step_rule=CompositeRule([ GradientClipping(10.0), SteepestDescent(0.01) ])) observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) main_loop = MainLoop( model=bricks, data_stream=data_stream, algorithm=algorithm, extensions=([LoadFromDump(from_dump)] if from_dump else []) + [ Timing(), TrainingDataMonitoring(observables, after_every_batch=True), TrainingDataMonitoring( observables, prefix="average", every_n_batches=10), FinishAfter(after_n_batches=num_batches).add_condition( "after_batch", lambda log: math.isnan( log.current_row.total_gradient_norm)), Plot(os.path.basename(save_path), [["average_" + cost.name], ["average_" + cost_per_character.name]], every_n_batches=10), SerializeMainLoop(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "test": with open(save_path, "rb") as source: encoder, fork, lookup, generator = dill.load(source) logger.info("Model is loaded") chars = tensor.lmatrix("features") generated = generator.generate( n_steps=3 * chars.shape[0], batch_size=chars.shape[1], attended=encoder.apply(**dict_union( fork.apply(lookup.lookup(chars), return_dict=True))), attended_mask=tensor.ones(chars.shape)) sample_function = ComputationGraph(generated).get_theano_function() logging.info("Sampling function is compiled") while True: # Python 2-3 compatibility line = input("Enter a sentence\n") batch_size = int(input("Enter a number of samples\n")) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) states, samples, glimpses, weights, costs = sample_function( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for i in range(samples.shape[1]): sample = list(samples[:, i]) try: true_length = sample.index(char2code['</S>']) + 1 except ValueError: true_length = len(sample) sample = sample[:true_length] cost = costs[:true_length, i].sum() message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=lambda tuple_: -tuple_[0]) for _, message in messages: print(message)
def main(num_epochs=100): x = tensor.matrix('features') m = tensor.matrix('features_mask') y = tensor.imatrix('targets') x_int = x.astype(dtype='int32').T - 2 train_dataset = IMDB() idx_sort = numpy.argsort( [len(s) for s in train_dataset.indexables[ train_dataset.sources.index('features')]] ) n_voc = len(train_dataset.dict.keys()) for idx in xrange(len(train_dataset.sources)): train_dataset.indexables[idx] = train_dataset.indexables[idx][idx_sort] n_h = 10 linear_embedding = LookupTable( length=n_voc, dim=4 * n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) linear_embedding.initialize() lstm_biases = numpy.zeros(4 * n_h).astype(dtype=theano.config.floatX) lstm_biases[n_h:(2 * n_h)] = 4. rnn = Bidirectional(LSTM( dim=n_h, weights_init=Uniform(std=0.01), biases_init=Constant(0.) )) rnn.initialize() score_layer = Linear( input_dim=2*n_h, output_dim=1, weights_init=Uniform(std=0.01), biases_init=Constant(0.) ) score_layer.initialize() embedding = linear_embedding.apply(x_int) * tensor.shape_padright(m.T) rnn_out = rnn.apply(embedding) rnn_out_mean_pooled = tensor.mean(rnn_out[0], axis=0) probs = Sigmoid().apply( score_layer.apply(rnn_out_mean_pooled)) cost = - (y * tensor.log(probs) + (1 - y) * tensor.log(1 - probs) ).mean() cost.name = 'cost' misclassification = (y * (probs < 0.5) + (1 - y) * (probs > 0.5) ).mean() misclassification.name = 'misclassification' cg = ComputationGraph([cost]) params = cg.parameters algorithm = GradientDescent( cost=cost, params=params, step_rule=CompositeRule( components=[StepClipping(threshold=10.), Adam() ] ) ) n_train = int(numpy.floor(.8 * train_dataset.num_examples)) n_valid = int(numpy.floor(.1 * train_dataset.num_examples)) train_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(100), batch_size=10, ) ), mask_sources=('features',) ) valid_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(100, 110), batch_size=10, ) ), mask_sources=('features',) ) test_data_stream = Padding( data_stream=DataStream( dataset=train_dataset, iteration_scheme=BatchwiseShuffledScheme( examples=range(110, 120), batch_size=10, ) ), mask_sources=('features',) ) model = Model(cost) extensions = [] extensions.append(Timing()) extensions.append(FinishAfter(after_n_epochs=num_epochs)) extensions.append(DataStreamMonitoring( [cost, misclassification], test_data_stream, prefix='test')) extensions.append(DataStreamMonitoring( [cost, misclassification], valid_data_stream, prefix='valid')) extensions.append(TrainingDataMonitoring( [cost, misclassification], prefix='train', after_epoch=True)) plotters = [] plotters.append(Plotter( channels=[['train_cost', 'train_misclassification', 'valid_cost', 'valid_misclassification']], titles=['Costs'])) extensions.append(PlotManager('IMDB classification example', plotters=plotters, after_epoch=True, after_training=True)) extensions.append(Printing()) main_loop = MainLoop(model=model, data_stream=train_data_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
# Define the shape of x specifically... :: the data has format (batch, features). x.tag.test_value = np.random.randint(vocab_size, size=batch_of_sentences ).astype(np.int32) x_extra.tag.test_value = np.zeros( (max_sentence_length, mini_batch_size, 1) ).astype(np.float32) x_mask.tag.test_value = np.random.choice( [0.0, 1.0], size=batch_of_sentences ).astype(np.float32) print("x shape", x.shape.tag.test_value) # array([29, 16])) word_embedding = lookup.apply(x) print("word_embedding shape", word_embedding.shape.tag.test_value) # array([ 29, 16, 100])) print("x_extra shape", x_extra.shape.tag.test_value) # array([ 29, 16, 1])) embedding_extended = tensor.concatenate([ word_embedding, x_extra ], axis=-1) print("embedding_extended shape", embedding_extended.shape.tag.test_value) # array([ 29, 16, 101])) rnn_outputs = rnn.apply(embedding_extended, mask=x_mask) print("rnn_outputs shape", rnn_outputs.shape.tag.test_value) # array([ 29, 16, 202])) ### So : Need to reshape the rnn outputs to produce suitable input here... # Convert a tensor here into a long stream of vectors # The shape actually depends on the specific batch... (for instance, the last one in an epoch may be smaller) #rnn_outputs_reshaped = rnn_outputs.reshape( (max_sentence_length*mini_batch_size, hidden_dim*2) ) # not parameterized properly rnn_outputs_reshaped = rnn_outputs.reshape( (x.shape[0]*x.shape[1], hidden_dim*2) ) print("rnn_outputs_reshaped shape", rnn_outputs_reshaped.shape.tag.test_value) #array([464, 202])) labels_raw = gather.apply(rnn_outputs_reshaped) # This is pre-softmaxing print("labels_raw shape", labels_raw.shape.tag.test_value) # array([ 464, 5])) def examine_embedding(embedding):
def main(config): vocab_src, _ = text_to_dict([config['train_src'], config['dev_src'], config['test_src']]) vocab_tgt, cabvo = text_to_dict([config['train_tgt'], config['dev_tgt']]) # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') source_sentence.tag.test_value = [[13, 20, 0, 20, 0, 20, 0], [1, 4, 8, 4, 8, 4, 8],] source_sentence_mask.tag.test_value = [[0, 1, 0, 1, 0, 1, 0], [1, 0, 1, 0, 1, 0, 1],] target_sentence.tag.test_value = [[0,1,1,5], [2,0,1,0],] target_sentence_mask.tag.test_value = [[0,1,1,0], [1,1,1,0],] logger.info('Building RNN encoder-decoder') ### Building Encoder embedder = LookupTable( length=len(vocab_src), dim=config['embed_src'], weights_init=IsotropicGaussian(), biases_init=Constant(0.0), name='embedder') transformer = Linear( config['embed_src'], config['hidden_src']*4, weights_init=IsotropicGaussian(), biases_init=Constant(0.0), name='transformer') lstminit = np.asarray([0.0,]*config['hidden_src']+[0.0,]*config['hidden_src']+[1.0,]*config['hidden_src']+[0.0,]*config['hidden_src']) encoder = Bidirectional( LSTM( dim=config['hidden_src'], weights_init=IsotropicGaussian(0.01), biases_init=Constant(lstminit)), name='encoderBiLSTM' ) encoder.prototype.weights_init = Orthogonal() ### Building Decoder lstminit = np.asarray([0.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']+[1.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']) transition = LSTM2GO( attended_dim=config['hidden_tgt'], dim=config['hidden_tgt'], weights_init=IsotropicGaussian(0.01), biases_init=Constant(lstminit), name='decoderLSTM') attention = SequenceContentAttention( state_names=transition.apply.states, # default activation is Tanh state_dims=[config['hidden_tgt']], attended_dim=config['hidden_src']*2, match_dim=config['hidden_tgt'], name="attention") readout = Readout( source_names=['states', 'feedback', attention.take_glimpses.outputs[0]], readout_dim=len(vocab_tgt), emitter = SoftmaxEmitter( name='emitter'), feedback_brick = LookupFeedback( num_outputs=len(vocab_tgt), feedback_dim=config['embed_tgt'], name='feedback'), post_merge=InitializableFeedforwardSequence([ Bias(dim=config['hidden_tgt'], name='softmax_bias').apply, Linear(input_dim=config['hidden_tgt'], output_dim=config['embed_tgt'], use_bias=False, name='softmax0').apply, Linear(input_dim=config['embed_tgt'], name='softmax1').apply]), merged_dim=config['hidden_tgt']) decoder = SequenceGenerator( readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator", fork=Fork( [name for name in transition.apply.sequences if name != 'mask'], prototype=Linear()), add_contexts=True) decoder.transition.weights_init = Orthogonal() #printchildren(encoder, 1) # Initialize model logger.info('Initializing model') embedder.initialize() transformer.initialize() encoder.initialize() decoder.initialize() # Apply model embedded = embedder.apply(source_sentence) tansformed = transformer.apply(embedded) encoded = encoder.apply(tansformed)[0] generated = decoder.generate( n_steps=2*source_sentence.shape[1], batch_size=source_sentence.shape[0], attended = encoded.dimshuffle(1,0,2), attended_mask=tensor.ones(source_sentence.shape).T ) print 'Generated: ', generated # generator_generate_outputs #samples = generated[1] # For GRU samples = generated[2] # For LSTM samples.name = 'samples' #samples_cost = generated[4] # For GRU samples_cost = generated[5] # For LSTM samples_cost = 'sampling_cost' cost = decoder.cost( mask = target_sentence_mask.T, outputs = target_sentence.T, attended = encoded.dimshuffle(1,0,2), attended_mask = source_sentence_mask.T) cost.name = 'target_cost' cost.tag.aggregation_scheme = TakeLast(cost) model = Model(cost) logger.info('Creating computational graph') cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) ######## # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) printchildren(embedder, 1) printchildren(transformer, 1) printchildren(encoder, 1) printchildren(decoder, 1) # Print parameter names # enc_dec_param_dict = merge(Selector(embedder).get_parameters(), Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) # enc_dec_param_dict = merge(Selector(decoder).get_parameters()) # logger.info("Parameter names: ") # for name, value in enc_dec_param_dict.items(): # logger.info(' {:15}: {}'.format(value.get_value().shape, name)) # logger.info("Total number of parameters: {}".format(len(enc_dec_param_dict))) ########## # Training data train_stream = get_train_stream(config, [config['train_src'],], [config['train_tgt'],], vocab_src, vocab_tgt) dev_stream = get_dev_stream( [config['dev_src'],], [config['dev_tgt'],], vocab_src, vocab_tgt) test_stream = get_test_stream([config['test_src'],], vocab_src) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), ProgressBar(), TrainingDataMonitoring([cost], prefix="tra", after_batch=True), DataStreamMonitoring(variables=[cost], data_stream=dev_stream, prefix="dev", after_batch=True), Sampler( model=Model(samples), data_stream=dev_stream, vocab=cabvo, saveto=config['saveto']+'dev', every_n_batches=config['save_freq']), Sampler( model=Model(samples), data_stream=test_stream, vocab=cabvo, saveto=config['saveto']+'test', after_n_batches=1, on_resumption=True, before_training=True), Plotter(saveto=config['saveto'], after_batch=True), Printing(after_batch=True), Checkpoint( path=config['saveto'], parameters = cg.parameters, save_main_loop=False, every_n_batches=config['save_freq'])] if BOKEH_AVAILABLE: Plot('Training cost', channels=[['target_cost']], after_batch=True) if config['reload']: extensions.append(Load(path=config['saveto'], load_iteration_state=False, load_log=False)) else: with open(config['saveto']+'.txt', 'w') as f: pass # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=model, algorithm=algorithm, data_stream=train_stream, extensions=extensions) main_loop.run()