def __init__(self, dimension, alphabet_size, **kwargs): super(WordReverser, self).__init__(**kwargs) encoder = Bidirectional( SimpleRecurrent(dim=dimension, activation=Tanh())) fork = Fork([name for name in encoder.prototype.apply.sequences if name != 'mask']) fork.input_dim = dimension fork.output_dims = [dimension for name in fork.input_names] lookup = LookupTable(alphabet_size, dimension) transition = SimpleRecurrent( activation=Tanh(), dim=dimension, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=2 * dimension, match_dim=dimension, name="attention") readout = Readout( readout_dim=alphabet_size, source_names=[transition.apply.states[0], attention.take_glimpses.outputs[0]], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(alphabet_size, dimension), name="readout") generator = SequenceGenerator( readout=readout, transition=transition, attention=attention, name="generator") self.lookup = lookup self.fork = fork self.encoder = encoder self.generator = generator self.children = [lookup, fork, encoder, generator]
def __init__(self, dimen, vocab_size): #{ # No idea what this is doing, but otherwise "allocated" is not set super(MorphGen, self).__init__(self) # The encoder encoder = Bidirectional(SimpleRecurrent(dim=dimen, activation=Tanh())) # What is this doing ? fork = Fork([name for name in encoder.prototype.apply.sequences if name != 'mask']) fork.input_dim = dimen fork.output_dims = [encoder.prototype.get_dim(name) for name in fork.input_names] lookup = LookupTable(vocab_size, dimen) transition = SimpleRecurrent(dim=dimen, activation=Tanh(), name="transition") atten = SequenceContentAttention(state_names=transition.apply.states,attended_dim=2*dimen, match_dim=dimen, name="attention") readout = Readout( readout_dim=vocab_size, source_names=[transition.apply.states[0], atten.take_glimpses.outputs[0]], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(vocab_size, dimen), name="readout"); generator = SequenceGenerator(readout=readout, transition=transition, attention=atten,name="generator") self.lookup = lookup self.fork = fork self.encoder = encoder self.generator = generator self.children = [lookup, fork, encoder, generator]
def __init__(self, dimension, alphabet_size, **kwargs): super(SimpleGenerator, self).__init__(**kwargs) lookup = LookupTable(alphabet_size, dimension) transition = SimpleRecurrent(activation=Tanh(), dim=dimension, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dimension, match_dim=dimension, name="attention") readout = Readout(readout_dim=alphabet_size, source_names=[ transition.apply.states[0], attention.take_glimpses.outputs[0] ], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback( alphabet_size, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator") self.lookup = lookup self.generator = generator self.children = [lookup, generator]
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.transition = GRUInitialState(attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder') self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") readout = Readout(source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=-1), feedback_brick=LookupFeedbackWMT15( vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply ]), merged_dim=state_dim, merge_prototype=Linear(use_bias=True)) self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator]
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, theano_seed=None, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = GRUInitialState( attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder') # Initialize the attention mechanism self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 readout = Readout( source_names=['states', 'feedback', self.attention.take_glimpses.outputs[0]], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed), feedback_brick=LookupFeedbackWMT15(vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence( [Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply]), merged_dim=state_dim) # Build sequence generator accordingly self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([name for name in self.transition.apply.sequences if name != 'mask'], prototype=Linear()) ) self.children = [self.sequence_generator]
def test_sequence_content_attention(): # Disclaimer: only check dimensions, not values rng = numpy.random.RandomState([2014, 12, 2]) seq_len = 5 batch_size = 6 state_dim = 2 attended_dim = 3 match_dim = 4 attention = SequenceContentAttention(state_names=["states"], state_dims=[state_dim], attended_dim=attended_dim, match_dim=match_dim, weights_init=IsotropicGaussian(0.5), biases_init=Constant(0)) attention.initialize() sequences = tensor.tensor3('sequences') states = tensor.matrix('states') mask = tensor.matrix('mask') glimpses, weights = attention.take_glimpses(sequences, attended_mask=mask, states=states) assert glimpses.ndim == 2 assert weights.ndim == 2 seq_values = numpy.zeros((seq_len, batch_size, attended_dim), dtype=theano.config.floatX) states_values = numpy.zeros((batch_size, state_dim), dtype=theano.config.floatX) mask_values = numpy.zeros((seq_len, batch_size), dtype=theano.config.floatX) # randomly generate a sensible mask for sed_idx in range(batch_size): mask_values[:rng.randint(1, seq_len), sed_idx] = 1 glimpses_values, weight_values = theano.function( [sequences, states, mask], [glimpses, weights])(seq_values, states_values, mask_values) assert glimpses_values.shape == (batch_size, attended_dim) assert weight_values.shape == (batch_size, seq_len) assert numpy.all(weight_values >= 0) assert numpy.all(weight_values <= 1) assert numpy.all(weight_values.sum(axis=1) == 1) assert numpy.all((weight_values.T == 0) == (mask_values == 0))
def test_compute_weights_with_zero_mask(): state_dim = 2 attended_dim = 3 match_dim = 4 attended_length = 5 batch_size = 6 attention = SequenceContentAttention( state_names=["states"], state_dims=[state_dim], attended_dim=attended_dim, match_dim=match_dim, weights_init=IsotropicGaussian(0.5), biases_init=Constant(0)) attention.initialize() energies = tensor.as_tensor_variable( numpy.random.rand(attended_length, batch_size)) mask = tensor.as_tensor_variable( numpy.zeros((attended_length, batch_size))) weights = attention.compute_weights(energies, mask).eval() assert numpy.all(numpy.isfinite(weights))
def test_stable_attention_weights(): state_dim = 2 attended_dim = 3 match_dim = 4 attended_length = 5 batch_size = 6 attention = SequenceContentAttention(state_names=["states"], state_dims=[state_dim], attended_dim=attended_dim, match_dim=match_dim, weights_init=IsotropicGaussian(0.5), biases_init=Constant(0)) attention.initialize() # Random high energies with mu=800, sigma=50 energies_val = (50. * numpy.random.randn(attended_length, batch_size) + 800).astype(theano.config.floatX) energies = tensor.as_tensor_variable(energies_val) mask = tensor.as_tensor_variable(numpy.ones((attended_length, batch_size))) weights = attention.compute_weights(energies, mask).eval() assert numpy.all(numpy.isfinite(weights))
def __init__(self, base_encoder, state_dim=1000, self_attendable=False, **kwargs): """Constructor. Args: base_encoder (Brick): Low level encoder network which produces annotations to attend to state_dim (int): Size of the recurrent layer. self_attendable (bool): If true, the annotator can attend to its own previous states. If false it can only attend to base annotations """ super(HierarchicalAnnotator, self).__init__(**kwargs) self.state_dim = state_dim * 2 self.base_encoder = base_encoder self.self_attendable = self_attendable trans_core = GatedRecurrent(activation=Tanh(), dim=self.state_dim) if self_attendable: self.attention = SelfAttendableContentAttention( state_names=trans_core.apply.states, attended_dim=self.state_dim, match_dim=self.state_dim, num_steps=10, name="hier_attention") else: self.attention = SequenceContentAttention( state_names=trans_core.apply.states, attended_dim=self.state_dim, match_dim=self.state_dim, name="hier_attention") self.transition = AttentionRecurrent(trans_core, self.attention, name="hier_att_trans") self.children = [self.transition]
def __init__(self, vocab_size, embedding_dim, dgru_state_dim, igru_state_dim, state_dim, representation_dim, transition_depth, trg_igru_depth, trg_dgru_depth, trg_space_idx, trg_bos, theano_seed=None, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.dgru_state_dim = dgru_state_dim self.igru_state_dim = igru_state_dim self.state_dim = state_dim self.trg_space_idx = trg_space_idx self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = RecurrentStack([ GRUInitialState(attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder_gru_withinit') ] + [ GatedRecurrent( dim=state_dim, activation=Tanh(), name='decoder_gru' + str(i)) for i in range(1, transition_depth) ], skip_connections=False) # Initialize the attention mechanism self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") self.interpolator = Interpolator( vocab_size=vocab_size, embedding_dim=embedding_dim, igru_state_dim=igru_state_dim, igru_depth=trg_igru_depth, trg_dgru_depth=trg_dgru_depth, source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=trg_bos, theano_seed=theano_seed), feedback_brick=TargetWordEncoder(vocab_size, embedding_dim, self.dgru_state_dim, trg_dgru_depth)) # Build sequence generator accordingly self.sequence_generator = SequenceGeneratorDCNMT( trg_space_idx=self.trg_space_idx, readout=self.interpolator, transition=self.transition, attention=self.attention, transition_depth=transition_depth, igru_depth=trg_igru_depth, trg_dgru_depth=trg_dgru_depth, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator]
def test_attention_recurrent(): rng = numpy.random.RandomState(1234) dim = 5 batch_size = 4 input_length = 20 attended_dim = 10 attended_length = 15 wrapped = SimpleRecurrent(dim, Identity()) attention = SequenceContentAttention(state_names=wrapped.apply.states, attended_dim=attended_dim, match_dim=attended_dim) recurrent = AttentionRecurrent(wrapped, attention, seed=1234) recurrent.weights_init = IsotropicGaussian(0.5) recurrent.biases_init = Constant(0) recurrent.initialize() attended = tensor.tensor3("attended") attended_mask = tensor.matrix("attended_mask") inputs = tensor.tensor3("inputs") inputs_mask = tensor.matrix("inputs_mask") outputs = recurrent.apply(inputs=inputs, mask=inputs_mask, attended=attended, attended_mask=attended_mask) states, glimpses, weights = outputs assert states.ndim == 3 assert glimpses.ndim == 3 assert weights.ndim == 3 # For values. def rand(size): return rng.uniform(size=size).astype(floatX) # For masks. def generate_mask(length, batch_size): mask = numpy.ones((length, batch_size), dtype=floatX) # To make it look like read data for i in range(batch_size): mask[1 + rng.randint(0, length - 1):, i] = 0.0 return mask input_vals = rand((input_length, batch_size, dim)) input_mask_vals = generate_mask(input_length, batch_size) attended_vals = rand((attended_length, batch_size, attended_dim)) attended_mask_vals = generate_mask(attended_length, batch_size) func = theano.function([inputs, inputs_mask, attended, attended_mask], [states, glimpses, weights]) states_vals, glimpses_vals, weight_vals = func(input_vals, input_mask_vals, attended_vals, attended_mask_vals) assert states_vals.shape == (input_length, batch_size, dim) assert glimpses_vals.shape == (input_length, batch_size, attended_dim) assert (len(ComputationGraph(outputs).shared_variables) == len( Selector(recurrent).get_params())) # weights for not masked position must be zero assert numpy.all(weight_vals * (1 - attended_mask_vals.T) == 0) # weights for masked positions must be non-zero assert numpy.all(abs(weight_vals + (1 - attended_mask_vals.T)) > 1e-5) # weights from different steps should be noticeably different assert (abs(weight_vals[0] - weight_vals[1])).sum() > 1e-2 # weights for all state after the last masked position should be same for i in range(batch_size): last = int(input_mask_vals[:, i].sum()) for j in range(last, input_length): assert_allclose(weight_vals[last, i], weight_vals[j, i]) # freeze sums assert_allclose(weight_vals.sum(), input_length * batch_size, 1e-5) assert_allclose(states_vals.sum(), 113.429, rtol=1e-5) assert_allclose(glimpses_vals.sum(), 415.901, rtol=1e-5)
def main(mode, save_path, num_batches, from_dump): if mode == "train": # Experiment configuration dimension = 100 readout_dimension = len(char2code) # Data processing pipeline data_stream = DataStreamMapping( mapping=lambda data: tuple(array.T for array in data), data_stream=PaddingDataStream( BatchDataStream( iteration_scheme=ConstantScheme(10), data_stream=DataStreamMapping( mapping=reverse_words, add_sources=("targets", ), data_stream=DataStreamFilter( predicate=lambda data: len(data[0]) <= 100, data_stream=OneBillionWord( "training", [99], char2code, level="character", preprocess=str.lower).get_default_stream()))))) # Build the model chars = tensor.lmatrix("features") chars_mask = tensor.matrix("features_mask") targets = tensor.lmatrix("targets") targets_mask = tensor.matrix("targets_mask") encoder = Bidirectional(GatedRecurrent(dim=dimension, activation=Tanh()), weights_init=Orthogonal()) encoder.initialize() fork = Fork([ name for name in encoder.prototype.apply.sequences if name != 'mask' ], weights_init=IsotropicGaussian(0.1), biases_init=Constant(0)) fork.input_dim = dimension fork.fork_dims = {name: dimension for name in fork.fork_names} fork.initialize() lookup = LookupTable(readout_dimension, dimension, weights_init=IsotropicGaussian(0.1)) lookup.initialize() transition = Transition(activation=Tanh(), dim=dimension, attended_dim=2 * dimension, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, match_dim=dimension, name="attention") readout = LinearReadout(readout_dim=readout_dimension, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedbacker=LookupFeedback( readout_dimension, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() bricks = [encoder, fork, lookup, generator] # Give an idea of what's going on params = Selector(bricks).get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) # Build the cost computation graph batch_cost = generator.cost( targets, targets_mask, attended=encoder.apply(**dict_union(fork.apply( lookup.lookup(chars), return_dict=True), mask=chars_mask)), attended_mask=chars_mask).sum() batch_size = named_copy(chars.shape[1], "batch_size") cost = aggregation.mean(batch_cost, batch_size) cost.name = "sequence_log_likelihood" logger.info("Cost graph is built") # Fetch variables useful for debugging max_length = named_copy(chars.shape[0], "max_length") cost_per_character = named_copy( aggregation.mean(batch_cost, batch_size * max_length), "character_log_likelihood") cg = ComputationGraph(cost) energies = unpack(VariableFilter(application=readout.readout, name="output")(cg.variables), singleton=True) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") (activations, ) = VariableFilter( application=generator.transition.apply, name="states")(cg.variables) mean_activation = named_copy(activations.mean(), "mean_activation") # Define the training algorithm. algorithm = GradientDescent(cost=cost, step_rule=CompositeRule([ GradientClipping(10.0), SteepestDescent(0.01) ])) observables = [ cost, min_energy, max_energy, mean_activation, batch_size, max_length, cost_per_character, algorithm.total_step_norm, algorithm.total_gradient_norm ] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) main_loop = MainLoop( model=bricks, data_stream=data_stream, algorithm=algorithm, extensions=([LoadFromDump(from_dump)] if from_dump else []) + [ Timing(), TrainingDataMonitoring(observables, after_every_batch=True), TrainingDataMonitoring( observables, prefix="average", every_n_batches=10), FinishAfter(after_n_batches=num_batches).add_condition( "after_batch", lambda log: math.isnan( log.current_row.total_gradient_norm)), Plot(os.path.basename(save_path), [["average_" + cost.name], ["average_" + cost_per_character.name]], every_n_batches=10), SerializeMainLoop(save_path, every_n_batches=500, save_separately=["model", "log"]), Printing(every_n_batches=1) ]) main_loop.run() elif mode == "test": with open(save_path, "rb") as source: encoder, fork, lookup, generator = dill.load(source) logger.info("Model is loaded") chars = tensor.lmatrix("features") generated = generator.generate( n_steps=3 * chars.shape[0], batch_size=chars.shape[1], attended=encoder.apply(**dict_union( fork.apply(lookup.lookup(chars), return_dict=True))), attended_mask=tensor.ones(chars.shape)) sample_function = ComputationGraph(generated).get_theano_function() logging.info("Sampling function is compiled") while True: # Python 2-3 compatibility line = input("Enter a sentence\n") batch_size = int(input("Enter a number of samples\n")) encoded_input = [ char2code.get(char, char2code["<UNK>"]) for char in line.lower().strip() ] encoded_input = ([char2code['<S>']] + encoded_input + [char2code['</S>']]) print("Encoder input:", encoded_input) target = reverse_words((encoded_input, ))[0] print("Target: ", target) states, samples, glimpses, weights, costs = sample_function( numpy.repeat(numpy.array(encoded_input)[:, None], batch_size, axis=1)) messages = [] for i in range(samples.shape[1]): sample = list(samples[:, i]) try: true_length = sample.index(char2code['</S>']) + 1 except ValueError: true_length = len(sample) sample = sample[:true_length] cost = costs[:true_length, i].sum() message = "({})".format(cost) message += "".join(code2char[code] for code in sample) if sample == target: message += " CORRECT!" messages.append((cost, message)) messages.sort(key=lambda tuple_: -tuple_[0]) for _, message in messages: print(message)
def test_attention_transition(): inp_dim = 2 inp_len = 10 attended_dim = 3 attended_len = 11 batch_size = 4 n_steps = 30 transition = TestTransition(dim=inp_dim, attended_dim=attended_dim, name="transition") attention = SequenceContentAttention(transition.apply.states, match_dim=inp_dim, name="attention") mixer = Mixer( [name for name in transition.apply.sequences if name != 'mask'], attention.take_look.outputs[0], name="mixer") att_trans = AttentionTransition(transition, attention, mixer, name="att_trans") att_trans.weights_init = IsotropicGaussian(0.01) att_trans.biases_init = Constant(0) att_trans.initialize() attended = tensor.tensor3("attended") attended_mask = tensor.matrix("attended_mask") inputs = tensor.tensor3("inputs") inputs_mask = tensor.matrix("inputs_mask") states, glimpses, weights = att_trans.apply(input_=inputs, mask=inputs_mask, attended=attended, attended_mask=attended_mask) assert states.ndim == 3 assert glimpses.ndim == 3 assert weights.ndim == 3 input_vals = numpy.zeros((inp_len, batch_size, inp_dim), dtype=floatX) input_mask_vals = numpy.ones((inp_len, batch_size), dtype=floatX) attended_vals = numpy.zeros((attended_len, batch_size, attended_dim), dtype=floatX) attended_mask_vals = numpy.ones((attended_len, batch_size), dtype=floatX) func = theano.function([inputs, inputs_mask, attended, attended_mask], [states, glimpses, weights]) states_vals, glimpses_vals, weight_vals = func(input_vals, input_mask_vals, attended_vals, attended_mask_vals) assert states_vals.shape == input_vals.shape assert glimpses_vals.shape == (inp_len, batch_size, attended_dim) assert weight_vals.shape == (inp_len, batch_size, attended_len) # Test SequenceGenerator using AttentionTransition generator = SequenceGenerator(LinearReadout( readout_dim=inp_dim, source_names=["state"], emitter=TestEmitter(name="emitter"), name="readout"), transition=transition, attention=attention, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") outputs = tensor.tensor3('outputs') costs = generator.cost(outputs, attended=attended, attended_mask=attended_mask) costs_vals = costs.eval({ outputs: input_vals, attended: attended_vals, attended_mask: attended_mask_vals }) assert costs_vals.shape == (inp_len, batch_size) results = (generator.generate(n_steps=n_steps, batch_size=attended.shape[1], attended=attended, attended_mask=attended_mask)) assert len(results) == 5 states_vals, outputs_vals, glimpses_vals, weights_vals, costs_vals = ( theano.function([attended, attended_mask], results)(attended_vals, attended_mask_vals)) assert states_vals.shape == (n_steps, batch_size, inp_dim) assert states_vals.shape == outputs_vals.shape assert glimpses_vals.shape == (n_steps, batch_size, attended_dim) assert weights_vals.shape == (n_steps, batch_size, attended_len) assert costs_vals.shape == (n_steps, batch_size)
def test_with_attention(): """Test a sequence generator with continuous outputs and attention.""" rng = numpy.random.RandomState(1234) inp_dim = 2 inp_len = 10 attended_dim = 3 attended_len = 11 batch_size = 4 n_steps = 30 # For values def rand(size): return rng.uniform(size=size).astype(floatX) # For masks def generate_mask(length, batch_size): mask = numpy.ones((length, batch_size), dtype=floatX) # To make it look like read data for i in range(batch_size): mask[1 + rng.randint(0, length - 1):, i] = 0.0 return mask output_vals = rand((inp_len, batch_size, inp_dim)) output_mask_vals = generate_mask(inp_len, batch_size) attended_vals = rand((attended_len, batch_size, attended_dim)) attended_mask_vals = generate_mask(attended_len, batch_size) transition = TestTransition( dim=inp_dim, attended_dim=attended_dim, activation=Identity()) attention = SequenceContentAttention( state_names=transition.apply.states, match_dim=inp_dim) generator = SequenceGenerator( Readout( readout_dim=inp_dim, source_names=[transition.apply.states[0], attention.take_glimpses.outputs[0]], emitter=TestEmitter()), transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), add_contexts=False, seed=1234) generator.initialize() # Test 'cost_matrix' method attended = tensor.tensor3("attended") attended_mask = tensor.matrix("attended_mask") outputs = tensor.tensor3('outputs') mask = tensor.matrix('mask') costs = generator.cost_matrix(outputs, mask, attended=attended, attended_mask=attended_mask) costs_vals = costs.eval({outputs: output_vals, mask: output_mask_vals, attended: attended_vals, attended_mask: attended_mask_vals}) assert costs_vals.shape == (inp_len, batch_size) assert_allclose(costs_vals.sum(), 13.5042, rtol=1e-5) # Test `generate` method results = ( generator.generate(n_steps=n_steps, batch_size=attended.shape[1], attended=attended, attended_mask=attended_mask)) assert len(results) == 5 states_vals, outputs_vals, glimpses_vals, weights_vals, costs_vals = ( theano.function([attended, attended_mask], results) (attended_vals, attended_mask_vals)) assert states_vals.shape == (n_steps, batch_size, inp_dim) assert states_vals.shape == outputs_vals.shape assert glimpses_vals.shape == (n_steps, batch_size, attended_dim) assert weights_vals.shape == (n_steps, batch_size, attended_len) assert costs_vals.shape == (n_steps, batch_size) assert_allclose(states_vals.sum(), 23.4172, rtol=1e-5) # There is no generation cost in this case, since generation is # deterministic assert_allclose(costs_vals.sum(), 0.0, rtol=1e-5) assert_allclose(weights_vals.sum(), 120.0, rtol=1e-5) assert_allclose(glimpses_vals.sum(), 199.2402, rtol=1e-5) assert_allclose(outputs_vals.sum(), -11.6008, rtol=1e-5)
skip_connections=True) emitter = BivariateGMMEmitter(k=k) source_names = [name for name in transition.apply.states if 'states' in name] #68 characters from blocks.bricks.attention import SequenceContentAttention from blocks.bricks.lookup import LookupTable lookup = LookupTable(68, 100) embed = lookup.apply(context) attention = SequenceContentAttention( state_names=source_names, attended_dim=100, #or is it 68 match_dim=30, name="attention") readout = Readout(readout_dim=readout_size, source_names=source_names + [attention.take_glimpses.outputs[0]], emitter=emitter, name="readout") generator = SequenceGenerator(readout=readout, attention=attention, transition=transition, name="generator") generator.weights_init = IsotropicGaussian(0.01)
def __init__(self, input_dims, input_num_chars, bos_label, eos_label, num_labels, dim_dec, dims_bidir, enc_transition, dec_transition, use_states_for_readout, attention_type, criterion, bottom, lm=None, token_map=None, bidir=True, window_size=None, max_length=None, subsample=None, dims_top=None, extra_input_dim=None, prior=None, conv_n=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dim_output_embedding=None, reuse_bottom_lookup_table=False, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, # softmax is the default set in SequenceContentAndConvAttention energy_normalizer=None, # for speech this is the approximate phoneme duration in frames max_decoded_length_scale=1, # for criterions involving generation of outputs, whether # or not they should be generated by the recognizer itself generate_predictions=True, compute_targets=True, extra_generation_steps=3, **kwargs): all_arguments = copy.deepcopy(locals()) all_arguments.update(copy.deepcopy(kwargs)) del all_arguments['kwargs'] del all_arguments['self'] if post_merge_activation is None: post_merge_activation = Tanh() super(EncoderDecoder, self).__init__(**kwargs) self.bos_label = bos_label self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack self.criterion = criterion self.generate_predictions = generate_predictions self.extra_generation_steps = extra_generation_steps self.compute_targets = compute_targets self.max_decoded_length_scale = max_decoded_length_scale post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN bottom_class = bottom.pop('bottom_class') bottom = bottom_class( input_dims=input_dims, input_num_chars=input_num_chars, name='bottom', **bottom) # BiRNN if dims_bidir: if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, bottom.get_dim(bottom.apply.outputs[0]), subsample, bidir=bidir) elif window_size: encoder = ConvEncoder( max_length, bottom.get_dim(bottom.apply.outputs[0]), window_size) else: raise ValueError("Don't know which Encoder to use") dim_encoded = encoder.get_dim(encoder.apply.outputs[0]) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition( dim=dim_dec, activation=Tanh(), name="transition") else: assert not extra_input_dim transitions = [self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack)] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dim_encoded, match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=dim_encoded, match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError("Unknown attention type {}" .format(attention_type)) if not embed_outputs: raise ValueError("embed_outputs=False is not supported any more") if not reuse_bottom_lookup_table: embedding = LookupTable(num_labels + 1, dim_dec if dim_output_embedding is None else dim_output_embedding) else: embedding = bottom.children[0] feedback = Feedback( embedding=embedding, output_names=[s for s in transition.apply.sequences if s != 'mask']) # Create a readout readout_config = dict( num_tokens=num_labels, input_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], name="readout") if post_merge_dims: readout_config['merge_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence([ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [d//getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims] + [num_labels]).apply, ], name='post_merge') if 'reward' in criterion and criterion['name'] != 'log_likelihood': if criterion['reward'] == 'edit_distance': readout_config['reward_brick'] = EditDistanceReward( self.bos_label, self.eos_label) elif criterion['reward'] == 'delta_edit_distance': readout_config['reward_brick'] = EditDistanceReward( self.bos_label, self.eos_label, deltas=True) elif criterion['reward'] == 'bleu': readout_config['reward_brick'] = BleuReward( self.bos_label, self.eos_label, deltas=False) elif criterion['reward'] == 'delta_bleu': readout_config['reward_brick'] = BleuReward( self.bos_label, self.eos_label, deltas=True) else: raise ValueError("Unknown reward type") if criterion['name'] == 'log_likelihood': readout_class = SoftmaxReadout elif criterion['name'] == 'critic': readout_class = CriticReadout criterion_copy = dict(criterion) del criterion_copy['name'] readout_config.update(**criterion_copy) elif criterion['name'] == 'reinforce': readout_class = ReinforceReadout readout_config['merge_names'] = list(readout_config['input_names']) readout_config['entropy'] = criterion.get('entropy') readout_config['input_names'] += ['attended', 'attended_mask'] elif criterion['name'] in ['sarsa', 'actor_critic']: readout_class = ActorCriticReadout if criterion['name'] == 'actor_critic': critic_arguments = dict(all_arguments) # No worries, critic will not compute log likelihood values. # We critic_arguments['criterion'] = { 'name': 'critic', 'value_softmax': criterion.get('value_softmax'), 'same_value_for_wrong': criterion.get('same_value_for_wrong'), 'groundtruth_word_bonus': criterion.get('groundtruth_word_bonus'), 'dueling_outputs': criterion.get('dueling_outputs')} critic_arguments['name'] = 'critic' if criterion.get('critic_uses_actor_states'): critic_arguments['extra_input_dim'] = dim_dec if (criterion.get('value_softmax') or criterion.get('same_value_for_wrong') or criterion.get('dueling_outputs')): # Add an extra output for the critic critic_arguments['num_labels'] = num_labels + 1 if criterion.get('force_bidir'): critic_arguments['dims_bidir'] = [dim_dec] critic_arguments['reuse_bottom_lookup_table'] = True critic_arguments['input_num_chars'] = {'inputs': num_labels} if criterion.get('downsize_critic'): critic_arguments = _downsize_config( critic_arguments, criterion['downsize_critic']) critic = EncoderDecoder(**critic_arguments) readout_config['critic'] = critic readout_config['merge_names'] = list(readout_config['input_names']) readout_config['freeze_actor'] = criterion.get('freeze_actor') readout_config['freeze_critic'] = criterion.get('freeze_critic') readout_config['critic_uses_actor_states'] = criterion.get('critic_uses_actor_states') readout_config['critic_uses_groundtruth'] = criterion.get('critic_uses_groundtruth') readout_config['critic_burnin_steps'] = criterion.get('critic_burnin_steps') readout_config['critic_loss'] = criterion.get('critic_loss') readout_config['discount'] = criterion.get('discount') readout_config['entropy_reward_coof'] = criterion.get('entropy_reward_coof') readout_config['cross_entropy_reward_coof'] = criterion.get('cross_entropy_reward_coof') readout_config['value_penalty'] = criterion.get('value_penalty') readout_config['value_penalty_type'] = criterion.get('value_penalty_type') readout_config['critic_policy_t'] = criterion.get('critic_policy_t') readout_config['bos_token'] = bos_label readout_config['accumulate_outputs'] = criterion.get('accumulate_outputs') readout_config['use_value_biases'] = criterion.get('use_value_biases') readout_config['actor_grad_estimate'] = criterion.get('actor_grad_estimate') readout_config['input_names'] += ['attended', 'attended_mask'] # Note, that settings below are for the "clean" mode. # When get_cost_graph() is run with training=True, they # are temporarily overriden with the "real" settings from # "criterion" readout_config['compute_targets'] = True readout_config['trpo_coef'] = 0.0 readout_config['solve_bellman'] = True else: raise ValueError("Unknown criterion {}".format(criterion['name'])) readout = readout_class(**readout_config) if lm: raise ValueError("LM is currently not supported") recurrent = AttentionRecurrent(transition, attention) if extra_input_dim: recurrent = RecurrentWithExtraInput( recurrent, "extra_inputs", extra_input_dim, name="with_extra_inputs") generator = SequenceGenerator( recurrent=recurrent, readout=readout, feedback=feedback, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.softmax = Softmax() self.children = [encoder, top, bottom, generator, self.softmax] # Create input variables self.inputs = self.bottom.batch_inputs self.inputs_mask = self.bottom.mask self.labels = tensor.lmatrix('labels') self.labels_mask = tensor.matrix("labels_mask") self.predicted_labels = tensor.lmatrix('predicted_labels') self.predicted_mask = tensor.matrix('predicted_mask') self.prefix_labels = tensor.lmatrix('prefix_labels') self.prefix_steps = tensor.lscalar('prefix_steps') self.single_inputs = self.bottom.single_inputs self.single_labels = tensor.lvector('labels') self.single_predicted_labels = tensor.lvector('predicted_labels') self.n_steps = tensor.lscalar('n_steps') # Configure mixed_generate if criterion['name'] == 'actor_critic': critic = self.generator.readout.critic self.mixed_generate.sequences = [] self.mixed_generate.states = ( ['step'] + self.generator.recurrent.apply.states + ['critic_' + name for name in critic.generator.recurrent.apply.states]) self.mixed_generate.outputs = ( ['samples', 'step'] + self.generator.recurrent.apply.outputs + ['critic_' + name for name in critic.generator.recurrent.apply.outputs]) self.mixed_generate.contexts = ( self.generator.recurrent.apply.contexts + ['critic_' + name for name in critic.generator.recurrent.apply.contexts] + ['groundtruth', 'groundtruth_mask']) self.initial_states.outputs = self.mixed_generate.states self.prefix_generate.sequences = [] self.prefix_generate.states = ['step'] + self.generator.recurrent.apply.states self.prefix_generate.outputs = ['samples', 'step'] + self.generator.recurrent.apply.outputs self.prefix_generate.contexts = self.generator.recurrent.apply.contexts
def __init__( self, input_dims, input_num_chars, eos_label, num_phonemes, dim_dec, dims_bidir, enc_transition, dec_transition, use_states_for_readout, attention_type, criterion, bottom, lm=None, character_map=None, bidir=True, subsample=None, dims_top=None, prior=None, conv_n=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dim_output_embedding=None, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, # softmax is the default set in SequenceContentAndConvAttention energy_normalizer=None, # for speech this is the approximate phoneme duration in frames max_decoded_length_scale=1, **kwargs): if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack self.criterion = criterion self.max_decoded_length_scale = max_decoded_length_scale post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN bottom_class = bottom.pop('bottom_class') bottom = bottom_class(input_dims=input_dims, input_num_chars=input_num_chars, name='bottom', **bottom) # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, bottom.get_dim(bottom.apply.outputs[0]), subsample, bidir=bidir) dim_encoded = encoder.get_dim(encoder.apply.outputs[0]) generators = [None, None] for i in range(2): # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded], name="top{}".format(i)) else: top = Identity(name='top{}'.format(i)) if dec_stack == 1: transition = self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition{}".format(i)) else: transitions = [ self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}_{}".format( i, trans_level)) for trans_level in xrange(dec_stack) ] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dim_encoded, match_dim=dim_matcher, name="cont_att" + i) elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=dim_encoded, match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att{}".format(i)) else: raise ValueError( "Unknown attention type {}".format(attention_type)) if embed_outputs: feedback = LookupFeedback( num_phonemes + 1, dim_dec if dim_output_embedding is None else dim_output_embedding) else: feedback = OneOfNFeedback(num_phonemes + 1) if criterion['name'] == 'log_likelihood': emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter{}".format(i)) if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() elif criterion['name'].startswith('mse'): emitter = RewardRegressionEmitter(criterion['name'], eos_label, num_phonemes, criterion.get( 'min_reward', -1.0), name="emitter") else: raise ValueError("Unknown criterion {}".format( criterion['name'])) readout_config = dict( readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout{}".format(i)) if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence( [ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP( [post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [ d // getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims ] + [num_phonemes]).apply, ], name='post_merge{}'.format(i)) readout = Readout(**readout_config) language_model = None if lm and lm.get('path'): lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn( "Beam search is prone to fail with no log-prob normalization" ) language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout( lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generators[i] = SequenceGenerator(readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator{}".format(i)) self.generator = generators[0] self.forward_to_backward = Linear(dim_dec, dim_dec) # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generators = generators self.children = [self.forward_to_backward, encoder, top, bottom ] + generators # Create input variables self.inputs = self.bottom.batch_inputs self.inputs_mask = self.bottom.mask self.labels = tensor.lmatrix('labels') self.labels_mask = tensor.matrix("labels_mask") self.single_inputs = self.bottom.single_inputs self.single_labels = tensor.lvector('labels') self.n_steps = tensor.lscalar('n_steps')
def main(config): vocab_src, _ = text_to_dict([config['train_src'], config['dev_src'], config['test_src']]) vocab_tgt, cabvo = text_to_dict([config['train_tgt'], config['dev_tgt']]) # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') source_sentence.tag.test_value = [[13, 20, 0, 20, 0, 20, 0], [1, 4, 8, 4, 8, 4, 8],] source_sentence_mask.tag.test_value = [[0, 1, 0, 1, 0, 1, 0], [1, 0, 1, 0, 1, 0, 1],] target_sentence.tag.test_value = [[0,1,1,5], [2,0,1,0],] target_sentence_mask.tag.test_value = [[0,1,1,0], [1,1,1,0],] logger.info('Building RNN encoder-decoder') ### Building Encoder embedder = LookupTable( length=len(vocab_src), dim=config['embed_src'], weights_init=IsotropicGaussian(), biases_init=Constant(0.0), name='embedder') transformer = Linear( config['embed_src'], config['hidden_src']*4, weights_init=IsotropicGaussian(), biases_init=Constant(0.0), name='transformer') lstminit = np.asarray([0.0,]*config['hidden_src']+[0.0,]*config['hidden_src']+[1.0,]*config['hidden_src']+[0.0,]*config['hidden_src']) encoder = Bidirectional( LSTM( dim=config['hidden_src'], weights_init=IsotropicGaussian(0.01), biases_init=Constant(lstminit)), name='encoderBiLSTM' ) encoder.prototype.weights_init = Orthogonal() ### Building Decoder lstminit = np.asarray([0.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']+[1.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']) transition = LSTM2GO( attended_dim=config['hidden_tgt'], dim=config['hidden_tgt'], weights_init=IsotropicGaussian(0.01), biases_init=Constant(lstminit), name='decoderLSTM') attention = SequenceContentAttention( state_names=transition.apply.states, # default activation is Tanh state_dims=[config['hidden_tgt']], attended_dim=config['hidden_src']*2, match_dim=config['hidden_tgt'], name="attention") readout = Readout( source_names=['states', 'feedback', attention.take_glimpses.outputs[0]], readout_dim=len(vocab_tgt), emitter = SoftmaxEmitter( name='emitter'), feedback_brick = LookupFeedback( num_outputs=len(vocab_tgt), feedback_dim=config['embed_tgt'], name='feedback'), post_merge=InitializableFeedforwardSequence([ Bias(dim=config['hidden_tgt'], name='softmax_bias').apply, Linear(input_dim=config['hidden_tgt'], output_dim=config['embed_tgt'], use_bias=False, name='softmax0').apply, Linear(input_dim=config['embed_tgt'], name='softmax1').apply]), merged_dim=config['hidden_tgt']) decoder = SequenceGenerator( readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator", fork=Fork( [name for name in transition.apply.sequences if name != 'mask'], prototype=Linear()), add_contexts=True) decoder.transition.weights_init = Orthogonal() #printchildren(encoder, 1) # Initialize model logger.info('Initializing model') embedder.initialize() transformer.initialize() encoder.initialize() decoder.initialize() # Apply model embedded = embedder.apply(source_sentence) tansformed = transformer.apply(embedded) encoded = encoder.apply(tansformed)[0] generated = decoder.generate( n_steps=2*source_sentence.shape[1], batch_size=source_sentence.shape[0], attended = encoded.dimshuffle(1,0,2), attended_mask=tensor.ones(source_sentence.shape).T ) print 'Generated: ', generated # generator_generate_outputs #samples = generated[1] # For GRU samples = generated[2] # For LSTM samples.name = 'samples' #samples_cost = generated[4] # For GRU samples_cost = generated[5] # For LSTM samples_cost = 'sampling_cost' cost = decoder.cost( mask = target_sentence_mask.T, outputs = target_sentence.T, attended = encoded.dimshuffle(1,0,2), attended_mask = source_sentence_mask.T) cost.name = 'target_cost' cost.tag.aggregation_scheme = TakeLast(cost) model = Model(cost) logger.info('Creating computational graph') cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) ######## # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) printchildren(embedder, 1) printchildren(transformer, 1) printchildren(encoder, 1) printchildren(decoder, 1) # Print parameter names # enc_dec_param_dict = merge(Selector(embedder).get_parameters(), Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) # enc_dec_param_dict = merge(Selector(decoder).get_parameters()) # logger.info("Parameter names: ") # for name, value in enc_dec_param_dict.items(): # logger.info(' {:15}: {}'.format(value.get_value().shape, name)) # logger.info("Total number of parameters: {}".format(len(enc_dec_param_dict))) ########## # Training data train_stream = get_train_stream(config, [config['train_src'],], [config['train_tgt'],], vocab_src, vocab_tgt) dev_stream = get_dev_stream( [config['dev_src'],], [config['dev_tgt'],], vocab_src, vocab_tgt) test_stream = get_test_stream([config['test_src'],], vocab_src) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), ProgressBar(), TrainingDataMonitoring([cost], prefix="tra", after_batch=True), DataStreamMonitoring(variables=[cost], data_stream=dev_stream, prefix="dev", after_batch=True), Sampler( model=Model(samples), data_stream=dev_stream, vocab=cabvo, saveto=config['saveto']+'dev', every_n_batches=config['save_freq']), Sampler( model=Model(samples), data_stream=test_stream, vocab=cabvo, saveto=config['saveto']+'test', after_n_batches=1, on_resumption=True, before_training=True), Plotter(saveto=config['saveto'], after_batch=True), Printing(after_batch=True), Checkpoint( path=config['saveto'], parameters = cg.parameters, save_main_loop=False, every_n_batches=config['save_freq'])] if BOKEH_AVAILABLE: Plot('Training cost', channels=[['target_cost']], after_batch=True) if config['reload']: extensions.append(Load(path=config['saveto'], load_iteration_state=False, load_log=False)) else: with open(config['saveto']+'.txt', 'w') as f: pass # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=model, algorithm=algorithm, data_stream=train_stream, extensions=extensions) main_loop.run()
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, context_dim, target_transition, theano_seed=None, loss_function='cross_entropy', **kwargs): super(InitialContextDecoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = target_transition(attended_dim=state_dim, context_dim=context_dim, dim=state_dim, activation=Tanh(), name='decoder') # self.transition = GRUInitialStateWithInitialStateConcatContext( # attended_dim=state_dim, context_dim=context_dim, dim=state_dim, # activation=Tanh(), name='decoder') # Initialize the attention mechanism self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 readout = Readout( source_names=[ 'states', 'feedback', # Chris: it's key that we're taking the first output of self.attention.take_glimpses.outputs # Chris: the first output is the weighted avgs, the second is the weights in (batch, time) self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed), feedback_brick=LookupFeedbackWMT15(vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply ]), merged_dim=state_dim) # Build sequence generator accordingly if loss_function == 'cross_entropy': self.sequence_generator = InitialContextSequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) elif loss_function == 'min_risk': self.sequence_generator = MinRiskInitialContextSequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) # the name is important, because it lets us match the brick hierarchy names for the vanilla SequenceGenerator # to load pretrained models # TODO: quick hack to fix bug self.sequence_generator.name = 'initialcontextsequencegenerator' else: raise ValueError( 'The decoder does not support the loss function: {}'.format( loss_function)) # TODO: uncomment this!! # self.sequence_generator.name = 'sequencegenerator' self.children = [self.sequence_generator]
def __init__(self, config, vocab_size): context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) context_bag = to_bag(context, vocab_size) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='embed') embed.weights_init = IsotropicGaussian(0.01) #embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt') #embed.weights_init = Constant(embeddings_initial_value) # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Build the encoder bricks transition = GatedRecurrent(activation=Tanh(), dim=config.generator_lstm_size, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=cenc_dim, match_dim=config.generator_lstm_size, name="attention") readout = Readout(readout_dim=vocab_size, source_names=[ transition.apply.states[0], attention.take_glimpses.outputs[0] ], emitter=MaskedSoftmaxEmitter(context_bag=context_bag, name='emitter'), feedback_brick=LookupFeedback( vocab_size, config.feedback_size), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator") cost = generator.cost(answer, answer_mask.astype(theano.config.floatX), attended=cenc, attended_mask=context_mask.astype( theano.config.floatX), name="cost") self.predictions = generator.generate( n_steps=7, batch_size=config.batch_size, attended=cenc, attended_mask=context_mask.astype(theano.config.floatX), iterate=True)[1] # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # initialize new stuff manually (change!) generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0) generator.push_allocation_config() generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def _initialize_attention(attention_strategy, seq_len, transition, representation_dim, att_dim, attention_sources='s', readout_sources='sfa', memory="none", memory_size=500): """Initializes the attention model according the configuration. Args: attention_strategy (string): "none" disables attention "content" is vanilla content-based attention (cf. Bahdanau, 2015) "nbest-N" is content-based attention in which all alignment weights except the N best are set to zero. "stack" adds a neural stack memory structure. "parameterized" uses an trainable alignment matrix (cf. Neural Alignment model) seq_len (int): Maximum sentence length transition (Recurrent): Recurrent transition brick of the decoder network which is to be equipped with an attention mechanism representation_dim (int): Dimension of source annotations att_dim (int): Number of hidden units in match vector attention_sources (string): Defines the sources used by the attention model 's' for decoder states, 'f' for feedback readout_sources (string): Defines the sources used in the readout network. 's' for decoder states, 'f' for feedback, 'a' for attention (context vector) memory (string): Defines the external memory structure which is available to the decoder network. "none" does not use any memory, "stack" enables a neural stack. memory_size (int): Size of the memory structure. For example, dimension of the vectors on the neural stack Returns: Tuple. First element is the attention, the second element is a list of source names for the readout network """ attention = None att_src_names = [] readout_src_names = [] if 's' in readout_sources: readout_src_names.append('states') if 'f' in readout_sources: readout_src_names.append('feedback') if 's' in attention_sources: att_src_names.extend(transition.apply.states) if 'f' in attention_sources: att_src_names.append('feedback') if attention_strategy != 'none': if attention_strategy == 'parameterized': attention = AlignmentAttention(seq_len=seq_len, state_names=transition.apply.states, attended_dim=representation_dim, name="attention") elif attention_strategy == 'content': if memory == 'stack': attention = PushDownSequenceContentAttention( stack_dim=memory_size, state_names=att_src_names, attended_dim=representation_dim, match_dim=att_dim, name="attention") else: attention = SequenceContentAttention( state_names=att_src_names, attended_dim=representation_dim, match_dim=att_dim, name="attention") elif 'content-' in attention_strategy: if memory == 'stack': logging.error("Memory 'stack' cannot used in combination " "with multi content attention strategy (not " "implemented yet)") else: _, n = attention_strategy.split('-') attention = SequenceMultiContentAttention( n_att_weights=int(n), state_names=att_src_names, attended_dim=representation_dim, match_dim=att_dim, name="attention") elif 'nbest-' in attention_strategy: _, n = attention_strategy.split('-') if memory == 'stack': attention = PushDownThresholdedAttention( stack_dim=memory_size, nbest=int(n), state_names=att_src_names, attended_dim=representation_dim, match_dim=att_dim, name="attention") else: attention = ThresholdedSequenceContentAttention( nbest=int(n), state_names=att_src_names, attended_dim=representation_dim, match_dim=att_dim, name="attention") elif 'coverage-' in attention_strategy: _, n = attention_strategy.split('-') if memory == 'stack': logging.error("Memory 'stack' cannot used in combination " "with coverage attention strategy (not " "implemented yet)") else: attention = CoverageContentAttention( max_fertility=int(n), state_names=att_src_names, attended_dim=representation_dim, match_dim=att_dim, name="attention") else: logging.fatal("Unknown attention strategy '%s'" % attention_strategy) if 'a' in readout_sources: readout_src_names.append(attention.take_glimpses.outputs[0]) return attention, readout_src_names
def __init__( self, recordings_source, labels_source, eos_label, num_features, num_phonemes, dim_dec, dims_bidir, dims_bottom, enc_transition, dec_transition, use_states_for_readout, attention_type, lm=None, character_map=None, subsample=None, dims_top=None, prior=None, conv_n=None, bottom_activation=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, energy_normalizer=None, # softmax is th edefault set in SequenceContentAndConvAttention **kwargs): if bottom_activation is None: bottom_activation = Tanh() if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.recordings_source = recordings_source self.labels_source = labels_source self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack bottom_activation = bottom_activation post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN if dims_bottom: bottom = MLP([bottom_activation] * len(dims_bottom), [num_features] + dims_bottom, name="bottom") else: bottom = Identity(name='bottom') # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder( self.enc_transition, dims_bidir, dims_bottom[-1] if len(dims_bottom) else num_features, subsample) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition") else: transitions = [ self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack) ] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError( "Unknown attention type {}".format(attention_type)) if embed_outputs: feedback = LookupFeedback(num_phonemes + 1, dim_dec) else: feedback = OneOfNFeedback(num_phonemes + 1) if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() else: emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter") readout_config = dict(readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout") if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence( [ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP( [post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [ d // getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims ] + [num_phonemes]).apply, ], name='post_merge') readout = Readout(**readout_config) language_model = None if lm: lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn( "Beam search is prone to fail with no log-prob normalization" ) language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout( lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.children = [encoder, top, bottom, generator] # Create input variables self.recordings = tensor.tensor3(self.recordings_source) self.recordings_mask = tensor.matrix(self.recordings_source + "_mask") self.labels = tensor.lmatrix(self.labels_source) self.labels_mask = tensor.matrix(self.labels_source + "_mask") self.batch_inputs = [ self.recordings, self.recordings_source, self.labels, self.labels_mask ] self.single_recording = tensor.matrix(self.recordings_source) self.single_transcription = tensor.lvector(self.labels_source)