def __init__(self, dimension, alphabet_size, **kwargs): super(WordReverser, self).__init__(**kwargs) encoder = Bidirectional( SimpleRecurrent(dim=dimension, activation=Tanh())) fork = Fork([name for name in encoder.prototype.apply.sequences if name != 'mask']) fork.input_dim = dimension fork.output_dims = [dimension for name in fork.input_names] lookup = LookupTable(alphabet_size, dimension) transition = SimpleRecurrent( activation=Tanh(), dim=dimension, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=2 * dimension, match_dim=dimension, name="attention") readout = Readout( readout_dim=alphabet_size, source_names=[transition.apply.states[0], attention.take_glimpses.outputs[0]], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(alphabet_size, dimension), name="readout") generator = SequenceGenerator( readout=readout, transition=transition, attention=attention, name="generator") self.lookup = lookup self.fork = fork self.encoder = encoder self.generator = generator self.children = [lookup, fork, encoder, generator]
def __init__(self, dimen, vocab_size): #{ # No idea what this is doing, but otherwise "allocated" is not set super(MorphGen, self).__init__(self) # The encoder encoder = Bidirectional(SimpleRecurrent(dim=dimen, activation=Tanh())) # What is this doing ? fork = Fork([name for name in encoder.prototype.apply.sequences if name != 'mask']) fork.input_dim = dimen fork.output_dims = [encoder.prototype.get_dim(name) for name in fork.input_names] lookup = LookupTable(vocab_size, dimen) transition = SimpleRecurrent(dim=dimen, activation=Tanh(), name="transition") atten = SequenceContentAttention(state_names=transition.apply.states,attended_dim=2*dimen, match_dim=dimen, name="attention") readout = Readout( readout_dim=vocab_size, source_names=[transition.apply.states[0], atten.take_glimpses.outputs[0]], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(vocab_size, dimen), name="readout"); generator = SequenceGenerator(readout=readout, transition=transition, attention=atten,name="generator") self.lookup = lookup self.fork = fork self.encoder = encoder self.generator = generator self.children = [lookup, fork, encoder, generator]
def __init__(self, dimension, alphabet_size, **kwargs): super(SimpleGenerator, self).__init__(**kwargs) lookup = LookupTable(alphabet_size, dimension) transition = SimpleRecurrent(activation=Tanh(), dim=dimension, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dimension, match_dim=dimension, name="attention") readout = Readout(readout_dim=alphabet_size, source_names=[ transition.apply.states[0], attention.take_glimpses.outputs[0] ], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback( alphabet_size, dimension), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator") self.lookup = lookup self.generator = generator self.children = [lookup, generator]
def getRnnGenerator(vocab_size,hidden_dim,input_dim=512): """ "Apply" the RNN to the input x For initializing the network, the vocab size needs to be known Default of the hidden layer is set tot 512 like Karpathy """ generator = SequenceGenerator( Readout(readout_dim = vocab_size, source_names = ["states"], # transition.apply.states ??? emitter = SoftmaxEmitter(name="emitter"), feedback_brick = LookupFeedback( vocab_size, input_dim, name = 'feedback' ), name = "readout" ), MySimpleRecurrent( name = "transition", activation = Tanh(), dim = hidden_dim ), weights_init = IsotropicGaussian(0.01), biases_init = Constant(0), name = "generator" ) generator.push_initialization_config() generator.transition.weights_init = IsotropicGaussian(0.01) generator.initialize() return generator
def __init__(self,hidden_size_recurrent, k, **kwargs): super(Scribe, self).__init__(**kwargs) readout_size =6*k+1 transition = [GatedRecurrent(dim=hidden_size_recurrent, name = "gru_{}".format(i) ) for i in range(3)] transition = RecurrentStack( transition, name="transition", skip_connections = True) emitter = BivariateGMMEmitter(k = k) source_names = [name for name in transition.apply.states if 'states' in name] readout = Readout( readout_dim = readout_size, source_names =source_names, emitter=emitter, name="readout") self.generator = SequenceGenerator(readout=readout, transition=transition, name = "generator") self.children = [self.generator]
def __init__(self, path, nn_char_map, no_transition_cost=1e12, **kwargs): # Since we currently support only type, it is ignored. # if type_ != 'fst': # raise ValueError("Supports only FST's so far.") fst = FST(path) fst_char_map = dict(fst.fst.isyms.items()) del fst_char_map['<eps>'] if not len(fst_char_map) == len(nn_char_map): raise ValueError() remap_table = { nn_char_map[character]: fst_code for character, fst_code in fst_char_map.items() } transition = FSTTransition(fst, remap_table, no_transition_cost) # This SequenceGenerator will be used only in a very limited way. # That's why it is sufficient to equip it with a completely # fake readout. dummy_readout = Readout(source_names=['add'], readout_dim=len(remap_table), merge=Merge(input_names=['costs'], prototype=Identity()), post_merge=Identity(), emitter=SoftmaxEmitter()) super(LanguageModel, self).__init__(transition=transition, fork=Fork(output_names=[ name for name in transition.apply.sequences if name != 'mask' ], prototype=Identity()), readout=dummy_readout, **kwargs)
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim,topical_dim,theano_seed=None, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed #self.topical_dim=topical_dim; # Initialize gru with special initial state self.transition = GRUInitialState( attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder') # Initialize the attention mechanism self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") self.topical_attention=SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=topical_dim, match_dim=state_dim, name="topical_attention")#not sure whether the match dim would be correct. # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 readout = Readout( source_names=['states', 'feedback', self.attention.take_glimpses.outputs[0]],#check! readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed), feedback_brick=LookupFeedbackWMT15(vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence( [Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply]), merged_dim=state_dim) # Build sequence generator accordingly self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, topical_attention=self.topical_attention, topical_name='topical_embeddingq', content_name='content_embedding', fork=Fork([name for name in self.transition.apply.sequences if name != 'mask'], prototype=Linear()) ) self.children = [self.sequence_generator]
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.transition = GRUInitialState(attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder') self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") readout = Readout(source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=-1), feedback_brick=LookupFeedbackWMT15( vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply ]), merged_dim=state_dim, merge_prototype=Linear(use_bias=True)) self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator]
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim readout = Readout( source_names=['states', 'feedback', 'readout_context'], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(), feedback_brick=LookupFeedback(vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=1000).apply, Maxout(num_pieces=2).apply, Linear(input_dim=state_dim / 2, output_dim=100, use_bias=False).apply, Linear(input_dim=100).apply ]), merged_dim=1000) self.transition = GatedRecurrentWithContext(Tanh(), dim=state_dim, name='decoder') # Readout will apply the linear transformation to 'readout_context' # with a Merge brick, so no need to fork it here self.fork = Fork([ name for name in self.transition.apply.contexts + self.transition.apply.states if name != 'readout_context' ], prototype=Linear()) self.tanh = Tanh() self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, fork_inputs=[ name for name in self.transition.apply.sequences if name != 'mask' ], ) self.children = [self.fork, self.sequence_generator, self.tanh]
def __init__(self, batch_size, frame_size, k, depth, size, **kwargs): super(PyramidLayer, self).__init__(**kwargs) target_size = frame_size * k depth_x = depth hidden_size_mlp_x = 32*size depth_transition = depth-1 depth_theta = depth hidden_size_mlp_theta = 32*size hidden_size_recurrent = 32*size*3 depth_context = depth hidden_size_mlp_context = 32*size context_size = 32*size activations_x = [Rectifier()]*depth_x dims_x = [frame_size] + [hidden_size_mlp_x]*(depth_x-1) + \ [4*hidden_size_recurrent] activations_theta = [Rectifier()]*depth_theta dims_theta = [hidden_size_recurrent] + \ [hidden_size_mlp_theta]*depth_theta activations_context = [Rectifier()]*depth_context dims_context = [frame_size] + [hidden_size_mlp_context]*(depth_context-1) + \ [context_size] mlp_x = MLP(activations = activations_x, dims = dims_x, name = "mlp_x") feedback = DeepTransitionFeedback(mlp = mlp_x) transition = [GatedRecurrent(dim=hidden_size_recurrent, use_bias = True, name = "gru_{}".format(i) ) for i in range(depth_transition)] transition = RecurrentStack( transition, name="transition", skip_connections = True) self.transition = transition mlp_theta = MLP( activations = activations_theta, dims = dims_theta, name = "mlp_theta") mlp_gmm = GMMMLP(mlp = mlp_theta, dim = target_size, k = k, const = 0.00001, name = "gmm_wrap") gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm, output_size = frame_size, k = k) source_names = [name for name in transition.apply.states if 'states' in name] attention = SimpleSequenceAttention( state_names = source_names, state_dims = [hidden_size_recurrent], attended_dim = context_size, name = "attention") #ipdb.set_trace() # Verify source names readout = Readout( readout_dim = hidden_size_recurrent, source_names =source_names + ['feedback'] + ['glimpses'], emitter=gmm_emitter, feedback_brick = feedback, name="readout") self.generator = SequenceGenerator(readout=readout, transition=transition, attention = attention, name = "generator") self.mlp_context = MLP(activations = activations_context, dims = dims_context) self.children = [self.generator, self.mlp_context] self.final_states = []
def test_with_attention(): """Test a sequence generator with continuous outputs and attention.""" rng = numpy.random.RandomState(1234) inp_dim = 2 inp_len = 10 attended_dim = 3 attended_len = 11 batch_size = 4 n_steps = 30 # For values def rand(size): return rng.uniform(size=size).astype(floatX) # For masks def generate_mask(length, batch_size): mask = numpy.ones((length, batch_size), dtype=floatX) # To make it look like read data for i in range(batch_size): mask[1 + rng.randint(0, length - 1):, i] = 0.0 return mask output_vals = rand((inp_len, batch_size, inp_dim)) output_mask_vals = generate_mask(inp_len, batch_size) attended_vals = rand((attended_len, batch_size, attended_dim)) attended_mask_vals = generate_mask(attended_len, batch_size) transition = TestTransition( dim=inp_dim, attended_dim=attended_dim, activation=Identity()) attention = SequenceContentAttention( state_names=transition.apply.states, match_dim=inp_dim) generator = SequenceGenerator( Readout( readout_dim=inp_dim, source_names=[transition.apply.states[0], attention.take_glimpses.outputs[0]], emitter=TestEmitter()), transition=transition, attention=attention, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), add_contexts=False, seed=1234) generator.initialize() # Test 'cost_matrix' method attended = tensor.tensor3("attended") attended_mask = tensor.matrix("attended_mask") outputs = tensor.tensor3('outputs') mask = tensor.matrix('mask') costs = generator.cost_matrix(outputs, mask, attended=attended, attended_mask=attended_mask) costs_vals = costs.eval({outputs: output_vals, mask: output_mask_vals, attended: attended_vals, attended_mask: attended_mask_vals}) assert costs_vals.shape == (inp_len, batch_size) assert_allclose(costs_vals.sum(), 13.5042, rtol=1e-5) # Test `generate` method results = ( generator.generate(n_steps=n_steps, batch_size=attended.shape[1], attended=attended, attended_mask=attended_mask)) assert len(results) == 5 states_vals, outputs_vals, glimpses_vals, weights_vals, costs_vals = ( theano.function([attended, attended_mask], results) (attended_vals, attended_mask_vals)) assert states_vals.shape == (n_steps, batch_size, inp_dim) assert states_vals.shape == outputs_vals.shape assert glimpses_vals.shape == (n_steps, batch_size, attended_dim) assert weights_vals.shape == (n_steps, batch_size, attended_len) assert costs_vals.shape == (n_steps, batch_size) assert_allclose(states_vals.sum(), 23.4172, rtol=1e-5) # There is no generation cost in this case, since generation is # deterministic assert_allclose(costs_vals.sum(), 0.0, rtol=1e-5) assert_allclose(weights_vals.sum(), 120.0, rtol=1e-5) assert_allclose(glimpses_vals.sum(), 199.2402, rtol=1e-5) assert_allclose(outputs_vals.sum(), -11.6008, rtol=1e-5)
unk_token='<UNK>', level='character') alphabet_size = 4 lstm_dim = 2 lstm1 = LSTM(dim=lstm_dim, use_bias=False, weights_init=Orthogonal()) lstm2 = LSTM(dim=lstm_dim, use_bias=False, weights_init=Orthogonal()) rnn = RecurrentStack([lstm1, lstm2], name="transition") readout = Readout(readout_dim=alphabet_size, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(alphabet_size, feedback_dim=alphabet_size, name="feedback"), name="readout") seq_gen = SequenceGenerator(readout=readout, transition=rnn, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") seq_gen.push_initialization_config() rnn.weights_init = Orthogonal() seq_gen.initialize() # z markov_tutorial
def __init__(self, vocab_size, topicWord_size, embedding_dim, state_dim, topical_dim, representation_dim, match_function='SumMacthFunction', use_doubly_stochastic=False, lambda_ds=0.001, use_local_attention=False, window_size=10, use_step_decay_cost=False, use_concentration_cost=False, lambda_ct=10, use_stablilizer=False, lambda_st=50, theano_seed=None, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.topicWord_size = topicWord_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = GRU(attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder') self.energy_computer = globals()[match_function](name='energy_comp') # Initialize the attention mechanism self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, energy_computer=self.energy_computer, use_local_attention=use_local_attention, window_size=window_size, name="attention") self.topical_attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=topical_dim, match_dim=state_dim, energy_computer=self.energy_computer, use_local_attention=use_local_attention, window_size=window_size, name="topical_attention" ) #not sure whether the match dim would be correct. # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 readout = Readout(source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed), feedback_brick=LookupFeedbackWMT15( vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply ]), merged_dim=state_dim, name='readout') # calculate the readout of topic word, # no specific feedback brick, use the trival feedback break # no post_merge and merge, use Bias and Linear topicWordReadout = Readout(source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.topicWord_size, emitter=SoftmaxEmitter( initial_output=-1, theano_seed=theano_seed), name='twReadout') # Build sequence generator accordingly self.sequence_generator = SequenceGenerator( readout=readout, topicWordReadout=topicWordReadout, topic_vector_names=['topicSumVector'], transition=self.transition, attention=self.attention, topical_attention=self.topical_attention, q_dim=self.state_dim, #q_name='topic_embedding', topical_name='topic_embedding', content_name='content_embedding', use_step_decay_cost=use_step_decay_cost, use_doubly_stochastic=use_doubly_stochastic, lambda_ds=lambda_ds, use_concentration_cost=use_concentration_cost, lambda_ct=lambda_ct, use_stablilizer=use_stablilizer, lambda_st=lambda_st, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator]
class TrivialEmitter2(TrivialEmitter): @application def initial_outputs(self, batch_size): return INITIAL_OUTPUTS_CONSTANT*tensor.ones((batch_size, self.readout_dim)) from blocks.bricks.parallel import Fork transition = SimpleRecurrent2(dim = dimension, activation = Identity()) readout = Readout( readout_dim=dimension, source_names=['states', 'feedback'], emitter=TrivialEmitter2(readout_dim = dimension), feedback_brick=TrivialFeedback(output_dim = dimension), #merge = Merge(), post_merge = Identity(), merged_dim = dimension, name="readout") generator = SequenceGenerator( readout=readout, transition=transition, fork = Fork(['inputs'], prototype=Identity()), weights_init = initialization.Identity(1.), biases_init = initialization.Constant(0.), name="generator") generator.push_initialization_config() generator.transition.transition.weights_init = initialization.Identity(2.)
def train(): if os.path.isfile('trainingdata.tar'): with open('trainingdata.tar', 'rb') as f: main = load(f) else: hidden_size = 512 filename = 'warpeace.hdf5' encoder = HDF5CharEncoder('warpeace_input.txt', 1000) encoder.write(filename) alphabet_len = encoder.length x = theano.tensor.lmatrix('x') readout = Readout( readout_dim=alphabet_len, feedback_brick=LookupFeedback(alphabet_len, hidden_size, name='feedback'), source_names=['states'], emitter=RandomSoftmaxEmitter(), name='readout' ) transition = GatedRecurrent( activation=Tanh(), dim=hidden_size) transition.weights_init = IsotropicGaussian(0.01) gen = SequenceGenerator(readout=readout, transition=transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name='sequencegenerator') gen.push_initialization_config() gen.initialize() cost = gen.cost(outputs=x) cost.name = 'cost' cg = ComputationGraph(cost) algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=Scale(0.5)) train_set = encoder.get_dataset() train_stream = DataStream.default_stream( train_set, iteration_scheme=SequentialScheme( train_set.num_examples, batch_size=128)) main = MainLoop( model=Model(cost), data_stream=train_stream, algorithm=algorithm, extensions=[ FinishAfter(), Printing(), Checkpoint('trainingdata.tar', every_n_epochs=10), ShowOutput(every_n_epochs=10) ]) main.run()
def __init__(self, vocab_size, embedding_dim, state_dim, att_dim, maxout_dim, representation_dim, attention_strategy='content', attention_sources='s', readout_sources='sfa', memory='none', memory_size=500, seq_len=50, init_strategy='last', theano_seed=None, **kwargs): """Creates a new decoder brick without embedding. Args: vocab_size (int): Target language vocabulary size embedding_dim (int): Size of feedback embedding layer state_dim (int): Number of hidden units att_dim (int): Size of attention match vector maxout_dim (int): Size of maxout layer representation_dim (int): Dimension of source annotations attention_strategy (string): Which attention should be used cf. ``_initialize_attention`` attention_sources (string): Defines the sources used by the attention model 's' for decoder states, 'f' for feedback readout_sources (string): Defines the sources used in the readout network. 's' for decoder states, 'f' for feedback, 'a' for attention (context vector) memory (string): Which external memory should be used (cf. ``_initialize_attention``) memory_size (int): Size of the external memory structure seq_len (int): Maximum sentence length init_strategy (string): How to initialize the RNN state (cf. ``GRUInitialState``) theano_seed: Random seed """ super(NoLookupDecoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = GRUInitialState(attended_dim=state_dim, init_strategy=init_strategy, dim=state_dim, activation=Tanh(), name='decoder') # Initialize the attention mechanism att_dim = att_dim if att_dim > 0 else state_dim self.attention, src_names = _initialize_attention( attention_strategy, seq_len, self.transition, representation_dim, att_dim, attention_sources, readout_sources, memory, memory_size) # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 maxout_dim = maxout_dim if maxout_dim > 0 else state_dim readout = Readout( source_names=src_names, readout_dim=embedding_dim, emitter=NoLookupEmitter(initial_output=-1, readout_dim=embedding_dim, cost_brick=SquaredError()), # cost_brick=CategoricalCrossEntropy()), feedback_brick=TrivialFeedback(output_dim=embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=maxout_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=maxout_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Logistic(name='softmax1').apply ]), merged_dim=maxout_dim) # Build sequence generator accordingly self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator]
def __init__( self, input_dims, input_num_chars, eos_label, num_phonemes, dim_dec, dims_bidir, enc_transition, dec_transition, use_states_for_readout, attention_type, criterion, bottom, lm=None, character_map=None, bidir=True, subsample=None, dims_top=None, prior=None, conv_n=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dim_output_embedding=None, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, # softmax is the default set in SequenceContentAndConvAttention energy_normalizer=None, # for speech this is the approximate phoneme duration in frames max_decoded_length_scale=1, **kwargs): if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack self.criterion = criterion self.max_decoded_length_scale = max_decoded_length_scale post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN bottom_class = bottom.pop('bottom_class') bottom = bottom_class(input_dims=input_dims, input_num_chars=input_num_chars, name='bottom', **bottom) # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, bottom.get_dim(bottom.apply.outputs[0]), subsample, bidir=bidir) dim_encoded = encoder.get_dim(encoder.apply.outputs[0]) generators = [None, None] for i in range(2): # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded], name="top{}".format(i)) else: top = Identity(name='top{}'.format(i)) if dec_stack == 1: transition = self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition{}".format(i)) else: transitions = [ self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}_{}".format( i, trans_level)) for trans_level in xrange(dec_stack) ] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dim_encoded, match_dim=dim_matcher, name="cont_att" + i) elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=dim_encoded, match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att{}".format(i)) else: raise ValueError( "Unknown attention type {}".format(attention_type)) if embed_outputs: feedback = LookupFeedback( num_phonemes + 1, dim_dec if dim_output_embedding is None else dim_output_embedding) else: feedback = OneOfNFeedback(num_phonemes + 1) if criterion['name'] == 'log_likelihood': emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter{}".format(i)) if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() elif criterion['name'].startswith('mse'): emitter = RewardRegressionEmitter(criterion['name'], eos_label, num_phonemes, criterion.get( 'min_reward', -1.0), name="emitter") else: raise ValueError("Unknown criterion {}".format( criterion['name'])) readout_config = dict( readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout{}".format(i)) if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence( [ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP( [post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [ d // getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims ] + [num_phonemes]).apply, ], name='post_merge{}'.format(i)) readout = Readout(**readout_config) language_model = None if lm and lm.get('path'): lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn( "Beam search is prone to fail with no log-prob normalization" ) language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout( lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generators[i] = SequenceGenerator(readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator{}".format(i)) self.generator = generators[0] self.forward_to_backward = Linear(dim_dec, dim_dec) # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generators = generators self.children = [self.forward_to_backward, encoder, top, bottom ] + generators # Create input variables self.inputs = self.bottom.batch_inputs self.inputs_mask = self.bottom.mask self.labels = tensor.lmatrix('labels') self.labels_mask = tensor.matrix("labels_mask") self.single_inputs = self.bottom.single_inputs self.single_labels = tensor.lvector('labels') self.n_steps = tensor.lscalar('n_steps')
def main_rnn(config): x = tensor.tensor3('features') y = tensor.matrix('targets') # if 'LSTM' in config['model'] : # from models import getLSTMstack # y_hat = getLSTMstack(input_dim=13, input_var=x, depth=int(config['model'][-1])) # else : # raise Exception("These are not the LSTM we are looking for") # y_hat = model.apply(x) emitter = TestEmitter() # emitter = TrivialEmitter(readout_dim=config['lstm_hidden_size']) # cost_func = SquaredError() # @application # def qwe(self, readouts, outputs=None): # print(type(self), type(readouts)) # x = cost_func.apply(readouts,outputs) # return x print(type(emitter.cost)) # emitter.cost = qwe # print(type(qwe)) steps = 2 n_samples= config['target_size'] transition = [LSTM(config['lstm_hidden_size']) for _ in range(4)] transition = RecurrentStack(transition, name="transition", skip_connections=False) source_names = [name for name in transition.apply.states if 'states' in name] readout = Readout(emitter, readout_dim=config['lstm_hidden_size'], source_names=source_names,feedback_brick=None, merge=None, merge_prototype=None, post_merge=None, merged_dim=None) seqgen = SequenceGenerator(readout, transition, attention=None, add_contexts=False) seqgen.weights_init = IsotropicGaussian(0.01) seqgen.biases_init = Constant(0.) seqgen.push_initialization_config() seqgen.transition.biases_init = IsotropicGaussian(0.01,1) seqgen.transition.push_initialization_config() seqgen.initialize() states = seqgen.transition.apply.outputs print('states',states) states = {name: shared_floatx_zeros((n_samples, config['lstm_hidden_size'])) for name in states} cost_matrix = seqgen.cost_matrix(x, **states) cost = cost_matrix.mean() cost.name = "nll" cg = ComputationGraph(cost) model = Model(cost) #Cost # cost = SquaredError().apply(y_hat ,y) #cost = CategoricalCrossEntropy().apply(T.flatten(),Y) # #for sampling #cg = ComputationGraph(seqgen.generate(n_steps=steps,batch_size=n_samples, iterate=True)) algorithm = GradientDescent( cost=cost, parameters=cg.parameters, step_rule=Scale(learning_rate=config['learning_rate'])) #Getting the stream train_stream = MFCC.get_stream(config['batch_size'],config['source_size'],config['target_size'],config['num_examples']) #Monitoring stuff extensions = [Timing(), FinishAfter(after_n_batches=config['num_batches']), #DataStreamMonitoring([cost, error_rate],test_stream,prefix="test"), TrainingDataMonitoring([cost], prefix="train", every_n_batches=1), #Checkpoint(save_to), ProgressBar(), Printing(every_n_batches=1)] main_loop = MainLoop( algorithm, train_stream, # model=model, extensions=extensions) main_loop.run()
def main(mode, save_path, steps, num_batches): num_states = MarkovChainDataset.num_states if mode == "train": # Experiment configuration rng = numpy.random.RandomState(1) batch_size = 50 seq_len = 100 dim = 10 feedback_dim = 8 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator(Readout( readout_dim=num_states, source_names=["states"], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(num_states, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Give an idea of what's going on. logger.info("Parameters:\n" + pprint.pformat( [(key, value.get_value().shape) for key, value in Selector(generator).get_params().items()], width=120)) logger.info("Markov chain entropy: {}".format( MarkovChainDataset.entropy)) logger.info("Expected min error: {}".format( -MarkovChainDataset.entropy * seq_len)) # Build the cost computation graph. x = tensor.lmatrix('data') cost = aggregation.mean( generator.cost_matrix(x[:, :]).sum(), x.shape[1]) cost.name = "sequence_log_likelihood" algorithm = GradientDescent( cost=cost, params=list(Selector(generator).get_params().values()), step_rule=Scale(0.001)) main_loop = MainLoop(algorithm=algorithm, data_stream=DataStream( MarkovChainDataset(rng, seq_len), iteration_scheme=ConstantScheme(batch_size)), model=Model(cost), extensions=[ FinishAfter(after_n_batches=num_batches), TrainingDataMonitoring([cost], prefix="this_step", after_batch=True), TrainingDataMonitoring([cost], prefix="average", every_n_batches=100), Checkpoint(save_path, every_n_batches=500), Printing(every_n_batches=100) ]) main_loop.run() elif mode == "sample": main_loop = cPickle.load(open(save_path, "rb")) generator = main_loop.model sample = ComputationGraph( generator.generate(n_steps=steps, batch_size=1, iterate=True)).get_theano_function() states, outputs, costs = [data[:, 0] for data in sample()] numpy.set_printoptions(precision=3, suppress=True) print("Generation cost:\n{}".format(costs.sum())) freqs = numpy.bincount(outputs).astype(floatX) freqs /= freqs.sum() print("Frequencies:\n {} vs {}".format(freqs, MarkovChainDataset.equilibrium)) trans_freqs = numpy.zeros((num_states, num_states), dtype=floatX) for a, b in zip(outputs, outputs[1:]): trans_freqs[a, b] += 1 trans_freqs /= trans_freqs.sum(axis=1)[:, None] print("Transition frequencies:\n{}\nvs\n{}".format( trans_freqs, MarkovChainDataset.trans_prob)) else: assert False
#68 characters from blocks.bricks.attention import SequenceContentAttention from blocks.bricks.lookup import LookupTable lookup = LookupTable(68, 100) embed = lookup.apply(context) attention = SequenceContentAttention( state_names=source_names, attended_dim=100, #or is it 68 match_dim=30, name="attention") readout = Readout(readout_dim=readout_size, source_names=source_names + [attention.take_glimpses.outputs[0]], emitter=emitter, name="readout") generator = SequenceGenerator(readout=readout, attention=attention, transition=transition, name="generator") generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.001) generator.push_initialization_config() lookup.weights_init = IsotropicGaussian(0.01) lookup.biases_init = Constant(0.001) lookup.initialize()
def main(name, epochs, batch_size, learning_rate, dim, mix_dim, old_model_name, max_length, bokeh, GRU, dropout, depth, max_grad, step_method, epsilon, sample): #---------------------------------------------------------------------- datasource = name def shnum(x): """ Convert a positive float into a short tag-usable string E.g.: 0 -> 0, 0.005 -> 53, 100 -> 1-2 """ return '0' if x <= 0 else '%s%d' % ( ("%e" % x)[0], -np.floor(np.log10(x))) jobname = "%s-%dX%dm%dd%dr%sb%de%s" % ( datasource, depth, dim, mix_dim, int( dropout * 10), shnum(learning_rate), batch_size, shnum(epsilon)) if max_length != 600: jobname += '-L%d' % max_length if GRU: jobname += 'g' if max_grad != 5.: jobname += 'G%g' % max_grad if step_method != 'adam': jobname += step_method if sample: print("Sampling") else: print("\nRunning experiment %s" % jobname) #---------------------------------------------------------------------- if depth > 1: transition = LSTMstack(dim=dim, depth=depth, name="transition", lstm_name="transition") assert not GRU elif GRU: transition = GatedRecurrent(dim=dim, name="transition") else: transition = LSTM(dim=dim, name="transition") emitter = SketchEmitter(mix_dim=mix_dim, epsilon=epsilon, name="emitter") readout = Readout(readout_dim=emitter.get_dim('inputs'), source_names=['states'], emitter=emitter, name="readout") normal_inputs = [ name for name in transition.apply.sequences if 'mask' not in name ] fork = Fork(normal_inputs, prototype=Linear(use_bias=True)) generator = SequenceGenerator(readout=readout, transition=transition, fork=fork) # Initialization settings generator.weights_init = OrthogonalGlorot() generator.biases_init = Constant(0) # Build the cost computation graph [steps,batch_size, 3] x = T.tensor3('features', dtype=floatX)[:max_length, :, :] x.tag.test_value = np.ones((max_length, batch_size, 3)).astype(np.float32) cost = generator.cost(x) cost.name = "sequence_log_likelihood" # Give an idea of what's going on model = Model(cost) params = model.get_params() logger.info("Parameters:\n" + pprint.pformat([(key, value.get_value().shape) for key, value in params.items()], width=120)) model_size = 0 for v in params.itervalues(): s = v.get_value().shape model_size += s[0] * (s[1] if len(s) > 1 else 1) logger.info("Total number of parameters %d" % model_size) #------------------------------------------------------------ extensions = [] if old_model_name == 'continue': extensions.append(LoadFromDump(jobname)) elif old_model_name: # or you can just load the weights without state using: old_params = LoadFromDump(old_model_name).manager.load_parameters() model.set_param_values(old_params) else: # Initialize parameters for brick in model.get_top_bricks(): brick.initialize() if sample: assert old_model_name and old_model_name != 'continue' Sample(generator, steps=max_length, path='.').do(None) exit(0) #------------------------------------------------------------ # Define the training algorithm. cg = ComputationGraph(cost) if dropout > 0.: from blocks.roles import INPUT, OUTPUT dropout_target = VariableFilter(roles=[OUTPUT], bricks=[transition], name_regex='states')(cg.variables) cg = apply_dropout(cg, dropout_target, dropout) cost = cg.outputs[0] if step_method == 'adam': step_rule = Adam(learning_rate) elif step_method == 'rmsprop': step_rule = RMSProp(learning_rate, decay_rate=0.95) elif step_method == 'adagrad': step_rule = AdaGrad(learning_rate) elif step_method == 'adadelta': step_rule = AdaDelta() elif step_method == 'scale': step_rule = Scale(learning_rate=0.1) else: raise Exception('Unknown sttep method %s' % step_method) step_rule = CompositeRule([StepClipping(max_grad), step_rule]) algorithm = GradientDescent(cost=cost, params=cg.parameters, step_rule=step_rule) #------------------------------------------------------------ observables = [cost] # Fetch variables useful for debugging (energies, ) = VariableFilter(applications=[generator.readout.readout], name_regex="output")(cg.variables) (activations, ) = VariableFilter( applications=[generator.transition.apply], name=generator.transition.apply.states[0])(cg.variables) min_energy = named_copy(energies.min(), "min_energy") max_energy = named_copy(energies.max(), "max_energy") mean_activation = named_copy(abs(activations).mean(), "mean_activation") observables += [min_energy, max_energy, mean_activation] observables += [algorithm.total_step_norm, algorithm.total_gradient_norm] for name, param in params.items(): observables.append(named_copy(param.norm(2), name + "_norm")) observables.append( named_copy(algorithm.gradients[param].norm(2), name + "_grad_norm")) #------------------------------------------------------------ datasource_fname = os.path.join(fuel.config.data_path, datasource, datasource + '.hdf5') train_ds = H5PYDataset( datasource_fname, #max_length=max_length, which_set='train', sources=('features', ), load_in_memory=True) train_stream = DataStream(train_ds, iteration_scheme=ShuffledScheme( train_ds.num_examples, batch_size)) test_ds = H5PYDataset( datasource_fname, #max_length=max_length, which_set='test', sources=('features', ), load_in_memory=True) test_stream = DataStream(test_ds, iteration_scheme=SequentialScheme( test_ds.num_examples, batch_size)) train_stream = Mapping(train_stream, _transpose) test_stream = Mapping(test_stream, _transpose) def stream_stats(ds, label): itr = ds.get_epoch_iterator(as_dict=True) batch_count = 0 examples_count = 0 for batch in itr: batch_count += 1 examples_count += batch['features'].shape[1] print('%s #batch %d #examples %d' % (label, batch_count, examples_count)) stream_stats(train_stream, 'train') stream_stats(test_stream, 'test') extensions += [ Timing(every_n_batches=10), TrainingDataMonitoring(observables, prefix="train", every_n_batches=10), DataStreamMonitoring( [cost], test_stream, prefix="test", on_resumption=True, after_epoch=False, # by default this is True every_n_batches=100), # all monitored data is ready so print it... # (next steps may take more time and we want to see the # results as soon as possible so print as soon as you can) Printing(every_n_batches=10), # perform multiple dumps at different intervals # so if one of them breaks (has nan) we can hopefully # find a model from few batches ago in the other Dump(jobname, every_n_batches=11), Dump(jobname + '.test', every_n_batches=100), Sample(generator, steps=max_length, path=jobname + '.test', every_n_batches=100), ProgressBar(), FinishAfter(after_n_epochs=epochs) # This shows a way to handle NaN emerging during # training: simply finish it. .add_condition("after_batch", _is_nan), ] if bokeh: extensions.append(Plot('sketch', channels=[ ['cost'], ])) # Construct the main loop and start training! main_loop = MainLoop(model=model, data_stream=train_stream, algorithm=algorithm, extensions=extensions) main_loop.run()
gmm_emitter = GMMEmitter(gmmmlp = mlp_gmm, output_size = frame_size, k = k) source_names = [name for name in transition.apply.states if 'states' in name] attention = SimpleSequenceAttention( state_names = source_names, state_dims = [hidden_size_recurrent], attended_dim = context_size, name = "attention") #ipdb.set_trace() # Verify source names readout = Readout( readout_dim = hidden_size_recurrent, source_names =source_names + ['feedback'] + ['glimpses'], emitter=gmm_emitter, feedback_brick = feedback, name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention = attention, name = "generator") mlp_context = MLP(activations = activations_context, dims = dims_context) bricks = [mlp_context] for brick in bricks: brick.weights_init = IsotropicGaussian(0.01)
def __init__(self, config, vocab_size): question = tensor.imatrix('question') question_mask = tensor.imatrix('question_mask') context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.ivector('answer') candidates = tensor.imatrix('candidates') candidates_mask = tensor.imatrix('candidates_mask') bricks = [] question = question.dimshuffle(1, 0) question_mask = question_mask.dimshuffle(1, 0) context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) # Embed questions and cntext embed = LookupTable(vocab_size, config.embed_size, name='question_embed') bricks.append(embed) qembed = embed.apply(question) cembed = embed.apply(context) qlstms, qhidden_list = make_bidir_lstm_stack(qembed, config.embed_size, question_mask.astype(theano.config.floatX), config.question_lstm_size, config.question_skip_connections, 'q') clstms, chidden_list = make_bidir_lstm_stack(cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + qlstms + clstms # Calculate question encoding (concatenate layer1) if config.question_skip_connections: qenc_dim = 2*sum(config.question_lstm_size) qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list], axis=1) else: qenc_dim = 2*config.question_lstm_size[-1] qenc = tensor.concatenate([h[-1,:,:] for h in qhidden_list[-2:]], axis=1) qenc.name = 'qenc' # Calculate context encoding (concatenate layer1) if config.ctx_skip_connections: #default yes cenc_dim = 2*sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2*config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Attention mechanism MLP activation: Tanh, identity attention_mlp = MLP(dims=config.attention_mlp_hidden + [1], activations=config.attention_mlp_activations[1:] + [Identity()], name='attention_mlp') attention_qlinear = Linear(input_dim=qenc_dim, output_dim=config.attention_mlp_hidden[0], name='attq') #Wum attention_clinear = Linear(input_dim=cenc_dim, output_dim=config.attention_mlp_hidden[0], use_bias=False, name='attc') # Wym bricks += [attention_mlp, attention_qlinear, attention_clinear] layer1 = Tanh().apply(attention_clinear.apply(cenc.reshape((cenc.shape[0]*cenc.shape[1], cenc.shape[2]))) .reshape((cenc.shape[0],cenc.shape[1],config.attention_mlp_hidden[0])) + attention_qlinear.apply(qenc)[None, :, :]) layer1.name = 'layer1' att_weights = attention_mlp.apply(layer1.reshape((layer1.shape[0]*layer1.shape[1], layer1.shape[2]))) att_weights.name = 'att_weights_0' att_weights = att_weights.reshape((layer1.shape[0], layer1.shape[1])) att_weights.name = 'att_weights' attended = tensor.sum(cenc * tensor.nnet.softmax(att_weights.T).T[:, :, None], axis=0) attended.name = 'attended' print("attended shape: %d" %attended.shape) dimension = qenc_dim + cenc_dim transition = SimpleRecurrent(activation=Tanh(),dim=dimension, name="transition") readout = Readout( readout_dim=vocab_size, source_names=[transition.apply.states[0]], emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(vocab_size, dimension), name="readout") generator = SequenceGenerator( readout=readout, transition=transition, name="generator") self.generator = generator bricks += [generator] cost = self.generator.cost() # Now we can calculate our output out_mlp = MLP(dims=[cenc_dim + qenc_dim] + config.out_mlp_hidden + [config.n_entities], activations=config.out_mlp_activations + [Identity()], name='out_mlp') bricks += [out_mlp] probs = out_mlp.apply(tensor.concatenate([attended, qenc], axis=1)) probs.name = 'probs' is_candidate = tensor.eq(tensor.arange(config.n_entities, dtype='int32')[None, None, :], tensor.switch(candidates_mask, candidates, -tensor.ones_like(candidates))[:, :, None]).sum(axis=1) probs = tensor.switch(is_candidate, probs, -1000 * tensor.ones_like(probs)) # Calculate prediction, cost and error rate pred = probs.argmax(axis=1) cost = Softmax().categorical_cross_entropy(answer, probs).mean() error_rate = tensor.neq(answer, pred).mean() # Apply dropout cg = ComputationGraph([cost, error_rate]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, qhidden_list + chidden_list, config.dropout) [cost_reg, error_rate_reg] = cg.outputs # Other stuff cost_reg.name = cost.name = 'cost' error_rate_reg.name = error_rate.name = 'error_rate' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg], [error_rate_reg]] self.monitor_vars_valid = [[cost], [error_rate]] # Initialize bricks for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def test_integer_sequence_generator(): """Test a sequence generator with integer outputs. Such sequence generators can be used to e.g. model language. """ rng = numpy.random.RandomState(1234) readout_dim = 5 feedback_dim = 3 dim = 20 batch_size = 30 n_steps = 10 transition = GatedRecurrent(dim=dim, activation=Tanh(), weights_init=Orthogonal()) generator = SequenceGenerator( Readout(readout_dim=readout_dim, source_names=["states"], emitter=SoftmaxEmitter(theano_seed=1234), feedback_brick=LookupFeedback(readout_dim, feedback_dim)), transition, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0), seed=1234) generator.initialize() # Test 'cost_matrix' method y = tensor.lmatrix('y') mask = tensor.matrix('mask') costs = generator.cost_matrix(y, mask) assert costs.ndim == 2 costs_fun = theano.function([y, mask], [costs]) y_test = rng.randint(readout_dim, size=(n_steps, batch_size)) m_test = numpy.ones((n_steps, batch_size), dtype=floatX) costs_val = costs_fun(y_test, m_test)[0] assert costs_val.shape == (n_steps, batch_size) assert_allclose(costs_val.sum(), 482.827, rtol=1e-5) # Test 'cost' method cost = generator.cost(y, mask) assert cost.ndim == 0 cost_val = theano.function([y, mask], [cost])(y_test, m_test) assert_allclose(cost_val, 16.0942, rtol=1e-5) # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method cg = ComputationGraph([cost]) var_filter = VariableFilter(roles=[AUXILIARY]) aux_var_name = '_'.join([generator.name, generator.cost.name, 'per_sequence_element']) cost_per_el = [el for el in var_filter(cg.variables) if el.name == aux_var_name][0] assert cost_per_el.ndim == 0 cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test) assert_allclose(cost_per_el_val, 1.60942, rtol=1e-5) # Test generate states, outputs, costs = generator.generate( iterate=True, batch_size=batch_size, n_steps=n_steps) cg = ComputationGraph(states + outputs + costs) states_val, outputs_val, costs_val = theano.function( [], [states, outputs, costs], updates=cg.updates)() assert states_val.shape == (n_steps, batch_size, dim) assert outputs_val.shape == (n_steps, batch_size) assert outputs_val.dtype == 'int64' assert costs_val.shape == (n_steps, batch_size) assert_allclose(states_val.sum(), -17.91811, rtol=1e-5) assert_allclose(costs_val.sum(), 482.863, rtol=1e-5) assert outputs_val.sum() == 630 # Test masks agnostic results of cost cost1 = costs_fun([[1], [2]], [[1], [1]])[0] cost2 = costs_fun([[3, 1], [4, 2], [2, 0]], [[1, 1], [1, 1], [1, 0]])[0] assert_allclose(cost1.sum(), cost2[:, 1].sum(), rtol=1e-5)
def test_sequence_generator(): """Test a sequence generator with no contexts and continuous outputs. Such sequence generators can be used to model e.g. dynamical systems. """ rng = numpy.random.RandomState(1234) output_dim = 1 dim = 20 batch_size = 30 n_steps = 10 transition = SimpleRecurrent(activation=Tanh(), dim=dim, weights_init=Orthogonal()) generator = SequenceGenerator( Readout(readout_dim=output_dim, source_names=["states"], emitter=TestEmitter()), transition, weights_init=IsotropicGaussian(0.1), biases_init=Constant(0.0), seed=1234) generator.initialize() # Test 'cost_matrix' method y = tensor.tensor3('y') mask = tensor.matrix('mask') costs = generator.cost_matrix(y, mask) assert costs.ndim == 2 y_test = rng.uniform(size=(n_steps, batch_size, output_dim)).astype(floatX) m_test = numpy.ones((n_steps, batch_size), dtype=floatX) costs_val = theano.function([y, mask], [costs])(y_test, m_test)[0] assert costs_val.shape == (n_steps, batch_size) assert_allclose(costs_val.sum(), 115.593, rtol=1e-5) # Test 'cost' method cost = generator.cost(y, mask) assert cost.ndim == 0 cost_val = theano.function([y, mask], [cost])(y_test, m_test) assert_allclose(cost_val, 3.8531, rtol=1e-5) # Test 'AUXILIARY' variable 'per_sequence_element' in 'cost' method cg = ComputationGraph([cost]) var_filter = VariableFilter(roles=[AUXILIARY]) aux_var_name = '_'.join([generator.name, generator.cost.name, 'per_sequence_element']) cost_per_el = [el for el in var_filter(cg.variables) if el.name == aux_var_name][0] assert cost_per_el.ndim == 0 cost_per_el_val = theano.function([y, mask], [cost_per_el])(y_test, m_test) assert_allclose(cost_per_el_val, 0.38531, rtol=1e-5) # Test 'generate' method states, outputs, costs = [variable.eval() for variable in generator.generate( states=rng.uniform( size=(batch_size, dim)).astype(floatX), iterate=True, batch_size=batch_size, n_steps=n_steps)] assert states.shape == (n_steps, batch_size, dim) assert outputs.shape == (n_steps, batch_size, output_dim) assert costs.shape == (n_steps, batch_size) assert_allclose(outputs.sum(), -0.33683, rtol=1e-5) assert_allclose(states.sum(), 15.7909, rtol=1e-5) # There is no generation cost in this case, since generation is # deterministic assert_allclose(costs.sum(), 0.0)
def __init__(self, config, vocab_size): context = tensor.imatrix('context') context_mask = tensor.imatrix('context_mask') answer = tensor.imatrix('answer') answer_mask = tensor.imatrix('answer_mask') bricks = [] context = context.dimshuffle(1, 0) context_mask = context_mask.dimshuffle(1, 0) answer = answer.dimshuffle(1, 0) answer_mask = answer_mask.dimshuffle(1, 0) context_bag = to_bag(context, vocab_size) # Embed questions and context embed = LookupTable(vocab_size, config.embed_size, name='embed') embed.weights_init = IsotropicGaussian(0.01) #embeddings_initial_value = init_embedding_table(filename='embeddings/vocab_embeddings.txt') #embed.weights_init = Constant(embeddings_initial_value) # Calculate context encoding (concatenate layer1) cembed = embed.apply(context) clstms, chidden_list = make_bidir_lstm_stack( cembed, config.embed_size, context_mask.astype(theano.config.floatX), config.ctx_lstm_size, config.ctx_skip_connections, 'ctx') bricks = bricks + clstms if config.ctx_skip_connections: cenc_dim = 2 * sum(config.ctx_lstm_size) #2 : fw & bw cenc = tensor.concatenate(chidden_list, axis=2) else: cenc_dim = 2 * config.ctx_lstm_size[-1] cenc = tensor.concatenate(chidden_list[-2:], axis=2) cenc.name = 'cenc' # Build the encoder bricks transition = GatedRecurrent(activation=Tanh(), dim=config.generator_lstm_size, name="transition") attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=cenc_dim, match_dim=config.generator_lstm_size, name="attention") readout = Readout(readout_dim=vocab_size, source_names=[ transition.apply.states[0], attention.take_glimpses.outputs[0] ], emitter=MaskedSoftmaxEmitter(context_bag=context_bag, name='emitter'), feedback_brick=LookupFeedback( vocab_size, config.feedback_size), name="readout") generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator") cost = generator.cost(answer, answer_mask.astype(theano.config.floatX), attended=cenc, attended_mask=context_mask.astype( theano.config.floatX), name="cost") self.predictions = generator.generate( n_steps=7, batch_size=config.batch_size, attended=cenc, attended_mask=context_mask.astype(theano.config.floatX), iterate=True)[1] # Apply dropout cg = ComputationGraph([cost]) if config.w_noise > 0: noise_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, noise_vars, config.w_noise) if config.dropout > 0: cg = apply_dropout(cg, chidden_list, config.dropout) [cost_reg] = cg.outputs # Other stuff cost.name = 'cost' cost_reg.name = 'cost_reg' self.sgd_cost = cost_reg self.monitor_vars = [[cost_reg]] self.monitor_vars_valid = [[cost_reg]] # initialize new stuff manually (change!) generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0) generator.push_allocation_config() generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Initialize bricks embed.initialize() for brick in bricks: brick.weights_init = config.weights_init brick.biases_init = config.biases_init brick.initialize()
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, context_dim, target_transition, theano_seed=None, loss_function='cross_entropy', **kwargs): super(InitialContextDecoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = target_transition(attended_dim=state_dim, context_dim=context_dim, dim=state_dim, activation=Tanh(), name='decoder') # self.transition = GRUInitialStateWithInitialStateConcatContext( # attended_dim=state_dim, context_dim=context_dim, dim=state_dim, # activation=Tanh(), name='decoder') # Initialize the attention mechanism self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 readout = Readout( source_names=[ 'states', 'feedback', # Chris: it's key that we're taking the first output of self.attention.take_glimpses.outputs # Chris: the first output is the weighted avgs, the second is the weights in (batch, time) self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed), feedback_brick=LookupFeedbackWMT15(vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply ]), merged_dim=state_dim) # Build sequence generator accordingly if loss_function == 'cross_entropy': self.sequence_generator = InitialContextSequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) elif loss_function == 'min_risk': self.sequence_generator = MinRiskInitialContextSequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) # the name is important, because it lets us match the brick hierarchy names for the vanilla SequenceGenerator # to load pretrained models # TODO: quick hack to fix bug self.sequence_generator.name = 'initialcontextsequencegenerator' else: raise ValueError( 'The decoder does not support the loss function: {}'.format( loss_function)) # TODO: uncomment this!! # self.sequence_generator.name = 'sequencegenerator' self.children = [self.sequence_generator]
def __init__( self, recordings_source, labels_source, eos_label, num_features, num_phonemes, dim_dec, dims_bidir, dims_bottom, enc_transition, dec_transition, use_states_for_readout, attention_type, lm=None, character_map=None, subsample=None, dims_top=None, prior=None, conv_n=None, bottom_activation=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, energy_normalizer=None, # softmax is th edefault set in SequenceContentAndConvAttention **kwargs): if bottom_activation is None: bottom_activation = Tanh() if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.recordings_source = recordings_source self.labels_source = labels_source self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack bottom_activation = bottom_activation post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN if dims_bottom: bottom = MLP([bottom_activation] * len(dims_bottom), [num_features] + dims_bottom, name="bottom") else: bottom = Identity(name='bottom') # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder( self.enc_transition, dims_bidir, dims_bottom[-1] if len(dims_bottom) else num_features, subsample) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition") else: transitions = [ self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack) ] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError( "Unknown attention type {}".format(attention_type)) if embed_outputs: feedback = LookupFeedback(num_phonemes + 1, dim_dec) else: feedback = OneOfNFeedback(num_phonemes + 1) if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() else: emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter") readout_config = dict(readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout") if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence( [ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP( [post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [ d // getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims ] + [num_phonemes]).apply, ], name='post_merge') readout = Readout(**readout_config) language_model = None if lm: lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn( "Beam search is prone to fail with no log-prob normalization" ) language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout( lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.children = [encoder, top, bottom, generator] # Create input variables self.recordings = tensor.tensor3(self.recordings_source) self.recordings_mask = tensor.matrix(self.recordings_source + "_mask") self.labels = tensor.lmatrix(self.labels_source) self.labels_mask = tensor.matrix(self.labels_source + "_mask") self.batch_inputs = [ self.recordings, self.recordings_source, self.labels, self.labels_mask ] self.single_recording = tensor.matrix(self.recordings_source) self.single_transcription = tensor.lvector(self.labels_source)
transition = LSTM(dim=hidden_size_recurrent) mlp_theta = MLP(activations=activations_theta, dims=dims_theta) mlp_gmm = GMMMLP(mlp=mlp_theta, dim=target_size, k=k, const=0.00001) emitter = GMMEmitter(gmmmlp=mlp_gmm, output_size=frame_size, k=k, name="emitter") source_names = ['states'] readout = Readout(readout_dim=hidden_size_recurrent, source_names=source_names, emitter=emitter, feedback_brick=feedback, name="readout") attention = SimpleSequenceAttention(state_names=source_names, state_dims=[hidden_size_recurrent], attended_dim=context_size) generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, name="generator") generator.weights_init = IsotropicGaussian(0.01) generator.biases_init = Constant(0.) generator.initialize()
train_data, vocab_size = createDataset(corpus=corpus, sequence_length=750, repeat=20) if args.mode == "train": seq_len = 100 dim = 100 feedback_dim = 100 # Build the bricks and initialize them transition = GatedRecurrent(name="transition", dim=dim, activation=Tanh()) generator = SequenceGenerator( Readout( readout_dim=vocab_size, source_names=["states"], # transition.apply.states ??? emitter=SoftmaxEmitter(name="emitter"), feedback_brick=LookupFeedback(vocab_size, feedback_dim, name='feedback'), name="readout"), transition, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator") generator.push_initialization_config() transition.weights_init = Orthogonal() generator.initialize() # Build the cost computation graph. x = tensor.lmatrix('inchar') cost = generator.cost(outputs=x)