def test_variable_filter(): # Creating computation graph brick1 = Linear(input_dim=2, output_dim=2, name='linear1') brick2 = Bias(2, name='bias1') activation = Sigmoid(name='sigm') x = tensor.vector() h1 = brick1.apply(x) h2 = activation.apply(h1) y = brick2.apply(h2) cg = ComputationGraph(y) parameters = [brick1.W, brick1.b, brick2.params[0]] bias = [brick1.b, brick2.params[0]] brick1_bias = [brick1.b] # Testing filtering by role role_filter = VariableFilter(roles=[PARAMETER]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[FILTER]) assert [] == role_filter(cg.variables) # Testing filtering by role using each_role flag role_filter = VariableFilter(roles=[PARAMETER, BIAS]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True) assert not parameters == role_filter(cg.variables) assert bias == role_filter(cg.variables) # Testing filtering by bricks classes brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by bricks instances brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by brick instance brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by name name_filter = VariableFilter(name='W_norm') assert [cg.variables[2]] == name_filter(cg.variables) # Testing filtering by name regex name_filter_regex = VariableFilter(name_regex='W_no.?m') assert [cg.variables[2]] == name_filter_regex(cg.variables) # Testing filtering by application appli_filter = VariableFilter(applications=[brick1.apply]) variables = [cg.variables[1], cg.variables[8]] assert variables == appli_filter(cg.variables) # Testing filtering by application appli_filter_list = VariableFilter(applications=[brick1.apply]) assert variables == appli_filter_list(cg.variables)
def _build_bricks(self, *args, **kwargs): # Build lookup tables self.word_embed = self._embed(len(self.dataset.word2index), self.config.word_embed_dim, name='word_embed') self.hashtag_embed = self._embed(len(self.dataset.hashtag2index), self.config.lstm_dim, name='hashtag_embed') # Build text encoder self.mlstm_ins = Linear(input_dim=self.config.word_embed_dim, output_dim=4 * self.config.lstm_dim, name='mlstm_in') self.mlstm_ins.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.word_embed_dim + self.config.lstm_dim)) self.mlstm_ins.biases_init = Constant(0) self.mlstm_ins.initialize() self.mlstm = MLSTM(self.config.lstm_time, self.config.lstm_dim, shared=False) self.mlstm.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.word_embed_dim + self.config.lstm_dim)) self.mlstm.biases_init = Constant(0) self.mlstm.initialize() self.hashtag2word = MLP( activations=[Tanh('hashtag2word_tanh')], dims=[self.config.lstm_dim, self.config.word_embed_dim], name='hashtag2word_mlp') self.hashtag2word.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.hashtag2word.biases_init = Constant(0) self.hashtag2word.initialize() self.hashtag2word_bias = Bias(dim=1, name='hashtag2word_bias') self.hashtag2word_bias.biases_init = Constant(0) self.hashtag2word_bias.initialize() #Build character embedding self.char_embed = self._embed(len(self.dataset.char2index), self.config.char_embed_dim, name='char_embed') # Build sparse word encoder self.rnn_ins = Linear(input_dim=self.config.char_embed_dim, output_dim=self.config.word_embed_dim, name='rnn_in') self.rnn_ins.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.char_embed_dim + self.config.word_embed_dim)) self.rnn_ins.biases_init = Constant(0) self.rnn_ins.initialize() self.rnn = SimpleRecurrent(dim=self.config.word_embed_dim, activation=Tanh()) self.rnn.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.rnn.initialize()
def __init__(self, emitter=None, feedback_brick=None, merge=None, merge_prototype=None, post_merge=None, merged_dim=None, **kwargs): if not emitter: emitter = TrivialEmitter(kwargs['readout_dim']) if not feedback_brick: feedback_brick = TrivialFeedback(kwargs['readout_dim']) if not merge: merge = Merge(input_names=kwargs['source_names'], prototype=merge_prototype) if not post_merge: post_merge = Bias(dim=kwargs['readout_dim']) if not merged_dim: merged_dim = kwargs['readout_dim'] self.emitter = emitter self.feedback_brick = feedback_brick self.merge = merge self.post_merge = post_merge self.merged_dim = merged_dim children = [self.emitter, self.feedback_brick, self.merge, self.post_merge] kwargs.setdefault('children', []).extend(children) super(Readout, self).__init__(**kwargs)
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, theano_seed=None, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state. self.transition = GRUInitialState(attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder') # Initialize the attention mechanism. self.attention = SequenceContentAttention2( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") readout = Readout(source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=NewSoftmaxEmitter(initial_output=-1, theano_seed=theano_seed), feedback_brick=NewLookupFeedback( vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply ]), merged_dim=state_dim) # Build sequence generator accordingly. self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear()), cost_type='categorical_cross_entropy') self.children = [self.sequence_generator]
def __init__(self, emitter=None, feedback_brick=None, merge=None, merge_prototype=None, post_merge=None, merged_dim=None, **kwargs): super(Readout, self).__init__(**kwargs) if not emitter: emitter = TrivialEmitter(self.readout_dim) if not feedback_brick: feedback_brick = TrivialFeedback(self.readout_dim) if not merge: merge = Merge(input_names=self.source_names, prototype=merge_prototype) if not post_merge: post_merge = Bias(dim=self.readout_dim) if not merged_dim: merged_dim = self.readout_dim self.emitter = emitter self.feedback_brick = feedback_brick self.merge = merge self.post_merge = post_merge self.merged_dim = merged_dim self.children = [ self.emitter, self.feedback_brick, self.merge, self.post_merge ]
def _build_bricks(self, *args, **kwargs): super(AttentionEUTHM2, self)._build_bricks() self.word_shift = MLP( activations=[Tanh('word_shift_tanh')], dims=[ self.config.user_embed_dim + self.config.word_embed_dim, self.config.word_embed_dim ], name='word_shift_mlp') self.word_shift.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim + self.config.user_embed_dim)) self.word_shift.biases_init = Constant(0) self.word_shift.initialize() self.word_shift_bias = Bias(dim=1, name='word_shift_bias') self.word_shift_bias.biases_init = Constant(0) self.word_shift_bias.initialize()
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim,topical_dim,theano_seed=None, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed #self.topical_dim=topical_dim; # Initialize gru with special initial state self.transition = GRUInitialState( attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder') # Initialize the attention mechanism self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") self.topical_attention=SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=topical_dim, match_dim=state_dim, name="topical_attention")#not sure whether the match dim would be correct. # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 readout = Readout( source_names=['states', 'feedback', self.attention.take_glimpses.outputs[0]],#check! readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed), feedback_brick=LookupFeedbackWMT15(vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence( [Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply]), merged_dim=state_dim) # Build sequence generator accordingly self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, topical_attention=self.topical_attention, topical_name='topical_embeddingq', content_name='content_embedding', fork=Fork([name for name in self.transition.apply.sequences if name != 'mask'], prototype=Linear()) ) self.children = [self.sequence_generator]
def __init__(self, vocab_size, embedding_dim, igru_state_dim, igru_depth, trg_dgru_depth, emitter, feedback_brick, merge=None, merge_prototype=None, post_merge=None, **kwargs): merged_dim = igru_state_dim if not merge: merge = Merge(input_names=kwargs['source_names'], prototype=merge_prototype) if not post_merge: post_merge = Bias(dim=merged_dim) # for compatible if igru_depth == 1: self.igru = IGRU(dim=igru_state_dim) else: self.igru = RecurrentStack( [IGRU(dim=igru_state_dim, name='igru')] + [ UpperIGRU(dim=igru_state_dim, activation=Tanh(), name='upper_igru' + str(i)) for i in range(1, igru_depth) ], skip_connections=True) self.embedding_dim = embedding_dim self.emitter = emitter self.feedback_brick = feedback_brick self.merge = merge self.post_merge = post_merge self.merged_dim = merged_dim self.igru_depth = igru_depth self.trg_dgru_depth = trg_dgru_depth self.lookup = LookupTable(name='embeddings') self.vocab_size = vocab_size self.igru_state_dim = igru_state_dim self.gru_to_softmax = Linear(input_dim=igru_state_dim, output_dim=vocab_size) self.gru_fork = Fork([ name for name in self.igru.apply.sequences if name != 'mask' and name != 'input_states' ], prototype=Linear(), name='gru_fork') children = [ self.emitter, self.feedback_brick, self.merge, self.post_merge, self.igru, self.lookup, self.gru_to_softmax, self.gru_fork ] kwargs.setdefault('children', []).extend(children) super(Interpolator, self).__init__(**kwargs)
class AttentionEUTHM2(AttentionEUTHM): def __init__(self, config, dataset, *args, **kwargs): ''' Define User-text-hashtag model with negtive sampling :param config: :param dataset: User-text-hashtag dataset ''' super(AttentionEUTHM2, self).__init__(config, dataset) def _get_doc_embed(self, *args, **kwargs): text_vec = self._get_text_vec() user_vec = self.user_embed.apply(self.user) text_vec = tensor.concatenate([ text_vec, user_vec[None, :, :][tensor.zeros( shape=(text_vec.shape[0], ), dtype='int32')] ], axis=2) text_vec = self.word_shift.apply(text_vec) + \ self.word_shift_bias.parameters[0][0] return self._encode_text_vec(text_vec) def _build_bricks(self, *args, **kwargs): super(AttentionEUTHM2, self)._build_bricks() self.word_shift = MLP( activations=[Tanh('word_shift_tanh')], dims=[ self.config.user_embed_dim + self.config.word_embed_dim, self.config.word_embed_dim ], name='word_shift_mlp') self.word_shift.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim + self.config.user_embed_dim)) self.word_shift.biases_init = Constant(0) self.word_shift.initialize() self.word_shift_bias = Bias(dim=1, name='word_shift_bias') self.word_shift_bias.biases_init = Constant(0) self.word_shift_bias.initialize()
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.transition = GRUInitialState(attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder') self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") readout = Readout(source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=-1), feedback_brick=LookupFeedbackWMT15( vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply ]), merged_dim=state_dim, merge_prototype=Linear(use_bias=True)) self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator]
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim readout = Readout( source_names=['states', 'feedback', 'readout_context'], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(), feedback_brick=LookupFeedback(vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=1000).apply, Maxout(num_pieces=2).apply, Linear(input_dim=state_dim / 2, output_dim=100, use_bias=False).apply, Linear(input_dim=100).apply ]), merged_dim=1000) self.transition = GatedRecurrentWithContext(Tanh(), dim=state_dim, name='decoder') # Readout will apply the linear transformation to 'readout_context' # with a Merge brick, so no need to fork it here self.fork = Fork([ name for name in self.transition.apply.contexts + self.transition.apply.states if name != 'readout_context' ], prototype=Linear()) self.tanh = Tanh() self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, fork_inputs=[ name for name in self.transition.apply.sequences if name != 'mask' ], ) self.children = [self.fork, self.sequence_generator, self.tanh]
def __init__(self, ref_data, output_dim): input_dim = ref_data.shape[1] ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data') rng = RandomStreams() ae_bricks = [] ae_input = ref_data_sh ae_costs = [] for i, (idim, odim) in enumerate(zip([input_dim] + ae_dims[:-1], ae_dims)): ae_mlp = MLP(activations=[ae_activations[i]], dims=[idim, odim], name='enc%i'%i) enc = ae_mlp.apply(ae_input) enc_n = ae_mlp.apply(ae_input + rng.normal(size=ae_input.shape, std=ae_f_noise_std)) ae_mlp_dec = MLP(activations=[ae_activations[i]], dims=[odim, idim], name='dec%i'%i) dec = ae_mlp_dec.apply(enc_n) cost = tensor.sqrt(((ae_input - dec) ** 2).sum(axis=1)).mean() + \ ae_l1_pen * abs(enc).sum(axis=1).mean() ae_costs.append(cost) ae_input = enc ae_bricks = ae_bricks + [ae_mlp, ae_mlp_dec] self.ae_costs = ae_costs ref_data_enc = ae_input # Construct the model j = tensor.lvector('j') r = ref_data_enc[j, :] x = tensor.fmatrix('x') y = tensor.ivector('y') # input_dim must be nr mlp = MLP(activations=activation_functions, dims=[ae_dims[-1]] + hidden_dims + [n_inter], name='inter_gen') mlp2 = MLP(activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name='end_mlp') inter_weights = mlp.apply(r) if inter_bias == None: ibias = Bias(n_inter) ibias.biases_init = Constant(0) ibias.initialize() inter = ibias.apply(tensor.dot(x, inter_weights)) else: inter = tensor.dot(x, inter_weights) - inter_bias inter = inter_act_fun.apply(inter) final = mlp2.apply(inter) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) # error_rate = tensor.neq(y, pred).mean() ber = balanced_error_rate.ber(y, pred) # Initialize parameters for brick in ae_bricks + [mlp, mlp2]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply regularization cg = ComputationGraph([cost, ber]) if r_dropout != 0: # - dropout on input vector r : r_dropout cg = apply_dropout(cg, [r], r_dropout) if x_dropout != 0: cg = apply_dropout(cg, [x], x_dropout) if s_dropout != 0: # - dropout on intermediate layers of first mlp : s_dropout s_dropout_vars = list(set(VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([inter_weights]))) - set([inter_weights])) cg = apply_dropout(cg, s_dropout_vars, s_dropout) if i_dropout != 0: # - dropout on input to second mlp : i_dropout cg = apply_dropout(cg, [inter], i_dropout) if a_dropout != 0: # - dropout on hidden layers of second mlp : a_dropout a_dropout_vars = list(set(VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([final]))) - set([inter_weights]) - set(s_dropout_vars)) cg = apply_dropout(cg, a_dropout_vars, a_dropout) if r_noise_std != 0: cg = apply_noise(cg, [r], r_noise_std) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) [cost_reg, ber_reg] = cg.outputs if s_l1pen != 0: s_weights = VariableFilter(bricks=mlp.linear_transformations, roles=[WEIGHT])(cg) cost_reg = cost_reg + s_l1pen * sum(abs(w).sum() for w in s_weights) if i_l1pen != 0: cost_reg = cost_reg + i_l1pen * abs(inter).sum() if a_l1pen != 0: a_weights = VariableFilter(bricks=mlp2.linear_transformations, roles=[WEIGHT])(cg) cost_reg = cost_reg + a_l1pen * sum(abs(w).sum() for w in a_weights) self.cost = cost self.cost_reg = cost_reg self.ber = ber self.ber_reg = ber_reg self.pred = pred self.confidence = confidence
def test_variable_filter(): # Creating computation graph brick1 = Linear(input_dim=2, output_dim=2, name="linear1") brick2 = Bias(2, name="bias1") activation = Logistic(name="sigm") x = tensor.vector() h1 = brick1.apply(x) h2 = activation.apply(h1) h2.name = "h2act" y = brick2.apply(h2) cg = ComputationGraph(y) parameters = [brick1.W, brick1.b, brick2.parameters[0]] bias = [brick1.b, brick2.parameters[0]] brick1_bias = [brick1.b] # Testing filtering by role role_filter = VariableFilter(roles=[PARAMETER]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[FILTER]) assert [] == role_filter(cg.variables) # Testing filtering by role using each_role flag role_filter = VariableFilter(roles=[PARAMETER, BIAS]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True) assert not parameters == role_filter(cg.variables) assert bias == role_filter(cg.variables) # Testing filtering by bricks classes brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by bricks instances brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by brick instance brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by name name_filter = VariableFilter(name="W_norm") assert [cg.variables[2]] == name_filter(cg.variables) # Testing filtering by name regex name_filter_regex = VariableFilter(name_regex="W_no.?m") assert [cg.variables[2]] == name_filter_regex(cg.variables) # Testing filtering by theano name theano_name_filter = VariableFilter(theano_name="h2act") assert [cg.variables[11]] == theano_name_filter(cg.variables) # Testing filtering by theano name regex theano_name_filter_regex = VariableFilter(theano_name_regex="h2a.?t") assert [cg.variables[11]] == theano_name_filter_regex(cg.variables) # Testing filtering by application appli_filter = VariableFilter(applications=[brick1.apply]) variables = [cg.variables[1], cg.variables[8]] assert variables == appli_filter(cg.variables) # Testing filtering by application appli_filter_list = VariableFilter(applications=[brick1.apply]) assert variables == appli_filter_list(cg.variables) input1 = tensor.matrix("input1") input2 = tensor.matrix("input2") merge = Merge(["input1", "input2"], [5, 6], 2) merged = merge.apply(input1, input2) merge_cg = ComputationGraph(merged) outputs = VariableFilter(roles=[OUTPUT], bricks=[merge])(merge_cg.variables) assert merged in outputs assert len(outputs) == 3 outputs_application = VariableFilter(roles=[OUTPUT], applications=[merge.apply])(merge_cg.variables) assert outputs_application == [merged]
def __init__(self, ref_data, output_dim): input_dim = ref_data.shape[1] ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data') # Construct the model j = tensor.lvector('j') r = ref_data_sh[j, :] x = tensor.fmatrix('x') y = tensor.ivector('y') # input_dim must be nr mlp0 = MLP(activations=activation_functions_0, dims=[input_dim] + hidden_dims_0, name='e0') mlp0vs = MLP(activations=[None], dims=[hidden_dims_0[-1], input_dim], name='de0') mlp1 = MLP(activations=activation_functions_1, dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter], name='inter_gen') mlp2 = MLP(activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name='end_mlp') encod = mlp0.apply(r) rprime = mlp0vs.apply(encod) inter_weights = mlp1.apply(encod) ibias = Bias(n_inter) ibias.biases_init = Constant(0) ibias.initialize() inter = inter_act_fun.apply(ibias.apply(tensor.dot(x, inter_weights))) final = mlp2.apply(inter) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in [mlp0, mlp0vs, mlp1, mlp2]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply regularization cg = ComputationGraph([cost, error_rate]) if r_dropout != 0: # - dropout on input vector r : r_dropout cg = apply_dropout(cg, [r], r_dropout) if s_dropout != 0: # - dropout on intermediate layers of first mlp : s_dropout s_dropout_vars = list( set( VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([inter_weights]))) - set([inter_weights])) cg = apply_dropout(cg, s_dropout_vars, s_dropout) if i_dropout != 0: # - dropout on input to second mlp : i_dropout cg = apply_dropout(cg, [inter], i_dropout) if a_dropout != 0: # - dropout on hidden layers of second mlp : a_dropout a_dropout_vars = list( set( VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([final]))) - set([inter_weights]) - set(s_dropout_vars)) cg = apply_dropout(cg, a_dropout_vars, a_dropout) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) [cost_reg, error_rate_reg] = cg.outputs # add reconstruction penalty for AE part penalty_val = tensor.sqrt(((r - rprime)**2).sum(axis=1)).mean() cost_reg = cost_reg + reconstruction_penalty * penalty_val self.cost = cost self.cost_reg = cost_reg self.error_rate = error_rate self.error_rate_reg = error_rate_reg self.pred = pred self.confidence = confidence
def test_variable_filter(): # Creating computation graph brick1 = Linear(input_dim=2, output_dim=2, name='linear1') brick2 = Bias(2, name='bias1') activation = Logistic(name='sigm') x = tensor.vector() h1 = brick1.apply(x, call_id='brick1_call_id') h2 = activation.apply(h1, call_id='act') h2.name = "h2act" y = brick2.apply(h2) cg = ComputationGraph(y) parameters = [brick1.W, brick1.b, brick2.parameters[0]] bias = [brick1.b, brick2.parameters[0]] brick1_bias = [brick1.b] # Testing filtering by role role_filter = VariableFilter(roles=[PARAMETER]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[FILTER]) assert [] == role_filter(cg.variables) # Testing filtering by role using each_role flag role_filter = VariableFilter(roles=[PARAMETER, BIAS]) assert parameters == role_filter(cg.variables) role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True) assert not parameters == role_filter(cg.variables) assert bias == role_filter(cg.variables) # Testing filtering by bricks classes brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by bricks instances brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by brick instance brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1]) assert brick1_bias == brick_filter(cg.variables) # Testing filtering by name name_filter = VariableFilter(name='W_norm') assert [cg.variables[2]] == name_filter(cg.variables) # Testing filtering by name regex name_filter_regex = VariableFilter(name_regex='W_no.?m') assert [cg.variables[2]] == name_filter_regex(cg.variables) # Testing filtering by theano name theano_name_filter = VariableFilter(theano_name='h2act') assert [cg.variables[11]] == theano_name_filter(cg.variables) # Testing filtering by theano name regex theano_name_filter_regex = VariableFilter(theano_name_regex='h2a.?t') assert [cg.variables[11]] == theano_name_filter_regex(cg.variables) brick1_apply_variables = [cg.variables[1], cg.variables[8]] # Testing filtering by application appli_filter = VariableFilter(applications=[brick1.apply]) assert brick1_apply_variables == appli_filter(cg.variables) # Testing filtering by unbound application unbound_appli_filter = VariableFilter(applications=[Linear.apply]) assert brick1_apply_variables == unbound_appli_filter(cg.variables) # Testing filtering by call identifier call_id_filter = VariableFilter(call_id='brick1_call_id') assert brick1_apply_variables == call_id_filter(cg.variables) input1 = tensor.matrix('input1') input2 = tensor.matrix('input2') merge = Merge(['input1', 'input2'], [5, 6], 2) merged = merge.apply(input1, input2) merge_cg = ComputationGraph(merged) outputs = VariableFilter( roles=[OUTPUT], bricks=[merge])(merge_cg.variables) assert merged in outputs assert len(outputs) == 3 outputs_application = VariableFilter( roles=[OUTPUT], applications=[merge.apply])(merge_cg.variables) assert outputs_application == [merged]
def __init__(self, ref_data, output_dim): input_dim = ref_data.shape[1] ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data') rng = RandomStreams() ae_bricks = [] ae_input = ref_data_sh ae_costs = [] for i, (idim, odim) in enumerate(zip([input_dim] + ae_dims[:-1], ae_dims)): ae_mlp = MLP(activations=[ae_activations[i]], dims=[idim, odim], name='enc%i' % i) enc = ae_mlp.apply(ae_input) enc_n = ae_mlp.apply( ae_input + rng.normal(size=ae_input.shape, std=ae_f_noise_std)) ae_mlp_dec = MLP(activations=[ae_activations[i]], dims=[odim, idim], name='dec%i' % i) dec = ae_mlp_dec.apply(enc_n) cost = tensor.sqrt(((ae_input - dec) ** 2).sum(axis=1)).mean() + \ ae_l1_pen * abs(enc).sum(axis=1).mean() ae_costs.append(cost) ae_input = enc ae_bricks = ae_bricks + [ae_mlp, ae_mlp_dec] self.ae_costs = ae_costs ref_data_enc = ae_input # Construct the model j = tensor.lvector('j') r = ref_data_enc[j, :] x = tensor.fmatrix('x') y = tensor.ivector('y') # input_dim must be nr mlp = MLP(activations=activation_functions, dims=[ae_dims[-1]] + hidden_dims + [n_inter], name='inter_gen') mlp2 = MLP(activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name='end_mlp') inter_weights = mlp.apply(r) if inter_bias == None: ibias = Bias(n_inter) ibias.biases_init = Constant(0) ibias.initialize() inter = ibias.apply(tensor.dot(x, inter_weights)) else: inter = tensor.dot(x, inter_weights) - inter_bias inter = inter_act_fun.apply(inter) final = mlp2.apply(inter) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) # error_rate = tensor.neq(y, pred).mean() ber = balanced_error_rate.ber(y, pred) # Initialize parameters for brick in ae_bricks + [mlp, mlp2]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply regularization cg = ComputationGraph([cost, ber]) if r_dropout != 0: # - dropout on input vector r : r_dropout cg = apply_dropout(cg, [r], r_dropout) if x_dropout != 0: cg = apply_dropout(cg, [x], x_dropout) if s_dropout != 0: # - dropout on intermediate layers of first mlp : s_dropout s_dropout_vars = list( set( VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([inter_weights]))) - set([inter_weights])) cg = apply_dropout(cg, s_dropout_vars, s_dropout) if i_dropout != 0: # - dropout on input to second mlp : i_dropout cg = apply_dropout(cg, [inter], i_dropout) if a_dropout != 0: # - dropout on hidden layers of second mlp : a_dropout a_dropout_vars = list( set( VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([final]))) - set([inter_weights]) - set(s_dropout_vars)) cg = apply_dropout(cg, a_dropout_vars, a_dropout) if r_noise_std != 0: cg = apply_noise(cg, [r], r_noise_std) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) [cost_reg, ber_reg] = cg.outputs if s_l1pen != 0: s_weights = VariableFilter(bricks=mlp.linear_transformations, roles=[WEIGHT])(cg) cost_reg = cost_reg + s_l1pen * sum( abs(w).sum() for w in s_weights) if i_l1pen != 0: cost_reg = cost_reg + i_l1pen * abs(inter).sum() if a_l1pen != 0: a_weights = VariableFilter(bricks=mlp2.linear_transformations, roles=[WEIGHT])(cg) cost_reg = cost_reg + a_l1pen * sum( abs(w).sum() for w in a_weights) self.cost = cost self.cost_reg = cost_reg self.ber = ber self.ber_reg = ber_reg self.pred = pred self.confidence = confidence
def __init__(self, ref_data, output_dim): ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data') # Construct the model j = tensor.lvector('j') x = tensor.fmatrix('x') y = tensor.ivector('y') last_outputs = [] s_dropout_vars = [] r_dropout_vars = [] i_dropout_vars = [] penalties = [] for i in range(nparts): fs = numpy.random.binomial(1, part_r_proba, size=(ref_data.shape[1],)) input_dim = int(fs.sum()) fs_sh = theano.shared(fs) r = ref_data_sh[j, :][:, fs_sh.nonzero()[0]] mlp0 = MLP(activations=activation_functions_0, dims=[input_dim] + hidden_dims_0, name='enc%d'%i) mlp0r = MLP(activations=[None], dims=[hidden_dims_0[-1], input_dim], name='dec%d'%i) mlp1 = MLP(activations=activation_functions_1, dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter], name='inter_gen_%d'%i) mlp2 = MLP(activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name='end_mlp_%d'%i) encod = mlp0.apply(r) rprime = mlp0r.apply(encod) inter_weights = mlp1.apply(encod) ibias = Bias(n_inter, name='inter_bias_%d'%i) inter = ibias.apply(tensor.dot(x, inter_weights)) inter = inter_act_fun.apply(inter) out = mlp2.apply(inter) penalties.append(tensor.sqrt(((rprime - r)**2).sum(axis=1)).mean()[None]) last_outputs.append(out) r_dropout_vars.append(r) s_dropout_vars = s_dropout_vars + ( VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([inter_weights])) ) i_dropout_vars.append(inter) # Initialize parameters for brick in [mlp0, mlp0r, mlp1, mlp2, ibias]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() final = tensor.concatenate([x[:, :, None] for x in last_outputs], axis=2).mean(axis=2) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # apply regularization cg = ComputationGraph([cost, error_rate]) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) if s_dropout != 0: cg = apply_dropout(cg, s_dropout_vars, s_dropout) if r_dropout != 0: cg = apply_dropout(cg, r_dropout_vars, r_dropout) if i_dropout != 0: cg = apply_dropout(cg, i_dropout_vars, i_dropout) [cost_reg, error_rate_reg] = cg.outputs cost_reg = cost_reg + reconstruction_penalty * tensor.concatenate(penalties, axis=0).sum() self.cost = cost self.cost_reg = cost_reg self.error_rate = error_rate self.error_rate_reg = error_rate_reg self.pred = pred self.confidence = confidence
def __init__(self, input_dims, input_num_chars, bos_label, eos_label, num_labels, dim_dec, dims_bidir, enc_transition, dec_transition, use_states_for_readout, attention_type, criterion, bottom, lm=None, token_map=None, bidir=True, window_size=None, max_length=None, subsample=None, dims_top=None, extra_input_dim=None, prior=None, conv_n=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dim_output_embedding=None, reuse_bottom_lookup_table=False, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, # softmax is the default set in SequenceContentAndConvAttention energy_normalizer=None, # for speech this is the approximate phoneme duration in frames max_decoded_length_scale=1, # for criterions involving generation of outputs, whether # or not they should be generated by the recognizer itself generate_predictions=True, compute_targets=True, extra_generation_steps=3, **kwargs): all_arguments = copy.deepcopy(locals()) all_arguments.update(copy.deepcopy(kwargs)) del all_arguments['kwargs'] del all_arguments['self'] if post_merge_activation is None: post_merge_activation = Tanh() super(EncoderDecoder, self).__init__(**kwargs) self.bos_label = bos_label self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack self.criterion = criterion self.generate_predictions = generate_predictions self.extra_generation_steps = extra_generation_steps self.compute_targets = compute_targets self.max_decoded_length_scale = max_decoded_length_scale post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN bottom_class = bottom.pop('bottom_class') bottom = bottom_class( input_dims=input_dims, input_num_chars=input_num_chars, name='bottom', **bottom) # BiRNN if dims_bidir: if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, bottom.get_dim(bottom.apply.outputs[0]), subsample, bidir=bidir) elif window_size: encoder = ConvEncoder( max_length, bottom.get_dim(bottom.apply.outputs[0]), window_size) else: raise ValueError("Don't know which Encoder to use") dim_encoded = encoder.get_dim(encoder.apply.outputs[0]) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition( dim=dim_dec, activation=Tanh(), name="transition") else: assert not extra_input_dim transitions = [self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack)] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dim_encoded, match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=dim_encoded, match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError("Unknown attention type {}" .format(attention_type)) if not embed_outputs: raise ValueError("embed_outputs=False is not supported any more") if not reuse_bottom_lookup_table: embedding = LookupTable(num_labels + 1, dim_dec if dim_output_embedding is None else dim_output_embedding) else: embedding = bottom.children[0] feedback = Feedback( embedding=embedding, output_names=[s for s in transition.apply.sequences if s != 'mask']) # Create a readout readout_config = dict( num_tokens=num_labels, input_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], name="readout") if post_merge_dims: readout_config['merge_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence([ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [d//getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims] + [num_labels]).apply, ], name='post_merge') if 'reward' in criterion and criterion['name'] != 'log_likelihood': if criterion['reward'] == 'edit_distance': readout_config['reward_brick'] = EditDistanceReward( self.bos_label, self.eos_label) elif criterion['reward'] == 'delta_edit_distance': readout_config['reward_brick'] = EditDistanceReward( self.bos_label, self.eos_label, deltas=True) elif criterion['reward'] == 'bleu': readout_config['reward_brick'] = BleuReward( self.bos_label, self.eos_label, deltas=False) elif criterion['reward'] == 'delta_bleu': readout_config['reward_brick'] = BleuReward( self.bos_label, self.eos_label, deltas=True) else: raise ValueError("Unknown reward type") if criterion['name'] == 'log_likelihood': readout_class = SoftmaxReadout elif criterion['name'] == 'critic': readout_class = CriticReadout criterion_copy = dict(criterion) del criterion_copy['name'] readout_config.update(**criterion_copy) elif criterion['name'] == 'reinforce': readout_class = ReinforceReadout readout_config['merge_names'] = list(readout_config['input_names']) readout_config['entropy'] = criterion.get('entropy') readout_config['input_names'] += ['attended', 'attended_mask'] elif criterion['name'] in ['sarsa', 'actor_critic']: readout_class = ActorCriticReadout if criterion['name'] == 'actor_critic': critic_arguments = dict(all_arguments) # No worries, critic will not compute log likelihood values. # We critic_arguments['criterion'] = { 'name': 'critic', 'value_softmax': criterion.get('value_softmax'), 'same_value_for_wrong': criterion.get('same_value_for_wrong'), 'groundtruth_word_bonus': criterion.get('groundtruth_word_bonus'), 'dueling_outputs': criterion.get('dueling_outputs')} critic_arguments['name'] = 'critic' if criterion.get('critic_uses_actor_states'): critic_arguments['extra_input_dim'] = dim_dec if (criterion.get('value_softmax') or criterion.get('same_value_for_wrong') or criterion.get('dueling_outputs')): # Add an extra output for the critic critic_arguments['num_labels'] = num_labels + 1 if criterion.get('force_bidir'): critic_arguments['dims_bidir'] = [dim_dec] critic_arguments['reuse_bottom_lookup_table'] = True critic_arguments['input_num_chars'] = {'inputs': num_labels} if criterion.get('downsize_critic'): critic_arguments = _downsize_config( critic_arguments, criterion['downsize_critic']) critic = EncoderDecoder(**critic_arguments) readout_config['critic'] = critic readout_config['merge_names'] = list(readout_config['input_names']) readout_config['freeze_actor'] = criterion.get('freeze_actor') readout_config['freeze_critic'] = criterion.get('freeze_critic') readout_config['critic_uses_actor_states'] = criterion.get('critic_uses_actor_states') readout_config['critic_uses_groundtruth'] = criterion.get('critic_uses_groundtruth') readout_config['critic_burnin_steps'] = criterion.get('critic_burnin_steps') readout_config['critic_loss'] = criterion.get('critic_loss') readout_config['discount'] = criterion.get('discount') readout_config['entropy_reward_coof'] = criterion.get('entropy_reward_coof') readout_config['cross_entropy_reward_coof'] = criterion.get('cross_entropy_reward_coof') readout_config['value_penalty'] = criterion.get('value_penalty') readout_config['value_penalty_type'] = criterion.get('value_penalty_type') readout_config['critic_policy_t'] = criterion.get('critic_policy_t') readout_config['bos_token'] = bos_label readout_config['accumulate_outputs'] = criterion.get('accumulate_outputs') readout_config['use_value_biases'] = criterion.get('use_value_biases') readout_config['actor_grad_estimate'] = criterion.get('actor_grad_estimate') readout_config['input_names'] += ['attended', 'attended_mask'] # Note, that settings below are for the "clean" mode. # When get_cost_graph() is run with training=True, they # are temporarily overriden with the "real" settings from # "criterion" readout_config['compute_targets'] = True readout_config['trpo_coef'] = 0.0 readout_config['solve_bellman'] = True else: raise ValueError("Unknown criterion {}".format(criterion['name'])) readout = readout_class(**readout_config) if lm: raise ValueError("LM is currently not supported") recurrent = AttentionRecurrent(transition, attention) if extra_input_dim: recurrent = RecurrentWithExtraInput( recurrent, "extra_inputs", extra_input_dim, name="with_extra_inputs") generator = SequenceGenerator( recurrent=recurrent, readout=readout, feedback=feedback, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.softmax = Softmax() self.children = [encoder, top, bottom, generator, self.softmax] # Create input variables self.inputs = self.bottom.batch_inputs self.inputs_mask = self.bottom.mask self.labels = tensor.lmatrix('labels') self.labels_mask = tensor.matrix("labels_mask") self.predicted_labels = tensor.lmatrix('predicted_labels') self.predicted_mask = tensor.matrix('predicted_mask') self.prefix_labels = tensor.lmatrix('prefix_labels') self.prefix_steps = tensor.lscalar('prefix_steps') self.single_inputs = self.bottom.single_inputs self.single_labels = tensor.lvector('labels') self.single_predicted_labels = tensor.lvector('predicted_labels') self.n_steps = tensor.lscalar('n_steps') # Configure mixed_generate if criterion['name'] == 'actor_critic': critic = self.generator.readout.critic self.mixed_generate.sequences = [] self.mixed_generate.states = ( ['step'] + self.generator.recurrent.apply.states + ['critic_' + name for name in critic.generator.recurrent.apply.states]) self.mixed_generate.outputs = ( ['samples', 'step'] + self.generator.recurrent.apply.outputs + ['critic_' + name for name in critic.generator.recurrent.apply.outputs]) self.mixed_generate.contexts = ( self.generator.recurrent.apply.contexts + ['critic_' + name for name in critic.generator.recurrent.apply.contexts] + ['groundtruth', 'groundtruth_mask']) self.initial_states.outputs = self.mixed_generate.states self.prefix_generate.sequences = [] self.prefix_generate.states = ['step'] + self.generator.recurrent.apply.states self.prefix_generate.outputs = ['samples', 'step'] + self.generator.recurrent.apply.outputs self.prefix_generate.contexts = self.generator.recurrent.apply.contexts
def __init__(self, vocab_size, topicWord_size, embedding_dim, state_dim, topical_dim, representation_dim, match_function='SumMacthFunction', use_doubly_stochastic=False, lambda_ds=0.001, use_local_attention=False, window_size=10, use_step_decay_cost=False, use_concentration_cost=False, lambda_ct=10, use_stablilizer=False, lambda_st=50, theano_seed=None, **kwargs): super(Decoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.topicWord_size = topicWord_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = GRU(attended_dim=state_dim, dim=state_dim, activation=Tanh(), name='decoder') self.energy_computer = globals()[match_function](name='energy_comp') # Initialize the attention mechanism self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, energy_computer=self.energy_computer, use_local_attention=use_local_attention, window_size=window_size, name="attention") self.topical_attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=topical_dim, match_dim=state_dim, energy_computer=self.energy_computer, use_local_attention=use_local_attention, window_size=window_size, name="topical_attention" ) #not sure whether the match dim would be correct. # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 readout = Readout(source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed), feedback_brick=LookupFeedbackWMT15( vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply ]), merged_dim=state_dim, name='readout') # calculate the readout of topic word, # no specific feedback brick, use the trival feedback break # no post_merge and merge, use Bias and Linear topicWordReadout = Readout(source_names=[ 'states', 'feedback', self.attention.take_glimpses.outputs[0] ], readout_dim=self.topicWord_size, emitter=SoftmaxEmitter( initial_output=-1, theano_seed=theano_seed), name='twReadout') # Build sequence generator accordingly self.sequence_generator = SequenceGenerator( readout=readout, topicWordReadout=topicWordReadout, topic_vector_names=['topicSumVector'], transition=self.transition, attention=self.attention, topical_attention=self.topical_attention, q_dim=self.state_dim, #q_name='topic_embedding', topical_name='topic_embedding', content_name='content_embedding', use_step_decay_cost=use_step_decay_cost, use_doubly_stochastic=use_doubly_stochastic, lambda_ds=lambda_ds, use_concentration_cost=use_concentration_cost, lambda_ct=lambda_ct, use_stablilizer=use_stablilizer, lambda_st=lambda_st, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator]
class EUTHM(UTHM): ''' UTH model with extend information ''' def __init__(self, config, dataset, *args, **kwargs): super(EUTHM, self).__init__(config, dataset) def _define_inputs(self, *args, **kwargs): super(EUTHM, self)._define_inputs() self.user_word = tensor.ivector('user_word') self.user_word_sparse_mask = tensor.vector('user_word_sparse_mask', dtype=theano.config.floatX) self.user_word_left_idx = tensor.ivector('user_word_idx_left_idx') self.user_word_right_idx = tensor.ivector('user_word_idx_right_idx') self.hashtag_word = tensor.ivector('hashtag_word') self.hashtag_sparse_mask = tensor.vector('hashtag_word_sparse_mask', dtype=theano.config.floatX) self.hashtag_word_left_idx = tensor.ivector( 'hashtag_word_idx_left_idx') self.hashtag_word_right_idx = tensor.ivector( 'hashtag_word_idx_right_idx') self.sparse_word = tensor.imatrix('sparse_word') self.sparse_word_sparse_mask = tensor.vector( 'sparse_word_sparse_mask', dtype=theano.config.floatX) self.sparse_word_mask = tensor.matrix('sparse_word_mask', dtype=theano.config.floatX) self.sparse_word_left_idx = tensor.ivector('sparse_word_idx_left_idx') self.sparse_word_right_idx = tensor.ivector( 'sparse_word_idx_right_idx') def _build_bricks(self, *args, **kwargs): # Build lookup tables super(EUTHM, self)._build_bricks() self.user2word = MLP( activations=[Tanh('user2word_tanh')], dims=[self.config.user_embed_dim, self.config.word_embed_dim], name='user2word_mlp') self.user2word.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.user2word.biases_init = Constant(0) self.user2word.initialize() self.hashtag2word = MLP( activations=[Tanh('hashtag2word_tanh')], dims=[ self.config.user_embed_dim + self.config.word_embed_dim, self.config.word_embed_dim ], name='hashtag2word_mlp') self.hashtag2word.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.hashtag2word.biases_init = Constant(0) self.hashtag2word.initialize() self.user2word_bias = Bias(dim=1, name='user2word_bias') self.user2word_bias.biases_init = Constant(0) self.user2word_bias.initialize() self.hashtag2word_bias = Bias(dim=1, name='hashtag2word_bias') self.hashtag2word_bias.biases_init = Constant(0) self.hashtag2word_bias.initialize() #Build character embedding self.char_embed = self._embed(len(self.dataset.char2index), self.config.char_embed_dim, name='char_embed') # Build sparse word encoder self.rnn_ins = Linear(input_dim=self.config.char_embed_dim, output_dim=self.config.word_embed_dim, name='rnn_in') self.rnn_ins.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.char_embed_dim + self.config.word_embed_dim)) self.rnn_ins.biases_init = Constant(0) self.rnn_ins.initialize() self.rnn = SimpleRecurrent(dim=self.config.word_embed_dim, activation=Tanh()) self.rnn.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.rnn.initialize() def _set_OV_value(self, *args, **kwargs): '''Train a <unk> representation''' tensor.set_subtensor( self.char_embed.W[self.dataset.char2index['<unk>']], numpy.zeros(self.config.char_embed_dim, dtype=theano.config.floatX)) def _get_text_vec(self, *args, **kwargs): # Transpose text self.text = self.text.dimshuffle(1, 0) self.text_mask = self.text_mask.dimshuffle(1, 0) self.sparse_word = self.sparse_word.dimshuffle(1, 0) self.sparse_word_mask = self.sparse_word_mask.dimshuffle(1, 0) # Turn word, user and hashtag into vector representation text_vec = self.word_embed.apply(self.text) # Apply user word, hashtag word and url text_vec = self._apply_user_word(text_vec) text_vec = self._apply_hashtag_word(text_vec) text_vec = self._apply_sparse_word(text_vec) return text_vec @abstractmethod def _apply_user_word(self, text_vec, *args, **kwargs): ''' Replace @a with transformed author vector :param text_vec: :param args: :param kwargs: :return: ''' user_word_vec = self.user2word.apply(self.user_embed.apply(self.user_word)) + \ self.user2word_bias.parameters[0][0] text_vec = tensor.set_subtensor( text_vec[self.user_word_right_idx, self.user_word_left_idx], text_vec[self.user_word_right_idx, self.user_word_left_idx] * (1 - self.user_word_sparse_mask[:, None]) + user_word_vec * self.user_word_sparse_mask[:, None]) return text_vec @abstractmethod def _apply_hashtag_word(self, text_vec, *args, **kwargs): ''' Replace #h with transformed hashtag vector :param text_vec: :param args: :param kwargs: :return: ''' hashtag_word_vec = self.hashtag2word.apply(self.hashtag_embed.apply(self.hashtag_word)) +\ self.hashtag2word_bias.parameters[0][0] text_vec = tensor.set_subtensor( text_vec[self.hashtag_word_right_idx, self.hashtag_word_left_idx], text_vec[self.hashtag_word_right_idx, self.hashtag_word_left_idx] * (1 - self.hashtag_sparse_mask[:, None]) + hashtag_word_vec * self.hashtag_sparse_mask[:, None]) return text_vec @abstractmethod def _apply_sparse_word(self, text_vec, *args, **kwargs): ''' Replace sparse word encoding with character embedding. (maybe lstm) :param text_vec: :param args: :param kwargs: :return: ''' sparse_word_vec = self.char_embed.apply(self.sparse_word) sparse_word_hiddens = self.rnn.apply( inputs=self.rnn_ins.apply(sparse_word_vec), mask=self.sparse_word_mask) tmp = sparse_word_hiddens[-1] text_vec = tensor.set_subtensor( text_vec[self.sparse_word_right_idx, self.sparse_word_left_idx], text_vec[self.sparse_word_right_idx, self.sparse_word_left_idx] * (1 - self.sparse_word_sparse_mask[:, None]) + tmp * self.sparse_word_sparse_mask[:, None]) return text_vec
def main(config): vocab_src, _ = text_to_dict([config['train_src'], config['dev_src'], config['test_src']]) vocab_tgt, cabvo = text_to_dict([config['train_tgt'], config['dev_tgt']]) # Create Theano variables logger.info('Creating theano variables') source_sentence = tensor.lmatrix('source') source_sentence_mask = tensor.matrix('source_mask') target_sentence = tensor.lmatrix('target') target_sentence_mask = tensor.matrix('target_mask') source_sentence.tag.test_value = [[13, 20, 0, 20, 0, 20, 0], [1, 4, 8, 4, 8, 4, 8],] source_sentence_mask.tag.test_value = [[0, 1, 0, 1, 0, 1, 0], [1, 0, 1, 0, 1, 0, 1],] target_sentence.tag.test_value = [[0,1,1,5], [2,0,1,0],] target_sentence_mask.tag.test_value = [[0,1,1,0], [1,1,1,0],] logger.info('Building RNN encoder-decoder') ### Building Encoder embedder = LookupTable( length=len(vocab_src), dim=config['embed_src'], weights_init=IsotropicGaussian(), biases_init=Constant(0.0), name='embedder') transformer = Linear( config['embed_src'], config['hidden_src']*4, weights_init=IsotropicGaussian(), biases_init=Constant(0.0), name='transformer') lstminit = np.asarray([0.0,]*config['hidden_src']+[0.0,]*config['hidden_src']+[1.0,]*config['hidden_src']+[0.0,]*config['hidden_src']) encoder = Bidirectional( LSTM( dim=config['hidden_src'], weights_init=IsotropicGaussian(0.01), biases_init=Constant(lstminit)), name='encoderBiLSTM' ) encoder.prototype.weights_init = Orthogonal() ### Building Decoder lstminit = np.asarray([0.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']+[1.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']) transition = LSTM2GO( attended_dim=config['hidden_tgt'], dim=config['hidden_tgt'], weights_init=IsotropicGaussian(0.01), biases_init=Constant(lstminit), name='decoderLSTM') attention = SequenceContentAttention( state_names=transition.apply.states, # default activation is Tanh state_dims=[config['hidden_tgt']], attended_dim=config['hidden_src']*2, match_dim=config['hidden_tgt'], name="attention") readout = Readout( source_names=['states', 'feedback', attention.take_glimpses.outputs[0]], readout_dim=len(vocab_tgt), emitter = SoftmaxEmitter( name='emitter'), feedback_brick = LookupFeedback( num_outputs=len(vocab_tgt), feedback_dim=config['embed_tgt'], name='feedback'), post_merge=InitializableFeedforwardSequence([ Bias(dim=config['hidden_tgt'], name='softmax_bias').apply, Linear(input_dim=config['hidden_tgt'], output_dim=config['embed_tgt'], use_bias=False, name='softmax0').apply, Linear(input_dim=config['embed_tgt'], name='softmax1').apply]), merged_dim=config['hidden_tgt']) decoder = SequenceGenerator( readout=readout, transition=transition, attention=attention, weights_init=IsotropicGaussian(0.01), biases_init=Constant(0), name="generator", fork=Fork( [name for name in transition.apply.sequences if name != 'mask'], prototype=Linear()), add_contexts=True) decoder.transition.weights_init = Orthogonal() #printchildren(encoder, 1) # Initialize model logger.info('Initializing model') embedder.initialize() transformer.initialize() encoder.initialize() decoder.initialize() # Apply model embedded = embedder.apply(source_sentence) tansformed = transformer.apply(embedded) encoded = encoder.apply(tansformed)[0] generated = decoder.generate( n_steps=2*source_sentence.shape[1], batch_size=source_sentence.shape[0], attended = encoded.dimshuffle(1,0,2), attended_mask=tensor.ones(source_sentence.shape).T ) print 'Generated: ', generated # generator_generate_outputs #samples = generated[1] # For GRU samples = generated[2] # For LSTM samples.name = 'samples' #samples_cost = generated[4] # For GRU samples_cost = generated[5] # For LSTM samples_cost = 'sampling_cost' cost = decoder.cost( mask = target_sentence_mask.T, outputs = target_sentence.T, attended = encoded.dimshuffle(1,0,2), attended_mask = source_sentence_mask.T) cost.name = 'target_cost' cost.tag.aggregation_scheme = TakeLast(cost) model = Model(cost) logger.info('Creating computational graph') cg = ComputationGraph(cost) # apply dropout for regularization if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog logger.info('Applying dropout') dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output'] cg = apply_dropout(cg, dropout_inputs, config['dropout']) ######## # Print shapes shapes = [param.get_value().shape for param in cg.parameters] logger.info("Parameter shapes: ") for shape, count in Counter(shapes).most_common(): logger.info(' {:15}: {}'.format(shape, count)) logger.info("Total number of parameters: {}".format(len(shapes))) printchildren(embedder, 1) printchildren(transformer, 1) printchildren(encoder, 1) printchildren(decoder, 1) # Print parameter names # enc_dec_param_dict = merge(Selector(embedder).get_parameters(), Selector(encoder).get_parameters(), Selector(decoder).get_parameters()) # enc_dec_param_dict = merge(Selector(decoder).get_parameters()) # logger.info("Parameter names: ") # for name, value in enc_dec_param_dict.items(): # logger.info(' {:15}: {}'.format(value.get_value().shape, name)) # logger.info("Total number of parameters: {}".format(len(enc_dec_param_dict))) ########## # Training data train_stream = get_train_stream(config, [config['train_src'],], [config['train_tgt'],], vocab_src, vocab_tgt) dev_stream = get_dev_stream( [config['dev_src'],], [config['dev_tgt'],], vocab_src, vocab_tgt) test_stream = get_test_stream([config['test_src'],], vocab_src) # Set extensions logger.info("Initializing extensions") extensions = [ FinishAfter(after_n_batches=config['finish_after']), ProgressBar(), TrainingDataMonitoring([cost], prefix="tra", after_batch=True), DataStreamMonitoring(variables=[cost], data_stream=dev_stream, prefix="dev", after_batch=True), Sampler( model=Model(samples), data_stream=dev_stream, vocab=cabvo, saveto=config['saveto']+'dev', every_n_batches=config['save_freq']), Sampler( model=Model(samples), data_stream=test_stream, vocab=cabvo, saveto=config['saveto']+'test', after_n_batches=1, on_resumption=True, before_training=True), Plotter(saveto=config['saveto'], after_batch=True), Printing(after_batch=True), Checkpoint( path=config['saveto'], parameters = cg.parameters, save_main_loop=False, every_n_batches=config['save_freq'])] if BOKEH_AVAILABLE: Plot('Training cost', channels=[['target_cost']], after_batch=True) if config['reload']: extensions.append(Load(path=config['saveto'], load_iteration_state=False, load_log=False)) else: with open(config['saveto']+'.txt', 'w') as f: pass # Set up training algorithm logger.info("Initializing training algorithm") algorithm = GradientDescent(cost=cost, parameters=cg.parameters, step_rule=CompositeRule([StepClipping(config['step_clipping']), eval(config['step_rule'])()]) ) # Initialize main loop logger.info("Initializing main loop") main_loop = MainLoop( model=model, algorithm=algorithm, data_stream=train_stream, extensions=extensions) main_loop.run()
def __init__(self, vocab_size, embedding_dim, state_dim, representation_dim, context_dim, target_transition, theano_seed=None, loss_function='cross_entropy', **kwargs): super(InitialContextDecoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = target_transition(attended_dim=state_dim, context_dim=context_dim, dim=state_dim, activation=Tanh(), name='decoder') # self.transition = GRUInitialStateWithInitialStateConcatContext( # attended_dim=state_dim, context_dim=context_dim, dim=state_dim, # activation=Tanh(), name='decoder') # Initialize the attention mechanism self.attention = SequenceContentAttention( state_names=self.transition.apply.states, attended_dim=representation_dim, match_dim=state_dim, name="attention") # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 readout = Readout( source_names=[ 'states', 'feedback', # Chris: it's key that we're taking the first output of self.attention.take_glimpses.outputs # Chris: the first output is the weighted avgs, the second is the weights in (batch, time) self.attention.take_glimpses.outputs[0] ], readout_dim=self.vocab_size, emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed), feedback_brick=LookupFeedbackWMT15(vocab_size, embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=state_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=state_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Linear(input_dim=embedding_dim, name='softmax1').apply ]), merged_dim=state_dim) # Build sequence generator accordingly if loss_function == 'cross_entropy': self.sequence_generator = InitialContextSequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) elif loss_function == 'min_risk': self.sequence_generator = MinRiskInitialContextSequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) # the name is important, because it lets us match the brick hierarchy names for the vanilla SequenceGenerator # to load pretrained models # TODO: quick hack to fix bug self.sequence_generator.name = 'initialcontextsequencegenerator' else: raise ValueError( 'The decoder does not support the loss function: {}'.format( loss_function)) # TODO: uncomment this!! # self.sequence_generator.name = 'sequencegenerator' self.children = [self.sequence_generator]
class ETHM(EUTHM): '''Model with only textual-hashtag information''' def __init__(self, config, dataset, *args, **kwargs): super(ETHM, self).__init__(config, dataset) def _build_model(self, *args, **kwargs): # Define inputs self._define_inputs() self._build_bricks() self._set_OV_value() # Transpose text self.text = self.text.dimshuffle(1, 0) self.text_mask = self.text_mask.dimshuffle(1, 0) self.sparse_word = self.sparse_word.dimshuffle(1, 0) self.sparse_word_mask = self.sparse_word_mask.dimshuffle(1, 0) # Turn word, and hashtag into vector representation text_vec = self.word_embed.apply(self.text) # Apply word and hashtag word and url text_vec = self._apply_hashtag_word(text_vec) text_vec = self._apply_sparse_word(text_vec) # Encode text mlstm_hidden, mlstm_cell = self.mlstm.apply( inputs=self.mlstm_ins.apply(text_vec), mask=self.text_mask.astype(theano.config.floatX)) text_encodes = mlstm_hidden[-1] input_vec = text_encodes self._get_cost(input_vec, None, None) def _define_inputs(self, *args, **kwargs): self.hashtag = tensor.ivector('hashtag') self.text = tensor.imatrix('text') self.text_mask = tensor.matrix('text_mask', dtype=theano.config.floatX) self.hashtag_word = tensor.ivector('hashtag_word') self.hashtag_sparse_mask = tensor.vector('hashtag_word_sparse_mask', dtype=theano.config.floatX) self.hashtag_word_left_idx = tensor.ivector( 'hashtag_word_idx_left_idx') self.hashtag_word_right_idx = tensor.ivector( 'hashtag_word_idx_right_idx') self.sparse_word = tensor.imatrix('sparse_word') self.sparse_word_sparse_mask = tensor.vector( 'sparse_word_sparse_mask', dtype=theano.config.floatX) self.sparse_word_mask = tensor.matrix('sparse_word_mask', dtype=theano.config.floatX) self.sparse_word_left_idx = tensor.ivector('sparse_word_idx_left_idx') self.sparse_word_right_idx = tensor.ivector( 'sparse_word_idx_right_idx') def _build_bricks(self, *args, **kwargs): # Build lookup tables self.word_embed = self._embed(len(self.dataset.word2index), self.config.word_embed_dim, name='word_embed') self.hashtag_embed = self._embed(len(self.dataset.hashtag2index), self.config.lstm_dim, name='hashtag_embed') # Build text encoder self.mlstm_ins = Linear(input_dim=self.config.word_embed_dim, output_dim=4 * self.config.lstm_dim, name='mlstm_in') self.mlstm_ins.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.word_embed_dim + self.config.lstm_dim)) self.mlstm_ins.biases_init = Constant(0) self.mlstm_ins.initialize() self.mlstm = MLSTM(self.config.lstm_time, self.config.lstm_dim, shared=False) self.mlstm.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.word_embed_dim + self.config.lstm_dim)) self.mlstm.biases_init = Constant(0) self.mlstm.initialize() self.hashtag2word = MLP( activations=[Tanh('hashtag2word_tanh')], dims=[self.config.lstm_dim, self.config.word_embed_dim], name='hashtag2word_mlp') self.hashtag2word.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.hashtag2word.biases_init = Constant(0) self.hashtag2word.initialize() self.hashtag2word_bias = Bias(dim=1, name='hashtag2word_bias') self.hashtag2word_bias.biases_init = Constant(0) self.hashtag2word_bias.initialize() #Build character embedding self.char_embed = self._embed(len(self.dataset.char2index), self.config.char_embed_dim, name='char_embed') # Build sparse word encoder self.rnn_ins = Linear(input_dim=self.config.char_embed_dim, output_dim=self.config.word_embed_dim, name='rnn_in') self.rnn_ins.weights_init = IsotropicGaussian( std=numpy.sqrt(2) / numpy.sqrt(self.config.char_embed_dim + self.config.word_embed_dim)) self.rnn_ins.biases_init = Constant(0) self.rnn_ins.initialize() self.rnn = SimpleRecurrent(dim=self.config.word_embed_dim, activation=Tanh()) self.rnn.weights_init = IsotropicGaussian( std=1 / numpy.sqrt(self.config.word_embed_dim)) self.rnn.initialize() def _apply_dropout(self, outputs, *args, **kwargs): variables = [self.word_embed.W, self.hashtag_embed.W] cgs = ComputationGraph(outputs) cg_dropouts = apply_dropout(cgs, variables, drop_prob=self.config.dropout_prob, seed=123).outputs return cg_dropouts def _apply_reg(self, cost, params=None, *args, **kwargs): try: if self.config.l2_norm > 0: cost = cost + self.config.l2_norm * theano_expressions.l2_norm( tensors=[self.hashtag_embed.W, self.word_embed.W])**2 else: pass except Exception: pass return cost
def __init__(self, ref_data, output_dim): if pca_dims is not None: covmat = numpy.dot(ref_data.T, ref_data) ev, evec = numpy.linalg.eig(covmat) best_i = ev.argsort()[-pca_dims:] best_evecs = evec[:, best_i] best_evecs = best_evecs / numpy.sqrt( (best_evecs**2).sum(axis=0)) #normalize ref_data = numpy.dot(ref_data, best_evecs) input_dim = ref_data.shape[1] ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data') # Construct the model j = tensor.lvector('j') r = ref_data_sh[j, :] x = tensor.fmatrix('x') y = tensor.ivector('y') # input_dim must be nr mlp = MLP(activations=activation_functions, dims=[input_dim] + hidden_dims + [n_inter], name='inter_gen') mlp2 = MLP(activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name='end_mlp') inter_weights = mlp.apply(r) if inter_bias == None: ibias = Bias(n_inter) ibias.biases_init = Constant(0) ibias.initialize() inter = ibias.apply(tensor.dot(x, inter_weights)) else: inter = tensor.dot(x, inter_weights) - inter_bias inter = inter_act_fun.apply(inter) final = mlp2.apply(inter) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) # error_rate = tensor.neq(y, pred).mean() ber = balanced_error_rate.ber(y, pred) # Initialize parameters for brick in [mlp, mlp2]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply regularization cg = ComputationGraph([cost, ber]) if r_dropout != 0: # - dropout on input vector r : r_dropout cg = apply_dropout(cg, [r], r_dropout) if x_dropout != 0: cg = apply_dropout(cg, [x], x_dropout) if s_dropout != 0: # - dropout on intermediate layers of first mlp : s_dropout s_dropout_vars = list( set( VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([inter_weights]))) - set([inter_weights])) cg = apply_dropout(cg, s_dropout_vars, s_dropout) if i_dropout != 0: # - dropout on input to second mlp : i_dropout cg = apply_dropout(cg, [inter], i_dropout) if a_dropout != 0: # - dropout on hidden layers of second mlp : a_dropout a_dropout_vars = list( set( VariableFilter(bricks=[Tanh], name='output') (ComputationGraph([final]))) - set([inter_weights]) - set(s_dropout_vars)) cg = apply_dropout(cg, a_dropout_vars, a_dropout) if r_noise_std != 0: cg = apply_noise(cg, [r], r_noise_std) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) [cost_reg, ber_reg] = cg.outputs if s_l1pen != 0: s_weights = VariableFilter(bricks=mlp.linear_transformations, roles=[WEIGHT])(cg) cost_reg = cost_reg + s_l1pen * sum( abs(w).sum() for w in s_weights) if i_l1pen != 0: cost_reg = cost_reg + i_l1pen * abs(inter).sum() if a_l1pen != 0: a_weights = VariableFilter(bricks=mlp2.linear_transformations, roles=[WEIGHT])(cg) cost_reg = cost_reg + a_l1pen * sum( abs(w).sum() for w in a_weights) self.cost = cost self.cost_reg = cost_reg self.ber = ber self.ber_reg = ber_reg self.pred = pred self.confidence = confidence
def __init__(self, ref_data, output_dim): input_dim = ref_data.shape[1] ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name="ref_data") # Construct the model j = tensor.lvector("j") r = ref_data_sh[j, :] x = tensor.fmatrix("x") y = tensor.ivector("y") # input_dim must be nr mlp0 = MLP(activations=activation_functions_0, dims=[input_dim] + hidden_dims_0, name="e0") mlp0vs = MLP(activations=[None], dims=[hidden_dims_0[-1], input_dim], name="de0") mlp1 = MLP( activations=activation_functions_1, dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter], name="inter_gen" ) mlp2 = MLP( activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name="end_mlp" ) encod = mlp0.apply(r) rprime = mlp0vs.apply(encod) inter_weights = mlp1.apply(encod) ibias = Bias(n_inter) ibias.biases_init = Constant(0) ibias.initialize() inter = inter_act_fun.apply(ibias.apply(tensor.dot(x, inter_weights))) final = mlp2.apply(inter) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # Initialize parameters for brick in [mlp0, mlp0vs, mlp1, mlp2]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() # apply regularization cg = ComputationGraph([cost, error_rate]) if r_dropout != 0: # - dropout on input vector r : r_dropout cg = apply_dropout(cg, [r], r_dropout) if s_dropout != 0: # - dropout on intermediate layers of first mlp : s_dropout s_dropout_vars = list( set(VariableFilter(bricks=[Tanh], name="output")(ComputationGraph([inter_weights]))) - set([inter_weights]) ) cg = apply_dropout(cg, s_dropout_vars, s_dropout) if i_dropout != 0: # - dropout on input to second mlp : i_dropout cg = apply_dropout(cg, [inter], i_dropout) if a_dropout != 0: # - dropout on hidden layers of second mlp : a_dropout a_dropout_vars = list( set(VariableFilter(bricks=[Tanh], name="output")(ComputationGraph([final]))) - set([inter_weights]) - set(s_dropout_vars) ) cg = apply_dropout(cg, a_dropout_vars, a_dropout) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) [cost_reg, error_rate_reg] = cg.outputs # add reconstruction penalty for AE part penalty_val = tensor.sqrt(((r - rprime) ** 2).sum(axis=1)).mean() cost_reg = cost_reg + reconstruction_penalty * penalty_val self.cost = cost self.cost_reg = cost_reg self.error_rate = error_rate self.error_rate_reg = error_rate_reg self.pred = pred self.confidence = confidence
def __init__( self, input_dims, input_num_chars, eos_label, num_phonemes, dim_dec, dims_bidir, enc_transition, dec_transition, use_states_for_readout, attention_type, criterion, bottom, lm=None, character_map=None, bidir=True, subsample=None, dims_top=None, prior=None, conv_n=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dim_output_embedding=None, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, # softmax is the default set in SequenceContentAndConvAttention energy_normalizer=None, # for speech this is the approximate phoneme duration in frames max_decoded_length_scale=1, **kwargs): if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack self.criterion = criterion self.max_decoded_length_scale = max_decoded_length_scale post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN bottom_class = bottom.pop('bottom_class') bottom = bottom_class(input_dims=input_dims, input_num_chars=input_num_chars, name='bottom', **bottom) # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, bottom.get_dim(bottom.apply.outputs[0]), subsample, bidir=bidir) dim_encoded = encoder.get_dim(encoder.apply.outputs[0]) generators = [None, None] for i in range(2): # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded], name="top{}".format(i)) else: top = Identity(name='top{}'.format(i)) if dec_stack == 1: transition = self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition{}".format(i)) else: transitions = [ self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}_{}".format( i, trans_level)) for trans_level in xrange(dec_stack) ] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=dim_encoded, match_dim=dim_matcher, name="cont_att" + i) elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=dim_encoded, match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att{}".format(i)) else: raise ValueError( "Unknown attention type {}".format(attention_type)) if embed_outputs: feedback = LookupFeedback( num_phonemes + 1, dim_dec if dim_output_embedding is None else dim_output_embedding) else: feedback = OneOfNFeedback(num_phonemes + 1) if criterion['name'] == 'log_likelihood': emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter{}".format(i)) if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() elif criterion['name'].startswith('mse'): emitter = RewardRegressionEmitter(criterion['name'], eos_label, num_phonemes, criterion.get( 'min_reward', -1.0), name="emitter") else: raise ValueError("Unknown criterion {}".format( criterion['name'])) readout_config = dict( readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout{}".format(i)) if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence( [ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP( [post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [ d // getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims ] + [num_phonemes]).apply, ], name='post_merge{}'.format(i)) readout = Readout(**readout_config) language_model = None if lm and lm.get('path'): lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn( "Beam search is prone to fail with no log-prob normalization" ) language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout( lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generators[i] = SequenceGenerator(readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator{}".format(i)) self.generator = generators[0] self.forward_to_backward = Linear(dim_dec, dim_dec) # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generators = generators self.children = [self.forward_to_backward, encoder, top, bottom ] + generators # Create input variables self.inputs = self.bottom.batch_inputs self.inputs_mask = self.bottom.mask self.labels = tensor.lmatrix('labels') self.labels_mask = tensor.matrix("labels_mask") self.single_inputs = self.bottom.single_inputs self.single_labels = tensor.lvector('labels') self.n_steps = tensor.lscalar('n_steps')
def __init__(self, vocab_size, embedding_dim, state_dim, att_dim, maxout_dim, representation_dim, attention_strategy='content', attention_sources='s', readout_sources='sfa', memory='none', memory_size=500, seq_len=50, init_strategy='last', theano_seed=None, **kwargs): """Creates a new decoder brick without embedding. Args: vocab_size (int): Target language vocabulary size embedding_dim (int): Size of feedback embedding layer state_dim (int): Number of hidden units att_dim (int): Size of attention match vector maxout_dim (int): Size of maxout layer representation_dim (int): Dimension of source annotations attention_strategy (string): Which attention should be used cf. ``_initialize_attention`` attention_sources (string): Defines the sources used by the attention model 's' for decoder states, 'f' for feedback readout_sources (string): Defines the sources used in the readout network. 's' for decoder states, 'f' for feedback, 'a' for attention (context vector) memory (string): Which external memory should be used (cf. ``_initialize_attention``) memory_size (int): Size of the external memory structure seq_len (int): Maximum sentence length init_strategy (string): How to initialize the RNN state (cf. ``GRUInitialState``) theano_seed: Random seed """ super(NoLookupDecoder, self).__init__(**kwargs) self.vocab_size = vocab_size self.embedding_dim = embedding_dim self.state_dim = state_dim self.representation_dim = representation_dim self.theano_seed = theano_seed # Initialize gru with special initial state self.transition = GRUInitialState(attended_dim=state_dim, init_strategy=init_strategy, dim=state_dim, activation=Tanh(), name='decoder') # Initialize the attention mechanism att_dim = att_dim if att_dim > 0 else state_dim self.attention, src_names = _initialize_attention( attention_strategy, seq_len, self.transition, representation_dim, att_dim, attention_sources, readout_sources, memory, memory_size) # Initialize the readout, note that SoftmaxEmitter emits -1 for # initial outputs which is used by LookupFeedBackWMT15 maxout_dim = maxout_dim if maxout_dim > 0 else state_dim readout = Readout( source_names=src_names, readout_dim=embedding_dim, emitter=NoLookupEmitter(initial_output=-1, readout_dim=embedding_dim, cost_brick=SquaredError()), # cost_brick=CategoricalCrossEntropy()), feedback_brick=TrivialFeedback(output_dim=embedding_dim), post_merge=InitializableFeedforwardSequence([ Bias(dim=maxout_dim, name='maxout_bias').apply, Maxout(num_pieces=2, name='maxout').apply, Linear(input_dim=maxout_dim / 2, output_dim=embedding_dim, use_bias=False, name='softmax0').apply, Logistic(name='softmax1').apply ]), merged_dim=maxout_dim) # Build sequence generator accordingly self.sequence_generator = SequenceGenerator( readout=readout, transition=self.transition, attention=self.attention, fork=Fork([ name for name in self.transition.apply.sequences if name != 'mask' ], prototype=Linear())) self.children = [self.sequence_generator]
def __init__( self, recordings_source, labels_source, eos_label, num_features, num_phonemes, dim_dec, dims_bidir, dims_bottom, enc_transition, dec_transition, use_states_for_readout, attention_type, lm=None, character_map=None, subsample=None, dims_top=None, prior=None, conv_n=None, bottom_activation=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dec_stack=1, conv_num_filters=1, data_prepend_eos=True, energy_normalizer=None, # softmax is th edefault set in SequenceContentAndConvAttention **kwargs): if bottom_activation is None: bottom_activation = Tanh() if post_merge_activation is None: post_merge_activation = Tanh() super(SpeechRecognizer, self).__init__(**kwargs) self.recordings_source = recordings_source self.labels_source = labels_source self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack bottom_activation = bottom_activation post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN if dims_bottom: bottom = MLP([bottom_activation] * len(dims_bottom), [num_features] + dims_bottom, name="bottom") else: bottom = Identity(name='bottom') # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder( self.enc_transition, dims_bidir, dims_bottom[-1] if len(dims_bottom) else num_features, subsample) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]], name="top") else: top = Identity(name='top') if dec_stack == 1: transition = self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition") else: transitions = [ self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level)) for trans_level in xrange(dec_stack) ] transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration if attention_type == "content": attention = SequenceContentAttention( state_names=transition.apply.states, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, name="cont_att") elif attention_type == "content_and_conv": attention = SequenceContentAndConvAttention( state_names=transition.apply.states, conv_n=conv_n, conv_num_filters=conv_num_filters, attended_dim=2 * dims_bidir[-1], match_dim=dim_matcher, prior=prior, energy_normalizer=energy_normalizer, name="conv_att") else: raise ValueError( "Unknown attention type {}".format(attention_type)) if embed_outputs: feedback = LookupFeedback(num_phonemes + 1, dim_dec) else: feedback = OneOfNFeedback(num_phonemes + 1) if lm: # In case we use LM it is Readout that is responsible # for normalization. emitter = LMEmitter() else: emitter = SoftmaxEmitter(initial_output=num_phonemes, name="emitter") readout_config = dict(readout_dim=num_phonemes, source_names=(transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]], emitter=emitter, feedback_brick=feedback, name="readout") if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence( [ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP( [post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [ d // getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims ] + [num_phonemes]).apply, ], name='post_merge') readout = Readout(**readout_config) language_model = None if lm: lm_weight = lm.pop('weight', 0.0) normalize_am_weights = lm.pop('normalize_am_weights', True) normalize_lm_weights = lm.pop('normalize_lm_weights', False) normalize_tot_weights = lm.pop('normalize_tot_weights', False) am_beta = lm.pop('am_beta', 1.0) if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1: logger.warn( "Beam search is prone to fail with no log-prob normalization" ) language_model = LanguageModel(nn_char_map=character_map, **lm) readout = ShallowFusionReadout( lm_costs_name='lm_add', lm_weight=lm_weight, normalize_am_weights=normalize_am_weights, normalize_lm_weights=normalize_lm_weights, normalize_tot_weights=normalize_tot_weights, am_beta=am_beta, **readout_config) generator = SequenceGenerator(readout=readout, transition=transition, attention=attention, language_model=language_model, name="generator") # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.children = [encoder, top, bottom, generator] # Create input variables self.recordings = tensor.tensor3(self.recordings_source) self.recordings_mask = tensor.matrix(self.recordings_source + "_mask") self.labels = tensor.lmatrix(self.labels_source) self.labels_mask = tensor.matrix(self.labels_source + "_mask") self.batch_inputs = [ self.recordings, self.recordings_source, self.labels, self.labels_mask ] self.single_recording = tensor.matrix(self.recordings_source) self.single_transcription = tensor.lvector(self.labels_source)
def __init__(self, ref_data, output_dim): ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data') # Construct the model j = tensor.lvector('j') x = tensor.fmatrix('x') y = tensor.ivector('y') last_outputs = [] s_dropout_vars = [] r_dropout_vars = [] i_dropout_vars = [] penalties = [] for i in range(nparts): fs = numpy.random.binomial(1, part_r_proba, size=(ref_data.shape[1], )) input_dim = int(fs.sum()) fs_sh = theano.shared(fs) r = ref_data_sh[j, :][:, fs_sh.nonzero()[0]] mlp0 = MLP(activations=activation_functions_0, dims=[input_dim] + hidden_dims_0, name='enc%d' % i) mlp0r = MLP(activations=[None], dims=[hidden_dims_0[-1], input_dim], name='dec%d' % i) mlp1 = MLP(activations=activation_functions_1, dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter], name='inter_gen_%d' % i) mlp2 = MLP(activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name='end_mlp_%d' % i) encod = mlp0.apply(r) rprime = mlp0r.apply(encod) inter_weights = mlp1.apply(encod) ibias = Bias(n_inter, name='inter_bias_%d' % i) inter = ibias.apply(tensor.dot(x, inter_weights)) inter = inter_act_fun.apply(inter) out = mlp2.apply(inter) penalties.append( tensor.sqrt(((rprime - r)**2).sum(axis=1)).mean()[None]) last_outputs.append(out) r_dropout_vars.append(r) s_dropout_vars = s_dropout_vars + (VariableFilter( bricks=[Tanh], name='output')(ComputationGraph([inter_weights ]))) i_dropout_vars.append(inter) # Initialize parameters for brick in [mlp0, mlp0r, mlp1, mlp2, ibias]: brick.weights_init = IsotropicGaussian(0.01) brick.biases_init = Constant(0.001) brick.initialize() final = tensor.concatenate([x[:, :, None] for x in last_outputs], axis=2).mean(axis=2) cost = Softmax().categorical_cross_entropy(y, final) confidence = Softmax().apply(final) pred = final.argmax(axis=1) error_rate = tensor.neq(y, pred).mean() # apply regularization cg = ComputationGraph([cost, error_rate]) if w_noise_std != 0: # - apply noise on weight variables weight_vars = VariableFilter(roles=[WEIGHT])(cg) cg = apply_noise(cg, weight_vars, w_noise_std) if s_dropout != 0: cg = apply_dropout(cg, s_dropout_vars, s_dropout) if r_dropout != 0: cg = apply_dropout(cg, r_dropout_vars, r_dropout) if i_dropout != 0: cg = apply_dropout(cg, i_dropout_vars, i_dropout) [cost_reg, error_rate_reg] = cg.outputs cost_reg = cost_reg + reconstruction_penalty * tensor.concatenate( penalties, axis=0).sum() self.cost = cost self.cost_reg = cost_reg self.error_rate = error_rate self.error_rate_reg = error_rate_reg self.pred = pred self.confidence = confidence
def __init__( self, input_sources, input_sources_dims, eos_label, num_phonemes, dim_dec, dims_bidir, enc_transition, dec_transition, use_states_for_readout, attention_type, criterion, bottom, enc_transition_params={}, dec_transition_params={}, names_postfix='', lm=None, character_map=None, bidir=True, bidir_aggregation='concat', subsample=None, dims_top=None, prior=None, conv_n=None, post_merge_activation=None, post_merge_dims=None, dim_matcher=None, embed_outputs=True, dim_output_embedding=None, dec_stack=1, conv_num_filters=1, data_prepend_eos=False, # softmax is the default set in SequenceContentAndConvAttention energy_normalizer=None, # for speech this is the approximate phoneme duration in frames max_decoded_length_scale=3, use_dependent_words_for_labels=False, use_dependent_words_for_attention=False, reproduce_rec_weight_init_bug=True, pointers_weight=0.5, tags_weight=1.0, tag_layer=-1, # -1 is last, 0 is after first bidir layer dependency_type='recurrent_soft', **kwargs): if post_merge_activation is None: post_merge_activation = Tanh() self.regularization_bricks = [] possible_regularization_bricks = [] self.names_postfix = names_postfix self.mask_dict = {} self.pointers_name = 'pointers' + names_postfix self.additional_sources = kwargs.pop('additional_sources') self.additional_sources_dims = kwargs.pop('additional_sources_dims') self.pointer_weight = pointers_weight self.soft_pointer_val = kwargs.pop('pointers_soften', 0.0) self.soft_pointer = self.soft_pointer_val > 0.0 self.tags_weight = tags_weight self.tag_layer = tag_layer self.train_tags = True if self.tags_weight < 0 or len(self.additional_sources) <= 1: self.train_tags = False self.dependency_type = dependency_type super(DependencyRecognizer, self).__init__(**kwargs) self.reproduce_rec_weight_init_bug = reproduce_rec_weight_init_bug self.eos_label = eos_label self.data_prepend_eos = data_prepend_eos self.rec_weights_init = None self.initial_states_init = None self.enc_transition = enc_transition self.dec_transition = dec_transition self.dec_stack = dec_stack self.criterion = criterion self.max_decoded_length_scale = max_decoded_length_scale self.post_merge_activation = post_merge_activation if dim_matcher is None: dim_matcher = dim_dec # The bottom part, before BiRNN bottom_class = bottom.pop('bottom_class') bottom = bottom_class(input_sources=input_sources, input_sources_dims=input_sources_dims, name='bottom', pointers_soften=self.soft_pointer, additional_sources=self.additional_sources, **bottom) # BiRNN if not subsample: subsample = [1] * len(dims_bidir) encoder = Encoder(self.enc_transition, dims_bidir, bottom.output_dim, subsample, bidir=bidir, bidir_aggregation=bidir_aggregation, enc_transition_params=enc_transition_params) possible_regularization_bricks += encoder.enc_transitions dim_encoded = encoder.get_dim(encoder.apply.outputs[0]) # The top part, on top of BiRNN but before the attention if dims_top: top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded], name="top") else: top = Identity(name='top') self.additional_sources_mlp = {} ndim_softmax = NDimensionalSoftmax() ndim_softmax._extra_ndim = 1 for source in self.additional_sources: if source != self.pointers_name: if len(self.names_postfix) > 0: source_glob_name = source[:-len(self.names_postfix)] else: source_glob_name = source self.additional_sources_mlp[source] = \ MLP([ndim_softmax], [dim_encoded, self.additional_sources_dims[source]], name='additional_'+source_glob_name) if dec_stack == 1: transition = self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition", **dec_transition_params) possible_regularization_bricks += [transition] else: transitions = [ self.dec_transition(dim=dim_dec, activation=Tanh(), name="transition_{}".format(trans_level), **dec_transition_params) for trans_level in xrange(dec_stack) ] possible_regularization_bricks += transitions transition = RecurrentStack(transitions=transitions, skip_connections=True) # Choose attention mechanism according to the configuration attention_class = ParsingAttention attention_kwargs = {} transition_with_att_class = ParsingAttentionRecurrent if self.dependency_type == "recurrent_soft": attention_kwargs['use_pointers'] = None elif self.dependency_type == "recurrent_hard": attention_kwargs['use_pointers'] = 'hard' elif self.dependency_type == "recurrent_semihard": attention_kwargs['use_pointers'] = 'semihard' else: raise ValueError("Unknown dependency type {}".format( self.dependency_type)) if attention_type == "content": pass elif attention_type == "content_hard": attention_kwargs['hard_attention'] = True else: raise ValueError( "Unknown attention type {}".format(attention_type)) if use_dependent_words_for_attention: attention_kwargs['use_word_annotations'] = True attention_kwargs['word_annontation_dim'] = dim_encoded attention = attention_class(state_names=transition.apply.states, attended_dim=dim_encoded, match_dim=dim_matcher, name="cont_att", **attention_kwargs) feedback = AttendedFeedback(num_phonemes + 1, dim_encoded) if criterion['name'] == 'log_likelihood': emitter = SoftmaxMultiEmitter(initial_output=num_phonemes, name="emitter") else: raise ValueError("Unknown criterion {}".format(criterion['name'])) readout_source_names = (transition.apply.states if use_states_for_readout else []) + [attention.take_glimpses.outputs[0]] if use_dependent_words_for_labels: readout_source_names.append('attended') readout_config = dict(readout_dim=num_phonemes, source_names=readout_source_names, emitter=emitter, feedback_brick=feedback, name="readout") if post_merge_dims: readout_config['merged_dim'] = post_merge_dims[0] readout_config['post_merge'] = InitializableSequence( [ Bias(post_merge_dims[0]).apply, post_merge_activation.apply, MLP( [post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()], # MLP was designed to support Maxout is activation # (because Maxout in a way is not one). However # a single layer Maxout network works with the trick below. # For deeper Maxout network one has to use the # Sequence brick. [ d // getattr(post_merge_activation, 'num_pieces', 1) for d in post_merge_dims ] + [num_phonemes]).apply, ], name='post_merge') readout = Readout(**readout_config) generator = Generator( readout=readout, transition=transition, attention=attention, dim_dec=dim_dec, pointer_weight=self.pointer_weight, transition_with_att_class=transition_with_att_class, name="generator") for brick in possible_regularization_bricks: if 'regularize' in dir(brick): self.regularization_bricks += [brick] logger.info("Regularization bricks: {}".format( str(self.regularization_bricks))) # Remember child bricks self.encoder = encoder self.bottom = bottom self.top = top self.generator = generator self.children = [encoder, top, bottom, generator] self.children.extend(self.additional_sources_mlp.values()) # Create input variables self.inputs = self.bottom.get_batch_inputs() self.inputs_mask = self.bottom.get_mask() self.additional_sources = self.bottom.get_batch_additional_sources() self.labels = tensor.lmatrix('labels' + names_postfix) self.labels_mask = tensor.matrix('labels' + names_postfix + '_mask') #self.labels_mask = tensor.matrix('labels_mask'+names_postfix) self.single_inputs = self.bottom.get_single_sequence_inputs() self.single_labels = tensor.lvector('labels' + names_postfix) self.single_additional_sources = self.bottom.get_single_additional_sources( ) self.n_steps = tensor.lscalar('n_steps' + names_postfix)