Ejemplo n.º 1
0
def test_variable_filter():
    # Creating computation graph
    brick1 = Linear(input_dim=2, output_dim=2, name='linear1')
    brick2 = Bias(2, name='bias1')
    activation = Sigmoid(name='sigm')

    x = tensor.vector()
    h1 = brick1.apply(x)
    h2 = activation.apply(h1)
    y = brick2.apply(h2)
    cg = ComputationGraph(y)

    parameters = [brick1.W, brick1.b, brick2.params[0]]
    bias = [brick1.b, brick2.params[0]]
    brick1_bias = [brick1.b]

    # Testing filtering by role
    role_filter = VariableFilter(roles=[PARAMETER])
    assert parameters == role_filter(cg.variables)
    role_filter = VariableFilter(roles=[FILTER])
    assert [] == role_filter(cg.variables)

    # Testing filtering by role using each_role flag
    role_filter = VariableFilter(roles=[PARAMETER, BIAS])
    assert parameters == role_filter(cg.variables)
    role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True)
    assert not parameters == role_filter(cg.variables)
    assert bias == role_filter(cg.variables)

    # Testing filtering by bricks classes
    brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by bricks instances
    brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by brick instance
    brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by name
    name_filter = VariableFilter(name='W_norm')
    assert [cg.variables[2]] == name_filter(cg.variables)

    # Testing filtering by name regex
    name_filter_regex = VariableFilter(name_regex='W_no.?m')
    assert [cg.variables[2]] == name_filter_regex(cg.variables)

    # Testing filtering by application
    appli_filter = VariableFilter(applications=[brick1.apply])
    variables = [cg.variables[1], cg.variables[8]]
    assert variables == appli_filter(cg.variables)

    # Testing filtering by application
    appli_filter_list = VariableFilter(applications=[brick1.apply])
    assert variables == appli_filter_list(cg.variables)
Ejemplo n.º 2
0
    def _build_bricks(self, *args, **kwargs):
        # Build lookup tables
        self.word_embed = self._embed(len(self.dataset.word2index),
                                      self.config.word_embed_dim,
                                      name='word_embed')

        self.hashtag_embed = self._embed(len(self.dataset.hashtag2index),
                                         self.config.lstm_dim,
                                         name='hashtag_embed')
        # Build text encoder
        self.mlstm_ins = Linear(input_dim=self.config.word_embed_dim,
                                output_dim=4 * self.config.lstm_dim,
                                name='mlstm_in')
        self.mlstm_ins.weights_init = IsotropicGaussian(
            std=numpy.sqrt(2) /
            numpy.sqrt(self.config.word_embed_dim + self.config.lstm_dim))
        self.mlstm_ins.biases_init = Constant(0)
        self.mlstm_ins.initialize()
        self.mlstm = MLSTM(self.config.lstm_time,
                           self.config.lstm_dim,
                           shared=False)
        self.mlstm.weights_init = IsotropicGaussian(
            std=numpy.sqrt(2) /
            numpy.sqrt(self.config.word_embed_dim + self.config.lstm_dim))
        self.mlstm.biases_init = Constant(0)
        self.mlstm.initialize()
        self.hashtag2word = MLP(
            activations=[Tanh('hashtag2word_tanh')],
            dims=[self.config.lstm_dim, self.config.word_embed_dim],
            name='hashtag2word_mlp')
        self.hashtag2word.weights_init = IsotropicGaussian(
            std=1 / numpy.sqrt(self.config.word_embed_dim))
        self.hashtag2word.biases_init = Constant(0)
        self.hashtag2word.initialize()
        self.hashtag2word_bias = Bias(dim=1, name='hashtag2word_bias')
        self.hashtag2word_bias.biases_init = Constant(0)
        self.hashtag2word_bias.initialize()
        #Build character embedding
        self.char_embed = self._embed(len(self.dataset.char2index),
                                      self.config.char_embed_dim,
                                      name='char_embed')
        # Build sparse word encoder
        self.rnn_ins = Linear(input_dim=self.config.char_embed_dim,
                              output_dim=self.config.word_embed_dim,
                              name='rnn_in')
        self.rnn_ins.weights_init = IsotropicGaussian(
            std=numpy.sqrt(2) / numpy.sqrt(self.config.char_embed_dim +
                                           self.config.word_embed_dim))
        self.rnn_ins.biases_init = Constant(0)
        self.rnn_ins.initialize()
        self.rnn = SimpleRecurrent(dim=self.config.word_embed_dim,
                                   activation=Tanh())
        self.rnn.weights_init = IsotropicGaussian(
            std=1 / numpy.sqrt(self.config.word_embed_dim))
        self.rnn.initialize()
Ejemplo n.º 3
0
    def __init__(self, emitter=None, feedback_brick=None,
                 merge=None, merge_prototype=None,
                 post_merge=None, merged_dim=None, **kwargs):

        if not emitter:
            emitter = TrivialEmitter(kwargs['readout_dim'])
        if not feedback_brick:
            feedback_brick = TrivialFeedback(kwargs['readout_dim'])
        if not merge:
            merge = Merge(input_names=kwargs['source_names'],
                          prototype=merge_prototype)
        if not post_merge:
            post_merge = Bias(dim=kwargs['readout_dim'])
        if not merged_dim:
            merged_dim = kwargs['readout_dim']
        self.emitter = emitter
        self.feedback_brick = feedback_brick
        self.merge = merge
        self.post_merge = post_merge
        self.merged_dim = merged_dim

        children = [self.emitter, self.feedback_brick, self.merge,
                    self.post_merge]
        kwargs.setdefault('children', []).extend(children)
        super(Readout, self).__init__(**kwargs)
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 state_dim,
                 representation_dim,
                 theano_seed=None,
                 **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        # Initialize gru with special initial state.
        self.transition = GRUInitialState(attended_dim=state_dim,
                                          dim=state_dim,
                                          activation=Tanh(),
                                          name='decoder')

        # Initialize the attention mechanism.
        self.attention = SequenceContentAttention2(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim,
            name="attention")

        readout = Readout(source_names=[
            'states', 'feedback', self.attention.take_glimpses.outputs[0]
        ],
                          readout_dim=self.vocab_size,
                          emitter=NewSoftmaxEmitter(initial_output=-1,
                                                    theano_seed=theano_seed),
                          feedback_brick=NewLookupFeedback(
                              vocab_size, embedding_dim),
                          post_merge=InitializableFeedforwardSequence([
                              Bias(dim=state_dim, name='maxout_bias').apply,
                              Maxout(num_pieces=2, name='maxout').apply,
                              Linear(input_dim=state_dim / 2,
                                     output_dim=embedding_dim,
                                     use_bias=False,
                                     name='softmax0').apply,
                              Linear(input_dim=embedding_dim,
                                     name='softmax1').apply
                          ]),
                          merged_dim=state_dim)

        # Build sequence generator accordingly.
        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            attention=self.attention,
            fork=Fork([
                name
                for name in self.transition.apply.sequences if name != 'mask'
            ],
                      prototype=Linear()),
            cost_type='categorical_cross_entropy')

        self.children = [self.sequence_generator]
Ejemplo n.º 5
0
    def __init__(self,
                 emitter=None,
                 feedback_brick=None,
                 merge=None,
                 merge_prototype=None,
                 post_merge=None,
                 merged_dim=None,
                 **kwargs):
        super(Readout, self).__init__(**kwargs)

        if not emitter:
            emitter = TrivialEmitter(self.readout_dim)
        if not feedback_brick:
            feedback_brick = TrivialFeedback(self.readout_dim)
        if not merge:
            merge = Merge(input_names=self.source_names,
                          prototype=merge_prototype)
        if not post_merge:
            post_merge = Bias(dim=self.readout_dim)
        if not merged_dim:
            merged_dim = self.readout_dim
        self.emitter = emitter
        self.feedback_brick = feedback_brick
        self.merge = merge
        self.post_merge = post_merge
        self.merged_dim = merged_dim

        self.children = [
            self.emitter, self.feedback_brick, self.merge, self.post_merge
        ]
Ejemplo n.º 6
0
 def _build_bricks(self, *args, **kwargs):
     super(AttentionEUTHM2, self)._build_bricks()
     self.word_shift = MLP(
         activations=[Tanh('word_shift_tanh')],
         dims=[
             self.config.user_embed_dim + self.config.word_embed_dim,
             self.config.word_embed_dim
         ],
         name='word_shift_mlp')
     self.word_shift.weights_init = IsotropicGaussian(
         std=1 / numpy.sqrt(self.config.word_embed_dim +
                            self.config.user_embed_dim))
     self.word_shift.biases_init = Constant(0)
     self.word_shift.initialize()
     self.word_shift_bias = Bias(dim=1, name='word_shift_bias')
     self.word_shift_bias.biases_init = Constant(0)
     self.word_shift_bias.initialize()
Ejemplo n.º 7
0
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim,topical_dim,theano_seed=None, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        #self.topical_dim=topical_dim;

        # Initialize gru with special initial state
        self.transition = GRUInitialState(
            attended_dim=state_dim, dim=state_dim,
            activation=Tanh(), name='decoder')


        # Initialize the attention mechanism
        self.attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim, name="attention")

        self.topical_attention=SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=topical_dim,
            match_dim=state_dim, name="topical_attention")#not sure whether the match dim would be correct.


        # Initialize the readout, note that SoftmaxEmitter emits -1 for
        # initial outputs which is used by LookupFeedBackWMT15
        readout = Readout(
            source_names=['states', 'feedback',
                          self.attention.take_glimpses.outputs[0]],#check!
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed),
            feedback_brick=LookupFeedbackWMT15(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence(
                [Bias(dim=state_dim, name='maxout_bias').apply,
                 Maxout(num_pieces=2, name='maxout').apply,
                 Linear(input_dim=state_dim / 2, output_dim=embedding_dim,
                        use_bias=False, name='softmax0').apply,
                 Linear(input_dim=embedding_dim, name='softmax1').apply]),
            merged_dim=state_dim)

        # Build sequence generator accordingly
        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            attention=self.attention,
            topical_attention=self.topical_attention,
            topical_name='topical_embeddingq',
            content_name='content_embedding',
            fork=Fork([name for name in self.transition.apply.sequences
                       if name != 'mask'], prototype=Linear())
        )

        self.children = [self.sequence_generator]
Ejemplo n.º 8
0
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 igru_state_dim,
                 igru_depth,
                 trg_dgru_depth,
                 emitter,
                 feedback_brick,
                 merge=None,
                 merge_prototype=None,
                 post_merge=None,
                 **kwargs):
        merged_dim = igru_state_dim
        if not merge:
            merge = Merge(input_names=kwargs['source_names'],
                          prototype=merge_prototype)
        if not post_merge:
            post_merge = Bias(dim=merged_dim)

        # for compatible
        if igru_depth == 1:
            self.igru = IGRU(dim=igru_state_dim)
        else:
            self.igru = RecurrentStack(
                [IGRU(dim=igru_state_dim, name='igru')] + [
                    UpperIGRU(dim=igru_state_dim,
                              activation=Tanh(),
                              name='upper_igru' + str(i))
                    for i in range(1, igru_depth)
                ],
                skip_connections=True)
        self.embedding_dim = embedding_dim
        self.emitter = emitter
        self.feedback_brick = feedback_brick
        self.merge = merge
        self.post_merge = post_merge
        self.merged_dim = merged_dim
        self.igru_depth = igru_depth
        self.trg_dgru_depth = trg_dgru_depth
        self.lookup = LookupTable(name='embeddings')
        self.vocab_size = vocab_size
        self.igru_state_dim = igru_state_dim
        self.gru_to_softmax = Linear(input_dim=igru_state_dim,
                                     output_dim=vocab_size)
        self.gru_fork = Fork([
            name for name in self.igru.apply.sequences
            if name != 'mask' and name != 'input_states'
        ],
                             prototype=Linear(),
                             name='gru_fork')

        children = [
            self.emitter, self.feedback_brick, self.merge, self.post_merge,
            self.igru, self.lookup, self.gru_to_softmax, self.gru_fork
        ]
        kwargs.setdefault('children', []).extend(children)
        super(Interpolator, self).__init__(**kwargs)
Ejemplo n.º 9
0
class AttentionEUTHM2(AttentionEUTHM):
    def __init__(self, config, dataset, *args, **kwargs):
        '''
        Define User-text-hashtag model with negtive sampling
        :param config:
        :param dataset: User-text-hashtag dataset
        '''
        super(AttentionEUTHM2, self).__init__(config, dataset)

    def _get_doc_embed(self, *args, **kwargs):
        text_vec = self._get_text_vec()
        user_vec = self.user_embed.apply(self.user)
        text_vec = tensor.concatenate([
            text_vec, user_vec[None, :, :][tensor.zeros(
                shape=(text_vec.shape[0], ), dtype='int32')]
        ],
                                      axis=2)
        text_vec = self.word_shift.apply(text_vec) + \
                        self.word_shift_bias.parameters[0][0]
        return self._encode_text_vec(text_vec)

    def _build_bricks(self, *args, **kwargs):
        super(AttentionEUTHM2, self)._build_bricks()
        self.word_shift = MLP(
            activations=[Tanh('word_shift_tanh')],
            dims=[
                self.config.user_embed_dim + self.config.word_embed_dim,
                self.config.word_embed_dim
            ],
            name='word_shift_mlp')
        self.word_shift.weights_init = IsotropicGaussian(
            std=1 / numpy.sqrt(self.config.word_embed_dim +
                               self.config.user_embed_dim))
        self.word_shift.biases_init = Constant(0)
        self.word_shift.initialize()
        self.word_shift_bias = Bias(dim=1, name='word_shift_bias')
        self.word_shift_bias.biases_init = Constant(0)
        self.word_shift_bias.initialize()
Ejemplo n.º 10
0
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        self.transition = GRUInitialState(attended_dim=state_dim,
                                          dim=state_dim,
                                          activation=Tanh(),
                                          name='decoder')
        self.attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim,
            name="attention")

        readout = Readout(source_names=[
            'states', 'feedback', self.attention.take_glimpses.outputs[0]
        ],
                          readout_dim=self.vocab_size,
                          emitter=SoftmaxEmitter(initial_output=-1),
                          feedback_brick=LookupFeedbackWMT15(
                              vocab_size, embedding_dim),
                          post_merge=InitializableFeedforwardSequence([
                              Bias(dim=state_dim, name='maxout_bias').apply,
                              Maxout(num_pieces=2, name='maxout').apply,
                              Linear(input_dim=state_dim / 2,
                                     output_dim=embedding_dim,
                                     use_bias=False,
                                     name='softmax0').apply,
                              Linear(input_dim=embedding_dim,
                                     name='softmax1').apply
                          ]),
                          merged_dim=state_dim,
                          merge_prototype=Linear(use_bias=True))

        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            attention=self.attention,
            fork=Fork([
                name
                for name in self.transition.apply.sequences if name != 'mask'
            ],
                      prototype=Linear()))

        self.children = [self.sequence_generator]
Ejemplo n.º 11
0
    def __init__(self, vocab_size, embedding_dim, state_dim,
                 representation_dim, **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim

        readout = Readout(
            source_names=['states', 'feedback', 'readout_context'],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(),
            feedback_brick=LookupFeedback(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence([
                Bias(dim=1000).apply,
                Maxout(num_pieces=2).apply,
                Linear(input_dim=state_dim / 2, output_dim=100,
                       use_bias=False).apply,
                Linear(input_dim=100).apply
            ]),
            merged_dim=1000)

        self.transition = GatedRecurrentWithContext(Tanh(),
                                                    dim=state_dim,
                                                    name='decoder')
        # Readout will apply the linear transformation to 'readout_context'
        # with a Merge brick, so no need to fork it here
        self.fork = Fork([
            name for name in self.transition.apply.contexts +
            self.transition.apply.states if name != 'readout_context'
        ],
                         prototype=Linear())
        self.tanh = Tanh()

        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            fork_inputs=[
                name for name in self.transition.apply.sequences
                if name != 'mask'
            ],
        )

        self.children = [self.fork, self.sequence_generator, self.tanh]
Ejemplo n.º 12
0
    def __init__(self, ref_data, output_dim):
        input_dim = ref_data.shape[1]
        ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data')

        rng = RandomStreams()

        ae_bricks = []
        ae_input = ref_data_sh
        ae_costs = []
        for i, (idim, odim) in enumerate(zip([input_dim] + ae_dims[:-1], ae_dims)):
            ae_mlp = MLP(activations=[ae_activations[i]],
                         dims=[idim, odim],
                         name='enc%i'%i)
            enc = ae_mlp.apply(ae_input)
            enc_n = ae_mlp.apply(ae_input + rng.normal(size=ae_input.shape, std=ae_f_noise_std))
            ae_mlp_dec = MLP(activations=[ae_activations[i]],
                             dims=[odim, idim],
                             name='dec%i'%i)
            dec = ae_mlp_dec.apply(enc_n)

            cost = tensor.sqrt(((ae_input - dec) ** 2).sum(axis=1)).mean() + \
                        ae_l1_pen * abs(enc).sum(axis=1).mean()
            ae_costs.append(cost)

            ae_input = enc
            ae_bricks = ae_bricks + [ae_mlp, ae_mlp_dec]

        self.ae_costs = ae_costs

        ref_data_enc = ae_input

        # Construct the model
        j = tensor.lvector('j')
        r = ref_data_enc[j, :]
        x = tensor.fmatrix('x')
        y = tensor.ivector('y')

        # input_dim must be nr
        mlp = MLP(activations=activation_functions,
                  dims=[ae_dims[-1]] + hidden_dims + [n_inter], name='inter_gen')
        mlp2 = MLP(activations=activation_functions_2 + [None],
                   dims=[n_inter] + hidden_dims_2 + [output_dim],
                   name='end_mlp')

        inter_weights = mlp.apply(r)

        if inter_bias == None:
            ibias = Bias(n_inter)
            ibias.biases_init = Constant(0)
            ibias.initialize()
            inter = ibias.apply(tensor.dot(x, inter_weights))
        else:
            inter = tensor.dot(x, inter_weights) - inter_bias
        inter = inter_act_fun.apply(inter)

        final = mlp2.apply(inter)

        cost = Softmax().categorical_cross_entropy(y, final)
        confidence = Softmax().apply(final)

        pred = final.argmax(axis=1)
        # error_rate = tensor.neq(y, pred).mean()
        ber = balanced_error_rate.ber(y, pred)

        # Initialize parameters
        for brick in ae_bricks + [mlp, mlp2]:
            brick.weights_init = IsotropicGaussian(0.01)
            brick.biases_init = Constant(0.001)
            brick.initialize()

        # apply regularization
        cg = ComputationGraph([cost, ber])

        if r_dropout != 0:
            # - dropout on input vector r : r_dropout
            cg = apply_dropout(cg, [r], r_dropout)

        if x_dropout != 0:
            cg = apply_dropout(cg, [x], x_dropout)

        if s_dropout != 0:
            # - dropout on intermediate layers of first mlp : s_dropout
            s_dropout_vars = list(set(VariableFilter(bricks=[Tanh], name='output')
                                                     (ComputationGraph([inter_weights])))
                                 - set([inter_weights]))
            cg = apply_dropout(cg, s_dropout_vars, s_dropout)

        if i_dropout != 0:
            # - dropout on input to second mlp : i_dropout
            cg = apply_dropout(cg, [inter], i_dropout)

        if a_dropout != 0:
            # - dropout on hidden layers of second mlp : a_dropout
            a_dropout_vars = list(set(VariableFilter(bricks=[Tanh], name='output')
                                                     (ComputationGraph([final])))
                                 - set([inter_weights]) - set(s_dropout_vars))
            cg = apply_dropout(cg, a_dropout_vars, a_dropout)

        if r_noise_std != 0:
            cg = apply_noise(cg, [r], r_noise_std)

        if w_noise_std != 0:
            # - apply noise on weight variables
            weight_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, weight_vars, w_noise_std)

        [cost_reg, ber_reg] = cg.outputs
        
        if s_l1pen != 0:
            s_weights = VariableFilter(bricks=mlp.linear_transformations, roles=[WEIGHT])(cg)
            cost_reg = cost_reg + s_l1pen * sum(abs(w).sum() for w in s_weights)
        if i_l1pen != 0:
            cost_reg = cost_reg + i_l1pen * abs(inter).sum()
        if a_l1pen != 0:
            a_weights = VariableFilter(bricks=mlp2.linear_transformations, roles=[WEIGHT])(cg)
            cost_reg = cost_reg + a_l1pen * sum(abs(w).sum() for w in a_weights)


        self.cost = cost
        self.cost_reg = cost_reg
        self.ber = ber
        self.ber_reg = ber_reg
        self.pred = pred
        self.confidence = confidence
Ejemplo n.º 13
0
def test_variable_filter():
    # Creating computation graph
    brick1 = Linear(input_dim=2, output_dim=2, name="linear1")
    brick2 = Bias(2, name="bias1")
    activation = Logistic(name="sigm")

    x = tensor.vector()
    h1 = brick1.apply(x)
    h2 = activation.apply(h1)
    h2.name = "h2act"
    y = brick2.apply(h2)
    cg = ComputationGraph(y)

    parameters = [brick1.W, brick1.b, brick2.parameters[0]]
    bias = [brick1.b, brick2.parameters[0]]
    brick1_bias = [brick1.b]

    # Testing filtering by role
    role_filter = VariableFilter(roles=[PARAMETER])
    assert parameters == role_filter(cg.variables)
    role_filter = VariableFilter(roles=[FILTER])
    assert [] == role_filter(cg.variables)

    # Testing filtering by role using each_role flag
    role_filter = VariableFilter(roles=[PARAMETER, BIAS])
    assert parameters == role_filter(cg.variables)
    role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True)
    assert not parameters == role_filter(cg.variables)
    assert bias == role_filter(cg.variables)

    # Testing filtering by bricks classes
    brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by bricks instances
    brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by brick instance
    brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by name
    name_filter = VariableFilter(name="W_norm")
    assert [cg.variables[2]] == name_filter(cg.variables)

    # Testing filtering by name regex
    name_filter_regex = VariableFilter(name_regex="W_no.?m")
    assert [cg.variables[2]] == name_filter_regex(cg.variables)

    # Testing filtering by theano name
    theano_name_filter = VariableFilter(theano_name="h2act")
    assert [cg.variables[11]] == theano_name_filter(cg.variables)

    # Testing filtering by theano name regex
    theano_name_filter_regex = VariableFilter(theano_name_regex="h2a.?t")
    assert [cg.variables[11]] == theano_name_filter_regex(cg.variables)

    # Testing filtering by application
    appli_filter = VariableFilter(applications=[brick1.apply])
    variables = [cg.variables[1], cg.variables[8]]
    assert variables == appli_filter(cg.variables)

    # Testing filtering by application
    appli_filter_list = VariableFilter(applications=[brick1.apply])
    assert variables == appli_filter_list(cg.variables)

    input1 = tensor.matrix("input1")
    input2 = tensor.matrix("input2")
    merge = Merge(["input1", "input2"], [5, 6], 2)
    merged = merge.apply(input1, input2)
    merge_cg = ComputationGraph(merged)
    outputs = VariableFilter(roles=[OUTPUT], bricks=[merge])(merge_cg.variables)
    assert merged in outputs
    assert len(outputs) == 3

    outputs_application = VariableFilter(roles=[OUTPUT], applications=[merge.apply])(merge_cg.variables)
    assert outputs_application == [merged]
Ejemplo n.º 14
0
    def __init__(self, ref_data, output_dim):
        input_dim = ref_data.shape[1]

        ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32),
                                    name='ref_data')

        # Construct the model
        j = tensor.lvector('j')
        r = ref_data_sh[j, :]
        x = tensor.fmatrix('x')
        y = tensor.ivector('y')

        # input_dim must be nr
        mlp0 = MLP(activations=activation_functions_0,
                   dims=[input_dim] + hidden_dims_0,
                   name='e0')
        mlp0vs = MLP(activations=[None],
                     dims=[hidden_dims_0[-1], input_dim],
                     name='de0')
        mlp1 = MLP(activations=activation_functions_1,
                   dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter],
                   name='inter_gen')
        mlp2 = MLP(activations=activation_functions_2 + [None],
                   dims=[n_inter] + hidden_dims_2 + [output_dim],
                   name='end_mlp')

        encod = mlp0.apply(r)
        rprime = mlp0vs.apply(encod)
        inter_weights = mlp1.apply(encod)

        ibias = Bias(n_inter)
        ibias.biases_init = Constant(0)
        ibias.initialize()
        inter = inter_act_fun.apply(ibias.apply(tensor.dot(x, inter_weights)))

        final = mlp2.apply(inter)

        cost = Softmax().categorical_cross_entropy(y, final)
        confidence = Softmax().apply(final)

        pred = final.argmax(axis=1)
        error_rate = tensor.neq(y, pred).mean()

        # Initialize parameters
        for brick in [mlp0, mlp0vs, mlp1, mlp2]:
            brick.weights_init = IsotropicGaussian(0.01)
            brick.biases_init = Constant(0.001)
            brick.initialize()

        # apply regularization
        cg = ComputationGraph([cost, error_rate])

        if r_dropout != 0:
            # - dropout on input vector r : r_dropout
            cg = apply_dropout(cg, [r], r_dropout)

        if s_dropout != 0:
            # - dropout on intermediate layers of first mlp : s_dropout
            s_dropout_vars = list(
                set(
                    VariableFilter(bricks=[Tanh], name='output')
                    (ComputationGraph([inter_weights]))) -
                set([inter_weights]))
            cg = apply_dropout(cg, s_dropout_vars, s_dropout)

        if i_dropout != 0:
            # - dropout on input to second mlp : i_dropout
            cg = apply_dropout(cg, [inter], i_dropout)

        if a_dropout != 0:
            # - dropout on hidden layers of second mlp : a_dropout
            a_dropout_vars = list(
                set(
                    VariableFilter(bricks=[Tanh], name='output')
                    (ComputationGraph([final]))) - set([inter_weights]) -
                set(s_dropout_vars))
            cg = apply_dropout(cg, a_dropout_vars, a_dropout)

        if w_noise_std != 0:
            # - apply noise on weight variables
            weight_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, weight_vars, w_noise_std)

        [cost_reg, error_rate_reg] = cg.outputs

        # add reconstruction penalty for AE part
        penalty_val = tensor.sqrt(((r - rprime)**2).sum(axis=1)).mean()
        cost_reg = cost_reg + reconstruction_penalty * penalty_val

        self.cost = cost
        self.cost_reg = cost_reg
        self.error_rate = error_rate
        self.error_rate_reg = error_rate_reg
        self.pred = pred
        self.confidence = confidence
Ejemplo n.º 15
0
def test_variable_filter():
    # Creating computation graph
    brick1 = Linear(input_dim=2, output_dim=2, name='linear1')
    brick2 = Bias(2, name='bias1')
    activation = Logistic(name='sigm')

    x = tensor.vector()
    h1 = brick1.apply(x, call_id='brick1_call_id')
    h2 = activation.apply(h1, call_id='act')
    h2.name = "h2act"
    y = brick2.apply(h2)
    cg = ComputationGraph(y)

    parameters = [brick1.W, brick1.b, brick2.parameters[0]]
    bias = [brick1.b, brick2.parameters[0]]
    brick1_bias = [brick1.b]

    # Testing filtering by role
    role_filter = VariableFilter(roles=[PARAMETER])
    assert parameters == role_filter(cg.variables)
    role_filter = VariableFilter(roles=[FILTER])
    assert [] == role_filter(cg.variables)

    # Testing filtering by role using each_role flag
    role_filter = VariableFilter(roles=[PARAMETER, BIAS])
    assert parameters == role_filter(cg.variables)
    role_filter = VariableFilter(roles=[PARAMETER, BIAS], each_role=True)
    assert not parameters == role_filter(cg.variables)
    assert bias == role_filter(cg.variables)

    # Testing filtering by bricks classes
    brick_filter = VariableFilter(roles=[BIAS], bricks=[Linear])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by bricks instances
    brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by brick instance
    brick_filter = VariableFilter(roles=[BIAS], bricks=[brick1])
    assert brick1_bias == brick_filter(cg.variables)

    # Testing filtering by name
    name_filter = VariableFilter(name='W_norm')
    assert [cg.variables[2]] == name_filter(cg.variables)

    # Testing filtering by name regex
    name_filter_regex = VariableFilter(name_regex='W_no.?m')
    assert [cg.variables[2]] == name_filter_regex(cg.variables)

    # Testing filtering by theano name
    theano_name_filter = VariableFilter(theano_name='h2act')
    assert [cg.variables[11]] == theano_name_filter(cg.variables)

    # Testing filtering by theano name regex
    theano_name_filter_regex = VariableFilter(theano_name_regex='h2a.?t')
    assert [cg.variables[11]] == theano_name_filter_regex(cg.variables)

    brick1_apply_variables = [cg.variables[1], cg.variables[8]]
    # Testing filtering by application
    appli_filter = VariableFilter(applications=[brick1.apply])
    assert brick1_apply_variables == appli_filter(cg.variables)

    # Testing filtering by unbound application
    unbound_appli_filter = VariableFilter(applications=[Linear.apply])
    assert brick1_apply_variables == unbound_appli_filter(cg.variables)

    # Testing filtering by call identifier
    call_id_filter = VariableFilter(call_id='brick1_call_id')
    assert brick1_apply_variables == call_id_filter(cg.variables)

    input1 = tensor.matrix('input1')
    input2 = tensor.matrix('input2')
    merge = Merge(['input1', 'input2'], [5, 6], 2)
    merged = merge.apply(input1, input2)
    merge_cg = ComputationGraph(merged)
    outputs = VariableFilter(
        roles=[OUTPUT], bricks=[merge])(merge_cg.variables)
    assert merged in outputs
    assert len(outputs) == 3

    outputs_application = VariableFilter(
        roles=[OUTPUT], applications=[merge.apply])(merge_cg.variables)
    assert outputs_application == [merged]
Ejemplo n.º 16
0
    def __init__(self, ref_data, output_dim):
        input_dim = ref_data.shape[1]
        ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32),
                                    name='ref_data')

        rng = RandomStreams()

        ae_bricks = []
        ae_input = ref_data_sh
        ae_costs = []
        for i, (idim,
                odim) in enumerate(zip([input_dim] + ae_dims[:-1], ae_dims)):
            ae_mlp = MLP(activations=[ae_activations[i]],
                         dims=[idim, odim],
                         name='enc%i' % i)
            enc = ae_mlp.apply(ae_input)
            enc_n = ae_mlp.apply(
                ae_input + rng.normal(size=ae_input.shape, std=ae_f_noise_std))
            ae_mlp_dec = MLP(activations=[ae_activations[i]],
                             dims=[odim, idim],
                             name='dec%i' % i)
            dec = ae_mlp_dec.apply(enc_n)

            cost = tensor.sqrt(((ae_input - dec) ** 2).sum(axis=1)).mean() + \
                        ae_l1_pen * abs(enc).sum(axis=1).mean()
            ae_costs.append(cost)

            ae_input = enc
            ae_bricks = ae_bricks + [ae_mlp, ae_mlp_dec]

        self.ae_costs = ae_costs

        ref_data_enc = ae_input

        # Construct the model
        j = tensor.lvector('j')
        r = ref_data_enc[j, :]
        x = tensor.fmatrix('x')
        y = tensor.ivector('y')

        # input_dim must be nr
        mlp = MLP(activations=activation_functions,
                  dims=[ae_dims[-1]] + hidden_dims + [n_inter],
                  name='inter_gen')
        mlp2 = MLP(activations=activation_functions_2 + [None],
                   dims=[n_inter] + hidden_dims_2 + [output_dim],
                   name='end_mlp')

        inter_weights = mlp.apply(r)

        if inter_bias == None:
            ibias = Bias(n_inter)
            ibias.biases_init = Constant(0)
            ibias.initialize()
            inter = ibias.apply(tensor.dot(x, inter_weights))
        else:
            inter = tensor.dot(x, inter_weights) - inter_bias
        inter = inter_act_fun.apply(inter)

        final = mlp2.apply(inter)

        cost = Softmax().categorical_cross_entropy(y, final)
        confidence = Softmax().apply(final)

        pred = final.argmax(axis=1)
        # error_rate = tensor.neq(y, pred).mean()
        ber = balanced_error_rate.ber(y, pred)

        # Initialize parameters
        for brick in ae_bricks + [mlp, mlp2]:
            brick.weights_init = IsotropicGaussian(0.01)
            brick.biases_init = Constant(0.001)
            brick.initialize()

        # apply regularization
        cg = ComputationGraph([cost, ber])

        if r_dropout != 0:
            # - dropout on input vector r : r_dropout
            cg = apply_dropout(cg, [r], r_dropout)

        if x_dropout != 0:
            cg = apply_dropout(cg, [x], x_dropout)

        if s_dropout != 0:
            # - dropout on intermediate layers of first mlp : s_dropout
            s_dropout_vars = list(
                set(
                    VariableFilter(bricks=[Tanh], name='output')
                    (ComputationGraph([inter_weights]))) -
                set([inter_weights]))
            cg = apply_dropout(cg, s_dropout_vars, s_dropout)

        if i_dropout != 0:
            # - dropout on input to second mlp : i_dropout
            cg = apply_dropout(cg, [inter], i_dropout)

        if a_dropout != 0:
            # - dropout on hidden layers of second mlp : a_dropout
            a_dropout_vars = list(
                set(
                    VariableFilter(bricks=[Tanh], name='output')
                    (ComputationGraph([final]))) - set([inter_weights]) -
                set(s_dropout_vars))
            cg = apply_dropout(cg, a_dropout_vars, a_dropout)

        if r_noise_std != 0:
            cg = apply_noise(cg, [r], r_noise_std)

        if w_noise_std != 0:
            # - apply noise on weight variables
            weight_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, weight_vars, w_noise_std)

        [cost_reg, ber_reg] = cg.outputs

        if s_l1pen != 0:
            s_weights = VariableFilter(bricks=mlp.linear_transformations,
                                       roles=[WEIGHT])(cg)
            cost_reg = cost_reg + s_l1pen * sum(
                abs(w).sum() for w in s_weights)
        if i_l1pen != 0:
            cost_reg = cost_reg + i_l1pen * abs(inter).sum()
        if a_l1pen != 0:
            a_weights = VariableFilter(bricks=mlp2.linear_transformations,
                                       roles=[WEIGHT])(cg)
            cost_reg = cost_reg + a_l1pen * sum(
                abs(w).sum() for w in a_weights)

        self.cost = cost
        self.cost_reg = cost_reg
        self.ber = ber
        self.ber_reg = ber_reg
        self.pred = pred
        self.confidence = confidence
Ejemplo n.º 17
0
    def __init__(self, ref_data, output_dim):
        ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name='ref_data')

        # Construct the model
        j = tensor.lvector('j')
        x = tensor.fmatrix('x')
        y = tensor.ivector('y')

        last_outputs = []
        s_dropout_vars = []
        r_dropout_vars = []
        i_dropout_vars = []
        penalties = []

        for i in range(nparts):
            fs = numpy.random.binomial(1, part_r_proba, size=(ref_data.shape[1],))
            input_dim = int(fs.sum())

            fs_sh = theano.shared(fs)
            r = ref_data_sh[j, :][:, fs_sh.nonzero()[0]]

            mlp0 = MLP(activations=activation_functions_0,
                      dims=[input_dim] + hidden_dims_0, name='enc%d'%i)
            mlp0r = MLP(activations=[None], dims=[hidden_dims_0[-1], input_dim], name='dec%d'%i)
            mlp1 = MLP(activations=activation_functions_1,
                      dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter], name='inter_gen_%d'%i)
            mlp2 = MLP(activations=activation_functions_2 + [None],
                       dims=[n_inter] + hidden_dims_2 + [output_dim],
                       name='end_mlp_%d'%i)

            encod = mlp0.apply(r)
            rprime = mlp0r.apply(encod)
            inter_weights = mlp1.apply(encod)

            ibias = Bias(n_inter, name='inter_bias_%d'%i)
            inter = ibias.apply(tensor.dot(x, inter_weights))
            inter = inter_act_fun.apply(inter)

            out = mlp2.apply(inter)

            penalties.append(tensor.sqrt(((rprime - r)**2).sum(axis=1)).mean()[None])

            last_outputs.append(out)

            r_dropout_vars.append(r)
            s_dropout_vars = s_dropout_vars + (
                                    VariableFilter(bricks=[Tanh], name='output')
                                                  (ComputationGraph([inter_weights]))
                            )
            i_dropout_vars.append(inter)

            # Initialize parameters
            for brick in [mlp0, mlp0r, mlp1, mlp2, ibias]:
                brick.weights_init = IsotropicGaussian(0.01)
                brick.biases_init = Constant(0.001)
                brick.initialize()

        final = tensor.concatenate([x[:, :, None] for x in last_outputs], axis=2).mean(axis=2)

        cost = Softmax().categorical_cross_entropy(y, final)
        confidence = Softmax().apply(final)

        pred = final.argmax(axis=1)
        error_rate = tensor.neq(y, pred).mean()

        # apply regularization
        cg = ComputationGraph([cost, error_rate])

        if w_noise_std != 0:
            # - apply noise on weight variables
            weight_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, weight_vars, w_noise_std)

        if s_dropout != 0:
            cg = apply_dropout(cg, s_dropout_vars, s_dropout)
        if r_dropout != 0:
            cg = apply_dropout(cg, r_dropout_vars, r_dropout)
        if i_dropout != 0:
            cg = apply_dropout(cg, i_dropout_vars, i_dropout)

        [cost_reg, error_rate_reg] = cg.outputs

        cost_reg = cost_reg + reconstruction_penalty * tensor.concatenate(penalties, axis=0).sum()

        self.cost = cost
        self.cost_reg = cost_reg
        self.error_rate = error_rate
        self.error_rate_reg = error_rate_reg
        self.pred = pred
        self.confidence = confidence
Ejemplo n.º 18
0
    def __init__(self,
                 input_dims,
                 input_num_chars,
                 bos_label, eos_label,
                 num_labels,
                 dim_dec, dims_bidir,
                 enc_transition, dec_transition,
                 use_states_for_readout,
                 attention_type,
                 criterion,
                 bottom,
                 lm=None, token_map=None,
                 bidir=True, window_size=None,
                 max_length=None, subsample=None,
                 dims_top=None, extra_input_dim=None,
                 prior=None, conv_n=None,
                 post_merge_activation=None,
                 post_merge_dims=None,
                 dim_matcher=None,
                 embed_outputs=True,
                 dim_output_embedding=None,
                 reuse_bottom_lookup_table=False,
                 dec_stack=1,
                 conv_num_filters=1,
                 data_prepend_eos=True,
                 # softmax is the default set in SequenceContentAndConvAttention
                 energy_normalizer=None,
                 # for speech this is the approximate phoneme duration in frames
                 max_decoded_length_scale=1,
                 # for criterions involving generation of outputs, whether
                 # or not they should be generated by the recognizer itself
                 generate_predictions=True,
                 compute_targets=True,
                 extra_generation_steps=3,
                 **kwargs):
        all_arguments = copy.deepcopy(locals())
        all_arguments.update(copy.deepcopy(kwargs))
        del all_arguments['kwargs']
        del all_arguments['self']

        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(EncoderDecoder, self).__init__(**kwargs)
        self.bos_label = bos_label
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        self.criterion = criterion
        self.generate_predictions = generate_predictions
        self.extra_generation_steps = extra_generation_steps
        self.compute_targets = compute_targets

        self.max_decoded_length_scale = max_decoded_length_scale

        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        bottom_class = bottom.pop('bottom_class')
        bottom = bottom_class(
            input_dims=input_dims, input_num_chars=input_num_chars,
            name='bottom',
            **bottom)

        # BiRNN
        if dims_bidir:
            if not subsample:
                subsample = [1] * len(dims_bidir)
            encoder = Encoder(self.enc_transition, dims_bidir,
                            bottom.get_dim(bottom.apply.outputs[0]),
                            subsample, bidir=bidir)
        elif window_size:
            encoder = ConvEncoder(
                max_length, bottom.get_dim(bottom.apply.outputs[0]), window_size)
        else:
            raise ValueError("Don't know which Encoder to use")
        dim_encoded = encoder.get_dim(encoder.apply.outputs[0])

        # The top part, on top of BiRNN but before the attention
        if dims_top:
            top = MLP([Tanh()],
                      [dim_encoded] + dims_top + [dim_encoded], name="top")
        else:
            top = Identity(name='top')

        if dec_stack == 1:
            transition = self.dec_transition(
                dim=dim_dec, activation=Tanh(), name="transition")
        else:
            assert not extra_input_dim
            transitions = [self.dec_transition(dim=dim_dec,
                                               activation=Tanh(),
                                               name="transition_{}".format(trans_level))
                           for trans_level in xrange(dec_stack)]
            transition = RecurrentStack(transitions=transitions,
                                        skip_connections=True)
        # Choose attention mechanism according to the configuration
        if attention_type == "content":
            attention = SequenceContentAttention(
                state_names=transition.apply.states,
                attended_dim=dim_encoded, match_dim=dim_matcher,
                name="cont_att")
        elif attention_type == "content_and_conv":
            attention = SequenceContentAndConvAttention(
                state_names=transition.apply.states,
                conv_n=conv_n,
                conv_num_filters=conv_num_filters,
                attended_dim=dim_encoded, match_dim=dim_matcher,
                prior=prior,
                energy_normalizer=energy_normalizer,
                name="conv_att")
        else:
            raise ValueError("Unknown attention type {}"
                             .format(attention_type))
        if not embed_outputs:
            raise ValueError("embed_outputs=False is not supported any more")
        if not reuse_bottom_lookup_table:
            embedding = LookupTable(num_labels + 1,
                            dim_dec if
                            dim_output_embedding is None
                            else dim_output_embedding)
        else:
            embedding = bottom.children[0]
        feedback = Feedback(
            embedding=embedding,
            output_names=[s for s in transition.apply.sequences
                           if s != 'mask'])

        # Create a readout
        readout_config = dict(
            num_tokens=num_labels,
            input_names=(transition.apply.states if use_states_for_readout else [])
                         + [attention.take_glimpses.outputs[0]],
            name="readout")
        if post_merge_dims:
            readout_config['merge_dim'] = post_merge_dims[0]
            readout_config['post_merge'] = InitializableSequence([
                Bias(post_merge_dims[0]).apply,
                post_merge_activation.apply,
                MLP([post_merge_activation] * (len(post_merge_dims) - 1) + [Identity()],
                    # MLP was designed to support Maxout is activation
                    # (because Maxout in a way is not one). However
                    # a single layer Maxout network works with the trick below.
                    # For deeper Maxout network one has to use the
                    # Sequence brick.
                    [d//getattr(post_merge_activation, 'num_pieces', 1)
                     for d in post_merge_dims] + [num_labels]).apply,
            ], name='post_merge')
        if 'reward' in criterion and criterion['name'] != 'log_likelihood':
            if criterion['reward'] == 'edit_distance':
                readout_config['reward_brick'] = EditDistanceReward(
                    self.bos_label, self.eos_label)
            elif criterion['reward'] == 'delta_edit_distance':
                readout_config['reward_brick'] = EditDistanceReward(
                    self.bos_label, self.eos_label, deltas=True)
            elif criterion['reward'] == 'bleu':
                readout_config['reward_brick'] = BleuReward(
                    self.bos_label, self.eos_label, deltas=False)
            elif criterion['reward'] == 'delta_bleu':
                readout_config['reward_brick'] = BleuReward(
                    self.bos_label, self.eos_label, deltas=True)
            else:
                raise ValueError("Unknown reward type")
        if criterion['name'] == 'log_likelihood':
            readout_class = SoftmaxReadout
        elif criterion['name'] == 'critic':
            readout_class = CriticReadout
            criterion_copy = dict(criterion)
            del criterion_copy['name']
            readout_config.update(**criterion_copy)
        elif criterion['name'] == 'reinforce':
            readout_class = ReinforceReadout
            readout_config['merge_names'] = list(readout_config['input_names'])
            readout_config['entropy'] = criterion.get('entropy')
            readout_config['input_names'] += ['attended', 'attended_mask']
        elif criterion['name'] in ['sarsa', 'actor_critic']:
            readout_class = ActorCriticReadout
            if criterion['name'] == 'actor_critic':
                critic_arguments = dict(all_arguments)
                # No worries, critic will not compute log likelihood values.
                # We
                critic_arguments['criterion'] = {
                    'name': 'critic',
                    'value_softmax': criterion.get('value_softmax'),
                    'same_value_for_wrong': criterion.get('same_value_for_wrong'),
                    'groundtruth_word_bonus': criterion.get('groundtruth_word_bonus'),
                    'dueling_outputs':  criterion.get('dueling_outputs')}
                critic_arguments['name'] = 'critic'
                if criterion.get('critic_uses_actor_states'):
                    critic_arguments['extra_input_dim'] = dim_dec
                if (criterion.get('value_softmax')
                        or criterion.get('same_value_for_wrong')
                        or criterion.get('dueling_outputs')):
                    # Add an extra output for the critic
                    critic_arguments['num_labels'] = num_labels + 1
                if criterion.get('force_bidir'):
                    critic_arguments['dims_bidir'] = [dim_dec]
                critic_arguments['reuse_bottom_lookup_table'] = True
                critic_arguments['input_num_chars'] = {'inputs': num_labels}
                if criterion.get('downsize_critic'):
                    critic_arguments = _downsize_config(
                        critic_arguments, criterion['downsize_critic'])
                critic = EncoderDecoder(**critic_arguments)
                readout_config['critic'] = critic
            readout_config['merge_names'] = list(readout_config['input_names'])
            readout_config['freeze_actor'] = criterion.get('freeze_actor')
            readout_config['freeze_critic'] = criterion.get('freeze_critic')
            readout_config['critic_uses_actor_states'] = criterion.get('critic_uses_actor_states')
            readout_config['critic_uses_groundtruth'] = criterion.get('critic_uses_groundtruth')
            readout_config['critic_burnin_steps'] = criterion.get('critic_burnin_steps')
            readout_config['critic_loss'] = criterion.get('critic_loss')
            readout_config['discount'] = criterion.get('discount')
            readout_config['entropy_reward_coof'] = criterion.get('entropy_reward_coof')
            readout_config['cross_entropy_reward_coof'] = criterion.get('cross_entropy_reward_coof')
            readout_config['value_penalty'] = criterion.get('value_penalty')
            readout_config['value_penalty_type'] = criterion.get('value_penalty_type')
            readout_config['critic_policy_t'] = criterion.get('critic_policy_t')
            readout_config['bos_token'] = bos_label
            readout_config['accumulate_outputs'] = criterion.get('accumulate_outputs')
            readout_config['use_value_biases'] = criterion.get('use_value_biases')
            readout_config['actor_grad_estimate'] = criterion.get('actor_grad_estimate')
            readout_config['input_names'] += ['attended', 'attended_mask']
            # Note, that settings below are for the "clean" mode.
            # When get_cost_graph() is run with training=True, they
            # are temporarily overriden with the "real" settings from
            # "criterion"
            readout_config['compute_targets'] = True
            readout_config['trpo_coef'] = 0.0
            readout_config['solve_bellman'] = True
        else:
            raise ValueError("Unknown criterion {}".format(criterion['name']))
        readout = readout_class(**readout_config)

        if lm:
            raise ValueError("LM is currently not supported")

        recurrent = AttentionRecurrent(transition, attention)
        if extra_input_dim:
            recurrent = RecurrentWithExtraInput(
                recurrent, "extra_inputs", extra_input_dim, name="with_extra_inputs")
        generator = SequenceGenerator(
            recurrent=recurrent, readout=readout, feedback=feedback,
            name="generator")

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generator = generator
        self.softmax = Softmax()
        self.children = [encoder, top, bottom, generator, self.softmax]

        # Create input variables
        self.inputs = self.bottom.batch_inputs
        self.inputs_mask = self.bottom.mask

        self.labels = tensor.lmatrix('labels')
        self.labels_mask = tensor.matrix("labels_mask")

        self.predicted_labels = tensor.lmatrix('predicted_labels')
        self.predicted_mask = tensor.matrix('predicted_mask')
        self.prefix_labels = tensor.lmatrix('prefix_labels')
        self.prefix_steps = tensor.lscalar('prefix_steps')

        self.single_inputs = self.bottom.single_inputs
        self.single_labels = tensor.lvector('labels')
        self.single_predicted_labels = tensor.lvector('predicted_labels')
        self.n_steps = tensor.lscalar('n_steps')

        # Configure mixed_generate
        if criterion['name'] == 'actor_critic':
            critic = self.generator.readout.critic
            self.mixed_generate.sequences = []
            self.mixed_generate.states = (
                ['step'] +
                self.generator.recurrent.apply.states +
                ['critic_' + name for name in critic.generator.recurrent.apply.states])
            self.mixed_generate.outputs = (
                ['samples', 'step'] +
                self.generator.recurrent.apply.outputs +
                ['critic_' + name for name in critic.generator.recurrent.apply.outputs])
            self.mixed_generate.contexts = (
                self.generator.recurrent.apply.contexts +
                ['critic_' + name for name in critic.generator.recurrent.apply.contexts]
                + ['groundtruth', 'groundtruth_mask'])
            self.initial_states.outputs = self.mixed_generate.states

        self.prefix_generate.sequences = []
        self.prefix_generate.states = ['step'] + self.generator.recurrent.apply.states
        self.prefix_generate.outputs = ['samples', 'step'] + self.generator.recurrent.apply.outputs
        self.prefix_generate.contexts = self.generator.recurrent.apply.contexts
Ejemplo n.º 19
0
    def __init__(self,
                 vocab_size,
                 topicWord_size,
                 embedding_dim,
                 state_dim,
                 topical_dim,
                 representation_dim,
                 match_function='SumMacthFunction',
                 use_doubly_stochastic=False,
                 lambda_ds=0.001,
                 use_local_attention=False,
                 window_size=10,
                 use_step_decay_cost=False,
                 use_concentration_cost=False,
                 lambda_ct=10,
                 use_stablilizer=False,
                 lambda_st=50,
                 theano_seed=None,
                 **kwargs):
        super(Decoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.topicWord_size = topicWord_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        # Initialize gru with special initial state
        self.transition = GRU(attended_dim=state_dim,
                              dim=state_dim,
                              activation=Tanh(),
                              name='decoder')

        self.energy_computer = globals()[match_function](name='energy_comp')

        # Initialize the attention mechanism
        self.attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim,
            energy_computer=self.energy_computer,
            use_local_attention=use_local_attention,
            window_size=window_size,
            name="attention")

        self.topical_attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=topical_dim,
            match_dim=state_dim,
            energy_computer=self.energy_computer,
            use_local_attention=use_local_attention,
            window_size=window_size,
            name="topical_attention"
        )  #not sure whether the match dim would be correct.

        # Initialize the readout, note that SoftmaxEmitter emits -1 for
        # initial outputs which is used by LookupFeedBackWMT15
        readout = Readout(source_names=[
            'states', 'feedback', self.attention.take_glimpses.outputs[0]
        ],
                          readout_dim=self.vocab_size,
                          emitter=SoftmaxEmitter(initial_output=-1,
                                                 theano_seed=theano_seed),
                          feedback_brick=LookupFeedbackWMT15(
                              vocab_size, embedding_dim),
                          post_merge=InitializableFeedforwardSequence([
                              Bias(dim=state_dim, name='maxout_bias').apply,
                              Maxout(num_pieces=2, name='maxout').apply,
                              Linear(input_dim=state_dim / 2,
                                     output_dim=embedding_dim,
                                     use_bias=False,
                                     name='softmax0').apply,
                              Linear(input_dim=embedding_dim,
                                     name='softmax1').apply
                          ]),
                          merged_dim=state_dim,
                          name='readout')

        # calculate the readout of topic word,
        # no specific feedback brick, use the trival feedback break
        # no post_merge and merge, use Bias and Linear
        topicWordReadout = Readout(source_names=[
            'states', 'feedback', self.attention.take_glimpses.outputs[0]
        ],
                                   readout_dim=self.topicWord_size,
                                   emitter=SoftmaxEmitter(
                                       initial_output=-1,
                                       theano_seed=theano_seed),
                                   name='twReadout')

        # Build sequence generator accordingly
        self.sequence_generator = SequenceGenerator(
            readout=readout,
            topicWordReadout=topicWordReadout,
            topic_vector_names=['topicSumVector'],
            transition=self.transition,
            attention=self.attention,
            topical_attention=self.topical_attention,
            q_dim=self.state_dim,
            #q_name='topic_embedding',
            topical_name='topic_embedding',
            content_name='content_embedding',
            use_step_decay_cost=use_step_decay_cost,
            use_doubly_stochastic=use_doubly_stochastic,
            lambda_ds=lambda_ds,
            use_concentration_cost=use_concentration_cost,
            lambda_ct=lambda_ct,
            use_stablilizer=use_stablilizer,
            lambda_st=lambda_st,
            fork=Fork([
                name
                for name in self.transition.apply.sequences if name != 'mask'
            ],
                      prototype=Linear()))

        self.children = [self.sequence_generator]
Ejemplo n.º 20
0
class EUTHM(UTHM):
    '''
    UTH model with extend information
    '''
    def __init__(self, config, dataset, *args, **kwargs):
        super(EUTHM, self).__init__(config, dataset)

    def _define_inputs(self, *args, **kwargs):
        super(EUTHM, self)._define_inputs()
        self.user_word = tensor.ivector('user_word')
        self.user_word_sparse_mask = tensor.vector('user_word_sparse_mask',
                                                   dtype=theano.config.floatX)
        self.user_word_left_idx = tensor.ivector('user_word_idx_left_idx')
        self.user_word_right_idx = tensor.ivector('user_word_idx_right_idx')
        self.hashtag_word = tensor.ivector('hashtag_word')
        self.hashtag_sparse_mask = tensor.vector('hashtag_word_sparse_mask',
                                                 dtype=theano.config.floatX)
        self.hashtag_word_left_idx = tensor.ivector(
            'hashtag_word_idx_left_idx')
        self.hashtag_word_right_idx = tensor.ivector(
            'hashtag_word_idx_right_idx')
        self.sparse_word = tensor.imatrix('sparse_word')
        self.sparse_word_sparse_mask = tensor.vector(
            'sparse_word_sparse_mask', dtype=theano.config.floatX)
        self.sparse_word_mask = tensor.matrix('sparse_word_mask',
                                              dtype=theano.config.floatX)
        self.sparse_word_left_idx = tensor.ivector('sparse_word_idx_left_idx')
        self.sparse_word_right_idx = tensor.ivector(
            'sparse_word_idx_right_idx')

    def _build_bricks(self, *args, **kwargs):
        # Build lookup tables
        super(EUTHM, self)._build_bricks()
        self.user2word = MLP(
            activations=[Tanh('user2word_tanh')],
            dims=[self.config.user_embed_dim, self.config.word_embed_dim],
            name='user2word_mlp')
        self.user2word.weights_init = IsotropicGaussian(
            std=1 / numpy.sqrt(self.config.word_embed_dim))
        self.user2word.biases_init = Constant(0)
        self.user2word.initialize()
        self.hashtag2word = MLP(
            activations=[Tanh('hashtag2word_tanh')],
            dims=[
                self.config.user_embed_dim + self.config.word_embed_dim,
                self.config.word_embed_dim
            ],
            name='hashtag2word_mlp')
        self.hashtag2word.weights_init = IsotropicGaussian(
            std=1 / numpy.sqrt(self.config.word_embed_dim))
        self.hashtag2word.biases_init = Constant(0)
        self.hashtag2word.initialize()
        self.user2word_bias = Bias(dim=1, name='user2word_bias')
        self.user2word_bias.biases_init = Constant(0)
        self.user2word_bias.initialize()
        self.hashtag2word_bias = Bias(dim=1, name='hashtag2word_bias')
        self.hashtag2word_bias.biases_init = Constant(0)
        self.hashtag2word_bias.initialize()
        #Build character embedding
        self.char_embed = self._embed(len(self.dataset.char2index),
                                      self.config.char_embed_dim,
                                      name='char_embed')
        # Build sparse word encoder
        self.rnn_ins = Linear(input_dim=self.config.char_embed_dim,
                              output_dim=self.config.word_embed_dim,
                              name='rnn_in')
        self.rnn_ins.weights_init = IsotropicGaussian(
            std=numpy.sqrt(2) / numpy.sqrt(self.config.char_embed_dim +
                                           self.config.word_embed_dim))
        self.rnn_ins.biases_init = Constant(0)
        self.rnn_ins.initialize()
        self.rnn = SimpleRecurrent(dim=self.config.word_embed_dim,
                                   activation=Tanh())
        self.rnn.weights_init = IsotropicGaussian(
            std=1 / numpy.sqrt(self.config.word_embed_dim))
        self.rnn.initialize()

    def _set_OV_value(self, *args, **kwargs):
        '''Train a <unk> representation'''
        tensor.set_subtensor(
            self.char_embed.W[self.dataset.char2index['<unk>']],
            numpy.zeros(self.config.char_embed_dim,
                        dtype=theano.config.floatX))

    def _get_text_vec(self, *args, **kwargs):
        # Transpose text
        self.text = self.text.dimshuffle(1, 0)
        self.text_mask = self.text_mask.dimshuffle(1, 0)
        self.sparse_word = self.sparse_word.dimshuffle(1, 0)
        self.sparse_word_mask = self.sparse_word_mask.dimshuffle(1, 0)
        # Turn word, user and hashtag into vector representation
        text_vec = self.word_embed.apply(self.text)
        # Apply user word, hashtag word and url
        text_vec = self._apply_user_word(text_vec)
        text_vec = self._apply_hashtag_word(text_vec)
        text_vec = self._apply_sparse_word(text_vec)
        return text_vec

    @abstractmethod
    def _apply_user_word(self, text_vec, *args, **kwargs):
        '''
        Replace @a with transformed author vector
        :param text_vec:
        :param args:
        :param kwargs:
        :return:
        '''
        user_word_vec = self.user2word.apply(self.user_embed.apply(self.user_word)) + \
                        self.user2word_bias.parameters[0][0]
        text_vec = tensor.set_subtensor(
            text_vec[self.user_word_right_idx, self.user_word_left_idx],
            text_vec[self.user_word_right_idx, self.user_word_left_idx] *
            (1 - self.user_word_sparse_mask[:, None]) +
            user_word_vec * self.user_word_sparse_mask[:, None])
        return text_vec

    @abstractmethod
    def _apply_hashtag_word(self, text_vec, *args, **kwargs):
        '''
        Replace #h with transformed hashtag vector
        :param text_vec:
        :param args:
        :param kwargs:
        :return:
        '''
        hashtag_word_vec = self.hashtag2word.apply(self.hashtag_embed.apply(self.hashtag_word)) +\
                           self.hashtag2word_bias.parameters[0][0]
        text_vec = tensor.set_subtensor(
            text_vec[self.hashtag_word_right_idx, self.hashtag_word_left_idx],
            text_vec[self.hashtag_word_right_idx, self.hashtag_word_left_idx] *
            (1 - self.hashtag_sparse_mask[:, None]) +
            hashtag_word_vec * self.hashtag_sparse_mask[:, None])
        return text_vec

    @abstractmethod
    def _apply_sparse_word(self, text_vec, *args, **kwargs):
        '''
        Replace sparse word encoding with character embedding. (maybe lstm)
        :param text_vec:
        :param args:
        :param kwargs:
        :return:
        '''
        sparse_word_vec = self.char_embed.apply(self.sparse_word)
        sparse_word_hiddens = self.rnn.apply(
            inputs=self.rnn_ins.apply(sparse_word_vec),
            mask=self.sparse_word_mask)
        tmp = sparse_word_hiddens[-1]
        text_vec = tensor.set_subtensor(
            text_vec[self.sparse_word_right_idx, self.sparse_word_left_idx],
            text_vec[self.sparse_word_right_idx, self.sparse_word_left_idx] *
            (1 - self.sparse_word_sparse_mask[:, None]) +
            tmp * self.sparse_word_sparse_mask[:, None])
        return text_vec
Ejemplo n.º 21
0
def main(config): 
	vocab_src, _ = text_to_dict([config['train_src'],
		config['dev_src'], config['test_src']])
	vocab_tgt, cabvo = text_to_dict([config['train_tgt'],
		config['dev_tgt']])

	# Create Theano variables
	logger.info('Creating theano variables')
	source_sentence = tensor.lmatrix('source')
	source_sentence_mask = tensor.matrix('source_mask')
	target_sentence = tensor.lmatrix('target')
	target_sentence_mask = tensor.matrix('target_mask')
	source_sentence.tag.test_value = [[13, 20, 0, 20, 0, 20, 0],
										[1, 4, 8, 4, 8, 4, 8],]
	source_sentence_mask.tag.test_value = [[0, 1, 0, 1, 0, 1, 0],
											[1, 0, 1, 0, 1, 0, 1],]
	target_sentence.tag.test_value = [[0,1,1,5],
										[2,0,1,0],]
	target_sentence_mask.tag.test_value = [[0,1,1,0],
											[1,1,1,0],]


	logger.info('Building RNN encoder-decoder')
	### Building Encoder 
	embedder = LookupTable(
		length=len(vocab_src), 
		dim=config['embed_src'], 
		weights_init=IsotropicGaussian(),
		biases_init=Constant(0.0), 
		name='embedder')
	transformer = Linear(
		config['embed_src'], 
		config['hidden_src']*4, 
		weights_init=IsotropicGaussian(),
		biases_init=Constant(0.0), 
		name='transformer')

	lstminit = np.asarray([0.0,]*config['hidden_src']+[0.0,]*config['hidden_src']+[1.0,]*config['hidden_src']+[0.0,]*config['hidden_src'])
	encoder = Bidirectional(
		LSTM(
			dim=config['hidden_src'], 
			weights_init=IsotropicGaussian(0.01),
			biases_init=Constant(lstminit)),
		name='encoderBiLSTM'
		)
	encoder.prototype.weights_init = Orthogonal()
	
	### Building Decoder 
	lstminit = np.asarray([0.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt']+[1.0,]*config['hidden_tgt']+[0.0,]*config['hidden_tgt'])
	transition = LSTM2GO(
		attended_dim=config['hidden_tgt'], 
		dim=config['hidden_tgt'], 
		weights_init=IsotropicGaussian(0.01),
		biases_init=Constant(lstminit), 
		name='decoderLSTM')

	attention = SequenceContentAttention( 
		state_names=transition.apply.states, # default activation is Tanh
		state_dims=[config['hidden_tgt']],
		attended_dim=config['hidden_src']*2,
		match_dim=config['hidden_tgt'], 
		name="attention")

	readout = Readout(
		source_names=['states', 
			'feedback', 
			attention.take_glimpses.outputs[0]],
		readout_dim=len(vocab_tgt),
		emitter = SoftmaxEmitter(
			name='emitter'), 
		feedback_brick = LookupFeedback(
			num_outputs=len(vocab_tgt), 
			feedback_dim=config['embed_tgt'], 
			name='feedback'), 
		post_merge=InitializableFeedforwardSequence([
			Bias(dim=config['hidden_tgt'], 
				name='softmax_bias').apply,
			Linear(input_dim=config['hidden_tgt'], 
				output_dim=config['embed_tgt'],
				use_bias=False, 
				name='softmax0').apply,
			Linear(input_dim=config['embed_tgt'], 
				name='softmax1').apply]),
		merged_dim=config['hidden_tgt'])

	decoder = SequenceGenerator(
		readout=readout, 
		transition=transition, 
		attention=attention, 
		weights_init=IsotropicGaussian(0.01), 
		biases_init=Constant(0),
		name="generator",
		fork=Fork(
			[name for name in transition.apply.sequences if name != 'mask'], 
			prototype=Linear()),
		add_contexts=True)
	decoder.transition.weights_init = Orthogonal()

	#printchildren(encoder, 1)
	# Initialize model
	logger.info('Initializing model')
	embedder.initialize()
	transformer.initialize()
	encoder.initialize()
	decoder.initialize()
	
	# Apply model 
	embedded = embedder.apply(source_sentence)
	tansformed = transformer.apply(embedded)
	encoded = encoder.apply(tansformed)[0]
	generated = decoder.generate(
		n_steps=2*source_sentence.shape[1], 
		batch_size=source_sentence.shape[0], 
		attended = encoded.dimshuffle(1,0,2), 
		attended_mask=tensor.ones(source_sentence.shape).T
		)
	print 'Generated: ', generated
	# generator_generate_outputs
	#samples = generated[1] # For GRU 
	samples = generated[2] # For LSTM
	samples.name = 'samples'
	#samples_cost = generated[4] # For GRU 
	samples_cost = generated[5] # For LSTM
	samples_cost = 'sampling_cost'
	cost = decoder.cost(
		mask = target_sentence_mask.T, 
		outputs = target_sentence.T, 
		attended = encoded.dimshuffle(1,0,2), 
		attended_mask = source_sentence_mask.T)
	cost.name = 'target_cost'
	cost.tag.aggregation_scheme = TakeLast(cost)
	model = Model(cost)
	
	logger.info('Creating computational graph')
	cg = ComputationGraph(cost)
	
	# apply dropout for regularization
	if config['dropout'] < 1.0: # dropout is applied to the output of maxout in ghog
		logger.info('Applying dropout')
		dropout_inputs = [x for x in cg.intermediary_variables if x.name == 'maxout_apply_output']
		cg = apply_dropout(cg, dropout_inputs, config['dropout'])

	######## 
	# Print shapes
	shapes = [param.get_value().shape for param in cg.parameters]
	logger.info("Parameter shapes: ")
	for shape, count in Counter(shapes).most_common():
		logger.info('	{:15}: {}'.format(shape, count))
	logger.info("Total number of parameters: {}".format(len(shapes)))

	printchildren(embedder, 1)
	printchildren(transformer, 1)
	printchildren(encoder, 1)
	printchildren(decoder, 1)
	# Print parameter names
	# enc_dec_param_dict = merge(Selector(embedder).get_parameters(), Selector(encoder).get_parameters(), Selector(decoder).get_parameters())
	# enc_dec_param_dict = merge(Selector(decoder).get_parameters())
	# logger.info("Parameter names: ")
	# for name, value in enc_dec_param_dict.items():
	# 	logger.info('	{:15}: {}'.format(value.get_value().shape, name))
	# logger.info("Total number of parameters: {}".format(len(enc_dec_param_dict)))
	##########

	# Training data 
	train_stream = get_train_stream(config, 
		[config['train_src'],], [config['train_tgt'],], 
		vocab_src, vocab_tgt)
	dev_stream = get_dev_stream(
		[config['dev_src'],], [config['dev_tgt'],], 
		vocab_src, vocab_tgt)
	test_stream = get_test_stream([config['test_src'],], vocab_src)

	# Set extensions
	logger.info("Initializing extensions")
	extensions = [
		FinishAfter(after_n_batches=config['finish_after']),
		ProgressBar(),
		TrainingDataMonitoring([cost], 
			prefix="tra", 
			after_batch=True),
		DataStreamMonitoring(variables=[cost], 
			data_stream=dev_stream, 
			prefix="dev", 
			after_batch=True), 
		Sampler(
			model=Model(samples), 
			data_stream=dev_stream,
			vocab=cabvo,
			saveto=config['saveto']+'dev',
			every_n_batches=config['save_freq']), 
		Sampler(
			model=Model(samples), 
			data_stream=test_stream,
			vocab=cabvo,
			saveto=config['saveto']+'test',
			after_n_batches=1, 
			on_resumption=True,
			before_training=True), 
		Plotter(saveto=config['saveto'], after_batch=True),
		Printing(after_batch=True),
		Checkpoint(
			path=config['saveto'], 
			parameters = cg.parameters,
			save_main_loop=False,
			every_n_batches=config['save_freq'])]
	if BOKEH_AVAILABLE: 
		Plot('Training cost', channels=[['target_cost']], after_batch=True)
	if config['reload']: 
		extensions.append(Load(path=config['saveto'], 
			load_iteration_state=False, 
			load_log=False))
	else: 
		with open(config['saveto']+'.txt', 'w') as f: 
			pass 

	# Set up training algorithm
	logger.info("Initializing training algorithm")
	algorithm = GradientDescent(cost=cost, 
		parameters=cg.parameters,
		step_rule=CompositeRule([StepClipping(config['step_clipping']), 
			eval(config['step_rule'])()])
    )

	# Initialize main loop
	logger.info("Initializing main loop")
	main_loop = MainLoop(
		model=model,
		algorithm=algorithm,
		data_stream=train_stream,
		extensions=extensions)
	main_loop.run()
Ejemplo n.º 22
0
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 state_dim,
                 representation_dim,
                 context_dim,
                 target_transition,
                 theano_seed=None,
                 loss_function='cross_entropy',
                 **kwargs):
        super(InitialContextDecoder, self).__init__(**kwargs)

        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        # Initialize gru with special initial state
        self.transition = target_transition(attended_dim=state_dim,
                                            context_dim=context_dim,
                                            dim=state_dim,
                                            activation=Tanh(),
                                            name='decoder')

        # self.transition = GRUInitialStateWithInitialStateConcatContext(
        #     attended_dim=state_dim, context_dim=context_dim, dim=state_dim,
        #     activation=Tanh(), name='decoder')

        # Initialize the attention mechanism
        self.attention = SequenceContentAttention(
            state_names=self.transition.apply.states,
            attended_dim=representation_dim,
            match_dim=state_dim,
            name="attention")

        # Initialize the readout, note that SoftmaxEmitter emits -1 for
        # initial outputs which is used by LookupFeedBackWMT15
        readout = Readout(
            source_names=[
                'states',
                'feedback',
                # Chris: it's key that we're taking the first output of self.attention.take_glimpses.outputs
                # Chris: the first output is the weighted avgs, the second is the weights in (batch, time)
                self.attention.take_glimpses.outputs[0]
            ],
            readout_dim=self.vocab_size,
            emitter=SoftmaxEmitter(initial_output=-1, theano_seed=theano_seed),
            feedback_brick=LookupFeedbackWMT15(vocab_size, embedding_dim),
            post_merge=InitializableFeedforwardSequence([
                Bias(dim=state_dim, name='maxout_bias').apply,
                Maxout(num_pieces=2, name='maxout').apply,
                Linear(input_dim=state_dim / 2,
                       output_dim=embedding_dim,
                       use_bias=False,
                       name='softmax0').apply,
                Linear(input_dim=embedding_dim, name='softmax1').apply
            ]),
            merged_dim=state_dim)

        # Build sequence generator accordingly
        if loss_function == 'cross_entropy':
            self.sequence_generator = InitialContextSequenceGenerator(
                readout=readout,
                transition=self.transition,
                attention=self.attention,
                fork=Fork([
                    name for name in self.transition.apply.sequences
                    if name != 'mask'
                ],
                          prototype=Linear()))
        elif loss_function == 'min_risk':
            self.sequence_generator = MinRiskInitialContextSequenceGenerator(
                readout=readout,
                transition=self.transition,
                attention=self.attention,
                fork=Fork([
                    name for name in self.transition.apply.sequences
                    if name != 'mask'
                ],
                          prototype=Linear()))
            # the name is important, because it lets us match the brick hierarchy names for the vanilla SequenceGenerator
            # to load pretrained models
            # TODO: quick hack to fix bug
            self.sequence_generator.name = 'initialcontextsequencegenerator'

        else:
            raise ValueError(
                'The decoder does not support the loss function: {}'.format(
                    loss_function))

        # TODO: uncomment this!!
        # self.sequence_generator.name = 'sequencegenerator'

        self.children = [self.sequence_generator]
Ejemplo n.º 23
0
class ETHM(EUTHM):
    '''Model with only textual-hashtag information'''
    def __init__(self, config, dataset, *args, **kwargs):
        super(ETHM, self).__init__(config, dataset)

    def _build_model(self, *args, **kwargs):
        # Define inputs
        self._define_inputs()
        self._build_bricks()
        self._set_OV_value()
        # Transpose text
        self.text = self.text.dimshuffle(1, 0)
        self.text_mask = self.text_mask.dimshuffle(1, 0)
        self.sparse_word = self.sparse_word.dimshuffle(1, 0)
        self.sparse_word_mask = self.sparse_word_mask.dimshuffle(1, 0)
        # Turn word, and hashtag into vector representation
        text_vec = self.word_embed.apply(self.text)
        # Apply word and hashtag word and url
        text_vec = self._apply_hashtag_word(text_vec)
        text_vec = self._apply_sparse_word(text_vec)
        # Encode text
        mlstm_hidden, mlstm_cell = self.mlstm.apply(
            inputs=self.mlstm_ins.apply(text_vec),
            mask=self.text_mask.astype(theano.config.floatX))
        text_encodes = mlstm_hidden[-1]
        input_vec = text_encodes
        self._get_cost(input_vec, None, None)

    def _define_inputs(self, *args, **kwargs):
        self.hashtag = tensor.ivector('hashtag')
        self.text = tensor.imatrix('text')
        self.text_mask = tensor.matrix('text_mask', dtype=theano.config.floatX)
        self.hashtag_word = tensor.ivector('hashtag_word')
        self.hashtag_sparse_mask = tensor.vector('hashtag_word_sparse_mask',
                                                 dtype=theano.config.floatX)
        self.hashtag_word_left_idx = tensor.ivector(
            'hashtag_word_idx_left_idx')
        self.hashtag_word_right_idx = tensor.ivector(
            'hashtag_word_idx_right_idx')
        self.sparse_word = tensor.imatrix('sparse_word')
        self.sparse_word_sparse_mask = tensor.vector(
            'sparse_word_sparse_mask', dtype=theano.config.floatX)
        self.sparse_word_mask = tensor.matrix('sparse_word_mask',
                                              dtype=theano.config.floatX)
        self.sparse_word_left_idx = tensor.ivector('sparse_word_idx_left_idx')
        self.sparse_word_right_idx = tensor.ivector(
            'sparse_word_idx_right_idx')

    def _build_bricks(self, *args, **kwargs):
        # Build lookup tables
        self.word_embed = self._embed(len(self.dataset.word2index),
                                      self.config.word_embed_dim,
                                      name='word_embed')

        self.hashtag_embed = self._embed(len(self.dataset.hashtag2index),
                                         self.config.lstm_dim,
                                         name='hashtag_embed')
        # Build text encoder
        self.mlstm_ins = Linear(input_dim=self.config.word_embed_dim,
                                output_dim=4 * self.config.lstm_dim,
                                name='mlstm_in')
        self.mlstm_ins.weights_init = IsotropicGaussian(
            std=numpy.sqrt(2) /
            numpy.sqrt(self.config.word_embed_dim + self.config.lstm_dim))
        self.mlstm_ins.biases_init = Constant(0)
        self.mlstm_ins.initialize()
        self.mlstm = MLSTM(self.config.lstm_time,
                           self.config.lstm_dim,
                           shared=False)
        self.mlstm.weights_init = IsotropicGaussian(
            std=numpy.sqrt(2) /
            numpy.sqrt(self.config.word_embed_dim + self.config.lstm_dim))
        self.mlstm.biases_init = Constant(0)
        self.mlstm.initialize()
        self.hashtag2word = MLP(
            activations=[Tanh('hashtag2word_tanh')],
            dims=[self.config.lstm_dim, self.config.word_embed_dim],
            name='hashtag2word_mlp')
        self.hashtag2word.weights_init = IsotropicGaussian(
            std=1 / numpy.sqrt(self.config.word_embed_dim))
        self.hashtag2word.biases_init = Constant(0)
        self.hashtag2word.initialize()
        self.hashtag2word_bias = Bias(dim=1, name='hashtag2word_bias')
        self.hashtag2word_bias.biases_init = Constant(0)
        self.hashtag2word_bias.initialize()
        #Build character embedding
        self.char_embed = self._embed(len(self.dataset.char2index),
                                      self.config.char_embed_dim,
                                      name='char_embed')
        # Build sparse word encoder
        self.rnn_ins = Linear(input_dim=self.config.char_embed_dim,
                              output_dim=self.config.word_embed_dim,
                              name='rnn_in')
        self.rnn_ins.weights_init = IsotropicGaussian(
            std=numpy.sqrt(2) / numpy.sqrt(self.config.char_embed_dim +
                                           self.config.word_embed_dim))
        self.rnn_ins.biases_init = Constant(0)
        self.rnn_ins.initialize()
        self.rnn = SimpleRecurrent(dim=self.config.word_embed_dim,
                                   activation=Tanh())
        self.rnn.weights_init = IsotropicGaussian(
            std=1 / numpy.sqrt(self.config.word_embed_dim))
        self.rnn.initialize()

    def _apply_dropout(self, outputs, *args, **kwargs):
        variables = [self.word_embed.W, self.hashtag_embed.W]
        cgs = ComputationGraph(outputs)
        cg_dropouts = apply_dropout(cgs,
                                    variables,
                                    drop_prob=self.config.dropout_prob,
                                    seed=123).outputs
        return cg_dropouts

    def _apply_reg(self, cost, params=None, *args, **kwargs):
        try:
            if self.config.l2_norm > 0:
                cost = cost + self.config.l2_norm * theano_expressions.l2_norm(
                    tensors=[self.hashtag_embed.W, self.word_embed.W])**2

            else:
                pass
        except Exception:
            pass
        return cost
Ejemplo n.º 24
0
    def __init__(self, ref_data, output_dim):
        if pca_dims is not None:
            covmat = numpy.dot(ref_data.T, ref_data)
            ev, evec = numpy.linalg.eig(covmat)
            best_i = ev.argsort()[-pca_dims:]
            best_evecs = evec[:, best_i]
            best_evecs = best_evecs / numpy.sqrt(
                (best_evecs**2).sum(axis=0))  #normalize
            ref_data = numpy.dot(ref_data, best_evecs)

        input_dim = ref_data.shape[1]

        ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32),
                                    name='ref_data')

        # Construct the model
        j = tensor.lvector('j')
        r = ref_data_sh[j, :]
        x = tensor.fmatrix('x')
        y = tensor.ivector('y')

        # input_dim must be nr
        mlp = MLP(activations=activation_functions,
                  dims=[input_dim] + hidden_dims + [n_inter],
                  name='inter_gen')
        mlp2 = MLP(activations=activation_functions_2 + [None],
                   dims=[n_inter] + hidden_dims_2 + [output_dim],
                   name='end_mlp')

        inter_weights = mlp.apply(r)

        if inter_bias == None:
            ibias = Bias(n_inter)
            ibias.biases_init = Constant(0)
            ibias.initialize()
            inter = ibias.apply(tensor.dot(x, inter_weights))
        else:
            inter = tensor.dot(x, inter_weights) - inter_bias
        inter = inter_act_fun.apply(inter)

        final = mlp2.apply(inter)

        cost = Softmax().categorical_cross_entropy(y, final)
        confidence = Softmax().apply(final)

        pred = final.argmax(axis=1)
        # error_rate = tensor.neq(y, pred).mean()
        ber = balanced_error_rate.ber(y, pred)

        # Initialize parameters
        for brick in [mlp, mlp2]:
            brick.weights_init = IsotropicGaussian(0.01)
            brick.biases_init = Constant(0.001)
            brick.initialize()

        # apply regularization
        cg = ComputationGraph([cost, ber])

        if r_dropout != 0:
            # - dropout on input vector r : r_dropout
            cg = apply_dropout(cg, [r], r_dropout)

        if x_dropout != 0:
            cg = apply_dropout(cg, [x], x_dropout)

        if s_dropout != 0:
            # - dropout on intermediate layers of first mlp : s_dropout
            s_dropout_vars = list(
                set(
                    VariableFilter(bricks=[Tanh], name='output')
                    (ComputationGraph([inter_weights]))) -
                set([inter_weights]))
            cg = apply_dropout(cg, s_dropout_vars, s_dropout)

        if i_dropout != 0:
            # - dropout on input to second mlp : i_dropout
            cg = apply_dropout(cg, [inter], i_dropout)

        if a_dropout != 0:
            # - dropout on hidden layers of second mlp : a_dropout
            a_dropout_vars = list(
                set(
                    VariableFilter(bricks=[Tanh], name='output')
                    (ComputationGraph([final]))) - set([inter_weights]) -
                set(s_dropout_vars))
            cg = apply_dropout(cg, a_dropout_vars, a_dropout)

        if r_noise_std != 0:
            cg = apply_noise(cg, [r], r_noise_std)

        if w_noise_std != 0:
            # - apply noise on weight variables
            weight_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, weight_vars, w_noise_std)

        [cost_reg, ber_reg] = cg.outputs

        if s_l1pen != 0:
            s_weights = VariableFilter(bricks=mlp.linear_transformations,
                                       roles=[WEIGHT])(cg)
            cost_reg = cost_reg + s_l1pen * sum(
                abs(w).sum() for w in s_weights)
        if i_l1pen != 0:
            cost_reg = cost_reg + i_l1pen * abs(inter).sum()
        if a_l1pen != 0:
            a_weights = VariableFilter(bricks=mlp2.linear_transformations,
                                       roles=[WEIGHT])(cg)
            cost_reg = cost_reg + a_l1pen * sum(
                abs(w).sum() for w in a_weights)

        self.cost = cost
        self.cost_reg = cost_reg
        self.ber = ber
        self.ber_reg = ber_reg
        self.pred = pred
        self.confidence = confidence
Ejemplo n.º 25
0
    def __init__(self, ref_data, output_dim):
        input_dim = ref_data.shape[1]

        ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32), name="ref_data")

        # Construct the model
        j = tensor.lvector("j")
        r = ref_data_sh[j, :]
        x = tensor.fmatrix("x")
        y = tensor.ivector("y")

        # input_dim must be nr
        mlp0 = MLP(activations=activation_functions_0, dims=[input_dim] + hidden_dims_0, name="e0")
        mlp0vs = MLP(activations=[None], dims=[hidden_dims_0[-1], input_dim], name="de0")
        mlp1 = MLP(
            activations=activation_functions_1, dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter], name="inter_gen"
        )
        mlp2 = MLP(
            activations=activation_functions_2 + [None], dims=[n_inter] + hidden_dims_2 + [output_dim], name="end_mlp"
        )

        encod = mlp0.apply(r)
        rprime = mlp0vs.apply(encod)
        inter_weights = mlp1.apply(encod)

        ibias = Bias(n_inter)
        ibias.biases_init = Constant(0)
        ibias.initialize()
        inter = inter_act_fun.apply(ibias.apply(tensor.dot(x, inter_weights)))

        final = mlp2.apply(inter)

        cost = Softmax().categorical_cross_entropy(y, final)
        confidence = Softmax().apply(final)

        pred = final.argmax(axis=1)
        error_rate = tensor.neq(y, pred).mean()

        # Initialize parameters
        for brick in [mlp0, mlp0vs, mlp1, mlp2]:
            brick.weights_init = IsotropicGaussian(0.01)
            brick.biases_init = Constant(0.001)
            brick.initialize()

        # apply regularization
        cg = ComputationGraph([cost, error_rate])

        if r_dropout != 0:
            # - dropout on input vector r : r_dropout
            cg = apply_dropout(cg, [r], r_dropout)

        if s_dropout != 0:
            # - dropout on intermediate layers of first mlp : s_dropout
            s_dropout_vars = list(
                set(VariableFilter(bricks=[Tanh], name="output")(ComputationGraph([inter_weights])))
                - set([inter_weights])
            )
            cg = apply_dropout(cg, s_dropout_vars, s_dropout)

        if i_dropout != 0:
            # - dropout on input to second mlp : i_dropout
            cg = apply_dropout(cg, [inter], i_dropout)

        if a_dropout != 0:
            # - dropout on hidden layers of second mlp : a_dropout
            a_dropout_vars = list(
                set(VariableFilter(bricks=[Tanh], name="output")(ComputationGraph([final])))
                - set([inter_weights])
                - set(s_dropout_vars)
            )
            cg = apply_dropout(cg, a_dropout_vars, a_dropout)

        if w_noise_std != 0:
            # - apply noise on weight variables
            weight_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, weight_vars, w_noise_std)

        [cost_reg, error_rate_reg] = cg.outputs

        # add reconstruction penalty for AE part
        penalty_val = tensor.sqrt(((r - rprime) ** 2).sum(axis=1)).mean()
        cost_reg = cost_reg + reconstruction_penalty * penalty_val

        self.cost = cost
        self.cost_reg = cost_reg
        self.error_rate = error_rate
        self.error_rate_reg = error_rate_reg
        self.pred = pred
        self.confidence = confidence
Ejemplo n.º 26
0
    def __init__(
            self,
            input_dims,
            input_num_chars,
            eos_label,
            num_phonemes,
            dim_dec,
            dims_bidir,
            enc_transition,
            dec_transition,
            use_states_for_readout,
            attention_type,
            criterion,
            bottom,
            lm=None,
            character_map=None,
            bidir=True,
            subsample=None,
            dims_top=None,
            prior=None,
            conv_n=None,
            post_merge_activation=None,
            post_merge_dims=None,
            dim_matcher=None,
            embed_outputs=True,
            dim_output_embedding=None,
            dec_stack=1,
            conv_num_filters=1,
            data_prepend_eos=True,
            # softmax is the default set in SequenceContentAndConvAttention
            energy_normalizer=None,
            # for speech this is the approximate phoneme duration in frames
            max_decoded_length_scale=1,
            **kwargs):

        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(SpeechRecognizer, self).__init__(**kwargs)
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        self.criterion = criterion

        self.max_decoded_length_scale = max_decoded_length_scale

        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        bottom_class = bottom.pop('bottom_class')
        bottom = bottom_class(input_dims=input_dims,
                              input_num_chars=input_num_chars,
                              name='bottom',
                              **bottom)

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(self.enc_transition,
                          dims_bidir,
                          bottom.get_dim(bottom.apply.outputs[0]),
                          subsample,
                          bidir=bidir)
        dim_encoded = encoder.get_dim(encoder.apply.outputs[0])

        generators = [None, None]
        for i in range(2):
            # The top part, on top of BiRNN but before the attention
            if dims_top:
                top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded],
                          name="top{}".format(i))
            else:
                top = Identity(name='top{}'.format(i))

            if dec_stack == 1:
                transition = self.dec_transition(dim=dim_dec,
                                                 activation=Tanh(),
                                                 name="transition{}".format(i))
            else:
                transitions = [
                    self.dec_transition(dim=dim_dec,
                                        activation=Tanh(),
                                        name="transition_{}_{}".format(
                                            i, trans_level))
                    for trans_level in xrange(dec_stack)
                ]
                transition = RecurrentStack(transitions=transitions,
                                            skip_connections=True)
            # Choose attention mechanism according to the configuration
            if attention_type == "content":
                attention = SequenceContentAttention(
                    state_names=transition.apply.states,
                    attended_dim=dim_encoded,
                    match_dim=dim_matcher,
                    name="cont_att" + i)
            elif attention_type == "content_and_conv":
                attention = SequenceContentAndConvAttention(
                    state_names=transition.apply.states,
                    conv_n=conv_n,
                    conv_num_filters=conv_num_filters,
                    attended_dim=dim_encoded,
                    match_dim=dim_matcher,
                    prior=prior,
                    energy_normalizer=energy_normalizer,
                    name="conv_att{}".format(i))
            else:
                raise ValueError(
                    "Unknown attention type {}".format(attention_type))
            if embed_outputs:
                feedback = LookupFeedback(
                    num_phonemes + 1, dim_dec
                    if dim_output_embedding is None else dim_output_embedding)
            else:
                feedback = OneOfNFeedback(num_phonemes + 1)
            if criterion['name'] == 'log_likelihood':
                emitter = SoftmaxEmitter(initial_output=num_phonemes,
                                         name="emitter{}".format(i))
                if lm:
                    # In case we use LM it is Readout that is responsible
                    # for normalization.
                    emitter = LMEmitter()
            elif criterion['name'].startswith('mse'):
                emitter = RewardRegressionEmitter(criterion['name'],
                                                  eos_label,
                                                  num_phonemes,
                                                  criterion.get(
                                                      'min_reward', -1.0),
                                                  name="emitter")
            else:
                raise ValueError("Unknown criterion {}".format(
                    criterion['name']))
            readout_config = dict(
                readout_dim=num_phonemes,
                source_names=(transition.apply.states if use_states_for_readout
                              else []) + [attention.take_glimpses.outputs[0]],
                emitter=emitter,
                feedback_brick=feedback,
                name="readout{}".format(i))
            if post_merge_dims:
                readout_config['merged_dim'] = post_merge_dims[0]
                readout_config['post_merge'] = InitializableSequence(
                    [
                        Bias(post_merge_dims[0]).apply,
                        post_merge_activation.apply,
                        MLP(
                            [post_merge_activation] *
                            (len(post_merge_dims) - 1) + [Identity()],
                            # MLP was designed to support Maxout is activation
                            # (because Maxout in a way is not one). However
                            # a single layer Maxout network works with the trick below.
                            # For deeper Maxout network one has to use the
                            # Sequence brick.
                            [
                                d //
                                getattr(post_merge_activation, 'num_pieces', 1)
                                for d in post_merge_dims
                            ] + [num_phonemes]).apply,
                    ],
                    name='post_merge{}'.format(i))
            readout = Readout(**readout_config)

            language_model = None
            if lm and lm.get('path'):
                lm_weight = lm.pop('weight', 0.0)
                normalize_am_weights = lm.pop('normalize_am_weights', True)
                normalize_lm_weights = lm.pop('normalize_lm_weights', False)
                normalize_tot_weights = lm.pop('normalize_tot_weights', False)
                am_beta = lm.pop('am_beta', 1.0)
                if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1:
                    logger.warn(
                        "Beam search is prone to fail with no log-prob normalization"
                    )
                language_model = LanguageModel(nn_char_map=character_map, **lm)
                readout = ShallowFusionReadout(
                    lm_costs_name='lm_add',
                    lm_weight=lm_weight,
                    normalize_am_weights=normalize_am_weights,
                    normalize_lm_weights=normalize_lm_weights,
                    normalize_tot_weights=normalize_tot_weights,
                    am_beta=am_beta,
                    **readout_config)

            generators[i] = SequenceGenerator(readout=readout,
                                              transition=transition,
                                              attention=attention,
                                              language_model=language_model,
                                              name="generator{}".format(i))

        self.generator = generators[0]

        self.forward_to_backward = Linear(dim_dec, dim_dec)

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generators = generators
        self.children = [self.forward_to_backward, encoder, top, bottom
                         ] + generators

        # Create input variables
        self.inputs = self.bottom.batch_inputs
        self.inputs_mask = self.bottom.mask

        self.labels = tensor.lmatrix('labels')
        self.labels_mask = tensor.matrix("labels_mask")

        self.single_inputs = self.bottom.single_inputs
        self.single_labels = tensor.lvector('labels')
        self.n_steps = tensor.lscalar('n_steps')
Ejemplo n.º 27
0
    def __init__(self,
                 vocab_size,
                 embedding_dim,
                 state_dim,
                 att_dim,
                 maxout_dim,
                 representation_dim,
                 attention_strategy='content',
                 attention_sources='s',
                 readout_sources='sfa',
                 memory='none',
                 memory_size=500,
                 seq_len=50,
                 init_strategy='last',
                 theano_seed=None,
                 **kwargs):
        """Creates a new decoder brick without embedding.
        
        Args:
            vocab_size (int): Target language vocabulary size
            embedding_dim (int): Size of feedback embedding layer
            state_dim (int): Number of hidden units
            att_dim (int): Size of attention match vector
            maxout_dim (int): Size of maxout layer
            representation_dim (int): Dimension of source annotations
            attention_strategy (string): Which attention should be used
                                         cf.  ``_initialize_attention``
            attention_sources (string): Defines the sources used by the 
                                        attention model 's' for decoder
                                        states, 'f' for feedback
            readout_sources (string): Defines the sources used in the 
                                      readout network. 's' for decoder
                                      states, 'f' for feedback, 'a' for
                                      attention (context vector)
            memory (string): Which external memory should be used
                             (cf.  ``_initialize_attention``)
            memory_size (int): Size of the external memory structure
            seq_len (int): Maximum sentence length
            init_strategy (string): How to initialize the RNN state
                                    (cf.  ``GRUInitialState``)
            theano_seed: Random seed
        """
        super(NoLookupDecoder, self).__init__(**kwargs)
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.state_dim = state_dim
        self.representation_dim = representation_dim
        self.theano_seed = theano_seed

        # Initialize gru with special initial state
        self.transition = GRUInitialState(attended_dim=state_dim,
                                          init_strategy=init_strategy,
                                          dim=state_dim,
                                          activation=Tanh(),
                                          name='decoder')

        # Initialize the attention mechanism
        att_dim = att_dim if att_dim > 0 else state_dim
        self.attention, src_names = _initialize_attention(
            attention_strategy, seq_len, self.transition, representation_dim,
            att_dim, attention_sources, readout_sources, memory, memory_size)

        # Initialize the readout, note that SoftmaxEmitter emits -1 for
        # initial outputs which is used by LookupFeedBackWMT15
        maxout_dim = maxout_dim if maxout_dim > 0 else state_dim
        readout = Readout(
            source_names=src_names,
            readout_dim=embedding_dim,
            emitter=NoLookupEmitter(initial_output=-1,
                                    readout_dim=embedding_dim,
                                    cost_brick=SquaredError()),
            #                        cost_brick=CategoricalCrossEntropy()),
            feedback_brick=TrivialFeedback(output_dim=embedding_dim),
            post_merge=InitializableFeedforwardSequence([
                Bias(dim=maxout_dim, name='maxout_bias').apply,
                Maxout(num_pieces=2, name='maxout').apply,
                Linear(input_dim=maxout_dim / 2,
                       output_dim=embedding_dim,
                       use_bias=False,
                       name='softmax0').apply,
                Logistic(name='softmax1').apply
            ]),
            merged_dim=maxout_dim)

        # Build sequence generator accordingly
        self.sequence_generator = SequenceGenerator(
            readout=readout,
            transition=self.transition,
            attention=self.attention,
            fork=Fork([
                name
                for name in self.transition.apply.sequences if name != 'mask'
            ],
                      prototype=Linear()))

        self.children = [self.sequence_generator]
Ejemplo n.º 28
0
    def __init__(
        self,
        recordings_source,
        labels_source,
        eos_label,
        num_features,
        num_phonemes,
        dim_dec,
        dims_bidir,
        dims_bottom,
        enc_transition,
        dec_transition,
        use_states_for_readout,
        attention_type,
        lm=None,
        character_map=None,
        subsample=None,
        dims_top=None,
        prior=None,
        conv_n=None,
        bottom_activation=None,
        post_merge_activation=None,
        post_merge_dims=None,
        dim_matcher=None,
        embed_outputs=True,
        dec_stack=1,
        conv_num_filters=1,
        data_prepend_eos=True,
        energy_normalizer=None,  # softmax is th edefault set in SequenceContentAndConvAttention
        **kwargs):
        if bottom_activation is None:
            bottom_activation = Tanh()
        if post_merge_activation is None:
            post_merge_activation = Tanh()
        super(SpeechRecognizer, self).__init__(**kwargs)
        self.recordings_source = recordings_source
        self.labels_source = labels_source
        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        bottom_activation = bottom_activation
        post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        if dims_bottom:
            bottom = MLP([bottom_activation] * len(dims_bottom),
                         [num_features] + dims_bottom,
                         name="bottom")
        else:
            bottom = Identity(name='bottom')

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(
            self.enc_transition, dims_bidir,
            dims_bottom[-1] if len(dims_bottom) else num_features, subsample)

        # The top part, on top of BiRNN but before the attention
        if dims_top:
            top = MLP([Tanh()],
                      [2 * dims_bidir[-1]] + dims_top + [2 * dims_bidir[-1]],
                      name="top")
        else:
            top = Identity(name='top')

        if dec_stack == 1:
            transition = self.dec_transition(dim=dim_dec,
                                             activation=Tanh(),
                                             name="transition")
        else:
            transitions = [
                self.dec_transition(dim=dim_dec,
                                    activation=Tanh(),
                                    name="transition_{}".format(trans_level))
                for trans_level in xrange(dec_stack)
            ]
            transition = RecurrentStack(transitions=transitions,
                                        skip_connections=True)
        # Choose attention mechanism according to the configuration
        if attention_type == "content":
            attention = SequenceContentAttention(
                state_names=transition.apply.states,
                attended_dim=2 * dims_bidir[-1],
                match_dim=dim_matcher,
                name="cont_att")
        elif attention_type == "content_and_conv":
            attention = SequenceContentAndConvAttention(
                state_names=transition.apply.states,
                conv_n=conv_n,
                conv_num_filters=conv_num_filters,
                attended_dim=2 * dims_bidir[-1],
                match_dim=dim_matcher,
                prior=prior,
                energy_normalizer=energy_normalizer,
                name="conv_att")
        else:
            raise ValueError(
                "Unknown attention type {}".format(attention_type))
        if embed_outputs:
            feedback = LookupFeedback(num_phonemes + 1, dim_dec)
        else:
            feedback = OneOfNFeedback(num_phonemes + 1)
        if lm:
            # In case we use LM it is Readout that is responsible
            # for normalization.
            emitter = LMEmitter()
        else:
            emitter = SoftmaxEmitter(initial_output=num_phonemes,
                                     name="emitter")
        readout_config = dict(readout_dim=num_phonemes,
                              source_names=(transition.apply.states if
                                            use_states_for_readout else []) +
                              [attention.take_glimpses.outputs[0]],
                              emitter=emitter,
                              feedback_brick=feedback,
                              name="readout")
        if post_merge_dims:
            readout_config['merged_dim'] = post_merge_dims[0]
            readout_config['post_merge'] = InitializableSequence(
                [
                    Bias(post_merge_dims[0]).apply,
                    post_merge_activation.apply,
                    MLP(
                        [post_merge_activation] *
                        (len(post_merge_dims) - 1) + [Identity()],
                        # MLP was designed to support Maxout is activation
                        # (because Maxout in a way is not one). However
                        # a single layer Maxout network works with the trick below.
                        # For deeper Maxout network one has to use the
                        # Sequence brick.
                        [
                            d //
                            getattr(post_merge_activation, 'num_pieces', 1)
                            for d in post_merge_dims
                        ] + [num_phonemes]).apply,
                ],
                name='post_merge')
        readout = Readout(**readout_config)

        language_model = None
        if lm:
            lm_weight = lm.pop('weight', 0.0)
            normalize_am_weights = lm.pop('normalize_am_weights', True)
            normalize_lm_weights = lm.pop('normalize_lm_weights', False)
            normalize_tot_weights = lm.pop('normalize_tot_weights', False)
            am_beta = lm.pop('am_beta', 1.0)
            if normalize_am_weights + normalize_lm_weights + normalize_tot_weights < 1:
                logger.warn(
                    "Beam search is prone to fail with no log-prob normalization"
                )
            language_model = LanguageModel(nn_char_map=character_map, **lm)
            readout = ShallowFusionReadout(
                lm_costs_name='lm_add',
                lm_weight=lm_weight,
                normalize_am_weights=normalize_am_weights,
                normalize_lm_weights=normalize_lm_weights,
                normalize_tot_weights=normalize_tot_weights,
                am_beta=am_beta,
                **readout_config)

        generator = SequenceGenerator(readout=readout,
                                      transition=transition,
                                      attention=attention,
                                      language_model=language_model,
                                      name="generator")

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generator = generator
        self.children = [encoder, top, bottom, generator]

        # Create input variables
        self.recordings = tensor.tensor3(self.recordings_source)
        self.recordings_mask = tensor.matrix(self.recordings_source + "_mask")
        self.labels = tensor.lmatrix(self.labels_source)
        self.labels_mask = tensor.matrix(self.labels_source + "_mask")
        self.batch_inputs = [
            self.recordings, self.recordings_source, self.labels,
            self.labels_mask
        ]
        self.single_recording = tensor.matrix(self.recordings_source)
        self.single_transcription = tensor.lvector(self.labels_source)
Ejemplo n.º 29
0
    def __init__(self, ref_data, output_dim):
        ref_data_sh = theano.shared(numpy.array(ref_data, dtype=numpy.float32),
                                    name='ref_data')

        # Construct the model
        j = tensor.lvector('j')
        x = tensor.fmatrix('x')
        y = tensor.ivector('y')

        last_outputs = []
        s_dropout_vars = []
        r_dropout_vars = []
        i_dropout_vars = []
        penalties = []

        for i in range(nparts):
            fs = numpy.random.binomial(1,
                                       part_r_proba,
                                       size=(ref_data.shape[1], ))
            input_dim = int(fs.sum())

            fs_sh = theano.shared(fs)
            r = ref_data_sh[j, :][:, fs_sh.nonzero()[0]]

            mlp0 = MLP(activations=activation_functions_0,
                       dims=[input_dim] + hidden_dims_0,
                       name='enc%d' % i)
            mlp0r = MLP(activations=[None],
                        dims=[hidden_dims_0[-1], input_dim],
                        name='dec%d' % i)
            mlp1 = MLP(activations=activation_functions_1,
                       dims=[hidden_dims_0[-1]] + hidden_dims_1 + [n_inter],
                       name='inter_gen_%d' % i)
            mlp2 = MLP(activations=activation_functions_2 + [None],
                       dims=[n_inter] + hidden_dims_2 + [output_dim],
                       name='end_mlp_%d' % i)

            encod = mlp0.apply(r)
            rprime = mlp0r.apply(encod)
            inter_weights = mlp1.apply(encod)

            ibias = Bias(n_inter, name='inter_bias_%d' % i)
            inter = ibias.apply(tensor.dot(x, inter_weights))
            inter = inter_act_fun.apply(inter)

            out = mlp2.apply(inter)

            penalties.append(
                tensor.sqrt(((rprime - r)**2).sum(axis=1)).mean()[None])

            last_outputs.append(out)

            r_dropout_vars.append(r)
            s_dropout_vars = s_dropout_vars + (VariableFilter(
                bricks=[Tanh], name='output')(ComputationGraph([inter_weights
                                                                ])))
            i_dropout_vars.append(inter)

            # Initialize parameters
            for brick in [mlp0, mlp0r, mlp1, mlp2, ibias]:
                brick.weights_init = IsotropicGaussian(0.01)
                brick.biases_init = Constant(0.001)
                brick.initialize()

        final = tensor.concatenate([x[:, :, None] for x in last_outputs],
                                   axis=2).mean(axis=2)

        cost = Softmax().categorical_cross_entropy(y, final)
        confidence = Softmax().apply(final)

        pred = final.argmax(axis=1)
        error_rate = tensor.neq(y, pred).mean()

        # apply regularization
        cg = ComputationGraph([cost, error_rate])

        if w_noise_std != 0:
            # - apply noise on weight variables
            weight_vars = VariableFilter(roles=[WEIGHT])(cg)
            cg = apply_noise(cg, weight_vars, w_noise_std)

        if s_dropout != 0:
            cg = apply_dropout(cg, s_dropout_vars, s_dropout)
        if r_dropout != 0:
            cg = apply_dropout(cg, r_dropout_vars, r_dropout)
        if i_dropout != 0:
            cg = apply_dropout(cg, i_dropout_vars, i_dropout)

        [cost_reg, error_rate_reg] = cg.outputs

        cost_reg = cost_reg + reconstruction_penalty * tensor.concatenate(
            penalties, axis=0).sum()

        self.cost = cost
        self.cost_reg = cost_reg
        self.error_rate = error_rate
        self.error_rate_reg = error_rate_reg
        self.pred = pred
        self.confidence = confidence
Ejemplo n.º 30
0
    def __init__(
            self,
            input_sources,
            input_sources_dims,
            eos_label,
            num_phonemes,
            dim_dec,
            dims_bidir,
            enc_transition,
            dec_transition,
            use_states_for_readout,
            attention_type,
            criterion,
            bottom,
            enc_transition_params={},
            dec_transition_params={},
            names_postfix='',
            lm=None,
            character_map=None,
            bidir=True,
            bidir_aggregation='concat',
            subsample=None,
            dims_top=None,
            prior=None,
            conv_n=None,
            post_merge_activation=None,
            post_merge_dims=None,
            dim_matcher=None,
            embed_outputs=True,
            dim_output_embedding=None,
            dec_stack=1,
            conv_num_filters=1,
            data_prepend_eos=False,
            # softmax is the default set in SequenceContentAndConvAttention
            energy_normalizer=None,
            # for speech this is the approximate phoneme duration in frames
            max_decoded_length_scale=3,
            use_dependent_words_for_labels=False,
            use_dependent_words_for_attention=False,
            reproduce_rec_weight_init_bug=True,
            pointers_weight=0.5,
            tags_weight=1.0,
            tag_layer=-1,  # -1 is last, 0 is after first bidir layer
            dependency_type='recurrent_soft',
            **kwargs):

        if post_merge_activation is None:
            post_merge_activation = Tanh()

        self.regularization_bricks = []
        possible_regularization_bricks = []

        self.names_postfix = names_postfix

        self.mask_dict = {}

        self.pointers_name = 'pointers' + names_postfix

        self.additional_sources = kwargs.pop('additional_sources')
        self.additional_sources_dims = kwargs.pop('additional_sources_dims')

        self.pointer_weight = pointers_weight
        self.soft_pointer_val = kwargs.pop('pointers_soften', 0.0)
        self.soft_pointer = self.soft_pointer_val > 0.0

        self.tags_weight = tags_weight
        self.tag_layer = tag_layer
        self.train_tags = True
        if self.tags_weight < 0 or len(self.additional_sources) <= 1:
            self.train_tags = False

        self.dependency_type = dependency_type

        super(DependencyRecognizer, self).__init__(**kwargs)

        self.reproduce_rec_weight_init_bug = reproduce_rec_weight_init_bug

        self.eos_label = eos_label
        self.data_prepend_eos = data_prepend_eos

        self.rec_weights_init = None
        self.initial_states_init = None

        self.enc_transition = enc_transition
        self.dec_transition = dec_transition
        self.dec_stack = dec_stack

        self.criterion = criterion

        self.max_decoded_length_scale = max_decoded_length_scale

        self.post_merge_activation = post_merge_activation

        if dim_matcher is None:
            dim_matcher = dim_dec

        # The bottom part, before BiRNN
        bottom_class = bottom.pop('bottom_class')
        bottom = bottom_class(input_sources=input_sources,
                              input_sources_dims=input_sources_dims,
                              name='bottom',
                              pointers_soften=self.soft_pointer,
                              additional_sources=self.additional_sources,
                              **bottom)

        # BiRNN
        if not subsample:
            subsample = [1] * len(dims_bidir)
        encoder = Encoder(self.enc_transition,
                          dims_bidir,
                          bottom.output_dim,
                          subsample,
                          bidir=bidir,
                          bidir_aggregation=bidir_aggregation,
                          enc_transition_params=enc_transition_params)
        possible_regularization_bricks += encoder.enc_transitions
        dim_encoded = encoder.get_dim(encoder.apply.outputs[0])

        # The top part, on top of BiRNN but before the attention
        if dims_top:
            top = MLP([Tanh()], [dim_encoded] + dims_top + [dim_encoded],
                      name="top")
        else:
            top = Identity(name='top')

        self.additional_sources_mlp = {}
        ndim_softmax = NDimensionalSoftmax()
        ndim_softmax._extra_ndim = 1
        for source in self.additional_sources:
            if source != self.pointers_name:
                if len(self.names_postfix) > 0:
                    source_glob_name = source[:-len(self.names_postfix)]
                else:
                    source_glob_name = source
                self.additional_sources_mlp[source] = \
                    MLP([ndim_softmax], [dim_encoded, self.additional_sources_dims[source]],
                        name='additional_'+source_glob_name)

        if dec_stack == 1:
            transition = self.dec_transition(dim=dim_dec,
                                             activation=Tanh(),
                                             name="transition",
                                             **dec_transition_params)
            possible_regularization_bricks += [transition]
        else:
            transitions = [
                self.dec_transition(dim=dim_dec,
                                    activation=Tanh(),
                                    name="transition_{}".format(trans_level),
                                    **dec_transition_params)
                for trans_level in xrange(dec_stack)
            ]
            possible_regularization_bricks += transitions
            transition = RecurrentStack(transitions=transitions,
                                        skip_connections=True)
        # Choose attention mechanism according to the configuration
        attention_class = ParsingAttention
        attention_kwargs = {}
        transition_with_att_class = ParsingAttentionRecurrent

        if self.dependency_type == "recurrent_soft":
            attention_kwargs['use_pointers'] = None
        elif self.dependency_type == "recurrent_hard":
            attention_kwargs['use_pointers'] = 'hard'
        elif self.dependency_type == "recurrent_semihard":
            attention_kwargs['use_pointers'] = 'semihard'
        else:
            raise ValueError("Unknown dependency type {}".format(
                self.dependency_type))

        if attention_type == "content":
            pass
        elif attention_type == "content_hard":
            attention_kwargs['hard_attention'] = True
        else:
            raise ValueError(
                "Unknown attention type {}".format(attention_type))

        if use_dependent_words_for_attention:
            attention_kwargs['use_word_annotations'] = True
            attention_kwargs['word_annontation_dim'] = dim_encoded

        attention = attention_class(state_names=transition.apply.states,
                                    attended_dim=dim_encoded,
                                    match_dim=dim_matcher,
                                    name="cont_att",
                                    **attention_kwargs)

        feedback = AttendedFeedback(num_phonemes + 1, dim_encoded)
        if criterion['name'] == 'log_likelihood':
            emitter = SoftmaxMultiEmitter(initial_output=num_phonemes,
                                          name="emitter")
        else:
            raise ValueError("Unknown criterion {}".format(criterion['name']))
        readout_source_names = (transition.apply.states
                                if use_states_for_readout else
                                []) + [attention.take_glimpses.outputs[0]]

        if use_dependent_words_for_labels:
            readout_source_names.append('attended')

        readout_config = dict(readout_dim=num_phonemes,
                              source_names=readout_source_names,
                              emitter=emitter,
                              feedback_brick=feedback,
                              name="readout")
        if post_merge_dims:
            readout_config['merged_dim'] = post_merge_dims[0]
            readout_config['post_merge'] = InitializableSequence(
                [
                    Bias(post_merge_dims[0]).apply,
                    post_merge_activation.apply,
                    MLP(
                        [post_merge_activation] *
                        (len(post_merge_dims) - 1) + [Identity()],
                        # MLP was designed to support Maxout is activation
                        # (because Maxout in a way is not one). However
                        # a single layer Maxout network works with the trick below.
                        # For deeper Maxout network one has to use the
                        # Sequence brick.
                        [
                            d //
                            getattr(post_merge_activation, 'num_pieces', 1)
                            for d in post_merge_dims
                        ] + [num_phonemes]).apply,
                ],
                name='post_merge')
        readout = Readout(**readout_config)

        generator = Generator(
            readout=readout,
            transition=transition,
            attention=attention,
            dim_dec=dim_dec,
            pointer_weight=self.pointer_weight,
            transition_with_att_class=transition_with_att_class,
            name="generator")

        for brick in possible_regularization_bricks:
            if 'regularize' in dir(brick):
                self.regularization_bricks += [brick]

        logger.info("Regularization bricks: {}".format(
            str(self.regularization_bricks)))

        # Remember child bricks
        self.encoder = encoder
        self.bottom = bottom
        self.top = top
        self.generator = generator
        self.children = [encoder, top, bottom, generator]
        self.children.extend(self.additional_sources_mlp.values())

        # Create input variables
        self.inputs = self.bottom.get_batch_inputs()
        self.inputs_mask = self.bottom.get_mask()

        self.additional_sources = self.bottom.get_batch_additional_sources()

        self.labels = tensor.lmatrix('labels' + names_postfix)
        self.labels_mask = tensor.matrix('labels' + names_postfix + '_mask')
        #self.labels_mask = tensor.matrix('labels_mask'+names_postfix)

        self.single_inputs = self.bottom.get_single_sequence_inputs()
        self.single_labels = tensor.lvector('labels' + names_postfix)
        self.single_additional_sources = self.bottom.get_single_additional_sources(
        )
        self.n_steps = tensor.lscalar('n_steps' + names_postfix)