Esempio n. 1
0
def ln_linear(inputs, size, bias, concat=False, dtype=None, scope=None):
    if not isinstance(size, (list, tuple)):
        raise ValueError("size argument must be (input_size, output_size)")

    input_size, output_size = size

    if not isinstance(input_size, (list, tuple)):
        input_size = [input_size]

    if not isinstance(inputs, (list, tuple)):
        inputs = [inputs]

    if len(inputs) != len(input_size):
        raise RuntimeError("unmatched elements found: inputs and input_size")

    results = []

    with variable_scope(scope):
        if concat:
            input_size = sum(input_size)
            inputs = theano.tensor.concatenate(inputs, -1)

            shape = [input_size, output_size]
            matrix = get_variable("matrix", shape, dtype=dtype)
            res = theano.dot(inputs, matrix)
            with variable_scope("layer_norm"):
                alpha = get_variable("gains", shape=(output_size,), dtype=dtype, initializer=ones_initializer)
                beta = get_variable("biases", shape=(output_size,), dtype=dtype, initializer=zeros_initializer)

            res = layer_normalize(res, alpha, beta)
            results.append(res)
        else:
            for i in range(len(input_size)):
                shape = [input_size[i], output_size]
                name = "matrix_%d" % i
                matrix = get_variable(name, shape, dtype=dtype)
                res = theano.dot(inputs[i], matrix)
                with variable_scope("layer_norm"):
                    alpha = get_variable("gains_%d" % i, shape=(output_size,), dtype=dtype,
                                         initializer=ones_initializer())
                    beta = get_variable("biases_%d" % i, shape=(output_size,), dtype=dtype,
                                        initializer=zeros_initializer())

                res = layer_normalize(res, alpha, beta)
                results.append(res)

        if bias:
            shape = [output_size]
            bias = get_variable("bias", shape, dtype=dtype)
            results.append(bias)

    if len(results) == 1:
        return results[0]

    return reduce(theano.tensor.add, results)
Esempio n. 2
0
def encoder(cell, inputs, mask, initial_state=None, dtype=None, scope=None):
    with ops.variable_scope(scope or "encoder"):
        with ops.variable_scope("forward"):
            fd_states = gru_encoder(cell, inputs, mask, initial_state, dtype)
        with ops.variable_scope("backward"):
            inputs = inputs[::-1]
            mask = mask[::-1]
            bd_states = gru_encoder(cell, inputs, mask, initial_state, dtype)
            bd_states = bd_states[::-1]

    return fd_states, bd_states
Esempio n. 3
0
def decoder(cell, inputs, mask, initial_state, attention_states,
            attention_mask, attn_size, dtype=None, scope=None):
    input_size, states_size = cell.input_size
    output_size = cell.output_size
    dtype = dtype or inputs.dtype

    # non sequences should passed to scan, DO NOT use closure
    def loop_fn(inputs, mask, state, attn_states, attn_mask, m_states):
        mask = mask[:, None]
        alpha = attention(state, m_states, output_size, attn_size, attn_mask)
        context = theano.tensor.sum(alpha[:, :, None] * attn_states, 0)
        output, next_state = cell([inputs, context], state)
        next_state = (1.0 - mask) * state +  mask * next_state

        return [next_state, context]

    with ops.variable_scope(scope or "decoder"):
        mapped_states = map_attention_states(attention_states, states_size,
                                             attn_size)
        seq = [inputs, mask]
        outputs_info = [initial_state, None]
        non_seq = [attention_states, attention_mask, mapped_states]
        (states, contexts) = ops.scan(loop_fn, seq, outputs_info, non_seq)

    return states, contexts
Esempio n. 4
0
    def __call__(self, inputs, state, scope=None):
        if not isinstance(inputs, (list, tuple)):
            inputs = [inputs]

        input_size = self.input_size
        output_size = self.output_size

        if len(inputs) != len(input_size):
            raise RuntimeError("unmatched elements: inputs and input_size")

        size = [list(input_size) + [output_size], 4 * output_size]

        with variable_scope(scope or "lstm"):
            c, h = state
            new_inputs = list(inputs[:]) + [h]
            concat = linear(new_inputs, size, True, concat=True, scope="gates")

            i, j, f, o = theano.tensor.split(concat, [output_size] * 4, 4, -1)

            j = theano.tensor.tanh(j)
            # input, forget, output gate
            i = theano.tensor.nnet.sigmoid(i)
            f = theano.tensor.nnet.sigmoid(f)
            o = theano.tensor.nnet.sigmoid(o)

            new_c = c * f + i * j
            # no output activation
            new_h = new_c * o
            new_state = (new_c, new_h)

        return new_h, new_state
Esempio n. 5
0
def attention(query,
              mapped_states,
              state_size,
              attn_size,
              attention_mask=None,
              scope=None):
    with ops.variable_scope(scope or "attention"):
        mapped_query = nn.linear(query, [state_size, attn_size],
                                 False,
                                 scope="query_w")

        mapped_query = mapped_query[None, :, :]
        hidden = theano.tensor.tanh(mapped_query + mapped_states)

        score = nn.linear(hidden, [attn_size, 1], False, scope="attention_v")
        score = score.reshape([score.shape[0], score.shape[1]])

        exp_score = theano.tensor.exp(score)

        if attention_mask is not None:
            exp_score = exp_score * attention_mask

        alpha = exp_score / theano.tensor.sum(exp_score, 0)

    return alpha
Esempio n. 6
0
    def build_attention(self, src_seq, src_mask, target_inputs, tgt_seq,
                        tgt_mask, keys, values, initial_state):
        # attention graph, this feature is optional
        def attention_loop(inputs, mask, state, keys, values, key_mask):
            mask = mask[:, None]
            alpha = attention(state, keys, key_mask, self.dim_hid,
                              self.dim_key)
            context = T.sum(alpha[:, :, None] * values, 0)
            output, next_state = self.cell([inputs, context], state)
            next_state = (1.0 - mask) * state + mask * next_state

            return [alpha, next_state]

        with ops.variable_scope("decoder"):
            seq = [target_inputs, tgt_mask]
            outputs_info = [None, initial_state]
            nonseq = [keys, values, src_mask]
            (alpha, state), updaptes = theano.scan(attention_loop, seq,
                                                   outputs_info, nonseq)
            attention_score = alpha

        alignment_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        alignment_outputs = attention_score
        align = theano.function(alignment_inputs, alignment_outputs)
        return align
Esempio n. 7
0
def attention(query,
              keys,
              key_mask,
              dim_query,
              dim_key,
              dtype=None,
              scope=None):
    with ops.variable_scope(scope or "attention", dtype=dtype):
        # content-based addressing
        # e_i = v_a^T tanh(W query + key_i)
        # alpha = softmax({e_i})
        # (n_query, dim_query) -> (n_query, dim_key)
        mapped_query = nn.linear(query, [dim_query, dim_key],
                                 False,
                                 scope="map-query")
        # (n_key, n_query, dim_key)
        act = T.tanh(mapped_query[None, :, :] + keys)
        # (n_key, n_query, 1)
        e = nn.linear(act, [dim_key, 1], False,
                      scope="pre-alpha")  # (n_key, n_query, 1)
        # (n_key, n_query)
        e = T.reshape(e, e.shape[:2])
        e = e.T  # (n_query, n_key)
        # match dimension
        key_mask = key_mask.T
        alpha = nn.masked_softmax(e, key_mask)  # (n_query, n_key)
        alpha = alpha.T  # (n_key, n_query)
    return alpha
Esempio n. 8
0
def map_attention_states(attention_states, input_size, attn_size, scope=None):
    with ops.variable_scope(scope or "attention"):
        mapped_states = nn.linear(attention_states, [input_size, attn_size],
                                  False,
                                  scope="attention_w")

    return mapped_states
Esempio n. 9
0
    def __call__(self, inputs, state, scope=None):
        if not isinstance(inputs, (list, tuple)):
            inputs = [inputs]

        input_size = self.input_size
        output_size = self.output_size

        if len(inputs) != len(input_size):
            raise RuntimeError("unmatched elements: inputs and input_size")

        size = [list(input_size) + [output_size], output_size]

        with variable_scope(scope or "gru_cell"):
            new_inputs = list(inputs[:]) + [state]
            r = feedforward(new_inputs, size, False, scope="reset_gate")
            u = feedforward(new_inputs, size, False, scope="update_gate")
            new_inputs = list(inputs[:]) + [r * state]
            c = feedforward(new_inputs,
                            size,
                            True,
                            activation=theano.tensor.tanh,
                            scope="candidate")

            new_state = (1.0 - u) * state + u * c

        return new_state, new_state
Esempio n. 10
0
def coarseattention(query,
                    keys,
                    key_mask,
                    dim_query,
                    dim_key,
                    dtype=None,
                    scope=None):
    with ops.variable_scope(scope or "coarseattention", dtype=dtype):
        # content-based addressing
        # e_i = v_a^T tanh(W query + key_i)
        # alpha = softmax({e_i})
        # (n_query, dim_query) -> (n_query, dim_key)

        e = []
        for i in range(len(keys)):
            mapped_query = nn.linear(query, [dim_query, dim_key[i]],
                                     False,
                                     scope="map-query_{}".format(i))
            # (n_key, n_query, dim_key)
            act = T.tanh(mapped_query[None, :, :] + keys[i])
            # (n_key, n_query, 1)
            em = nn.linear(
                act, [dim_key[i], 1], False,
                scope="pre-alpha_{}".format(i))  # (n_key, n_query, 1)
            e.append(em)

        e = reduce(T.add, e)
        # (n_key, n_query)
        e = T.reshape(e, e.shape[:2])
        e = e.T  # (n_query, n_key)
        # match dimension
        key_mask = key_mask.T
        alpha = nn.masked_softmax(e, key_mask)  # (n_query, n_key)
        alpha = alpha.T  # (n_key, n_query)
    return alpha
Esempio n. 11
0
    def build_sampling(self, src_seq, src_mask, target_embedding, target_bias, keys, values, initial_state):
        # sampling graph, this feature is optional
        max_len = T.iscalar()

        def sampling_loop(inputs, state, keys, values, key_mask):
            _, state_prime = self.cell1(inputs, state, scope="gru1")
            alpha = attention(state_prime, keys, key_mask, self.dim_hid, self.dim_key)
            context = T.sum(alpha[:, :, None] * values, 0)
            output, next_state = self.cell2(context, state_prime, scope="gru2")
            probs = self.prediction(inputs, next_state, context)  # p(y_j) \propto f(y_{j-1}, c_j, s_j)
            next_words = ops.random.multinomial(probs).argmax(axis=1)
            new_inputs = nn.embedding_lookup(target_embedding, next_words)
            new_inputs = new_inputs + target_bias

            return [next_words, new_inputs, next_state]

        with ops.variable_scope("decoder"):
            batch = src_seq.shape[1]
            initial_inputs = T.zeros([batch, self.dim_y], theano.config.floatX)

            outputs_info = [None, initial_inputs, initial_state]
            nonseq = [keys, values, src_mask]
            outputs, updates = theano.scan(sampling_loop, [], outputs_info,
                                           nonseq, n_steps=max_len)
            sampled_words = outputs[0]

        sampling_inputs = [src_seq, src_mask, max_len]
        sampling_outputs = sampled_words
        sample = theano.function(sampling_inputs, sampling_outputs,
                                 updates=updates)
        return sample
Esempio n. 12
0
    def build_attention(self, src_seq, src_mask, target_inputs, tgt_seq, tgt_mask, keys, values, initial_state):
        # attention graph, this feature is optional
        def attention_loop(inputs, mask, state, keys, values, key_mask):
            mask = mask[:, None]
            # s_j^{\prime} = GRU^1(y_{j-1}, s_{j-1})
            _, state_prime = self.cell1(inputs, state, scope="gru1")
            # c_j = att(H, s_j^{\prime})
            alpha = attention(state_prime, keys, key_mask, self.dim_hid, self.dim_key)
            context = T.sum(alpha[:, :, None] * values, 0)
            # s_j = GRU^2(c_j, s_j^{\prime})
            output, next_state = self.cell2(context, state_prime, scope="gru2")
            next_state = (1.0 - mask) * state + mask * next_state
            return [alpha, next_state]

        with ops.variable_scope("decoder"):
            seq = [target_inputs, tgt_mask]
            outputs_info = [None, initial_state]
            nonseq = [keys, values, src_mask]
            (alpha, state), updaptes = theano.scan(attention_loop, seq,
                                                   outputs_info, nonseq)
            attention_score = alpha

        alignment_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        alignment_outputs = attention_score
        align = theano.function(alignment_inputs, alignment_outputs)
        return align
Esempio n. 13
0
def decoder(cell,
            inputs,
            mask,
            initial_state,
            attention_states,
            attention_mask,
            attn_size,
            mapped_states=None,
            dtype=None,
            scope=None):
    input_size, states_size = cell.input_size

    output_size = cell.output_size
    dtype = dtype or inputs.dtype
    att_size = [output_size, states_size, attn_size]

    def loop_fn(inputs, mask, state, attn_states, attn_mask, mapped_states):
        mask = mask[:, None]
        alpha = attention(state, None, mapped_states, attn_mask, att_size)
        context = theano.tensor.sum(alpha[:, :, None] * attn_states, 0)
        output, next_state = cell([inputs, context], state)
        next_state = (1.0 - mask) * state + mask * next_state

        return [next_state, context]

    with ops.variable_scope(scope or "decoder"):
        if mapped_states is None:
            mapped_states = attention(None, attention_states, None, None,
                                      att_size)
        seq = [inputs, mask]
        outputs_info = [initial_state, None]
        non_seq = [attention_states, attention_mask, mapped_states]
        (states, contexts) = ops.scan(loop_fn, seq, outputs_info, non_seq)

    return states, contexts
Esempio n. 14
0
    def prediction(self, y_emb, state, context, keep_prob=1.0):
        """
        readout -> softmax
        p(y_j) \propto f(y_{j-1}, s_{j}, c_{j})
        :param y_emb: 
        :param state: 
        :param context: 
        :param keep_prob: 
        :return: 
        """
        features = [state, y_emb, context]
        readout = nn.feedforward(
            features,
            [[self.dim_hid, self.dim_y, self.dim_value], self.dim_readout],
            True,
            activation=T.tanh,
            scope="readout")

        if keep_prob < 1.0:
            readout = nn.dropout(readout, keep_prob=keep_prob)

        with ops.variable_scope(self.tiescope, reuse=True):
            target_embedding = ops.get_variable(
                "embedding", [self.n_y_vocab, self.dim_readout])
            target_embedding = target_embedding.T
            logits = T.dot(readout, target_embedding)

        # logits = nn.linear(readout, [self.dim_readout, self.n_y_vocab], True,
        #                    scope="logits")
        if logits.ndim == 3:
            new_shape = [logits.shape[0] * logits.shape[1], -1]
            logits = logits.reshape(new_shape)

        probs = T.nnet.softmax(logits)
        return probs
Esempio n. 15
0
    def forward(self,
                y_seq,
                y_emb,
                mask,
                keys,
                key_mask,
                values,
                initial_state,
                domain_keys,
                domain_annot,
                tag_seq,
                keep_prob=1.0):
        # shift embedding
        y_shifted = T.zeros_like(y_emb)
        y_shifted = T.set_subtensor(y_shifted[1:], y_emb[:-1])
        y_emb = y_shifted
        # feed
        states, contexts = Decoder.scan(self, y_emb, mask, keys, key_mask,
                                        values, initial_state, domain_keys,
                                        domain_annot)

        with ops.variable_scope("DSAdec"):
            newmask = T.set_subtensor(
                mask[T.cast(T.sum(mask, 0) - 1, 'int32'),
                     T.arange(mask.shape[1])], 0.0)
            # domain_alpha = domain_sensitive_attention(states, newmask, self.dim_hid, self.dim_domain)
            domain_alpha = attention(states[-1], states, newmask, self.dim_hid,
                                     self.dim_hid)
            domain_states = states * domain_alpha[:, :, None]

            # batch * (shdim * 2)
            domain_context = T.sum(domain_states, 0)
            # batch * feadim1
            feature = nn.feedforward(domain_context,
                                     [self.dim_hid, self.feadim],
                                     True,
                                     activation=T.tanh,
                                     scope="feature")

            dscores = nn.feedforward(feature, [self.feadim, self.dnum],
                                     True,
                                     activation=T.tanh,
                                     scope="score")
            # (batch, 4)
            dprobs = T.nnet.softmax(dscores)
            pred_tag = T.argmax(dprobs, 1)
            didx = T.arange(tag_seq.flatten().shape[0])
            dce = -T.log(dprobs[didx, tag_seq.flatten()])
            domaincost = T.mean(dce)

        # p(y_j) \propto f(y_{j-1}, s_{j}, c_{j})
        probs = self.prediction(y_emb, states, contexts, keep_prob)

        # compute cost
        cost, snt_cost = self.get_cost(y_seq, mask, probs, domain_alpha)
        return states, contexts, cost, domaincost, pred_tag, snt_cost
Esempio n. 16
0
    def __call__(self, inputs, state, c_inputs=None, scope=None):
        with variable_scope(scope or "multi_rnn_cell"):
            cur_inp = inputs
            new_states = []
            for i, cell in enumerate(self._cells):
                with variable_scope("cell_%d" % i):
                    cur_state = state[i]

                    if c_inputs:
                        if not isinstance(inputs, (list, tuple)):
                            cur_inp = [inputs]
                        if not isinstance(c_inputs, (list, tuple)):
                            c_inputs = [c_inputs]
                        cur_inp = list(cur_inp) + list(c_inputs)

                    cur_inp, new_state = cell(cur_inp, cur_state)
                    new_states.append(new_state)
        new_states = tuple(new_states)
        return cur_inp, new_states
Esempio n. 17
0
    def forward(self,
                x_embedded,
                mask,
                initial_state=None,
                dtype=None,
                scope=None):
        scope = scope or "encoder"
        cell = self.cell
        with ops.variable_scope(scope, dtype=dtype):
            with ops.variable_scope("forward"):
                fd_states = gru_encoder(cell, x_embedded, mask, initial_state,
                                        dtype)
            with ops.variable_scope("backward"):
                x_embedded = x_embedded[::-1]
                mask = mask[::-1]
                bd_states = gru_encoder(cell, x_embedded, mask, initial_state,
                                        dtype)
                bd_states = bd_states[::-1]

        return fd_states, bd_states
Esempio n. 18
0
def domain_sensitive_attention(keys, key_mask, dim_key, dim_domain, dtype=None, scope=None):
    with ops.variable_scope(scope or "domain_sensitive_attention", dtype=dtype):

        mapped_keys = nn.linear(keys, [dim_key, dim_domain], True, scope="map-key")
        act = T.tanh(mapped_keys)
        # (n_key, n_query, 1)
        e = nn.linear(act, [dim_domain, 1], False, scope="pre-alpha")  # (n_key, n_query, 1)
        # (n_key, n_query)
        e = T.reshape(e, e.shape[:2])
        e = e.T  # (n_query, n_key)
        # match dimension
        key_mask = key_mask.T
        alpha = nn.masked_softmax(e, key_mask)  # (n_query, n_key)
        alpha = alpha.T  # (n_key, n_query)
    return alpha
    def step(self, y_prev, mask, state, *args):
        n_src = self.n_src
        assert len(args) == self.n_src * 3
        src_keys = args[:n_src]
        src_values = args[n_src:2 * n_src]
        src_masks = args[2 * n_src:]

        mask = mask[:, None]
        # s_j^{\prime} = GRU^1(y_{j-1}, s_{j-1})
        _, state_prime = self.cell1(y_prev, state, scope="gru1")
        state_prime = (1.0 - mask) * state + mask * state_prime
        # c_j = att(H, s_j^{\prime})
        contexts = []
        for i, _key, _val, _mask in zip(itertools.count(), src_keys,
                                        src_values, src_masks):
            alpha = attention(state_prime,
                              _key,
                              _mask,
                              self.dim_hid,
                              self.dim_key,
                              scope='attn_alpha_%d' % i)
            context = theano.tensor.sum(alpha[:, :, None] * _val, 0)
            contexts.append(context)
        if self.method == "attn":
            contexts = T.reshape(T.concatenate(contexts, 0),
                                 [n_src] + list(contexts[0].shape))
            with ops.variable_scope("beta"):
                beta_keys = map_key(contexts, self.dim_value, self.dim_key)

            beta = attention(state_prime,
                             beta_keys,
                             T.ones(contexts.shape[:2]),
                             self.dim_hid,
                             self.dim_key,
                             scope='beta')
            context = T.sum(beta[:, :, None] * contexts, 0)
        elif self.method == "concat":
            context = T.concatenate(contexts, -1)

        # s_j = GRU^2(c_j, s_j^{\prime})
        output, next_state = self.cell2(context, state_prime, scope="gru2")
        next_state = (1.0 - mask) * state + mask * next_state
        return next_state, context
Esempio n. 20
0
def linear(inputs, size, bias, concat=False, dtype=None, scope=None):
    if not isinstance(size, (list, tuple)):
        raise ValueError("size argument must be (input_size, output_size)")

    input_size, output_size = size

    if not isinstance(input_size, (list, tuple)):
        input_size = [input_size]

    if not isinstance(inputs, (list, tuple)):
        inputs = [inputs]

    if len(inputs) != len(input_size):
        raise RuntimeError("unmatched elements found: inputs and input_size")

    results = []

    with variable_scope(scope):
        if concat:
            input_size = sum(input_size)
            inputs = theano.tensor.concatenate(inputs, -1)

            shape = [input_size, output_size]
            matrix = get_variable("matrix", shape, dtype=dtype)
            results.append(theano.dot(inputs, matrix))
        else:
            for i in range(len(input_size)):
                shape = [input_size[i], output_size]
                name = "matrix_%d" % i
                matrix = get_variable(name, shape, dtype=dtype)
                results.append(theano.dot(inputs[i], matrix))

        if bias:
            shape = [output_size]
            bias = get_variable("bias", shape, dtype=dtype)
            results.append(bias)

    if len(results) == 1:
        return results[0]

    return reduce(theano.tensor.add, results)
Esempio n. 21
0
    def step(self, y_prev, mask, state, src_words_keys, src_pos_keys, pos_words_keys, pos_pos_keys, values, key_mask):
        mask = mask[:, None]

        # s_j^{\prime} = GRU^1(y_{j-1}, s_{j-1})
        _, state_prime = self.cell1(y_prev, state, scope="gru1")
        state_prime = (1.0 - mask) * state + mask * state_prime
        # c_j = att(H, s_j^{\prime})
        alpha = coarseattention(state_prime, [src_words_keys, src_pos_keys], key_mask, self.dim_query, [self.dim_key, self.dim_word2pos])
        context = T.sum(alpha[:, :, None] * values, 0)
        # s_j = GRU^2(c_j, s_j^{\prime})
        output, next_state = self.cell2(context, state_prime, scope="gru2")
        next_state = (1.0 - mask) * state + mask * next_state

        # y_pos_{j} = nn(s_{j})
        tempstate = nn.feedforward(next_state,
                                   [self.dim_hid, self.posnndim],
                                          True, activation=T.nnet.relu, scope="ttaggerstates")

        score = nn.linear(tempstate, [self.posnndim, self.n_y_tagvocab], True, scope="ttaggerscores")
        prob = T.nnet.softmax(score)

        with ops.variable_scope("tgttag_embedding"):
            tgttag_embedding = ops.get_variable("embedding", [self.n_y_tagvocab, self.poshdim])
            tgttag_bias = ops.get_variable("bias", [self.poshdim])

        y_pos_state = T.dot(prob, tgttag_embedding) + tgttag_bias

        posquery = T.concatenate([tempstate, y_pos_state], -1)
        beta = fineattention([state_prime, posquery], [[src_words_keys, src_pos_keys], [pos_words_keys, pos_pos_keys]], key_mask,
                             [self.dim_query, self.dim_posquery], [[self.dim_key, self.dim_word2pos], [self.dim_pos2word, self.dim_pos2pos]])

        finalalpha = beta
        # adl = ops.get_variable("adaptive", [])
        # walpha = T.exp(adl)
        # wbeta = T.exp(1.0 - adl)
        # finalalpha = walpha/(walpha+wbeta) * alpha + wbeta/(walpha+wbeta) * beta
        #
        # finalalpha = 0.5 * beta + 0.5 * alpha
        context = T.sum(finalalpha[:, :, None] * values, 0)

        return next_state, context, y_pos_state, prob
Esempio n. 22
0
    def __init__(self, **option):
        # source and target embedding dim
        sedim, tedim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim = option["hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        scope = option["scope"]
        decoder_scope = "decoder"

        encoder = Encoder(sedim, shdim)
        decoderType = eval("Decoder{}".format(option["decoder"]))
        decoder = decoderType(tedim, thdim, ahdim, 2 * shdim, dim_maxout=maxdim, max_part=maxpart, dim_readout=deephid,
                              n_y_vocab=tvsize)

        # training graph
        with ops.variable_scope(scope, initializer=initializer,
                                regularizer=regularizer, dtype=dtype):
            src_seq = T.imatrix("source_sequence")
            src_mask = T.matrix("source_sequence_mask")
            tgt_seq = T.imatrix("target_sequence")
            tgt_mask = T.matrix("target_sequence_mask")

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding") as tgtembscope:
                target_embedding = ops.get_variable("embedding",
                                                    [tvsize, tedim])
                # target_bias = ops.get_variable("bias", [tedim])
                decoder.tiescope = tgtembscope

            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)

            source_inputs = source_inputs + source_bias

            if keep_prob < 1.0:
                source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob)
                target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob)

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            # compute initial state for decoder
            # first state of backward encoder
            final_state = r_states[0]
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True, scope="initial",
                                               activation=T.tanh)
                # keys for query
                mapped_keys = map_key(annotation, 2 * shdim, ahdim)

                _, _, cost,_  = decoder.forward(tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask,
                                                    annotation, initial_state, keep_prob)


        training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        training_outputs = [cost]

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = T.ivector("prev_words")

            # disable dropout
            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)
            # target_inputs = target_inputs + target_bias

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            # decoder
            final_state = r_states[0]
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True, scope="initial",
                                               activation=T.tanh)
                mapped_keys = map_key(annotation, 2 * shdim, ahdim)

            prev_inputs = nn.embedding_lookup(target_embedding, prev_words)
            # prev_inputs = prev_inputs + target_bias

            cond = T.neq(prev_words, 0)
            # zeros out embedding if y is 0, which indicates <s>
            prev_inputs = prev_inputs * cond[:, None]

            with ops.variable_scope(decoder_scope):
                mask = T.ones_like(prev_words, dtype=dtype)
                next_state, context = decoder.step(prev_inputs, mask, initial_state, mapped_keys, annotation, src_mask)
                if option["decoder"] == "GruSimple":
                    probs = decoder.prediction(prev_inputs, initial_state, context)
                elif option["decoder"] == "GruCond":
                    probs = decoder.prediction(prev_inputs, next_state, context)

        # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [annotation, initial_state, mapped_keys]
        encode = theano.function(encoding_inputs, encoding_outputs)

        if option["decoder"] == "GruSimple":
            prediction_inputs = [prev_words, initial_state, annotation,
                                 mapped_keys, src_mask]
            prediction_outputs = [probs, context]
            predict = theano.function(prediction_inputs, prediction_outputs)

            generation_inputs = [prev_words, initial_state, context]
            generation_outputs = next_state
            generate = theano.function(generation_inputs, generation_outputs)

            self.predict = predict
            self.generate = generate
        elif option["decoder"] == "GruCond":
            prediction_inputs = [prev_words, initial_state, annotation,
                                 mapped_keys, src_mask]
            prediction_outputs = [probs, next_state]
            predict = theano.function(prediction_inputs, prediction_outputs)
            self.predict = predict

        # optional graph
        '''
        with ops.variable_scope(scope, reuse=True):
            sample = decoder.build_sampling(src_seq, src_mask, target_embedding, target_bias, mapped_keys,
                                            annotation, initial_state)
            align = decoder.build_attention(src_seq, src_mask, target_inputs, tgt_seq, tgt_mask, mapped_keys,
                                            annotation, initial_state)
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True, scope="initial",
                                               activation=T.tanh)
                # keys for query
                mapped_keys = map_key(annotation, 2 * shdim, ahdim)

                _, _, _,snt_cost  = decoder.forward(tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask,
                                                    annotation, initial_state, 1.0)
            get_snt_cost = theano.function(training_inputs, snt_cost)
        '''
        self.cost = cost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.updates = []
        # self.align = align
        # self.sample = sample
        self.encode = encode

        # self.get_snt_cost = get_snt_cost
        self.option = option
Esempio n. 23
0
    def __init__(self, **option):
        # source and target embedding dim
        sedim, tedim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim = option["hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        scope = option["scope"]
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        def prediction(prev_inputs, prev_state, context, keep_prob=1.0):
            features = [prev_state, prev_inputs, context]
            maxhid = nn.maxout(features, [[thdim, tedim, 2 * shdim], maxdim],
                               maxpart, True)
            readout = nn.linear(maxhid, [maxdim, deephid], False,
                                scope="deepout")

            if keep_prob < 1.0:
                readout = nn.dropout(readout, keep_prob=keep_prob)

            logits = nn.linear(readout, [deephid, tvsize], True,
                               scope="logits")

            if logits.ndim == 3:
                new_shape = [logits.shape[0] * logits.shape[1], -1]
                logits = logits.reshape(new_shape)

            probs = theano.tensor.nnet.softmax(logits)

            return probs

        # training graph
        with ops.variable_scope(scope, initializer=initializer,
                                regularizer=regularizer, dtype=dtype):
            src_seq = theano.tensor.imatrix("soruce_sequence")
            src_mask = theano.tensor.matrix("soruce_sequence_mask")
            tgt_seq = theano.tensor.imatrix("target_sequence")
            tgt_mask = theano.tensor.matrix("target_sequence_mask")

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding"):
                target_embedding = ops.get_variable("embedding",
                                                [tvsize, tedim])
                target_bias = ops.get_variable("bias", [tedim])

            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)

            source_inputs = source_inputs + source_bias
            target_inputs = target_inputs + target_bias

            if keep_prob < 1.0:
                source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob)
                target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob)

            cell = nn.rnn_cell.gru_cell([sedim, shdim])

            if keep_prob < 1.0:
                cell = nn.rnn_cell.dropout_wrapper(cell)

            outputs = encoder(cell, source_inputs, src_mask)
            annotation = theano.tensor.concatenate(outputs, 2)

            # compute initial state for decoder
            # first state of backward encoder
            final_state = outputs[1][0]
            with ops.variable_scope("decoder"):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True, scope="initial",
                                               activation=theano.tensor.tanh)

            cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim])

            if keep_prob < 1.0:
                cell = nn.rnn_cell.dropout_wrapper(cell)

            # run decoder
            decoder_outputs = decoder(cell, target_inputs, tgt_mask,
                                      initial_state, annotation, src_mask,
                                      ahdim)
            all_output, all_context = decoder_outputs

            shift_inputs = theano.tensor.zeros_like(target_inputs)
            shift_inputs = theano.tensor.set_subtensor(shift_inputs[1:],
                                                       target_inputs[:-1])

            init_state = initial_state[None, :, :]
            all_states = theano.tensor.concatenate([init_state, all_output], 0)
            prev_states = all_states[:-1]

            with ops.variable_scope("decoder"):
                probs = prediction(shift_inputs, prev_states, all_context,
                                   keep_prob=keep_prob)

            # compute cost
            idx = theano.tensor.arange(tgt_seq.flatten().shape[0])
            cost = -theano.tensor.log(probs[idx, tgt_seq.flatten()])
            cost = cost.reshape(tgt_seq.shape)
            cost = theano.tensor.sum(cost * tgt_mask, 0)
            cost = theano.tensor.mean(cost)

        training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        training_outputs = [cost]

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = theano.tensor.ivector("prev_words")

            # encoder, disable dropout
            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias

            cell = nn.rnn_cell.gru_cell([sedim, shdim])
            outputs = encoder(cell, source_inputs, src_mask)
            annotation = theano.tensor.concatenate(outputs, 2)

            # decoder
            final_state = outputs[1][0]
            with ops.variable_scope("decoder"):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True, scope="initial",
                                               activation=theano.tensor.tanh)

            inputs = nn.embedding_lookup(target_embedding, prev_words)
            inputs = inputs + target_bias

            cond = theano.tensor.neq(prev_words, 0)
            # zeros out embedding if y is 0
            inputs = inputs * cond[:, None]

            cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim])

            with ops.variable_scope("decoder"):
                mapped_states = map_attention_states(annotation, 2 * shdim,
                                                     ahdim)
                alpha = attention(initial_state, mapped_states, thdim, ahdim,
                                  src_mask)
                context = theano.tensor.sum(alpha[:, :, None] * annotation, 0)
                output, next_state = cell([inputs, context], initial_state)
                probs = prediction(inputs, initial_state, context)

        # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [annotation, initial_state, mapped_states]
        encode = theano.function(encoding_inputs, encoding_outputs)

        prediction_inputs = [prev_words, initial_state, annotation,
                             mapped_states, src_mask]
        prediction_outputs = [probs, context, alpha]
        predict = theano.function(prediction_inputs, prediction_outputs)

        generation_inputs = [prev_words, initial_state, context]
        generation_outputs = next_state
        generate = theano.function(generation_inputs, generation_outputs)

        self.cost = cost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.encode = encode
        self.predict = predict
        self.generate = generate
        self.option = option
Esempio n. 24
0
    def __init__(self, **option):
        # source and target embedding dim
        sedim, tedim, xposhdim, yposhdim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim, xposnn, yposnn, word2pos, pos2word, pos2pos = option[
            "hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab, tagvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab

        stag2id, ttag2id = tagvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)
        stagsize, ttagsize = len(stag2id), len(ttag2id)

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        scope = option["scope"]
        decoder_scope = "decoder"

        encoder = Encoder(sedim, shdim)
        decoderType = eval("Decoder{}".format(option["decoder"]))
        decoder = decoderType(tedim,
                              thdim,
                              ahdim,
                              2 * shdim + xposhdim,
                              dim_maxout=maxdim,
                              max_part=maxpart,
                              dim_readout=deephid,
                              n_y_vocab=tvsize,
                              n_y_tagvocab=ttagsize,
                              poshdim=yposhdim,
                              posnndim=yposnn,
                              word2pos=word2pos,
                              pos2word=pos2word,
                              pos2pos=pos2pos)

        # training graph
        with ops.variable_scope(scope,
                                initializer=initializer,
                                regularizer=regularizer,
                                dtype=dtype):
            src_seq = T.imatrix("source_sequence")
            src_mask = T.matrix("source_sequence_mask")
            tgt_seq = T.imatrix("target_sequence")
            tgt_mask = T.matrix("target_sequence_mask")
            src_pos = T.imatrix("source_postag")
            tgt_pos = T.imatrix("target_postag")

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding"):
                target_embedding = ops.get_variable("embedding",
                                                    [tvsize, tedim])
                target_bias = ops.get_variable("bias", [tedim])

            with ops.variable_scope("srctag_embedding"):
                srctag_embedding = ops.get_variable("embedding",
                                                    [stagsize, xposhdim])
                srctag_bias = ops.get_variable("bias", [xposhdim])

            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)

            source_inputs = source_inputs + source_bias
            target_inputs = target_inputs + target_bias

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            with ops.variable_scope("srcpostagger"):
                tempstates = nn.feedforward(annotation, [shdim * 2, xposnn],
                                            True,
                                            scope="staggerstates",
                                            activation=T.nnet.relu)
                scores = nn.linear(tempstates, [xposnn, stagsize],
                                   True,
                                   scope="staggerscores")

                new_shape = [scores.shape[0] * scores.shape[1], -1]
                scores = scores.reshape(new_shape)
                srcposprobs = T.nnet.softmax(scores)

                srctaggerstates = T.dot(srcposprobs,
                                        srctag_embedding) + srctag_bias
                srctaggerstates = srctaggerstates.reshape(
                    [annotation.shape[0], annotation.shape[1], -1])

                idx = T.arange(src_pos.flatten().shape[0])
                ce = -T.log(srcposprobs[idx, src_pos.flatten()])
                ce = ce.reshape(src_pos.shape)
                ce = T.sum(ce * src_mask, 0)
                srcpos_cost = T.mean(ce)

            tempposkeys = T.concatenate([srctaggerstates, tempstates], -1)

            src_words_keys = map_key(annotation, 2 * shdim, ahdim,
                                     "srcwordkeys")
            src_pos_keys = map_key(tempposkeys, xposnn + xposhdim, word2pos,
                                   "srcposkeys")

            pos_words_keys = map_key(annotation, 2 * shdim, pos2word,
                                     "pos2wordkeys")
            pos_pos_keys = map_key(tempposkeys, xposnn + xposhdim, pos2pos,
                                   "pos2poskeys")

            annotation = T.concatenate([annotation, srctaggerstates], -1)

            # compute initial state for decoder
            # first state of backward encoder
            final_state = T.concatenate([r_states[0], srctaggerstates[0]], -1)
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state,
                                               [shdim + xposhdim, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)

                _, _, transcost, _, tgtpos_cost = decoder.forward(
                    tgt_seq, target_inputs, tgt_mask, src_words_keys,
                    src_pos_keys, pos_words_keys, pos_pos_keys, src_mask,
                    annotation, initial_state, tgt_pos, keep_prob)

        lambx = theano.shared(numpy.asarray(option["lambda"][0], dtype),
                              "lambdax")
        lamby = theano.shared(numpy.asarray(option["lambda"][1], dtype),
                              "lambday")

        totalcost = transcost + lambx * srcpos_cost + lamby * tgtpos_cost
        training_inputs = [
            src_seq, src_mask, tgt_seq, tgt_mask, src_pos, tgt_pos
        ]
        training_outputs = [srcpos_cost, tgtpos_cost, transcost, totalcost]

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = T.ivector("prev_words")

            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)
            target_inputs = target_inputs + target_bias

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            with ops.variable_scope("srcpostagger"):
                tempstates = nn.feedforward(annotation, [shdim * 2, xposnn],
                                            True,
                                            scope="staggerstates",
                                            activation=T.nnet.relu)
                scores = nn.linear(tempstates, [xposnn, stagsize],
                                   True,
                                   scope="staggerscores")

                new_shape = [scores.shape[0] * scores.shape[1], -1]
                scores = scores.reshape(new_shape)
                srcposprobs = T.nnet.softmax(scores)

                srctaggerstates = T.dot(srcposprobs,
                                        srctag_embedding) + srctag_bias
                srctaggerstates = srctaggerstates.reshape(
                    [annotation.shape[0], annotation.shape[1], -1])

            tempposkeys = T.concatenate([srctaggerstates, tempstates], -1)

            src_words_keys = map_key(annotation, 2 * shdim, ahdim,
                                     "srcwordkeys")
            src_pos_keys = map_key(tempposkeys, xposnn + xposhdim, word2pos,
                                   "srcposkeys")

            pos_words_keys = map_key(annotation, 2 * shdim, pos2word,
                                     "pos2wordkeys")
            pos_pos_keys = map_key(tempposkeys, xposnn + xposhdim, pos2pos,
                                   "pos2poskeys")

            annotation = T.concatenate([annotation, srctaggerstates], -1)

            # decoder
            final_state = T.concatenate([r_states[0], srctaggerstates[0]], -1)
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state,
                                               [shdim + xposhdim, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)

            prev_inputs = nn.embedding_lookup(target_embedding, prev_words)
            prev_inputs = prev_inputs + target_bias

            cond = T.neq(prev_words, 0)
            # zeros out embedding if y is 0, which indicates <s>
            prev_inputs = prev_inputs * cond[:, None]

            with ops.variable_scope(decoder_scope):
                mask = T.ones_like(prev_words, dtype=dtype)
                next_state, context, next_pos, tgtposprob = decoder.step(
                    prev_inputs, mask, initial_state, src_words_keys,
                    src_pos_keys, pos_words_keys, pos_pos_keys, annotation,
                    src_mask)
                if option["decoder"] == "GruSimple":
                    probs = decoder.prediction(prev_inputs, initial_state,
                                               context)
                elif option["decoder"] == "GruCond":
                    probs = decoder.prediction(prev_inputs, next_state,
                                               context, next_pos)

        # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [
            annotation, initial_state, src_words_keys, src_pos_keys,
            pos_words_keys, pos_pos_keys, srcposprobs
        ]
        encode = theano.function(encoding_inputs, encoding_outputs)

        if option["decoder"] == "GruSimple":
            prediction_inputs = [
                prev_words, initial_state, annotation, mapped_keys, src_mask
            ]
            prediction_outputs = [probs, context]
            predict = theano.function(prediction_inputs, prediction_outputs)

            generation_inputs = [prev_words, initial_state, context]
            generation_outputs = next_state
            generate = theano.function(generation_inputs, generation_outputs)

            self.predict = predict
            self.generate = generate
        elif option["decoder"] == "GruCond":
            prediction_inputs = [
                prev_words, initial_state, annotation, src_words_keys,
                src_pos_keys, pos_words_keys, pos_pos_keys, src_mask
            ]
            prediction_outputs = [probs, next_state, tgtposprob]
            predict = theano.function(prediction_inputs,
                                      prediction_outputs,
                                      on_unused_input='warn')
            self.predict = predict

        self.cost = totalcost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.updates = []
        self.encode = encode
        self.option = option
Esempio n. 25
0
    def __init__(self, **option):

        # source and target embedding dim
        sedim, tedim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim = option["hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        scope = option["scope"]
        decoder_scope = "decoder2"

        encoder = Encoder(sedim, shdim)
        import decoder2
        decoder = decoder2.DecoderGruCond(2,
                                          option['method'],
                                          tedim,
                                          thdim,
                                          ahdim,
                                          2 * shdim + thdim,
                                          dim_readout=deephid,
                                          n_y_vocab=tvsize)

        # training graph
        with ops.variable_scope(scope,
                                initializer=initializer,
                                regularizer=regularizer,
                                dtype=dtype):
            src_seq = T.imatrix("source_sequence")
            src_mask = T.matrix("source_sequence_mask")
            tgt_seq = T.imatrix("target_sequence")
            tgt_mask = T.matrix("target_sequence_mask")
            byseq = T.imatrix("backward_target_sequence")

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding"):
                target_embedding = ops.get_variable("embedding",
                                                    [tvsize, tedim])
                target_bias = ops.get_variable("bias", [tedim])

            source_inputs = nn.embedding_lookup(source_embedding,
                                                src_seq) + source_bias
            target_inputs = nn.embedding_lookup(target_embedding,
                                                tgt_seq) + target_bias
            by_inputs = nn.embedding_lookup(target_embedding,
                                            byseq) + target_bias

            if keep_prob < 1.0:
                source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob)
                target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob)
                by_inputs = nn.dropout(by_inputs, keep_prob=keep_prob)

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            annotation = nn.dropout(annotation, keep_prob=keep_prob)

            import softdec
            soft_decoder = softdec.SoftDecoder(option["eosid"],
                                               option["softk"],
                                               tedim,
                                               thdim,
                                               ahdim,
                                               2 * shdim,
                                               dim_readout=deephid,
                                               n_y_vocab=tvsize)
            with ops.variable_scope('soft_decoder'):
                initial_state = nn.feedforward(states[-1], [shdim, thdim],
                                               True,
                                               scope='initial',
                                               activation=T.tanh)
                mapped_keys = map_key(annotation, 2 * shdim, ahdim)
                soft_states, _, _, soft_mask = soft_decoder.infer(
                    mapped_keys, src_mask, annotation, initial_state,
                    target_embedding, target_bias, keep_prob)

            with ops.variable_scope('soft_decoder', reuse=True):
                _, _, soft_cost, _ = soft_decoder.forward(
                    byseq, by_inputs, tgt_mask, mapped_keys, src_mask,
                    annotation, initial_state, keep_prob)

            # compute initial state for decoder
            # first state of backward encoder
            # initialize with only encoder state
            final_state = r_states[0]

            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)
                # keys for query
                with ops.variable_scope('map-key-src'):
                    mapped_keys_src = map_key(annotation, 2 * shdim, ahdim)
                with ops.variable_scope('map-key-soft'):
                    mapped_keys_soft = map_key(soft_states, thdim, ahdim)

                _, _, _, snt_cost = decoder.forward(
                    tgt_seq, target_inputs, tgt_mask,
                    [mapped_keys_src, mapped_keys_soft], [src_mask, soft_mask],
                    [annotation, soft_states], initial_state, keep_prob)

            ce = snt_cost
            true_cost = T.mean(ce)
            lamb = theano.shared(numpy.asarray(option['lambda'], dtype),
                                 'lambda')
            cost = lamb * soft_cost + (1 - lamb) * true_cost

        # import utils.ttensor
        # print 'true_cost %d:' % len(utils.ttensor.find_inputs_and_params(true_cost)[0])
        # for xxx in utils.ttensor.find_inputs_and_params(true_cost)[0]:
        #     print '\t', xxx
        # print 'soft_cost %d:' % len(utils.ttensor.find_inputs_and_params(soft_cost)[0])
        # for xxx in utils.ttensor.find_inputs_and_params(soft_cost)[0]:
        #     print '\t', xxx
        # print 'tot_cost: %d' % len(utils.ttensor.find_inputs_and_params(cost)[0])
        # for xxx in utils.ttensor.find_inputs_and_params(cost)[0]:
        #     print '\t', xxx
        # print 'snt_cost: %d' % len(utils.ttensor.find_inputs_and_params(snt_cost)[0])
        # for xxx in utils.ttensor.find_inputs_and_params(snt_cost)[0]:
        #     print '\t', xxx

        training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask, byseq]
        training_outputs = [cost, soft_cost, true_cost]

        # get_snt_cost = theano.function(training_inputs[:4], snt_cost)
        get_snt_cost = None

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = T.ivector("prev_words")

            # disable dropout
            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)
            target_inputs = target_inputs + target_bias

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            with ops.variable_scope('soft_decoder'):
                initial_state = nn.feedforward(states[-1], [shdim, thdim],
                                               True,
                                               scope='initial',
                                               activation=T.tanh)
                mapped_keys = map_key(annotation, 2 * shdim, ahdim)
                soft_states, soft_contexts, soft_probs, soft_mask = soft_decoder.infer(
                    mapped_keys, src_mask, annotation, initial_state,
                    target_embedding, target_bias, 1.0)

            # decoder
            final_state = r_states[0]
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)
                # keys for query
                with ops.variable_scope('map-key-src'):
                    mapped_keys_src = map_key(annotation, 2 * shdim, ahdim)
                with ops.variable_scope('map-key-soft'):
                    mapped_keys_soft = map_key(soft_states, thdim, ahdim)

            prev_inputs = nn.embedding_lookup(target_embedding, prev_words)
            prev_inputs = prev_inputs + target_bias

            cond = T.neq(prev_words, 0)
            # zeros out embedding if y is 0, which indicates <s>
            prev_inputs = prev_inputs * cond[:, None]

            with ops.variable_scope(decoder_scope):
                mask = T.ones_like(prev_words, dtype=dtype)
                next_state, context = decoder.step(
                    prev_inputs, mask, initial_state, *[
                        mapped_keys_src, mapped_keys_soft, annotation,
                        soft_states, src_mask, soft_mask
                    ])
                probs = decoder.prediction(prev_inputs, next_state, context)

                # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [
            initial_state, annotation, soft_states, mapped_keys_src,
            mapped_keys_soft, soft_mask
        ]
        encode = theano.function(encoding_inputs, encoding_outputs)

        if option["decoder"] == "GruSimple":
            raise ValueError()
            prediction_inputs = [
                prev_words, initial_state, annotation, mapped_keys, src_mask
            ]
            prediction_outputs = [probs, context]
            predict = theano.function(prediction_inputs, prediction_outputs)

            generation_inputs = [prev_words, initial_state, context]
            generation_outputs = next_state
            generate = theano.function(generation_inputs, generation_outputs)

            self.predict = predict
            self.generate = generate
        elif option["decoder"] == "GruCond":
            prediction_inputs = [
                prev_words, initial_state, annotation, mapped_keys_src,
                src_mask, soft_states, mapped_keys_soft, soft_mask
            ]
            prediction_outputs = [probs, next_state]
            predict = theano.function(prediction_inputs, prediction_outputs)
            self.predict = predict

        self.cost = cost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.updates = []
        self.align = None
        self.sample = None
        self.encode = encode

        self.get_snt_cost = get_snt_cost
        self.option = option
Esempio n. 26
0
    def __init__(self, **option):
        # source and target embedding dim
        sedim, tedim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim, domaindim, feadim = option["hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)
        dnum = option['dnum']

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        scope = option["scope"]
        decoder_scope = "decoder"

        encoder = Encoder(sedim, shdim)
        decoderType = eval("Decoder{}".format(option["decoder"]))
        decoder = decoderType(tedim,
                              thdim,
                              ahdim,
                              2 * shdim,
                              dnum=dnum,
                              dim_maxout=maxdim,
                              max_part=maxpart,
                              dim_readout=deephid,
                              dim_domain=domaindim,
                              feadim=feadim,
                              n_y_vocab=tvsize)

        # training graph
        with ops.variable_scope(scope,
                                initializer=initializer,
                                regularizer=regularizer,
                                dtype=dtype):
            src_seq = T.imatrix("source_sequence")
            src_mask = T.matrix("source_sequence_mask")
            tgt_seq = T.imatrix("target_sequence")
            tgt_mask = T.matrix("target_sequence_mask")
            tag_seq = T.imatrix("domain_tag")
            # nsrc_mask = T.set_subtensor(src_mask[T.cast(T.sum(src_mask, 0) - 1, 'int32'),
            #                                      T.arange(src_mask.shape[1])], 0.0)

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding") as tgtembscope:
                target_embedding = ops.get_variable("embedding",
                                                    [tvsize, tedim])
                # target_bias = ops.get_variable("bias", [tedim])
                decoder.tiescope = tgtembscope

            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)

            source_inputs = source_inputs + source_bias

            if keep_prob < 1.0:
                source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob)
                target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob)

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            with ops.variable_scope("Specific"):
                domain_alpha = domain_sensitive_attention(
                    annotation, src_mask, shdim * 2, domaindim)
                # domain_alpha = attention(r_states[0], annotation, nsrc_mask,
                #                          shdim,
                #                          shdim * 2)
                domain_context = T.sum(annotation * domain_alpha[:, :, None],
                                       0)
                dfeature = nn.feedforward(domain_context, [shdim * 2, feadim],
                                          True,
                                          activation=T.tanh,
                                          scope="feature1")

                dscores = nn.feedforward(dfeature, [feadim, dnum],
                                         True,
                                         activation=T.tanh,
                                         scope="score")
                # (batch, 2)
                dprobs = T.nnet.softmax(dscores)
                dpred_tag = T.argmax(dprobs, 1)
                didx = T.arange(tag_seq.flatten().shape[0])
                dce = -T.log(dprobs[didx, tag_seq.flatten()])
                dcost = T.mean(dce)

            share_alpha = domain_sensitive_attention(annotation, src_mask,
                                                     shdim * 2, domaindim)
            # share_alpha = attention(r_states[0], annotation, nsrc_mask,
            #                         shdim,
            #                         shdim * 2)
            share_context = T.sum(annotation * share_alpha[:, :, None], 0)
            sfeature = nn.feedforward(share_context, [shdim * 2, feadim],
                                      True,
                                      activation=T.tanh,
                                      scope="feature1")

            with ops.variable_scope("Shared"):
                sscores = nn.feedforward(sfeature, [feadim, dnum],
                                         True,
                                         activation=T.tanh,
                                         scope="score")
                # (batch, 2)
                sprobs = T.nnet.softmax(sscores)
                spred_tag = T.argmax(sprobs, 1)
                sidx = T.arange(tag_seq.flatten().shape[0])
                sce = -T.log(sprobs[sidx, tag_seq.flatten()])
                scost = T.mean(sce)
                adv_sce = -sprobs[sidx, tag_seq.flatten()] * T.log(
                    sprobs[sidx, tag_seq.flatten()])
                adv_scost = T.mean(adv_sce)

            domain_gate = nn.feedforward([dfeature, annotation],
                                         [[feadim, shdim * 2], shdim * 2],
                                         True,
                                         scope="domain_gate")
            domain_annotation = annotation * domain_gate
            domain_annotation = nn.dropout(domain_annotation,
                                           keep_prob=keep_prob)
            share_gate = nn.feedforward([sfeature, annotation],
                                        [[feadim, shdim * 2], shdim * 2],
                                        True,
                                        scope="share_gate")
            annotation = annotation * share_gate
            annotation = nn.dropout(annotation, keep_prob=keep_prob)

            # compute initial state for decoder
            # first state of backward encoder
            # batch * shdim
            final_state = T.concatenate([
                annotation[0, :, annotation.shape[-1] / 2:],
                domain_annotation[0, :, annotation.shape[-1] / 2:]
            ], -1)
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim * 2, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)
                # keys for query
                mapped_keys = map_key(annotation, 2 * shdim, ahdim, "semantic")
                mapped_domain_keys = map_key(domain_annotation, 2 * shdim,
                                             ahdim, "domain")

                _, _, cost, tgtdcost, tpred_tag, _ = decoder.forward(
                    tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask,
                    annotation, initial_state, mapped_domain_keys,
                    domain_annotation, tag_seq, keep_prob)

        lamb = theano.shared(numpy.asarray(option["lambda"], dtype), "lambda")
        # cwscost *= lamb
        final_cost = cost + dcost + tgtdcost - lamb * adv_scost

        tag_inputs = [src_seq, src_mask]
        tag_outputs = [dpred_tag, spred_tag]
        tag_predict = theano.function(tag_inputs, tag_outputs)
        self.tag_predict = tag_predict

        tgt_tag_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        tgt_tag_outputs = [tpred_tag]
        tgt_tag_predict = theano.function(tgt_tag_inputs, tgt_tag_outputs)
        self.tgt_tag_predict = tgt_tag_predict

        training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask, tag_seq]
        training_outputs = [cost, dcost, adv_scost, tgtdcost]

        self.cost_cla = scost
        self.inputs_cla = [src_seq, src_mask, tag_seq]
        self.outputs_cla = [scost]

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = T.ivector("prev_words")

            # disable dropout
            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            with ops.variable_scope("Specific"):
                domain_alpha = domain_sensitive_attention(
                    annotation, src_mask, shdim * 2, domaindim)
                # domain_alpha = attention(r_states[0], annotation, nsrc_mask,
                #                          shdim,
                #                          shdim * 2)
                domain_context = T.sum(annotation * domain_alpha[:, :, None],
                                       0)
                dfeature = nn.feedforward(domain_context, [shdim * 2, feadim],
                                          True,
                                          activation=T.tanh,
                                          scope="feature1")

            share_alpha = domain_sensitive_attention(annotation, src_mask,
                                                     shdim * 2, domaindim)
            # share_alpha = attention(r_states[0], annotation, nsrc_mask,
            #                         shdim,
            #                         shdim * 2)
            share_context = T.sum(annotation * share_alpha[:, :, None], 0)
            sfeature = nn.feedforward(share_context, [shdim * 2, feadim],
                                      True,
                                      activation=T.tanh,
                                      scope="feature1")

            domain_gate = nn.feedforward([dfeature, annotation],
                                         [[feadim, shdim * 2], shdim * 2],
                                         True,
                                         scope="domain_gate")
            domain_annotation = annotation * domain_gate
            share_gate = nn.feedforward([sfeature, annotation],
                                        [[feadim, shdim * 2], shdim * 2],
                                        True,
                                        scope="share_gate")
            annotation = annotation * share_gate

            # decoder
            final_state = T.concatenate([
                annotation[0, :, annotation.shape[-1] / 2:],
                domain_annotation[0, :, annotation.shape[-1] / 2:]
            ], -1)
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim * 2, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)
                mapped_keys = map_key(annotation, 2 * shdim, ahdim, "semantic")
                mapped_domain_keys = map_key(domain_annotation, 2 * shdim,
                                             ahdim, "domain")

            prev_inputs = nn.embedding_lookup(target_embedding, prev_words)
            # prev_inputs = prev_inputs + target_bias

            cond = T.neq(prev_words, 0)
            # zeros out embedding if y is 0, which indicates <s>
            prev_inputs = prev_inputs * cond[:, None]

            with ops.variable_scope(decoder_scope):
                mask = T.ones_like(prev_words, dtype=dtype)
                next_state, context = decoder.step(prev_inputs, mask,
                                                   initial_state, mapped_keys,
                                                   annotation, src_mask,
                                                   mapped_domain_keys,
                                                   domain_annotation)
                if option["decoder"] == "GruSimple":
                    probs = decoder.prediction(prev_inputs, initial_state,
                                               context)
                elif option["decoder"] == "GruCond":
                    probs = decoder.prediction(prev_inputs, next_state,
                                               context)

        # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [
            annotation, initial_state, mapped_keys, mapped_domain_keys,
            domain_annotation
        ]
        encode = theano.function(encoding_inputs, encoding_outputs)

        if option["decoder"] == "GruSimple":
            prediction_inputs = [
                prev_words, initial_state, annotation, mapped_keys, src_mask
            ]
            prediction_outputs = [probs, context]
            predict = theano.function(prediction_inputs, prediction_outputs)

            generation_inputs = [prev_words, initial_state, context]
            generation_outputs = next_state
            generate = theano.function(generation_inputs, generation_outputs)

            self.predict = predict
            self.generate = generate
        elif option["decoder"] == "GruCond":
            prediction_inputs = [
                prev_words, initial_state, annotation, mapped_keys, src_mask,
                mapped_domain_keys, domain_annotation
            ]
            prediction_outputs = [probs, next_state]
            predict = theano.function(prediction_inputs, prediction_outputs)
            self.predict = predict

        self.cost = final_cost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.updates = []
        # self.align = align
        # self.sample = sample
        self.encode = encode
        # self.get_snt_cost = get_snt_cost
        self.option = option
Esempio n. 27
0
    def __init__(self, **option):
        # source and target embedding dim
        sedim, tedim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim = option["hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "criterion" not in option:
            option["criterion"] = "mle"

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        scope = option["scope"]
        criterion = option["criterion"]
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        # MRT mode do not use dropout
        if criterion == "mrt":
            keep_prob = 1.0

        def prediction(prev_inputs, prev_state, context, keep_prob=1.0):
            features = [prev_state, prev_inputs, context]
            maxhid = nn.maxout(features, [[thdim, tedim, 2 * shdim], maxdim],
                               maxpart, True)
            readout = nn.linear(maxhid, [maxdim, deephid],
                                False,
                                scope="deepout")

            if keep_prob < 1.0:
                readout = nn.dropout(readout, keep_prob=keep_prob)

            logits = nn.linear(readout, [deephid, tvsize],
                               True,
                               scope="logits")

            if logits.ndim == 3:
                new_shape = [logits.shape[0] * logits.shape[1], -1]
                logits = logits.reshape(new_shape)

            probs = theano.tensor.nnet.softmax(logits)

            return probs

        # training graph
        with ops.variable_scope(scope,
                                initializer=initializer,
                                regularizer=regularizer,
                                dtype=dtype):
            src_seq = theano.tensor.imatrix("soruce_sequence")
            src_mask = theano.tensor.matrix("soruce_sequence_mask")
            tgt_seq = theano.tensor.imatrix("target_sequence")
            tgt_mask = theano.tensor.matrix("target_sequence_mask")

            if criterion == "mrt":
                loss = theano.tensor.vector("loss_score")
                sharp = theano.tensor.scalar("sharpness")

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding"):
                target_embedding = ops.get_variable("embedding",
                                                    [tvsize, tedim])
                target_bias = ops.get_variable("bias", [tedim])

            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)

            source_inputs = source_inputs + source_bias
            target_inputs = target_inputs + target_bias

            if keep_prob < 1.0:
                source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob)
                target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob)

            cell = nn.rnn_cell.gru_cell([sedim, shdim])

            outputs = encoder(cell, source_inputs, src_mask)
            annotation = theano.tensor.concatenate(outputs, 2)

            annotation = nn.dropout(annotation, keep_prob=keep_prob)

            # compute initial state for decoder
            # first state of backward encoder
            final_state = outputs[1][0]
            with ops.variable_scope("decoder"):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True,
                                               scope="initial",
                                               activation=theano.tensor.tanh)

            # run decoder
            cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim])

            if criterion == "mrt":
                # In MRT training, shape of src_seq and src_mask are assumed
                # to have [len, 1]
                batch = tgt_seq.shape[1]
                with ops.variable_scope("decoder"):
                    mapped_states = attention(None, annotation, None, None,
                                              [thdim, 2 * shdim, ahdim])
                b_src_mask = theano.tensor.repeat(src_mask, batch, 1)
                b_annotation = theano.tensor.repeat(annotation, batch, 1)
                b_mapped_states = theano.tensor.repeat(mapped_states, batch, 1)
                b_initial_state = theano.tensor.repeat(initial_state, batch, 0)

                decoder_outputs = decoder(cell, target_inputs, tgt_mask,
                                          b_initial_state, b_annotation,
                                          b_src_mask, ahdim, b_mapped_states)
            else:
                decoder_outputs = decoder(cell, target_inputs, tgt_mask,
                                          initial_state, annotation, src_mask,
                                          ahdim)

            all_output, all_context = decoder_outputs
            shift_inputs = theano.tensor.zeros_like(target_inputs)
            shift_inputs = theano.tensor.set_subtensor(shift_inputs[1:],
                                                       target_inputs[:-1])

            if criterion == "mrt":
                init_state = b_initial_state[None, :, :]
            else:
                init_state = initial_state[None, :, :]

            all_states = theano.tensor.concatenate([init_state, all_output], 0)
            prev_states = all_states[:-1]

            with ops.variable_scope("decoder"):
                probs = prediction(shift_inputs,
                                   prev_states,
                                   all_context,
                                   keep_prob=keep_prob)

            # compute cost
            idx = theano.tensor.arange(tgt_seq.flatten().shape[0])
            ce = -theano.tensor.log(probs[idx, tgt_seq.flatten()])
            ce = ce.reshape(tgt_seq.shape)
            ce = theano.tensor.sum(ce * tgt_mask, 0)

            if criterion == "mle":
                cost = theano.tensor.mean(ce)
            else:
                # ce is positive here
                logp = -ce
                score = sharp * logp
                # safe softmax
                score = score - theano.tensor.max(score)
                score = theano.tensor.exp(score)
                qprob = score / theano.tensor.sum(score)
                risk = theano.tensor.sum(qprob * loss)
                cost = risk

        if criterion == "mle":
            training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        else:
            training_inputs = [
                src_seq, src_mask, tgt_seq, tgt_mask, loss, sharp
            ]
        training_outputs = [cost]

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = theano.tensor.ivector("prev_words")

            # disable dropout
            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)
            target_inputs = target_inputs + target_bias

            cell = nn.rnn_cell.gru_cell([sedim, shdim])
            outputs = encoder(cell, source_inputs, src_mask)
            annotation = theano.tensor.concatenate(outputs, 2)

            # decoder
            final_state = outputs[1][0]
            with ops.variable_scope("decoder"):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True,
                                               scope="initial",
                                               activation=theano.tensor.tanh)

            inputs = nn.embedding_lookup(target_embedding, prev_words)
            inputs = inputs + target_bias

            cond = theano.tensor.neq(prev_words, 0)
            # zeros out embedding if y is 0
            inputs = inputs * cond[:, None]

            cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim])

            # encode -> prediction -> generation
            # prediction: prev_word + prev_state => context, next_word
            # generation: curr_word + context + prev_state => next_state
            # here, initial_state is merely a placeholder
            with ops.variable_scope("decoder"):
                # used in encoding
                mapped_states = attention(None, annotation, None, None,
                                          [thdim, 2 * shdim, ahdim])
                # used in prediction
                alpha = attention(initial_state, None, mapped_states, src_mask,
                                  [thdim, 2 * shdim, ahdim])
                context = theano.tensor.sum(alpha[:, :, None] * annotation, 0)
                probs = prediction(inputs, initial_state, context)
                # used in generation
                output, next_state = cell([inputs, context], initial_state)

        # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [annotation, initial_state, mapped_states]
        encode = theano.function(encoding_inputs, encoding_outputs)

        prediction_inputs = [
            prev_words, initial_state, annotation, mapped_states, src_mask
        ]
        prediction_outputs = [probs, context, alpha]
        predict = theano.function(prediction_inputs, prediction_outputs)

        generation_inputs = [prev_words, initial_state, context]
        generation_outputs = next_state
        generate = theano.function(generation_inputs, generation_outputs)

        # sampling graph, this feature is optional
        with ops.variable_scope(scope, reuse=True):
            max_len = theano.tensor.iscalar()

            def sampling_loop(inputs, state, attn_states, attn_mask, m_states):
                alpha = attention(state, None, m_states, attn_mask,
                                  [thdim, 2 * shdim, ahdim])
                context = theano.tensor.sum(alpha[:, :, None] * attn_states, 0)
                probs = prediction(inputs, state, context)
                next_words = ops.random.multinomial(probs).argmax(axis=1)
                new_inputs = nn.embedding_lookup(target_embedding, next_words)
                new_inputs = new_inputs + target_bias
                output, next_state = cell([new_inputs, context], state)

                return [next_words, new_inputs, next_state]

            with ops.variable_scope("decoder"):
                batch = src_seq.shape[1]
                initial_inputs = theano.tensor.zeros([batch, tedim],
                                                     dtype=dtype)

                outputs_info = [None, initial_inputs, initial_state]
                nonseq = [annotation, src_mask, mapped_states]
                outputs, updates = theano.scan(sampling_loop, [],
                                               outputs_info,
                                               nonseq,
                                               n_steps=max_len)
                sampled_words = outputs[0]

        sampling_inputs = [src_seq, src_mask, max_len]
        sampling_outputs = sampled_words
        sample = theano.function(sampling_inputs,
                                 sampling_outputs,
                                 updates=updates)

        # attention graph, this feature is optional
        with ops.variable_scope(scope, reuse=True):

            def attention_loop(inputs, mask, state, attn_states, attn_mask,
                               m_states):
                mask = mask[:, None]
                alpha = attention(state, None, m_states, attn_mask,
                                  [thdim, 2 * shdim, ahdim])
                context = theano.tensor.sum(alpha[:, :, None] * attn_states, 0)
                output, next_state = cell([inputs, context], state)
                next_state = (1.0 - mask) * state + mask * next_state

                return [alpha, next_state]

            with ops.variable_scope("decoder"):
                seq = [target_inputs, tgt_mask]
                outputs_info = [None, initial_state]
                nonseq = [annotation, src_mask, mapped_states]
                (alpha, state), updaptes = theano.scan(attention_loop, seq,
                                                       outputs_info, nonseq)
                attention_score = alpha

        alignment_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        alignment_outputs = attention_score
        align = theano.function(alignment_inputs, alignment_outputs)

        self.cost = cost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.updates = []
        self.align = align
        self.sample = sample
        self.encode = encode
        self.predict = predict
        self.generate = generate
        self.option = option