def _infer_step(self, y_prev, mask, state, keys, values, key_mask,
                 embedding, embedding_bias, keep_prob):
     softk = self.softk
     next_state, context = self.step(y_prev, mask, state, keys, values,
                                     key_mask)
     probs = self.prediction(y_prev[None, :, :], next_state[None, :, :],
                             context[None, :, :], keep_prob)
     pred_idx = T.argmax(probs, 1)  # (batch,)
     iseos = T.eq(pred_idx, self.eosid)
     curr_mask = 1.0 - iseos
     mask = mask[:, 0]
     curr_mask = mask * curr_mask
     # greedy search
     if softk == 1:  # greedy
         next_inputs = nn.embedding_lookup(
             embedding, pred_idx) + embedding_bias  # (batch, emb_dim)
     elif softk == 0:  # soft over all
         next_inputs = T.dot(probs, embedding) + embedding_bias
     elif softk > 1:  # soft over k-best
         # (batch, k), (batch, k)
         max_k_probs, max_k_idx = max_k(probs, softk)
         # (batch, k, dim)
         max_k_probs = max_k_probs / T.sum(max_k_probs, 1)[:, None]
         max_k_embeddings = nn.embedding_lookup(embedding,
                                                max_k_idx) + embedding_bias
         next_inputs = T.sum(max_k_probs[:, :, None] * max_k_embeddings, 1)
     else:
         raise RuntimeError()
     curr_mask = curr_mask[:, None]
     return (next_inputs, curr_mask, next_state, context,
             probs), theano.scan_module.until(T.eq(T.sum(curr_mask), 0.0))
        def sampling_loop(inputs, state, keys, values, key_mask):
            alpha = attention(state, keys, key_mask, self.dim_hid, self.dim_key)
            context = T.sum(alpha[:, :, None] * values, 0)
            probs = self.prediction(inputs, state, context)
            next_words = ops.random.multinomial(probs).argmax(axis=1)
            new_inputs = nn.embedding_lookup(target_embedding, next_words)
            new_inputs = new_inputs + target_bias
            output, next_state = self.cell([inputs, context], state)

            return [next_words, new_inputs, next_state]
Exemple #3
0
        def sampling_loop(inputs, state, keys, values, key_mask):
            _, state_prime = self.cell1(inputs, state, scope="gru1")
            alpha = attention(state_prime, keys, key_mask, self.dim_hid, self.dim_key)
            context = T.sum(alpha[:, :, None] * values, 0)
            output, next_state = self.cell2(context, state_prime, scope="gru2")
            probs = self.prediction(inputs, next_state, context)  # p(y_j) \propto f(y_{j-1}, c_j, s_j)
            next_words = ops.random.multinomial(probs).argmax(axis=1)
            new_inputs = nn.embedding_lookup(target_embedding, next_words)
            new_inputs = new_inputs + target_bias

            return [next_words, new_inputs, next_state]
Exemple #4
0
            def sampling_loop(inputs, state, attn_states, attn_mask, m_states):
                alpha = attention(state, None, m_states, attn_mask,
                                  [thdim, 2 * shdim, ahdim])
                context = theano.tensor.sum(alpha[:, :, None] * attn_states, 0)
                probs = prediction(inputs, state, context)
                next_words = ops.random.multinomial(probs).argmax(axis=1)
                new_inputs = nn.embedding_lookup(target_embedding, next_words)
                new_inputs = new_inputs + target_bias
                output, next_state = cell([new_inputs, context], state)

                return [next_words, new_inputs, next_state]
Exemple #5
0
    def __init__(self, **option):
        # source and target embedding dim
        sedim, tedim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim = option["hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        scope = option["scope"]
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        def prediction(prev_inputs, prev_state, context, keep_prob=1.0):
            features = [prev_state, prev_inputs, context]
            maxhid = nn.maxout(features, [[thdim, tedim, 2 * shdim], maxdim],
                               maxpart, True)
            readout = nn.linear(maxhid, [maxdim, deephid], False,
                                scope="deepout")

            if keep_prob < 1.0:
                readout = nn.dropout(readout, keep_prob=keep_prob)

            logits = nn.linear(readout, [deephid, tvsize], True,
                               scope="logits")

            if logits.ndim == 3:
                new_shape = [logits.shape[0] * logits.shape[1], -1]
                logits = logits.reshape(new_shape)

            probs = theano.tensor.nnet.softmax(logits)

            return probs

        # training graph
        with ops.variable_scope(scope, initializer=initializer,
                                regularizer=regularizer, dtype=dtype):
            src_seq = theano.tensor.imatrix("soruce_sequence")
            src_mask = theano.tensor.matrix("soruce_sequence_mask")
            tgt_seq = theano.tensor.imatrix("target_sequence")
            tgt_mask = theano.tensor.matrix("target_sequence_mask")

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding"):
                target_embedding = ops.get_variable("embedding",
                                                [tvsize, tedim])
                target_bias = ops.get_variable("bias", [tedim])

            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)

            source_inputs = source_inputs + source_bias
            target_inputs = target_inputs + target_bias

            if keep_prob < 1.0:
                source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob)
                target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob)

            cell = nn.rnn_cell.gru_cell([sedim, shdim])

            if keep_prob < 1.0:
                cell = nn.rnn_cell.dropout_wrapper(cell)

            outputs = encoder(cell, source_inputs, src_mask)
            annotation = theano.tensor.concatenate(outputs, 2)

            # compute initial state for decoder
            # first state of backward encoder
            final_state = outputs[1][0]
            with ops.variable_scope("decoder"):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True, scope="initial",
                                               activation=theano.tensor.tanh)

            cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim])

            if keep_prob < 1.0:
                cell = nn.rnn_cell.dropout_wrapper(cell)

            # run decoder
            decoder_outputs = decoder(cell, target_inputs, tgt_mask,
                                      initial_state, annotation, src_mask,
                                      ahdim)
            all_output, all_context = decoder_outputs

            shift_inputs = theano.tensor.zeros_like(target_inputs)
            shift_inputs = theano.tensor.set_subtensor(shift_inputs[1:],
                                                       target_inputs[:-1])

            init_state = initial_state[None, :, :]
            all_states = theano.tensor.concatenate([init_state, all_output], 0)
            prev_states = all_states[:-1]

            with ops.variable_scope("decoder"):
                probs = prediction(shift_inputs, prev_states, all_context,
                                   keep_prob=keep_prob)

            # compute cost
            idx = theano.tensor.arange(tgt_seq.flatten().shape[0])
            cost = -theano.tensor.log(probs[idx, tgt_seq.flatten()])
            cost = cost.reshape(tgt_seq.shape)
            cost = theano.tensor.sum(cost * tgt_mask, 0)
            cost = theano.tensor.mean(cost)

        training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        training_outputs = [cost]

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = theano.tensor.ivector("prev_words")

            # encoder, disable dropout
            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias

            cell = nn.rnn_cell.gru_cell([sedim, shdim])
            outputs = encoder(cell, source_inputs, src_mask)
            annotation = theano.tensor.concatenate(outputs, 2)

            # decoder
            final_state = outputs[1][0]
            with ops.variable_scope("decoder"):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True, scope="initial",
                                               activation=theano.tensor.tanh)

            inputs = nn.embedding_lookup(target_embedding, prev_words)
            inputs = inputs + target_bias

            cond = theano.tensor.neq(prev_words, 0)
            # zeros out embedding if y is 0
            inputs = inputs * cond[:, None]

            cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim])

            with ops.variable_scope("decoder"):
                mapped_states = map_attention_states(annotation, 2 * shdim,
                                                     ahdim)
                alpha = attention(initial_state, mapped_states, thdim, ahdim,
                                  src_mask)
                context = theano.tensor.sum(alpha[:, :, None] * annotation, 0)
                output, next_state = cell([inputs, context], initial_state)
                probs = prediction(inputs, initial_state, context)

        # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [annotation, initial_state, mapped_states]
        encode = theano.function(encoding_inputs, encoding_outputs)

        prediction_inputs = [prev_words, initial_state, annotation,
                             mapped_states, src_mask]
        prediction_outputs = [probs, context, alpha]
        predict = theano.function(prediction_inputs, prediction_outputs)

        generation_inputs = [prev_words, initial_state, context]
        generation_outputs = next_state
        generate = theano.function(generation_inputs, generation_outputs)

        self.cost = cost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.encode = encode
        self.predict = predict
        self.generate = generate
        self.option = option
Exemple #6
0
    def __init__(self, **option):

        # source and target embedding dim
        sedim, tedim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim = option["hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        scope = option["scope"]
        decoder_scope = "decoder2"

        encoder = Encoder(sedim, shdim)
        import decoder2
        decoder = decoder2.DecoderGruCond(2,
                                          option['method'],
                                          tedim,
                                          thdim,
                                          ahdim,
                                          2 * shdim + thdim,
                                          dim_readout=deephid,
                                          n_y_vocab=tvsize)

        # training graph
        with ops.variable_scope(scope,
                                initializer=initializer,
                                regularizer=regularizer,
                                dtype=dtype):
            src_seq = T.imatrix("source_sequence")
            src_mask = T.matrix("source_sequence_mask")
            tgt_seq = T.imatrix("target_sequence")
            tgt_mask = T.matrix("target_sequence_mask")
            byseq = T.imatrix("backward_target_sequence")

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding"):
                target_embedding = ops.get_variable("embedding",
                                                    [tvsize, tedim])
                target_bias = ops.get_variable("bias", [tedim])

            source_inputs = nn.embedding_lookup(source_embedding,
                                                src_seq) + source_bias
            target_inputs = nn.embedding_lookup(target_embedding,
                                                tgt_seq) + target_bias
            by_inputs = nn.embedding_lookup(target_embedding,
                                            byseq) + target_bias

            if keep_prob < 1.0:
                source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob)
                target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob)
                by_inputs = nn.dropout(by_inputs, keep_prob=keep_prob)

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            annotation = nn.dropout(annotation, keep_prob=keep_prob)

            import softdec
            soft_decoder = softdec.SoftDecoder(option["eosid"],
                                               option["softk"],
                                               tedim,
                                               thdim,
                                               ahdim,
                                               2 * shdim,
                                               dim_readout=deephid,
                                               n_y_vocab=tvsize)
            with ops.variable_scope('soft_decoder'):
                initial_state = nn.feedforward(states[-1], [shdim, thdim],
                                               True,
                                               scope='initial',
                                               activation=T.tanh)
                mapped_keys = map_key(annotation, 2 * shdim, ahdim)
                soft_states, _, _, soft_mask = soft_decoder.infer(
                    mapped_keys, src_mask, annotation, initial_state,
                    target_embedding, target_bias, keep_prob)

            with ops.variable_scope('soft_decoder', reuse=True):
                _, _, soft_cost, _ = soft_decoder.forward(
                    byseq, by_inputs, tgt_mask, mapped_keys, src_mask,
                    annotation, initial_state, keep_prob)

            # compute initial state for decoder
            # first state of backward encoder
            # initialize with only encoder state
            final_state = r_states[0]

            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)
                # keys for query
                with ops.variable_scope('map-key-src'):
                    mapped_keys_src = map_key(annotation, 2 * shdim, ahdim)
                with ops.variable_scope('map-key-soft'):
                    mapped_keys_soft = map_key(soft_states, thdim, ahdim)

                _, _, _, snt_cost = decoder.forward(
                    tgt_seq, target_inputs, tgt_mask,
                    [mapped_keys_src, mapped_keys_soft], [src_mask, soft_mask],
                    [annotation, soft_states], initial_state, keep_prob)

            ce = snt_cost
            true_cost = T.mean(ce)
            lamb = theano.shared(numpy.asarray(option['lambda'], dtype),
                                 'lambda')
            cost = lamb * soft_cost + (1 - lamb) * true_cost

        # import utils.ttensor
        # print 'true_cost %d:' % len(utils.ttensor.find_inputs_and_params(true_cost)[0])
        # for xxx in utils.ttensor.find_inputs_and_params(true_cost)[0]:
        #     print '\t', xxx
        # print 'soft_cost %d:' % len(utils.ttensor.find_inputs_and_params(soft_cost)[0])
        # for xxx in utils.ttensor.find_inputs_and_params(soft_cost)[0]:
        #     print '\t', xxx
        # print 'tot_cost: %d' % len(utils.ttensor.find_inputs_and_params(cost)[0])
        # for xxx in utils.ttensor.find_inputs_and_params(cost)[0]:
        #     print '\t', xxx
        # print 'snt_cost: %d' % len(utils.ttensor.find_inputs_and_params(snt_cost)[0])
        # for xxx in utils.ttensor.find_inputs_and_params(snt_cost)[0]:
        #     print '\t', xxx

        training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask, byseq]
        training_outputs = [cost, soft_cost, true_cost]

        # get_snt_cost = theano.function(training_inputs[:4], snt_cost)
        get_snt_cost = None

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = T.ivector("prev_words")

            # disable dropout
            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)
            target_inputs = target_inputs + target_bias

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            with ops.variable_scope('soft_decoder'):
                initial_state = nn.feedforward(states[-1], [shdim, thdim],
                                               True,
                                               scope='initial',
                                               activation=T.tanh)
                mapped_keys = map_key(annotation, 2 * shdim, ahdim)
                soft_states, soft_contexts, soft_probs, soft_mask = soft_decoder.infer(
                    mapped_keys, src_mask, annotation, initial_state,
                    target_embedding, target_bias, 1.0)

            # decoder
            final_state = r_states[0]
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)
                # keys for query
                with ops.variable_scope('map-key-src'):
                    mapped_keys_src = map_key(annotation, 2 * shdim, ahdim)
                with ops.variable_scope('map-key-soft'):
                    mapped_keys_soft = map_key(soft_states, thdim, ahdim)

            prev_inputs = nn.embedding_lookup(target_embedding, prev_words)
            prev_inputs = prev_inputs + target_bias

            cond = T.neq(prev_words, 0)
            # zeros out embedding if y is 0, which indicates <s>
            prev_inputs = prev_inputs * cond[:, None]

            with ops.variable_scope(decoder_scope):
                mask = T.ones_like(prev_words, dtype=dtype)
                next_state, context = decoder.step(
                    prev_inputs, mask, initial_state, *[
                        mapped_keys_src, mapped_keys_soft, annotation,
                        soft_states, src_mask, soft_mask
                    ])
                probs = decoder.prediction(prev_inputs, next_state, context)

                # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [
            initial_state, annotation, soft_states, mapped_keys_src,
            mapped_keys_soft, soft_mask
        ]
        encode = theano.function(encoding_inputs, encoding_outputs)

        if option["decoder"] == "GruSimple":
            raise ValueError()
            prediction_inputs = [
                prev_words, initial_state, annotation, mapped_keys, src_mask
            ]
            prediction_outputs = [probs, context]
            predict = theano.function(prediction_inputs, prediction_outputs)

            generation_inputs = [prev_words, initial_state, context]
            generation_outputs = next_state
            generate = theano.function(generation_inputs, generation_outputs)

            self.predict = predict
            self.generate = generate
        elif option["decoder"] == "GruCond":
            prediction_inputs = [
                prev_words, initial_state, annotation, mapped_keys_src,
                src_mask, soft_states, mapped_keys_soft, soft_mask
            ]
            prediction_outputs = [probs, next_state]
            predict = theano.function(prediction_inputs, prediction_outputs)
            self.predict = predict

        self.cost = cost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.updates = []
        self.align = None
        self.sample = None
        self.encode = encode

        self.get_snt_cost = get_snt_cost
        self.option = option
Exemple #7
0
    def __init__(self, **option):
        # source and target embedding dim
        sedim, tedim, xposhdim, yposhdim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim, xposnn, yposnn, word2pos, pos2word, pos2pos = option[
            "hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab, tagvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab

        stag2id, ttag2id = tagvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)
        stagsize, ttagsize = len(stag2id), len(ttag2id)

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        scope = option["scope"]
        decoder_scope = "decoder"

        encoder = Encoder(sedim, shdim)
        decoderType = eval("Decoder{}".format(option["decoder"]))
        decoder = decoderType(tedim,
                              thdim,
                              ahdim,
                              2 * shdim + xposhdim,
                              dim_maxout=maxdim,
                              max_part=maxpart,
                              dim_readout=deephid,
                              n_y_vocab=tvsize,
                              n_y_tagvocab=ttagsize,
                              poshdim=yposhdim,
                              posnndim=yposnn,
                              word2pos=word2pos,
                              pos2word=pos2word,
                              pos2pos=pos2pos)

        # training graph
        with ops.variable_scope(scope,
                                initializer=initializer,
                                regularizer=regularizer,
                                dtype=dtype):
            src_seq = T.imatrix("source_sequence")
            src_mask = T.matrix("source_sequence_mask")
            tgt_seq = T.imatrix("target_sequence")
            tgt_mask = T.matrix("target_sequence_mask")
            src_pos = T.imatrix("source_postag")
            tgt_pos = T.imatrix("target_postag")

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding"):
                target_embedding = ops.get_variable("embedding",
                                                    [tvsize, tedim])
                target_bias = ops.get_variable("bias", [tedim])

            with ops.variable_scope("srctag_embedding"):
                srctag_embedding = ops.get_variable("embedding",
                                                    [stagsize, xposhdim])
                srctag_bias = ops.get_variable("bias", [xposhdim])

            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)

            source_inputs = source_inputs + source_bias
            target_inputs = target_inputs + target_bias

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            with ops.variable_scope("srcpostagger"):
                tempstates = nn.feedforward(annotation, [shdim * 2, xposnn],
                                            True,
                                            scope="staggerstates",
                                            activation=T.nnet.relu)
                scores = nn.linear(tempstates, [xposnn, stagsize],
                                   True,
                                   scope="staggerscores")

                new_shape = [scores.shape[0] * scores.shape[1], -1]
                scores = scores.reshape(new_shape)
                srcposprobs = T.nnet.softmax(scores)

                srctaggerstates = T.dot(srcposprobs,
                                        srctag_embedding) + srctag_bias
                srctaggerstates = srctaggerstates.reshape(
                    [annotation.shape[0], annotation.shape[1], -1])

                idx = T.arange(src_pos.flatten().shape[0])
                ce = -T.log(srcposprobs[idx, src_pos.flatten()])
                ce = ce.reshape(src_pos.shape)
                ce = T.sum(ce * src_mask, 0)
                srcpos_cost = T.mean(ce)

            tempposkeys = T.concatenate([srctaggerstates, tempstates], -1)

            src_words_keys = map_key(annotation, 2 * shdim, ahdim,
                                     "srcwordkeys")
            src_pos_keys = map_key(tempposkeys, xposnn + xposhdim, word2pos,
                                   "srcposkeys")

            pos_words_keys = map_key(annotation, 2 * shdim, pos2word,
                                     "pos2wordkeys")
            pos_pos_keys = map_key(tempposkeys, xposnn + xposhdim, pos2pos,
                                   "pos2poskeys")

            annotation = T.concatenate([annotation, srctaggerstates], -1)

            # compute initial state for decoder
            # first state of backward encoder
            final_state = T.concatenate([r_states[0], srctaggerstates[0]], -1)
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state,
                                               [shdim + xposhdim, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)

                _, _, transcost, _, tgtpos_cost = decoder.forward(
                    tgt_seq, target_inputs, tgt_mask, src_words_keys,
                    src_pos_keys, pos_words_keys, pos_pos_keys, src_mask,
                    annotation, initial_state, tgt_pos, keep_prob)

        lambx = theano.shared(numpy.asarray(option["lambda"][0], dtype),
                              "lambdax")
        lamby = theano.shared(numpy.asarray(option["lambda"][1], dtype),
                              "lambday")

        totalcost = transcost + lambx * srcpos_cost + lamby * tgtpos_cost
        training_inputs = [
            src_seq, src_mask, tgt_seq, tgt_mask, src_pos, tgt_pos
        ]
        training_outputs = [srcpos_cost, tgtpos_cost, transcost, totalcost]

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = T.ivector("prev_words")

            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)
            target_inputs = target_inputs + target_bias

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            with ops.variable_scope("srcpostagger"):
                tempstates = nn.feedforward(annotation, [shdim * 2, xposnn],
                                            True,
                                            scope="staggerstates",
                                            activation=T.nnet.relu)
                scores = nn.linear(tempstates, [xposnn, stagsize],
                                   True,
                                   scope="staggerscores")

                new_shape = [scores.shape[0] * scores.shape[1], -1]
                scores = scores.reshape(new_shape)
                srcposprobs = T.nnet.softmax(scores)

                srctaggerstates = T.dot(srcposprobs,
                                        srctag_embedding) + srctag_bias
                srctaggerstates = srctaggerstates.reshape(
                    [annotation.shape[0], annotation.shape[1], -1])

            tempposkeys = T.concatenate([srctaggerstates, tempstates], -1)

            src_words_keys = map_key(annotation, 2 * shdim, ahdim,
                                     "srcwordkeys")
            src_pos_keys = map_key(tempposkeys, xposnn + xposhdim, word2pos,
                                   "srcposkeys")

            pos_words_keys = map_key(annotation, 2 * shdim, pos2word,
                                     "pos2wordkeys")
            pos_pos_keys = map_key(tempposkeys, xposnn + xposhdim, pos2pos,
                                   "pos2poskeys")

            annotation = T.concatenate([annotation, srctaggerstates], -1)

            # decoder
            final_state = T.concatenate([r_states[0], srctaggerstates[0]], -1)
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state,
                                               [shdim + xposhdim, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)

            prev_inputs = nn.embedding_lookup(target_embedding, prev_words)
            prev_inputs = prev_inputs + target_bias

            cond = T.neq(prev_words, 0)
            # zeros out embedding if y is 0, which indicates <s>
            prev_inputs = prev_inputs * cond[:, None]

            with ops.variable_scope(decoder_scope):
                mask = T.ones_like(prev_words, dtype=dtype)
                next_state, context, next_pos, tgtposprob = decoder.step(
                    prev_inputs, mask, initial_state, src_words_keys,
                    src_pos_keys, pos_words_keys, pos_pos_keys, annotation,
                    src_mask)
                if option["decoder"] == "GruSimple":
                    probs = decoder.prediction(prev_inputs, initial_state,
                                               context)
                elif option["decoder"] == "GruCond":
                    probs = decoder.prediction(prev_inputs, next_state,
                                               context, next_pos)

        # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [
            annotation, initial_state, src_words_keys, src_pos_keys,
            pos_words_keys, pos_pos_keys, srcposprobs
        ]
        encode = theano.function(encoding_inputs, encoding_outputs)

        if option["decoder"] == "GruSimple":
            prediction_inputs = [
                prev_words, initial_state, annotation, mapped_keys, src_mask
            ]
            prediction_outputs = [probs, context]
            predict = theano.function(prediction_inputs, prediction_outputs)

            generation_inputs = [prev_words, initial_state, context]
            generation_outputs = next_state
            generate = theano.function(generation_inputs, generation_outputs)

            self.predict = predict
            self.generate = generate
        elif option["decoder"] == "GruCond":
            prediction_inputs = [
                prev_words, initial_state, annotation, src_words_keys,
                src_pos_keys, pos_words_keys, pos_pos_keys, src_mask
            ]
            prediction_outputs = [probs, next_state, tgtposprob]
            predict = theano.function(prediction_inputs,
                                      prediction_outputs,
                                      on_unused_input='warn')
            self.predict = predict

        self.cost = totalcost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.updates = []
        self.encode = encode
        self.option = option
Exemple #8
0
    def __init__(self, **option):
        # source and target embedding dim
        sedim, tedim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim = option["hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "criterion" not in option:
            option["criterion"] = "mle"

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        scope = option["scope"]
        criterion = option["criterion"]
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        # MRT mode do not use dropout
        if criterion == "mrt":
            keep_prob = 1.0

        def prediction(prev_inputs, prev_state, context, keep_prob=1.0):
            features = [prev_state, prev_inputs, context]
            maxhid = nn.maxout(features, [[thdim, tedim, 2 * shdim], maxdim],
                               maxpart, True)
            readout = nn.linear(maxhid, [maxdim, deephid],
                                False,
                                scope="deepout")

            if keep_prob < 1.0:
                readout = nn.dropout(readout, keep_prob=keep_prob)

            logits = nn.linear(readout, [deephid, tvsize],
                               True,
                               scope="logits")

            if logits.ndim == 3:
                new_shape = [logits.shape[0] * logits.shape[1], -1]
                logits = logits.reshape(new_shape)

            probs = theano.tensor.nnet.softmax(logits)

            return probs

        # training graph
        with ops.variable_scope(scope,
                                initializer=initializer,
                                regularizer=regularizer,
                                dtype=dtype):
            src_seq = theano.tensor.imatrix("soruce_sequence")
            src_mask = theano.tensor.matrix("soruce_sequence_mask")
            tgt_seq = theano.tensor.imatrix("target_sequence")
            tgt_mask = theano.tensor.matrix("target_sequence_mask")

            if criterion == "mrt":
                loss = theano.tensor.vector("loss_score")
                sharp = theano.tensor.scalar("sharpness")

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding"):
                target_embedding = ops.get_variable("embedding",
                                                    [tvsize, tedim])
                target_bias = ops.get_variable("bias", [tedim])

            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)

            source_inputs = source_inputs + source_bias
            target_inputs = target_inputs + target_bias

            if keep_prob < 1.0:
                source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob)
                target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob)

            cell = nn.rnn_cell.gru_cell([sedim, shdim])

            outputs = encoder(cell, source_inputs, src_mask)
            annotation = theano.tensor.concatenate(outputs, 2)

            annotation = nn.dropout(annotation, keep_prob=keep_prob)

            # compute initial state for decoder
            # first state of backward encoder
            final_state = outputs[1][0]
            with ops.variable_scope("decoder"):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True,
                                               scope="initial",
                                               activation=theano.tensor.tanh)

            # run decoder
            cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim])

            if criterion == "mrt":
                # In MRT training, shape of src_seq and src_mask are assumed
                # to have [len, 1]
                batch = tgt_seq.shape[1]
                with ops.variable_scope("decoder"):
                    mapped_states = attention(None, annotation, None, None,
                                              [thdim, 2 * shdim, ahdim])
                b_src_mask = theano.tensor.repeat(src_mask, batch, 1)
                b_annotation = theano.tensor.repeat(annotation, batch, 1)
                b_mapped_states = theano.tensor.repeat(mapped_states, batch, 1)
                b_initial_state = theano.tensor.repeat(initial_state, batch, 0)

                decoder_outputs = decoder(cell, target_inputs, tgt_mask,
                                          b_initial_state, b_annotation,
                                          b_src_mask, ahdim, b_mapped_states)
            else:
                decoder_outputs = decoder(cell, target_inputs, tgt_mask,
                                          initial_state, annotation, src_mask,
                                          ahdim)

            all_output, all_context = decoder_outputs
            shift_inputs = theano.tensor.zeros_like(target_inputs)
            shift_inputs = theano.tensor.set_subtensor(shift_inputs[1:],
                                                       target_inputs[:-1])

            if criterion == "mrt":
                init_state = b_initial_state[None, :, :]
            else:
                init_state = initial_state[None, :, :]

            all_states = theano.tensor.concatenate([init_state, all_output], 0)
            prev_states = all_states[:-1]

            with ops.variable_scope("decoder"):
                probs = prediction(shift_inputs,
                                   prev_states,
                                   all_context,
                                   keep_prob=keep_prob)

            # compute cost
            idx = theano.tensor.arange(tgt_seq.flatten().shape[0])
            ce = -theano.tensor.log(probs[idx, tgt_seq.flatten()])
            ce = ce.reshape(tgt_seq.shape)
            ce = theano.tensor.sum(ce * tgt_mask, 0)

            if criterion == "mle":
                cost = theano.tensor.mean(ce)
            else:
                # ce is positive here
                logp = -ce
                score = sharp * logp
                # safe softmax
                score = score - theano.tensor.max(score)
                score = theano.tensor.exp(score)
                qprob = score / theano.tensor.sum(score)
                risk = theano.tensor.sum(qprob * loss)
                cost = risk

        if criterion == "mle":
            training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        else:
            training_inputs = [
                src_seq, src_mask, tgt_seq, tgt_mask, loss, sharp
            ]
        training_outputs = [cost]

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = theano.tensor.ivector("prev_words")

            # disable dropout
            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)
            target_inputs = target_inputs + target_bias

            cell = nn.rnn_cell.gru_cell([sedim, shdim])
            outputs = encoder(cell, source_inputs, src_mask)
            annotation = theano.tensor.concatenate(outputs, 2)

            # decoder
            final_state = outputs[1][0]
            with ops.variable_scope("decoder"):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True,
                                               scope="initial",
                                               activation=theano.tensor.tanh)

            inputs = nn.embedding_lookup(target_embedding, prev_words)
            inputs = inputs + target_bias

            cond = theano.tensor.neq(prev_words, 0)
            # zeros out embedding if y is 0
            inputs = inputs * cond[:, None]

            cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim])

            # encode -> prediction -> generation
            # prediction: prev_word + prev_state => context, next_word
            # generation: curr_word + context + prev_state => next_state
            # here, initial_state is merely a placeholder
            with ops.variable_scope("decoder"):
                # used in encoding
                mapped_states = attention(None, annotation, None, None,
                                          [thdim, 2 * shdim, ahdim])
                # used in prediction
                alpha = attention(initial_state, None, mapped_states, src_mask,
                                  [thdim, 2 * shdim, ahdim])
                context = theano.tensor.sum(alpha[:, :, None] * annotation, 0)
                probs = prediction(inputs, initial_state, context)
                # used in generation
                output, next_state = cell([inputs, context], initial_state)

        # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [annotation, initial_state, mapped_states]
        encode = theano.function(encoding_inputs, encoding_outputs)

        prediction_inputs = [
            prev_words, initial_state, annotation, mapped_states, src_mask
        ]
        prediction_outputs = [probs, context, alpha]
        predict = theano.function(prediction_inputs, prediction_outputs)

        generation_inputs = [prev_words, initial_state, context]
        generation_outputs = next_state
        generate = theano.function(generation_inputs, generation_outputs)

        # sampling graph, this feature is optional
        with ops.variable_scope(scope, reuse=True):
            max_len = theano.tensor.iscalar()

            def sampling_loop(inputs, state, attn_states, attn_mask, m_states):
                alpha = attention(state, None, m_states, attn_mask,
                                  [thdim, 2 * shdim, ahdim])
                context = theano.tensor.sum(alpha[:, :, None] * attn_states, 0)
                probs = prediction(inputs, state, context)
                next_words = ops.random.multinomial(probs).argmax(axis=1)
                new_inputs = nn.embedding_lookup(target_embedding, next_words)
                new_inputs = new_inputs + target_bias
                output, next_state = cell([new_inputs, context], state)

                return [next_words, new_inputs, next_state]

            with ops.variable_scope("decoder"):
                batch = src_seq.shape[1]
                initial_inputs = theano.tensor.zeros([batch, tedim],
                                                     dtype=dtype)

                outputs_info = [None, initial_inputs, initial_state]
                nonseq = [annotation, src_mask, mapped_states]
                outputs, updates = theano.scan(sampling_loop, [],
                                               outputs_info,
                                               nonseq,
                                               n_steps=max_len)
                sampled_words = outputs[0]

        sampling_inputs = [src_seq, src_mask, max_len]
        sampling_outputs = sampled_words
        sample = theano.function(sampling_inputs,
                                 sampling_outputs,
                                 updates=updates)

        # attention graph, this feature is optional
        with ops.variable_scope(scope, reuse=True):

            def attention_loop(inputs, mask, state, attn_states, attn_mask,
                               m_states):
                mask = mask[:, None]
                alpha = attention(state, None, m_states, attn_mask,
                                  [thdim, 2 * shdim, ahdim])
                context = theano.tensor.sum(alpha[:, :, None] * attn_states, 0)
                output, next_state = cell([inputs, context], state)
                next_state = (1.0 - mask) * state + mask * next_state

                return [alpha, next_state]

            with ops.variable_scope("decoder"):
                seq = [target_inputs, tgt_mask]
                outputs_info = [None, initial_state]
                nonseq = [annotation, src_mask, mapped_states]
                (alpha, state), updaptes = theano.scan(attention_loop, seq,
                                                       outputs_info, nonseq)
                attention_score = alpha

        alignment_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        alignment_outputs = attention_score
        align = theano.function(alignment_inputs, alignment_outputs)

        self.cost = cost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.updates = []
        self.align = align
        self.sample = sample
        self.encode = encode
        self.predict = predict
        self.generate = generate
        self.option = option
Exemple #9
0
    def __init__(self, **option):
        # source and target embedding dim
        sedim, tedim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim = option["hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        scope = option["scope"]
        decoder_scope = "decoder"

        encoder = Encoder(sedim, shdim)
        decoderType = eval("Decoder{}".format(option["decoder"]))
        decoder = decoderType(tedim, thdim, ahdim, 2 * shdim, dim_maxout=maxdim, max_part=maxpart, dim_readout=deephid,
                              n_y_vocab=tvsize)

        # training graph
        with ops.variable_scope(scope, initializer=initializer,
                                regularizer=regularizer, dtype=dtype):
            src_seq = T.imatrix("source_sequence")
            src_mask = T.matrix("source_sequence_mask")
            tgt_seq = T.imatrix("target_sequence")
            tgt_mask = T.matrix("target_sequence_mask")

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding") as tgtembscope:
                target_embedding = ops.get_variable("embedding",
                                                    [tvsize, tedim])
                # target_bias = ops.get_variable("bias", [tedim])
                decoder.tiescope = tgtembscope

            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)

            source_inputs = source_inputs + source_bias

            if keep_prob < 1.0:
                source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob)
                target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob)

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            # compute initial state for decoder
            # first state of backward encoder
            final_state = r_states[0]
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True, scope="initial",
                                               activation=T.tanh)
                # keys for query
                mapped_keys = map_key(annotation, 2 * shdim, ahdim)

                _, _, cost,_  = decoder.forward(tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask,
                                                    annotation, initial_state, keep_prob)


        training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        training_outputs = [cost]

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = T.ivector("prev_words")

            # disable dropout
            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)
            # target_inputs = target_inputs + target_bias

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            # decoder
            final_state = r_states[0]
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True, scope="initial",
                                               activation=T.tanh)
                mapped_keys = map_key(annotation, 2 * shdim, ahdim)

            prev_inputs = nn.embedding_lookup(target_embedding, prev_words)
            # prev_inputs = prev_inputs + target_bias

            cond = T.neq(prev_words, 0)
            # zeros out embedding if y is 0, which indicates <s>
            prev_inputs = prev_inputs * cond[:, None]

            with ops.variable_scope(decoder_scope):
                mask = T.ones_like(prev_words, dtype=dtype)
                next_state, context = decoder.step(prev_inputs, mask, initial_state, mapped_keys, annotation, src_mask)
                if option["decoder"] == "GruSimple":
                    probs = decoder.prediction(prev_inputs, initial_state, context)
                elif option["decoder"] == "GruCond":
                    probs = decoder.prediction(prev_inputs, next_state, context)

        # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [annotation, initial_state, mapped_keys]
        encode = theano.function(encoding_inputs, encoding_outputs)

        if option["decoder"] == "GruSimple":
            prediction_inputs = [prev_words, initial_state, annotation,
                                 mapped_keys, src_mask]
            prediction_outputs = [probs, context]
            predict = theano.function(prediction_inputs, prediction_outputs)

            generation_inputs = [prev_words, initial_state, context]
            generation_outputs = next_state
            generate = theano.function(generation_inputs, generation_outputs)

            self.predict = predict
            self.generate = generate
        elif option["decoder"] == "GruCond":
            prediction_inputs = [prev_words, initial_state, annotation,
                                 mapped_keys, src_mask]
            prediction_outputs = [probs, next_state]
            predict = theano.function(prediction_inputs, prediction_outputs)
            self.predict = predict

        # optional graph
        '''
        with ops.variable_scope(scope, reuse=True):
            sample = decoder.build_sampling(src_seq, src_mask, target_embedding, target_bias, mapped_keys,
                                            annotation, initial_state)
            align = decoder.build_attention(src_seq, src_mask, target_inputs, tgt_seq, tgt_mask, mapped_keys,
                                            annotation, initial_state)
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True, scope="initial",
                                               activation=T.tanh)
                # keys for query
                mapped_keys = map_key(annotation, 2 * shdim, ahdim)

                _, _, _,snt_cost  = decoder.forward(tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask,
                                                    annotation, initial_state, 1.0)
            get_snt_cost = theano.function(training_inputs, snt_cost)
        '''
        self.cost = cost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.updates = []
        # self.align = align
        # self.sample = sample
        self.encode = encode

        # self.get_snt_cost = get_snt_cost
        self.option = option
Exemple #10
0
    def __init__(self, **option):
        # source and target embedding dim
        sedim, tedim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim, domaindim, feadim = option["hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)
        dnum = option['dnum']

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        scope = option["scope"]
        decoder_scope = "decoder"

        encoder = Encoder(sedim, shdim)
        decoderType = eval("Decoder{}".format(option["decoder"]))
        decoder = decoderType(tedim,
                              thdim,
                              ahdim,
                              2 * shdim,
                              dnum=dnum,
                              dim_maxout=maxdim,
                              max_part=maxpart,
                              dim_readout=deephid,
                              dim_domain=domaindim,
                              feadim=feadim,
                              n_y_vocab=tvsize)

        # training graph
        with ops.variable_scope(scope,
                                initializer=initializer,
                                regularizer=regularizer,
                                dtype=dtype):
            src_seq = T.imatrix("source_sequence")
            src_mask = T.matrix("source_sequence_mask")
            tgt_seq = T.imatrix("target_sequence")
            tgt_mask = T.matrix("target_sequence_mask")
            tag_seq = T.imatrix("domain_tag")
            # nsrc_mask = T.set_subtensor(src_mask[T.cast(T.sum(src_mask, 0) - 1, 'int32'),
            #                                      T.arange(src_mask.shape[1])], 0.0)

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding") as tgtembscope:
                target_embedding = ops.get_variable("embedding",
                                                    [tvsize, tedim])
                # target_bias = ops.get_variable("bias", [tedim])
                decoder.tiescope = tgtembscope

            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)

            source_inputs = source_inputs + source_bias

            if keep_prob < 1.0:
                source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob)
                target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob)

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            with ops.variable_scope("Specific"):
                domain_alpha = domain_sensitive_attention(
                    annotation, src_mask, shdim * 2, domaindim)
                # domain_alpha = attention(r_states[0], annotation, nsrc_mask,
                #                          shdim,
                #                          shdim * 2)
                domain_context = T.sum(annotation * domain_alpha[:, :, None],
                                       0)
                dfeature = nn.feedforward(domain_context, [shdim * 2, feadim],
                                          True,
                                          activation=T.tanh,
                                          scope="feature1")

                dscores = nn.feedforward(dfeature, [feadim, dnum],
                                         True,
                                         activation=T.tanh,
                                         scope="score")
                # (batch, 2)
                dprobs = T.nnet.softmax(dscores)
                dpred_tag = T.argmax(dprobs, 1)
                didx = T.arange(tag_seq.flatten().shape[0])
                dce = -T.log(dprobs[didx, tag_seq.flatten()])
                dcost = T.mean(dce)

            share_alpha = domain_sensitive_attention(annotation, src_mask,
                                                     shdim * 2, domaindim)
            # share_alpha = attention(r_states[0], annotation, nsrc_mask,
            #                         shdim,
            #                         shdim * 2)
            share_context = T.sum(annotation * share_alpha[:, :, None], 0)
            sfeature = nn.feedforward(share_context, [shdim * 2, feadim],
                                      True,
                                      activation=T.tanh,
                                      scope="feature1")

            with ops.variable_scope("Shared"):
                sscores = nn.feedforward(sfeature, [feadim, dnum],
                                         True,
                                         activation=T.tanh,
                                         scope="score")
                # (batch, 2)
                sprobs = T.nnet.softmax(sscores)
                spred_tag = T.argmax(sprobs, 1)
                sidx = T.arange(tag_seq.flatten().shape[0])
                sce = -T.log(sprobs[sidx, tag_seq.flatten()])
                scost = T.mean(sce)
                adv_sce = -sprobs[sidx, tag_seq.flatten()] * T.log(
                    sprobs[sidx, tag_seq.flatten()])
                adv_scost = T.mean(adv_sce)

            domain_gate = nn.feedforward([dfeature, annotation],
                                         [[feadim, shdim * 2], shdim * 2],
                                         True,
                                         scope="domain_gate")
            domain_annotation = annotation * domain_gate
            domain_annotation = nn.dropout(domain_annotation,
                                           keep_prob=keep_prob)
            share_gate = nn.feedforward([sfeature, annotation],
                                        [[feadim, shdim * 2], shdim * 2],
                                        True,
                                        scope="share_gate")
            annotation = annotation * share_gate
            annotation = nn.dropout(annotation, keep_prob=keep_prob)

            # compute initial state for decoder
            # first state of backward encoder
            # batch * shdim
            final_state = T.concatenate([
                annotation[0, :, annotation.shape[-1] / 2:],
                domain_annotation[0, :, annotation.shape[-1] / 2:]
            ], -1)
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim * 2, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)
                # keys for query
                mapped_keys = map_key(annotation, 2 * shdim, ahdim, "semantic")
                mapped_domain_keys = map_key(domain_annotation, 2 * shdim,
                                             ahdim, "domain")

                _, _, cost, tgtdcost, tpred_tag, _ = decoder.forward(
                    tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask,
                    annotation, initial_state, mapped_domain_keys,
                    domain_annotation, tag_seq, keep_prob)

        lamb = theano.shared(numpy.asarray(option["lambda"], dtype), "lambda")
        # cwscost *= lamb
        final_cost = cost + dcost + tgtdcost - lamb * adv_scost

        tag_inputs = [src_seq, src_mask]
        tag_outputs = [dpred_tag, spred_tag]
        tag_predict = theano.function(tag_inputs, tag_outputs)
        self.tag_predict = tag_predict

        tgt_tag_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        tgt_tag_outputs = [tpred_tag]
        tgt_tag_predict = theano.function(tgt_tag_inputs, tgt_tag_outputs)
        self.tgt_tag_predict = tgt_tag_predict

        training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask, tag_seq]
        training_outputs = [cost, dcost, adv_scost, tgtdcost]

        self.cost_cla = scost
        self.inputs_cla = [src_seq, src_mask, tag_seq]
        self.outputs_cla = [scost]

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = T.ivector("prev_words")

            # disable dropout
            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            with ops.variable_scope("Specific"):
                domain_alpha = domain_sensitive_attention(
                    annotation, src_mask, shdim * 2, domaindim)
                # domain_alpha = attention(r_states[0], annotation, nsrc_mask,
                #                          shdim,
                #                          shdim * 2)
                domain_context = T.sum(annotation * domain_alpha[:, :, None],
                                       0)
                dfeature = nn.feedforward(domain_context, [shdim * 2, feadim],
                                          True,
                                          activation=T.tanh,
                                          scope="feature1")

            share_alpha = domain_sensitive_attention(annotation, src_mask,
                                                     shdim * 2, domaindim)
            # share_alpha = attention(r_states[0], annotation, nsrc_mask,
            #                         shdim,
            #                         shdim * 2)
            share_context = T.sum(annotation * share_alpha[:, :, None], 0)
            sfeature = nn.feedforward(share_context, [shdim * 2, feadim],
                                      True,
                                      activation=T.tanh,
                                      scope="feature1")

            domain_gate = nn.feedforward([dfeature, annotation],
                                         [[feadim, shdim * 2], shdim * 2],
                                         True,
                                         scope="domain_gate")
            domain_annotation = annotation * domain_gate
            share_gate = nn.feedforward([sfeature, annotation],
                                        [[feadim, shdim * 2], shdim * 2],
                                        True,
                                        scope="share_gate")
            annotation = annotation * share_gate

            # decoder
            final_state = T.concatenate([
                annotation[0, :, annotation.shape[-1] / 2:],
                domain_annotation[0, :, annotation.shape[-1] / 2:]
            ], -1)
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim * 2, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)
                mapped_keys = map_key(annotation, 2 * shdim, ahdim, "semantic")
                mapped_domain_keys = map_key(domain_annotation, 2 * shdim,
                                             ahdim, "domain")

            prev_inputs = nn.embedding_lookup(target_embedding, prev_words)
            # prev_inputs = prev_inputs + target_bias

            cond = T.neq(prev_words, 0)
            # zeros out embedding if y is 0, which indicates <s>
            prev_inputs = prev_inputs * cond[:, None]

            with ops.variable_scope(decoder_scope):
                mask = T.ones_like(prev_words, dtype=dtype)
                next_state, context = decoder.step(prev_inputs, mask,
                                                   initial_state, mapped_keys,
                                                   annotation, src_mask,
                                                   mapped_domain_keys,
                                                   domain_annotation)
                if option["decoder"] == "GruSimple":
                    probs = decoder.prediction(prev_inputs, initial_state,
                                               context)
                elif option["decoder"] == "GruCond":
                    probs = decoder.prediction(prev_inputs, next_state,
                                               context)

        # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [
            annotation, initial_state, mapped_keys, mapped_domain_keys,
            domain_annotation
        ]
        encode = theano.function(encoding_inputs, encoding_outputs)

        if option["decoder"] == "GruSimple":
            prediction_inputs = [
                prev_words, initial_state, annotation, mapped_keys, src_mask
            ]
            prediction_outputs = [probs, context]
            predict = theano.function(prediction_inputs, prediction_outputs)

            generation_inputs = [prev_words, initial_state, context]
            generation_outputs = next_state
            generate = theano.function(generation_inputs, generation_outputs)

            self.predict = predict
            self.generate = generate
        elif option["decoder"] == "GruCond":
            prediction_inputs = [
                prev_words, initial_state, annotation, mapped_keys, src_mask,
                mapped_domain_keys, domain_annotation
            ]
            prediction_outputs = [probs, next_state]
            predict = theano.function(prediction_inputs, prediction_outputs)
            self.predict = predict

        self.cost = final_cost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.updates = []
        # self.align = align
        # self.sample = sample
        self.encode = encode
        # self.get_snt_cost = get_snt_cost
        self.option = option