Beispiel #1
0
    def network(self, input, keep_prob=0.5, reuse=None):
        with tf.variable_scope('network', reuse=reuse):
            pool_ = lambda x: nn.max_pool(x, 2, 2)
            max_out_ = lambda x: nn.max_out(x, 16)
            conv_ = lambda x, output_depth, name, trainable=True: nn.conv(
                x,
                3,
                output_depth,
                1,
                self.weight_decay,
                name=name,
                trainable=trainable)
            fc_ = lambda x, features, name, relu=True: nn.fc(
                x, features, self.weight_decay, name, relu=relu)
            VGG_MEAN = [103.939, 116.779, 123.68]
            # Convert RGB to BGR and subtract mean
            # red, green, blue = tf.split(input, 3, axis=3)
            input = tf.concat([
                input - 24,
                input - 24,
                input - 24,
            ], axis=3)

            conv_1_1 = conv_(input, 64, 'conv1_1', trainable=False)
            conv_1_2 = conv_(conv_1_1, 64, 'conv1_2', trainable=False)

            pool_1 = pool_(conv_1_2)

            conv_2_1 = conv_(pool_1, 128, 'conv2_1', trainable=False)
            conv_2_2 = conv_(conv_2_1, 128, 'conv2_2', trainable=False)

            pool_2 = pool_(conv_2_2)

            conv_3_1 = conv_(pool_2, 256, 'conv3_1')
            conv_3_2 = conv_(conv_3_1, 256, 'conv3_2')
            conv_3_3 = conv_(conv_3_2, 256, 'conv3_3')

            pool_3 = pool_(conv_3_3)

            conv_4_1 = conv_(pool_3, 512, 'conv4_1')
            conv_4_2 = conv_(conv_4_1, 512, 'conv4_2')
            conv_4_3 = conv_(conv_4_2, 512, 'conv4_3')

            pool_4 = pool_(conv_4_3)

            conv_5_1 = conv_(pool_4, 512, 'conv5_1')
            conv_5_2 = conv_(conv_5_1, 512, 'conv5_2')
            conv_5_3 = conv_(conv_5_2, 512, 'conv5_3')

            pool_5 = pool_(conv_5_3)
            if self.maxout:
                max_5 = max_out_(pool_5)
                flattened = tf.contrib.layers.flatten(max_5)
            else:
                flattened = tf.contrib.layers.flatten(pool_5)

            fc_6 = nn.dropout(fc_(flattened, 4096, 'fc6'), keep_prob)
            fc_7 = nn.dropout(fc_(fc_6, 4096, 'fc7'), keep_prob)
            fc_8 = fc_(fc_7, self.label_dim, 'fc8', relu=False)
            return fc_8
Beispiel #2
0
    def network(self, input, keep_prob=0.5, reuse=None):
        with tf.variable_scope("network", reuse=reuse):
            pool_ = lambda x: nn.max_pool(x, 2, 2)
            max_out_ = lambda x: nn.max_out(x, 16)
            config = self.config

            conv_ = lambda x, output_depth, name, stride=1, padding="SAME", relu=True, filter_size=3: conv(
                x,
                filter_size,
                output_depth,
                stride,
                name=name,
                padding=padding,
                relu=relu,
            )
            fc_ = lambda x, features, name, relu=True: fc(
                x, features, name, relu=relu)

            VGG_MEAN = [config.mean, config.mean, config.mean]
            input = tf.concat([
                input - VGG_MEAN[0], input - VGG_MEAN[1], input - VGG_MEAN[2]
            ],
                              axis=3)

            conv_1_1 = conv_(input, 64, "conv1_1")  # , trainable = False)
            conv_1_2 = conv_(conv_1_1, 64, "conv1_2")  # , trainable = False)

            pool_1 = pool_(conv_1_2)

            conv_2_1 = conv_(pool_1, 128, "conv2_1")  # , trainable = False)
            conv_2_2 = conv_(conv_2_1, 128, "conv2_2")  # , trainable = False)

            pool_2 = pool_(conv_2_2)

            conv_3_1 = conv_(pool_2, 256, "conv3_1")
            conv_3_2 = conv_(conv_3_1, 256, "conv3_2")
            conv_3_3 = conv_(conv_3_2, 256, "conv3_3")

            pool_3 = pool_(conv_3_3)

            conv_4_1 = conv_(pool_3, 512, "conv4_1")
            conv_4_2 = conv_(conv_4_1, 512, "conv4_2")
            conv_4_3 = conv_(conv_4_2, 512, "conv4_3")

            pool_4 = pool_(conv_4_3)

            conv_5_1 = conv_(pool_4, 512, "conv5_1")
            conv_5_2 = conv_(conv_5_1, 512, "conv5_2")
            conv_5_3 = conv_(conv_5_2, 512, "conv5_3")

            pool_5 = pool_(conv_5_3)
            flattened = tf.contrib.layers.flatten(
                pool_5)  # i.e. assume self.maxout=False

            fc_6 = nn.dropout(fc_(flattened, 4096, "fc6"), keep_prob)
            fc_7 = nn.dropout(fc_(fc_6, 4096, "fc7"), keep_prob)
            fc_8 = fc_(fc_7, config.label_dim, "fc8", relu=False)
            return fc_8
Beispiel #3
0
    def prediction(self, y_emb, state, context, y_pos, keep_prob=1.0):
        """
        readout -> softmax
        p(y_j) \propto f(y_{j-1}, s_{j}, c_{j})
        :param y_pos:
        :param y_emb:
        :param state: 
        :param context: 
        :param keep_prob: 
        :return: 
        """
        state = nn.feedforward([state, y_pos], [[self.dim_hid, self.poshdim], self.dim_hid], True,
                               activation=T.tanh, scope="enhancedstate")
        
        features = [state, y_emb, context, y_pos]
        readout = nn.feedforward(features, [[self.dim_hid, self.dim_y, self.dim_value, self.poshdim], self.dim_readout], True,
                                 activation=T.tanh,
                                 scope="readout")

        if keep_prob < 1.0:
             readout = nn.dropout(readout, keep_prob=keep_prob)
        logits = nn.linear(readout, [self.dim_readout, self.n_y_vocab], True,
                           scope="logits")

        if logits.ndim == 3:
            new_shape = [logits.shape[0] * logits.shape[1], -1]
            logits = logits.reshape(new_shape)

        probs = T.nnet.softmax(logits)
        return probs
Beispiel #4
0
    def prediction(self, y_emb, state, context, keep_prob=1.0):
        """
        readout -> softmax
        p(y_j) \propto f(y_{j-1}, s_{j}, c_{j})
        :param y_emb: 
        :param state: 
        :param context: 
        :param keep_prob: 
        :return: 
        """
        features = [state, y_emb, context]
        readout = nn.feedforward(
            features,
            [[self.dim_hid, self.dim_y, self.dim_value], self.dim_readout],
            True,
            activation=T.tanh,
            scope="readout")

        if keep_prob < 1.0:
            readout = nn.dropout(readout, keep_prob=keep_prob)

        with ops.variable_scope(self.tiescope, reuse=True):
            target_embedding = ops.get_variable(
                "embedding", [self.n_y_vocab, self.dim_readout])
            target_embedding = target_embedding.T
            logits = T.dot(readout, target_embedding)

        # logits = nn.linear(readout, [self.dim_readout, self.n_y_vocab], True,
        #                    scope="logits")
        if logits.ndim == 3:
            new_shape = [logits.shape[0] * logits.shape[1], -1]
            logits = logits.reshape(new_shape)

        probs = T.nnet.softmax(logits)
        return probs
Beispiel #5
0
    def prediction(self, y_emb, state, context, keep_prob=1.0):
        """
        maxout -> readout -> softmax
        p(y_j) \propto f(y_{j-1}, s_{j-1}, c_{j})
        :param y_emb: 
        :param state: 
        :param context: 
        :param keep_prob: 
        :return: 
        """

        features = [state, y_emb, context]
        maxhid = nn.maxout(
            features,
            [[self.dim_hid, self.dim_y, self.dim_value], self.dim_maxout],
            self.max_part, True)
        readout = nn.linear(maxhid, [self.dim_maxout, self.dim_readout],
                            False,
                            scope="readout")

        if keep_prob < 1.0:
            readout = nn.dropout(readout, keep_prob=keep_prob)

        logits = nn.linear(readout, [self.dim_readout, self.n_y_vocab],
                           True,
                           scope="logits")

        if logits.ndim == 3:
            new_shape = [logits.shape[0] * logits.shape[1], -1]
            logits = logits.reshape(new_shape)

        probs = T.nnet.softmax(logits)

        return probs
def F(inputs, d, activation=tf.nn.relu, kernel_initializer=None, scope=None, use_bias=True, input_keep_prob=1.0, wd=0.0, is_train=None):
    out = dropout(inputs, input_keep_prob, is_train)
    with tf.variable_scope(scope or "projection"):
        out = tf.layers.dense(out, d, activation=activation, use_bias=use_bias, kernel_initializer=kernel_initializer)
        if wd:
            add_wd(wd)
    return out
Beispiel #7
0
 def siamese(self, input1, input2, keep_prob = 0.5, weight_decay = \
             weight_decay_factor, reuse = None):
     fc_ = lambda x, features, name, relu = True: nn.fc(x, features, weight_decay, name, relu = relu)
     feat1, _ = self.vgg(input1, keep_prob, True)
     feat2, _ = self.vgg(input2, keep_prob, True)
     with tf.variable_scope('network', reuse = reuse):
         fc_combined = tf.concat((feat1,feat2),1)
         fc_8 = nn.dropout(fc_(fc_combined, 4096, 'fc8'), keep_prob)
         fc_9 = fc_(fc_8, 2, 'fc9', relu = False)
         return fc_9
Beispiel #8
0
        def prediction(prev_inputs, prev_state, context, keep_prob=1.0):
            features = [prev_state, prev_inputs, context]
            maxhid = nn.maxout(features, [[thdim, tedim, 2 * shdim], maxdim],
                               maxpart, True)
            readout = nn.linear(maxhid, [maxdim, deephid], False,
                                scope="deepout")

            if keep_prob < 1.0:
                readout = nn.dropout(readout, keep_prob=keep_prob)

            logits = nn.linear(readout, [deephid, tvsize], True,
                               scope="logits")

            if logits.ndim == 3:
                new_shape = [logits.shape[0] * logits.shape[1], -1]
                logits = logits.reshape(new_shape)

            probs = theano.tensor.nnet.softmax(logits)

            return probs
def cudnn_rnn(rnn_type, inputs, length, hidden_size, num_layers=1, 
        dropout_keep_prob=1.0, concat=True, initial_state=None, 
        kernel_initializer=tf.random_normal_initializer(stddev=0.1), wd=0.0, is_train=False, scope=None):
    with tf.variable_scope(scope or 'cudnn_rnn'):
        direction = "bidirectional" if 'bi' in rnn_type else "unidirectional"
        input_size = inputs.get_shape().as_list()[-1]
        if rnn_type.endswith('gru'):
            rnn = CudnnGRU(num_layers=num_layers, num_units=hidden_size, 
                            input_mode='linear_input', direction=direction, 
                            dropout=1-dropout_keep_prob, name='rnn')
        
        elif rnn_type.endswith('lstm'):
            rnn = CudnnLSTM(num_layers=num_layers, num_units=hidden_size, 
                            input_mode='linear_input', direction=direction, 
                            dropout=1-dropout_keep_prob, name='rnn')
        else:
            raise NotImplementedError("{} is not supported.".format(rnn_type))
        inputs = dropout(inputs, dropout_keep_prob, is_train)
        outputs, _ = rnn(tf.transpose(inputs, [1, 0, 2]))
        outputs = tf.transpose(outputs, [1, 0, 2]) # [N, JX, 2*d]
        output_h = None
        if wd:
            add_wd(wd)
        return outputs, output_h
Beispiel #10
0
    def __init__(self, **option):
        # source and target embedding dim
        sedim, tedim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim = option["hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        scope = option["scope"]
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        def prediction(prev_inputs, prev_state, context, keep_prob=1.0):
            features = [prev_state, prev_inputs, context]
            maxhid = nn.maxout(features, [[thdim, tedim, 2 * shdim], maxdim],
                               maxpart, True)
            readout = nn.linear(maxhid, [maxdim, deephid], False,
                                scope="deepout")

            if keep_prob < 1.0:
                readout = nn.dropout(readout, keep_prob=keep_prob)

            logits = nn.linear(readout, [deephid, tvsize], True,
                               scope="logits")

            if logits.ndim == 3:
                new_shape = [logits.shape[0] * logits.shape[1], -1]
                logits = logits.reshape(new_shape)

            probs = theano.tensor.nnet.softmax(logits)

            return probs

        # training graph
        with ops.variable_scope(scope, initializer=initializer,
                                regularizer=regularizer, dtype=dtype):
            src_seq = theano.tensor.imatrix("soruce_sequence")
            src_mask = theano.tensor.matrix("soruce_sequence_mask")
            tgt_seq = theano.tensor.imatrix("target_sequence")
            tgt_mask = theano.tensor.matrix("target_sequence_mask")

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding"):
                target_embedding = ops.get_variable("embedding",
                                                [tvsize, tedim])
                target_bias = ops.get_variable("bias", [tedim])

            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)

            source_inputs = source_inputs + source_bias
            target_inputs = target_inputs + target_bias

            if keep_prob < 1.0:
                source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob)
                target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob)

            cell = nn.rnn_cell.gru_cell([sedim, shdim])

            if keep_prob < 1.0:
                cell = nn.rnn_cell.dropout_wrapper(cell)

            outputs = encoder(cell, source_inputs, src_mask)
            annotation = theano.tensor.concatenate(outputs, 2)

            # compute initial state for decoder
            # first state of backward encoder
            final_state = outputs[1][0]
            with ops.variable_scope("decoder"):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True, scope="initial",
                                               activation=theano.tensor.tanh)

            cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim])

            if keep_prob < 1.0:
                cell = nn.rnn_cell.dropout_wrapper(cell)

            # run decoder
            decoder_outputs = decoder(cell, target_inputs, tgt_mask,
                                      initial_state, annotation, src_mask,
                                      ahdim)
            all_output, all_context = decoder_outputs

            shift_inputs = theano.tensor.zeros_like(target_inputs)
            shift_inputs = theano.tensor.set_subtensor(shift_inputs[1:],
                                                       target_inputs[:-1])

            init_state = initial_state[None, :, :]
            all_states = theano.tensor.concatenate([init_state, all_output], 0)
            prev_states = all_states[:-1]

            with ops.variable_scope("decoder"):
                probs = prediction(shift_inputs, prev_states, all_context,
                                   keep_prob=keep_prob)

            # compute cost
            idx = theano.tensor.arange(tgt_seq.flatten().shape[0])
            cost = -theano.tensor.log(probs[idx, tgt_seq.flatten()])
            cost = cost.reshape(tgt_seq.shape)
            cost = theano.tensor.sum(cost * tgt_mask, 0)
            cost = theano.tensor.mean(cost)

        training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        training_outputs = [cost]

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = theano.tensor.ivector("prev_words")

            # encoder, disable dropout
            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias

            cell = nn.rnn_cell.gru_cell([sedim, shdim])
            outputs = encoder(cell, source_inputs, src_mask)
            annotation = theano.tensor.concatenate(outputs, 2)

            # decoder
            final_state = outputs[1][0]
            with ops.variable_scope("decoder"):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True, scope="initial",
                                               activation=theano.tensor.tanh)

            inputs = nn.embedding_lookup(target_embedding, prev_words)
            inputs = inputs + target_bias

            cond = theano.tensor.neq(prev_words, 0)
            # zeros out embedding if y is 0
            inputs = inputs * cond[:, None]

            cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim])

            with ops.variable_scope("decoder"):
                mapped_states = map_attention_states(annotation, 2 * shdim,
                                                     ahdim)
                alpha = attention(initial_state, mapped_states, thdim, ahdim,
                                  src_mask)
                context = theano.tensor.sum(alpha[:, :, None] * annotation, 0)
                output, next_state = cell([inputs, context], initial_state)
                probs = prediction(inputs, initial_state, context)

        # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [annotation, initial_state, mapped_states]
        encode = theano.function(encoding_inputs, encoding_outputs)

        prediction_inputs = [prev_words, initial_state, annotation,
                             mapped_states, src_mask]
        prediction_outputs = [probs, context, alpha]
        predict = theano.function(prediction_inputs, prediction_outputs)

        generation_inputs = [prev_words, initial_state, context]
        generation_outputs = next_state
        generate = theano.function(generation_inputs, generation_outputs)

        self.cost = cost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.encode = encode
        self.predict = predict
        self.generate = generate
        self.option = option
Beispiel #11
0
    def __init__(self, **option):

        # source and target embedding dim
        sedim, tedim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim = option["hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        scope = option["scope"]
        decoder_scope = "decoder2"

        encoder = Encoder(sedim, shdim)
        import decoder2
        decoder = decoder2.DecoderGruCond(2,
                                          option['method'],
                                          tedim,
                                          thdim,
                                          ahdim,
                                          2 * shdim + thdim,
                                          dim_readout=deephid,
                                          n_y_vocab=tvsize)

        # training graph
        with ops.variable_scope(scope,
                                initializer=initializer,
                                regularizer=regularizer,
                                dtype=dtype):
            src_seq = T.imatrix("source_sequence")
            src_mask = T.matrix("source_sequence_mask")
            tgt_seq = T.imatrix("target_sequence")
            tgt_mask = T.matrix("target_sequence_mask")
            byseq = T.imatrix("backward_target_sequence")

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding"):
                target_embedding = ops.get_variable("embedding",
                                                    [tvsize, tedim])
                target_bias = ops.get_variable("bias", [tedim])

            source_inputs = nn.embedding_lookup(source_embedding,
                                                src_seq) + source_bias
            target_inputs = nn.embedding_lookup(target_embedding,
                                                tgt_seq) + target_bias
            by_inputs = nn.embedding_lookup(target_embedding,
                                            byseq) + target_bias

            if keep_prob < 1.0:
                source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob)
                target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob)
                by_inputs = nn.dropout(by_inputs, keep_prob=keep_prob)

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            annotation = nn.dropout(annotation, keep_prob=keep_prob)

            import softdec
            soft_decoder = softdec.SoftDecoder(option["eosid"],
                                               option["softk"],
                                               tedim,
                                               thdim,
                                               ahdim,
                                               2 * shdim,
                                               dim_readout=deephid,
                                               n_y_vocab=tvsize)
            with ops.variable_scope('soft_decoder'):
                initial_state = nn.feedforward(states[-1], [shdim, thdim],
                                               True,
                                               scope='initial',
                                               activation=T.tanh)
                mapped_keys = map_key(annotation, 2 * shdim, ahdim)
                soft_states, _, _, soft_mask = soft_decoder.infer(
                    mapped_keys, src_mask, annotation, initial_state,
                    target_embedding, target_bias, keep_prob)

            with ops.variable_scope('soft_decoder', reuse=True):
                _, _, soft_cost, _ = soft_decoder.forward(
                    byseq, by_inputs, tgt_mask, mapped_keys, src_mask,
                    annotation, initial_state, keep_prob)

            # compute initial state for decoder
            # first state of backward encoder
            # initialize with only encoder state
            final_state = r_states[0]

            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)
                # keys for query
                with ops.variable_scope('map-key-src'):
                    mapped_keys_src = map_key(annotation, 2 * shdim, ahdim)
                with ops.variable_scope('map-key-soft'):
                    mapped_keys_soft = map_key(soft_states, thdim, ahdim)

                _, _, _, snt_cost = decoder.forward(
                    tgt_seq, target_inputs, tgt_mask,
                    [mapped_keys_src, mapped_keys_soft], [src_mask, soft_mask],
                    [annotation, soft_states], initial_state, keep_prob)

            ce = snt_cost
            true_cost = T.mean(ce)
            lamb = theano.shared(numpy.asarray(option['lambda'], dtype),
                                 'lambda')
            cost = lamb * soft_cost + (1 - lamb) * true_cost

        # import utils.ttensor
        # print 'true_cost %d:' % len(utils.ttensor.find_inputs_and_params(true_cost)[0])
        # for xxx in utils.ttensor.find_inputs_and_params(true_cost)[0]:
        #     print '\t', xxx
        # print 'soft_cost %d:' % len(utils.ttensor.find_inputs_and_params(soft_cost)[0])
        # for xxx in utils.ttensor.find_inputs_and_params(soft_cost)[0]:
        #     print '\t', xxx
        # print 'tot_cost: %d' % len(utils.ttensor.find_inputs_and_params(cost)[0])
        # for xxx in utils.ttensor.find_inputs_and_params(cost)[0]:
        #     print '\t', xxx
        # print 'snt_cost: %d' % len(utils.ttensor.find_inputs_and_params(snt_cost)[0])
        # for xxx in utils.ttensor.find_inputs_and_params(snt_cost)[0]:
        #     print '\t', xxx

        training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask, byseq]
        training_outputs = [cost, soft_cost, true_cost]

        # get_snt_cost = theano.function(training_inputs[:4], snt_cost)
        get_snt_cost = None

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = T.ivector("prev_words")

            # disable dropout
            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)
            target_inputs = target_inputs + target_bias

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            with ops.variable_scope('soft_decoder'):
                initial_state = nn.feedforward(states[-1], [shdim, thdim],
                                               True,
                                               scope='initial',
                                               activation=T.tanh)
                mapped_keys = map_key(annotation, 2 * shdim, ahdim)
                soft_states, soft_contexts, soft_probs, soft_mask = soft_decoder.infer(
                    mapped_keys, src_mask, annotation, initial_state,
                    target_embedding, target_bias, 1.0)

            # decoder
            final_state = r_states[0]
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)
                # keys for query
                with ops.variable_scope('map-key-src'):
                    mapped_keys_src = map_key(annotation, 2 * shdim, ahdim)
                with ops.variable_scope('map-key-soft'):
                    mapped_keys_soft = map_key(soft_states, thdim, ahdim)

            prev_inputs = nn.embedding_lookup(target_embedding, prev_words)
            prev_inputs = prev_inputs + target_bias

            cond = T.neq(prev_words, 0)
            # zeros out embedding if y is 0, which indicates <s>
            prev_inputs = prev_inputs * cond[:, None]

            with ops.variable_scope(decoder_scope):
                mask = T.ones_like(prev_words, dtype=dtype)
                next_state, context = decoder.step(
                    prev_inputs, mask, initial_state, *[
                        mapped_keys_src, mapped_keys_soft, annotation,
                        soft_states, src_mask, soft_mask
                    ])
                probs = decoder.prediction(prev_inputs, next_state, context)

                # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [
            initial_state, annotation, soft_states, mapped_keys_src,
            mapped_keys_soft, soft_mask
        ]
        encode = theano.function(encoding_inputs, encoding_outputs)

        if option["decoder"] == "GruSimple":
            raise ValueError()
            prediction_inputs = [
                prev_words, initial_state, annotation, mapped_keys, src_mask
            ]
            prediction_outputs = [probs, context]
            predict = theano.function(prediction_inputs, prediction_outputs)

            generation_inputs = [prev_words, initial_state, context]
            generation_outputs = next_state
            generate = theano.function(generation_inputs, generation_outputs)

            self.predict = predict
            self.generate = generate
        elif option["decoder"] == "GruCond":
            prediction_inputs = [
                prev_words, initial_state, annotation, mapped_keys_src,
                src_mask, soft_states, mapped_keys_soft, soft_mask
            ]
            prediction_outputs = [probs, next_state]
            predict = theano.function(prediction_inputs, prediction_outputs)
            self.predict = predict

        self.cost = cost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.updates = []
        self.align = None
        self.sample = None
        self.encode = encode

        self.get_snt_cost = get_snt_cost
        self.option = option
Beispiel #12
0
  def build(self):
    params = self.params
    V, d, A = params.embed_size, params.hidden_size, self.words.vocab_size

    # initialize self
    # placeholders
    input = tf.placeholder('int32', shape=[self.params.batch_size, self.params.max_fact_count, self.params.max_sent_size], name='x')  # [num_batch, fact_count, sentence_len]
    question = tf.placeholder('int32', shape=[self.params.batch_size, self.params.max_ques_size], name='q')  # [num_batch, question_len]
    answer = tf.placeholder('int32', shape=[self.params.batch_size], name='y')  # [num_batch] - one word answer
    fact_counts = tf.placeholder('int64', shape=[self.params.batch_size], name='fc')
    input_mask = tf.placeholder('float32', shape=[self.params.batch_size, self.params.max_fact_count, self.params.max_sent_size,self.params.embed_size], name='xm')
    is_training = tf.placeholder(tf.bool)
    self.att = tf.constant(0.)

    # Prepare parameters
    gru = tf.nn.rnn_cell.GRUCell(self.params.hidden_size)
    l = self.positional_encoding()
    embedding = weight('embedding', [self.words.vocab_size, self.params.embed_size], init='uniform', range=3 ** (1 / 2))

    with tf.name_scope('SentenceReader'):
      input_list = tf.unstack(tf.transpose(input))  # L x [F, N]
      input_embed = []
      for facts in input_list:
        facts = tf.unstack(facts)
        embed = tf.stack([tf.nn.embedding_lookup(embedding, w) for w in facts])  # [F, N, V]
        input_embed.append(embed)

      # apply positional encoding
      input_embed = tf.transpose(tf.stack(input_embed), [2, 1, 0, 3])  # [N, F, L, V]
      encoded = l * input_embed * input_mask
      facts = tf.reduce_sum(encoded, 2)  # [N, F, V]

    # dropout time
    facts = dropout(facts, params.keep_prob, is_training)

    with tf.name_scope('InputFusion'):
      # Bidirectional RNN
      with tf.variable_scope('Forward'):
        forward_states, _ = tf.nn.dynamic_rnn(gru, facts, fact_counts, dtype=tf.float32)

      with tf.variable_scope('Backward'):
        facts_reverse = tf.reverse_sequence(facts, fact_counts, 1)
        backward_states, _ = tf.nn.dynamic_rnn(gru, facts_reverse, fact_counts, dtype=tf.float32)

      # Use forward and backward states both
      facts = forward_states + backward_states  # [N, F, d]

    with tf.variable_scope('Question'):
      tf.logging.info(question)
      ques_list = tf.unstack(tf.transpose(question))
      tf.logging.info(ques_list)
      ques_embed = tf.stack([tf.nn.embedding_lookup(embedding, w) for w in ques_list])
      #ques_embed = tf.expand_dims(ques_embed, 0)
      tf.logging.info(ques_embed)
      initial_state = gru.zero_state(self.params.batch_size, dtype=tf.float32)
      _, question_vec = tf.nn.dynamic_rnn(gru, ques_embed,initial_state=initial_state, dtype=tf.float32,time_major=True)

    # Episodic Memory
    with tf.variable_scope('Episodic'):
      episode = EpisodeModule(self.params.hidden_size, question_vec, facts, is_training, self.params.batch_norm)
      memory = tf.identity(question_vec)

      for t in range(params.memory_step):
        with tf.variable_scope('Layer%d' % t) as scope:
          if params.memory_update == 'gru':
            memory = gru(episode.new(memory), memory)[0]
          else:
            # ReLU update
            c = episode.new(memory)
            concated = tf.concat([memory, c, question_vec],1)

            w_t = weight('w_t', [3 * d, d])
            z = tf.matmul(concated, w_t)
            if params.batch_norm:
              z = batch_norm(z, is_training)
            else:
              b_t = bias('b_t', d)
              z = z + b_t
            memory = tf.nn.relu(z)  # [N, d]

          scope.reuse_variables()

    # Regularizations
    if params.batch_norm:
      memory = batch_norm(memory, is_training=is_training)
    memory = dropout(memory, params.keep_prob, is_training)

    with tf.name_scope('Answer'):
      # Answer module : feed-forward version (for it is one word answer)
      w_a = weight('w_a', [d, A], init='xavier')
      logits = tf.matmul(memory, w_a)  # [N, A]

    with tf.name_scope('Loss'):
      # Cross-Entropy loss
      cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=answer)
      loss = tf.reduce_mean(cross_entropy)
      total_loss = loss + params.weight_decay * tf.add_n(tf.get_collection('l2'))

    with tf.variable_scope('Accuracy'):
      # Accuracy
      predicts = tf.cast(tf.argmax(logits, 1), 'int32')
      corrects = tf.equal(predicts, answer)
      num_corrects = tf.reduce_sum(tf.cast(corrects, tf.float32))
      accuracy = tf.reduce_mean(tf.cast(corrects, tf.float32))

    # Training
    optimizer = tf.train.AdamOptimizer(params.learning_rate)
    opt_op = optimizer.minimize(total_loss, global_step=self.global_step)

    # placeholders
    self.x = input
    self.xm = input_mask
    self.q = question
    self.y = answer
    self.fc = fact_counts
    self.is_training = is_training

    # tensors
    self.total_loss = total_loss
    self.num_corrects = num_corrects
    self.accuracy = accuracy
    self.opt_op = opt_op
Beispiel #13
0
def model_spec(x,
               keep_prob=0.5,
               deterministic=False,
               init=False,
               use_weight_normalization=False,
               use_batch_normalization=False,
               use_mean_only_batch_normalization=False):
    x = nn.gaussian_noise(x,
                          deterministic=deterministic,
                          name='gaussian_noise')

    x = nn.conv2d(
        x,
        num_filters=96,
        init=init,
        use_weight_normalization=use_weight_normalization,
        use_batch_normalization=use_batch_normalization,
        use_mean_only_batch_normalization=use_mean_only_batch_normalization,
        deterministic=deterministic,
        name='conv1',
        nonlinearity=nn.lRelu)

    x = nn.conv2d(
        x,
        num_filters=96,
        init=init,
        use_weight_normalization=use_weight_normalization,
        use_batch_normalization=use_batch_normalization,
        use_mean_only_batch_normalization=use_mean_only_batch_normalization,
        deterministic=deterministic,
        name='conv2',
        nonlinearity=nn.lRelu)

    x = nn.conv2d(
        x,
        num_filters=96,
        init=init,
        use_weight_normalization=use_weight_normalization,
        use_batch_normalization=use_batch_normalization,
        use_mean_only_batch_normalization=use_mean_only_batch_normalization,
        deterministic=deterministic,
        name='conv3',
        nonlinearity=nn.lRelu)

    x = tf.nn.max_pool(x,
                       ksize=[1, 2, 2, 1],
                       strides=[1, 2, 2, 1],
                       padding='SAME',
                       name='max_pool_1')
    x = nn.dropout(x,
                   keep_prob=keep_prob,
                   deterministic=deterministic,
                   name='drop1')

    x = nn.conv2d(
        x,
        num_filters=192,
        init=init,
        use_weight_normalization=use_weight_normalization,
        use_batch_normalization=use_batch_normalization,
        use_mean_only_batch_normalization=use_mean_only_batch_normalization,
        deterministic=deterministic,
        name='conv4',
        nonlinearity=nn.lRelu)

    x = nn.conv2d(
        x,
        num_filters=192,
        init=init,
        use_weight_normalization=use_weight_normalization,
        use_batch_normalization=use_batch_normalization,
        use_mean_only_batch_normalization=use_mean_only_batch_normalization,
        deterministic=deterministic,
        name='conv5',
        nonlinearity=nn.lRelu)

    x = nn.conv2d(
        x,
        num_filters=192,
        init=init,
        use_weight_normalization=use_weight_normalization,
        use_batch_normalization=use_batch_normalization,
        use_mean_only_batch_normalization=use_mean_only_batch_normalization,
        deterministic=deterministic,
        name='conv6',
        nonlinearity=nn.lRelu)

    x = tf.nn.max_pool(x,
                       ksize=[1, 2, 2, 1],
                       strides=[1, 2, 2, 1],
                       padding='SAME',
                       name='max_pool_2')
    x = nn.dropout(x,
                   keep_prob=keep_prob,
                   deterministic=deterministic,
                   name='drop2')

    x = nn.conv2d(
        x,
        num_filters=192,
        init=init,
        use_weight_normalization=use_weight_normalization,
        use_batch_normalization=use_batch_normalization,
        use_mean_only_batch_normalization=use_mean_only_batch_normalization,
        deterministic=deterministic,
        pad='VALID',
        name='conv7',
        nonlinearity=nn.lRelu)

    x = nn.NiN(
        x,
        num_units=192,
        nonlinearity=nn.lRelu,
        init=init,
        use_weight_normalization=use_weight_normalization,
        use_batch_normalization=use_batch_normalization,
        use_mean_only_batch_normalization=use_mean_only_batch_normalization,
        deterministic=deterministic,
        name='Nin1')

    x = nn.NiN(
        x,
        num_units=192,
        nonlinearity=nn.lRelu,
        init=init,
        use_weight_normalization=use_weight_normalization,
        use_batch_normalization=use_batch_normalization,
        use_mean_only_batch_normalization=use_mean_only_batch_normalization,
        deterministic=deterministic,
        name='Nin2')

    x = nn.globalAvgPool(x, name='Globalavgpool1')

    x = nn.dense(
        x,
        num_units=10,
        nonlinearity=None,
        init=init,
        use_weight_normalization=use_weight_normalization,
        use_batch_normalization=use_batch_normalization,
        use_mean_only_batch_normalization=use_mean_only_batch_normalization,
        deterministic=deterministic,
        name='output_dense')

    return x
Beispiel #14
0
    def __init__(self, **option):
        # source and target embedding dim
        sedim, tedim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim = option["hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "criterion" not in option:
            option["criterion"] = "mle"

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        scope = option["scope"]
        criterion = option["criterion"]
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        # MRT mode do not use dropout
        if criterion == "mrt":
            keep_prob = 1.0

        def prediction(prev_inputs, prev_state, context, keep_prob=1.0):
            features = [prev_state, prev_inputs, context]
            maxhid = nn.maxout(features, [[thdim, tedim, 2 * shdim], maxdim],
                               maxpart, True)
            readout = nn.linear(maxhid, [maxdim, deephid],
                                False,
                                scope="deepout")

            if keep_prob < 1.0:
                readout = nn.dropout(readout, keep_prob=keep_prob)

            logits = nn.linear(readout, [deephid, tvsize],
                               True,
                               scope="logits")

            if logits.ndim == 3:
                new_shape = [logits.shape[0] * logits.shape[1], -1]
                logits = logits.reshape(new_shape)

            probs = theano.tensor.nnet.softmax(logits)

            return probs

        # training graph
        with ops.variable_scope(scope,
                                initializer=initializer,
                                regularizer=regularizer,
                                dtype=dtype):
            src_seq = theano.tensor.imatrix("soruce_sequence")
            src_mask = theano.tensor.matrix("soruce_sequence_mask")
            tgt_seq = theano.tensor.imatrix("target_sequence")
            tgt_mask = theano.tensor.matrix("target_sequence_mask")

            if criterion == "mrt":
                loss = theano.tensor.vector("loss_score")
                sharp = theano.tensor.scalar("sharpness")

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding"):
                target_embedding = ops.get_variable("embedding",
                                                    [tvsize, tedim])
                target_bias = ops.get_variable("bias", [tedim])

            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)

            source_inputs = source_inputs + source_bias
            target_inputs = target_inputs + target_bias

            if keep_prob < 1.0:
                source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob)
                target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob)

            cell = nn.rnn_cell.gru_cell([sedim, shdim])

            outputs = encoder(cell, source_inputs, src_mask)
            annotation = theano.tensor.concatenate(outputs, 2)

            annotation = nn.dropout(annotation, keep_prob=keep_prob)

            # compute initial state for decoder
            # first state of backward encoder
            final_state = outputs[1][0]
            with ops.variable_scope("decoder"):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True,
                                               scope="initial",
                                               activation=theano.tensor.tanh)

            # run decoder
            cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim])

            if criterion == "mrt":
                # In MRT training, shape of src_seq and src_mask are assumed
                # to have [len, 1]
                batch = tgt_seq.shape[1]
                with ops.variable_scope("decoder"):
                    mapped_states = attention(None, annotation, None, None,
                                              [thdim, 2 * shdim, ahdim])
                b_src_mask = theano.tensor.repeat(src_mask, batch, 1)
                b_annotation = theano.tensor.repeat(annotation, batch, 1)
                b_mapped_states = theano.tensor.repeat(mapped_states, batch, 1)
                b_initial_state = theano.tensor.repeat(initial_state, batch, 0)

                decoder_outputs = decoder(cell, target_inputs, tgt_mask,
                                          b_initial_state, b_annotation,
                                          b_src_mask, ahdim, b_mapped_states)
            else:
                decoder_outputs = decoder(cell, target_inputs, tgt_mask,
                                          initial_state, annotation, src_mask,
                                          ahdim)

            all_output, all_context = decoder_outputs
            shift_inputs = theano.tensor.zeros_like(target_inputs)
            shift_inputs = theano.tensor.set_subtensor(shift_inputs[1:],
                                                       target_inputs[:-1])

            if criterion == "mrt":
                init_state = b_initial_state[None, :, :]
            else:
                init_state = initial_state[None, :, :]

            all_states = theano.tensor.concatenate([init_state, all_output], 0)
            prev_states = all_states[:-1]

            with ops.variable_scope("decoder"):
                probs = prediction(shift_inputs,
                                   prev_states,
                                   all_context,
                                   keep_prob=keep_prob)

            # compute cost
            idx = theano.tensor.arange(tgt_seq.flatten().shape[0])
            ce = -theano.tensor.log(probs[idx, tgt_seq.flatten()])
            ce = ce.reshape(tgt_seq.shape)
            ce = theano.tensor.sum(ce * tgt_mask, 0)

            if criterion == "mle":
                cost = theano.tensor.mean(ce)
            else:
                # ce is positive here
                logp = -ce
                score = sharp * logp
                # safe softmax
                score = score - theano.tensor.max(score)
                score = theano.tensor.exp(score)
                qprob = score / theano.tensor.sum(score)
                risk = theano.tensor.sum(qprob * loss)
                cost = risk

        if criterion == "mle":
            training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        else:
            training_inputs = [
                src_seq, src_mask, tgt_seq, tgt_mask, loss, sharp
            ]
        training_outputs = [cost]

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = theano.tensor.ivector("prev_words")

            # disable dropout
            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)
            target_inputs = target_inputs + target_bias

            cell = nn.rnn_cell.gru_cell([sedim, shdim])
            outputs = encoder(cell, source_inputs, src_mask)
            annotation = theano.tensor.concatenate(outputs, 2)

            # decoder
            final_state = outputs[1][0]
            with ops.variable_scope("decoder"):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True,
                                               scope="initial",
                                               activation=theano.tensor.tanh)

            inputs = nn.embedding_lookup(target_embedding, prev_words)
            inputs = inputs + target_bias

            cond = theano.tensor.neq(prev_words, 0)
            # zeros out embedding if y is 0
            inputs = inputs * cond[:, None]

            cell = nn.rnn_cell.gru_cell([[tedim, 2 * shdim], thdim])

            # encode -> prediction -> generation
            # prediction: prev_word + prev_state => context, next_word
            # generation: curr_word + context + prev_state => next_state
            # here, initial_state is merely a placeholder
            with ops.variable_scope("decoder"):
                # used in encoding
                mapped_states = attention(None, annotation, None, None,
                                          [thdim, 2 * shdim, ahdim])
                # used in prediction
                alpha = attention(initial_state, None, mapped_states, src_mask,
                                  [thdim, 2 * shdim, ahdim])
                context = theano.tensor.sum(alpha[:, :, None] * annotation, 0)
                probs = prediction(inputs, initial_state, context)
                # used in generation
                output, next_state = cell([inputs, context], initial_state)

        # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [annotation, initial_state, mapped_states]
        encode = theano.function(encoding_inputs, encoding_outputs)

        prediction_inputs = [
            prev_words, initial_state, annotation, mapped_states, src_mask
        ]
        prediction_outputs = [probs, context, alpha]
        predict = theano.function(prediction_inputs, prediction_outputs)

        generation_inputs = [prev_words, initial_state, context]
        generation_outputs = next_state
        generate = theano.function(generation_inputs, generation_outputs)

        # sampling graph, this feature is optional
        with ops.variable_scope(scope, reuse=True):
            max_len = theano.tensor.iscalar()

            def sampling_loop(inputs, state, attn_states, attn_mask, m_states):
                alpha = attention(state, None, m_states, attn_mask,
                                  [thdim, 2 * shdim, ahdim])
                context = theano.tensor.sum(alpha[:, :, None] * attn_states, 0)
                probs = prediction(inputs, state, context)
                next_words = ops.random.multinomial(probs).argmax(axis=1)
                new_inputs = nn.embedding_lookup(target_embedding, next_words)
                new_inputs = new_inputs + target_bias
                output, next_state = cell([new_inputs, context], state)

                return [next_words, new_inputs, next_state]

            with ops.variable_scope("decoder"):
                batch = src_seq.shape[1]
                initial_inputs = theano.tensor.zeros([batch, tedim],
                                                     dtype=dtype)

                outputs_info = [None, initial_inputs, initial_state]
                nonseq = [annotation, src_mask, mapped_states]
                outputs, updates = theano.scan(sampling_loop, [],
                                               outputs_info,
                                               nonseq,
                                               n_steps=max_len)
                sampled_words = outputs[0]

        sampling_inputs = [src_seq, src_mask, max_len]
        sampling_outputs = sampled_words
        sample = theano.function(sampling_inputs,
                                 sampling_outputs,
                                 updates=updates)

        # attention graph, this feature is optional
        with ops.variable_scope(scope, reuse=True):

            def attention_loop(inputs, mask, state, attn_states, attn_mask,
                               m_states):
                mask = mask[:, None]
                alpha = attention(state, None, m_states, attn_mask,
                                  [thdim, 2 * shdim, ahdim])
                context = theano.tensor.sum(alpha[:, :, None] * attn_states, 0)
                output, next_state = cell([inputs, context], state)
                next_state = (1.0 - mask) * state + mask * next_state

                return [alpha, next_state]

            with ops.variable_scope("decoder"):
                seq = [target_inputs, tgt_mask]
                outputs_info = [None, initial_state]
                nonseq = [annotation, src_mask, mapped_states]
                (alpha, state), updaptes = theano.scan(attention_loop, seq,
                                                       outputs_info, nonseq)
                attention_score = alpha

        alignment_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        alignment_outputs = attention_score
        align = theano.function(alignment_inputs, alignment_outputs)

        self.cost = cost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.updates = []
        self.align = align
        self.sample = sample
        self.encode = encode
        self.predict = predict
        self.generate = generate
        self.option = option
Beispiel #15
0
    def __init__(self, **option):
        # source and target embedding dim
        sedim, tedim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim = option["hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        scope = option["scope"]
        decoder_scope = "decoder"

        encoder = Encoder(sedim, shdim)
        decoderType = eval("Decoder{}".format(option["decoder"]))
        decoder = decoderType(tedim, thdim, ahdim, 2 * shdim, dim_maxout=maxdim, max_part=maxpart, dim_readout=deephid,
                              n_y_vocab=tvsize)

        # training graph
        with ops.variable_scope(scope, initializer=initializer,
                                regularizer=regularizer, dtype=dtype):
            src_seq = T.imatrix("source_sequence")
            src_mask = T.matrix("source_sequence_mask")
            tgt_seq = T.imatrix("target_sequence")
            tgt_mask = T.matrix("target_sequence_mask")

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding") as tgtembscope:
                target_embedding = ops.get_variable("embedding",
                                                    [tvsize, tedim])
                # target_bias = ops.get_variable("bias", [tedim])
                decoder.tiescope = tgtembscope

            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)

            source_inputs = source_inputs + source_bias

            if keep_prob < 1.0:
                source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob)
                target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob)

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            # compute initial state for decoder
            # first state of backward encoder
            final_state = r_states[0]
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True, scope="initial",
                                               activation=T.tanh)
                # keys for query
                mapped_keys = map_key(annotation, 2 * shdim, ahdim)

                _, _, cost,_  = decoder.forward(tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask,
                                                    annotation, initial_state, keep_prob)


        training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        training_outputs = [cost]

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = T.ivector("prev_words")

            # disable dropout
            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)
            # target_inputs = target_inputs + target_bias

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            # decoder
            final_state = r_states[0]
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True, scope="initial",
                                               activation=T.tanh)
                mapped_keys = map_key(annotation, 2 * shdim, ahdim)

            prev_inputs = nn.embedding_lookup(target_embedding, prev_words)
            # prev_inputs = prev_inputs + target_bias

            cond = T.neq(prev_words, 0)
            # zeros out embedding if y is 0, which indicates <s>
            prev_inputs = prev_inputs * cond[:, None]

            with ops.variable_scope(decoder_scope):
                mask = T.ones_like(prev_words, dtype=dtype)
                next_state, context = decoder.step(prev_inputs, mask, initial_state, mapped_keys, annotation, src_mask)
                if option["decoder"] == "GruSimple":
                    probs = decoder.prediction(prev_inputs, initial_state, context)
                elif option["decoder"] == "GruCond":
                    probs = decoder.prediction(prev_inputs, next_state, context)

        # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [annotation, initial_state, mapped_keys]
        encode = theano.function(encoding_inputs, encoding_outputs)

        if option["decoder"] == "GruSimple":
            prediction_inputs = [prev_words, initial_state, annotation,
                                 mapped_keys, src_mask]
            prediction_outputs = [probs, context]
            predict = theano.function(prediction_inputs, prediction_outputs)

            generation_inputs = [prev_words, initial_state, context]
            generation_outputs = next_state
            generate = theano.function(generation_inputs, generation_outputs)

            self.predict = predict
            self.generate = generate
        elif option["decoder"] == "GruCond":
            prediction_inputs = [prev_words, initial_state, annotation,
                                 mapped_keys, src_mask]
            prediction_outputs = [probs, next_state]
            predict = theano.function(prediction_inputs, prediction_outputs)
            self.predict = predict

        # optional graph
        '''
        with ops.variable_scope(scope, reuse=True):
            sample = decoder.build_sampling(src_seq, src_mask, target_embedding, target_bias, mapped_keys,
                                            annotation, initial_state)
            align = decoder.build_attention(src_seq, src_mask, target_inputs, tgt_seq, tgt_mask, mapped_keys,
                                            annotation, initial_state)
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim, thdim],
                                               True, scope="initial",
                                               activation=T.tanh)
                # keys for query
                mapped_keys = map_key(annotation, 2 * shdim, ahdim)

                _, _, _,snt_cost  = decoder.forward(tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask,
                                                    annotation, initial_state, 1.0)
            get_snt_cost = theano.function(training_inputs, snt_cost)
        '''
        self.cost = cost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.updates = []
        # self.align = align
        # self.sample = sample
        self.encode = encode

        # self.get_snt_cost = get_snt_cost
        self.option = option
Beispiel #16
0
    def __init__(self, **option):
        # source and target embedding dim
        sedim, tedim = option["embdim"]
        # source, target and attention hidden dim
        shdim, thdim, ahdim, domaindim, feadim = option["hidden"]
        # maxout hidden dim
        maxdim = option["maxhid"]
        # maxout part
        maxpart = option["maxpart"]
        # deepout hidden dim
        deephid = option["deephid"]
        svocab, tvocab = option["vocabulary"]
        sw2id, sid2w = svocab
        tw2id, tid2w = tvocab
        # source and target vocabulary size
        svsize, tvsize = len(sid2w), len(tid2w)
        dnum = option['dnum']

        if "scope" not in option or option["scope"] is None:
            option["scope"] = "rnnsearch"

        if "initializer" not in option:
            option["initializer"] = None

        if "regularizer" not in option:
            option["regularizer"] = None

        if "keep_prob" not in option:
            option["keep_prob"] = 1.0

        dtype = theano.config.floatX
        initializer = option["initializer"]
        regularizer = option["regularizer"]
        keep_prob = option["keep_prob"] or 1.0

        scope = option["scope"]
        decoder_scope = "decoder"

        encoder = Encoder(sedim, shdim)
        decoderType = eval("Decoder{}".format(option["decoder"]))
        decoder = decoderType(tedim,
                              thdim,
                              ahdim,
                              2 * shdim,
                              dnum=dnum,
                              dim_maxout=maxdim,
                              max_part=maxpart,
                              dim_readout=deephid,
                              dim_domain=domaindim,
                              feadim=feadim,
                              n_y_vocab=tvsize)

        # training graph
        with ops.variable_scope(scope,
                                initializer=initializer,
                                regularizer=regularizer,
                                dtype=dtype):
            src_seq = T.imatrix("source_sequence")
            src_mask = T.matrix("source_sequence_mask")
            tgt_seq = T.imatrix("target_sequence")
            tgt_mask = T.matrix("target_sequence_mask")
            tag_seq = T.imatrix("domain_tag")
            # nsrc_mask = T.set_subtensor(src_mask[T.cast(T.sum(src_mask, 0) - 1, 'int32'),
            #                                      T.arange(src_mask.shape[1])], 0.0)

            with ops.variable_scope("source_embedding"):
                source_embedding = ops.get_variable("embedding",
                                                    [svsize, sedim])
                source_bias = ops.get_variable("bias", [sedim])

            with ops.variable_scope("target_embedding") as tgtembscope:
                target_embedding = ops.get_variable("embedding",
                                                    [tvsize, tedim])
                # target_bias = ops.get_variable("bias", [tedim])
                decoder.tiescope = tgtembscope

            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            target_inputs = nn.embedding_lookup(target_embedding, tgt_seq)

            source_inputs = source_inputs + source_bias

            if keep_prob < 1.0:
                source_inputs = nn.dropout(source_inputs, keep_prob=keep_prob)
                target_inputs = nn.dropout(target_inputs, keep_prob=keep_prob)

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            with ops.variable_scope("Specific"):
                domain_alpha = domain_sensitive_attention(
                    annotation, src_mask, shdim * 2, domaindim)
                # domain_alpha = attention(r_states[0], annotation, nsrc_mask,
                #                          shdim,
                #                          shdim * 2)
                domain_context = T.sum(annotation * domain_alpha[:, :, None],
                                       0)
                dfeature = nn.feedforward(domain_context, [shdim * 2, feadim],
                                          True,
                                          activation=T.tanh,
                                          scope="feature1")

                dscores = nn.feedforward(dfeature, [feadim, dnum],
                                         True,
                                         activation=T.tanh,
                                         scope="score")
                # (batch, 2)
                dprobs = T.nnet.softmax(dscores)
                dpred_tag = T.argmax(dprobs, 1)
                didx = T.arange(tag_seq.flatten().shape[0])
                dce = -T.log(dprobs[didx, tag_seq.flatten()])
                dcost = T.mean(dce)

            share_alpha = domain_sensitive_attention(annotation, src_mask,
                                                     shdim * 2, domaindim)
            # share_alpha = attention(r_states[0], annotation, nsrc_mask,
            #                         shdim,
            #                         shdim * 2)
            share_context = T.sum(annotation * share_alpha[:, :, None], 0)
            sfeature = nn.feedforward(share_context, [shdim * 2, feadim],
                                      True,
                                      activation=T.tanh,
                                      scope="feature1")

            with ops.variable_scope("Shared"):
                sscores = nn.feedforward(sfeature, [feadim, dnum],
                                         True,
                                         activation=T.tanh,
                                         scope="score")
                # (batch, 2)
                sprobs = T.nnet.softmax(sscores)
                spred_tag = T.argmax(sprobs, 1)
                sidx = T.arange(tag_seq.flatten().shape[0])
                sce = -T.log(sprobs[sidx, tag_seq.flatten()])
                scost = T.mean(sce)
                adv_sce = -sprobs[sidx, tag_seq.flatten()] * T.log(
                    sprobs[sidx, tag_seq.flatten()])
                adv_scost = T.mean(adv_sce)

            domain_gate = nn.feedforward([dfeature, annotation],
                                         [[feadim, shdim * 2], shdim * 2],
                                         True,
                                         scope="domain_gate")
            domain_annotation = annotation * domain_gate
            domain_annotation = nn.dropout(domain_annotation,
                                           keep_prob=keep_prob)
            share_gate = nn.feedforward([sfeature, annotation],
                                        [[feadim, shdim * 2], shdim * 2],
                                        True,
                                        scope="share_gate")
            annotation = annotation * share_gate
            annotation = nn.dropout(annotation, keep_prob=keep_prob)

            # compute initial state for decoder
            # first state of backward encoder
            # batch * shdim
            final_state = T.concatenate([
                annotation[0, :, annotation.shape[-1] / 2:],
                domain_annotation[0, :, annotation.shape[-1] / 2:]
            ], -1)
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim * 2, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)
                # keys for query
                mapped_keys = map_key(annotation, 2 * shdim, ahdim, "semantic")
                mapped_domain_keys = map_key(domain_annotation, 2 * shdim,
                                             ahdim, "domain")

                _, _, cost, tgtdcost, tpred_tag, _ = decoder.forward(
                    tgt_seq, target_inputs, tgt_mask, mapped_keys, src_mask,
                    annotation, initial_state, mapped_domain_keys,
                    domain_annotation, tag_seq, keep_prob)

        lamb = theano.shared(numpy.asarray(option["lambda"], dtype), "lambda")
        # cwscost *= lamb
        final_cost = cost + dcost + tgtdcost - lamb * adv_scost

        tag_inputs = [src_seq, src_mask]
        tag_outputs = [dpred_tag, spred_tag]
        tag_predict = theano.function(tag_inputs, tag_outputs)
        self.tag_predict = tag_predict

        tgt_tag_inputs = [src_seq, src_mask, tgt_seq, tgt_mask]
        tgt_tag_outputs = [tpred_tag]
        tgt_tag_predict = theano.function(tgt_tag_inputs, tgt_tag_outputs)
        self.tgt_tag_predict = tgt_tag_predict

        training_inputs = [src_seq, src_mask, tgt_seq, tgt_mask, tag_seq]
        training_outputs = [cost, dcost, adv_scost, tgtdcost]

        self.cost_cla = scost
        self.inputs_cla = [src_seq, src_mask, tag_seq]
        self.outputs_cla = [scost]

        # decoding graph
        with ops.variable_scope(scope, reuse=True):
            prev_words = T.ivector("prev_words")

            # disable dropout
            source_inputs = nn.embedding_lookup(source_embedding, src_seq)
            source_inputs = source_inputs + source_bias

            states, r_states = encoder.forward(source_inputs, src_mask)
            annotation = T.concatenate([states, r_states], 2)

            with ops.variable_scope("Specific"):
                domain_alpha = domain_sensitive_attention(
                    annotation, src_mask, shdim * 2, domaindim)
                # domain_alpha = attention(r_states[0], annotation, nsrc_mask,
                #                          shdim,
                #                          shdim * 2)
                domain_context = T.sum(annotation * domain_alpha[:, :, None],
                                       0)
                dfeature = nn.feedforward(domain_context, [shdim * 2, feadim],
                                          True,
                                          activation=T.tanh,
                                          scope="feature1")

            share_alpha = domain_sensitive_attention(annotation, src_mask,
                                                     shdim * 2, domaindim)
            # share_alpha = attention(r_states[0], annotation, nsrc_mask,
            #                         shdim,
            #                         shdim * 2)
            share_context = T.sum(annotation * share_alpha[:, :, None], 0)
            sfeature = nn.feedforward(share_context, [shdim * 2, feadim],
                                      True,
                                      activation=T.tanh,
                                      scope="feature1")

            domain_gate = nn.feedforward([dfeature, annotation],
                                         [[feadim, shdim * 2], shdim * 2],
                                         True,
                                         scope="domain_gate")
            domain_annotation = annotation * domain_gate
            share_gate = nn.feedforward([sfeature, annotation],
                                        [[feadim, shdim * 2], shdim * 2],
                                        True,
                                        scope="share_gate")
            annotation = annotation * share_gate

            # decoder
            final_state = T.concatenate([
                annotation[0, :, annotation.shape[-1] / 2:],
                domain_annotation[0, :, annotation.shape[-1] / 2:]
            ], -1)
            with ops.variable_scope(decoder_scope):
                initial_state = nn.feedforward(final_state, [shdim * 2, thdim],
                                               True,
                                               scope="initial",
                                               activation=T.tanh)
                mapped_keys = map_key(annotation, 2 * shdim, ahdim, "semantic")
                mapped_domain_keys = map_key(domain_annotation, 2 * shdim,
                                             ahdim, "domain")

            prev_inputs = nn.embedding_lookup(target_embedding, prev_words)
            # prev_inputs = prev_inputs + target_bias

            cond = T.neq(prev_words, 0)
            # zeros out embedding if y is 0, which indicates <s>
            prev_inputs = prev_inputs * cond[:, None]

            with ops.variable_scope(decoder_scope):
                mask = T.ones_like(prev_words, dtype=dtype)
                next_state, context = decoder.step(prev_inputs, mask,
                                                   initial_state, mapped_keys,
                                                   annotation, src_mask,
                                                   mapped_domain_keys,
                                                   domain_annotation)
                if option["decoder"] == "GruSimple":
                    probs = decoder.prediction(prev_inputs, initial_state,
                                               context)
                elif option["decoder"] == "GruCond":
                    probs = decoder.prediction(prev_inputs, next_state,
                                               context)

        # encoding
        encoding_inputs = [src_seq, src_mask]
        encoding_outputs = [
            annotation, initial_state, mapped_keys, mapped_domain_keys,
            domain_annotation
        ]
        encode = theano.function(encoding_inputs, encoding_outputs)

        if option["decoder"] == "GruSimple":
            prediction_inputs = [
                prev_words, initial_state, annotation, mapped_keys, src_mask
            ]
            prediction_outputs = [probs, context]
            predict = theano.function(prediction_inputs, prediction_outputs)

            generation_inputs = [prev_words, initial_state, context]
            generation_outputs = next_state
            generate = theano.function(generation_inputs, generation_outputs)

            self.predict = predict
            self.generate = generate
        elif option["decoder"] == "GruCond":
            prediction_inputs = [
                prev_words, initial_state, annotation, mapped_keys, src_mask,
                mapped_domain_keys, domain_annotation
            ]
            prediction_outputs = [probs, next_state]
            predict = theano.function(prediction_inputs, prediction_outputs)
            self.predict = predict

        self.cost = final_cost
        self.inputs = training_inputs
        self.outputs = training_outputs
        self.updates = []
        # self.align = align
        # self.sample = sample
        self.encode = encode
        # self.get_snt_cost = get_snt_cost
        self.option = option