Example #1
0
    def call(self, inputs, **kwargs):
        """Following the routing algorithm from Hinton's paper,
        but replace b = b + <u,v> with b = <u,v>.

        This change can improve the feature representation of the capsule.

        However, you can replace
            b = K.batch_dot(outputs, hat_inputs, [2, 3])
        with
            b += K.batch_dot(outputs, hat_inputs, [2, 3])
        to get standard routing.
        """

        if self.share_weights:
            hat_inputs = K.conv1d(inputs, self.kernel)
        else:
            hat_inputs = K.local_conv1d(inputs, self.kernel, [1], [1])

        batch_size = K.shape(inputs)[0]
        input_num_capsule = K.shape(inputs)[1]
        hat_inputs = K.reshape(hat_inputs,
                               (batch_size, input_num_capsule,
                                self.num_capsule, self.dim_capsule))
        hat_inputs = K.permute_dimensions(hat_inputs, (0, 2, 1, 3))

        b = K.zeros_like(hat_inputs[:, :, :, 0])
        print(self.routings)
        for i in range(self.routings):
            c = K.softmax(b, 1)
            o = self.activation(K.batch_dot(c, hat_inputs, [2, 2]))
            if i < self.routings - 1:
                b = K.batch_dot(o, hat_inputs, [2, 3])
                if K.backend() == 'theano':
                    o = K.sum(o, axis=1)
        return o
Example #2
0
 def call(self, input):
     for i in range(self.num_layer):
         if i == 0:
             cross = Lambda(lambda x: Add()([K.sum(self.W[i] * K.batch_dot(K.reshape(x, (-1, self.input_dim, 1)), x), 1, keepdims = True), self.bias[i], x]))(input)
         else:
             cross = Lambda(lambda x: Add()([K.sum(self.W[i] * K.batch_dot(K.reshape(x, (-1, self.input_dim, 1)), input), 1, keepdims = True), self.bias[i], input]))(cross)
     return Flatten()(cross)
Example #3
0
def self_attn_block(inp, n_c, squeeze_factor=8):
    """ GAN Self Attention Block
    Code borrows from https://github.com/taki0112/Self-Attention-GAN-Tensorflow
    """
    msg = "Input channels must be >= {}, recieved nc={}".format(squeeze_factor, n_c)
    assert n_c // squeeze_factor > 0, msg
    var_x = inp
    shape_x = var_x.get_shape().as_list()

    var_f = Conv2D(n_c // squeeze_factor, 1,
                   kernel_regularizer=regularizers.l2(GAN22_REGULARIZER))(var_x)
    var_g = Conv2D(n_c // squeeze_factor, 1,
                   kernel_regularizer=regularizers.l2(GAN22_REGULARIZER))(var_x)
    var_h = Conv2D(n_c, 1, kernel_regularizer=regularizers.l2(GAN22_REGULARIZER))(var_x)

    shape_f = var_f.get_shape().as_list()
    shape_g = var_g.get_shape().as_list()
    shape_h = var_h.get_shape().as_list()
    flat_f = Reshape((-1, shape_f[-1]))(var_f)
    flat_g = Reshape((-1, shape_g[-1]))(var_g)
    flat_h = Reshape((-1, shape_h[-1]))(var_h)

    var_s = Lambda(lambda var_x: K.batch_dot(var_x[0],
                                             Permute((2, 1))(var_x[1])))([flat_g, flat_f])

    beta = Softmax(axis=-1)(var_s)
    var_o = Lambda(lambda var_x: K.batch_dot(var_x[0], var_x[1]))([beta, flat_h])
    var_o = Reshape(shape_x[1:])(var_o)
    var_o = Scale()(var_o)

    out = add([var_o, inp])
    return out
Example #4
0
    def simple_context(X, mask, n=activation_rnn_size):
        """Reduce the input just to its headline part (second half).

        For each word in this part it concatenate the output of the previous layer (RNN)
        with a weighted average of the outputs of the description part.
        In this only the last `rnn_size - activation_rnn_size` are used from each output.
        The first `activation_rnn_size` output is used to computer the weights for the averaging.
        """
        desc, head = X[:, :maxlend, :], X[:, maxlend:, :]
        head_activations, head_words = head[:, :, :n], head[:, :, n:]
        desc_activations, desc_words = desc[:, :, :n], desc[:, :, n:]

        # RTFM http://deeplearning.net/software/theano/library/tensor/basic.html#theano.tensor.batched_tensordot
        # activation for every head word and every desc word
        activation_energies = K.batch_dot(head_activations, desc_activations, axes=(2, 2))
        # make sure we dont use description words that are masked out
        activation_energies = activation_energies + -1e20 * K.expand_dims(
            1. - K.cast(mask[:, :maxlend], 'float32'), 1)

        # for every head word compute weights for every desc word
        activation_energies = K.reshape(activation_energies, (-1, maxlend))
        activation_weights = K.softmax(activation_energies)
        activation_weights = K.reshape(activation_weights, (-1, maxlenh, maxlend))

        # for every head word compute weighted average of desc words
        desc_avg_word = K.batch_dot(activation_weights, desc_words, axes=(2, 1))
        return K.concatenate((desc_avg_word, head_words))
Example #5
0
    def call(self, x):
        assert isinstance(x, list)
        inp_a, inp_b = x

        outp_a = K.l2_normalize(inp_a, -1)
        outp_b = K.l2_normalize(inp_b, -1)
        alpha = K.batch_dot(outp_b, outp_a, axes=[2, 2])
        alpha = K.l2_normalize(alpha, 1)
        alpha = K.one_hot(K.argmax(alpha, 1), K.int_shape(inp_a)[1])
        hmax = K.batch_dot(alpha, outp_b, axes=[1, 1])
        kcon = K.eye(K.int_shape(inp_a)[1], dtype='float32')

        m = []
        for i in range(self.output_dim):
            outp_a = inp_a * self.W[i]
            outp_hmax = hmax * self.W[i]
            outp_a = K.l2_normalize(outp_a, -1)
            outp_hmax = K.l2_normalize(outp_hmax, -1)
            outp = K.batch_dot(outp_hmax, outp_a, axes=[2, 2])
            outp = K.sum(outp * kcon, -1, keepdims=True)
            m.append(outp)
        if self.output_dim > 1:
            persp = K.concatenate(m, 2)
        else:
            persp = m
        return [persp, persp]
Example #6
0
File: dqn.py Project: noe/keras-rl
        def A_network_output(x):
            # The input of this layer is [L, mu, a] in concatenated form. We first split
            # those up.
            idx = 0
            L_flat = x[:, idx:idx + (self.nb_actions * self.nb_actions + self.nb_actions) / 2]
            idx += (self.nb_actions * self.nb_actions + self.nb_actions) / 2
            mu = x[:, idx:idx + self.nb_actions]
            idx += self.nb_actions
            a = x[:, idx:idx + self.nb_actions]
            idx += self.nb_actions

            # Create L and L^T matrix, which we use to construct the positive-definite matrix P.
            Ls = []
            LTs = []
            for idx in xrange(self.batch_size):
                L = K.zeros((self.nb_actions, self.nb_actions))
                L = T.set_subtensor(L[np.tril_indices(self.nb_actions)], L_flat[idx, :])
                diag = K.exp(T.diag(L))
                L = T.set_subtensor(L[np.diag_indices(self.nb_actions)], diag)
                Ls.append(L)
                LTs.append(K.transpose(L))
                # TODO: diagonal elements exp
            L = K.pack(Ls)
            LT = K.pack(LTs)
            P = K.batch_dot(L, LT, axes=(1, 2))
            assert K.ndim(P) == 3

            # Combine a, mu and P into a scalar (over the batches).
            A = -.5 * K.batch_dot(K.batch_dot(a - mu, P, axes=(1, 2)), a - mu, axes=1)
            assert K.ndim(A) == 2
            return A
Example #7
0
 def call(self, x):
     #如果只传入Q_seq,K_seq,V_seq,那么就不做Mask
     #如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len,那么对多余部分做Mask
     if len(x) == 3:
         Q_seq,K_seq,V_seq = x
         Q_len,V_len = None,None
     elif len(x) == 5:
         Q_seq,K_seq,V_seq,Q_len,V_len = x
     #对Q、K、V做线性变换
     Q_seq = K.dot(Q_seq, self.WQ)
     Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head))
     Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3))
     K_seq = K.dot(K_seq, self.WK)
     K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head))
     K_seq = K.permute_dimensions(K_seq, (0,2,1,3))
     V_seq = K.dot(V_seq, self.WV)
     V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head))
     V_seq = K.permute_dimensions(V_seq, (0,2,1,3))
     #计算内积,然后mask,然后softmax
     A = K.batch_dot(Q_seq, K_seq, axes=[3,3])
     A = K.permute_dimensions(A, (0,3,2,1))
     A = self.Mask(A, V_len, 'add')
     A = K.permute_dimensions(A, (0,3,2,1))    
     A = K.softmax(A)
     #输出并mask
     O_seq = K.batch_dot(A, V_seq, axes=[3,2])
     O_seq = K.permute_dimensions(O_seq, (0,2,1,3))
     O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))
     O_seq = self.Mask(O_seq, Q_len, 'mul')
     return O_seq
    def call(self, x, mask=None):
        q, k, v = x
        d_k = q.shape.as_list()[2]

        # in pure tensorflow:
        # weights = tf.matmul(x_batch, tf.transpose(y_batch, perm=[0, 2, 1]))
        # normalized_weights = tf.nn.softmax(weights/scaling)
        # output = tf.matmul(normalized_weights, x_batch)
        
        weights = K.batch_dot(q,  k, axes=[2, 2])

        if mask is not None:
            # add mask weights
            if isinstance(mask, (list, tuple)):
                if len(mask) > 0:
                    raise ValueError("mask can only be a Tensor or a list of length 1 containing a tensor.")

                mask = mask[0]

            weights += -1e10*(1-mask)

        normalized_weights = K.softmax(weights / np.sqrt(d_k))
        output = K.batch_dot(normalized_weights, v)
        
        if self._return_attention:
            return [output, normalized_weights]
        else:
            return output
Example #9
0
def recurrence(y_i, h):
    h_permute = K.permute_dimensions(h, [0, 2, 1])  # (batch_size, encoding_dim, input_length)
    e = K.l2_normalize(
        K.batch_dot(h_permute, s, axes=1),  # (batch_size, input_length)
        axis=1)  # (batch_size, input_length)

    # eqn 6
    alpha = K.softmax(e)  # (batch_size, input_length)

    # eqn 5
    c = K.batch_dot(h, alpha, axes=1)  # (batch_size, encoding_dim)

    recurrence_result = K.expand_dims(
        K.concatenate([c, y_i], axis=1),
        dim=1)  # (batch_size, 1, 2 * encoding_dim)

    expanded_h = Input(shape=(1, 2 * encoding_dim),
                       name='expanded_h')
    gru = Sequential([
        GRU(output_dim,
            return_sequences=False,
            input_shape=(1, 2 * encoding_dim))
    ])
    model = Model(input=[expanded_h],
                  output=[gru(expanded_h)])  # (batch_size, 1, output_dim)
    return model(recurrence_result)
Example #10
0
def semantic_matrix(argv):
	assert len(argv) == 2
	q = argv[0]
	a = argv[1]
	q_sqrt = K.sqrt((q ** 2).sum(axis=2, keepdims=True))
	a_sqrt = K.sqrt((a ** 2).sum(axis=2, keepdims=True))
	denominator = K.batch_dot(q_sqrt, K.permute_dimensions(a_sqrt, [0,2,1]))
	return K.batch_dot(q, K.permute_dimensions(a, [0,2,1])) / (denominator + SAFE_EPSILON)
Example #11
0
    def call(self, x, mask=None):
        stride_row, stride_col = self.subsample
        _, feature_dim, nb_filter = self.W_shape

        if self.dim_ordering == 'th':
            if K._backend == 'theano':
                output = []
                for i in range(self.output_row):
                    for j in range(self.output_col):
                        slice_row = slice(i * stride_row,
                                          i * stride_row + self.nb_row)
                        slice_col = slice(j * stride_col,
                                          j * stride_col + self.nb_col)
                        x_flatten = K.reshape(x[:, :, slice_row, slice_col], (1, -1, feature_dim))
                        output.append(K.dot(x_flatten, self.W[i * self.output_col + j, :, :]))
                output = K.concatenate(output, axis=0)
            else:
                xs = []
                for i in range(self.output_row):
                    for j in range(self.output_col):
                        slice_row = slice(i * stride_row,
                                          i * stride_row + self.nb_row)
                        slice_col = slice(j * stride_col,
                                          j * stride_col + self.nb_col)
                        xs.append(K.reshape(x[:, :, slice_row, slice_col], (1, -1, feature_dim)))
                x_aggregate = K.concatenate(xs, axis=0)
                output = K.batch_dot(x_aggregate, self.W)
            output = K.reshape(output, (self.output_row, self.output_col, -1, nb_filter))
            output = K.permute_dimensions(output, (2, 3, 0, 1))
        elif self.dim_ordering == 'tf':
            xs = []
            for i in range(self.output_row):
                for j in range(self.output_col):
                    slice_row = slice(i * stride_row,
                                      i * stride_row + self.nb_row)
                    slice_col = slice(j * stride_col,
                                      j * stride_col + self.nb_col)
                    xs.append(K.reshape(x[:, slice_row, slice_col, :], (1, -1, feature_dim)))
            x_aggregate = K.concatenate(xs, axis=0)
            output = K.batch_dot(x_aggregate, self.W)
            output = K.reshape(output, (self.output_row, self.output_col, -1, nb_filter))
            output = K.permute_dimensions(output, (2, 0, 1, 3))
        else:
            raise Exception('Invalid dim_ordering: ' + self.dim_ordering)

        if self.bias:
            if self.dim_ordering == 'th':
                output += K.reshape(self.b, (1, nb_filter, self.output_row, self.output_col))
            elif self.dim_ordering == 'tf':
                output += K.reshape(self.b, (1, self.output_row, self.output_col, nb_filter))
            else:
                raise Exception('Invalid dim_ordering: ' + self.dim_ordering)

        output = self.activation(output)
        return output
Example #12
0
    def call(self, inputs, mask=None):
        if type(inputs) is not list or len(inputs) <= 1:
            raise Exception('Merge must be called on a list of tensors '
                            '(at least 2). Got: ' + str(inputs))
        # case: "mode" is a lambda or function.
        if hasattr(self.mode, '__call__'):
            # TODO: consider making it possible to
            # pass custom arguments to lambda.
            arguments = {}
            return self.mode(inputs, **arguments)

        if self.mode == 'sum' or self.mode == 'ave':
            s = inputs[0]
            for i in range(1, len(inputs)):
                s += inputs[i]
            if self.mode == 'ave':
                s /= len(inputs)
            return s

        elif self.mode == 'concat':
            return K.concatenate(inputs, axis=self.concat_axis)

        elif self.mode == 'mul':
            s = inputs[0]
            for i in range(1, len(inputs)):
                s *= inputs[i]
            return s

        elif self.mode == 'dot':
            l1 = inputs[0]
            l2 = inputs[1]
            output = K.batch_dot(l1, l2, self.dot_axes)
            return output

        elif self.mode == 'cos':
            l1 = inputs[0]
            l2 = inputs[1]
            denominator = K.sqrt(K.batch_dot(l1, l1, self.dot_axes) *
                                 K.batch_dot(l2, l2, self.dot_axes))
            output = K.batch_dot(l1, l2, self.dot_axes) / denominator
            output = K.expand_dims(output, 1)
            return output

        elif self.mode == 'abs':
			s = inputs[0] * inputs[0]
			for i in range(1, len(inputs)):
				s += inputs[i] * inputs[i]
			return K.sqrt(s)

        elif self.mode == 'atan2':
			return T.tensor.arctan2(inputs[1], inputs[0])

        else:
            raise Exception('Unknown merge mode.')
def euclidDist( inputs ):
	assert len( inputs ) == 2, "euclidDist requires 2 inputs"
	l1 = inputs[ 0 ]
	l2 = inputs[ 1 ]
	x = l1 - l2
	output = K.batch_dot( x, x, axes = 1 )
	K.reshape( output, (1,) )
	return output
Example #14
0
 def call(self, x, mask = None):
     tupleEmbed    = self.tupleEmbed
     relationEmbed = self.relationEmbed
     nb = x.shape[0]
     entity_embeddings   = tupleEmbed[x[:, 0], x[:, 2]]
     relation_embeddings = relationEmbed[x[:, 1]]
     dot_prod = K.batch_dot(entity_embeddings, relation_embeddings, axes = 1)
     return self.activation(dot_prod)
    def step(self, x, states):

        ytm, stm = states

        # repeat the hidden state to the length of the sequence
        _stm = K.repeat(stm, self.timesteps)

        # now multiplty the weight matrix with the repeated hidden state
        _Wxstm = K.dot(_stm, self.W_a)

        # calculate the attention probabilities
        # this relates how much other timesteps contributed to this one.
        et = K.dot(activations.tanh(_Wxstm + self._uxpb),
                   K.expand_dims(self.V_a))
        at = K.exp(et)
        at_sum = K.sum(at, axis=1)
        at_sum_repeated = K.repeat(at_sum, self.timesteps)
        at /= at_sum_repeated  # vector of size (batchsize, timesteps, 1)

        # calculate the context vector
        context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1)
        # ~~~> calculate new hidden state
        # first calculate the "r" gate:

        rt = activations.sigmoid(
            K.dot(ytm, self.W_r)
            + K.dot(stm, self.U_r)
            + K.dot(context, self.C_r)
            + self.b_r)

        # now calculate the "z" gate
        zt = activations.sigmoid(
            K.dot(ytm, self.W_z)
            + K.dot(stm, self.U_z)
            + K.dot(context, self.C_z)
            + self.b_z)

        # calculate the proposal hidden state:
        s_tp = activations.tanh(
            K.dot(ytm, self.W_p)
            + K.dot((rt * stm), self.U_p)
            + K.dot(context, self.C_p)
            + self.b_p)

        # new hidden state:
        st = (1-zt)*stm + zt * s_tp

        yt = activations.softmax(
            K.dot(ytm, self.W_o)
            + K.dot(stm, self.U_o)
            + K.dot(context, self.C_o)
            + self.b_o)

        if self.return_probabilities:
            return at, [yt, st]
        else:
            return yt, [yt, st]
    def get_similarity(self):
        ''' Specify similarity in configuration under 'similarity_params' -> 'mode'
        If a parameter is needed for the model, specify it in 'similarity_params'

        Example configuration:

        config = {
            ... other parameters ...
            'similarity_params': {
                'mode': 'gesd',
                'gamma': 1,
                'c': 1,
            }
        }

        cosine: dot(a, b) / sqrt(dot(a, a) * dot(b, b))
        polynomial: (gamma * dot(a, b) + c) ^ d
        sigmoid: tanh(gamma * dot(a, b) + c)
        rbf: exp(-gamma * l2_norm(a-b) ^ 2)
        euclidean: 1 / (1 + l2_norm(a - b))
        exponential: exp(-gamma * l2_norm(a - b))
        gesd: euclidean * sigmoid
        aesd: (euclidean + sigmoid) / 2
        '''

        params = self.similarity_params
        similarity = params['mode']

        axis = lambda a: len(a._keras_shape) - 1
        dot = lambda a, b: K.batch_dot(a, b, axes=axis(a))
        l2_norm = lambda a, b: K.sqrt(K.sum((a - b) ** 2, axis=axis(a), keepdims=True))
        l1_norm = lambda a, b: K.sum(K.abs(a - b), axis=axis(a), keepdims=True)

        if similarity == 'cosine':
            return lambda x: dot(x[0], x[1]) / K.sqrt(dot(x[0], x[0]) * dot(x[1], x[1]))
        elif similarity == 'polynomial':
            return lambda x: (params['gamma'] * dot(x[0], x[1]) + params['c']) ** params['d']
        elif similarity == 'sigmoid':
            return lambda x: K.tanh(params['gamma'] * dot(x[0], x[1]) + params['c'])
        elif similarity == 'rbf':
            return lambda x: K.exp(-1 * params['gamma'] * l2_norm(x[0], x[1]) ** 2)
        elif similarity == 'euclidean':
            return lambda x: 1 / (1 + l2_norm(x[0], x[1]))
        elif similarity == 'l1':
            return lambda x: -l1_norm(x[0], x[1])
        elif similarity == 'exponential':
            return lambda x: K.exp(-1 * params['gamma'] * l2_norm(x[0], x[1]))
        elif similarity == 'gesd':
            euclidean = lambda x: 1 / (1 + l2_norm(x[0], x[1]))
            sigmoid = lambda x: 1 / (1 + K.exp(-1 * params['gamma'] * (dot(x[0], x[1]) + params['c'])))
            return lambda x: euclidean(x) * sigmoid(x)
        elif similarity == 'aesd':
            euclidean = lambda x: 0.5 / (1 + l2_norm(x[0], x[1]))
            sigmoid = lambda x: 0.5 / (1 + K.exp(-1 * params['gamma'] * (dot(x[0], x[1]) + params['c'])))
            return lambda x: euclidean(x) + sigmoid(x)
        else:
            raise Exception('Invalid similarity: {}'.format(similarity))
Example #17
0
    def call(self, inputs, training=None):
        # inputs.shape=[None, input_num_capsule, input_dim_capsule]
        # inputs_expand.shape=[None, 1, input_num_capsule, input_dim_capsule]
        inputs_expand = K.expand_dims(inputs, 1)

        # Replicate num_capsule dimension to prepare being multiplied by W
        # inputs_tiled.shape=[None, num_capsule, input_num_capsule, input_dim_capsule]
        inputs_tiled = K.tile(inputs_expand, [1, self.num_capsule, 1, 1])

        # Compute `inputs * W` by scanning inputs_tiled on dimension 0.
        # x.shape=[num_capsule, input_num_capsule, input_dim_capsule]
        # W.shape=[num_capsule, input_num_capsule, dim_capsule, input_dim_capsule]
        # Regard the first two dimensions as `batch` dimension,
        # then matmul: [input_dim_capsule] x [dim_capsule, input_dim_capsule]^T -> [dim_capsule].
        # inputs_hat.shape = [None, num_capsule, input_num_capsule, dim_capsule]
        inputs_hat = K.map_fn(lambda x: K.batch_dot(x, self.W, [2, 3]), elems=inputs_tiled)

        # Begin: Routing algorithm ---------------------------------------------------------------------#
        # The prior for coupling coefficient, initialized as zeros.
        # b.shape = [None, self.num_capsule, self.input_num_capsule].
        b = tf.zeros(shape=[K.shape(inputs_hat)[0], self.num_capsule, self.input_num_capsule])

        assert self.routings > 0, 'The routings should be > 0.'
        for i in range(self.routings):
            # c.shape=[batch_size, num_capsule, input_num_capsule]
            c = tf.nn.softmax(b, dim=1)

            # c.shape =  [batch_size, num_capsule, input_num_capsule]
            # inputs_hat.shape=[None, num_capsule, input_num_capsule, dim_capsule]
            # The first two dimensions as `batch` dimension,
            # then matmal: [input_num_capsule] x [input_num_capsule, dim_capsule] -> [dim_capsule].
            # outputs.shape=[None, num_capsule, dim_capsule]
            outputs = squash(K.batch_dot(c, inputs_hat, [2, 2]))  # [None, 10, 16]

            if i < self.routings - 1:
                # outputs.shape =  [None, num_capsule, dim_capsule]
                # inputs_hat.shape=[None, num_capsule, input_num_capsule, dim_capsule]
                # The first two dimensions as `batch` dimension,
                # then matmal: [dim_capsule] x [input_num_capsule, dim_capsule]^T -> [input_num_capsule].
                # b.shape=[batch_size, num_capsule, input_num_capsule]
                b += K.batch_dot(outputs, inputs_hat, [2, 3])
        # End: Routing algorithm -----------------------------------------------------------------------#

        return outputs
 def _normalize_attention(attmat):
     att = attmat[0]
     mat = attmat[1]
     if transpose:
         att = K.permute_dimensions(att,(0, 2, 1))
     # 3d softmax
     e = K.exp(att - K.max(att, axis=-1, keepdims=True))
     s = K.sum(e, axis=-1, keepdims=True)
     sm_att = e / s
     return K.batch_dot(sm_att, mat)
Example #19
0
    def call(self, x, mask = None):
        # x is of dimension nb x (2*context_size + 3) where x[:,0] are the words, x[:,3:] is the context, x[:,1] is the context word and  x[:,2] is the INDEX of the context word in context     
        

        W_g = self.W_g
        W_s = self.W_s
        nb = x.shape[0]
        context_length = self.input_dim - 3
        actual_word_indx = (self.input_dim+3)/2 #same as context + 3
        right_senses,ignore_updates = theano.scan(disambiguate, sequences = x[:,3:], non_sequences = [context_length, W_g, W_s])
        words_sense_vector = W_s[x[:,0], right_senses[:,actual_word_indx]]
        a, ignore_updates = theano.scan(get_sense_vector, sequences = [x[:,0],right_senses,x[:,2]], non_sequences = [W_g, W_s])
        contexts_sense_vector = a
        sense_dot_prod = K.batch_dot(words_sense_vector, contexts_sense_vector, axes = 1)
        global_dot_prod = K.batch_dot(W_g[x[:,0]], W_g[x[:,1]], axes = 1)
        dot_prod = sense_dot_prod + global_dot_prod
        # return a[0]
        # return self.activation(K.log(K.sigmoid(dot_prod)))
        return self.activation(dot_prod)
Example #20
0
def cos_sim_matvec(values):
	mat, vec = values
	#mat = K.l2_normalize(mat, axis=-1)
	#vec = K.l2_normalize(vec, axis=-1)
	mat = myl2(mat, axis=-1)
	vec = myl2(vec, axis=-1)
	dodo =  K.batch_dot(mat,vec,axes=[2,1])
	#dodo = dodo.dimshuffle((0,1,'x'))
	#return T.extra_ops.repeat()
	return dodo
    def _additive_similarity(self, source, query):
        concatenation = K.concatenate([source, query], axis=2)
        nonlinearity = K.tanh(K.dot(concatenation, self._weights["w_a"]))
        
        # tile the weight vector (1, 1, dim) for each time step and each element of the batch -> (bs, T, dim)
        source_shape = K.shape(source)
        vaeff = K.tile(K.expand_dims(self._weights["v_a"], 0), [source_shape[0], source_shape[1], 1])

        similarity = K.batch_dot(K.permute_dimensions(vaeff, [0, 2, 1]), nonlinearity, axes=[1, 2])
        
        return similarity
 def call(self,inputs,**kwargs):
     if type(inputs) is list:
         assert len(inputs) == 2
         inputs,mask = inputs
     else:
         x = inputs
         # enlarge range of values in x by mapping max(new_x) = 1, others 
         x = (x - K.max(x,1,True)) / K.epsilon() + 1
         mask = K.clip(x,self.clip_value[0],self.clip_value[1]) # clip value beween 0 and 1
     masked_input = K.batch_dot(inputs, mask, [1,1])
     return masked_input
Example #23
0
 def _transform(self, X, affine_transformation, output_size):
     batch_size, num_channels = K.shape(X)[0], K.shape(X)[3]
     transformations = K.reshape(affine_transformation,
                                 shape=(batch_size, 2, 3))
     # transformations = K.cast(affine_transformation[:, 0:2, :], 'float32')
     regular_grids = self._make_regular_grids(batch_size, *output_size)
     sampled_grids = K.batch_dot(transformations, regular_grids)
     interpolated_image = self._interpolate(X, sampled_grids, output_size)
     new_shape = (batch_size, output_size[0], output_size[1], num_channels)
     interpolated_image = K.reshape(interpolated_image, new_shape)
     return interpolated_image
Example #24
0
    def call(self, xy, mask=None):
        if not isinstance(xy, list) or len(xy) != 2:
            raise Exception('Inner attention must be called on 2 inputs.'
                            ' Got: ' + str(xy))
        x, y = xy
        assert K.ndim(x) == 3, "x should be 3d (m x d), but got %d"%(K.ndim(x))
        assert K.ndim(y) == 3, "y should be 3d (n x d), but got %d"%(K.ndim(y))
        #assert d1 == d2, "x and y should be of same dimension, but dim(x)=%d, dim(y)=%d"%(d1, d2)

        z = K.batch_dot(x, y, axes=2)

        # Softmax
        e = K.exp(z - K.max(z, axis=-1, keepdims=True))
        s = K.sum(e, axis=-1, keepdims=True)
        z = e / s

        # z should be 10
        z = K.batch_dot(z, x, axes=1)

        return z
Example #25
0
def routing(u_hat_vecs, beta_a, iterations, output_capsule_num, i_activations):
    b = keras.backend.zeros_like(u_hat_vecs[:,:,:,0])
    if i_activations is not None:
        i_activations = i_activations[...,tf.newaxis]
    for i in range(iterations):
        if False:
            leak = tf.zeros_like(b, optimize=True)
            leak = tf.reduce_sum(leak, axis=1, keep_dims=True)
            leaky_logits = tf.concat([leak, b], axis=1)
            leaky_routing = tf.nn.softmax(leaky_logits, dim=1)        
            c = tf.split(leaky_routing, [1, output_capsule_num], axis=1)[1]
        else:
            c = softmax(b, 1)   
#        if i_activations is not None:
#            tf.transpose(tf.transpose(c, perm=[0,2,1]) * i_activations, perm=[0,2,1]) 
        outputs = squash_v1(K.batch_dot(c, u_hat_vecs, [2, 2]))
        if i < iterations - 1:
            b = b + K.batch_dot(outputs, u_hat_vecs, [2, 3])                                    
    poses = outputs 
    activations = K.sqrt(K.sum(K.square(poses), 2))
    return poses, activations
    def call(self, X, mask=None):
        input_shape = self.input_spec[0].shape

        w = self.attention
        input_length = K.shape(X)[1]
        X = K.reshape(X, (-1, ) + input_shape[2:])  # (nb_samples * timesteps, ...)
        w = K.reshape(w, (-1, ) + input_shape[2:3])  # (nb_samples * timesteps, ...)
        y = K.batch_dot(w,X)
        # Not sure why this is not working, but I should use the layer
        #y = self.layer.call(w,X)
        # (nb_samples, timesteps, ...)
        y = K.reshape(y, (-1, input_length) + input_shape[3:])
        return y
 def call(self, x):
     source, query = x
     
     similarity = self._similarity(source, query)
     expected_similarity_shape = [source.shape.as_list()[0], source.shape.as_list()[1], source.shape.as_list()[1]]
    
     if similarity.shape.as_list() != expected_similarity_shape:
         raise RuntimeError("The similarity function has returned a similarity with shape {0}, but expected {1}".format(similarity.shape.as_list()[:2], expected_similarity_shape))
     
     score = K.softmax(similarity)
     output = K.batch_dot(score, source, axes=[1, 1])
     
     return output
 def _pairwise_distances(self, inputs: List[Tensor]) -> Tensor:
     emb_c, emb_r = inputs
     bs = K.shape(emb_c)[0]
     embeddings = K.concatenate([emb_c, emb_r], 0)
     dot_product = K.dot(embeddings, K.transpose(embeddings))
     square_norm = K.batch_dot(embeddings, embeddings, axes=1)
     distances = K.transpose(square_norm) - 2.0 * dot_product + square_norm
     distances = K.slice(distances, (0, bs), (bs, bs))
     distances = K.clip(distances, 0.0, None)
     mask = K.cast(K.equal(distances, 0.0), K.dtype(distances))
     distances = distances + mask * 1e-16
     distances = K.sqrt(distances)
     distances = distances * (1.0 - mask)
     return distances
Example #29
0
    def call(self, inputs, **kwargs):
        # use true label to select target capsule, shape=[batch_size, num_capsule]
        if type(inputs) is list:  # true label is provided with shape = [batch_size, n_classes], i.e. one-hot code.
            assert len(inputs) == 2
            inputs, mask = inputs
        else:  # if no true label, mask by the max length of vectors of capsules
            x = inputs
            # Enlarge the range of values in x to make max(new_x)=1 and others < 0
            x = (x - K.max(x, 1, True)) / K.epsilon() + 1
            mask = K.clip(x, 0, 1)  # the max value in x clipped to 1 and other to 0

        # masked inputs, shape = [batch_size, dim_vector]
        inputs_masked = K.batch_dot(inputs, mask, [1, 1])
        return inputs_masked
Example #30
0
    def call(self, inputs, training=None):
        # inputs.shape=[None, input_num_capsule, input_dim_vector]
        # Expand dims to [None, input_num_capsule, 1, 1, input_dim_vector]
        inputs_expand = K.expand_dims(K.expand_dims(inputs, 2), 2)

        # Replicate num_capsule dimension to prepare being multiplied by W
        # Now it has shape = [None, input_num_capsule, num_capsule, 1, input_dim_vector]
        inputs_tiled = K.tile(inputs_expand, [1, 1, self.num_capsule, 1, 1])

        """  
        # Compute `inputs * W` by expanding the first dim of W. More time-consuming and need batch_size.
        # Now W has shape  = [batch_size, input_num_capsule, num_capsule, input_dim_vector, dim_vector]
        w_tiled = K.tile(K.expand_dims(self.W, 0), [self.batch_size, 1, 1, 1, 1])
        
        # Transformed vectors, inputs_hat.shape = [None, input_num_capsule, num_capsule, 1, dim_vector]
        inputs_hat = K.batch_dot(inputs_tiled, w_tiled, [4, 3])
        """
        # Compute `inputs * W` by scanning inputs_tiled on dimension 0. This is faster but requires Tensorflow.
        # inputs_hat.shape = [None, input_num_capsule, num_capsule, 1, dim_vector]
        inputs_hat = tf.scan(lambda ac, x: K.batch_dot(x, self.W, [3, 2]),
                             elems=inputs_tiled,
                             initializer=K.zeros([self.input_num_capsule, self.num_capsule, 1, self.dim_vector]))
        """
        # Routing algorithm V1. Use tf.while_loop in a dynamic way.
        def body(i, b, outputs):
            c = tf.nn.softmax(self.bias, dim=2)  # dim=2 is the num_capsule dimension
            outputs = squash(K.sum(c * inputs_hat, 1, keepdims=True))
            b = b + K.sum(inputs_hat * outputs, -1, keepdims=True)
            return [i-1, b, outputs]

        cond = lambda i, b, inputs_hat: i > 0
        loop_vars = [K.constant(self.num_routing), self.bias, K.sum(inputs_hat, 1, keepdims=True)]
        _, _, outputs = tf.while_loop(cond, body, loop_vars)
        """
        # Routing algorithm V2. Use iteration. V2 and V1 both work without much difference on performance
        assert self.num_routing > 0, 'The num_routing should be > 0.'

        for i in range(self.num_routing):
            c = tf.nn.softmax(self.bias, dim=2)  # dim=2 is the num_capsule dimension
            # outputs.shape=[None, 1, num_capsule, 1, dim_vector]
            outputs = squash(K.sum(c * inputs_hat, 1, keepdims=True))

            # last iteration needs not compute bias which will not be passed to the graph any more anyway.
            if i != self.num_routing - 1:
                # self.bias = K.update_add(self.bias, K.sum(inputs_hat * outputs, [0, -1], keepdims=True))
                self.bias += K.sum(inputs_hat * outputs, -1, keepdims=True)
            # tf.summary.histogram('BigBee', self.bias)  # for debugging
        return K.reshape(outputs, [-1, self.num_capsule, self.dim_vector])
Example #31
0
    def step_do(self, step_in, states):  # 定义每一步的迭代

        in_value = step_in
        if 0 < self.dropout < 1.:
            self._dropout_mask = K.in_train_phase(
                K.dropout(K.ones_like(step_in), self.dropout),
                K.ones_like(step_in))
        if 0 < self.dropout < 1.:
            in_value = step_in * self._dropout_mask

        # hist = K.tanh(K.dot(states[0], self.rec_kernel))
        # hist = K.tanh(states[0])

        in_value = K.expand_dims(in_value, axis=-2)

        l_state = K.expand_dims(states[0], axis=-2)
        l_inp = K.concatenate([l_state, in_value], axis=-2)

        s_state = K.expand_dims(states[1], axis=-2)
        s_inp = K.concatenate([s_state, in_value], axis=-2)

        l_query = K.dot(l_inp, self.query_kernel)

        l_key = K.dot(l_inp, self.key_kernel)

        l_value = K.dot(l_inp, self.value_kernel)

        l_attention_prob = K.batch_dot(l_query, l_key, axes=[2, 2]) / np.sqrt(
            self.units)
        print(l_attention_prob.shape)
        l_attention_prob = K.softmax(l_attention_prob)
        l_outputs = K.batch_dot(l_attention_prob, l_value)
        l_outputs = K.tanh(l_outputs)

        s_query = K.dot(s_inp, self.query_kernel)

        s_key = K.dot(s_inp, self.key_kernel)

        s_value = K.dot(s_inp, self.value_kernel)

        s_attention_prob = K.batch_dot(s_query, s_key, axes=[2, 2]) / np.sqrt(
            self.units)
        s_attention_prob = K.softmax(s_attention_prob)
        s_outputs = K.batch_dot(s_attention_prob, s_value)
        s_outputs = K.tanh(s_outputs)

        lt = K.expand_dims(l_outputs[:, 0], axis=-2)
        st = K.expand_dims(s_outputs[:, 1], axis=-2)
        outputs = K.concatenate([lt, st], axis=-2)

        query = K.dot(outputs, self.query_kernel)

        key = K.dot(outputs, self.key_kernel)

        value = K.dot(outputs, self.value_kernel)

        attention_prob = K.batch_dot(query, key, axes=[2, 2]) / np.sqrt(
            self.units)
        attention_prob = K.softmax(attention_prob)
        print(attention_prob.shape)
        att_out = K.batch_dot(attention_prob, value, axes=[2, 1])

        # outputs = K.concatenate([l_outputs[:,0], s_outputs[:,1]], axis=-1)
        # outputs = 0.5*l_outputs[:,0] + 0.5*s_outputs[:,1]
        print('inner_outputs.shape', outputs.shape)
        return att_out[:, 0], [att_out[:, 0], att_out[:, 1]]
Example #32
0
    def call(self, inputs):

        if not isinstance(inputs, list):
            raise ValueError('This layer should be called '
                             'on a list of 2/3 inputs.')

        if len(inputs) != 3 and len(inputs) != 2:
            raise ValueError('This layer should be called '
                             'on a list of 2/3 inputs.'
                             'Got ' + str(len(inputs)) + ' inputs.')

        # if len(inputs) != 1:
        #     raise ValueError('This layer should be called '
        #                      'on only 1 input.'
        #                      'Got ' + str(len(input)) + ' inputs.')
        input_real = inputs[0]
        input_imag = inputs[1]

        ndims = len(inputs[0].shape)
        if self.average_weights:
            output_r = K.mean(input_real, axis=ndims - 2, keepdims=False)
            output_i = K.mean(input_imag, axis=ndims - 2, keepdims=False)
        else:
            #For embedding layer inputs[2] is (None, embedding_dim,1)
            #For test inputs[2] is (None, embedding_dim)
            if len(inputs[2].shape) == ndims - 1:
                weight = K.expand_dims(inputs[2])
            else:
                weight = inputs[2]

            weight = K.repeat_elements(weight,
                                       input_real.shape[-1],
                                       axis=ndims - 1)

            output_real = input_real * weight  #shape: (None, 300, 300)
            output_real = K.sum(output_real, axis=ndims - 2)
            output_imag = input_imag * weight
            output_imag = K.sum(output_imag, axis=ndims - 2)

        output_real_transpose = K.expand_dims(output_real, axis=ndims - 2)
        output_imag_transpose = K.expand_dims(output_imag, axis=ndims - 2)

        #        output_real_transpose = K.permute_dimensions(output_real, (0,2,1))
        #        output_imag_transpose = K.permute_dimensions(output_imag, (0,2,1))

        output_real = K.expand_dims(output_real)
        output_imag = K.expand_dims(output_imag)

        print(output_real.shape)
        print(output_real_transpose.shape)
        # print(output_imag.shape)

        output_r = K.batch_dot(
            output_real, output_real_transpose,
            axes=[ndims - 1, ndims]) + K.batch_dot(
                output_imag, output_imag_transpose, axes=[ndims - 1, ndims])
        output_i = K.batch_dot(
            output_imag, output_real_transpose,
            axes=[ndims - 1, ndims]) - K.batch_dot(
                output_real, output_imag_transpose, axes=[ndims - 1, ndims])

        return [output_r, output_i]
Example #33
0
    def call(self, inputs):
        z = inputs # z.shape=(batch_size, latent_dim)
        z = K.expand_dims(z, 1)
        return z - K.expand_dims(self.mean, 0)
    
    def compute_output_shape(self, input_shape):
        return (None, self.num_classes, input_shape[-1])

gaussian = Gaussian(num_classes, name='priors')
z_prior_mean = gaussian(z)

clvae = Model([x, y_in], [x_recon, z_prior_mean])

z_mean = K.expand_dims(z_mean, 1)
z_log_var = K.expand_dims(z_log_var, 1)

lamb = 0.5
xent_loss = 0.5 * K.mean((x - x_recon)**2, 0)
kl_loss = - 0.5 * (z_log_var - K.square(z_prior_mean))
kl_loss = K.mean(K.batch_dot(K.expand_dims(y_in, 1), kl_loss), 0)
clvae_loss = lamb * K.sum(xent_loss) + K.sum(kl_loss)

clvae.add_loss(clvae_loss)
clvae.compile(optimizer='adam')
clvae.summary()

clvae_history = clvae.fit([x_train, to_categorical(y_train)],
        shuffle=True,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=([x_test, to_categorical(y_test)], None))
Example #34
0
	def cosine(self, x):
		axis = len(x[0]._keras_shape)-1
		dot = lambda a, b: K.batch_dot(a, b, axes=axis)
		return dot(x[0], x[1]) / K.sqrt(dot(x[0], x[0]) * dot(x[1], x[1]))
Example #35
0
 def _g_var_chol(p, epsilon):
     mu, var, chol = p
     epsilon = K.batch_dot(epsilon, chol, axes=(1, 2))
     return mu + K.sqrt(K.abs(var)) * epsilon
Example #36
0
 def _g_logvar_chol_2D1(p, epsilon):
     mu, logvar, chol = p
     epsilon = K.batch_dot(epsilon, chol, axes=(1, 1))
     return mu + K.exp(logvar/2) * epsilon
Example #37
0
    def call(self, x, mask=None):
        # TODO: validate input shape

        assert (len(x) == 3)
        L_flat = x[0]
        mu = x[1]
        a = x[2]

        if self.mode == 'full':
            # Create L and L^T matrix, which we use to construct the positive-definite matrix P.
            L = None
            LT = None
            if K.backend() == 'theano':
                import theano.tensor as T
                import theano

                def fn(x, L_acc, LT_acc):
                    x_ = K.zeros((self.nb_actions, self.nb_actions))
                    x_ = T.set_subtensor(x_[np.tril_indices(self.nb_actions)],
                                         x)
                    diag = K.exp(T.diag(x_)) + K.epsilon()
                    x_ = T.set_subtensor(x_[np.diag_indices(self.nb_actions)],
                                         diag)
                    return x_, x_.T

                outputs_info = [
                    K.zeros((self.nb_actions, self.nb_actions)),
                    K.zeros((self.nb_actions, self.nb_actions)),
                ]
                results, _ = theano.scan(fn=fn,
                                         sequences=L_flat,
                                         outputs_info=outputs_info)
                L, LT = results
            elif K.backend() == 'tensorflow':
                import tensorflow as tf

                # Number of elements in a triangular matrix.
                nb_elems = (self.nb_actions * self.nb_actions +
                            self.nb_actions) // 2

                # Create mask for the diagonal elements in L_flat. This is used to exponentiate
                # only the diagonal elements, which is done before gathering.
                diag_indeces = [0]
                for row in range(1, self.nb_actions):
                    diag_indeces.append(diag_indeces[-1] + (row + 1))
                diag_mask = np.zeros(1 + nb_elems)  # +1 for the leading zero
                diag_mask[np.array(diag_indeces) + 1] = 1
                diag_mask = K.variable(diag_mask)

                # Add leading zero element to each element in the L_flat. We use this zero
                # element when gathering L_flat into a lower triangular matrix L.
                nb_rows = tf.shape(L_flat)[0]
                zeros = tf.expand_dims(tf.tile(K.zeros((1, )), [nb_rows]), 1)
                try:
                    # Old TF behavior.
                    L_flat = tf.concat(1, [zeros, L_flat])
                except TypeError:
                    # New TF behavior
                    L_flat = tf.concat([zeros, L_flat], 1)

                # Create mask that can be used to gather elements from L_flat and put them
                # into a lower triangular matrix.
                tril_mask = np.zeros((self.nb_actions, self.nb_actions),
                                     dtype='int32')
                tril_mask[np.tril_indices(self.nb_actions)] = range(
                    1, nb_elems + 1)

                # Finally, process each element of the batch.
                init = [
                    K.zeros((self.nb_actions, self.nb_actions)),
                    K.zeros((self.nb_actions, self.nb_actions)),
                ]

                def fn(a, x):
                    # Exponentiate everything. This is much easier than only exponentiating
                    # the diagonal elements, and, usually, the action space is relatively low.
                    x_ = K.exp(x) + K.epsilon()
                    # Only keep the diagonal elements.
                    x_ *= diag_mask
                    # Add the original, non-diagonal elements.
                    x_ += x * (1. - diag_mask)
                    # Finally, gather everything into a lower triangular matrix.
                    L_ = tf.gather(x_, tril_mask)
                    return [L_, tf.transpose(L_)]

                tmp = tf.scan(fn, L_flat, initializer=init)
                if isinstance(tmp, (list, tuple)):
                    # TensorFlow 0.10 now returns a tuple of tensors.
                    L, LT = tmp
                else:
                    # Old TensorFlow < 0.10 returns a shared tensor.
                    L = tmp[:, 0, :, :]
                    LT = tmp[:, 1, :, :]
            else:
                raise RuntimeError('Unknown Keras backend "{}".'.format(
                    K.backend()))
            assert L is not None
            assert LT is not None
            P = K.batch_dot(L, LT)
        elif self.mode == 'diag':
            if K.backend() == 'theano':
                import theano.tensor as T
                import theano

                def fn(x, P_acc):
                    x_ = K.zeros((self.nb_actions, self.nb_actions))
                    x_ = T.set_subtensor(x_[np.diag_indices(self.nb_actions)],
                                         x)
                    return x_

                outputs_info = [
                    K.zeros((self.nb_actions, self.nb_actions)),
                ]
                P, _ = theano.scan(fn=fn,
                                   sequences=L_flat,
                                   outputs_info=outputs_info)
            elif K.backend() == 'tensorflow':
                import tensorflow as tf

                # Create mask that can be used to gather elements from L_flat and put them
                # into a diagonal matrix.
                diag_mask = np.zeros((self.nb_actions, self.nb_actions),
                                     dtype='int32')
                diag_mask[np.diag_indices(self.nb_actions)] = range(
                    1, self.nb_actions + 1)

                # Add leading zero element to each element in the L_flat. We use this zero
                # element when gathering L_flat into a lower triangular matrix L.
                nb_rows = tf.shape(L_flat)[0]
                zeros = tf.expand_dims(tf.tile(K.zeros((1, )), [nb_rows]), 1)
                try:
                    # Old TF behavior.
                    L_flat = tf.concat(1, [zeros, L_flat])
                except TypeError:
                    # New TF behavior
                    L_flat = tf.concat([zeros, L_flat], 1)

                # Finally, process each element of the batch.
                def fn(a, x):
                    x_ = tf.gather(x, diag_mask)
                    return x_

                P = tf.scan(fn,
                            L_flat,
                            initializer=K.zeros(
                                (self.nb_actions, self.nb_actions)))
            else:
                raise RuntimeError('Unknown Keras backend "{}".'.format(
                    K.backend()))
        assert P is not None
        assert K.ndim(P) == 3

        # Combine a, mu and P into a scalar (over the batches). What we compute here is
        # -.5 * (a - mu)^T * P * (a - mu), where * denotes the dot-product. Unfortunately
        # TensorFlow handles vector * P slightly suboptimal, hence we convert the vectors to
        # 1xd/dx1 matrices and finally flatten the resulting 1x1 matrix into a scalar. All
        # operations happen over the batch size, which is dimension 0.
        prod = K.batch_dot(K.expand_dims(a - mu, 1), P)
        prod = K.batch_dot(prod, K.expand_dims(a - mu, -1))
        A = -.5 * K.batch_flatten(prod)
        assert K.ndim(A) == 2
        return A
Example #38
0
def backend_dot(x):
    return K.batch_dot(x[0], x[1])
Example #39
0
def test():
    ## argument
    train_path = sys.argv[1]
    test_path = sys.argv[2]
    predict_path = sys.argv[3]
    model_name = sys.argv[4]
    char_embed_path = sys.argv[5]
    word_embed_path = sys.argv[6]
    pos_embed_path = sys.argv[7]
    dict_path = sys.argv[8]

    train_rate = 0.9
    max_char_ctx_len = 1160
    max_word_ctx_len = 680

    char_ctx_len = 1160
    char_qus_len = 240

    word_ctx_len = 400
    word_qus_len = 40

    word_char_len = 5

    char_embed_size = 128
    word_embed_size = 128
    pos_embed_size = 32
    hidden_size = 64
    model_size = 64

    max_epochs = 50
    batch_size = 8

    lr = 0.001
    drop_rate = 0.5
    recur_drop_rate = 0.0
    patience = 20

    ## load data
    print("load data")
    st = time.time()
    train_raw_data = data_utils.load_json_data(train_path)
    test_raw_data = data_utils.load_json_data(test_path)
    #    # load pos data
    #    train_gen_pos_data = data_utils.load_json_data(train_pos_path)
    #    test_gen_pos_data = data_utils.load_json_data(test_pos_path)
    # load embedding
    char_embedding = word2vec.Word2Vec.load(char_embed_path)
    word_embedding = word2vec.Word2Vec.load(word_embed_path)
    pos_embedding = word2vec.Word2Vec.load(pos_embed_path)
    et = time.time()
    print("cost time:", et - st)

    ## process data
    print("process data")
    st = time.time()
    train_data = data_utils.make_train_data(
        train_raw_data
    )  # data format: (id, context, question, answer_start, answer_end)
    test_data = data_utils.make_test_data(
        test_raw_data)  # data format: (id, context, question)
    train_context = [data[1] for data in train_data]
    train_question = [data[2] for data in train_data]
    train_char_answer_start = [data[3] for data in train_data]
    train_char_answer_end = [data[4] for data in train_data]
    #    train_context_poss = [data['context'] for data in train_gen_pos_data['data']]
    #    train_question_poss = [data['question'] for data in train_gen_pos_data['data']]
    test_id = [data[0] for data in test_data]
    test_context = [data[1] for data in test_data]
    test_question = [data[2] for data in test_data]
    #    test_context_poss = [data['context'] for data in test_gen_pos_data['data']]
    #    test_question_poss = [data['question'] for data in test_gen_pos_data['data']]
    del train_data
    del test_data
    et = time.time()
    print("cost time:", et - st)

    ## load vocabulary
    print("load vocabulary")
    st = time.time()
    char_vocab = data_utils.load_json_data('model_%s_char_vocab.json' %
                                           model_name)
    word_vocab = data_utils.load_json_data('model_%s_word_vocab.json' %
                                           model_name)
    pos_vocab = data_utils.load_json_data('model_%s_pos_vocab.json' %
                                          model_name)
    #    poss = train_context_poss + train_question_poss + test_context_poss + test_question_poss
    #    pos_vocab, rev_pos_vocab = data_utils.build_vocabulary_with_embedding(poss, pos_embedding)
    char_vocab_size = len(char_vocab)
    word_vocab_size = len(word_vocab)
    pos_vocab_size = len(pos_vocab)
    et = time.time()
    print("char vocab size:", char_vocab_size)
    print("word vocab size:", word_vocab_size)
    print("pos vocab size:", pos_vocab_size)
    print("cost time:", et - st)

    ## tokenize data
    print("tokenize data")
    st = time.time()
    train_context_chars = data_utils.tokenize_to_chars(train_context)
    train_question_chars = data_utils.tokenize_to_chars(train_question)
    test_context_chars = data_utils.tokenize_to_chars(test_context)
    test_question_chars = data_utils.tokenize_to_chars(test_question)
    train_context_words = data_utils.tokenize_to_words(train_context,
                                                       init_dict=True,
                                                       dict_path=dict_path)
    train_question_words = data_utils.tokenize_to_words(train_question,
                                                        init_dict=True,
                                                        dict_path=dict_path)
    test_context_words = data_utils.tokenize_to_words(test_context,
                                                      init_dict=True,
                                                      dict_path=dict_path)
    test_question_words = data_utils.tokenize_to_words(test_question,
                                                       init_dict=True,
                                                       dict_path=dict_path)
    train_context_poss = data_utils.tokenize_to_poss(train_context,
                                                     init_dict=True,
                                                     dict_path=dict_path)
    train_question_poss = data_utils.tokenize_to_poss(train_question,
                                                      init_dict=True,
                                                      dict_path=dict_path)
    test_context_poss = data_utils.tokenize_to_poss(test_context,
                                                    init_dict=True,
                                                    dict_path=dict_path)
    test_question_poss = data_utils.tokenize_to_poss(test_question,
                                                     init_dict=True,
                                                     dict_path=dict_path)
    et = time.time()
    print("cost time:", et - st)

    ## select data
    # select the data which sequence lengths satisfy length constraints
    print("select data")
    st = time.time()
    select_indices = data_utils.select_data_by_lengths(train_context_words,
                                                       train_question_words,
                                                       word_ctx_len,
                                                       word_qus_len)
    train_context_chars = [train_context_chars[i] for i in select_indices]
    train_context_words = [train_context_words[i] for i in select_indices]
    train_context_poss = [train_context_poss[i] for i in select_indices]
    train_question_chars = [train_question_chars[i] for i in select_indices]
    train_question_words = [train_question_words[i] for i in select_indices]
    train_question_poss = [train_question_poss[i] for i in select_indices]
    train_char_answer_start = [
        train_char_answer_start[i] for i in select_indices
    ]
    train_char_answer_end = [train_char_answer_end[i] for i in select_indices]
    et = time.time()
    print("cost time:", et - st)

    ## set answer
    # it should be done after tokenize sentences to words
    print("set answer")
    st = time.time()
    train_word_answer_start, train_word_answer_end = data_utils.set_word_answer(
        train_context_words, train_char_answer_start, train_char_answer_end,
        word_ctx_len)
    train_answer_start, train_answer_end = train_word_answer_start, train_word_answer_end
    et = time.time()
    print("cost time:", et - st)

    ## pad data
    print("pad data")
    st = time.time()
    # clip words to chars
    # it should be done after build vocab (add PAD)
    train_context_clip_chars = data_utils.clip_words_to_chars(
        train_context_words, word_char_len)
    train_question_clip_chars = data_utils.clip_words_to_chars(
        train_question_words, word_char_len)
    test_context_clip_chars = data_utils.clip_words_to_chars(
        test_context_words, word_char_len)
    test_question_clip_chars = data_utils.clip_words_to_chars(
        test_question_words, word_char_len)
    #    print("Debug: tarin_context_clip_chars[0]:")
    #    print(train_context_clip_chars[0])
    #    print("Debug: train_question_clip_chars[0]:")
    #    print(train_question_clip_chars[0])

    # padding
    train_context_pad_chars = data_utils.pad_sequences(
        train_context_clip_chars, word_ctx_len * word_char_len)
    train_question_pad_chars = data_utils.pad_sequences(
        train_question_clip_chars, word_qus_len * word_char_len)
    train_context_pad_words = data_utils.pad_sequences(train_context_words,
                                                       word_ctx_len)
    train_question_pad_words = data_utils.pad_sequences(
        train_question_words, word_qus_len)
    train_context_pad_poss = data_utils.pad_sequences(train_context_poss,
                                                      word_ctx_len)
    train_question_pad_poss = data_utils.pad_sequences(train_question_poss,
                                                       word_qus_len)
    test_context_pad_chars = data_utils.pad_sequences(
        test_context_clip_chars, word_ctx_len * word_char_len)
    test_question_pad_chars = data_utils.pad_sequences(
        test_question_clip_chars, word_qus_len * word_char_len)
    test_context_pad_words = data_utils.pad_sequences(test_context_words,
                                                      word_ctx_len)
    test_question_pad_words = data_utils.pad_sequences(test_question_words,
                                                       word_qus_len)
    test_context_pad_poss = data_utils.pad_sequences(test_context_poss,
                                                     word_ctx_len)
    test_question_pad_poss = data_utils.pad_sequences(test_question_poss,
                                                      word_qus_len)
    et = time.time()
    print("cost time:", et - st)
    ## make arrays
    print("make arrays")
    st = time.time()
    # map vocab to index
    #    print("Debug: train_context_pad_words[0]:")
    #    print(train_context_pad_words[0])
    #    print("Debug: train_question_pad_words[0]:")
    #    print(train_question_pad_words[0])
    train_context_char_indices = data_utils.map_vocabulary_index(
        train_context_pad_chars, char_vocab)
    train_question_char_indices = data_utils.map_vocabulary_index(
        train_question_pad_chars, char_vocab)
    train_context_word_indices = data_utils.map_vocabulary_index(
        train_context_pad_words, word_vocab)
    train_question_word_indices = data_utils.map_vocabulary_index(
        train_question_pad_words, word_vocab)
    train_context_pos_indices = data_utils.map_vocabulary_index(
        train_context_pad_poss, pos_vocab)
    train_question_pos_indices = data_utils.map_vocabulary_index(
        train_question_pad_poss, pos_vocab)
    test_context_char_indices = data_utils.map_vocabulary_index(
        test_context_pad_chars, char_vocab)
    test_question_char_indices = data_utils.map_vocabulary_index(
        test_question_pad_chars, char_vocab)
    test_context_word_indices = data_utils.map_vocabulary_index(
        test_context_pad_words, word_vocab)
    test_question_word_indices = data_utils.map_vocabulary_index(
        test_question_pad_words, word_vocab)
    test_context_pos_indices = data_utils.map_vocabulary_index(
        test_context_pad_poss, pos_vocab)
    test_question_pos_indices = data_utils.map_vocabulary_index(
        test_question_pad_poss, pos_vocab)
    # make one-hot label
    train_answer_start_onehot = data_utils.one_hot_encoding(
        train_answer_start, word_ctx_len)
    train_answer_end_onehot = data_utils.one_hot_encoding(
        train_answer_end, word_ctx_len)
    # to array
    # X1: context chars; X2: context words; X3: context poss;
    # X4: question chars; X5: question words; X6: question poss;
    # Y1: answer_start, Y2: answer_end
    train_X1 = np.array(train_context_char_indices, dtype=np.int32)
    train_X2 = np.array(train_context_word_indices, dtype=np.int32)
    train_X3 = np.array(train_context_pos_indices, dtype=np.int32)
    train_X4 = np.array(train_question_char_indices, dtype=np.int32)
    train_X5 = np.array(train_question_word_indices, dtype=np.int32)
    train_X6 = np.array(train_question_pos_indices, dtype=np.int32)
    train_Y1 = np.array(train_answer_start_onehot, dtype=np.int32)
    train_Y2 = np.array(train_answer_end_onehot, dtype=np.int32)
    train_word_ans1 = np.array(train_answer_start, dtype=np.int32)
    train_word_ans2 = np.array(train_answer_end, dtype=np.int32)
    train_ans1 = np.array(train_char_answer_start, dtype=np.int32)
    train_ans2 = np.array(train_char_answer_end, dtype=np.int32)
    test_X1 = np.array(test_context_char_indices, dtype=np.int32)
    test_X2 = np.array(test_context_word_indices, dtype=np.int32)
    test_X3 = np.array(test_context_pos_indices, dtype=np.int32)
    test_X4 = np.array(test_question_char_indices, dtype=np.int32)
    test_X5 = np.array(test_question_word_indices, dtype=np.int32)
    test_X6 = np.array(test_question_pos_indices, dtype=np.int32)
    # make embedding weight matrix
    word_embed_matrix = data_utils.make_embedding_matrix(
        word_embedding, word_vocab, word_embed_size)
    char_embed_matrix = data_utils.make_embedding_matrix(
        char_embedding, char_vocab, char_embed_size)
    pos_embed_matrix = data_utils.make_embedding_matrix(
        pos_embedding, pos_vocab, pos_embed_size)

    # delete data for releasing memory
    del train_context, train_question, test_context, test_question
    del train_context_chars, train_question_chars, test_context_chars, test_question_chars
    #    del train_context_words, train_question_words, test_context_words, test_question_words
    del train_context_clip_chars, train_question_clip_chars, test_context_clip_chars, test_question_clip_chars
    del train_context_char_indices, train_question_char_indices, test_context_char_indices, test_question_char_indices
    del train_context_word_indices, train_question_word_indices, test_context_word_indices, test_question_word_indices
    del train_context_pos_indices, train_question_pos_indices, test_context_pos_indices, test_question_pos_indices
    del train_word_answer_start, train_word_answer_end, train_char_answer_start, train_char_answer_end
    del train_answer_start_onehot, train_answer_end_onehot
    et = time.time()
    print("train shape:", train_X1.shape, train_X2.shape, train_X3.shape,
          train_X4.shape, train_X5.shape, train_X6.shape, train_Y1.shape,
          train_Y2.shape)
    print("test shape:", test_X1.shape, test_X2.shape, test_X3.shape,
          test_X4.shape, test_X5.shape, test_X6.shape)
    print("cost time:", et - st)

    ## XXX build model
    print("build model")
    st = time.time()
    # input layers
    # X1: context chars; X2: context words; X3: context poss;
    # X4: question chars; X5: question words; X6: question poss;
    # Y1: answer_start; Y2: answer_end
    var_x1_input = Input(shape=(word_ctx_len * word_char_len, ),
                         dtype=np.int32)
    var_x2_input = Input(shape=(word_ctx_len, ), dtype=np.int32)
    var_x3_input = Input(shape=(word_ctx_len, ), dtype=np.int32)
    var_x4_input = Input(shape=(word_qus_len * word_char_len, ),
                         dtype=np.int32)
    var_x5_input = Input(shape=(word_qus_len, ), dtype=np.int32)
    var_x6_input = Input(shape=(word_qus_len, ), dtype=np.int32)

    # embedding layers
    var_x1_embed = Embedding(
        input_dim=char_vocab_size,
        output_dim=char_embed_size,
        weights=[char_embed_matrix],
        input_length=word_ctx_len * word_char_len,
        trainable=False
    )(var_x1_input)  # shape: (None, ctx_length * word_length, char_embed_size)
    var_x2_embed = Embedding(
        input_dim=word_vocab_size,
        output_dim=word_embed_size,
        weights=[word_embed_matrix],
        input_length=word_ctx_len,
        trainable=False)(
            var_x2_input)  # shape: (None, ctx_length, word_embed_size)
    var_x3_embed = Embedding(
        input_dim=pos_vocab_size,
        output_dim=pos_embed_size,
        weights=[pos_embed_matrix],
        input_length=word_ctx_len,
        trainable=False)(
            var_x3_input)  # shape: (None, ctx_length, pos_embed_size)
    var_x4_embed = Embedding(
        input_dim=char_vocab_size,
        output_dim=char_embed_size,
        weights=[char_embed_matrix],
        input_length=word_qus_len * word_char_len,
        trainable=False
    )(var_x4_input)  # shape: (None, qus_length * word_length, char_embed_size)
    var_x5_embed = Embedding(
        input_dim=word_vocab_size,
        output_dim=word_embed_size,
        weights=[word_embed_matrix],
        input_length=word_qus_len,
        trainable=False)(
            var_x5_input)  # shape: (None, qus_length, word_embed_size)
    var_x6_embed = Embedding(
        input_dim=pos_vocab_size,
        output_dim=pos_embed_size,
        weights=[pos_embed_matrix],
        input_length=word_qus_len,
        trainable=False)(
            var_x6_input)  # shape: (None, qus_length, pos_embed_size)

    var_x1_embed = Reshape([word_ctx_len, word_char_len * char_embed_size])(
        var_x1_embed
    )  # shape: (None, ctx_length, word_length * char_embed_size)
    var_x4_embed = Reshape([word_qus_len, word_char_len * char_embed_size])(
        var_x4_embed
    )  # shape: (None, qus_length, word_length * char_embed_size)
    var_char_embed_layer = Dense(units=word_embed_size)
    var_x1_embed = TimeDistributed(
        var_char_embed_layer,
        input_shape=(word_ctx_len, word_char_len * char_embed_size))(
            var_x1_embed)  # shape: (None, ctx_length, word_embed_size)
    var_x1_embed = Activation('relu')(var_x1_embed)
    #    var_x1_embed = Dropout(rate=drop_rate)(var_x1_embed)
    var_x4_embed = TimeDistributed(
        var_char_embed_layer,
        input_shape=(word_qus_len, word_char_len * char_embed_size))(
            var_x4_embed)  # shape: (None, qus_length, word_embed_size)
    var_x4_embed = Activation('relu')(var_x4_embed)
    #    var_x4_embed = Dropout(rate=drop_rate)(var_x4_embed)

    #XXX concatenate word embedding and pos embedding directly
    var_ctx_embed = concatenate(
        [var_x1_embed, var_x2_embed, var_x3_embed], axis=2
    )  # shape: (None, ctx_length, word_embed_size * 2 + pos_embed_size)
    var_qus_embed = concatenate(
        [var_x4_embed, var_x5_embed, var_x6_embed], axis=2
    )  # shape: (None, qus_length, word_embed_size * 2 + pos_embed_size)
    var_ctx_embed = Dropout(rate=drop_rate)(var_ctx_embed)
    var_qus_embed = Dropout(rate=drop_rate)(var_qus_embed)

    var_ctx_lstm = Bidirectional(
        LSTM(units=hidden_size,
             recurrent_dropout=recur_drop_rate,
             return_sequences=True))(
                 var_ctx_embed)  # shape: (None, ctx_length, hidden_size * 2)
    var_qus_lstm = Bidirectional(
        LSTM(units=hidden_size,
             recurrent_dropout=recur_drop_rate,
             return_sequences=True))(
                 var_qus_embed)  # shape: (None, qus_length, hidden_size * 2)
    # dropout ?
    #    var_ctx_lstm = Dropout(rate=drop_rate)(var_ctx_lstm)
    #    var_qus_lstm = Dropout(rate=drop_rate)(var_qus_lstm)

    # attention layers
    var_ctx_flatten = Flatten()(
        var_ctx_lstm)  # shape: (None, ctx_length * hidden_size * 2)
    var_qus_flatten = Flatten()(
        var_qus_lstm)  # shape: (None, qus_length * hidden_size * 2)
    var_ctx_repeat = RepeatVector(word_qus_len)(
        var_ctx_flatten
    )  # shape: (None, qus_length, ctx_length * hidden_size * 2)
    var_qus_repeat = RepeatVector(word_ctx_len)(
        var_qus_flatten
    )  # shape: (None, ctx_length, qus_length * hidden_size * 2)
    var_ctx_repeat = Reshape([word_qus_len, word_ctx_len, hidden_size * 2])(
        var_ctx_repeat
    )  # shape: (None, qus_length, ctx_length, hidden_size * 2)
    var_qus_repeat = Reshape([word_ctx_len, word_qus_len, hidden_size * 2])(
        var_qus_repeat
    )  # shape: (None, ctx_length, qus_length, hidden_size * 2)
    var_ctx_repeat = Permute(
        [2, 1, 3])(var_ctx_repeat
                   )  # shape: (None, ctx_length, qus_length, hidden_size * 2)
    var_mul_repeat = multiply([
        var_ctx_repeat, var_qus_repeat
    ])  # shape: (None, ctx_length, qus_length, hidden_size * 2)

    var_sim_repeat = concatenate(
        [var_ctx_repeat, var_qus_repeat, var_mul_repeat],
        axis=3)  # shape: (None, ctx_length, qus_length, hidden_size * 6)
    var_sim_sequence = Reshape([word_ctx_len * word_qus_len, hidden_size * 6])(
        var_sim_repeat
    )  # shape: (None, ctx_length * qus_length, hidden_size * 6)
    # dropout ?
    #    var_sim_sequence = Dropout(rate=drop_rate)(var_sim_sequence)
    var_similarity = TimeDistributed(
        Dense(units=1),
        input_shape=(word_ctx_len * word_qus_len, hidden_size * 6))(
            var_sim_sequence)  # shape: (None, ctx_length * qus_length, 1)
    var_similarity = Reshape([word_ctx_len, word_qus_len])(
        var_similarity)  # shape: (None, ctx_length, qus_length)
    var_similarity = Activation('relu')(var_similarity)
    # dropout ?
    #    var_similarity = Dropout(rate=drop_rate)(var_similarity)

    var_c2qatt_weight = TimeDistributed(
        Activation('softmax'), input_shape=(word_ctx_len, word_qus_len))(
            var_similarity)  # shape: (None, ctx_length, qus_length)
    var_c2qatt_ctx = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=[2, 1]))(
        [var_c2qatt_weight,
         var_qus_lstm])  # shape: (None, ctx_length, hidden_size * 2)

    var_q2catt_weight = Lambda(lambda x: K.max(x, axis=2))(
        var_similarity)  # shape: (None, ctx_length)
    var_q2catt_weight = RepeatVector(hidden_size * 2)(
        var_q2catt_weight)  # shape: (None, hidden_size * 2, ctx_length)
    var_q2catt_weight = Permute([2, 1])(
        var_q2catt_weight)  # shape: (None, ctx_length, hidden_size * 2)
    var_q2catt_ctx = multiply([var_q2catt_weight, var_ctx_lstm
                               ])  # shape: (None, ctx_length, hidden_size * 2)

    var_c2qctx_attmul = multiply(
        [var_ctx_lstm,
         var_c2qatt_ctx])  # shape: (None, ctx_length, hidden_size * 2)
    var_q2cctx_attmul = multiply(
        [var_ctx_lstm,
         var_q2catt_ctx])  # shape: (None, ctx_length, hidden_size * 2)
    var_attention = concatenate(
        [var_ctx_lstm, var_c2qatt_ctx, var_c2qctx_attmul, var_q2cctx_attmul],
        axis=2)  # shape: (None, ctx_length, hidden_size * 8)
    var_attention = Activation('relu')(var_attention)
    #    # dropout ?
    #    var_attention = Dropout(rate=drop_rate)(var_attention)

    # model layers
    var_model1_lstm = Bidirectional(
        LSTM(units=model_size,
             recurrent_dropout=recur_drop_rate,
             return_sequences=True))(
                 var_attention)  # shape: (None, ctx_length, model_size * 2)
    var_model1_att = concatenate(
        [var_attention, var_model1_lstm],
        axis=2)  # shape: (None, ctx_length, hidden_size * 8 + model_size * 2)
    # dropout ?
    #    var_model1_att = Dropout(rate=drop_rate)(var_model1_att)

    var_model2_lstm = Bidirectional(
        LSTM(units=model_size,
             recurrent_dropout=recur_drop_rate,
             return_sequences=True))(
                 var_model1_lstm)  # shape: (None, ctx_length, model_size * 2)
    var_model2_att = concatenate(
        [var_attention, var_model2_lstm],
        axis=2)  # shape: (None, ctx_length, hidden_size * 8 + model_size * 2)
    # dropout ?
    #    var_model2_att = Dropout(rate=drop_rate)(var_model2_att)

    # output layers
    var_pointer1_weight = TimeDistributed(
        Dense(units=1),
        input_shape=(word_ctx_len, hidden_size * 8 + model_size * 2))(
            var_model1_att)  # shape: (None, ctx_length, 1)
    var_pointer1_weight = Flatten()(
        var_pointer1_weight)  # shape: (None, ctx_length)
    var_pointer1 = Activation('softmax')(
        var_pointer1_weight)  # shape: (None, ctx_length)

    var_pointer2_weight = TimeDistributed(
        Dense(units=1),
        input_shape=(word_ctx_len, hidden_size * 8 + model_size * 2))(
            var_model2_att)  # shape: (None, ctx_length, 1)
    var_pointer2_weight = Flatten()(
        var_pointer2_weight)  # shape: (None, ctx_length)
    var_pointer2 = Activation('softmax')(
        var_pointer2_weight)  # shape: (None, ctx_length)

    model = Model(inputs=[
        var_x1_input, var_x2_input, var_x3_input, var_x4_input, var_x5_input,
        var_x6_input
    ],
                  outputs=[var_pointer1, var_pointer2])

    adam = Adam(lr=lr)

    #    # Set loss functions ?
    #    def two_pointers_crossentropy(y_true, y_pred):
    #        p1_true, p1_pred = y_true[0], y_pred[0]
    #        p2_true, p2_pred = y_true[:,1], y_pred[1]
    #        p1_loss = categorical_crops
    # XXX use multiple loss
    model.compile(
        optimizer=adam,
        loss=['categorical_crossentropy', 'categorical_crossentropy'],
        loss_weights=[0.5, 0.5],
        metrics=['accuracy'])
    et = time.time()
    print("cost time:", et - st)

    ## evaluate
    print("evaluate")
    st = time.time()
    model = load_model('model_%s.h5' % model_name, custom_objects={'tf': tf})
    # compute predict
    print("predict")
    st = time.time()
    train_Y1_hat, train_Y2_hat = model.predict(
        [train_X1, train_X2, train_X3, train_X4, train_X5, train_X6],
        batch_size=batch_size)
    et = time.time()
    print("cost time:", et - st)
    train_Y1_word_pred, train_Y2_word_pred = model_utils.constraint_predict(
        train_Y1_hat, train_Y2_hat)
    train_Y1_pred, train_Y2_pred = data_utils.set_char_answer(
        train_context_words, train_Y1_word_pred, train_Y2_word_pred)
    train_Y1_pred = np.array(train_Y1_pred, dtype=np.int32)
    train_Y2_pred = np.array(train_Y2_pred, dtype=np.int32)
    # evaluate predict with setting answer (word answer)
    train_acc1, train_acc2, train_accuracy = evaluation.compute_accuracy(
        train_word_ans1, train_Y1_word_pred, train_word_ans2,
        train_Y2_word_pred)
    train_prec, train_rec, train_f1 = evaluation.compute_scores(
        train_word_ans1, train_Y1_word_pred, train_word_ans2,
        train_Y2_word_pred, word_ctx_len)
    print("word-level train accuracy:", train_acc1, train_acc2, train_accuracy)
    print("word-level train prec rec:", train_prec, train_rec)
    print("word-level train f1:", train_f1)
    # evaluate predict with real answer (char answer)
    train_acc1, train_acc2, train_accuracy = evaluation.compute_accuracy(
        train_ans1, train_Y1_pred, train_ans2, train_Y2_pred)
    train_prec, train_rec, train_f1 = evaluation.compute_scores(
        train_ans1, train_Y1_pred, train_ans2, train_Y2_pred, max_char_ctx_len)
    print("char-level train accuracy:", train_acc1, train_acc2, train_accuracy)
    print("char-level train prec rec:", train_prec, train_rec)
    print("char-level train f1:", train_f1)
    et = time.time()
    print("cost time:", et - st)

    ## test
    print("test")
    st = time.time()
    test_Y1_hat, test_Y2_hat = model.predict(
        [test_X1, test_X2, test_X3, test_X4, test_X5, test_X6],
        batch_size=batch_size)
    # compute predict
    test_Y1_word_pred, test_Y2_word_pred = model_utils.constraint_predict(
        test_Y1_hat, test_Y2_hat)
    test_Y1_pred, test_Y2_pred = data_utils.set_char_answer(
        test_context_words, test_Y1_word_pred, test_Y2_word_pred)
    test_Y1_pred = np.array(test_Y1_pred, dtype=np.int32)
    test_Y2_pred = np.array(test_Y2_pred, dtype=np.int32)
    data_utils.write_predict(predict_path, test_id, test_Y1_pred, test_Y2_pred)
    et = time.time()
    print("cost time:", et - st)
Example #40
0
vec[0][0] = 1

In = []
for j in range(n_data):
    In.append(Input(shape=[len_feature]))
    In.append(Input(shape=(neighbors, n_data)))
In.append(Input(shape=(1, neighbors)))
feature = []
for j in range(n_data):
    feature.append(encoder(In[j * 2]))

feature_ = Concatenate(axis=1)(feature)

relation1 = []
for j in range(n_data):
    T = Lambda(lambda x: K.batch_dot(x[0], x[1]))([In[j * 2 + 1], feature_])
    relation1.append(m1([T, T, T, In[n_data * 2]]))

relation1_ = Concatenate(axis=1)(relation1)

relation2 = []
for j in range(n_data):
    T = Lambda(lambda x: K.batch_dot(x[0], x[1]))([In[j * 2 + 1], relation1_])
    relation2.append(m2([T, T, T, In[n_data * 2]]))

V = []
for j in range(n_data):
    V.append(q_net([feature[j], relation1[j], relation2[j]]))

model = Model(input=In, output=V)
model.compile(optimizer=Adam(lr=0.0001), loss='mse')
Example #41
0
def gram(cnn):
    # gram3 = []
    gram = K.batch_dot(cnn, cnn, axes=[3, 3])
    gram = K.reshape(gram, (-1, 2500))
    return gram
Example #42
0
 def call(self, inputs):
     q, a = inputs
     # https://github.com/wglassly/cnnormaliztion/blob/master/src/nn_layers.py#L822
     return K.batch_dot(q, K.dot(a, K.transpose(self.M)), axes=1)
Example #43
0
 def gram_matrix_b(x):
     x = K.permute_dimensions(x, (0, 3, 1, 2))
     s = K.shape(x)
     feat = K.reshape(x, (s[0], s[1], s[2] * s[3]))
     return K.batch_dot(feat, K.permute_dimensions(
         feat, (0, 2, 1))) / K.prod(K.cast(s[1:], K.floatx()))
Example #44
0
 def dot_funxtion(x):
     return K.batch_dot(x[0], x[1])
Example #45
0
 def _g_logvar_chol_3D(p, epsilon):
     mu, logvar, chol = p
     epsilon = K.batch_dot(epsilon * K.exp(logvar/2), chol, axes=(2, 1))
     return mu + epsilon
Example #46
0
    def attention(self,
                  pre_q,
                  pre_v,
                  pre_k,
                  out_seq_len,
                  d_model,
                  attn_mask=None,
                  training=None):
        """
        Calculates the output of the attention once the affine transformations
        of the inputs are done. Here's the shapes of the arguments:
        :param pre_q: (batch_size, q_seq_len, num_heads, d_model // num_heads)
        :param pre_v: (batch_size, v_seq_len, num_heads, d_model // num_heads)
        :param pre_k: (batch_size, k_seq_len, num_heads, d_model // num_heads)
        :param out_seq_len: the length of the output sequence
        :param d_model: dimensionality of the model (by the paper)
        :param training: Passed by Keras. Should not be defined manually.
          Optional scalar tensor indicating if we're in training
          or inference phase.
        """
        # shaping Q and V into (batch_size, num_heads, seq_len, d_model//heads)
        q = K.permute_dimensions(pre_q, [0, 2, 1, 3])
        v = K.permute_dimensions(pre_v, [0, 2, 1, 3])

        if self.compression_window_size is None:
            k_transposed = K.permute_dimensions(pre_k, [0, 2, 3, 1])
        else:
            # Memory-compressed attention described in paper
            # "Generating Wikipedia by Summarizing Long Sequences"
            # (https://arxiv.org/pdf/1801.10198.pdf)
            # It compresses keys and values using 1D-convolution which reduces
            # the size of Q * K_transposed from roughly seq_len^2
            # to convoluted_seq_len^2. If we use strided convolution with
            # window size = 3 and stride = 3, memory requirements of such
            # memory-compressed attention will be 9 times smaller than
            # that of the original version.
            if self.use_masking:
                raise NotImplementedError(
                    "Masked memory-compressed attention has not "
                    "been implemented yet")
            k = K.permute_dimensions(pre_k, [0, 2, 1, 3])
            k, v = [
                K.reshape(
                    # Step 3: Return the result to its original dimensions
                    # (batch_size, num_heads, seq_len, d_model//heads)
                    K.bias_add(
                        # Step 3: ... and add bias
                        K.conv1d(
                            # Step 2: we "compress" K and V using strided conv
                            K.reshape(
                                # Step 1: we reshape K and V to
                                # (batch * num_heads,  seq_len, d_model//heads)
                                item,
                                (-1, K.int_shape(item)[-2],
                                 d_model // self.num_heads)),
                            kernel,
                            strides=self.compression_window_size,
                            padding='valid',
                            data_format='channels_last'),
                        bias,
                        data_format='channels_last'),
                    # new shape
                    K.concatenate([
                        K.shape(item)[0],
                        K.shape(item)[1],  # shape: (batch_size, num_heads)
                        [-1, d_model // self.num_heads]
                    ]))  # shape: (seq_len, n_model//num_heads)
                for item, kernel, bias in ((k, self.k_conv_kernel,
                                            self.k_conv_bias),
                                           (v, self.v_conv_kernel,
                                            self.v_conv_bias))
            ]
            k_transposed = K.permute_dimensions(k, [0, 1, 3, 2])
        # shaping K into (batch_size, num_heads, d_model//heads, seq_len)
        # for further matrix multiplication
        sqrt_d = K.sqrt(K.cast(d_model, dtype=K.floatx()) // self.num_heads)
        q_shape = K.shape(q)
        k_t_shape = K.shape(k_transposed)
        v_shape = K.shape(v)

        #q_shape = K.int_shape(q)
        #k_t_shape = K.int_shape(k_transposed)
        #v_shape = K.int_shape(v)

        # before performing batch_dot all tensors are being converted to 3D
        # shape (batch_size * num_heads, tar_seq_len, d_model//num_heads) to make sure batch_dot
        # performs identically on all backends
        attention_heads = K.reshape(
            K.batch_dot(
                self.apply_dropout_if_needed(
                    K.softmax(
                        # mask the attention for the prediction process
                        #self.mask_attention_if_needed(
                        self.mask_attention(
                            # core scaled dot product
                            K.
                            batch_dot(  # (batch_size * num_heads, tar_seq_len, src_seq_len)
                                K.reshape(
                                    q, (-1, q_shape[-2], q_shape[-1])
                                ),  # q_shape: (batch_size*num_heads, q_seq_len, d_model//heads)
                                K.reshape(
                                    k_transposed,  # k_transposed: (batch_size*num_heads, d_model//heads, k_seq_len)
                                    (-1, k_t_shape[-2], k_t_shape[-1]))) /
                            sqrt_d,
                            attn_mask)),
                    training=training),
                K.reshape(v, (-1, v_shape[-2], v_shape[-1]))
            ),  # shape: (batch_size * num_heads, v_seq_len, d_model//heads)
            (-1, self.num_heads, q_shape[-2], q_shape[-1]))
        # shape: (batch_size * seq_length, d_model)
        attention_heads_merged = K.reshape(
            # shape (batch_size, q_seq_length, num_heads, d_model // num_heads) to make sure batch_dot
            K.permute_dimensions(attention_heads, [0, 2, 1, 3]),
            (-1, d_model))
        # shape: (batch_size, out_seq_len, d_model). Generally, out_seq_len should be q_seq_len
        attention_out = K.reshape(
            K.dot(attention_heads_merged, self.output_weights),
            (-1, out_seq_len, d_model))
        return attention_out
Example #47
0
 def _g_std_chol(p, epsilon):
     mu, s, chol = p
     epsilon = K.batch_dot(epsilon, chol, axes=(1, 2))
     return mu + K.abs(s) * epsilon
    def build(self, embedding_matrix):
        if self.config['rnn'] == 'gru' and self.config['gpu']:
            RNN = CuDNNGRU(self.config['rnn_output_size'],
                           return_sequences=True)
        elif self.config['rnn'] == 'lstm' and self.config['gpu']:
            RNN = CuDNNLSTM(self.config['rnn_output_size'],
                            return_sequences=True)
        elif self.config['rnn'] == 'gru' and not self.config['gpu']:
            RNN = GRU(self.config['rnn_output_size'],
                      return_sequences=True,
                      dropout=self.config['dropout_rate'],
                      recurrent_dropout=self.config['dropout_rate'])
        else:
            RNN = LSTM(self.config['rnn_output_size'],
                       return_sequences=True,
                       dropout=self.config['dropout_rate'],
                       recurrent_dropout=self.config['dropout_rate'])
        self.sentence_input = Input(shape=(self.config['max_length'], ),
                                    dtype='int32',
                                    name='sentence_input')
        embed = Embedding(embedding_matrix.shape[0],
                          embedding_matrix.shape[1],
                          trainable=self.config['embed_trainable'],
                          weights=[embedding_matrix])(self.sentence_input)
        embed = SpatialDropout1D(self.config['spatial_dropout_rate'])(embed)
        convs = []
        for ksz in self.config['kernel_sizes']:
            conv = Conv1D(self.config['filters'],
                          ksz,
                          activation='relu',
                          padding='same')(embed)
            convs.append(conv)
        cnn_out = concatenate(convs, axis=-1)

        if self.config['bidirectional']:
            rnn_out = Bidirectional(RNN)(embed)
        else:
            rnn_out = RNN(embed)

        capsule_cnn = Capsule(num_capsule=self.config['num_capsule'],
                              dim_capsule=self.config['dim_capsule'],
                              routings=self.config['routings'],
                              share_weights=True,
                              name='capsule_cnn')(cnn_out)
        capsule_cnn = Flatten()(capsule_cnn)

        capsule_rnn = Capsule(num_capsule=self.config['num_capsule'],
                              dim_capsule=self.config['dim_capsule'],
                              routings=self.config['routings'],
                              share_weights=True,
                              name='capsule_rnn')(rnn_out)
        capsule_rnn = Flatten()(capsule_rnn)

        cnn_u = TimeDistributed(
            Dense(self.config['hidden_dims'], activation='tanh',
                  use_bias=True))(cnn_out)
        cnn_alpha = Dense(1)(cnn_u)
        cnn_alpha = Flatten()(cnn_alpha)
        cnn_alpha = Activation(activation='softmax')(cnn_alpha)
        cnn_att_rep = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=[1, 1]))(
            [cnn_out, cnn_alpha])

        rnn_u = TimeDistributed(
            Dense(self.config['hidden_dims'], activation='tanh',
                  use_bias=True))(rnn_out)
        rnn_alpha = Dense(1)(rnn_u)
        rnn_alpha = Flatten()(rnn_alpha)
        rnn_alpha = Activation(activation='softmax')(rnn_alpha)
        rnn_att_rep = Lambda(lambda x: K.batch_dot(x[0], x[1], axes=[1, 1]))(
            [rnn_out, rnn_alpha])

        cnn_concat = concatenate([capsule_cnn, cnn_att_rep], axis=-1)
        rnn_concat = concatenate([capsule_rnn, rnn_att_rep], axis=-1)

        rep = concatenate([cnn_concat, rnn_concat], axis=-1)
        return rep
Example #49
0
    def MultiHeadsAttModel(self,
                           In_agent,
                           In_neighbor,
                           l=5,
                           d=128,
                           dv=16,
                           dout=128,
                           nv=8,
                           suffix=-1):
        """
        input:[bacth,agent,128]
        output:
        -hidden state: [batch,agent,32]
        -attention: [batch,agent,neighbor]
        """
        """
        agent repr
        """
        print("In_agent.shape,In_neighbor.shape,l, d, dv, dout, nv",
              In_agent.shape, In_neighbor.shape, l, d, dv, dout, nv)
        #[batch,agent,dim]->[batch,agent,1,dim]
        agent_repr = Reshape((self.num_agents, 1, d))(In_agent)
        """
        neighbor repr
        """
        #[batch,agent,dim]->(reshape)[batch,1,agent,dim]->(tile)[batch,agent,agent,dim]
        neighbor_repr = RepeatVector3D(self.num_agents)(In_agent)
        print("neighbor_repr.shape", neighbor_repr.shape)
        #[batch,agent,neighbor,agent]x[batch,agent,agent,dim]->[batch,agent,neighbor,dim]
        neighbor_repr = Lambda(lambda x: K.batch_dot(x[0], x[1]))(
            [In_neighbor, neighbor_repr])
        print("neighbor_repr.shape", neighbor_repr.shape)
        """
        attention computation
        """
        #multi-head
        #[batch,agent,1,dim]->[batch,agent,1,dv*nv]
        agent_repr_head = Dense(dv * nv,
                                activation='relu',
                                kernel_initializer='random_normal',
                                name='agent_repr_%d' % suffix)(agent_repr)
        #[batch,agent,1,dv,nv]->[batch,agent,nv,1,dv]
        agent_repr_head = Reshape(
            (self.num_agents, 1, dv, nv))(agent_repr_head)
        agent_repr_head = Lambda(lambda x: K.permute_dimensions(
            x, (0, 1, 4, 2, 3)))(agent_repr_head)
        #agent_repr_head=Lambda(lambda x:K.permute_dimensions(K.reshape(x,(-1,self.num_agents,1,dv,nv)),(0,1,4,2,3)))(agent_repr_head)
        #[batch,agent,neighbor,dim]->[batch,agent,neighbor,dv*nv]

        neighbor_repr_head = Dense(dv * nv,
                                   activation='relu',
                                   kernel_initializer='random_normal',
                                   name='neighbor_repr_%d' %
                                   suffix)(neighbor_repr)
        #[batch,agent,neighbor,dv,nv]->[batch,agent,nv,neighbor,dv]
        print("DEBUG", neighbor_repr_head.shape)
        print("self.num_agents,self.num_neighbors,dv,nv", self.num_agents,
              self.num_neighbors, dv, nv)
        neighbor_repr_head = Reshape(
            (self.num_agents, self.num_neighbors, dv, nv))(neighbor_repr_head)
        neighbor_repr_head = Lambda(lambda x: K.permute_dimensions(
            x, (0, 1, 4, 2, 3)))(neighbor_repr_head)
        #neighbor_repr_head=Lambda(lambda x:K.permute_dimensions(K.reshape(x,(-1,self.num_agents,self.num_neighbors,dv,nv)),(0,1,4,2,3)))(neighbor_repr_head)
        #[batch,agent,nv,1,dv]x[batch,agent,nv,neighbor,dv]->[batch,agent,nv,1,neighbor]
        att = Lambda(
            lambda x: K.softmax(K.batch_dot(x[0], x[1], axes=[4, 4])))(
                [agent_repr_head, neighbor_repr_head])
        #[batch,agent,nv,1,neighbor]->[batch,agent,nv,neighbor]
        att_record = Reshape((self.num_agents, nv, self.num_neighbors))(att)

        #self embedding again
        neighbor_hidden_repr_head = Dense(dv * nv,
                                          activation='relu',
                                          kernel_initializer='random_normal',
                                          name='neighbor_hidden_repr_%d' %
                                          suffix)(neighbor_repr)
        neighbor_hidden_repr_head = Reshape(
            (self.num_agents, self.num_neighbors, dv,
             nv))(neighbor_hidden_repr_head)
        neighbor_hidden_repr_head = Lambda(lambda x: K.permute_dimensions(
            x, (0, 1, 4, 2, 3)))(neighbor_hidden_repr_head)
        out = Lambda(lambda x: K.mean(K.batch_dot(x[0], x[1]), axis=2))(
            [att, neighbor_hidden_repr_head])
        out = Reshape((self.num_agents, dv))(out)
        out = Dense(dout,
                    activation="relu",
                    kernel_initializer='random_normal',
                    name='MLP_after_relation_%d' % suffix)(out)
        return out, att_record
Example #50
0
 def _outer(AB):
     att_ji = K.batch_dot(AB[1], K.permute_dimensions(AB[0], (0, 2, 1)))
     return K.permute_dimensions(att_ji,(0, 2, 1))
Example #51
0
    def step(self, x, states):
        (
            h_p,
            h_v,  # 0:parent, 1:traversal 
            x_type,  # 2:treetype(ins/sub,left/right); ints of size (B,). \in {0,1,2,3}
            B_U,
            B_W) = states  # 3:Udropoutmask, 4:Wdropoutmask

        #### matrix x has all 4 x computations in it
        ## per move
        this_Wx = self.W_x[x_type]  ## B, I, 4*O
        matrix_x = K.batch_dot(x * B_W[0], this_Wx) + self.b_x
        x_zp = matrix_x[:, :self.output_dim]
        x_rp = matrix_x[:, self.output_dim:2 * self.output_dim]
        x_rv = matrix_x[:, 2 * self.output_dim:3 * self.output_dim]
        x_ih = matrix_x[:, 3 * self.output_dim:]

        #### matrix p has zp, rp; matrix v has zv, rv
        matrix_p = K.dot(h_p * B_U[0], self.U_p[:, :2 * self.output_dim])

        # zp is for the parent unit update (resulting in child unit)
        inner_zp = matrix_p[:, :self.output_dim]
        z_p = self.inner_activation(x_zp + inner_zp)

        # rp is for gating to the intermediate unit of parent
        inner_rp = matrix_p[:, self.output_dim:2 * self.output_dim]
        r_p = self.inner_activation(x_rp + inner_rp)

        matrix_v = K.dot(h_v * B_U[0], self.U_v[:, :2 * self.output_dim])
        # rv is for the intermediate gate on the traversal unit
        # this gets reused for both the parent's and its own intermediate
        inner_rv = matrix_v[:, self.output_dim:2 * self.output_dim]
        r_v = self.inner_activation(x_rv + inner_rv)

        # the actual recurrence calculations
        # h_p * U and h_v * U ; as gated by their r gates
        inner_hp = K.dot(r_p * h_p * B_U[0], self.U_p[:, 2 * self.output_dim:])
        inner_hv = K.dot(r_v * h_v * B_U[0], self.U_v[:, 2 * self.output_dim:])
        # h_c_tilde is the intermediate state
        h_c_tilde = self.activation(x_ih + inner_hp + inner_hv)
        # h_c is the new child state
        h_c = z_p * h_c_tilde + (1 - z_p) * h_p

        matrix_c = K.dot(h_c * B_U[0], self.U_c) + self.b_c

        hc_zv = matrix_c[:, :self.output_dim]
        hc_rv = matrix_c[:, self.output_dim:2 * self.output_dim]
        hc_ih = matrix_c[:, 2 * self.output_dim:]

        ### zv -> gate h_v  and h_v_tilde
        ### rv -> gate h_v's contribution to h_v_tilde
        ### ih -> h_c's contribution to h_v_tilde

        # zv is for the traversal unit update.
        inner_zv = matrix_v[:, :self.output_dim]
        z_v = self.inner_activation(hc_zv + inner_zv)
        ## r_v is calculated with h_c rather than x
        r_v = self.inner_activation(hc_rv + inner_rv)

        inner_hvplus = K.dot(r_v * h_v * B_U[0],
                             self.U_v[:, 2 * self.output_dim:])
        h_vplus_tilde = self.activation(hc_ih + inner_hvplus)
        h_vplus = z_v * h_v + (1 - z_v) * h_vplus_tilde

        return h_c, h_vplus
    def call(self, X, mask=None):
        # input: D (sample,c,w,d)
        proj_input = self.activation(tf.tensordot(X, self.att_proj, axes=[[3],[0]])) # tanh(dot(D,P))=Dl,(sample,c,w,p)
        if self.context == 'word':
            raw_att_scores = tf.tensordot(proj_input, self.att_scorer, axes=[[3],[0]]) # (sample,c,w)
        elif self.context == 'clause':
            def step(X, states):
                new_state = activations.tanh(tf.tensordot(X,self.encoder_weight, axes=[[2],[0]]) \
                    + tf.tensordot(states[0],self.recurrent_weight, axes=[[2],[0]]))
                return new_state, [new_state]
            # Make all-zero initial state. 
            # Directly obtaining the first input dimension is not allowed, so this is the work-aronud.
            initial_state = tf.tensordot(K.max(proj_input*0,axis=2),K.zeros((self.proj_dim, self.rec_hid_dim)), axes = [[2],[0]])
            proj_input_permute = K.permute_dimensions(proj_input,(0,2,1,3))
            _,all_rnn_out,_ = K.rnn(step,proj_input_permute,[initial_state])
            raw_att_scores = tf.tensordot(K.permute_dimensions(all_rnn_out,(0,2,1,3)), 
                                                self.att_scorer, axes=[[3],[0]])
        
        elif self.context == 'bidirectional_clause':
            def step_forward(X, states):
                new_state = activations.tanh(tf.tensordot(X,self.encoder_weight_forward, axes=[[2],[0]]) \
                    + tf.tensordot(states[0],self.recurrent_weight_forward, axes=[[2],[0]]))
                return new_state, [new_state]
            def step_backward(X, states):
                new_state = activations.tanh(tf.tensordot(X,self.encoder_weight_backward, axes=[[2],[0]]) \
                    + tf.tensordot(states[0],self.recurrent_weight_backward, axes=[[2],[0]]))
                return new_state, [new_state]
            # Make all-zero initial state. 
            # Directly obtaining the first input dimension is not allowed, so this is the work-aronud.
            initial_state = tf.tensordot(K.max(proj_input*0,axis=2),K.zeros((self.proj_dim, self.rec_hid_dim)), axes = [[2],[0]]) 
            proj_input_permute = K.permute_dimensions(proj_input,(0,2,1,3))
            proj_input_permute_backward = K.reverse(proj_input_permute, 1)
            _,all_rnn_out_forward,_ = K.rnn(step_forward,proj_input_permute,[initial_state])
            _,all_rnn_out_backward,_ = K.rnn(step_backward,proj_input_permute,[initial_state])
            all_rnn_out = all_rnn_out_forward+all_rnn_out_backward
            raw_att_scores = tf.tensordot(K.permute_dimensions(all_rnn_out,(0,2,1,3)), 
                                                self.att_scorer, axes=[[3],[0]])

        elif self.context == 'LSTM_clause':
            def step(inputs, states):
                h_tm1 = states[0]  # previous memory state
                c_tm1 = states[1]  # previous carry state

                x_i = tf.tensordot(inputs, self.kernel_i,axes=[[2],[0]])
                x_f = tf.tensordot(inputs, self.kernel_f,axes=[[2],[0]])
                x_c = tf.tensordot(inputs, self.kernel_c,axes=[[2],[0]])
                x_o = tf.tensordot(inputs, self.kernel_o,axes=[[2],[0]])
                x_i = K.bias_add(x_i, self.bias_i)
                x_f = K.bias_add(x_f, self.bias_f)
                x_c = K.bias_add(x_c, self.bias_c)
                x_o = K.bias_add(x_o, self.bias_o)
                i = activations.hard_sigmoid(x_i + tf.tensordot(h_tm1,
                                                          self.recurrent_kernel_i,axes=[[2],[0]]))
                f = activations.hard_sigmoid(x_f + tf.tensordot(h_tm1,
                                                          self.recurrent_kernel_f,axes=[[2],[0]]))
                c = f * c_tm1 + i * activations.tanh(x_c + tf.tensordot(h_tm1,
                                                                self.recurrent_kernel_c,axes=[[2],[0]]))
                o = activations.hard_sigmoid(x_o + tf.tensordot(h_tm1,
                                                          self.recurrent_kernel_o,axes=[[2],[0]]))
                h = o * activations.tanh(c)

                return h, [h, c]
            # Make all-zero initial state. 
            # Directly obtaining the first input dimension is not allowed, so this is the work-aronud.
            initial_state = tf.tensordot(K.max(proj_input*0,axis=2),K.zeros((self.proj_dim, self.rec_hid_dim)), axes = [[2],[0]])
            proj_input_permute = K.permute_dimensions(proj_input,(0,2,1,3))
            _,all_rnn_out,_ = K.rnn(step,proj_input_permute,[initial_state,initial_state])
            raw_att_scores = tf.tensordot(K.permute_dimensions(all_rnn_out,(0,2,1,3)), 
                                                self.att_scorer, axes=[[3],[0]])
        elif self.context == 'biLSTM_clause':
            def step_forward(inputs, states):
                h_tm1 = states[0]  # previous memory state
                c_tm1 = states[1]  # previous carry state

                x_i = tf.tensordot(inputs, self.kernel_i_forward,axes=[[2],[0]])
                x_f = tf.tensordot(inputs, self.kernel_f_forward,axes=[[2],[0]])
                x_c = tf.tensordot(inputs, self.kernel_c_forward,axes=[[2],[0]])
                x_o = tf.tensordot(inputs, self.kernel_o_forward,axes=[[2],[0]])
                x_i = K.bias_add(x_i, self.bias_i_forward)
                x_f = K.bias_add(x_f, self.bias_f_forward)
                x_c = K.bias_add(x_c, self.bias_c_forward)
                x_o = K.bias_add(x_o, self.bias_o_forward)
                i = activations.hard_sigmoid(x_i + tf.tensordot(h_tm1,
                                                          self.recurrent_kernel_i_forward,axes=[[2],[0]]))
                f = activations.hard_sigmoid(x_f + tf.tensordot(h_tm1,
                                                          self.recurrent_kernel_f_forward,axes=[[2],[0]]))
                c = f * c_tm1 + i * activations.tanh(x_c + tf.tensordot(h_tm1,
                                                                self.recurrent_kernel_c_forward,axes=[[2],[0]]))
                o = activations.hard_sigmoid(x_o + tf.tensordot(h_tm1,
                                                          self.recurrent_kernel_o_forward,axes=[[2],[0]]))
                h = o * activations.tanh(c)

                return h, [h, c]

            def step_backward(inputs, states):
                h_tm1 = states[0]  # previous memory state
                c_tm1 = states[1]  # previous carry state

                x_i = tf.tensordot(inputs, self.kernel_i_backward,axes=[[2],[0]])
                x_f = tf.tensordot(inputs, self.kernel_f_backward,axes=[[2],[0]])
                x_c = tf.tensordot(inputs, self.kernel_c_backward,axes=[[2],[0]])
                x_o = tf.tensordot(inputs, self.kernel_o_backward,axes=[[2],[0]])
                x_i = K.bias_add(x_i, self.bias_i_backward)
                x_f = K.bias_add(x_f, self.bias_f_backward)
                x_c = K.bias_add(x_c, self.bias_c_backward)
                x_o = K.bias_add(x_o, self.bias_o_backward)
                i = activations.hard_sigmoid(x_i + tf.tensordot(h_tm1,
                                                          self.recurrent_kernel_i_backward,axes=[[2],[0]]))
                f = activations.hard_sigmoid(x_f + tf.tensordot(h_tm1,
                                                          self.recurrent_kernel_f_backward,axes=[[2],[0]]))
                c = f * c_tm1 + i * activations.tanh(x_c + tf.tensordot(h_tm1,
                                                                self.recurrent_kernel_c_backward,axes=[[2],[0]]))
                o = activations.hard_sigmoid(x_o + tf.tensordot(h_tm1,
                                                          self.recurrent_kernel_o_backward,axes=[[2],[0]]))
                h = o * activations.tanh(c)

                return h, [h, c]

            # Make all-zero initial state. 
            # Directly obtaining the first input dimension is not allowed, so this is the work-aronud.
            initial_state = tf.tensordot(K.max(proj_input*0,axis=2),K.zeros((self.proj_dim, self.rec_hid_dim)), axes = [[2],[0]])
            proj_input_permute = K.permute_dimensions(proj_input,(0,2,1,3))
            proj_input_permute_backward = K.reverse(proj_input_permute, 1)
            _,all_rnn_out_forward,_ = K.rnn(step_forward,proj_input_permute,[initial_state,initial_state])
            _,all_rnn_out_backward,_ = K.rnn(step_backward,proj_input_permute_backward,[initial_state,initial_state])
            all_rnn_out = K.concatenate([all_rnn_out_forward,all_rnn_out_backward],axis=-1)
            raw_att_scores = tf.tensordot(K.permute_dimensions(all_rnn_out,(0,2,1,3)), 
                                                self.att_scorer, axes=[[3],[0]])


        elif self.context == 'para':
            raw_att_scores = K.sum(tf.tensordot(proj_input, self.att_scorer, axes=[[3],[2]]), axis = [1, 2]) # (sample,c,w)
        
        if self.hard: # Hard attention
            rep_att_score = K.repeat_elements(K.expand_dims(raw_att_scores),rep=self.wd,axis=-1)
            top = tf.nn.top_k(K.permute_dimensions(rep_att_score,(0,1,3,2)),k=self.k).indices
            permute_X = K.permute_dimensions(X,(0,1,3,2))
            reduced_X = K.permute_dimensions(tf.batch_gather(permute_X, top),(0,1,3,2))
            new_att_scores = K.softmax(tf.nn.top_k(raw_att_scores,k=self.k).values,axis=2)
            result = K.batch_dot(new_att_scores,reduced_X,axes=[2,2])
        else:
            att_scores = K.softmax(raw_att_scores, axis=2)
            result = K.batch_dot(att_scores,X,axes=[2,2]) # (sample,c,d)
        if self.return_attention:
            return [result, raw_att_scores]
        else:
            return result
Example #53
0
    def local_conv3d(self,
                     inputs,
                     kernel,
                     kernel_size,
                     strides,
                     output_shape,
                     data_format=None):
        """Apply 3D conv with un-shared weights.
        # Arguments
            inputs: 4D tensor with shape:
                    (batch_size, filters, new_rows, new_cols)
                    if data_format='channels_first'
                    or 4D tensor with shape:
                    (batch_size, new_rows, new_cols, filters)
                    if data_format='channels_last'.
            kernel: the unshared weight for convolution,
                    with shape (output_items, feature_dim, filters)
            kernel_size: a tuple of 2 integers, specifying the
                        width and height of the 3D convolution window.
            strides: a tuple of 2 integers, specifying the strides
                    of the convolution along the width and height.
            output_shape: a tuple with (output_row, output_col)
            data_format: the data format, channels_first or channels_last
        # Returns
            A 4d tensor with shape:
            (batch_size, filters, new_rows, new_cols)
            if data_format='channels_first'
            or 4D tensor with shape:
            (batch_size, new_rows, new_cols, filters)
            if data_format='channels_last'.
        # Raises
            ValueError: if `data_format` is neither
                        `channels_last` or `channels_first`.
        """
        if data_format is None:
            data_format = K.image_data_format()
        if data_format not in {'channels_first', 'channels_last'}:
            raise ValueError('Unknown data_format: ' + str(data_format))

        stride_row, stride_col, stride_z = strides
        output_row, output_col, output_z = output_shape
        kernel_shape = K.int_shape(kernel)
        _, feature_dim, filters = kernel_shape

        xs = []
        for i in range(output_row):
            for j in range(output_col):
                for k in range(output_z):
                    slice_row = slice(i * stride_row,
                                      i * stride_row + kernel_size[0])
                    slice_col = slice(j * stride_col,
                                      j * stride_col + kernel_size[1])
                    slice_z = slice(k * stride_z,
                                    k * stride_z + kernel_size[2])
                    if data_format == 'channels_first':
                        xs.append(
                            K.reshape(
                                inputs[:, :, slice_row, slice_col, slice_z],
                                (1, -1, feature_dim)))
                    else:
                        xs.append(
                            K.reshape(
                                inputs[:, slice_row, slice_col, slice_z, :],
                                (1, -1, feature_dim)))

        x_aggregate = K.concatenate(xs, axis=0)
        output = K.batch_dot(x_aggregate, kernel)
        output = K.reshape(output,
                           (output_row, output_col, output_z, -1, filters))

        if data_format == 'channels_first':
            output = K.permute_dimensions(output, (3, 4, 0, 1, 2))
        else:
            output = K.permute_dimensions(output, (3, 0, 1, 2, 4))
        return output
Example #54
0
    def __init__(self,
                 model,
                 bounds,
                 channel_axis=3,
                 preprocessing=(0, 1),
                 predicts='probabilities'):

        super(KerasModel, self).__init__(bounds=bounds,
                                         channel_axis=channel_axis,
                                         preprocessing=preprocessing)

        from keras import backend as K
        import keras
        from pkg_resources import parse_version

        assert parse_version(keras.__version__) >= parse_version(
            '2.0.7'), 'Keras version needs to be 2.0.7 or newer'

        if predicts == 'probs':
            predicts = 'probabilities'
        assert predicts in ['probabilities', 'logits']

        inputs = model.input
        labels = K.placeholder(shape=(None, ))
        predictions = model.output

        shape = K.int_shape(predictions)
        _, num_classes = shape
        assert num_classes is not None

        self._num_classes = num_classes

        if predicts == 'probabilities':
            if K.backend() == 'tensorflow':
                predictions, = predictions.op.inputs
                loss = K.sparse_categorical_crossentropy(labels,
                                                         predictions,
                                                         from_logits=True)
            else:  # pragma: no cover
                logging.warning(
                    'relying on numerically unstable conversion from probabilities to softmax'
                )
                loss = K.sparse_categorical_crossentropy(labels,
                                                         predictions,
                                                         from_logits=False)

                # transform the probability predictions into logits, so that
                # the rest of this code can assume predictions to be logits
                predictions = self._to_logits(predictions)
        elif predicts == 'logits':
            loss = K.sparse_categorical_crossentropy(labels,
                                                     predictions,
                                                     from_logits=True)

        loss = K.sum(loss, axis=0)
        gradient, = K.gradients(loss, [inputs])

        backward_grad_logits = K.placeholder(shape=predictions.shape)
        backward_loss = K.sum(K.batch_dot(predictions,
                                          backward_grad_logits,
                                          axes=-1),
                              axis=0)
        backward_grad_inputs, = K.gradients(backward_loss, [inputs])

        self._loss_fn = K.function([inputs, labels], [loss])
        self._forward_fn = K.function([inputs], [predictions])
        self._gradient_fn = K.function([inputs, labels], [gradient])
        self._backward_fn = K.function([backward_grad_logits, inputs],
                                       [backward_grad_inputs])
        self._forward_and_gradient_fn = K.function([inputs, labels],
                                                   [predictions, gradient])
def L2X(datatype, train=True):
    # the whole thing is equation (5)
    x_train, y_train, x_val, y_val, datatype_val, input_shape = create_data(
        datatype, n=int(1e6))

    st1 = time.time()
    st2 = st1
    print(input_shape)
    activation = 'relu'
    # P(S|X) we train the model on this, for capturing the important features.
    model_input = Input(shape=(input_shape, ), dtype='float32')

    net = Dense(100,
                activation=activation,
                name='s/dense1',
                kernel_regularizer=regularizers.l2(1e-3))(model_input)
    net = Dense(100,
                activation=activation,
                name='s/dense2',
                kernel_regularizer=regularizers.l2(1e-3))(net)

    # A tensor of shape, [batch_size, max_sents, 100]

    mid_dim = input_shape * num_groups

    logits = Dense(mid_dim)(net)
    # [BATCH_SIZE, max_sents, 1]

    k = ks[datatype]
    tau = 0.1

    samples = Sample_Concrete(tau, k, input_shape, num_groups,
                              name='sample')(logits)

    # samples = Reshape((num_groups, input_shape))(samples)
    samples = Reshape((input_shape, num_groups))(samples)
    samples = Permute((2, 1))(samples)

    #samples to be KD *1 and then make a matrix K*D and the K*D * D * 1 = K * 1 the new_model_input
    #   1) one nueral net that gives
    #   2) seperate neural net with one node as input.

    # q(X_S) variational family
    # new_model_input = Multiply()([model_input, samples])
    # new_model_input =  Dot(samples, model_input)

    def matmul_output_shape(input_shapes):
        shape1 = list(input_shapes[0])
        shape2 = list(input_shapes[1])
        return tuple((shape1[0], shape1[1]))

    matmul_layer = Lambda(lambda x: K.batch_dot(x[0], x[1]),
                          output_shape=matmul_output_shape)
    new_model_input = matmul_layer([samples, model_input])  # bs, num_groups

    #### here we apply instance-wise feature selection again I(Xs;Y)
    net2 = Dense(100,
                 activation=activation,
                 name='g/dense1',
                 kernel_regularizer=regularizers.l2(1e-3))(new_model_input)
    net2 = Dense(100,
                 activation=activation,
                 name='g/dense2',
                 kernel_regularizer=regularizers.l2(1e-3))(net2)
    logits = Dense(num_groups)(net2)
    samples_grp = Sample_Concrete_Original(tau,
                                           num_important_groups,
                                           name='group_selection')(logits)
    new_model_input2 = Multiply()([new_model_input, samples_grp])

    #net = Dense(200, activation=activation, name = 'dense2',
    #	kernel_regularizer=regularizers.l2(1e-3))(new_model_input)
    #net = BatchNormalization()(net)
    net = Dense(32,
                activation=activation,
                name='dense1',
                kernel_regularizer=regularizers.l2(1e-3))(new_model_input2)
    net = BatchNormalization()(net)  # Add batchnorm for stability.
    net = Dense(16,
                activation=activation,
                name='dense2',
                kernel_regularizer=regularizers.l2(1e-3))(net)
    net = BatchNormalization()(net)

    preds = Dense(2,
                  activation='softmax',
                  name='dense4',
                  kernel_regularizer=regularizers.l2(1e-3))(net)

    #### HERE IS FOR ANOTHER BRANCH I(Xg;Y)
    net3 = Dense(100,
                 activation=activation,
                 name='g2/dense1',
                 kernel_regularizer=regularizers.l2(1e-3))(new_model_input)
    net3 = Dense(100,
                 activation=activation,
                 name='g2/dense2',
                 kernel_regularizer=regularizers.l2(1e-3))(net3)
    preds2 = Dense(2,
                   activation='softmax',
                   name='g2/dense4',
                   kernel_regularizer=regularizers.l2(1e-3))(net3)

    model = Model(inputs=model_input, outputs=[preds, preds2])
    model.summary()

    if train:
        adam = optimizers.Adam(lr=1e-3)
        #### HERE CHANGE THE PROPORTION OF 2 WEIGHTS
        l1 = 1.0
        l2 = 1.0
        model.compile(
            loss=['categorical_crossentropy', 'categorical_crossentropy'],
            loss_weights=[l1, l2],
            optimizer=adam,
            metrics=['acc'])
        filepath = "models/{}/L2X.hdf5".format(datatype)
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='val_acc',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='max')
        callbacks_list = [checkpoint]
        model.fit(x_train, [y_train, y_train],
                  validation_data=(x_val, [y_val, y_val]),
                  callbacks=callbacks_list,
                  epochs=2,
                  batch_size=BATCH_SIZE)
        st2 = time.time()
    else:
        model.load_weights('models/{}/L2X.hdf5'.format(datatype), by_name=True)

    pred_model = Model(model_input, [samples, samples_grp])
    pred_model.compile(loss=None, optimizer='rmsprop', metrics=[None])

    # For now samples is a matrix instead of a vector

    scores, scores_grp = pred_model.predict(x_val,
                                            verbose=1,
                                            batch_size=BATCH_SIZE)

    # We need to write a new compute_median_rank to do analysis
    # median_ranks = compute_median_rank(scores, k = ks[datatype],
    #		datatype_val=datatype_val)
    median_ranks = compute_groups(scores)

    return median_ranks, time.time(
    ) - st2, st2 - st1, scores, scores_grp, x_val, y_val
Example #56
0
def tangent_distance(signals,
                     protos,
                     subspaces,
                     squared=False,
                     epsilon=K.epsilon()):
    # Note: subspaces is always assumed as transposed and must be orthogonal!
    # shape(signals): batch x proto_number x channels x dim1 x dim2 x ... x dimN
    # shape(protos): proto_number x dim1 x dim2 x ... x dimN
    # shape(subspaces): (optional [proto_number]) x prod(dim1 * dim2 * ... * dimN)  x prod(projected_atom_shape)
    # subspace should be orthogonalized

    signal_shape, signal_int_shape = _int_and_mixed_shape(signals)
    proto_shape, proto_int_shape = _int_and_mixed_shape(protos)
    subspace_int_shape = K.int_shape(subspaces)

    # check if the shapes are correct
    _check_shapes(signal_int_shape, proto_int_shape)

    with K.name_scope('tangent_distance'):
        atom_axes = list(range(3, len(signal_int_shape)))
        # for sparse signals, we use the memory efficient implementation
        if signal_int_shape[1] == 1:
            signals = K.reshape(signals, [-1, np.prod(signal_shape[3:])])

            if len(atom_axes) > 1:
                protos = K.reshape(protos, [proto_shape[0], -1])

            if K.ndim(subspaces) == 2:
                # clean solution without map_fn if the matrix_scope is global
                with K.name_scope('projectors'):
                    projectors = K.eye(subspace_int_shape[-2]) - K.dot(
                        subspaces, K.transpose(subspaces))

                with K.name_scope('tangentspace_projections'):
                    projected_signals = K.dot(signals, projectors)
                    projected_protos = K.dot(protos, projectors)

                diss = euclidean_distance(projected_signals,
                                          projected_protos,
                                          squared=squared,
                                          epsilon=epsilon)

                diss = K.reshape(
                    diss, [signal_shape[0], signal_shape[2], proto_shape[0]])

                return K.permute_dimensions(diss, [0, 2, 1])

            else:
                # no solution without map_fn possible --> memory efficient but slow!
                with K.name_scope('projectors'):
                    projectors = K.eye(subspace_int_shape[-2]) - K.batch_dot(
                        subspaces, subspaces, [2, 2])

                with K.name_scope('tangentspace_projections'):
                    projected_protos = K.transpose(
                        K.batch_dot(projectors, protos, [1, 1]))

                with K.name_scope('euclidean_distance'):

                    def projected_norm(projector):
                        return K.sum(K.square(K.dot(signals, projector)),
                                     axis=1)

                    diss = K.transpose(K.map_fn(projected_norm, projectors)) \
                           - 2 * K.dot(signals, projected_protos) \
                           + K.sum(K.square(projected_protos), axis=0, keepdims=True)

                    if not squared:
                        if epsilon == 0:
                            diss = K.sqrt(diss)
                        else:
                            diss = K.sqrt(K.maximum(diss, epsilon))

                diss = K.reshape(
                    diss, [signal_shape[0], signal_shape[2], proto_shape[0]])

                return K.permute_dimensions(diss, [0, 2, 1])

        else:
            signals = K.permute_dimensions(signals, [0, 2, 1] + atom_axes)
            diff = signals - protos

            # global tangent space
            if K.ndim(subspaces) == 2:
                with K.name_scope('projectors'):
                    projectors = K.eye(subspace_int_shape[-2]) - K.dot(
                        subspaces, K.transpose(subspaces))

                with K.name_scope('tangentspace_projections'):
                    diff = K.reshape(diff, (signal_shape[0] * signal_shape[2],
                                            signal_shape[1], -1))
                    projected_diff = K.dot(diff, projectors)
                    projected_diff = K.reshape(
                        projected_diff,
                        (signal_shape[0], signal_shape[2], signal_shape[1]) +
                        signal_shape[3:])

                diss = p_norm(projected_diff,
                              order_p=2,
                              axis=atom_axes,
                              squared=squared,
                              keepdims=False,
                              epsilon=epsilon)
                return K.permute_dimensions(diss, [0, 2, 1])

            # local tangent spaces
            else:
                with K.name_scope('projectors'):
                    projectors = K.eye(subspace_int_shape[-2]) - K.batch_dot(
                        subspaces, subspaces, [2, 2])

                with K.name_scope('tangentspace_projections'):
                    diff = K.reshape(diff, (signal_shape[0] * signal_shape[2],
                                            signal_shape[1], -1))
                    diff = K.permute_dimensions(diff, [1, 0, 2])
                    projected_diff = K.batch_dot(diff, projectors)
                    projected_diff = K.reshape(
                        projected_diff,
                        (signal_shape[1], signal_shape[0], signal_shape[2]) +
                        signal_shape[3:])

                diss = p_norm(projected_diff,
                              order_p=2,
                              axis=atom_axes,
                              squared=squared,
                              keepdims=False,
                              epsilon=epsilon)
                return K.permute_dimensions(diss, [1, 0, 2])
Example #57
0
    def call(self, inputs):

        #Ordinary Conv2D Convolution kernel
        outputs = K.conv2d(inputs,
                           self.kernel,
                           strides=self.strides,
                           padding=self.padding,
                           data_format='channels_last',
                           dilation_rate=self.dilation_rate)

        if self.use_bias:
            outputs = K.bias_add(outputs,
                                 self.bias,
                                 data_format='channels_last')

        if self.activation is not None:
            outputs = self.activation(outputs)

        #Add second part of semi-convolutional operator
        shape = K.shape(outputs)
        shape = [shape[i] for i in range(4)]
        batch_size, x_dim, y_dim, c1 = shape

        #Create tensors containng x/y pixel locations
        xx_ones = K.ones([batch_size, x_dim], dtype='int32')
        xx_ones = K.expand_dims(xx_ones, -1)
        xx_range = K.tile(K.expand_dims(K.arange(x_dim), 0), [batch_size, 1])
        xx_range = K.expand_dims(xx_range, 1)
        xx_channel = K.batch_dot(xx_ones, xx_range)
        xx_channel = K.expand_dims(xx_channel, -1)
        xx_channel = K.cast(xx_channel, 'float32')
        if self.normalized_position:
            xx_channel = xx_channel / (K.cast(x_dim, 'float32') - 1)
            xx_channel = xx_channel * 2 - 1

        yy_ones = K.ones([batch_size, y_dim], dtype='int32')
        yy_ones = K.expand_dims(yy_ones, 1)
        yy_range = K.tile(K.expand_dims(K.arange(y_dim), 0), [batch_size, 1])
        yy_range = K.expand_dims(yy_range, -1)
        yy_channel = K.batch_dot(yy_range, yy_ones)
        yy_channel = K.expand_dims(yy_channel, -1)
        yy_channel = K.cast(yy_channel, 'float32')
        if self.normalized_position:
            yy_channel = yy_channel / (K.cast(x_dim, 'float32') - 1)
            yy_channel = yy_channel * 2 - 1

        #Concat global x and y location
        semi_tensor = K.concatenate([xx_channel, yy_channel], axis=-1)

        #Apply Lambda function
        if self.function is not None:
            semi_tensor = self.function(semi_tensor, self.normalized_position,
                                        **self.arguments)

        c2 = K.shape(semi_tensor)[-1]

        #Pad with "zero" channels
        semi_tensor = K.concatenate(
            [semi_tensor,
             K.zeros([batch_size, x_dim, y_dim, c1 - c2])],
            axis=-1)

        #Sum the convolutional output with the semi_tensor
        joint_outputs = outputs + semi_tensor
        return joint_outputs  #, semi_tensor, outputs
def kl_dist(vects):
    qry_vec, doc_vec = vects
    qry_vec = K.clip(qry_vec, K.epsilon(), 1)
    doc_vec = K.clip(doc_vec, K.epsilon(), 1)
    dist = K.batch_dot(qry_vec, K.log(doc_vec), 1)
    return dist
Example #59
0
    def call(self, inputs):  # 定义正式执行的函数

        init_states = [
            K.zeros((K.shape(inputs)[0], K.shape(inputs)[-1])),
            K.zeros((K.shape(inputs)[0], K.shape(inputs)[-1]))
        ]  # 定义初始态(全零)
        #init_states = [inputs[:,0], inputs[:,0]]
        #print('inputs',K.shape(inputs)[0])
        outputs = K.rnn(self.step_do, inputs, init_states,
                        unroll=False)  # 循环执行step_do函数
        #print('outputs[1]',outputs.shape)

        print('outputs[0].shape', outputs[0].shape)

        query1 = K.dot(outputs[1], self.query_kernel1)

        key1 = K.dot(outputs[1], self.key_kernel1)

        value1 = K.dot(outputs[1], self.value_kernel1)

        attention_prob1 = K.batch_dot(query1, key1, axes=[2, 2]) / np.sqrt(
            self.units)
        attention_prob1 = K.softmax(attention_prob1)
        att_out1 = K.batch_dot(attention_prob1, value1, axes=[2, 1])

        query2 = K.dot(outputs[1], self.query_kernel2)

        key2 = K.dot(outputs[1], self.key_kernel2)

        value2 = K.dot(outputs[1], self.value_kernel2)

        attention_prob2 = K.batch_dot(query2, key2, axes=[2, 2]) / np.sqrt(
            self.units)
        attention_prob2 = K.softmax(attention_prob2)
        att_out2 = K.batch_dot(attention_prob2, value2, axes=[2, 1])

        query3 = K.dot(outputs[1], self.query_kernel3)

        key3 = K.dot(outputs[1], self.key_kernel3)

        value3 = K.dot(outputs[1], self.value_kernel3)

        attention_prob3 = K.batch_dot(query3, key3, axes=[2, 2]) / np.sqrt(
            self.units)
        attention_prob3 = K.softmax(attention_prob3)
        att_out3 = K.batch_dot(attention_prob3, value3, axes=[2, 1])

        query4 = K.dot(outputs[1], self.query_kernel4)

        key4 = K.dot(outputs[1], self.key_kernel4)

        value4 = K.dot(outputs[1], self.value_kernel4)

        attention_prob4 = K.batch_dot(query4, key4, axes=[2, 2]) / np.sqrt(
            self.units)
        attention_prob4 = K.softmax(attention_prob4)
        att_out4 = K.batch_dot(attention_prob4, value4, axes=[2, 1])

        att_out = K.concatenate([att_out1, att_out2, att_out3, att_out4],
                                axis=-1)
        out = K.dot(att_out, self.switch_kernel)
        return out[:, -1]
Example #60
0
    def call(self, X, mask=None):
        assert isinstance(X, list) and len(
            X) >= 2, "Bad input expecting list of input,encoder,decoder"

        if (len(X) == 3):
            x_T, e_T, d_T = X
        elif (len(X) == 2):
            x_T, e_T = X
            d_T = e_T
        # (batch_size ,sequence_len, feature_dim) -> (batch_size ,feature_dim,sequence_len)
        x = K.permute_dimensions(x_T, (0, 2, 1))
        # print("SHAPE!!!!", K.eval(x.shape))
        # x_T = theano.printing.Print('x_T',attrs=['shape'])(x_T)
        if (K.backend() == "tensorflow"):
            assert self.seq_len != None, 'Must set Ptr_Layer(seq_len=?) if using Tensorflow'
            seq_len = self.seq_len
        else:
            seq_len = K.shape(e_T)[1]
        # Shape key:
        # x_T:  #(batch_size ,sequence_len, feature_dim)
        # e_T:  #(batch_size ,sequence_len, recurrent_dim)
        # d_T:  #(batch_size ,sequence_len, recurrent_dim)

        # (batch_size ,sequence_len, recurrent_dim) * (recurrent_dim,att_dim) -> #(batch_size ,sequence_len,att_dim)
        _e_T, _d_T = K.dot(e_T, K.transpose(self.W1)), K.dot(
            d_T, K.transpose(self.W2))  # (batch_size ,sequence_len, att_dim)
        _e, _d = K.permute_dimensions(_e_T, (0, 2, 1)), K.permute_dimensions(
            _d_T, (0, 2, 1))  # (batch_size ,att_dim, sequence_len)

        # _e = theano.printing.Print('_e', attrs=['shape'])(_e)
        # _d = theano.printing.Print('_d', attrs=['shape'])(_d)

        def Tmap(fn, arrays, dtype='float32'):
            # assumes all arrays have same leading dim
            indices = K.range(K.shape(arrays[0])[0])
            out = K.map_fn(lambda ii: fn(*[array[ii] for array in arrays]),
                           indices,
                           dtype=dtype)
            return out

        if (self.implementation == 'ptr_net'):
            print("PTR_NET")

            E_T = K.repeat_elements(
                K.expand_dims(_e_T, dim=1), seq_len,
                axis=1)  # (batch_size ,sequence_len, sequence_len, att_dim)
            D_T = K.repeat_elements(
                K.expand_dims(_d_T, dim=1), seq_len,
                axis=1)  # (batch_size ,sequence_len, sequence_len, att_dim)

            D = K.permute_dimensions(
                D_T, (0, 2, 1,
                      3))  # (batch_size ,sequence_len, sequence_len, att_dim)

            u = K.squeeze(K.dot(K.tanh(E_T + D), self.v),
                          axis=-1)  # (batch_size ,sequence_len, sequence_len)
            u = K.permute_dimensions(u, (0, 2, 1))
            # axis=2 is row axis therefore u*x has columns that are linear combos of x
            u = softmax(u, axis=2)  # (batch_size ,sequence_len, sequence_len)
        elif (self.implementation == 'ptr_net_scan'):

            def _ptr_net_u(_e_T, _d_T):
                __E_T = K.repeat_elements(
                    K.expand_dims(_e_T, dim=0), seq_len,
                    axis=0)  # (sequence_len, sequence_len, att_dim)
                __D_T = K.repeat_elements(
                    K.expand_dims(_d_T, dim=0), seq_len,
                    axis=0)  # (sequence_len, sequence_len, att_dim)

                __D = K.permute_dimensions(
                    __D_T, (1, 0, 2))  # (sequence_len, sequence_len, att_dim)

                u = K.dot(K.tanh(__E_T + __D),
                          self.v)  # (sequence_len, sequence_len)
                u = K.squeeze(u, axis=-1)
                u = K.permute_dimensions(u, (1, 0))
                u = softmax(u, axis=1)  # (sequence_len, sequence_len)

                return u

            assert K.backend(
            ) == 'tensorflow', 'ptr_net_scan only works with tensorflow backend'
            import tensorflow as tf
            u = tf.map_fn(lambda x: _ptr_net_u(x[0], x[1]), (_e_T, _d_T),
                          dtype=tf.float32)

        elif (self.implementation == 'custom'):

            # only onto if att_dim == sequence_len
            u = _e + _d_T  ## (batch_size ,att_dim, att_dim)
            u = softmax(u, axis=2)  ## (batch_size ,att_dim, att_dim)
        elif (self.implementation == 'custom_T'):
            u = _e_T + _d  ## (batch_size ,att_dim, att_dim)
            u = softmax(u, axis=2)  ## (batch_size ,att_dim, att_dim)
        else:
            raise ValueError("implementation not recognized: %r" %
                             self.implementation)

        self.add_loss(giniSparsity(u, self.sparsity_coeff))

        soft_sorted_x = K.batch_dot(u, x, axes=[1, 2])

        # x_T = K.permute_dimensions(soft_sorted_x, (0, 2, 1))
        return soft_sorted_x  # +K.sum(K.sum(K.sum(u)))#+ K.sum(K.sum(K.sum(_e))) + K.sum(K.sum(K.sum(_d)))