Esempio n. 1
0
        def teacher_forced(h, states):
            # switching from (batch_size, previous_layer_input|true_input, output_dim)
            #    to ( previous_layer_input|true_input, batch_size, output_dim)
            axes = [1, 0] + list(range(2, K.ndim(h)))
            h = K.permute_dimensions(h, axes)

            prev_layer_input = h[0:1, :, :]
            true_input = h[1:, :, :self.units]

            # this should correspond  to true input
            prev_sampled_output = true_input

            if self.implementation == 0:
                x_z = prev_layer_input[0, :, :self.units]
                x_r = prev_layer_input[0, :, self.units: 2 * self.units]
                x_h = prev_layer_input[0, :, 2 * self.units:]
            else:
                raise ValueError('Implementation type ' + self.implementation + ' is invalid')

            z = self.recurrent_activation(x_z + K.dot(h_tm1 * rec_dp_mask[0],
                                                      self.recurrent_kernel_z))
            r = self.recurrent_activation(x_r + K.dot(h_tm1 * rec_dp_mask[1],
                                                      self.recurrent_kernel_r))

            hh = self.activation(x_h +
                                 K.dot(r * h_tm1 * rec_dp_mask[2],
                                       self.recurrent_kernel_h) +
                                 K.dot(r * prev_sampled_output, self.recurrent_kernel_y))

            output = z * h_tm1 + (1. - z) * hh

            return K.stack([output, output])
Esempio n. 2
0
    def call(self, inputs, **kwargs):
        assert isinstance(inputs, list) and len(inputs) == 3
        first, second, features = inputs[0], inputs[1], inputs[2]
        if not self.from_logits:
            first = kb.clip(first, 1e-10, 1.0)
            second = kb.clip(second, 1e-10, 1.0)
            first_, second_ = kb.log(first), kb.log(second)
        else:
            first_, second_ = first, second
        # embedded_features.shape = (M, T, 1)
        if self.use_intermediate_layer:
            features = kb.dot(features, self.first_kernel)
            features = kb.bias_add(features, self.first_bias, data_format="channels_last")
            features = self.intermediate_activation(features)
        embedded_features = kb.dot(features, self.features_kernel)
        embedded_features = kb.bias_add(
            embedded_features, self.features_bias, data_format="channels_last")
        if self.use_dimension_bias:
            tiling_shape = [1] * (kb.ndim(first)-1) + [kb.shape(first)[-1]]
            embedded_features = kb.tile(embedded_features, tiling_shape)
            embedded_features = kb.bias_add(
                embedded_features, self.dimensions_bias, data_format="channels_last")
        sigma = kb.sigmoid(embedded_features)

        result = weighted_sum(first_, second_, sigma,
                              self.first_threshold, self.second_threshold)
        probs = kb.softmax(result)
        if self.return_logits:
            return [probs, result]
        return probs
Esempio n. 3
0
        def free_running(h, states):

            prev_generated_output = initial_states[0][1:, :, :]
            prev_sampled_output = prev_generated_output

            # switching from (batch_size, previous_layer_input|true_input, output_dim)
            #    to ( previous_layer_input|true_input, batch_size, output_dim)
            axes = [1, 0] + list(range(2, K.ndim(h)))
            h = K.permute_dimensions(h, axes)

            prev_layer_input = h[0:1, :, :]

            if self.implementation == 0:
                x_z = prev_layer_input[0, :, :self.units]
                x_r = prev_layer_input[0, :, self.units: 2 * self.units]
                x_h = prev_layer_input[0, :, 2 * self.units:]

            z = self.recurrent_activation(x_z + K.dot(h_tm1 * rec_dp_mask[0],
                                                      self.recurrent_kernel_z))
            r = self.recurrent_activation(x_r + K.dot(h_tm1 * rec_dp_mask[1],
                                                      self.recurrent_kernel_r))

            hh = self.activation(x_h +
                                 K.dot(r * h_tm1 * rec_dp_mask[2],
                                       self.recurrent_kernel_h) +
                                 K.dot(r * prev_sampled_output, self.recurrent_kernel_y))

            output = z * h_tm1 + (1. - z) * hh

            final_output = self.output_sampling(output, random_cutoff_vec)

            return K.stack([output, final_output])
Esempio n. 4
0
 def get_variational_regularization(self, X):
     mean = self.activation(K.dot(X, self.W_mean) + self.b_mean)
     logsigma = self.activation(K.dot(X, self.W_logsigma) + self.b_logsigma)
     return GaussianKL(mean, logsigma,
                       regularizer_scale=self.regularizer_scale,
                       prior_mean=self.prior_mean,
                       prior_logsigma=self.prior_logsigma)
Esempio n. 5
0
 def call(self, inputs, **kwargs):
     gate = kb.dot(inputs, self.gate_kernel)
     gate = kb.bias_add(gate, self.gate_bias, data_format="channels_last")
     gate = self.activation(gate)
     new_value = kb.dot(inputs, self.dense_kernel)
     new_value = kb.bias_add(new_value, self.dense_bias, data_format="channels_last")
     return gate * new_value + (1.0 - gate) * inputs
 def step(self, x, states):
     # states only contains the previous output.
     assert len(states) == 1
     prev_output = states[0]
     h = K.dot(x, self.W) + self.b
     output = self.activation(h + K.dot(prev_output, self.U))
     return output, [output]
    def step(self, x, states):  
        h = states[0]
        # states[1] necessary?
        
        # comes from the constants
        X_static = states[-2]
        # equals K.dot(static_x, self._W1) + self._b2 with X.shape=[bs, L, static_input_dim]
        total_x_static_prod = states[-1]

        # expand dims to add the vector which is only valid for this time step
        # to total_x_prod which is valid for all time steps
        hw = K.expand_dims(K.dot(h, self._W2), 1)
        additive_atn = total_x_static_prod + hw
        attention = K.softmax(K.dot(additive_atn, self._V), axis=1)
        static_x_weighted = K.sum(attention * X_static, [1])
        
        x = K.dot(K.concatenate([x, static_x_weighted], 1), self._W3) + self._b3

        h, new_states = self.layer.cell.call(x, states[:-2])
        
        # append attention to the states to "smuggle" it out of the RNN wrapper
        attention = K.squeeze(attention, -1)
        h = K.concatenate([h, attention])

        return h, new_states
Esempio n. 8
0
 def call(self, x):
     #如果只传入Q_seq,K_seq,V_seq,那么就不做Mask
     #如果同时传入Q_seq,K_seq,V_seq,Q_len,V_len,那么对多余部分做Mask
     if len(x) == 3:
         Q_seq,K_seq,V_seq = x
         Q_len,V_len = None,None
     elif len(x) == 5:
         Q_seq,K_seq,V_seq,Q_len,V_len = x
     #对Q、K、V做线性变换
     Q_seq = K.dot(Q_seq, self.WQ)
     Q_seq = K.reshape(Q_seq, (-1, K.shape(Q_seq)[1], self.nb_head, self.size_per_head))
     Q_seq = K.permute_dimensions(Q_seq, (0,2,1,3))
     K_seq = K.dot(K_seq, self.WK)
     K_seq = K.reshape(K_seq, (-1, K.shape(K_seq)[1], self.nb_head, self.size_per_head))
     K_seq = K.permute_dimensions(K_seq, (0,2,1,3))
     V_seq = K.dot(V_seq, self.WV)
     V_seq = K.reshape(V_seq, (-1, K.shape(V_seq)[1], self.nb_head, self.size_per_head))
     V_seq = K.permute_dimensions(V_seq, (0,2,1,3))
     #计算内积,然后mask,然后softmax
     A = K.batch_dot(Q_seq, K_seq, axes=[3,3])
     A = K.permute_dimensions(A, (0,3,2,1))
     A = self.Mask(A, V_len, 'add')
     A = K.permute_dimensions(A, (0,3,2,1))    
     A = K.softmax(A)
     #输出并mask
     O_seq = K.batch_dot(A, V_seq, axes=[3,2])
     O_seq = K.permute_dimensions(O_seq, (0,2,1,3))
     O_seq = K.reshape(O_seq, (-1, K.shape(O_seq)[1], self.output_dim))
     O_seq = self.Mask(O_seq, Q_len, 'mul')
     return O_seq
    def call(self, x):
        assert(K.backend() == 'tensorflow')
        temp = K.permute_dimensions(x, (0, 2, 1))
        for i in range(0, self.attention_depth):
            temp = K.sigmoid(K.dot(temp, self.Ws[i]) + self.bs[i])
        temp = K.permute_dimensions(temp, (0, 2, 1))
        estimated_weight = K.squeeze(K.dot(temp, K.expand_dims(self.Wf, -1)), -1)
        biased_weight = estimated_weight + self.bias
        non_linear_weight = K.tanh(biased_weight)

        # For each hidded state calculate how much should it contribute
        # to the context vector. This is the main part of attention.
        # In order to convert weights to "probabilities" use a sigmoid
        # based function: exp(x) / sum(exp(xi)).
        prob = K.exp(non_linear_weight)
        # Compute the total sum for each batch.
        total_sum = K.sum(prob, axis=1, keepdims=True)
        prob /= K.cast(total_sum, K.floatx())

        # Enable this if you want access to internal probabilities.
        # Should only be used for testing that Attention works as expected.
        # return prob

        # Multiply each hidden value by the corresponding probability.
        prob = K.expand_dims(prob, -1)
        new_hidden_values = x * prob
        return K.sum(new_hidden_values, axis=1)
Esempio n. 10
0
    def step(self, x, states):
        
        r_tm1, V_tm1,s_tm1,time = states[:4]
        h_tm1 = states[4:]
 
        
        
        r_tm1 = r_tm1
        
        op_t, h_t = _update_controller(self, T.concatenate([x, r_tm1], axis=-1),
                                             h_tm1)
              
       # op_t = op_t  + print_name_shape("W_d",self.W_d.get_value()) 
        op_t = op_t
        #op_t = op_t[:,0,:]
        d_t = K.sigmoid( K.dot(op_t, self.W_d)  + self.b_d)  
        u_t = K.sigmoid(K.dot(op_t, self.W_u) + self.b_u)
        v_t = K.tanh(K.dot(op_t, self.W_v) + self.b_v)
        o_t = K.tanh(K.dot(op_t, self.W_o) + self.b_o) 
        
        
        time = time + 1
        V_t, s_t, r_t = _update_neural_stack(self, V_tm1, s_tm1, d_t[::,0], 
                                             u_t[::,0], v_t,time[0],stack=self.stack)
        

       
        return o_t, [r_t, V_t, s_t, time] + h_t
Esempio n. 11
0
 def get_variational_regularization(self, X):
     X = K.reshape(X, (-1, self.input_shape[-1]))
     mean = self.activation(K.dot(X, self.W_mean) + self.b_mean)
     logsigma = self.activation(K.dot(X, self.W_logsigma) + self.b_logsigma)
     return GaussianKL(mean, logsigma,
                       regularizer_scale=self.regularizer_scale,
                       prior_mean=self.prior_mean,
                       prior_logsigma=self.prior_logsigma)
Esempio n. 12
0
 def call(self, inputs, states, constants):
     [prev_output] = states
     [constant] = constants
     h_input = K.dot(inputs, self.input_kernel)
     h_state = K.dot(prev_output, self.recurrent_kernel)
     h_const = K.dot(constant, self.constant_kernel)
     output = h_input + h_state + h_const
     return output, [output]
Esempio n. 13
0
 def get_output(self, train=False):
     X = self.get_input(train)
     if self.pretrain or self.output_reconstruction:
         output = self.reconstruction_activation(K.dot(self.activation(K.dot(X, self.W)), K.transpose(self.W)))
         return output
     else:
         output = self.activation(K.dot(X, self.W))
         return output
Esempio n. 14
0
  def call(self, x, mask=None):
    # input shape: (nb_samples, time (padded with zeros), input_dim)
    # note that the .build() method of subclasses MUST define
    # self.input_spec with a complete input shape.
    input_shape = self.input_spec[0].shape
    if K._BACKEND == 'tensorflow':
        if not input_shape[1]:
            raise Exception('When using TensorFlow, you should define '
                            'explicitly the number of timesteps of '
                            'your sequences.\n'
                            'If your first layer is an Embedding, '
                            'make sure to pass it an "input_length" '
                            'argument. Otherwise, make sure '
                            'the first layer has '
                            'an "input_shape" or "batch_input_shape" '
                            'argument, including the time axis. '
                            'Found input shape at layer ' + self.name +
                            ': ' + str(input_shape))
    if self.stateful:
        initial_states = self.states
    else:
        initial_states = self.get_initial_states(x)
    constants = self.get_constants(x)
    preprocessed_input = self.preprocess_input(x)

    last_output, outputs_0, states = K.rnn(self.step, preprocessed_input,
                                           initial_states,
                                           go_backwards=self.go_backwards,
                                           mask=mask,
                                           constants=constants,
                                           unroll=self.unroll,
                                           input_length=input_shape[1])
    timer = K.zeros((2, self.output_length, 2))
    last_output, outputs, states = K.rnn(self.dream, timer,
                                         states, go_backwards=self.go_backwards,
                                         mask=mask,
                                         constants=constants,
                                         input_length=self.output_length,
                                         unroll=self.unroll)

    last_output = K.dot(last_output, self.V) + self.ext_b
    outputs = K.concatenate([outputs_0, outputs], axis=1)
    outputs = K.dot(K.reshape(outputs, (-1, self.output_dim)), self.V) + self.ext_b

    ishape = K.shape(x)
    if K._BACKEND == "tensorflow":
      ishape = x.get_shape().as_list()
    outputs = K.reshape(outputs, (-1, ishape[1]+self.output_length, ishape[2]))

    if self.stateful:
      self.updates = []
      for i in range(len(states)):
        self.updates.append((self.states[i], states[i]))

    if self.return_sequences:
      return outputs
    else:
      return last_output
Esempio n. 15
0
    def call(self, x, mask=None):
        N_DECISION = (2 ** (self.n_depth)) - 1  # Number of decision nodes
        N_LEAF  = 2 ** (self.n_depth + 1)  # Number of leaf nodes

        flat_decision_p_e = []
        leaf_p_e = []
        for w_d, w_l in zip(self.w_d_ensemble, self.w_l_ensemble):

            decision_p = K.sigmoid((K.dot(x, w_d)))
            leaf_p = K.softmax(w_l)

            decision_p_comp = 1 - decision_p

            decision_p_pack = K.concatenate([decision_p, decision_p_comp])

            flat_decision_p_e.append(decision_p_pack)
            leaf_p_e.append(leaf_p)

        #Construct tiling pattern for decision probability matrix
        #Could be done in TF, but I think it's better statically
        tiling_pattern = np.zeros((N_LEAF, self.n_depth), dtype=np.int32)
        comp_offset = N_DECISION
        dec_idx = 0
        for n in xrange(self.n_depth):
            j = 0
            for depth_idx in xrange(2**n):
                repeat_times = 2 ** (self.n_depth - n)
                for _ in xrange(repeat_times):
                    tiling_pattern[j][n] = dec_idx 
                    j = j + 1

                for _ in xrange(repeat_times):
                    tiling_pattern[j][n] = comp_offset + dec_idx 
                    j = j + 1

                dec_idx = dec_idx + 1

        flat_pattern = tiling_pattern.flatten()

        # iterate over each tree
        tree_ret = None
        for flat_decision_p, leaf_p in zip(flat_decision_p_e, leaf_p_e):
            flat_mu = tf.transpose(tf.gather(tf.transpose(flat_decision_p), flat_pattern))
            
            batch_size = tf.shape(flat_decision_p)[0]
            shape = tf.pack([batch_size, N_LEAF, self.n_depth])

            mu = K.reshape(flat_mu, shape)
            leaf_prob = K.prod(mu, [2])
            prob_label = K.dot(leaf_prob, leaf_p)

            if tree_ret is None:
              tree_ret = prob_label
            else:
              tree_ret = tree_ret + prob_label

        return tree_ret/self.n_trees
Esempio n. 16
0
  def dream(self, x, states):
    prev_st = states[0]
    prev_x = tf.stop_gradient(K.dot(prev_st, self.V) + self.ext_b)
    B_U = states[1]
    B_W = states[2]
    h = K.dot(prev_x * B_W, self.W) + self.b

    output = self.activation(h + K.dot(prev_st * B_U, self.U))
    return output, [output]
Esempio n. 17
0
 def call(self, x, mask=None):
     # x[0]: (batch_size, input_length, input_dim)
     # x[1]: (batch_size, 1) indices of prepositions
     # Optional: x[2]: (batch_size, input_length - 2)
     assert isinstance(x, list) or isinstance(x, tuple)
     encoded_sentence = x[0]
     prep_indices = K.squeeze(x[1], axis=-1)  #(batch_size,)
     batch_indices = K.arange(K.shape(encoded_sentence)[0])  # (batch_size,)
     if self.with_attachment_probs:
         # We're essentially doing K.argmax(x[2]) here, but argmax is not differentiable!
         head_probs = x[2]
         head_probs_padding = K.zeros_like(x[2])[:, :2]  # (batch_size, 2)
         # (batch_size, input_length)
         padded_head_probs = K.concatenate([head_probs, head_probs_padding])
         # (batch_size, 1)
         max_head_probs = K.expand_dims(K.max(padded_head_probs, axis=1))
         # (batch_size, input_length, 1)
         max_head_prob_indices = K.expand_dims(K.equal(padded_head_probs, max_head_probs))
         # (batch_size, input_length, input_dim)
         masked_head_encoding = K.switch(max_head_prob_indices, encoded_sentence, K.zeros_like(encoded_sentence))
         # (batch_size, input_dim)
         head_encoding = K.sum(masked_head_encoding, axis=1)
     else:
         head_indices = prep_indices - 1  # (batch_size,)
         head_encoding = encoded_sentence[batch_indices, head_indices, :]  # (batch_size, input_dim)
     prep_encoding = encoded_sentence[batch_indices, prep_indices, :]  # (batch_size, input_dim)
     child_encoding = encoded_sentence[batch_indices, prep_indices+1, :]  # (batch_size, input_dim)
     '''
     prep_indices = x[1]
     sentence_mask = mask[0]
     if sentence_mask is not None:
         if K.ndim(sentence_mask) > 2:
             # This means this layer came after a Bidirectional layer. Keras has this bug which
             # concatenates input masks instead of output masks.
             # TODO: Fix Bidirectional instead.
             sentence_mask = K.any(sentence_mask, axis=(-2, -1))
     head_encoding, prep_encoding, child_encoding = self.get_split_averages(encoded_sentence, sentence_mask,
                                                                            prep_indices)
     '''
     head_projection = K.dot(head_encoding, self.proj_head)  # (batch_size, proj_dim)
     prep_projection = K.dot(prep_encoding, self.proj_prep)  # (batch_size, proj_dim)
     child_projection = K.dot(child_encoding, self.proj_child)  # (batch_size, proj_dim)
     #(batch_size, proj_dim)
     if self.composition_type == 'HPCT':
         composed_projection = K.tanh(head_projection + prep_projection + child_projection)
     elif self.composition_type == 'HPC':
         prep_child_projection = K.tanh(prep_projection + child_projection)  # (batch_size, proj_dim)
         composed_projection = K.tanh(head_projection + prep_child_projection)
     else:
         # Composition type in HC
         composed_projection = K.tanh(head_projection + child_projection)
     for hidden_layer in self.hidden_layers:
         composed_projection = K.tanh(K.dot(composed_projection, hidden_layer))  # (batch_size, proj_dim)
     # (batch_size, num_classes)
     class_scores = K.dot(composed_projection, self.scorer)
     label_probabilities = K.softmax(class_scores)
     return label_probabilities
Esempio n. 18
0
    def call(self, argument, mask=None):
        """Execute this layer on input tensors.

    Parameters
    ----------
    argument: list
      List of two tensors (X, Xp). X should be of shape (n_test, n_feat) and
      Xp should be of shape (n_support, n_feat) where n_test is the size of
      the test set, n_support that of the support set, and n_feat is the number
      of per-atom features.

    Returns
    -------
    list
      Returns two tensors of same shape as input. Namely the output shape will
      be [(n_test, n_feat), (n_support, n_feat)]
    """
        x, xp = argument

        # Get initializations
        p = self.p_init
        q = self.q_init
        # Rename support
        z = xp
        states = self.support_states_init
        x_states = self.test_states_init

        for d in range(self.max_depth):
            # Process support xp using attention
            e = cos(z + q, xp)
            a = K.softmax(e)
            # Get linear combination of support set
            r = K.dot(a, xp)

            # Not sure if it helps to place the update here or later yet.  Will
            # decide
            # z = r

            # Process test x using attention
            x_e = cos(x + p, z)
            x_a = K.softmax(x_e)
            s = K.dot(x_a, z)

            # Generate new support attention states
            qr = K.concatenate([q, r], axis=1)
            q, states = self.support_lstm([qr] + states)

            # Generate new test attention states
            ps = K.concatenate([p, s], axis=1)
            p, x_states = self.test_lstm([ps] + x_states)

            # Redefine
            z = r

        # return [x+p, z+q]
        return [x + p, xp + q]
Esempio n. 19
0
  def dream(self, x, states):
    prev_st = states[0]
    controls = x[:, :self.control_dim]
    prev_x = K.concatenate([controls, tf.stop_gradient(K.dot(prev_st, self.V) + self.ext_b)], axis=1)
    B_U = states[1]
    B_W = states[2]
    h = K.dot(prev_x * B_W, self.W) + self.b

    output = self.activation(h + K.dot(prev_st * B_U, self.U))
    return output, [output]
Esempio n. 20
0
 def __call__(self, x):
     regularization = 0
     dimorder = self.axis + list(set(range(K.ndim(x))) - set(self.axis))
     x = K.permute_dimensions(x, dimorder)
     x = x.reshape((x.shape[0], -1))
     x -= K.mean(x, axis=1, keepdims=True)
     if self.division_idx is not None:
         regularization += .5*K.sum(K.square(K.dot(x[:self.division_idx], x[self.division_idx:].T)/x.shape[1]))
     else:
         regularization += .5*K.sum(K.square(K.dot(x, x.T)/x.shape[1]))
     return regularization
Esempio n. 21
0
 def __call__(self, x):
     xshape = K.int_shape(x)
     if self.axis is 'last':
         x = K.reshape(x, (-1, xshape[-1]))
         x /= K.sqrt(K.sum(K.square(x), axis=0, keepdims=True))
         xx = K.dot(K.transpose(x), x)
         return self.gamma * K.sum(K.log(1.0 + K.exp(self.lam * (xx - 1.0))) * (1.0 - K.eye(xshape[-1])))
     elif self.axis is 'first':
         x = K.reshape(x, (xshape[0], -1))
         x /= K.sqrt(K.sum(K.square(x), axis=1, keepdims=True))
         xx = K.dot(x, K.transpose(x))
         return self.gamma * K.sum(K.log(1.0 + K.exp(self.lam * (xx - 1.0))) * (1.0 - K.eye(xshape[0])))
Esempio n. 22
0
  def step(self, x, states):
    prev_output = states[0]
    B_U = states[1]
    B_W = states[2]

    if self.consume_less == 'cpu':
        h = x
    else:
        h = K.dot(x * B_W, self.W) + self.b

    output = self.activation(h + K.dot(prev_output * B_U, self.U))
    return output, [output]
 def call(self, x, mask=None):
     x_cont, x_ques, ques_len = x
     input_shape_ = x_cont.shape.as_list()
     x_cont_ = tf.nn.relu(K.dot(x_cont, self.WC))
     x_ques_ = tf.nn.relu(K.dot(x_ques, self.WQ))
     logits = tf.matmul(x_cont_, x_ques_, transpose_b=True) / (self.filters ** 0.5)
     logits = self.mask_logits(logits, ques_len, clen=input_shape_[1])
     logits = tf.nn.softmax(logits)
     C = tf.matmul(logits, x_ques)
     res = tf.concat([x_cont, C], axis=2)
     gate = tf.nn.sigmoid(K.dot(res, self.V))
     return gate
Esempio n. 24
0
    def free_energy(self, x):
        """
        Compute free energy for Bernoulli RBM, given visible units.

        The marginal probability p(x) = sum_h 1/Z exp(-E(x, h)) can be re-arranged to the form 
        p(x) = 1/Z exp(-F(x)), where the free energy F(x) = -sum_j=1^H log(1 + exp(x^T W[:,j] + bh_j)) - bx^T x, 
        in case of the Bernoulli RBM energy function.
        """
        wx_b = K.dot(x, self.W) + self.bh
        hidden_term = K.sum(K.log(1 + K.exp(wx_b)), axis=1)
        vbias_term = K.dot(x, self.bx)
        return -hidden_term - vbias_term
Esempio n. 25
0
 def call(self, x):
     y = K.dot(x, self.W_carry)
     if self.bias:
         y += self.b_carry
     transform_weight = activations.sigmoid(y)
     y = K.dot(x, self.W)
     if self.bias:
         y += self.b
     act = self.activation(y)
     act *= transform_weight
     output = act + (1 - transform_weight) * x
     return output
Esempio n. 26
0
 def __loss(y_true, y_pred):
     kernel_cs_forward, kernel_cs_backward = [], []
     for (forward, backward) in layers:
         kernel_c_forward = forward.cell.trainable_weights[1][:, rnn_units * 2:rnn_units * 3]
         kernel_c_backward = backward.cell.trainable_weights[1][:, rnn_units * 2:rnn_units * 3]
         kernel_cs_forward.append(K.reshape(kernel_c_forward, (rnn_units * rnn_units,)))
         kernel_cs_backward.append(K.reshape(kernel_c_backward, (rnn_units * rnn_units,)))
     phi_forward = K.stack(kernel_cs_forward)
     phi_backward = K.stack(kernel_cs_backward)
     loss_sim_forward = K.sum(K.square(K.dot(phi_forward, K.transpose(phi_forward)) - K.eye(len(layers))))
     loss_sim_backward = K.sum(K.square(K.dot(phi_backward, K.transpose(phi_backward)) - K.eye(len(layers))))
     loss_cat = keras.losses.categorical_crossentropy(y_true, y_pred)
     return loss_cat + lmbd * (loss_sim_forward + loss_sim_backward)
    def call(self, x, mask=None):
        e = K.dot(x, self.W)
        if self.bias:
            e += self.b
        e = K.tanh(e)
        e = K.reshape(K.dot(e, self.U), (-1, self.timesteps))
        a = K.exp(e)
        if mask is not None:
            a *= K.cast(mask, K.floatx())
        a_weights = a / K.cast(K.sum(a, axis=-1, keepdims=True) + K.epsilon(), K.floatx())
        weighted_output = x * K.expand_dims(a_weights, axis=-1)

        return [K.mean(weighted_output, axis=1), a_weights]
Esempio n. 28
0
 def step(self, x, states):
     assert len(states) == 5, len(states)
     states = list(states)
     y_tm1 = states.pop(2)
     v = self.activation(K.dot(x, self.W_x) + self.b_x)
     y_tm1 += v
     output_dim = self.output_dim
     self.output_dim = self.hidden_dim
     h_t, new_states = super(LSTMDecoder, self).step(y_tm1, states)
     self.output_dim = output_dim
     y_t = self.activation(K.dot(h_t, self.W_y) + self.b_y)
     new_states += [y_t]
     return y_t, new_states
Esempio n. 29
0
    def step(self, x, states):
        h, [h, c] = self.layer.step(x, states)
        attention = states[4]

        m = self.attn_activation(K.dot(h, self.U_a) * attention + self.b_a)
        s = K.sigmoid(K.dot(m, self.U_s) + self.b_s)

        if self.single_attention_param:
            h = h * K.repeat_elements(s, self.layer.output_dim, axis=1)
        else:
            h = h * s

        return h, [h, c]
Esempio n. 30
0
def dot_product(x, kernel):
    """
    Wrapper for dot product operation, in order to be compatible with both
    Theano and Tensorflow
    Args:
        x (): input
        kernel (): weights
    Returns:
    """
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)
Esempio n. 31
0
### INITIALIZING CONSTANTS
n_input = 272
tau = 0.1
lambda_step = 0.1
soft_thr = 0.1
conv_size = 32
filter_size = 3

### PREPARING THE MODEL (An image of the model map has been attached)

# Defining the input and output
inp = Input((n_input,))
inp_labels = Input((1089, ))

# Defining the input for the first ISTA block
x0 = Lambda(lambda x: K.dot(x, K.constant(phi_inv)))(inp)
phi_tb = Lambda(lambda x: K.dot(x, K.constant(np.transpose(phi))))(inp)

# ISTA block #1
conv1_x1 = Lambda(lambda x: x - lambda_step * K.dot(x, K.constant(ptp)) + lambda_step * phi_tb, name='conv1_x1')(x0)
conv1_x2 = Reshape((33, 33, 1), name='conv1_x2')(conv1_x1)
conv1_x3 = Conv2D(conv_size, [filter_size, filter_size], padding='SAME', use_bias=False, name='conv1_x3')(conv1_x2)
conv1_sl1 = Conv2D(conv_size, [filter_size, filter_size], padding='SAME', use_bias=False, activation='relu', name='conv1_sl1')
conv1_x4 = conv1_sl1(conv1_x3)
conv1_sl2 = Conv2D(conv_size, [filter_size, filter_size], padding='SAME', use_bias=False, name='conv1_sl2')
conv1_x44 = conv1_sl2(conv1_x4)
conv1_x5 = Multiply(name='conv1_x5')([Lambda(lambda x: K.sign(x))(conv1_x44), Lambda(lambda x: relu(x - soft_thr))(Lambda(lambda x: K.abs(x))(conv1_x44))])
conv1_sl3 = Conv2D(conv_size, [filter_size, filter_size], padding='SAME', use_bias=False, activation='relu', name='conv1_sl3')
conv1_x6 = conv1_sl3(conv1_x5)
conv1_sl4 = Conv2D(conv_size, [filter_size, filter_size], padding='SAME', use_bias=False, name='conv1_sl4')
conv1_x66 = conv1_sl4(conv1_x6)
Esempio n. 32
0
    def step(self, inputs, states):
        h_tm1 = states[0]
        c_tm1 = states[1]
        dp_mask = states[2]
        rec_dp_mask = states[3]

        if self.implementation == 2:
            m1 = K.dot(inputs, self.multiplicative_kernel_x)
            m2 = K.dot(h_tm1, self.multiplicative_kernel_h)
            m = m1 * m2
            z = K.dot(inputs * dp_mask[0], self.kernel)
            z += K.dot(m * rec_dp_mask[0], self.recurrent_kernel)
            if self.use_bias:
                z = K.bias_add(z, self.bias)

            z0 = z[:, :self.units]
            z1 = z[:, self.units:2 * self.units]
            z2 = z[:, 2 * self.units:3 * self.units]
            z3 = z[:, 3 * self.units:4 * self.units]

            i = self.recurrent_activation(z0)
            f = self.recurrent_activation(z1)
            c = f * c_tm1 + i * self.activation(z2)
            o = self.recurrent_activation(z3)
        else:
            if self.implementation == 0:
                inp = inputs[:, 4 * self.units:]
                m1 = K.dot(inp, self.multiplicative_kernel_x)
                m2 = K.dot(h_tm1, self.multiplicative_kernel_h)
                m = m1 * m2

                x_i = inputs[:, :self.units]
                x_f = inputs[:, self.units:2 * self.units]
                x_c = inputs[:, 2 * self.units:3 * self.units]
                x_o = inputs[:, 3 * self.units:4 * self.units]
            elif self.implementation == 1:
                m1 = K.dot(inputs, self.multiplicative_kernel_x)
                m2 = K.dot(h_tm1, self.multiplicative_kernel_h)
                m = m1 * m2
                x_i = K.dot(inputs * dp_mask[0], self.kernel_i) + self.bias_i
                x_f = K.dot(inputs * dp_mask[1], self.kernel_f) + self.bias_f
                x_c = K.dot(inputs * dp_mask[2], self.kernel_c) + self.bias_c
                x_o = K.dot(inputs * dp_mask[3], self.kernel_o) + self.bias_o
            else:
                raise ValueError('Unknown `implementation` mode.')

            i = self.recurrent_activation(
                x_i + K.dot(m * rec_dp_mask[0], self.recurrent_kernel_i))
            f = self.recurrent_activation(
                x_f + K.dot(m * rec_dp_mask[1], self.recurrent_kernel_f))
            c = f * c_tm1 + i * self.activation(
                x_c + K.dot(m * rec_dp_mask[2], self.recurrent_kernel_c))
            o = self.recurrent_activation(
                x_o + K.dot(m * rec_dp_mask[3], self.recurrent_kernel_o))
        h = o * self.activation(c)
        if 0 < self.dropout + self.recurrent_dropout:
            h._uses_learning_phase = True
        return h, [h, c]
Esempio n. 33
0
def gram_matrix(x):
    assert K.ndim(x) == 3
    features = K.batch_flatten(x)
    gram = K.dot(features, K.transpose(features))
    return gram
Esempio n. 34
0
    def _step(self,
              x_tm1,
              h_tm1, c_tm1, v,
              u_i, u_f, u_o, u_c, w_i, w_f, w_c, w_o, w_x, v_i, v_f, v_c, v_o, b_i, b_f, b_c, b_o, b_x):

        #Inputs = output from previous time step, vector from encoder
        xi_t = K.dot(x_tm1, w_i) + K.dot(v, v_i) + b_i
        xf_t = K.dot(x_tm1, w_f) + K.dot(v, v_f) + b_f
        xc_t = K.dot(x_tm1, w_c) + K.dot(v, v_c) + b_c
        xo_t = K.dot(x_tm1, w_o) + K.dot(v, v_o) + b_o

        i_t = self.inner_activation(xi_t + K.dot(h_tm1, u_i))
        f_t = self.inner_activation(xf_t + K.dot(h_tm1, u_f))
        c_t = f_t * c_tm1 + i_t * self.activation(xc_t + K.dot(h_tm1, u_c))
        o_t = self.inner_activation(xo_t + K.dot(h_tm1, u_o))
        h_t = o_t * self.activation(c_t)

        x_t = K.dot(h_t, w_x) + b_x
        return x_t, h_t, c_t
Esempio n. 35
0
 def gram_matrix(x):
     features = backend.batch_flatten(
         backend.permute_dimensions(x, (2, 0, 1)))
     gram = backend.dot(features, backend.transpose(features))
     return gram
Esempio n. 36
0
    def call(self, inputs, states, training=None):
        h_tm1 = states[0]  # previous memory
        # x = inputs[:, :self.x_dim]
        wei = inputs[:, self.x_dim:self.x_dim + self.weight_dim]

        if 0 < self.dropout < 1 and self._dropout_mask is None:
            self._dropout_mask = _generate_dropout_mask(K.ones_like(x),
                                                        self.dropout,
                                                        training=training,
                                                        count=4)
        if (0 < self.recurrent_dropout < 1
                and self._recurrent_dropout_mask is None):
            self._recurrent_dropout_mask = _generate_dropout_mask(
                K.ones_like(h_tm1),
                self.recurrent_dropout,
                training=training,
                count=4)

        # dropout matrices for input units
        dp_mask = self._dropout_mask
        # dropout matrices for recurrent units
        rec_dp_mask = self._recurrent_dropout_mask

        self.implementation = 2
        if self.implementation == 1:
            pass
        else:
            if 0. < self.dropout < 1.:
                x *= dp_mask[0]

            # inputs projected by all gate matrices at once
            matrix_x = K.dot(x, self.kernel)
            matrix_w = K.dot(wei, self.kernel_wei)
            if self.use_bias:
                # biases: bias_z_i, bias_r_i, bias_h_i
                matrix_x = K.bias_add(matrix_x,
                                      self.input_bias[:self.units * 3])
            x_z = matrix_x[:, :self.units]
            x_r = matrix_x[:, self.units:2 * self.units]
            x_h = matrix_x[:, 2 * self.units:3 * self.units]
            x_w = matrix_w

            if 0. < self.recurrent_dropout < 1.:
                h_tm1 *= rec_dp_mask[0]

            if self.reset_after:
                # hidden state projected by all gate matrices at once
                matrix_inner = K.dot(h_tm1, self.recurrent_kernel)
                if self.use_bias:
                    matrix_inner = K.bias_add(matrix_inner,
                                              self.recurrent_bias)
            else:
                # hidden state projected separately for update/reset and new
                matrix_inner = K.dot(h_tm1,
                                     self.recurrent_kernel[:, :2 * self.units])

            recurrent_z = matrix_inner[:, :self.units]
            recurrent_r = matrix_inner[:, self.units:2 * self.units]

            z = self.recurrent_activation(x_z + recurrent_z)
            r = self.recurrent_activation(x_r + recurrent_r)

            if self.reset_after:
                recurrent_h = r * matrix_inner[:,
                                               2 * self.units:3 * self.units]
                recurrent_w = matrix_inner[:, 2 * self.units:3 * self.units]
            else:
                recurrent_h = K.dot(
                    r * h_tm1,
                    self.recurrent_kernel[:, 2 * self.units:3 * self.units])
                recurrent_w = K.dot(h_tm1,
                                    self.recurrent_kernel[:, 3 * self.units:])

            w = self.recurrent_activation(x_w + recurrent_w)
            #w = self.recurrent_activation(x_w)

            #x_h = x_h * w

            hh = self.activation(x_h + recurrent_h)

        # previous and candidate state mixed by update gate
        h = (1 - w * z) * h_tm1 + (w * z) * hh

        if 0 < self.dropout + self.recurrent_dropout:
            if training is None:
                h._uses_learning_phase = True

        return h, [h]
Esempio n. 37
0
 def reshape(x, states):
     h = K.dot(x, self.W_h) + self.b_h
     return h, []
Esempio n. 38
0
def dot_product(x, kernel):
    if K.backend() == 'tensorflow':
        return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
    else:
        return K.dot(x, kernel)
Esempio n. 39
0
 def _mlp(self, input_, weights, bias):
     act = input_
     for w, b in zip(weights, bias):
         output = K.dot(act, w) + b
         act = self.activation(output)
     return output
Esempio n. 40
0
    def attention(self,
                  pre_q,
                  pre_v,
                  pre_k,
                  out_seq_len,
                  d_model,
                  attn_mask=None,
                  training=None):
        """
        Calculates the output of the attention once the affine transformations
        of the inputs are done. Here's the shapes of the arguments:
        :param pre_q: (batch_size, q_seq_len, num_heads, d_model // num_heads)
        :param pre_v: (batch_size, v_seq_len, num_heads, d_model // num_heads)
        :param pre_k: (batch_size, k_seq_len, num_heads, d_model // num_heads)
        :param out_seq_len: the length of the output sequence
        :param d_model: dimensionality of the model (by the paper)
        :param training: Passed by Keras. Should not be defined manually.
          Optional scalar tensor indicating if we're in training
          or inference phase.
        """
        # shaping Q and V into (batch_size, num_heads, seq_len, d_model//heads)
        q = K.permute_dimensions(pre_q, [0, 2, 1, 3])
        v = K.permute_dimensions(pre_v, [0, 2, 1, 3])

        if self.compression_window_size is None:
            k_transposed = K.permute_dimensions(pre_k, [0, 2, 3, 1])
        else:
            # Memory-compressed attention described in paper
            # "Generating Wikipedia by Summarizing Long Sequences"
            # (https://arxiv.org/pdf/1801.10198.pdf)
            # It compresses keys and values using 1D-convolution which reduces
            # the size of Q * K_transposed from roughly seq_len^2
            # to convoluted_seq_len^2. If we use strided convolution with
            # window size = 3 and stride = 3, memory requirements of such
            # memory-compressed attention will be 9 times smaller than
            # that of the original version.
            if self.use_masking:
                raise NotImplementedError(
                    "Masked memory-compressed attention has not "
                    "been implemented yet")
            k = K.permute_dimensions(pre_k, [0, 2, 1, 3])
            k, v = [
                K.reshape(
                    # Step 3: Return the result to its original dimensions
                    # (batch_size, num_heads, seq_len, d_model//heads)
                    K.bias_add(
                        # Step 3: ... and add bias
                        K.conv1d(
                            # Step 2: we "compress" K and V using strided conv
                            K.reshape(
                                # Step 1: we reshape K and V to
                                # (batch * num_heads,  seq_len, d_model//heads)
                                item,
                                (-1, K.int_shape(item)[-2],
                                 d_model // self.num_heads)),
                            kernel,
                            strides=self.compression_window_size,
                            padding='valid',
                            data_format='channels_last'),
                        bias,
                        data_format='channels_last'),
                    # new shape
                    K.concatenate([
                        K.shape(item)[0],
                        K.shape(item)[1],  # shape: (batch_size, num_heads)
                        [-1, d_model // self.num_heads]
                    ]))  # shape: (seq_len, n_model//num_heads)
                for item, kernel, bias in ((k, self.k_conv_kernel,
                                            self.k_conv_bias),
                                           (v, self.v_conv_kernel,
                                            self.v_conv_bias))
            ]
            k_transposed = K.permute_dimensions(k, [0, 1, 3, 2])
        # shaping K into (batch_size, num_heads, d_model//heads, seq_len)
        # for further matrix multiplication
        sqrt_d = K.sqrt(K.cast(d_model, dtype=K.floatx()) // self.num_heads)
        q_shape = K.shape(q)
        k_t_shape = K.shape(k_transposed)
        v_shape = K.shape(v)

        #q_shape = K.int_shape(q)
        #k_t_shape = K.int_shape(k_transposed)
        #v_shape = K.int_shape(v)

        # before performing batch_dot all tensors are being converted to 3D
        # shape (batch_size * num_heads, tar_seq_len, d_model//num_heads) to make sure batch_dot
        # performs identically on all backends
        attention_heads = K.reshape(
            K.batch_dot(
                self.apply_dropout_if_needed(
                    K.softmax(
                        # mask the attention for the prediction process
                        #self.mask_attention_if_needed(
                        self.mask_attention(
                            # core scaled dot product
                            K.
                            batch_dot(  # (batch_size * num_heads, tar_seq_len, src_seq_len)
                                K.reshape(
                                    q, (-1, q_shape[-2], q_shape[-1])
                                ),  # q_shape: (batch_size*num_heads, q_seq_len, d_model//heads)
                                K.reshape(
                                    k_transposed,  # k_transposed: (batch_size*num_heads, d_model//heads, k_seq_len)
                                    (-1, k_t_shape[-2], k_t_shape[-1]))) /
                            sqrt_d,
                            attn_mask)),
                    training=training),
                K.reshape(v, (-1, v_shape[-2], v_shape[-1]))
            ),  # shape: (batch_size * num_heads, v_seq_len, d_model//heads)
            (-1, self.num_heads, q_shape[-2], q_shape[-1]))
        # shape: (batch_size * seq_length, d_model)
        attention_heads_merged = K.reshape(
            # shape (batch_size, q_seq_length, num_heads, d_model // num_heads) to make sure batch_dot
            K.permute_dimensions(attention_heads, [0, 2, 1, 3]),
            (-1, d_model))
        # shape: (batch_size, out_seq_len, d_model). Generally, out_seq_len should be q_seq_len
        attention_out = K.reshape(
            K.dot(attention_heads_merged, self.output_weights),
            (-1, out_seq_len, d_model))
        return attention_out
Esempio n. 41
0
 def call(self, inputs, **kwargs):
     inputs = K.l2_normalize(inputs, -1)  # input_l2norm
     output = K.dot(inputs, self.kernel)  # cos = input_l2norm * W_l2norm
     return output
Esempio n. 42
0
 def call(self, x, mask=None):
     X = x[:, :, 0] * x[:, :, 1]
     Y = K.abs(x[:, :, 0] - x[:, :, 1])
     z = K.dot(X, self.W_p) + K.dot(Y, self.W_m)
     return K.tanh(z) #+ self.b)
def gram_matrix(x):
    features = k.batch_flatten(k.permute_dimensions(x, (2, 0, 1)))
    gram = k.dot(features, k.transpose(features))
    return gram
Esempio n. 44
0
    def _build_gram_matrix(self, x):
        features = K.batch_flatten(K.permute_dimensions(x, (2, 0, 1)))
        gram_matrix = K.dot(features, K.transpose(features))

        return gram_matrix
Esempio n. 45
0
 def call(self, x, mask=None):
     output = K.dot(x, self.W)
     if self.bias:
         output += self.b
     return self.activation(output)
Esempio n. 46
0
 def compute_similarity(self, tensor_1, tensor_2):
     dot_product = K.sum(K.dot(tensor_1, self.weight_matrix) * tensor_2,
                         axis=-1)
     return self.activation(dot_product + self.bias)
Esempio n. 47
0
 def a(x, states):
     output = K.dot(x, w_a) + b_a
     return output, []
Esempio n. 48
0
 def call(self, x, mask=None):
     return K.dot(x, self.W)
Esempio n. 49
0
def gram_matrix(features):
    return K.dot(features, K.transpose(features))
Esempio n. 50
0
 def call(self, x):
     return [K.dot(x, self.kernel), K.dot(x, self.kernel)]
Esempio n. 51
0
    def _step(self,
              x_tm1,
              h_tm1, c_tm1, H,
              u_i, u_f, u_o, u_c, w_i, w_f, w_c, w_o, w_x, w_a, v_i, v_f, v_c, v_o, b_i, b_f, b_c, b_o, b_x, b_a):

        s_tm1 = K.repeat(c_tm1, self.input_length)
        e = H + s_tm1
        def a(x, states):
            output = K.dot(x, w_a) + b_a
            return output, []
        _, energy, _ = K.rnn(a, e, [], masking=False)
        energy = activations.get('linear')(energy)
        energy = K.permute_dimensions(energy, (2, 0, 1))
        energy = energy[0]
        alpha = K.softmax(energy)
        alpha = K.repeat(alpha, self.input_dim)
        alpha = K.permute_dimensions(alpha, (0, 2 , 1))
        weighted_H = H * alpha
        
        v = K.sum(weighted_H, axis=1)

        xi_t = K.dot(x_tm1, w_i) + K.dot(v, v_i) + b_i
        xf_t = K.dot(x_tm1, w_f) + K.dot(v, v_f) + b_f
        xc_t = K.dot(x_tm1, w_c) + K.dot(v, v_c) + b_c
        xo_t = K.dot(x_tm1, w_o) + K.dot(v, v_o) + b_o

        i_t = self.inner_activation(xi_t + K.dot(h_tm1, u_i))
        f_t = self.inner_activation(xf_t + K.dot(h_tm1, u_f))
        c_t = f_t * c_tm1 + i_t * self.activation(xc_t + K.dot(h_tm1, u_c))
        o_t = self.inner_activation(xo_t + K.dot(h_tm1, u_o))
        h_t = o_t * self.activation(c_t)

        x_t = K.dot(h_t, w_x) + b_x
        return x_t, h_t, c_t
Esempio n. 52
0
def gram_matrix(x):
    print K.ndim(x), x.shape
    features = K.batch_flatten(K.permute_dimensions(x, (0, 3, 1, 2)))
    gram = K.dot(features, K.transpose(features))
    return gram
Esempio n. 53
0
    def step(self, x, states):
        (
            h_p,
            h_v,  # 0:parent, 1:traversal 
            x_type,  # 2:treetype(ins/sub,left/right); ints of size (B,). \in {0,1,2,3}
            B_U,
            B_W) = states  # 3:Udropoutmask, 4:Wdropoutmask

        #### matrix x has all 4 x computations in it
        ## per move
        this_Wx = self.W_x[x_type]  ## B, I, 4*O
        matrix_x = K.batch_dot(x * B_W[0], this_Wx) + self.b_x
        x_zp = matrix_x[:, :self.output_dim]
        x_rp = matrix_x[:, self.output_dim:2 * self.output_dim]
        x_rv = matrix_x[:, 2 * self.output_dim:3 * self.output_dim]
        x_ih = matrix_x[:, 3 * self.output_dim:]

        #### matrix p has zp, rp; matrix v has zv, rv
        matrix_p = K.dot(h_p * B_U[0], self.U_p[:, :2 * self.output_dim])

        # zp is for the parent unit update (resulting in child unit)
        inner_zp = matrix_p[:, :self.output_dim]
        z_p = self.inner_activation(x_zp + inner_zp)

        # rp is for gating to the intermediate unit of parent
        inner_rp = matrix_p[:, self.output_dim:2 * self.output_dim]
        r_p = self.inner_activation(x_rp + inner_rp)

        matrix_v = K.dot(h_v * B_U[0], self.U_v[:, :2 * self.output_dim])
        # rv is for the intermediate gate on the traversal unit
        # this gets reused for both the parent's and its own intermediate
        inner_rv = matrix_v[:, self.output_dim:2 * self.output_dim]
        r_v = self.inner_activation(x_rv + inner_rv)

        # the actual recurrence calculations
        # h_p * U and h_v * U ; as gated by their r gates
        inner_hp = K.dot(r_p * h_p * B_U[0], self.U_p[:, 2 * self.output_dim:])
        inner_hv = K.dot(r_v * h_v * B_U[0], self.U_v[:, 2 * self.output_dim:])
        # h_c_tilde is the intermediate state
        h_c_tilde = self.activation(x_ih + inner_hp + inner_hv)
        # h_c is the new child state
        h_c = z_p * h_c_tilde + (1 - z_p) * h_p

        matrix_c = K.dot(h_c * B_U[0], self.U_c) + self.b_c

        hc_zv = matrix_c[:, :self.output_dim]
        hc_rv = matrix_c[:, self.output_dim:2 * self.output_dim]
        hc_ih = matrix_c[:, 2 * self.output_dim:]

        ### zv -> gate h_v  and h_v_tilde
        ### rv -> gate h_v's contribution to h_v_tilde
        ### ih -> h_c's contribution to h_v_tilde

        # zv is for the traversal unit update.
        inner_zv = matrix_v[:, :self.output_dim]
        z_v = self.inner_activation(hc_zv + inner_zv)
        ## r_v is calculated with h_c rather than x
        r_v = self.inner_activation(hc_rv + inner_rv)

        inner_hvplus = K.dot(r_v * h_v * B_U[0],
                             self.U_v[:, 2 * self.output_dim:])
        h_vplus_tilde = self.activation(hc_ih + inner_hvplus)
        h_vplus = z_v * h_v + (1 - z_v) * h_vplus_tilde

        return h_c, h_vplus
Esempio n. 54
0
def train_model(base_model: keras.Model,
                is_causal: bool,
                tasks_meta_data: List[TaskMetadata],
                pretrain_generator,
                finetune_generator,
                pretrain_epochs: int = 1,
                pretrain_optimizer='adam',
                pretrain_steps: int = 1000000,
                pretrain_callbacks=None,
                finetune_epochs: int = 1,
                finetune_optimizer='adam',
                finetune_steps: int = 10000,
                finetune_callbacks=None,
                verbose: int = 0,
                TPUStrategy=None):
    token_input = base_model.inputs[0]
    segment_input = base_model.inputs[1]
    position_input = base_model.inputs[2]
    uses_attn_mask = len(base_model.inputs) == 4
    max_len = K.int_shape(base_model.inputs[0])[1]
    if uses_attn_mask:
        attention_mask_input = base_model.inputs[3]
    all_logits = []
    all_tasks = {task.name: task for task in tasks_meta_data}
    task_nodes = {}
    sent_level_mask_inputs = []
    assert len(all_tasks) == len(tasks_meta_data)
    for task in all_tasks.values():
        task_loss_weight = Input(batch_shape=(None, 1),
                                 dtype='float32',
                                 name=task.name + '_loss_weight')
        if task.is_token_level:
            if task.name == 'lm':
                decoder = Lambda(lambda x: K.dot(
                    x,
                    K.transpose(
                        base_model.get_layer('TokenEmbedding').weights[0])),
                                 name='lm_logits')
            else:
                decoder = Dense(units=task.num_classes,
                                name=task.name + '_logits')
            logits = TimeDistributed(decoder,
                                     name=task.name +
                                     '_logits_time_distributed')(Dropout(
                                         task.dropout)(base_model.outputs[0]))
            task_target = Input(batch_shape=(
                None,
                max_len,
            ),
                                dtype='int32',
                                name=task.name + '_target_input')
            task_mask = Input(batch_shape=(None, max_len),
                              dtype='int8' if TPUStrategy is None else 'int32',
                              name=task.name + '_mask_input')
            task_loss = Lambda(
                lambda x: x[0] * masked_classification_loss(x[1], x[2], x[3]),
                name=task.name + '_loss')(
                    [task_loss_weight, task_target, logits, task_mask])
        else:
            task_mask = Input(batch_shape=(None, 1),
                              dtype='int32',
                              name=task.name + '_mask_input')
            decoder_input = sparse_gather(base_model.outputs[0], task_mask,
                                          task.name)
            logits = Dense(units=task.num_classes, name=task.name + '_logits')(
                Dropout(task.dropout)(decoder_input))
            task_target = Input(batch_shape=(None, 1),
                                dtype='int32',
                                name=task.name + '_target_input')
            task_loss = Lambda(
                lambda x: x[0] * classification_loss(x[1], x[2]),
                name=task.name + '_loss')(
                    [task_loss_weight, task_target, logits])
            sent_level_mask_inputs.append(task_mask)
        task_nodes[task.name] = {
            'target': task_target,
            'mask': task_mask,
            'loss_weight': task_loss_weight,
            'loss': task_loss,
        }
        all_logits.append(logits)

    def get_generator(sentence_generator: Generator[SentenceBatch, None, None],
                      is_pretrain: bool):
        for i, batch in enumerate(sentence_generator):
            batch_size, seq_len = batch.tokens.shape
            x = [
                batch.tokens, batch.segments,
                generate_pos_ids(batch_size, max_len)
            ]
            y = []
            if uses_attn_mask:
                x.append(create_attention_mask(batch.padding_mask, is_causal))
            for task_name in task_nodes.keys():
                if is_pretrain:
                    cond = all_tasks[
                        task_name].weight_scheduler.active_in_pretrain
                else:
                    cond = all_tasks[
                        task_name].weight_scheduler.active_in_finetune
                if cond:
                    if task_name in batch.sentence_classification:
                        task_data_batch = batch.sentence_classification[
                            task_name]
                    else:
                        task_data_batch = batch.token_classification[task_name]
                    x.append(task_data_batch.target)
                    if all_tasks[task_name].is_token_level:
                        x.append(task_data_batch.target_mask)
                    else:
                        x.append(
                            (task_data_batch.target_mask +
                             np.arange(batch_size) * seq_len).astype(np.int32))
                    x.append(
                        np.repeat(
                            np.array([
                                all_tasks[task_name].weight_scheduler.get(
                                    is_pretrain, i)
                            ]), batch_size, 0))
                    y.append(np.repeat(np.array([0.0]), batch_size, 0))
            yield x, y

    def train_step(is_pretrain: bool):
        _inputs = [token_input, segment_input, position_input]
        _outputs = []
        if uses_attn_mask:
            _inputs.append(attention_mask_input)
        for task_name in task_nodes.keys():
            if is_pretrain:
                cond = all_tasks[task_name].weight_scheduler.active_in_pretrain
            else:
                cond = all_tasks[task_name].weight_scheduler.active_in_finetune
            if cond:
                _inputs.append(task_nodes[task_name]['target'])
                _inputs.append(task_nodes[task_name]['mask'])
                _inputs.append(task_nodes[task_name]['loss_weight'])
                _outputs.append(task_nodes[task_name]['loss'])
        _generator = get_generator(
            pretrain_generator if is_pretrain else finetune_generator,
            is_pretrain)
        _model = keras.Model(inputs=_inputs, outputs=_outputs)
        if TPUStrategy is not None:
            '''
            Create TPUStrategy like this:
            tpu_address = 'grpc://' + os.environ['COLAB_TPU_ADDR']
            TPUStrategy = tf.contrib.tpu.TPUDistributionStrategy(
                tf.contrib.cluster_resolver.TPUClusterResolver(tpu=tpu_address)
            )
            '''
            _model = tf.contrib.tpu.keras_to_tpu_model(_model,
                                                       strategy=TPUStrategy)
        _model.compile(
            pretrain_optimizer if is_pretrain else finetune_optimizer,
            loss=pass_through_loss)
        _model.fit_generator(
            _generator,
            steps_per_epoch=pretrain_steps if is_pretrain else finetune_steps,
            verbose=verbose,
            callbacks=pretrain_callbacks
            if is_pretrain else finetune_callbacks,
            shuffle=False,
            epochs=pretrain_epochs if is_pretrain else finetune_epochs)

    if pretrain_generator is not None:
        train_step(True)
    if finetune_generator is not None:
        train_step(False)

    ret_model = keras.Model(inputs=base_model.inputs + sent_level_mask_inputs,
                            outputs=all_logits)
    if TPUStrategy is not None:
        ret_model = tf.contrib.tpu.keras_to_tpu_model(ret_model,
                                                      strategy=TPUStrategy)
        # Compile for TPU model predicting for the first time. Also you can have a new compile for training use after this
        ret_model.compile(finetune_optimizer, loss=pass_through_loss)
    return ret_model
Esempio n. 55
0
def get_Gram_matrix(F):
    G = K.dot(F, K.transpose(F))
    return G
Esempio n. 56
0
def dot_product(x, kernel):
    return K.squeeze(K.dot(x, K.expand_dims(kernel)), axis=-1)
Esempio n. 57
0
    def step(self, x, states):

        ytm, stm = states

        # repeat the hidden state to the length of the sequence
        _stm = K.repeat(stm, self.timesteps)

        # now multiplty the weight matrix with the repeated hidden state
        _Wxstm = K.dot(_stm, self.W_a)

        # calculate the attention probabilities
        # this relates how much other timesteps contributed to this one.
        et = K.dot(activations.tanh(_Wxstm + self._uxpb),
                   K.expand_dims(self.V_a))
        at = K.exp(et)
        at_sum = K.sum(at, axis=1)
        at_sum_repeated = K.repeat(at_sum, self.timesteps)
        at /= at_sum_repeated  # vector of size (batchsize, timesteps, 1)

        # calculate the context vector
        context = K.squeeze(K.batch_dot(at, self.x_seq, axes=1), axis=1)
        # ~~~> calculate new hidden state
        # first calculate the "r" gate:

        rt = activations.sigmoid(
            K.dot(ytm, self.W_r) + K.dot(stm, self.U_r) +
            K.dot(context, self.C_r) + self.b_r)

        # now calculate the "z" gate
        zt = activations.sigmoid(
            K.dot(ytm, self.W_z) + K.dot(stm, self.U_z) +
            K.dot(context, self.C_z) + self.b_z)

        # calculate the proposal hidden state:
        s_tp = activations.tanh(
            K.dot(ytm, self.W_p) + K.dot((rt * stm), self.U_p) +
            K.dot(context, self.C_p) + self.b_p)

        # new hidden state:
        st = (1 - zt) * stm + zt * s_tp

        yt = activations.softmax(
            K.dot(ytm, self.W_o) + K.dot(stm, self.U_o) +
            K.dot(context, self.C_o) + self.b_o)

        if self.return_probabilities:
            return at, [yt, st]
        else:
            return yt, [yt, st]
Esempio n. 58
0
 def gram_matrix(x):
     flatten = K.batch_flatten(K.permute_dimensions(x, (2, 0, 1)))
     gram_m = K.dot(flatten, K.transpose(flatten))
     return gram_m
Esempio n. 59
0
 def get_constants(self, enc_output, constants):
     constants.append(K.dot(enc_output,self.W1))
     constants.append(enc_output)
     return constants
Esempio n. 60
0
 def loss2(self, y_true, y_pred):
     sigma = K.cast_to_floatx(np.diag(np.full((2 * self.F, ), 0.005)))
     return (0.5) * K.dot(
         K.dot((y_true - y_pred), tf.matrix_inverse(sigma)),
         tf.transpose(y_true - y_pred))