Python get_keras_initializationの例、docqa.nn.layers.get_keras_initialization Pythonの例

コード例 #1

0

ファイルを表示

 def __call__(self, is_train, scope=None):
     activation = get_keras_activation(self.activation)
     recurrent_initializer = get_keras_initialization(self.recurrent_initializer)
     kernel_initializer = get_keras_initialization(self.kernel_initializer)
     candidate_initializer = get_keras_initialization(self.candidate_initializer)
     return GRUCell(self.num_units, tf.constant_initializer(self.bais_init),
                    kernel_initializer, recurrent_initializer, candidate_initializer, activation)

コード例 #2

0

ファイルを表示

    def __call__(self, is_train, scope=None):
        activation = get_keras_activation(self.activation)
        recurrent_activation = get_keras_activation(self.recurrent_activation)
        kernel_initializer = get_keras_initialization(self.kernel_initializer)
        recurrent_initializer = get_keras_initialization(self.recurrent_initializer)
        if activation is None or kernel_initializer is None \
                or recurrent_initializer is None or recurrent_activation is None:
            raise ValueError()

        cell = InitializedLSTMCell(self.num_units, kernel_initializer,
                                   recurrent_initializer, activation,
                                   recurrent_activation, self.forget_bias,
                                   self.keep_recurrent_probs, is_train, scope)
        return cell

コード例 #3

0

ファイルを表示

    def apply(self,
              is_train,
              x,
              memories,
              answer: List[Tensor],
              x_mask=None,
              memory_mask=None):
        with tf.variable_scope("map_context"):
            memories = self.context_mapper.apply(is_train, memories,
                                                 memory_mask)
        with tf.variable_scope("encode_context"):
            encoded = self.context_encoder.apply(is_train, memories,
                                                 memory_mask)
        with tf.variable_scope("merge"):
            x = self.merge.apply(is_train, x, encoded, x_mask)
        with tf.variable_scope("predict"):
            m1, m2 = self.bounds_predictor.apply(is_train, x, x_mask)

        init = get_keras_initialization(self.init)
        with tf.variable_scope("logits1"):
            l1 = fully_connected(m1,
                                 1,
                                 activation_fn=None,
                                 weights_initializer=init)
            l1 = tf.squeeze(l1, squeeze_dims=[2])
        with tf.variable_scope("logits2"):
            l2 = fully_connected(m2,
                                 1,
                                 activation_fn=None,
                                 weights_initializer=init)
            l2 = tf.squeeze(l2, squeeze_dims=[2])

        with tf.variable_scope("predict_span"):
            return self.span_predictor.predict(answer, l1, l2, x_mask)

コード例 #4

0

ファイルを表示

ファイル: attention.py プロジェクト: hamishivi/document-qa

    def apply(self, is_train, x, mask=None):
        if self.key_mapper is not None:
            with tf.variable_scope("map_keys"):
                keys = self.key_mapper.apply(is_train, x, mask)
        else:
            keys = x

        weights = tf.get_variable("weights", (keys.shape.as_list()[-1], self.n_encodings), dtype=tf.float32,
                                  initializer=get_keras_initialization(self.init))
        dist = tf.tensordot(keys, weights, axes=[[2], [0]])  # (batch, x_words, n_encoding)
        if self.bias:
            dist += tf.get_variable("bias", (1, 1, self.n_encodings),
                                    dtype=tf.float32, initializer=tf.zeros_initializer())
        if mask is not None:
            bool_mask = tf.expand_dims(tf.cast(tf.sequence_mask(mask, tf.shape(x)[1]), tf.float32), 2)
            dist = bool_mask * bool_mask + (1 - bool_mask) * VERY_NEGATIVE_NUMBER

        dist = tf.nn.softmax(dist, dim=1)

        out = tf.einsum("ajk,ajn->ank", x, dist)  # (batch, n_encoding, feature)

        if self.post_process is not None:
            with tf.variable_scope("post_process"):
                out = self.post_process.apply(is_train, out)
        return out

コード例 #5

0

ファイルを表示

ファイル: similarity_layers.py プロジェクト: kapil1201/Machine-Learning-and-Python-Work

    def _distance_logits(self, x1, x2):
        init = get_keras_initialization(self.init)

        project1 = tf.get_variable("project1",
                                   (x1.shape.as_list()[-1], self.project_size),
                                   initializer=init)
        x1 = tf.tensordot(x1, project1, [[2], [0]])

        if self.share_project:
            if x2.shape.as_list()[-1] != x1.shape.as_list()[-1]:
                raise ValueError()
            project2 = project1
        else:
            project2 = tf.get_variable(
                "project2", (x2.shape.as_list()[-1], self.project_size),
                initializer=init)
        x2 = tf.tensordot(x2, project2, [[2], [0]])

        if self.project_bias:
            x1 += tf.get_variable("bias1", (1, 1, self.project_size),
                                  initializer=tf.zeros_initializer())
            x2 += tf.get_variable("bias2", (1, 1, self.project_size),
                                  initializer=tf.zeros_initializer())

        dots = tf.matmul(x1, x2, transpose_b=True)
        if self.scale:
            dots /= tf.sqrt(tf.cast(self.project_size, tf.float32))
        return dots

コード例 #6

0

ファイルを表示

ファイル: similarity_layers.py プロジェクト: kapil1201/Machine-Learning-and-Python-Work

    def _distance_logits(self, x, keys):
        init = get_keras_initialization(self.init)

        key_w = tf.get_variable("key_w",
                                shape=keys.shape.as_list()[-1],
                                initializer=init,
                                dtype=tf.float32)
        key_logits = tf.tensordot(keys, key_w, axes=[[2],
                                                     [0]])  # (batch, key_len)

        x_w = tf.get_variable("input_w",
                              shape=x.shape.as_list()[-1],
                              initializer=init,
                              dtype=tf.float32)
        x_logits = tf.tensordot(x, x_w, axes=[[2], [0]])  # (batch, x_len)

        dot_w = tf.get_variable("dot_w",
                                shape=x.shape.as_list()[-1],
                                initializer=init,
                                dtype=tf.float32)

        # Compute x * dot_weights first, the batch mult with x
        x_dots = x * tf.expand_dims(tf.expand_dims(dot_w, 0), 0)
        dot_logits = tf.matmul(x_dots, keys, transpose_b=True)

        return dot_logits + tf.expand_dims(key_logits, 1) + tf.expand_dims(
            x_logits, 2)

コード例 #7

0

ファイルを表示

ファイル: span_prediction.py プロジェクト: yumere/document-qa

    def apply(self, is_train, context_embed, answer, context_mask=None):
        init_fn = get_keras_initialization(self.init)
        with tf.variable_scope("bounds_encoding"):
            m1, m2, m3 = self.predictor.apply(is_train, context_embed,
                                              context_mask)

        with tf.variable_scope("start_pred"):
            logits1 = fully_connected(m1,
                                      1,
                                      activation_fn=None,
                                      weights_initializer=init_fn)
            logits1 = tf.squeeze(logits1, squeeze_dims=[2])

        with tf.variable_scope("end_pred"):
            logits2 = fully_connected(m2,
                                      1,
                                      activation_fn=None,
                                      weights_initializer=init_fn)
            logits2 = tf.squeeze(logits2, squeeze_dims=[2])

        with tf.variable_scope("yes_no_pred"):
            logits3 = self.sequence_reducer.apply(None, m3)
            logits3 = fully_connected(logits3,
                                      3,
                                      activation_fn=None,
                                      weights_initializer=init_fn)

        with tf.variable_scope("predict_span"):
            return self.span_predictor.predict(answer, logits1, logits2,
                                               logits3, context_mask)

コード例 #8

0

ファイルを表示

ファイル: similarity_layers.py プロジェクト: kapil1201/Machine-Learning-and-Python-Work

    def _distance_logits(self, x, keys):
        init = get_keras_initialization(self.init)
        key_w = tf.get_variable("key_w",
                                shape=keys.shape.as_list()[-1],
                                initializer=init,
                                dtype=tf.float32)
        key_logits = tf.tensordot(keys, key_w, axes=[[2],
                                                     [0]])  # (batch, key_len)

        x_w = tf.get_variable("x_w",
                              shape=x.shape.as_list()[-1],
                              initializer=init,
                              dtype=tf.float32)
        x_logits = tf.tensordot(x, x_w, axes=[[2], [0]])  # (batch, x_len)

        # Broadcasting will expand the arrays to (batch, x_len, key_len)
        return tf.expand_dims(x_logits, axis=2) + tf.expand_dims(key_logits,
                                                                 axis=1)

コード例 #9

0

ファイルを表示

ファイル: attention.py プロジェクト: hamishivi/document-qa

    def apply(self, is_train, x, mask=None):
        if self.key_mapper is not None:
            with tf.variable_scope("map_keys"):
                keys = self.key_mapper.apply(is_train, x, mask)
        else:
            keys = x

        weights = tf.get_variable("weights", keys.shape.as_list()[-1], dtype=tf.float32,
                                  initializer=get_keras_initialization(self.init))
        dist = tf.tensordot(keys, weights, axes=[[2], [0]])  # (batch, x_words)
        dist = exp_mask(dist, mask)
        dist = tf.nn.softmax(dist)

        out = tf.einsum("ajk,aj->ak", x, dist)  # (batch, x_dim)

        if self.post_process is not None:
            with tf.variable_scope("post_process"):
                out = self.post_process.apply(is_train, out)
        return out

コード例 #10

0

ファイルを表示

ファイル: similarity_layers.py プロジェクト: kapil1201/Machine-Learning-and-Python-Work

    def _distance_logits(self, x, keys):
        init = get_keras_initialization(self.init)
        key_w = tf.get_variable("key_w",
                                shape=(keys.shape.as_list()[-1],
                                       self.projected_size),
                                initializer=init,
                                dtype=tf.float32)
        key_logits = tf.tensordot(keys, key_w,
                                  axes=[[2], [0]
                                        ])  # (batch, key_len, projected_size)

        if self.shared_project:
            x_w = key_w
        else:
            x_w = tf.get_variable("x_w",
                                  shape=(x.shape.as_list()[-1],
                                         self.projected_size),
                                  initializer=init,
                                  dtype=tf.float32)

        x_logits = tf.tensordot(x, x_w,
                                axes=[[2],
                                      [0]])  # (batch, x_len, projected_size)

        summed = tf.expand_dims(x_logits, axis=2) + tf.expand_dims(
            key_logits, axis=1)  # (batch, key_len, x_len, poject_size)

        summed = get_keras_activation(self.activation)(summed)

        combine_w = tf.get_variable("combine_w",
                                    shape=self.projected_size,
                                    initializer=init,
                                    dtype=tf.float32)

        return tf.tensordot(summed, combine_w,
                            axes=[[3], [0]])  # (batch, key_len, x_len)

コード例 #11

0

ファイルを表示

    def apply(self, is_train, context_embed, answer, context_mask=None):
        init_fn = get_keras_initialization(self.init)
        m1, m2 = self.predictor.apply(is_train, context_embed, context_mask)

        if m1.shape.as_list()[-1] != 1:
            with tf.variable_scope("start_pred"):
                start_logits = fully_connected(m1,
                                               1,
                                               activation_fn=None,
                                               weights_initializer=init_fn)
        else:
            start_logits = m1
        start_logits = tf.squeeze(start_logits, squeeze_dims=[2])

        if m1.shape.as_list()[-1] != 1:
            with tf.variable_scope("end_pred"):
                end_logits = fully_connected(m2,
                                             1,
                                             activation_fn=None,
                                             weights_initializer=init_fn)
        else:
            end_logits = m2
        end_logits = tf.squeeze(end_logits, squeeze_dims=[2])

        masked_start_logits = exp_mask(start_logits, context_mask)
        masked_end_logits = exp_mask(end_logits, context_mask)

        start_atten = tf.einsum("ajk,aj->ak", m1,
                                tf.nn.softmax(masked_start_logits))
        end_atten = tf.einsum("ajk,aj->ak", m2,
                              tf.nn.softmax(masked_end_logits))
        with tf.variable_scope("encode_context"):
            enc = self.encoder.apply(is_train, context_embed, context_mask)
        if len(enc.shape) == 3:
            _, encodings, fe = enc.shape.as_list()
            enc = tf.reshape(enc, (-1, encodings * fe))

        with tf.variable_scope("confidence"):
            conf = [start_atten, end_atten, enc]
            none_logit = self.confidence_predictor.apply(
                is_train, tf.concat(conf, axis=1))
        with tf.variable_scope("confidence_logits"):
            none_logit = fully_connected(none_logit,
                                         1,
                                         activation_fn=None,
                                         weights_initializer=init_fn)
            none_logit = tf.squeeze(none_logit, axis=1)

        batch_dim = tf.shape(start_logits)[0]

        # (batch, (l * l)) logits for each (start, end) pair
        all_logits = tf.reshape(
            tf.expand_dims(masked_start_logits, 1) +
            tf.expand_dims(masked_end_logits, 2), (batch_dim, -1))

        # (batch, (l * l) + 1) logits including the none option
        all_logits = tf.concat(
            [all_logits, tf.expand_dims(none_logit, 1)], axis=1)
        log_norms = tf.reduce_logsumexp(all_logits, axis=1)

        # Now build a "correctness" mask in the same format
        correct_mask = tf.logical_and(tf.expand_dims(answer[0], 1),
                                      tf.expand_dims(answer[1], 2))
        correct_mask = tf.reshape(correct_mask, (batch_dim, -1))
        correct_mask = tf.concat([
            correct_mask,
            tf.logical_not(tf.reduce_any(answer[0], axis=1, keep_dims=True))
        ],
                                 axis=1)

        # Note we are happily allowing the model to place weights on "backwards" spans, and also giving
        # it points for predicting spans that start and end at different answer spans. It would be easy to
        # fix by masking out some of the `all_logit` matrix and specify a more accuracy correct_mask, but I
        # in general left it this way to be consistent with the independent bound models that do the same.
        # Some early tests found properly masking things to not make much difference (or even to hurt), but it
        # still could be an avenue for improvement

        log_correct = tf.reduce_logsumexp(
            all_logits + VERY_NEGATIVE_NUMBER *
            (1 - tf.cast(correct_mask, tf.float32)),
            axis=1)
        loss = tf.reduce_mean(-(log_correct - log_norms))
        probs = tf.nn.softmax(all_logits)
        tf.add_to_collection(tf.GraphKeys.LOSSES, loss)
        return ConfidencePrediction(probs[:, :-1], masked_start_logits,
                                    masked_end_logits, probs[:, -1],
                                    none_logit, context_mask)

コード例 #12

0

ファイルを表示

    def apply(self, is_train, context_embed, answer, context_mask=None):
        init_fn = get_keras_initialization(self.init)
        bool_mask = tf.sequence_mask(context_mask, tf.shape(context_embed)[1])

        with tf.variable_scope("predict"):
            m1, m2 = self.mapper.apply(is_train, context_embed, context_mask)

        if self.pre_process is not None:
            with tf.variable_scope("pre-process1"):
                m1 = self.pre_process.apply(is_train, m1, context_mask)
            with tf.variable_scope("pre-process2"):
                m2 = self.pre_process.apply(is_train, m2, context_mask)

        span_vector_lst = []
        mask_lst = []
        with tf.variable_scope("merge"):
            span_vector_lst.append(self.merge.apply(is_train, m1, m2))
        mask_lst.append(bool_mask)
        for i in range(1, self.bound):
            with tf.variable_scope("merge", reuse=True):
                span_vector_lst.append(
                    self.merge.apply(is_train, m1[:, :-i], m2[:, i:]))
            mask_lst.append(bool_mask[:, i:])

        mask = tf.concat(mask_lst, axis=1)
        span_vectors = tf.concat(
            span_vector_lst,
            axis=1)  # all logits -> flattened per-span predictions

        if self.post_process is not None:
            with tf.variable_scope("post-process"):
                span_vectors = self.post_process.apply(is_train, span_vectors)

        with tf.variable_scope("compute_logits"):
            logits = fully_connected(span_vectors,
                                     1,
                                     activation_fn=None,
                                     weights_initializer=init_fn)

        logits = tf.squeeze(logits, squeeze_dims=[2])
        logits = logits + VERY_NEGATIVE_NUMBER * (
            1 - tf.cast(tf.concat(mask, axis=1), tf.float32))

        l = tf.shape(context_embed)[1]

        if len(answer) == 1:
            answer = answer[0]
            if answer.dtype == tf.int32:
                if self.f1_weight == 0:
                    answer_ix = to_packed_coordinates(answer, l, self.bound)
                    loss = tf.reduce_mean(
                        tf.nn.sparse_softmax_cross_entropy_with_logits(
                            logits=logits, labels=answer_ix))
                else:
                    f1_mask = packed_span_f1_mask(answer, l, self.bound)
                    if self.f1_weight < 1:
                        f1_mask *= self.f1_weight
                        f1_mask += (1 - self.f1_weight) * tf.one_hot(
                            to_packed_coordinates(answer, l, self.bound), l)

                    # TODO can we stay in log space?  (actually its tricky since f1_mask can have zeros...)
                    probs = tf.nn.softmax(logits)
                    loss = -tf.reduce_mean(
                        tf.log(tf.reduce_sum(probs * f1_mask, axis=1)))
            else:
                log_norm = tf.reduce_logsumexp(logits, axis=1)
                if self.aggregate == "sum":
                    log_score = tf.reduce_logsumexp(
                        logits + VERY_NEGATIVE_NUMBER *
                        (1 - tf.cast(answer, tf.float32)),
                        axis=1)
                elif self.aggregate == "max":
                    log_score = tf.reduce_max(
                        logits + VERY_NEGATIVE_NUMBER *
                        (1 - tf.cast(answer, tf.float32)),
                        axis=1)
                else:
                    raise NotImplementedError()
                loss = tf.reduce_mean(-(log_score - log_norm))
        else:
            raise NotImplementedError()

        tf.add_to_collection(tf.GraphKeys.LOSSES, loss)
        return PackedSpanPrediction(logits, l, self.bound)

コード例 #13

0

ファイルを表示

    def _apply_transposed(self, is_train, x):
        w_init = get_keras_initialization(self.w_init)
        r_init = None if self.recurrent_init is None else get_keras_initialization(self.recurrent_init)
        x_size = x.shape.as_list()[-1]
        if x_size is None:
            raise ValueError("Last dimension must be defined (have shape %s)" % str(x.shape))

        if self._kind == "GRU":
            cell = cudnn_rnn_ops.CudnnGRU(self.n_layers, self.n_units, x_size, input_mode="linear_input")
        elif self._kind == "LSTM":
            cell = cudnn_rnn_ops.CudnnLSTM(self.n_layers, self.n_units, x_size, input_mode="linear_input")
        else:
            raise ValueError()

        n_params = cell.params_size().eval()
        weights, biases = cell.params_to_canonical(tf.zeros([n_params]))

        def init(shape, dtype=None, partition_info=None):
            # This a bit hacky, since the api for these models is akward. We have to compute the shape of
            # the weights / biases by calling `cell.params_to_canonical` with a unused tensor, and then
            # use .eval() to actually get the shape. Then we can apply the user-requested initialzers
            if self._kind == "LSTM":
                is_recurrent = [False, False, False, False, True, True, True, True]
                is_forget_bias = [False, True, False, False, False, True, False, False]
            else:
                is_recurrent = [False, False, False, True, True, True]
                is_forget_bias = [False] * 6

            init_biases = [tf.constant(self.lstm_bias/2.0, tf.float32, (self.n_units,)) if z else tf.zeros(self.n_units)
                           for z in is_forget_bias]
            init_weights = []

            for w, r in zip(weights, is_recurrent):
                if r and r_init is not None:
                    init_weights.append(tf.reshape(r_init((self.n_units, self.n_units), w.dtype), tf.shape(w)))
                else:
                    init_weights.append(w_init(tf.shape(w).eval(), w.dtype))
            out = cell.canonical_to_params(init_weights, init_biases)
            out.set_shape((n_params, ))

            return out

        parameters = tf.get_variable(
            "gru_parameters",
            n_params,
            tf.float32,
            initializer=init
        )

        if self.keep_recurrent < 1:
            # Not super well test, try to figure out which indices in `parameters` are recurrent weights and drop them
            # this is implementing drop-connect for the recurrent weights
            is_recurrent = weights[:len(weights) // 2] + [tf.ones_like(w) for w in weights[len(weights) // 2:]]
            recurrent_mask = cell.canonical_to_params(is_recurrent, biases)  # ones at recurrent weights
            recurrent_mask = 1 - recurrent_mask * (1 - self.keep_recurrent)  # ones are non-recurrent param, keep_prob elsewhere
            parameters = tf.cond(is_train,
                                 lambda: tf.floor(tf.random_uniform((n_params, )) + recurrent_mask) * parameters,
                                 lambda: parameters)

        if self._kind == "LSTM":
            if self.learn_initial_states:
                raise NotImplementedError()
            else:
                initial_state_h = tf.zeros((self.n_layers, tf.shape(x)[1], self.n_units), tf.float32)
                initial_state_c = tf.zeros((self.n_layers, tf.shape(x)[1], self.n_units), tf.float32)
            out = cell(x, initial_state_h, initial_state_c, parameters, True)
        else:
            if self.learn_initial_states:
                initial_state = tf.get_variable("initial_state", self.n_units,
                                                tf.float32, tf.zeros_initializer())
                initial_state = tf.tile(tf.expand_dims(tf.expand_dims(initial_state, 0), 0),
                                        [self.n_layers, tf.shape(x)[1], 1])
            else:
                initial_state = tf.zeros((self.n_layers, tf.shape(x)[1], self.n_units), tf.float32)
            out = cell(x, initial_state, parameters, True)
        return out

コード例 #14

0

ファイルを表示

ファイル: attention.py プロジェクト: hamishivi/document-qa

    def apply(self, is_train, x, mask=None):
        batch_size = tf.shape(x)[0]
        x_word_dim = tf.shape(x)[1]
        x_feature_dim = x.shape.as_list()[-1]
        project_size = self.project_size
        if project_size is None:
            project_size = x_feature_dim // self.n_heads
            if x_feature_dim % self.n_heads != 0:
                raise ValueError()
        mem_size = self.memory_size
        if mem_size is None:
            mem_size = project_size

        init = get_keras_initialization(self.init)

        query_proj = tf.get_variable("query_proj", (x_feature_dim, self.n_heads, project_size), initializer=init)
        if self.shared_project:
            key_proj = query_proj
        else:
            key_proj = tf.get_variable("key_proj", (x_feature_dim, self.n_heads, project_size), initializer=init)
        mem_proj = tf.get_variable("mem_proj", (x_feature_dim, self.n_heads, mem_size), initializer=init)

        queries = tf.tensordot(x, query_proj, [[2], [0]])  # (batch, word, n_head, project_size)
        keys = tf.tensordot(x, key_proj, [[2], [0]])  # (batch, key, n_head, project_size)

        if self.project_bias:
            queries += tf.get_variable("query_bias", (1, 1, self.n_heads, project_size),
                                        initializer=tf.zeros_initializer())
            keys += tf.get_variable("key_bias", (1, 1, self.n_heads, project_size),
                                        initializer=tf.zeros_initializer())

        # dist_matrix = tf.matmul(queries, keys, transpose_b=True)
        dist_matrix = tf.einsum("bwhd,bkhd->bwkh", queries, keys)   # dots of (batch, word, key, head)

        if self.scale:
            dist_matrix /= tf.sqrt(float(project_size))

        if self.bilinear_comp:
            query_bias_proj = tf.get_variable("query_bias_proj", (x_feature_dim, self.n_heads), initializer=init)
            key_bias_proj = tf.get_variable("query_bias_proj", (x_feature_dim, self.n_heads), initializer=init)
            dist_matrix += tf.expand_dims(tf.tensordot(x, query_bias_proj, [[2], [0]]), 2)
            dist_matrix += tf.expand_dims(tf.tensordot(x, key_bias_proj, [[2], [0]]), 1)

        joint_mask = compute_attention_mask(mask, mask, x_word_dim, x_word_dim)
        if joint_mask is not None:
            dist_matrix += tf.expand_dims(VERY_NEGATIVE_NUMBER * (1 - tf.cast(joint_mask, dist_matrix.dtype)), 2)
        dist_matrix += tf.expand_dims(tf.expand_dims(tf.eye(x_word_dim) * VERY_NEGATIVE_NUMBER, 0), 2)

        if self.bias:
            bias = tf.get_variable("bias", (1, 1, self.n_heads, 1), initializer=tf.zeros_initializer())
            dist_matrix += bias

        select_probs = tf.nn.softmax(dist_matrix)   # for each (batch, word, head) probability over keys

        memories = tf.tensordot(x, mem_proj, [[2], [0]])  # (batch, memory, head, mem_size)
        response = tf.einsum("bwhk,bkhd->bwhd", select_probs, memories)  # (batch, word, head, mem_size)

        response = tf.reshape(response, (batch_size, x_word_dim, self.n_heads * mem_size))   # concat heads

        if self.merge is not None:
            with tf.variable_scope("merge"):
                 response = self.merge.apply(is_train, x, response)
        return response