Exemple #1
0
    def transduce(self, sent: ExpressionSequence) -> ExpressionSequence:
        if self.pos_encoding_type == "trigonometric":
            if self.position_encoding_block is None or self.position_encoding_block.shape[
                    2] < len(sent):
                self.initialize_position_encoding(
                    int(len(sent) * 1.2),
                    self.input_dim if self.pos_encoding_combine == "add" else
                    self.pos_encoding_size)
            encoding = dy.inputTensor(
                self.position_encoding_block[0, :, :len(sent)])
        elif self.pos_encoding_type == "embedding":
            encoding = self.positional_embedder.embed_sent(
                len(sent)).as_tensor()
        if self.pos_encoding_type:
            if self.pos_encoding_combine == "add":
                sent = ExpressionSequence(expr_tensor=sent.as_tensor() +
                                          encoding,
                                          mask=sent.mask)
            else:  # concat
                sent = ExpressionSequence(expr_tensor=dy.concatenate(
                    [sent.as_tensor(), encoding]),
                                          mask=sent.mask)

        elif self.pos_encoding_type:
            raise ValueError(f"unknown encoding type {self.pos_encoding_type}")
        for module in self.modules:
            enc_sent = module.transduce(sent)
            sent = enc_sent
        self._final_states = [transducers.FinalTransducerState(sent[-1])]
        return sent
Exemple #2
0
    def transduce(self, es: ExpressionSequence) -> ExpressionSequence:
        """
    returns the list of output Expressions obtained by adding the given inputs
    to the current state, one by one.

    Args:
      es: an ExpressionSequence
    """
        es_list = [es]

        for layer_i, fb in enumerate(self.builder_layers):
            reduce_factor = self._reduce_factor_for_layer(layer_i)

            if es_list[0].mask is None: mask_out = None
            else: mask_out = es_list[0].mask.lin_subsampled(reduce_factor)

            if self.downsampling_method == "concat" and len(
                    es_list[0]) % reduce_factor != 0:
                raise ValueError(
                    f"For 'concat' subsampling, sequence lengths must be multiples of the total reduce factor, "
                    f"but got sequence length={len(es_list[0])} for reduce_factor={reduce_factor}. "
                    f"Set Batcher's pad_src_to_multiple argument accordingly.")
            fs = fb.transduce(es_list)
            if layer_i < len(self.builder_layers) - 1:
                if self.downsampling_method == "skip":
                    es_list = [
                        ExpressionSequence(expr_list=fs[::reduce_factor],
                                           mask=mask_out)
                    ]
                elif self.downsampling_method == "concat":
                    es_len = len(es_list[0])
                    es_list_fwd = []
                    for i in range(0, es_len, reduce_factor):
                        for j in range(reduce_factor):
                            if i == 0:
                                es_list_fwd.append([])
                            es_list_fwd[j].append(fs[i + j])
                    es_list = [
                        ExpressionSequence(expr_list=es_list_fwd[j],
                                           mask=mask_out)
                        for j in range(reduce_factor)
                    ]
                else:
                    raise RuntimeError(
                        f"unknown downsampling_method {self.downsampling_method}"
                    )
            else:
                # concat final outputs
                ret_es = ExpressionSequence(expr_list=[f for f in fs],
                                            mask=mask_out)

        self._final_states = [
            FinalTransducerState(fb.get_final_states()[0].main_expr(),
                                 fb.get_final_states()[0].cell_expr())
            for fb in self.builder_layers
        ]

        return ret_es
Exemple #3
0
  def transduce(self, es: ExpressionSequence) -> ExpressionSequence:
    """
    returns the list of output Expressions obtained by adding the given inputs
    to the current state, one by one, to both the forward and backward RNNs,
    and concatenating.

    Args:
      es: an ExpressionSequence
    """
    es_list = [es]

    for layer_i, (fb, bb) in enumerate(self.builder_layers):
      reduce_factor = self._reduce_factor_for_layer(layer_i)

      if es_list[0].mask is None: mask_out = None
      else: mask_out = es_list[0].mask.lin_subsampled(reduce_factor)

      if self.downsampling_method=="concat" and len(es_list[0]) % reduce_factor != 0:
        raise ValueError(f"For 'concat' subsampling, sequence lengths must be multiples of the total reduce factor, "
                         f"but got sequence length={len(es_list[0])} for reduce_factor={reduce_factor}. "
                         f"Set Batcher's pad_src_to_multiple argument accordingly.")
      fs = fb.transduce(es_list)
      bs = bb.transduce([ReversedExpressionSequence(es_item) for es_item in es_list])
      if layer_i < len(self.builder_layers) - 1:
        if self.downsampling_method=="skip":
          es_list = [ExpressionSequence(expr_list=fs[::reduce_factor], mask=mask_out),
                     ExpressionSequence(expr_list=bs[::reduce_factor][::-1], mask=mask_out)]
        elif self.downsampling_method=="concat":
          es_len = len(es_list[0])
          es_list_fwd = []
          es_list_bwd = []
          for i in range(0, es_len, reduce_factor):
            for j in range(reduce_factor):
              if i==0:
                es_list_fwd.append([])
                es_list_bwd.append([])
              es_list_fwd[j].append(fs[i+j])
              es_list_bwd[j].append(bs[len(es_list[0])-reduce_factor+j-i])
          es_list = [ExpressionSequence(expr_list=es_list_fwd[j], mask=mask_out) for j in range(reduce_factor)] + \
                    [ExpressionSequence(expr_list=es_list_bwd[j], mask=mask_out) for j in range(reduce_factor)]
        else:
          raise RuntimeError(f"unknown downsampling_method {self.downsampling_method}")
      else:
        # concat final outputs
        ret_es = ExpressionSequence(
          expr_list=[dy.concatenate([f, b]) for f, b in zip(fs, ReversedExpressionSequence(bs))], mask=mask_out)

    self._final_states = [FinalTransducerState(dy.concatenate([fb.get_final_states()[0].main_expr(),
                                                               bb.get_final_states()[0].main_expr()]),
                                               dy.concatenate([fb.get_final_states()[0].cell_expr(),
                                                               bb.get_final_states()[0].cell_expr()])) \
                          for (fb, bb) in self.builder_layers]

    return ret_es
Exemple #4
0
  def transduce(self, src: ExpressionSequence) -> ExpressionSequence:
    src = src.as_tensor()

    src_height = src.dim()[0][0]
    src_width = src.dim()[0][1]
    # src_channels = 1
    batch_size = src.dim()[1]

    # convolution and pooling layers
    # src dim is ((40, 1000), 128)
    src = padding(src, self.filter_width[0]+3)
    l1 = dy.rectify(dy.conv2d(src, dy.parameter(self.filters1), stride = [self.stride[0], self.stride[0]], is_valid = True)) # ((1, 1000, 64), 128)
    pool1 = dy.maxpooling2d(l1, (1, 4), (1,2), is_valid = True) #((1, 499, 64), 128)

    pool1 = padding(pool1, self.filter_width[1]+3)
    l2 = dy.rectify(dy.conv2d(pool1, dy.parameter(self.filters2), stride = [self.stride[1], self.stride[1]], is_valid = True))# ((1, 499, 512), 128)
    pool2 = dy.maxpooling2d(l2, (1, 4), (1,2), is_valid = True)#((1, 248, 512), 128)

    pool2 = padding(pool2, self.filter_width[2])
    l3 = dy.rectify(dy.conv2d(pool2, dy.parameter(self.filters3), stride = [self.stride[2], self.stride[2]], is_valid = True))# ((1, 248, 1024), 128)
    pool3 = dy.max_dim(l3, d = 1)

    my_norm = dy.l2_norm(pool3) + 1e-6
    output = dy.cdiv(pool3,my_norm)
    output = dy.reshape(output, (self.num_filters[2],), batch_size = batch_size)

    return ExpressionSequence(expr_tensor=output)
Exemple #5
0
    def transduce(self, embed_sent: ExpressionSequence) -> ExpressionSequence:
        src = embed_sent.as_tensor()

        sent_len = src.dim()[0][1]
        batch_size = src.dim()[1]
        pad_size = (self.window_receptor -
                    1) / 2  #TODO adapt it also for even window size

        src = dy.concatenate([
            dy.zeroes((self.input_dim, pad_size), batch_size=batch_size), src,
            dy.zeroes((self.input_dim, pad_size), batch_size=batch_size)
        ],
                             d=1)
        padded_sent_len = sent_len + 2 * pad_size

        conv1 = dy.parameter(self.pConv1)
        bias1 = dy.parameter(self.pBias1)
        src_chn = dy.reshape(src, (self.input_dim, padded_sent_len, 1),
                             batch_size=batch_size)
        cnn_layer1 = dy.conv2d_bias(src_chn, conv1, bias1, stride=[1, 1])

        hidden_layer = dy.reshape(cnn_layer1, (self.internal_dim, sent_len, 1),
                                  batch_size=batch_size)
        if self.non_linearity is 'linear':
            hidden_layer = hidden_layer
        elif self.non_linearity is 'tanh':
            hidden_layer = dy.tanh(hidden_layer)
        elif self.non_linearity is 'relu':
            hidden_layer = dy.rectify(hidden_layer)
        elif self.non_linearity is 'sigmoid':
            hidden_layer = dy.logistic(hidden_layer)

        for conv_hid, bias_hid in self.builder_layers:
            hidden_layer = dy.conv2d_bias(hidden_layer,
                                          dy.parameter(conv_hid),
                                          dy.parameter(bias_hid),
                                          stride=[1, 1])
            hidden_layer = dy.reshape(hidden_layer,
                                      (self.internal_dim, sent_len, 1),
                                      batch_size=batch_size)
            if self.non_linearity is 'linear':
                hidden_layer = hidden_layer
            elif self.non_linearity is 'tanh':
                hidden_layer = dy.tanh(hidden_layer)
            elif self.non_linearity is 'relu':
                hidden_layer = dy.rectify(hidden_layer)
            elif self.non_linearity is 'sigmoid':
                hidden_layer = dy.logistic(hidden_layer)
        last_conv = dy.parameter(self.last_conv)
        last_bias = dy.parameter(self.last_bias)
        output = dy.conv2d_bias(hidden_layer,
                                last_conv,
                                last_bias,
                                stride=[1, 1])
        output = dy.reshape(output, (sent_len, self.output_dim),
                            batch_size=batch_size)
        output_seq = ExpressionSequence(expr_tensor=output)
        self._final_states = [FinalTransducerState(output_seq[-1])]
        return output_seq
Exemple #6
0
 def transduce(self, seq: ExpressionSequence) -> ExpressionSequence:
     seq_tensor = self.child.transduce(seq).as_tensor() + seq.as_tensor()
     if self.layer_norm:
         d = seq_tensor.dim()
         seq_tensor = dy.reshape(seq_tensor, (d[0][0], ),
                                 batch_size=d[0][1] * d[1])
         seq_tensor = dy.layer_norm(seq_tensor, self.ln_g, self.ln_b)
         seq_tensor = dy.reshape(seq_tensor, d[0], batch_size=d[1])
     return ExpressionSequence(expr_tensor=seq_tensor)
Exemple #7
0
    def transduce(self, expr_seq: ExpressionSequence) -> ExpressionSequence:
        """
    transduce the sequence

    Args:
      expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated)
    Returns:
      expression sequence
    """

        Wq, Wk, Wv, Wo = [
            dy.parameter(x) for x in (self.pWq, self.pWk, self.pWv, self.pWo)
        ]
        bq, bk, bv, bo = [
            dy.parameter(x) for x in (self.pbq, self.pbk, self.pbv, self.pbo)
        ]

        # Start with a [(length, model_size) x batch] tensor
        x = expr_seq.as_transposed_tensor()
        x_len = x.dim()[0][0]
        x_batch = x.dim()[1]
        # Get the query key and value vectors
        # TODO: do we need bias broadcasting in DyNet?
        # q = dy.affine_transform([bq, x, Wq])
        # k = dy.affine_transform([bk, x, Wk])
        # v = dy.affine_transform([bv, x, Wv])
        q = bq + x * Wq
        k = bk + x * Wk
        v = bv + x * Wv

        # Split to batches [(length, head_dim) x batch * num_heads] tensor
        q, k, v = [
            dy.reshape(x, (x_len, self.head_dim),
                       batch_size=x_batch * self.num_heads) for x in (q, k, v)
        ]

        # Do scaled dot product [(length, length) x batch * num_heads], rows are queries, columns are keys
        attn_score = q * dy.transpose(k) / sqrt(self.head_dim)
        if expr_seq.mask is not None:
            mask = dy.inputTensor(np.repeat(
                expr_seq.mask.np_arr, self.num_heads, axis=0).transpose(),
                                  batched=True) * -1e10
            attn_score = attn_score + mask
        attn_prob = dy.softmax(attn_score, d=1)
        # Reduce using attention and resize to match [(length, model_size) x batch]
        o = dy.reshape(attn_prob * v, (x_len, self.input_dim),
                       batch_size=x_batch)
        # Final transformation
        # o = dy.affine_transform([bo, attn_prob * v, Wo])
        o = bo + o * Wo

        expr_seq = ExpressionSequence(expr_transposed_tensor=o,
                                      mask=expr_seq.mask)

        self._final_states = [FinalTransducerState(expr_seq[-1], None)]

        return expr_seq
Exemple #8
0
 def transduce(self, src: ExpressionSequence) -> ExpressionSequence:
     src = src.as_tensor()
     # convolutional layer
     src = padding(src,
                   src.dim()[0][0],
                   src.dim()[0][1], self.filter_width, self.stride,
                   src.dim()[1])
     l1 = dy.rectify(
         dy.conv2d(src,
                   dy.parameter(self.filter_conv),
                   stride=[self.stride, self.stride],
                   is_valid=True))
     timestep = l1.dim()[0][1]
     features = l1.dim()[0][2]
     batch_size = l1.dim()[1]
     # transpose l1 to be (timesetp, dim), but keep the batch_size.
     rhn_in = dy.reshape(l1, (timestep, features), batch_size=batch_size)
     rhn_in = [dy.pick(rhn_in, i) for i in range(timestep)]
     for l in range(self.rhn_num_hidden_layers):
         rhn_out = []
         # initialize a random vector for the first state vector, keep the same batch size.
         prev_state = dy.parameter(self.init[l])
         # begin recurrent high way network
         for t in range(timestep):
             for m in range(0, self.rhn_microsteps):
                 H = dy.affine_transform([
                     dy.parameter(self.recur[l][m][1]),
                     dy.parameter(self.recur[l][m][0]), prev_state
                 ])
                 T = dy.affine_transform([
                     dy.parameter(self.recur[l][m][3]),
                     dy.parameter(self.recur[l][m][2]), prev_state
                 ])
                 if m == 0:
                     H += dy.parameter(self.linear[l][0]) * rhn_in[t]
                     T += dy.parameter(self.linear[l][1]) * rhn_in[t]
                 H = dy.tanh(H)
                 T = dy.logistic(T)
                 prev_state = dy.cmult(1 - T, prev_state) + dy.cmult(
                     T, H)  # ((1024, ), batch_size)
             rhn_out.append(prev_state)
         if self.residual and l > 0:
             rhn_out = [sum(x) for x in zip(rhn_out, rhn_in)]
         rhn_in = rhn_out
     # Compute the attention-weighted average of the activations
     rhn_in = dy.concatenate_cols(rhn_in)
     scores = dy.transpose(dy.parameter(self.attention[0][1])) * dy.tanh(
         dy.parameter(self.attention[0][0]) *
         rhn_in)  # ((1,510), batch_size)
     scores = dy.reshape(scores, (scores.dim()[0][1], ),
                         batch_size=scores.dim()[1])
     attn_out = rhn_in * dy.softmax(
         scores
     )  # # rhn_in.as_tensor() is ((1024,510), batch_size) softmax is ((510,), batch_size)
     return ExpressionSequence(expr_tensor=attn_out)
Exemple #9
0
 def transduce(self, src: ExpressionSequence) -> ExpressionSequence:
   sent_len = len(src)
   embeddings = dy.strided_select(dy.parameter(self.embedder), [1,1], [0,0], [self.input_dim, sent_len])
   if self.op == 'sum':
     output = embeddings + src.as_tensor()
   elif self.op == 'concat':
     output = dy.concatenate([embeddings, src.as_tensor()])
   else:
     raise ValueError(f'Illegal op {op} in PositionalTransducer (options are "sum"/"concat")')
   output_seq = ExpressionSequence(expr_tensor=output, mask=src.mask)
   self._final_states = [FinalTransducerState(output_seq[-1])]
   return output_seq
Exemple #10
0
    def transduce(self, x: ExpressionSequence) -> ExpressionSequence:
        seq_len = len(x)
        batch_size = x[0].dim()[1]

        att_mask = None
        if self.diagonal_mask_width is not None:
            if self.diagonal_mask_width is None:
                att_mask = np.zeros((seq_len, seq_len))
            else:
                att_mask = np.ones((seq_len, seq_len))
                for i in range(seq_len):
                    from_i = max(0, i - self.diagonal_mask_width // 2)
                    to_i = min(seq_len, i + self.diagonal_mask_width // 2 + 1)
                    att_mask[from_i:to_i, from_i:to_i] = 0.0

        mid = self.self_attn(x=x,
                             att_mask=att_mask,
                             batch_mask=x.mask.np_arr if x.mask else None,
                             p=self.dropout)
        if self.downsample_factor > 1:
            seq_len = int(math.ceil(seq_len / float(self.downsample_factor)))
        hidden_dim = mid.dim()[0][0]
        out_mask = x.mask
        if self.downsample_factor > 1 and out_mask is not None:
            out_mask = out_mask.lin_subsampled(
                reduce_factor=self.downsample_factor)
        if self.ff_lstm:
            mid_re = dy.reshape(mid, (hidden_dim, seq_len),
                                batch_size=batch_size)
            out = self.feed_forward.transduce(
                ExpressionSequence(expr_tensor=mid_re, mask=out_mask))
            out = dy.reshape(out.as_tensor(), (hidden_dim, ),
                             batch_size=seq_len * batch_size)
        else:
            out = self.feed_forward.transduce(mid, p=self.dropout)

        self._recent_output = out
        return ExpressionSequence(expr_tensor=dy.reshape(
            out, (out.dim()[0][0], seq_len), batch_size=batch_size),
                                  mask=out_mask)
Exemple #11
0
 def exprseq_pooling(self, exprseq):
     # Reduce to vector
     exprseq = ExpressionSequence(
         expr_tensor=exprseq.mask.add_to_tensor_expr(
             exprseq.as_tensor(), -1e10),
         mask=exprseq.mask)
     if exprseq.expr_tensor is not None:
         if len(exprseq.expr_tensor.dim()[0]) > 1:
             return dy.max_dim(exprseq.expr_tensor, d=1)
         else:
             return exprseq.expr_tensor
     else:
         return dy.emax(exprseq.expr_list)
Exemple #12
0
  def transduce(self, src: ExpressionSequence) -> ExpressionSequence:
    src = src.as_tensor()

    src_height = src.dim()[0][0]
    src_width = 1
    batch_size = src.dim()[1]

    W = dy.parameter(self.pW)
    b = dy.parameter(self.pb)

    src = dy.reshape(src, (src_height, src_width), batch_size=batch_size) # ((276, 80, 3), 1)
    # convolution and pooling layers
    l1 = (W*src)+b
    output = dy.cdiv(l1,dy.sqrt(dy.squared_norm(l1)))
    return ExpressionSequence(expr_tensor=output)
Exemple #13
0
 def transduce(self,
               embed_sent: ExpressionSequence) -> List[ExpressionSequence]:
     batch_size = embed_sent[0].dim()[1]
     actions = self.sample_segmentation(embed_sent, batch_size)
     embeddings = dy.concatenate(embed_sent.expr_list, d=1)
     embeddings.value()
     #
     composed_words = []
     for i in range(batch_size):
         sequence = dy.pick_batch_elem(embeddings, i)
         # For each sampled segmentations
         lower_bound = 0
         for j, upper_bound in enumerate(actions[i]):
             if self.no_char_embed:
                 char_sequence = []
             else:
                 char_sequence = dy.pick_range(sequence, lower_bound,
                                               upper_bound + 1, 1)
             composed_words.append(
                 (char_sequence, i, j, lower_bound, upper_bound + 1))
             lower_bound = upper_bound + 1
     outputs = self.segment_composer.compose(composed_words, batch_size)
     # Padding + return
     try:
         if self.length_prior:
             seg_size_unpadded = [
                 len(outputs[i]) for i in range(batch_size)
             ]
         sampled_sentence, segment_mask = self.pad(outputs)
         expr_seq = ExpressionSequence(
             expr_tensor=dy.concatenate_to_batch(sampled_sentence),
             mask=segment_mask)
         return self.final_transducer.transduce(expr_seq)
     finally:
         if self.length_prior:
             self.seg_size_unpadded = seg_size_unpadded
         self.compose_output = outputs
         self.segment_actions = actions
         if not self.train and self.is_reporting():
             if len(actions) == 1:  # Support only AccuracyEvalTask
                 self.report_sent_info({"segment_actions": actions})
 def transduce(self, embed):
   self.seq_transducer.transduce(ExpressionSequence(expr_tensor=embed))
   return self.seq_transducer.get_final_states()[-1].main_expr()
Exemple #15
0
    def __call__(self, x: dy.Expression, att_mask: np.ndarray,
                 batch_mask: np.ndarray, p: numbers.Real):
        """
    x: expression of dimensions (input_dim, time) x batch
    att_mask: numpy array of dimensions (time, time); pre-transposed
    batch_mask: numpy array of dimensions (batch, time)
    p: dropout prob
    """
        sent_len = x.dim()[0][1]
        batch_size = x[0].dim()[1]

        if self.downsample_factor > 1:
            if sent_len % self.downsample_factor != 0:
                raise ValueError(
                    "For 'reshape' downsampling, sequence lengths must be multiples of the downsampling factor. "
                    "Configure batcher accordingly.")
            if batch_mask is not None:
                batch_mask = batch_mask[:, ::self.downsample_factor]
            sent_len_out = sent_len // self.downsample_factor
            sent_len = sent_len_out
            out_mask = x.mask
            if self.downsample_factor > 1 and out_mask is not None:
                out_mask = out_mask.lin_subsampled(
                    reduce_factor=self.downsample_factor)

            x = ExpressionSequence(expr_tensor=dy.reshape(
                x.as_tensor(), (x.dim()[0][0] * self.downsample_factor,
                                x.dim()[0][1] / self.downsample_factor),
                batch_size=batch_size),
                                   mask=out_mask)
            residual = SAAMTimeDistributed()(x)
        else:
            residual = SAAMTimeDistributed()(x)
            sent_len_out = sent_len
        if self.model_dim != self.input_dim * self.downsample_factor:
            residual = self.res_shortcut.transform(residual)

        # Concatenate all the words together for doing vectorized affine transform
        if self.kq_pos_encoding_type is None:
            kvq_lin = self.linear_kvq.transform(SAAMTimeDistributed()(x))
            key_up = self.shape_projection(
                dy.pick_range(kvq_lin, 0, self.head_count * self.dim_per_head),
                batch_size)
            value_up = self.shape_projection(
                dy.pick_range(kvq_lin, self.head_count * self.dim_per_head,
                              2 * self.head_count * self.dim_per_head),
                batch_size)
            query_up = self.shape_projection(
                dy.pick_range(kvq_lin, 2 * self.head_count * self.dim_per_head,
                              3 * self.head_count * self.dim_per_head),
                batch_size)
        else:
            assert self.kq_pos_encoding_type == "embedding"
            encoding = self.kq_positional_embedder.embed_sent(
                sent_len).as_tensor()
            kq_lin = self.linear_kq.transform(SAAMTimeDistributed()(
                ExpressionSequence(
                    expr_tensor=dy.concatenate([x.as_tensor(), encoding]))))
            key_up = self.shape_projection(
                dy.pick_range(kq_lin, 0, self.head_count * self.dim_per_head),
                batch_size)
            query_up = self.shape_projection(
                dy.pick_range(kq_lin, self.head_count * self.dim_per_head,
                              2 * self.head_count * self.dim_per_head),
                batch_size)
            v_lin = self.linear_v.transform(SAAMTimeDistributed()(x))
            value_up = self.shape_projection(v_lin, batch_size)

        if self.cross_pos_encoding_type:
            assert self.cross_pos_encoding_type == "embedding"
            emb1 = dy.pick_range(dy.parameter(self.cross_pos_emb_p1), 0,
                                 sent_len)
            emb2 = dy.pick_range(dy.parameter(self.cross_pos_emb_p2), 0,
                                 sent_len)
            key_up = dy.reshape(key_up,
                                (sent_len, self.dim_per_head, self.head_count),
                                batch_size=batch_size)
            key_up = dy.concatenate_cols(
                [dy.cmult(key_up, emb1),
                 dy.cmult(key_up, emb2)])
            key_up = dy.reshape(key_up, (sent_len, self.dim_per_head * 2),
                                batch_size=self.head_count * batch_size)
            query_up = dy.reshape(
                query_up, (sent_len, self.dim_per_head, self.head_count),
                batch_size=batch_size)
            query_up = dy.concatenate_cols(
                [dy.cmult(query_up, emb2),
                 dy.cmult(query_up, -emb1)])
            query_up = dy.reshape(query_up, (sent_len, self.dim_per_head * 2),
                                  batch_size=self.head_count * batch_size)

        scaled = query_up * dy.transpose(
            key_up / math.sqrt(self.dim_per_head)
        )  # scale before the matrix multiplication to save memory

        # Apply Mask here
        if not self.ignore_masks:
            if att_mask is not None:
                att_mask_inp = att_mask * -100.0
                if self.downsample_factor > 1:
                    att_mask_inp = att_mask_inp[::self.downsample_factor, ::
                                                self.downsample_factor]
                scaled += dy.inputTensor(att_mask_inp)
            if batch_mask is not None:
                # reshape (batch, time) -> (time, head_count*batch), then *-100
                inp = np.resize(np.broadcast_to(batch_mask.T[:, np.newaxis, :],
                                                (sent_len, self.head_count, batch_size)),
                                (1, sent_len, self.head_count * batch_size)) \
                      * -100
                mask_expr = dy.inputTensor(inp, batched=True)
                scaled += mask_expr
            if self.diag_gauss_mask:
                diag_growing = np.zeros((sent_len, sent_len, self.head_count))
                for i in range(sent_len):
                    for j in range(sent_len):
                        diag_growing[i, j, :] = -(i - j)**2 / 2.0
                e_diag_gauss_mask = dy.inputTensor(diag_growing)
                e_sigma = dy.parameter(self.diag_gauss_mask_sigma)
                if self.square_mask_std:
                    e_sigma = dy.square(e_sigma)
                e_sigma_sq_inv = dy.cdiv(
                    dy.ones(e_sigma.dim()[0], batch_size=batch_size),
                    dy.square(e_sigma))
                e_diag_gauss_mask_final = dy.cmult(e_diag_gauss_mask,
                                                   e_sigma_sq_inv)
                scaled += dy.reshape(e_diag_gauss_mask_final,
                                     (sent_len, sent_len),
                                     batch_size=batch_size * self.head_count)

        # Computing Softmax here.
        attn = dy.softmax(scaled, d=1)
        if LOG_ATTENTION:
            yaml_logger.info({
                "key": "selfatt_mat_ax0",
                "value": np.average(attn.value(), axis=0).dumps(),
                "desc": self.desc
            })
            yaml_logger.info({
                "key": "selfatt_mat_ax1",
                "value": np.average(attn.value(), axis=1).dumps(),
                "desc": self.desc
            })
            yaml_logger.info({
                "key": "selfatt_mat_ax0_ent",
                "value": entropy(attn.value()).dumps(),
                "desc": self.desc
            })
            yaml_logger.info({
                "key": "selfatt_mat_ax1_ent",
                "value": entropy(attn.value().transpose()).dumps(),
                "desc": self.desc
            })

        self.select_att_head = 0
        if self.select_att_head is not None:
            attn = dy.reshape(attn, (sent_len, sent_len, self.head_count),
                              batch_size=batch_size)
            sel_mask = np.zeros((1, 1, self.head_count))
            sel_mask[0, 0, self.select_att_head] = 1.0
            attn = dy.cmult(attn, dy.inputTensor(sel_mask))
            attn = dy.reshape(attn, (sent_len, sent_len),
                              batch_size=self.head_count * batch_size)

        # Applying dropout to attention
        if p > 0.0:
            drop_attn = dy.dropout(attn, p)
        else:
            drop_attn = attn

        # Computing weighted attention score
        attn_prod = drop_attn * value_up

        # Reshaping the attn_prod to input query dimensions
        out = dy.reshape(attn_prod,
                         (sent_len_out, self.dim_per_head * self.head_count),
                         batch_size=batch_size)
        out = dy.transpose(out)
        out = dy.reshape(out, (self.model_dim, ),
                         batch_size=batch_size * sent_len_out)
        #     out = dy.reshape_transpose_reshape(attn_prod, (sent_len_out, self.dim_per_head * self.head_count), (self.model_dim,), pre_batch_size=batch_size, post_batch_size=batch_size*sent_len_out)

        if self.plot_attention:
            from sklearn.metrics.pairwise import cosine_similarity
            assert batch_size == 1
            mats = []
            for i in range(attn.dim()[1]):
                mats.append(dy.pick_batch_elem(attn, i).npvalue())
                self.plot_att_mat(
                    mats[-1], "{}.sent_{}.head_{}.png".format(
                        self.plot_attention, self.plot_attention_counter, i),
                    300)
            avg_mat = np.average(mats, axis=0)
            self.plot_att_mat(
                avg_mat,
                "{}.sent_{}.head_avg.png".format(self.plot_attention,
                                                 self.plot_attention_counter),
                300)
            cosim_before = cosine_similarity(x.as_tensor().npvalue().T)
            self.plot_att_mat(
                cosim_before, "{}.sent_{}.cosim_before.png".format(
                    self.plot_attention, self.plot_attention_counter), 600)
            cosim_after = cosine_similarity(out.npvalue().T)
            self.plot_att_mat(
                cosim_after, "{}.sent_{}.cosim_after.png".format(
                    self.plot_attention, self.plot_attention_counter), 600)
            self.plot_attention_counter += 1

        # Adding dropout and layer normalization
        if p > 0.0:
            res = dy.dropout(out, p) + residual
        else:
            res = out + residual
        ret = self.layer_norm.transform(res)
        return ret