コード例 #1
0
  def transduce(self, src: ExpressionSequence) -> ExpressionSequence:
    src = src.as_tensor()

    src_height = src.dim()[0][0]
    src_width = src.dim()[0][1]
    # src_channels = 1
    batch_size = src.dim()[1]

    # convolution and pooling layers
    # src dim is ((40, 1000), 128)
    src = padding(src, self.filter_width[0]+3)
    l1 = dy.rectify(dy.conv2d(src, dy.parameter(self.filters1), stride = [self.stride[0], self.stride[0]], is_valid = True)) # ((1, 1000, 64), 128)
    pool1 = dy.maxpooling2d(l1, (1, 4), (1,2), is_valid = True) #((1, 499, 64), 128)

    pool1 = padding(pool1, self.filter_width[1]+3)
    l2 = dy.rectify(dy.conv2d(pool1, dy.parameter(self.filters2), stride = [self.stride[1], self.stride[1]], is_valid = True))# ((1, 499, 512), 128)
    pool2 = dy.maxpooling2d(l2, (1, 4), (1,2), is_valid = True)#((1, 248, 512), 128)

    pool2 = padding(pool2, self.filter_width[2])
    l3 = dy.rectify(dy.conv2d(pool2, dy.parameter(self.filters3), stride = [self.stride[2], self.stride[2]], is_valid = True))# ((1, 248, 1024), 128)
    pool3 = dy.max_dim(l3, d = 1)

    my_norm = dy.l2_norm(pool3) + 1e-6
    output = dy.cdiv(pool3,my_norm)
    output = dy.reshape(output, (self.num_filters[2],), batch_size = batch_size)

    return ExpressionSequence(expr_tensor=output)
コード例 #2
0
    def transduce(self, sent: ExpressionSequence) -> ExpressionSequence:
        if self.pos_encoding_type == "trigonometric":
            if self.position_encoding_block is None or self.position_encoding_block.shape[
                    2] < len(sent):
                self.initialize_position_encoding(
                    int(len(sent) * 1.2),
                    self.input_dim if self.pos_encoding_combine == "add" else
                    self.pos_encoding_size)
            encoding = dy.inputTensor(
                self.position_encoding_block[0, :, :len(sent)])
        elif self.pos_encoding_type == "embedding":
            encoding = self.positional_embedder.embed_sent(
                len(sent)).as_tensor()
        if self.pos_encoding_type:
            if self.pos_encoding_combine == "add":
                sent = ExpressionSequence(expr_tensor=sent.as_tensor() +
                                          encoding,
                                          mask=sent.mask)
            else:  # concat
                sent = ExpressionSequence(expr_tensor=dy.concatenate(
                    [sent.as_tensor(), encoding]),
                                          mask=sent.mask)

        elif self.pos_encoding_type:
            raise ValueError(f"unknown encoding type {self.pos_encoding_type}")
        for module in self.modules:
            enc_sent = module.transduce(sent)
            sent = enc_sent
        self._final_states = [transducers.FinalTransducerState(sent[-1])]
        return sent
コード例 #3
0
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        if expr_seq.dim()[1] > 1:
            raise ValueError(
                f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.dim()[1]}"
            )
        lattice = self.cur_src[0]
        Wx_iog = dy.parameter(self.p_Wx_iog)
        Wh_iog = dy.parameter(self.p_Wh_iog)
        b_iog = dy.parameter(self.p_b_iog)
        Wx_f = dy.parameter(self.p_Wx_f)
        Wh_f = dy.parameter(self.p_Wh_f)
        b_f = dy.parameter(self.p_b_f)
        h = []
        c = []

        batch_size = expr_seq.dim()[1]
        if self.dropout_rate > 0.0 and self.train:
            self.set_dropout_masks(batch_size=batch_size)

        for node_i in range(lattice.sent_len()):
            cur_node = lattice.nodes[node_i]
            val = expr_seq[node_i]
            if self.dropout_rate > 0.0 and self.train:
                val = dy.cmult(val, self.dropout_mask_x)
            i_ft_list = []
            if len(cur_node.nodes_prev) == 0:
                tmp_iog = dy.affine_transform([b_iog, Wx_iog, val])
            else:
                h_tilde = sum(h[pred] for pred in cur_node.nodes_prev)
                tmp_iog = dy.affine_transform(
                    [b_iog, Wx_iog, val, Wh_iog, h_tilde])
                for pred in cur_node.nodes_prev:
                    i_ft_list.append(
                        dy.logistic(
                            dy.affine_transform(
                                [b_f, Wx_f, val, Wh_f, h[pred]])))
            i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim)
            i_aot = dy.pick_range(tmp_iog, self.hidden_dim,
                                  self.hidden_dim * 2)
            i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2,
                                  self.hidden_dim * 3)

            i_it = dy.logistic(i_ait)
            i_ot = dy.logistic(i_aot)
            i_gt = dy.tanh(i_agt)
            if len(cur_node.nodes_prev) == 0:
                c.append(dy.cmult(i_it, i_gt))
            else:
                fc = dy.cmult(i_ft_list[0], c[cur_node.nodes_prev[0]])
                for i in range(1, len(cur_node.nodes_prev)):
                    fc += dy.cmult(i_ft_list[i], c[cur_node.nodes_prev[i]])
                c.append(fc + dy.cmult(i_it, i_gt))
            h_t = dy.cmult(i_ot, dy.tanh(c[-1]))
            if self.dropout_rate > 0.0 and self.train:
                h_t = dy.cmult(h_t, self.dropout_mask_h)
            h.append(h_t)
        self._final_states = [transducers.FinalTransducerState(h[-1], c[-1])]
        return expression_seqs.ExpressionSequence(expr_list=h)
コード例 #4
0
ファイル: convolution.py プロジェクト: seeledu/xnmt-devel
    def transduce(self, embed_sent: ExpressionSequence) -> ExpressionSequence:
        src = embed_sent.as_tensor()

        sent_len = src.dim()[0][1]
        batch_size = src.dim()[1]
        pad_size = (self.window_receptor -
                    1) / 2  #TODO adapt it also for even window size

        src = dy.concatenate([
            dy.zeroes((self.input_dim, pad_size), batch_size=batch_size), src,
            dy.zeroes((self.input_dim, pad_size), batch_size=batch_size)
        ],
                             d=1)
        padded_sent_len = sent_len + 2 * pad_size

        conv1 = dy.parameter(self.pConv1)
        bias1 = dy.parameter(self.pBias1)
        src_chn = dy.reshape(src, (self.input_dim, padded_sent_len, 1),
                             batch_size=batch_size)
        cnn_layer1 = dy.conv2d_bias(src_chn, conv1, bias1, stride=[1, 1])

        hidden_layer = dy.reshape(cnn_layer1, (self.internal_dim, sent_len, 1),
                                  batch_size=batch_size)
        if self.non_linearity is 'linear':
            hidden_layer = hidden_layer
        elif self.non_linearity is 'tanh':
            hidden_layer = dy.tanh(hidden_layer)
        elif self.non_linearity is 'relu':
            hidden_layer = dy.rectify(hidden_layer)
        elif self.non_linearity is 'sigmoid':
            hidden_layer = dy.logistic(hidden_layer)

        for conv_hid, bias_hid in self.builder_layers:
            hidden_layer = dy.conv2d_bias(hidden_layer,
                                          dy.parameter(conv_hid),
                                          dy.parameter(bias_hid),
                                          stride=[1, 1])
            hidden_layer = dy.reshape(hidden_layer,
                                      (self.internal_dim, sent_len, 1),
                                      batch_size=batch_size)
            if self.non_linearity is 'linear':
                hidden_layer = hidden_layer
            elif self.non_linearity is 'tanh':
                hidden_layer = dy.tanh(hidden_layer)
            elif self.non_linearity is 'relu':
                hidden_layer = dy.rectify(hidden_layer)
            elif self.non_linearity is 'sigmoid':
                hidden_layer = dy.logistic(hidden_layer)
        last_conv = dy.parameter(self.last_conv)
        last_bias = dy.parameter(self.last_bias)
        output = dy.conv2d_bias(hidden_layer,
                                last_conv,
                                last_bias,
                                stride=[1, 1])
        output = dy.reshape(output, (sent_len, self.output_dim),
                            batch_size=batch_size)
        output_seq = ExpressionSequence(expr_tensor=output)
        self._final_states = [FinalTransducerState(output_seq[-1])]
        return output_seq
コード例 #5
0
ファイル: pyramidal.py プロジェクト: seeledu/xnmt-devel
    def transduce(self, es: ExpressionSequence) -> ExpressionSequence:
        """
    returns the list of output Expressions obtained by adding the given inputs
    to the current state, one by one.

    Args:
      es: an ExpressionSequence
    """
        es_list = [es]

        for layer_i, fb in enumerate(self.builder_layers):
            reduce_factor = self._reduce_factor_for_layer(layer_i)

            if es_list[0].mask is None: mask_out = None
            else: mask_out = es_list[0].mask.lin_subsampled(reduce_factor)

            if self.downsampling_method == "concat" and len(
                    es_list[0]) % reduce_factor != 0:
                raise ValueError(
                    f"For 'concat' subsampling, sequence lengths must be multiples of the total reduce factor, "
                    f"but got sequence length={len(es_list[0])} for reduce_factor={reduce_factor}. "
                    f"Set Batcher's pad_src_to_multiple argument accordingly.")
            fs = fb.transduce(es_list)
            if layer_i < len(self.builder_layers) - 1:
                if self.downsampling_method == "skip":
                    es_list = [
                        ExpressionSequence(expr_list=fs[::reduce_factor],
                                           mask=mask_out)
                    ]
                elif self.downsampling_method == "concat":
                    es_len = len(es_list[0])
                    es_list_fwd = []
                    for i in range(0, es_len, reduce_factor):
                        for j in range(reduce_factor):
                            if i == 0:
                                es_list_fwd.append([])
                            es_list_fwd[j].append(fs[i + j])
                    es_list = [
                        ExpressionSequence(expr_list=es_list_fwd[j],
                                           mask=mask_out)
                        for j in range(reduce_factor)
                    ]
                else:
                    raise RuntimeError(
                        f"unknown downsampling_method {self.downsampling_method}"
                    )
            else:
                # concat final outputs
                ret_es = ExpressionSequence(expr_list=[f for f in fs],
                                            mask=mask_out)

        self._final_states = [
            FinalTransducerState(fb.get_final_states()[0].main_expr(),
                                 fb.get_final_states()[0].cell_expr())
            for fb in self.builder_layers
        ]

        return ret_es
コード例 #6
0
ファイル: self_attention.py プロジェクト: seeledu/xnmt-devel
    def transduce(self, expr_seq: ExpressionSequence) -> ExpressionSequence:
        """
    transduce the sequence

    Args:
      expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated)
    Returns:
      expression sequence
    """

        Wq, Wk, Wv, Wo = [
            dy.parameter(x) for x in (self.pWq, self.pWk, self.pWv, self.pWo)
        ]
        bq, bk, bv, bo = [
            dy.parameter(x) for x in (self.pbq, self.pbk, self.pbv, self.pbo)
        ]

        # Start with a [(length, model_size) x batch] tensor
        x = expr_seq.as_transposed_tensor()
        x_len = x.dim()[0][0]
        x_batch = x.dim()[1]
        # Get the query key and value vectors
        # TODO: do we need bias broadcasting in DyNet?
        # q = dy.affine_transform([bq, x, Wq])
        # k = dy.affine_transform([bk, x, Wk])
        # v = dy.affine_transform([bv, x, Wv])
        q = bq + x * Wq
        k = bk + x * Wk
        v = bv + x * Wv

        # Split to batches [(length, head_dim) x batch * num_heads] tensor
        q, k, v = [
            dy.reshape(x, (x_len, self.head_dim),
                       batch_size=x_batch * self.num_heads) for x in (q, k, v)
        ]

        # Do scaled dot product [(length, length) x batch * num_heads], rows are queries, columns are keys
        attn_score = q * dy.transpose(k) / sqrt(self.head_dim)
        if expr_seq.mask is not None:
            mask = dy.inputTensor(np.repeat(
                expr_seq.mask.np_arr, self.num_heads, axis=0).transpose(),
                                  batched=True) * -1e10
            attn_score = attn_score + mask
        attn_prob = dy.softmax(attn_score, d=1)
        # Reduce using attention and resize to match [(length, model_size) x batch]
        o = dy.reshape(attn_prob * v, (x_len, self.input_dim),
                       batch_size=x_batch)
        # Final transformation
        # o = dy.affine_transform([bo, attn_prob * v, Wo])
        o = bo + o * Wo

        expr_seq = ExpressionSequence(expr_transposed_tensor=o,
                                      mask=expr_seq.mask)

        self._final_states = [FinalTransducerState(expr_seq[-1], None)]

        return expr_seq
コード例 #7
0
ファイル: residual.py プロジェクト: gmwe/xnmt
 def transduce(self, seq: ExpressionSequence) -> ExpressionSequence:
     seq_tensor = self.child.transduce(seq).as_tensor() + seq.as_tensor()
     if self.layer_norm:
         d = seq_tensor.dim()
         seq_tensor = dy.reshape(seq_tensor, (d[0][0], ),
                                 batch_size=d[0][1] * d[1])
         seq_tensor = dy.layer_norm(seq_tensor, self.ln_g, self.ln_b)
         seq_tensor = dy.reshape(seq_tensor, d[0], batch_size=d[1])
     return ExpressionSequence(expr_tensor=seq_tensor)
コード例 #8
0
 def transduce(self, src: ExpressionSequence) -> ExpressionSequence:
     src = src.as_tensor()
     # convolutional layer
     src = padding(src,
                   src.dim()[0][0],
                   src.dim()[0][1], self.filter_width, self.stride,
                   src.dim()[1])
     l1 = dy.rectify(
         dy.conv2d(src,
                   dy.parameter(self.filter_conv),
                   stride=[self.stride, self.stride],
                   is_valid=True))
     timestep = l1.dim()[0][1]
     features = l1.dim()[0][2]
     batch_size = l1.dim()[1]
     # transpose l1 to be (timesetp, dim), but keep the batch_size.
     rhn_in = dy.reshape(l1, (timestep, features), batch_size=batch_size)
     rhn_in = [dy.pick(rhn_in, i) for i in range(timestep)]
     for l in range(self.rhn_num_hidden_layers):
         rhn_out = []
         # initialize a random vector for the first state vector, keep the same batch size.
         prev_state = dy.parameter(self.init[l])
         # begin recurrent high way network
         for t in range(timestep):
             for m in range(0, self.rhn_microsteps):
                 H = dy.affine_transform([
                     dy.parameter(self.recur[l][m][1]),
                     dy.parameter(self.recur[l][m][0]), prev_state
                 ])
                 T = dy.affine_transform([
                     dy.parameter(self.recur[l][m][3]),
                     dy.parameter(self.recur[l][m][2]), prev_state
                 ])
                 if m == 0:
                     H += dy.parameter(self.linear[l][0]) * rhn_in[t]
                     T += dy.parameter(self.linear[l][1]) * rhn_in[t]
                 H = dy.tanh(H)
                 T = dy.logistic(T)
                 prev_state = dy.cmult(1 - T, prev_state) + dy.cmult(
                     T, H)  # ((1024, ), batch_size)
             rhn_out.append(prev_state)
         if self.residual and l > 0:
             rhn_out = [sum(x) for x in zip(rhn_out, rhn_in)]
         rhn_in = rhn_out
     # Compute the attention-weighted average of the activations
     rhn_in = dy.concatenate_cols(rhn_in)
     scores = dy.transpose(dy.parameter(self.attention[0][1])) * dy.tanh(
         dy.parameter(self.attention[0][0]) *
         rhn_in)  # ((1,510), batch_size)
     scores = dy.reshape(scores, (scores.dim()[0][1], ),
                         batch_size=scores.dim()[1])
     attn_out = rhn_in * dy.softmax(
         scores
     )  # # rhn_in.as_tensor() is ((1024,510), batch_size) softmax is ((510,), batch_size)
     return ExpressionSequence(expr_tensor=attn_out)
コード例 #9
0
  def transduce(self, es: ExpressionSequence) -> ExpressionSequence:
    """
    returns the list of output Expressions obtained by adding the given inputs
    to the current state, one by one, to both the forward and backward RNNs,
    and concatenating.

    Args:
      es: an ExpressionSequence
    """
    es_list = [es]

    for layer_i, (fb, bb) in enumerate(self.builder_layers):
      reduce_factor = self._reduce_factor_for_layer(layer_i)

      if es_list[0].mask is None: mask_out = None
      else: mask_out = es_list[0].mask.lin_subsampled(reduce_factor)

      if self.downsampling_method=="concat" and len(es_list[0]) % reduce_factor != 0:
        raise ValueError(f"For 'concat' subsampling, sequence lengths must be multiples of the total reduce factor, "
                         f"but got sequence length={len(es_list[0])} for reduce_factor={reduce_factor}. "
                         f"Set Batcher's pad_src_to_multiple argument accordingly.")
      fs = fb.transduce(es_list)
      bs = bb.transduce([ReversedExpressionSequence(es_item) for es_item in es_list])
      if layer_i < len(self.builder_layers) - 1:
        if self.downsampling_method=="skip":
          es_list = [ExpressionSequence(expr_list=fs[::reduce_factor], mask=mask_out),
                     ExpressionSequence(expr_list=bs[::reduce_factor][::-1], mask=mask_out)]
        elif self.downsampling_method=="concat":
          es_len = len(es_list[0])
          es_list_fwd = []
          es_list_bwd = []
          for i in range(0, es_len, reduce_factor):
            for j in range(reduce_factor):
              if i==0:
                es_list_fwd.append([])
                es_list_bwd.append([])
              es_list_fwd[j].append(fs[i+j])
              es_list_bwd[j].append(bs[len(es_list[0])-reduce_factor+j-i])
          es_list = [ExpressionSequence(expr_list=es_list_fwd[j], mask=mask_out) for j in range(reduce_factor)] + \
                    [ExpressionSequence(expr_list=es_list_bwd[j], mask=mask_out) for j in range(reduce_factor)]
        else:
          raise RuntimeError(f"unknown downsampling_method {self.downsampling_method}")
      else:
        # concat final outputs
        ret_es = ExpressionSequence(
          expr_list=[dy.concatenate([f, b]) for f, b in zip(fs, ReversedExpressionSequence(bs))], mask=mask_out)

    self._final_states = [FinalTransducerState(dy.concatenate([fb.get_final_states()[0].main_expr(),
                                                               bb.get_final_states()[0].main_expr()]),
                                               dy.concatenate([fb.get_final_states()[0].cell_expr(),
                                                               bb.get_final_states()[0].cell_expr()])) \
                          for (fb, bb) in self.builder_layers]

    return ret_es
コード例 #10
0
ファイル: positional.py プロジェクト: gmwe/xnmt
 def transduce(self, src: ExpressionSequence) -> ExpressionSequence:
   sent_len = len(src)
   embeddings = dy.strided_select(dy.parameter(self.embedder), [1,1], [0,0], [self.input_dim, sent_len])
   if self.op == 'sum':
     output = embeddings + src.as_tensor()
   elif self.op == 'concat':
     output = dy.concatenate([embeddings, src.as_tensor()])
   else:
     raise ValueError(f'Illegal op {op} in PositionalTransducer (options are "sum"/"concat")')
   output_seq = ExpressionSequence(expr_tensor=output, mask=src.mask)
   self._final_states = [FinalTransducerState(output_seq[-1])]
   return output_seq
コード例 #11
0
ファイル: fixed_size_att.py プロジェクト: msperber/misc
 def transduce(self, x: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence:
   x_T = x.as_transposed_tensor()
   scores = x_T * dy.parameter(self.W)
   if x.mask is not None:
     scores = x.mask.add_to_tensor_expr(scores, multiplicator=-100.0, time_first=True)
   if self.pos_enc_max:
     seq_len = x_T.dim()[0][0]
     pos_enc = self.pos_enc[:seq_len,:]
     scores = dy.cmult(scores, dy.inputTensor(pos_enc))
   attention = dy.softmax(scores)
   output_expr = x.as_tensor() * attention
   return expression_seqs.ExpressionSequence(expr_tensor=output_expr, mask=None)
コード例 #12
0
ファイル: retrievers.py プロジェクト: seeledu/xnmt-devel
 def exprseq_pooling(self, exprseq):
     # Reduce to vector
     exprseq = ExpressionSequence(
         expr_tensor=exprseq.mask.add_to_tensor_expr(
             exprseq.as_tensor(), -1e10),
         mask=exprseq.mask)
     if exprseq.expr_tensor is not None:
         if len(exprseq.expr_tensor.dim()[0]) > 1:
             return dy.max_dim(exprseq.expr_tensor, d=1)
         else:
             return exprseq.expr_tensor
     else:
         return dy.emax(exprseq.expr_list)
コード例 #13
0
  def transduce(self, src: ExpressionSequence) -> ExpressionSequence:
    src = src.as_tensor()

    src_height = src.dim()[0][0]
    src_width = 1
    batch_size = src.dim()[1]

    W = dy.parameter(self.pW)
    b = dy.parameter(self.pb)

    src = dy.reshape(src, (src_height, src_width), batch_size=batch_size) # ((276, 80, 3), 1)
    # convolution and pooling layers
    l1 = (W*src)+b
    output = dy.cdiv(l1,dy.sqrt(dy.squared_norm(l1)))
    return ExpressionSequence(expr_tensor=output)
コード例 #14
0
    def __call__(self, x: ExpressionSequence) -> tt.Tensor:
        """
    Move the time-dimension of an input expression into the batch dimension via a reshape.

    Args:
      x: expression of dimensions ((hidden, timesteps), batch_size)

    Returns:
      expression of dimensions ((hidden,), timesteps*batch_size)
    """
        batch_size = x[0].dim()[1]
        model_dim = x[0].dim()[0][0]
        seq_len = x.sent_len()
        total_words = seq_len * batch_size
        input_tensor = x.as_tensor()
        return dy.reshape(input_tensor, (model_dim, ), batch_size=total_words)
コード例 #15
0
ファイル: base.py プロジェクト: yzhen-li/xnmt
 def transduce(
     self, src: expression_seqs.ExpressionSequence
 ) -> expression_seqs.ExpressionSequence:
     src_tensor = src.as_tensor()
     out_mask = src.mask
     if self.downsample_by > 1:
         assert len(src_tensor.dim()[0])==2, \
           f"Downsampling only supported for tensors of order two. Found dims {src_tensor.dim()}"
         (hidden_dim, seq_len), batch_size = src_tensor.dim()
         if seq_len % self.downsample_by != 0:
             raise ValueError(
                 "For downsampling, sequence lengths must be multiples of the total reduce factor. "
                 "Configure batcher accordingly.")
         src_tensor = dy.reshape(src_tensor,
                                 (hidden_dim * self.downsample_by,
                                  seq_len // self.downsample_by),
                                 batch_size=batch_size)
         if out_mask:
             out_mask = out_mask.lin_subsampled(
                 reduce_factor=self.downsample_by)
     output = self.transform.transform(src_tensor)
     if self.downsample_by == 1:
         if len(output.dim()) != src_tensor.dim(
         ):  # can happen with seq length 1
             output = dy.reshape(output,
                                 src_tensor.dim()[0],
                                 batch_size=src_tensor.dim()[1])
     output_seq = expression_seqs.ExpressionSequence(expr_tensor=output,
                                                     mask=out_mask)
     self._final_states = [FinalTransducerState(output_seq[-1])]
     return output_seq
コード例 #16
0
    def transduce(
        self, seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:

        if self.train and self.dropout > 0.0:
            seq_tensor = dy.dropout(
                self.child.transduce(seq).as_tensor(),
                self.dropout) + seq.as_tensor()
        else:
            seq_tensor = self.child.transduce(
                seq).as_tensor() + seq.as_tensor()
        if self.layer_norm:
            d = seq_tensor.dim()
            seq_tensor = dy.reshape(seq_tensor, (d[0][0], ),
                                    batch_size=d[0][1] * d[1])
            seq_tensor = dy.layer_norm(seq_tensor, self.ln_g, self.ln_b)
            seq_tensor = dy.reshape(seq_tensor, d[0], batch_size=d[1])
        return expression_seqs.ExpressionSequence(expr_tensor=seq_tensor)
コード例 #17
0
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        """
    transduce the sequence

    Args:
      expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated)
    Returns:
      expression sequence
    """

        # Start with a [(length, model_size) x batch] tensor
        # B x T x H -> B x H x T
        x = expr_seq.as_tensor()
        x_len = x.size()[1]
        x_batch = x.size()[0]
        # Get the query key and value vectors
        q = self.lin_q(x).transpose(1, 2).contiguous()
        k = self.lin_k(x).transpose(1, 2).contiguous()
        v = self.lin_v(x).transpose(1, 2).contiguous()
        # q = bq + x * Wq
        # k = bk + x * Wk
        # v = bv + x * Wv

        # Split to batches [(length, head_dim) x batch * num_heads] tensor
        q, k, v = [
            temp.view((x_batch * self.num_heads, self.head_dim, x_len))
            for temp in (q, k, v)
        ]

        # Do scaled dot product [batch*num_heads, length, length], rows are keys, columns are queries
        attn_score = torch.matmul(k.transpose(1, 2), q) / sqrt(self.head_dim)
        if expr_seq.mask is not None:
            mask = torch.Tensor(
                np.repeat(expr_seq.mask.np_arr, self.num_heads, axis=0) *
                -1e10).to(xnmt.device)
            attn_score = attn_score + mask.unsqueeze(2)
        attn_prob = torch.nn.Softmax(dim=1)(attn_score)
        # attn_prob = dy.softmax(attn_score, d=1)
        if self.train and self.dropout > 0.0:
            attn_prob = tt.dropout(attn_prob, self.dropout)
        # Reduce using attention and resize to match [(length, model_size) x batch]
        o = torch.matmul(v, attn_prob).view(x_batch, self.input_dim,
                                            x_len).transpose(1, 2)
        # Final transformation
        o = self.lin_o(o)
        # o = bo + o * Wo

        expr_seq = expression_seqs.ExpressionSequence(expr_tensor=o,
                                                      mask=expr_seq.mask)

        self._final_states = [
            transducers.FinalTransducerState(expr_seq[-1], None)
        ]

        return expr_seq
コード例 #18
0
  def transduce(self, es: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence:
    mask = es.mask
    sent_len = len(es)
    es_expr = es.as_transposed_tensor()
    batch_size = es_expr.dim()[1]

    es_chn = dy.reshape(es_expr, (sent_len, self.freq_dim, self.chn_dim), batch_size=batch_size)

    h_out = {}
    for direction in ["fwd", "bwd"]:
      # input convolutions
      gates_xt_bias = dy.conv2d_bias(es_chn, dy.parameter(self.params["x2all_" + direction]),
                                     dy.parameter(self.params["b_" + direction]), stride=(1, 1), is_valid=False)
      gates_xt_bias_list = [dy.pick_range(gates_xt_bias, i, i + 1) for i in range(sent_len)]

      h = []
      c = []
      for input_pos in range(sent_len):
        directional_pos = input_pos if direction == "fwd" else sent_len - input_pos - 1
        gates_t = gates_xt_bias_list[directional_pos]
        if input_pos > 0:
          # recurrent convolutions
          gates_h_t = dy.conv2d(h[-1], dy.parameter(self.params["h2all_" + direction]), stride=(1, 1), is_valid=False)
          gates_t += gates_h_t

        # standard LSTM logic
        if len(c) == 0:
          c_tm1 = dy.zeros((self.freq_dim * self.num_filters,), batch_size=batch_size)
        else:
          c_tm1 = c[-1]
        gates_t_reshaped = dy.reshape(gates_t, (4 * self.freq_dim * self.num_filters,), batch_size=batch_size)
        c_t = dy.reshape(dy.vanilla_lstm_c(c_tm1, gates_t_reshaped), (self.freq_dim * self.num_filters,),
                         batch_size=batch_size)
        h_t = dy.vanilla_lstm_h(c_t, gates_t_reshaped)
        h_t = dy.reshape(h_t, (1, self.freq_dim, self.num_filters,), batch_size=batch_size)

        if mask is None or np.isclose(np.sum(mask.np_arr[:, input_pos:input_pos + 1]), 0.0):
          c.append(c_t)
          h.append(h_t)
        else:
          c.append(
            mask.cmult_by_timestep_expr(c_t, input_pos, True) + mask.cmult_by_timestep_expr(c[-1], input_pos, False))
          h.append(
            mask.cmult_by_timestep_expr(h_t, input_pos, True) + mask.cmult_by_timestep_expr(h[-1], input_pos, False))

      h_out[direction] = h
    ret_expr = []
    for state_i in range(len(h_out["fwd"])):
      state_fwd = h_out["fwd"][state_i]
      state_bwd = h_out["bwd"][-1 - state_i]
      output_dim = (state_fwd.dim()[0][1] * state_fwd.dim()[0][2],)
      fwd_reshape = dy.reshape(state_fwd, output_dim, batch_size=batch_size)
      bwd_reshape = dy.reshape(state_bwd, output_dim, batch_size=batch_size)
      ret_expr.append(dy.concatenate([fwd_reshape, bwd_reshape], d=0 if self.reshape_output else 2))
    return expression_seqs.ExpressionSequence(expr_list=ret_expr, mask=mask)

  # TODO: implement get_final_states()
コード例 #19
0
ファイル: residual.py プロジェクト: yzhen-li/xnmt
    def transduce(
        self, seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:

        if self.train and self.dropout > 0.0:
            seq_tensor = tt.dropout(
                self.child.transduce(seq).as_tensor(),
                self.dropout) + seq.as_tensor()
        else:
            seq_tensor = self.child.transduce(
                seq).as_tensor() + seq.as_tensor()
        if self.layer_norm:
            batch_size = tt.batch_size(seq_tensor)
            merged_seq_tensor = tt.merge_time_batch_dims(seq_tensor)
            transformed_seq_tensor = self.layer_norm_component.transform(
                merged_seq_tensor)
            seq_tensor = tt.unmerge_time_batch_dims(transformed_seq_tensor,
                                                    batch_size)
        return expression_seqs.ExpressionSequence(expr_tensor=seq_tensor)
コード例 #20
0
    def transduce(self, x: ExpressionSequence) -> ExpressionSequence:
        seq_len = len(x)
        batch_size = x[0].dim()[1]

        att_mask = None
        if self.diagonal_mask_width is not None:
            if self.diagonal_mask_width is None:
                att_mask = np.zeros((seq_len, seq_len))
            else:
                att_mask = np.ones((seq_len, seq_len))
                for i in range(seq_len):
                    from_i = max(0, i - self.diagonal_mask_width // 2)
                    to_i = min(seq_len, i + self.diagonal_mask_width // 2 + 1)
                    att_mask[from_i:to_i, from_i:to_i] = 0.0

        mid = self.self_attn(x=x,
                             att_mask=att_mask,
                             batch_mask=x.mask.np_arr if x.mask else None,
                             p=self.dropout)
        if self.downsample_factor > 1:
            seq_len = int(math.ceil(seq_len / float(self.downsample_factor)))
        hidden_dim = mid.dim()[0][0]
        out_mask = x.mask
        if self.downsample_factor > 1 and out_mask is not None:
            out_mask = out_mask.lin_subsampled(
                reduce_factor=self.downsample_factor)
        if self.ff_lstm:
            mid_re = dy.reshape(mid, (hidden_dim, seq_len),
                                batch_size=batch_size)
            out = self.feed_forward.transduce(
                ExpressionSequence(expr_tensor=mid_re, mask=out_mask))
            out = dy.reshape(out.as_tensor(), (hidden_dim, ),
                             batch_size=seq_len * batch_size)
        else:
            out = self.feed_forward.transduce(mid, p=self.dropout)

        self._recent_output = out
        return ExpressionSequence(expr_tensor=dy.reshape(
            out, (out.dim()[0][0], seq_len), batch_size=batch_size),
                                  mask=out_mask)
コード例 #21
0
ファイル: positional.py プロジェクト: yzhen-li/xnmt
 def transduce(
     self, src: expression_seqs.ExpressionSequence
 ) -> expression_seqs.ExpressionSequence:
     sent_len = src.sent_len()
     batch_size = tt.batch_size(src[0])
     embeddings = self.embeddings(
         torch.tensor([list(range(sent_len))] * batch_size).to(xnmt.device))
     # embeddings = dy.strided_select(dy.parameter(self.embedder), [1,1], [0,0], [self.input_dim, sent_len])
     if self.op == 'sum':
         output = embeddings + src.as_tensor()
     elif self.op == 'concat':
         output = tt.concatenate([embeddings, src.as_tensor()])
     else:
         raise ValueError(
             f'Illegal op {op} in PositionalTransducer (options are "sum"/"concat")'
         )
     if self.train and self.dropout > 0.0:
         output = tt.dropout(output, self.dropout)
     output_seq = expression_seqs.ExpressionSequence(expr_tensor=output,
                                                     mask=src.mask)
     self._final_states = [transducers.FinalTransducerState(output_seq[-1])]
     return output_seq
コード例 #22
0
ファイル: convolution.py プロジェクト: yzhen-li/xnmt
 def transduce(
     self, x: expression_seqs.ExpressionSequence
 ) -> expression_seqs.ExpressionSequence:
     expr = x.as_transposed_tensor()
     batch_size, hidden_dim, seq_len = expr.size()
     expr = expr.view((batch_size, self.in_channels,
                       hidden_dim // self.in_channels, seq_len))
     expr = self.cnn_layer(expr)
     if self.use_pooling:
         expr = self.pooling_layer(expr)
     expr = self.activation_fct(expr)
     batch_size, out_chn, out_h, seq_len = expr.size()
     expr = expr.view((batch_size, out_chn * out_h, seq_len))
     output_seq = expression_seqs.ExpressionSequence(
         expr_transposed_tensor=expr,
         mask=x.mask.lin_subsampled(trg_len=seq_len) if x.mask else None)
     self._final_states = [transducers.FinalTransducerState(output_seq[-1])]
     return output_seq
コード例 #23
0
 def transduce(self,
               embed_sent: ExpressionSequence) -> List[ExpressionSequence]:
     batch_size = embed_sent[0].dim()[1]
     actions = self.sample_segmentation(embed_sent, batch_size)
     embeddings = dy.concatenate(embed_sent.expr_list, d=1)
     embeddings.value()
     #
     composed_words = []
     for i in range(batch_size):
         sequence = dy.pick_batch_elem(embeddings, i)
         # For each sampled segmentations
         lower_bound = 0
         for j, upper_bound in enumerate(actions[i]):
             if self.no_char_embed:
                 char_sequence = []
             else:
                 char_sequence = dy.pick_range(sequence, lower_bound,
                                               upper_bound + 1, 1)
             composed_words.append(
                 (char_sequence, i, j, lower_bound, upper_bound + 1))
             lower_bound = upper_bound + 1
     outputs = self.segment_composer.compose(composed_words, batch_size)
     # Padding + return
     try:
         if self.length_prior:
             seg_size_unpadded = [
                 len(outputs[i]) for i in range(batch_size)
             ]
         sampled_sentence, segment_mask = self.pad(outputs)
         expr_seq = ExpressionSequence(
             expr_tensor=dy.concatenate_to_batch(sampled_sentence),
             mask=segment_mask)
         return self.final_transducer.transduce(expr_seq)
     finally:
         if self.length_prior:
             self.seg_size_unpadded = seg_size_unpadded
         self.compose_output = outputs
         self.segment_actions = actions
         if not self.train and self.is_reporting():
             if len(actions) == 1:  # Support only AccuracyEvalTask
                 self.report_sent_info({"segment_actions": actions})
コード例 #24
0
ファイル: base.py プロジェクト: yzhen-li/xnmt
 def transduce(
     self, src: expression_seqs.ExpressionSequence
 ) -> expression_seqs.ExpressionSequence:
     src_tensor = src.as_tensor()
     out_mask = src.mask
     if self.downsample_by > 1:
         assert src_tensor.dim()==3, \
           f"Downsampling only supported for tensors of order two (+ batch). Found dims {src_tensor.size()}"
         batch_size, seq_len, hidden_dim = src_tensor.size()
         if seq_len % self.downsample_by != 0:
             raise ValueError(
                 "For downsampling, sequence lengths must be multiples of the total reduce factor. "
                 "Configure batcher accordingly.")
         src_tensor = src_tensor.view(
             (batch_size, seq_len // self.downsample_by,
              hidden_dim * self.downsample_by))
         if out_mask:
             out_mask = out_mask.lin_subsampled(
                 reduce_factor=self.downsample_by)
     output = self.transform.transform(src_tensor)
     output_seq = expression_seqs.ExpressionSequence(expr_tensor=output,
                                                     mask=out_mask)
     self._final_states = [FinalTransducerState(output_seq[-1])]
     return output_seq
コード例 #25
0
ファイル: norms.py プロジェクト: yzhen-li/xnmt
 def transduce(
     self, es: expression_seqs.ExpressionSequence
 ) -> expression_seqs.ExpressionSequence:
     output = self.transform(es.as_tensor(), es.mask)
     return expression_seqs.ExpressionSequence(expr_tensor=output,
                                               mask=es.mask)
コード例 #26
0
ファイル: lattice.py プロジェクト: yzhen-li/xnmt
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        if expr_seq.batch_size() > 1:
            raise ValueError(
                f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.batch_size()}"
            )
        lattice = self.cur_src[0]
        Wx_iog = dy.parameter(self.p_Wx_iog)
        Wh_iog = dy.parameter(self.p_Wh_iog)
        b_iog = dy.parameter(self.p_b_iog)
        Wx_f = dy.parameter(self.p_Wx_f)
        Wh_f = dy.parameter(self.p_Wh_f)
        b_f = dy.parameter(self.p_b_f)
        h = {}
        c = {}
        h_list = []

        batch_size = expr_seq.batch_size()
        if self.dropout_rate > 0.0 and self.train:
            self.set_dropout_masks(batch_size=batch_size)

        for i, cur_node_id in enumerate(lattice.nodes):
            prev_node = lattice.graph.predecessors(cur_node_id)
            val = expr_seq[i]
            if self.dropout_rate > 0.0 and self.train:
                val = dy.cmult(val, self.dropout_mask_x)
            i_ft_list = []
            if len(prev_node) == 0:
                tmp_iog = dy.affine_transform([b_iog, Wx_iog, val])
            else:
                h_tilde = sum(h[pred] for pred in prev_node)
                tmp_iog = dy.affine_transform(
                    [b_iog, Wx_iog, val, Wh_iog, h_tilde])
                for pred in prev_node:
                    i_ft_list.append(
                        dy.logistic(
                            dy.affine_transform(
                                [b_f, Wx_f, val, Wh_f, h[pred]])))
            i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim)
            i_aot = dy.pick_range(tmp_iog, self.hidden_dim,
                                  self.hidden_dim * 2)
            i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2,
                                  self.hidden_dim * 3)

            i_it = dy.logistic(i_ait)
            i_ot = dy.logistic(i_aot)
            i_gt = dy.tanh(i_agt)
            if len(prev_node) == 0:
                c[cur_node_id] = dy.cmult(i_it, i_gt)
            else:
                fc = dy.cmult(i_ft_list[0], c[prev_node[0]])
                for i in range(1, len(prev_node)):
                    fc += dy.cmult(i_ft_list[i], c[prev_node[i]])
                c[cur_node_id] = fc + dy.cmult(i_it, i_gt)
            h_t = dy.cmult(i_ot, dy.tanh(c[cur_node_id]))
            if self.dropout_rate > 0.0 and self.train:
                h_t = dy.cmult(h_t, self.dropout_mask_h)
            h[cur_node_id] = h_t
            h_list.append(h_t)
        self._final_states = [
            transducers.FinalTransducerState(h_list[-1], h_list[-1])
        ]
        return expression_seqs.ExpressionSequence(expr_list=h_list)
コード例 #27
0
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        """
    transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c)

    Args:
      expr_seq: expression sequence (will be accessed via tensor_expr)
    Return:
      expression sequence
    """

        if isinstance(expr_seq, list):
            mask_out = expr_seq[0].mask
            seq_len = len(expr_seq[0])
            batch_size = expr_seq[0].dim()[1]
            tensors = [e.as_tensor() for e in expr_seq]
            input_tensor = dy.reshape(dy.concatenate(tensors),
                                      (seq_len, 1, self.input_dim),
                                      batch_size=batch_size)
        else:
            mask_out = expr_seq.mask
            seq_len = len(expr_seq)
            batch_size = expr_seq.dim()[1]
            input_tensor = dy.reshape(dy.transpose(expr_seq.as_tensor()),
                                      (seq_len, 1, self.input_dim),
                                      batch_size=batch_size)

        if self.dropout > 0.0 and self.train:
            input_tensor = dy.dropout(input_tensor, self.dropout)

        proj_inp = dy.conv2d_bias(input_tensor,
                                  dy.parameter(self.p_f),
                                  dy.parameter(self.p_b),
                                  stride=(self.stride, 1),
                                  is_valid=False)
        reduced_seq_len = proj_inp.dim()[0][0]
        proj_inp = dy.transpose(
            dy.reshape(proj_inp, (reduced_seq_len, self.hidden_dim * 3),
                       batch_size=batch_size))
        # proj_inp dims: (hidden, 1, seq_len), batch_size
        if self.stride > 1 and mask_out is not None:
            mask_out = mask_out.lin_subsampled(trg_len=reduced_seq_len)

        h = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)]
        c = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)]
        for t in range(reduced_seq_len):
            f_t = dy.logistic(
                dy.strided_select(proj_inp, [], [0, t],
                                  [self.hidden_dim, t + 1]))
            o_t = dy.logistic(
                dy.strided_select(proj_inp, [], [self.hidden_dim, t],
                                  [self.hidden_dim * 2, t + 1]))
            z_t = dy.tanh(
                dy.strided_select(proj_inp, [], [self.hidden_dim * 2, t],
                                  [self.hidden_dim * 3, t + 1]))

            if self.dropout > 0.0 and self.train:
                retention_rate = 1.0 - self.dropout
                dropout_mask = dy.random_bernoulli((self.hidden_dim, 1),
                                                   retention_rate,
                                                   batch_size=batch_size)
                f_t = 1.0 - dy.cmult(
                    dropout_mask, 1.0 - f_t
                )  # TODO: would be easy to make a zoneout dynet operation to save memory

            i_t = 1.0 - f_t

            if t == 0:
                c_t = dy.cmult(i_t, z_t)
            else:
                c_t = dy.cmult(f_t, c[-1]) + dy.cmult(i_t, z_t)
            h_t = dy.cmult(
                o_t, c_t)  # note: LSTM would use dy.tanh(c_t) instead of c_t
            if mask_out is None or np.isclose(
                    np.sum(mask_out.np_arr[:, t:t + 1]), 0.0):
                c.append(c_t)
                h.append(h_t)
            else:
                c.append(
                    mask_out.cmult_by_timestep_expr(c_t, t, True) +
                    mask_out.cmult_by_timestep_expr(c[-1], t, False))
                h.append(
                    mask_out.cmult_by_timestep_expr(h_t, t, True) +
                    mask_out.cmult_by_timestep_expr(h[-1], t, False))

        self._final_states = [transducers.FinalTransducerState(dy.reshape(h[-1], (self.hidden_dim,), batch_size=batch_size), \
                                                               dy.reshape(c[-1], (self.hidden_dim,),
                                                                          batch_size=batch_size))]
        return expression_seqs.ExpressionSequence(expr_list=h[1:],
                                                  mask=mask_out)
コード例 #28
0
    def __call__(self, x: dy.Expression, att_mask: np.ndarray,
                 batch_mask: np.ndarray, p: numbers.Real):
        """
    x: expression of dimensions (input_dim, time) x batch
    att_mask: numpy array of dimensions (time, time); pre-transposed
    batch_mask: numpy array of dimensions (batch, time)
    p: dropout prob
    """
        sent_len = x.dim()[0][1]
        batch_size = x[0].dim()[1]

        if self.downsample_factor > 1:
            if sent_len % self.downsample_factor != 0:
                raise ValueError(
                    "For 'reshape' downsampling, sequence lengths must be multiples of the downsampling factor. "
                    "Configure batcher accordingly.")
            if batch_mask is not None:
                batch_mask = batch_mask[:, ::self.downsample_factor]
            sent_len_out = sent_len // self.downsample_factor
            sent_len = sent_len_out
            out_mask = x.mask
            if self.downsample_factor > 1 and out_mask is not None:
                out_mask = out_mask.lin_subsampled(
                    reduce_factor=self.downsample_factor)

            x = ExpressionSequence(expr_tensor=dy.reshape(
                x.as_tensor(), (x.dim()[0][0] * self.downsample_factor,
                                x.dim()[0][1] / self.downsample_factor),
                batch_size=batch_size),
                                   mask=out_mask)
            residual = SAAMTimeDistributed()(x)
        else:
            residual = SAAMTimeDistributed()(x)
            sent_len_out = sent_len
        if self.model_dim != self.input_dim * self.downsample_factor:
            residual = self.res_shortcut.transform(residual)

        # Concatenate all the words together for doing vectorized affine transform
        if self.kq_pos_encoding_type is None:
            kvq_lin = self.linear_kvq.transform(SAAMTimeDistributed()(x))
            key_up = self.shape_projection(
                dy.pick_range(kvq_lin, 0, self.head_count * self.dim_per_head),
                batch_size)
            value_up = self.shape_projection(
                dy.pick_range(kvq_lin, self.head_count * self.dim_per_head,
                              2 * self.head_count * self.dim_per_head),
                batch_size)
            query_up = self.shape_projection(
                dy.pick_range(kvq_lin, 2 * self.head_count * self.dim_per_head,
                              3 * self.head_count * self.dim_per_head),
                batch_size)
        else:
            assert self.kq_pos_encoding_type == "embedding"
            encoding = self.kq_positional_embedder.embed_sent(
                sent_len).as_tensor()
            kq_lin = self.linear_kq.transform(SAAMTimeDistributed()(
                ExpressionSequence(
                    expr_tensor=dy.concatenate([x.as_tensor(), encoding]))))
            key_up = self.shape_projection(
                dy.pick_range(kq_lin, 0, self.head_count * self.dim_per_head),
                batch_size)
            query_up = self.shape_projection(
                dy.pick_range(kq_lin, self.head_count * self.dim_per_head,
                              2 * self.head_count * self.dim_per_head),
                batch_size)
            v_lin = self.linear_v.transform(SAAMTimeDistributed()(x))
            value_up = self.shape_projection(v_lin, batch_size)

        if self.cross_pos_encoding_type:
            assert self.cross_pos_encoding_type == "embedding"
            emb1 = dy.pick_range(dy.parameter(self.cross_pos_emb_p1), 0,
                                 sent_len)
            emb2 = dy.pick_range(dy.parameter(self.cross_pos_emb_p2), 0,
                                 sent_len)
            key_up = dy.reshape(key_up,
                                (sent_len, self.dim_per_head, self.head_count),
                                batch_size=batch_size)
            key_up = dy.concatenate_cols(
                [dy.cmult(key_up, emb1),
                 dy.cmult(key_up, emb2)])
            key_up = dy.reshape(key_up, (sent_len, self.dim_per_head * 2),
                                batch_size=self.head_count * batch_size)
            query_up = dy.reshape(
                query_up, (sent_len, self.dim_per_head, self.head_count),
                batch_size=batch_size)
            query_up = dy.concatenate_cols(
                [dy.cmult(query_up, emb2),
                 dy.cmult(query_up, -emb1)])
            query_up = dy.reshape(query_up, (sent_len, self.dim_per_head * 2),
                                  batch_size=self.head_count * batch_size)

        scaled = query_up * dy.transpose(
            key_up / math.sqrt(self.dim_per_head)
        )  # scale before the matrix multiplication to save memory

        # Apply Mask here
        if not self.ignore_masks:
            if att_mask is not None:
                att_mask_inp = att_mask * -100.0
                if self.downsample_factor > 1:
                    att_mask_inp = att_mask_inp[::self.downsample_factor, ::
                                                self.downsample_factor]
                scaled += dy.inputTensor(att_mask_inp)
            if batch_mask is not None:
                # reshape (batch, time) -> (time, head_count*batch), then *-100
                inp = np.resize(np.broadcast_to(batch_mask.T[:, np.newaxis, :],
                                                (sent_len, self.head_count, batch_size)),
                                (1, sent_len, self.head_count * batch_size)) \
                      * -100
                mask_expr = dy.inputTensor(inp, batched=True)
                scaled += mask_expr
            if self.diag_gauss_mask:
                diag_growing = np.zeros((sent_len, sent_len, self.head_count))
                for i in range(sent_len):
                    for j in range(sent_len):
                        diag_growing[i, j, :] = -(i - j)**2 / 2.0
                e_diag_gauss_mask = dy.inputTensor(diag_growing)
                e_sigma = dy.parameter(self.diag_gauss_mask_sigma)
                if self.square_mask_std:
                    e_sigma = dy.square(e_sigma)
                e_sigma_sq_inv = dy.cdiv(
                    dy.ones(e_sigma.dim()[0], batch_size=batch_size),
                    dy.square(e_sigma))
                e_diag_gauss_mask_final = dy.cmult(e_diag_gauss_mask,
                                                   e_sigma_sq_inv)
                scaled += dy.reshape(e_diag_gauss_mask_final,
                                     (sent_len, sent_len),
                                     batch_size=batch_size * self.head_count)

        # Computing Softmax here.
        attn = dy.softmax(scaled, d=1)
        if LOG_ATTENTION:
            yaml_logger.info({
                "key": "selfatt_mat_ax0",
                "value": np.average(attn.value(), axis=0).dumps(),
                "desc": self.desc
            })
            yaml_logger.info({
                "key": "selfatt_mat_ax1",
                "value": np.average(attn.value(), axis=1).dumps(),
                "desc": self.desc
            })
            yaml_logger.info({
                "key": "selfatt_mat_ax0_ent",
                "value": entropy(attn.value()).dumps(),
                "desc": self.desc
            })
            yaml_logger.info({
                "key": "selfatt_mat_ax1_ent",
                "value": entropy(attn.value().transpose()).dumps(),
                "desc": self.desc
            })

        self.select_att_head = 0
        if self.select_att_head is not None:
            attn = dy.reshape(attn, (sent_len, sent_len, self.head_count),
                              batch_size=batch_size)
            sel_mask = np.zeros((1, 1, self.head_count))
            sel_mask[0, 0, self.select_att_head] = 1.0
            attn = dy.cmult(attn, dy.inputTensor(sel_mask))
            attn = dy.reshape(attn, (sent_len, sent_len),
                              batch_size=self.head_count * batch_size)

        # Applying dropout to attention
        if p > 0.0:
            drop_attn = dy.dropout(attn, p)
        else:
            drop_attn = attn

        # Computing weighted attention score
        attn_prod = drop_attn * value_up

        # Reshaping the attn_prod to input query dimensions
        out = dy.reshape(attn_prod,
                         (sent_len_out, self.dim_per_head * self.head_count),
                         batch_size=batch_size)
        out = dy.transpose(out)
        out = dy.reshape(out, (self.model_dim, ),
                         batch_size=batch_size * sent_len_out)
        #     out = dy.reshape_transpose_reshape(attn_prod, (sent_len_out, self.dim_per_head * self.head_count), (self.model_dim,), pre_batch_size=batch_size, post_batch_size=batch_size*sent_len_out)

        if self.plot_attention:
            from sklearn.metrics.pairwise import cosine_similarity
            assert batch_size == 1
            mats = []
            for i in range(attn.dim()[1]):
                mats.append(dy.pick_batch_elem(attn, i).npvalue())
                self.plot_att_mat(
                    mats[-1], "{}.sent_{}.head_{}.png".format(
                        self.plot_attention, self.plot_attention_counter, i),
                    300)
            avg_mat = np.average(mats, axis=0)
            self.plot_att_mat(
                avg_mat,
                "{}.sent_{}.head_avg.png".format(self.plot_attention,
                                                 self.plot_attention_counter),
                300)
            cosim_before = cosine_similarity(x.as_tensor().npvalue().T)
            self.plot_att_mat(
                cosim_before, "{}.sent_{}.cosim_before.png".format(
                    self.plot_attention, self.plot_attention_counter), 600)
            cosim_after = cosine_similarity(out.npvalue().T)
            self.plot_att_mat(
                cosim_after, "{}.sent_{}.cosim_after.png".format(
                    self.plot_attention, self.plot_attention_counter), 600)
            self.plot_attention_counter += 1

        # Adding dropout and layer normalization
        if p > 0.0:
            res = dy.dropout(out, p) + residual
        else:
            res = out + residual
        ret = self.layer_norm.transform(res)
        return ret
コード例 #29
0
 def transduce(self, embed):
   self.seq_transducer.transduce(ExpressionSequence(expr_tensor=embed))
   return self.seq_transducer.get_final_states()[-1].main_expr()