Esempio n. 1
0
    def transduce(self, sent: ExpressionSequence) -> ExpressionSequence:
        if self.pos_encoding_type == "trigonometric":
            if self.position_encoding_block is None or self.position_encoding_block.shape[
                    2] < len(sent):
                self.initialize_position_encoding(
                    int(len(sent) * 1.2),
                    self.input_dim if self.pos_encoding_combine == "add" else
                    self.pos_encoding_size)
            encoding = dy.inputTensor(
                self.position_encoding_block[0, :, :len(sent)])
        elif self.pos_encoding_type == "embedding":
            encoding = self.positional_embedder.embed_sent(
                len(sent)).as_tensor()
        if self.pos_encoding_type:
            if self.pos_encoding_combine == "add":
                sent = ExpressionSequence(expr_tensor=sent.as_tensor() +
                                          encoding,
                                          mask=sent.mask)
            else:  # concat
                sent = ExpressionSequence(expr_tensor=dy.concatenate(
                    [sent.as_tensor(), encoding]),
                                          mask=sent.mask)

        elif self.pos_encoding_type:
            raise ValueError(f"unknown encoding type {self.pos_encoding_type}")
        for module in self.modules:
            enc_sent = module.transduce(sent)
            sent = enc_sent
        self._final_states = [transducers.FinalTransducerState(sent[-1])]
        return sent
Esempio n. 2
0
    def transduce(self, expr_seq: 'expression_seqs.ExpressionSequence'):
        """
    transduce the sequence

    Args:
      expr_seq: expression sequence
    Returns:
      expression sequence
    """
        batch_size = expr_seq[0].dim()[1]
        seq_len = len(expr_seq)

        output_exps = []
        for pos_i in range(seq_len):
            input_i = expr_seq[pos_i]
            affine = self.linear_layer(input_i)
            # affine = dy.affine_transform([dy.parameter(self.p_b), dy.parameter(self.p_W), input_i])
            if self.train and self.dropout_rate:
                affine = dy.dropout(affine, self.dropout_rate)
            if self.gumbel:
                affine = affine + dy.random_gumbel(dim=affine.dim()[0],
                                                   batch_size=batch_size)
            softmax_out = dy.softmax(affine)
            # embedded = self.emb_layer(softmax_out)
            embedded = dy.parameter(self.p_E) * softmax_out
            if self.residual:
                embedded = embedded + input_i
            output_exps.append(embedded)

        self._final_states = [
            transducers.FinalTransducerState(main_expr=embedded)
        ]

        return expression_seqs.ExpressionSequence(expr_list=output_exps,
                                                  mask=expr_seq.mask)
Esempio n. 3
0
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        if expr_seq.dim()[1] > 1:
            raise ValueError(
                f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.dim()[1]}"
            )
        lattice = self.cur_src[0]
        Wx_iog = dy.parameter(self.p_Wx_iog)
        Wh_iog = dy.parameter(self.p_Wh_iog)
        b_iog = dy.parameter(self.p_b_iog)
        Wx_f = dy.parameter(self.p_Wx_f)
        Wh_f = dy.parameter(self.p_Wh_f)
        b_f = dy.parameter(self.p_b_f)
        h = []
        c = []

        batch_size = expr_seq.dim()[1]
        if self.dropout_rate > 0.0 and self.train:
            self.set_dropout_masks(batch_size=batch_size)

        for node_i in range(lattice.sent_len()):
            cur_node = lattice.nodes[node_i]
            val = expr_seq[node_i]
            if self.dropout_rate > 0.0 and self.train:
                val = dy.cmult(val, self.dropout_mask_x)
            i_ft_list = []
            if len(cur_node.nodes_prev) == 0:
                tmp_iog = dy.affine_transform([b_iog, Wx_iog, val])
            else:
                h_tilde = sum(h[pred] for pred in cur_node.nodes_prev)
                tmp_iog = dy.affine_transform(
                    [b_iog, Wx_iog, val, Wh_iog, h_tilde])
                for pred in cur_node.nodes_prev:
                    i_ft_list.append(
                        dy.logistic(
                            dy.affine_transform(
                                [b_f, Wx_f, val, Wh_f, h[pred]])))
            i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim)
            i_aot = dy.pick_range(tmp_iog, self.hidden_dim,
                                  self.hidden_dim * 2)
            i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2,
                                  self.hidden_dim * 3)

            i_it = dy.logistic(i_ait)
            i_ot = dy.logistic(i_aot)
            i_gt = dy.tanh(i_agt)
            if len(cur_node.nodes_prev) == 0:
                c.append(dy.cmult(i_it, i_gt))
            else:
                fc = dy.cmult(i_ft_list[0], c[cur_node.nodes_prev[0]])
                for i in range(1, len(cur_node.nodes_prev)):
                    fc += dy.cmult(i_ft_list[i], c[cur_node.nodes_prev[i]])
                c.append(fc + dy.cmult(i_it, i_gt))
            h_t = dy.cmult(i_ot, dy.tanh(c[-1]))
            if self.dropout_rate > 0.0 and self.train:
                h_t = dy.cmult(h_t, self.dropout_mask_h)
            h.append(h_t)
        self._final_states = [transducers.FinalTransducerState(h[-1], c[-1])]
        return expression_seqs.ExpressionSequence(expr_list=h)
Esempio n. 4
0
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        """
    transduce the sequence

    Args:
      expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated)
    Returns:
      expression sequence
    """

        # Start with a [(length, model_size) x batch] tensor
        # B x T x H -> B x H x T
        x = expr_seq.as_tensor()
        x_len = x.size()[1]
        x_batch = x.size()[0]
        # Get the query key and value vectors
        q = self.lin_q(x).transpose(1, 2).contiguous()
        k = self.lin_k(x).transpose(1, 2).contiguous()
        v = self.lin_v(x).transpose(1, 2).contiguous()
        # q = bq + x * Wq
        # k = bk + x * Wk
        # v = bv + x * Wv

        # Split to batches [(length, head_dim) x batch * num_heads] tensor
        q, k, v = [
            temp.view((x_batch * self.num_heads, self.head_dim, x_len))
            for temp in (q, k, v)
        ]

        # Do scaled dot product [batch*num_heads, length, length], rows are keys, columns are queries
        attn_score = torch.matmul(k.transpose(1, 2), q) / sqrt(self.head_dim)
        if expr_seq.mask is not None:
            mask = torch.Tensor(
                np.repeat(expr_seq.mask.np_arr, self.num_heads, axis=0) *
                -1e10).to(xnmt.device)
            attn_score = attn_score + mask.unsqueeze(2)
        attn_prob = torch.nn.Softmax(dim=1)(attn_score)
        # attn_prob = dy.softmax(attn_score, d=1)
        if self.train and self.dropout > 0.0:
            attn_prob = tt.dropout(attn_prob, self.dropout)
        # Reduce using attention and resize to match [(length, model_size) x batch]
        o = torch.matmul(v, attn_prob).view(x_batch, self.input_dim,
                                            x_len).transpose(1, 2)
        # Final transformation
        o = self.lin_o(o)
        # o = bo + o * Wo

        expr_seq = expression_seqs.ExpressionSequence(expr_tensor=o,
                                                      mask=expr_seq.mask)

        self._final_states = [
            transducers.FinalTransducerState(expr_seq[-1], None)
        ]

        return expr_seq
Esempio n. 5
0
  def transduce(self, expr_seq: 'expression_seqs.ExpressionSequence') -> 'expression_seqs.ExpressionSequence':
    """
    transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c)

    Args:
      expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated)
    Returns:
      expression sequence
    """
    if isinstance(expr_seq, expression_seqs.ExpressionSequence):
      expr_seq = [expr_seq]
    batch_size = expr_seq[0][0].dim()[1]
    seq_len = len(expr_seq[0])

    if self.dropout_rate > 0.0 and self.train:
      self.set_dropout_masks(batch_size=batch_size)

    cur_input = expr_seq
    self._final_states = []
    for layer_i in range(self.num_layers):
      h = [dy.zeroes(dim=(self.hidden_dim,), batch_size=batch_size)]
      c = [dy.zeroes(dim=(self.hidden_dim,), batch_size=batch_size)]
      for pos_i in range(seq_len):
        x_t = [cur_input[j][pos_i] for j in range(len(cur_input))]
        if isinstance(x_t, dy.Expression):
          x_t = [x_t]
        elif type(x_t) != list:
          x_t = list(x_t)
        if sum([x_t_i.dim()[0][0] for x_t_i in x_t]) != self.total_input_dim:
          found_dim = sum([x_t_i.dim()[0][0] for x_t_i in x_t])
          raise ValueError(f"VanillaLSTMGates: x_t has inconsistent dimension {found_dim}, expecting {self.total_input_dim}")
        if self.dropout_rate > 0.0 and self.train:
          # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights)
          gates_t = dy.vanilla_lstm_gates_dropout_concat(x_t,
                                                         h[-1],
                                                         self.Wx[layer_i],
                                                         self.Wh[layer_i],
                                                         self.b[layer_i],
                                                         self.dropout_mask_x[layer_i],
                                                         self.dropout_mask_h[layer_i],
                                                         self.weightnoise_std if self.train else 0.0)
        else:
          gates_t = dy.vanilla_lstm_gates_concat(x_t, h[-1], self.Wx[layer_i], self.Wh[layer_i], self.b[layer_i], self.weightnoise_std if self.train else 0.0)
        c_t = dy.vanilla_lstm_c(c[-1], gates_t)
        h_t = dy.vanilla_lstm_h(c_t, gates_t)
        if expr_seq[0].mask is None or np.isclose(np.sum(expr_seq[0].mask.np_arr[:,pos_i:pos_i+1]), 0.0):
          c.append(c_t)
          h.append(h_t)
        else:
          c.append(expr_seq[0].mask.cmult_by_timestep_expr(c_t,pos_i,True) + expr_seq[0].mask.cmult_by_timestep_expr(c[-1],pos_i,False))
          h.append(expr_seq[0].mask.cmult_by_timestep_expr(h_t,pos_i,True) + expr_seq[0].mask.cmult_by_timestep_expr(h[-1],pos_i,False))
      self._final_states.append(transducers.FinalTransducerState(h[-1], c[-1]))
      cur_input = [h[1:]]

    return expression_seqs.ExpressionSequence(expr_list=h[1:], mask=expr_seq[0].mask)
Esempio n. 6
0
    def transduce(
        self, expr_seq: 'expression_seqs.ExpressionSequence'
    ) -> 'expression_seqs.ExpressionSequence':
        """
    transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c)

    Args:
      expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated)
    Returns:
      expression sequence
    """
        if isinstance(expr_seq, expression_seqs.ExpressionSequence):
            expr_seq = [expr_seq]
        concat_inputs = len(expr_seq) >= 2
        batch_size = tt.batch_size(expr_seq[0][0])
        seq_len = expr_seq[0].sent_len()
        mask = expr_seq[0].mask

        if self.dropout_rate > 0.0 and self.train:
            self.set_dropout_masks(batch_size=batch_size)

        cur_input = expr_seq
        self._final_states = []
        for layer_i in range(self.num_layers):
            h = [tt.zeroes(hidden_dim=self.hidden_dim, batch_size=batch_size)]
            c = [tt.zeroes(hidden_dim=self.hidden_dim, batch_size=batch_size)]
            for pos_i in range(seq_len):
                if concat_inputs and layer_i == 0:
                    x_t = tt.concatenate(
                        [cur_input[i][pos_i] for i in range(len(cur_input))])
                else:
                    x_t = cur_input[0][pos_i]
                h_tm1 = h[-1]
                if self.dropout_rate > 0.0 and self.train:
                    # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights)
                    x_t = torch.mul(x_t, self.dropout_mask_x[layer_i])
                    h_tm1 = torch.mul(h_tm1, self.dropout_mask_h[layer_i])
                h_t, c_t = self.layers[layer_i](x_t, (h_tm1, c[-1]))
                if mask is None or np.isclose(
                        np.sum(mask.np_arr[:, pos_i:pos_i + 1]), 0.0):
                    c.append(c_t)
                    h.append(h_t)
                else:
                    c.append(
                        mask.cmult_by_timestep_expr(c_t, pos_i, True) +
                        mask.cmult_by_timestep_expr(c[-1], pos_i, False))
                    h.append(
                        mask.cmult_by_timestep_expr(h_t, pos_i, True) +
                        mask.cmult_by_timestep_expr(h[-1], pos_i, False))
            self._final_states.append(
                transducers.FinalTransducerState(h[-1], c[-1]))
            cur_input = [h[1:]]

        return expression_seqs.ExpressionSequence(expr_list=h[1:], mask=mask)
Esempio n. 7
0
  def transduce(self, es: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence:
    """
    returns the list of output Expressions obtained by adding the given inputs
    to the current state, one by one, to both the forward and backward RNNs,
    and concatenating.

    Args:
      es: an ExpressionSequence
    """
    es_list = [es]

    for layer_i, (fb, bb) in enumerate(self.builder_layers):
      reduce_factor = self._reduce_factor_for_layer(layer_i)

      if es_list[0].mask is None: mask_out = None
      else: mask_out = es_list[0].mask.lin_subsampled(reduce_factor)

      if self.downsampling_method=="concat" and es_list[0].sent_len() % reduce_factor != 0:
        raise ValueError(f"For 'concat' subsampling, sequence lengths must be multiples of the total reduce factor, "
                         f"but got sequence length={es_list[0].sent_len()} for reduce_factor={reduce_factor}. "
                         f"Set Batcher's pad_src_to_multiple argument accordingly.")
      fs = fb.transduce(es_list)
      bs = bb.transduce([expression_seqs.ReversedExpressionSequence(es_item) for es_item in es_list])
      if layer_i < len(self.builder_layers) - 1:
        if self.downsampling_method=="skip":
          es_list = [expression_seqs.ExpressionSequence(expr_list=fs[::reduce_factor], mask=mask_out),
                     expression_seqs.ExpressionSequence(expr_list=bs[::reduce_factor][::-1], mask=mask_out)]
        elif self.downsampling_method=="concat":
          es_len = es_list[0].sent_len()
          es_list_fwd = []
          es_list_bwd = []
          for i in range(0, es_len, reduce_factor):
            for j in range(reduce_factor):
              if i==0:
                es_list_fwd.append([])
                es_list_bwd.append([])
              es_list_fwd[j].append(fs[i+j])
              es_list_bwd[j].append(bs[es_list[0].sent_len()-reduce_factor+j-i])
          es_list = [expression_seqs.ExpressionSequence(expr_list=es_list_fwd[j], mask=mask_out) for j in range(reduce_factor)] + \
                    [expression_seqs.ExpressionSequence(expr_list=es_list_bwd[j], mask=mask_out) for j in range(reduce_factor)]
        else:
          raise RuntimeError(f"unknown downsampling_method {self.downsampling_method}")
      else:
        # concat final outputs
        ret_es = expression_seqs.ExpressionSequence(
          expr_list=[tt.concatenate([f, b]) for f, b in zip(fs, expression_seqs.ReversedExpressionSequence(bs))], mask=mask_out)

    self._final_states = [transducers.FinalTransducerState(tt.concatenate([fb.get_final_states()[0].main_expr(),
                                                                           bb.get_final_states()[0].main_expr()]),
                                                           tt.concatenate([fb.get_final_states()[0].cell_expr(),
                                                                           bb.get_final_states()[0].cell_expr()])) \
                          for (fb, bb) in self.builder_layers]
    return ret_es
Esempio n. 8
0
 def transduce(self, src: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence:
   sent_len = len(src)
   embeddings = dy.strided_select(dy.parameter(self.embedder), [1,1], [0,0], [self.input_dim, sent_len])
   if self.op == 'sum':
     output = embeddings + src.as_tensor()
   elif self.op == 'concat':
     output = dy.concatenate([embeddings, src.as_tensor()])
   else:
     raise ValueError(f'Illegal op {op} in PositionalTransducer (options are "sum"/"concat")')
   if self.train and self.dropout > 0.0:
     output = dy.dropout(output, self.dropout)
   output_seq = expression_seqs.ExpressionSequence(expr_tensor=output, mask=src.mask)
   self._final_states = [transducers.FinalTransducerState(output_seq[-1])]
   return output_seq
Esempio n. 9
0
  def transduce(self, expr_seq: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence:
    """
    transduce the sequence

    Args:
      expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated)
    Returns:
      expression sequence
    """

    Wq, Wk, Wv, Wo = [dy.parameter(x) for x in (self.pWq, self.pWk, self.pWv, self.pWo)]
    bq, bk, bv, bo = [dy.parameter(x) for x in (self.pbq, self.pbk, self.pbv, self.pbo)]

    # Start with a [(length, model_size) x batch] tensor
    x = expr_seq.as_transposed_tensor()
    x_len = x.dim()[0][0]
    x_batch = x.dim()[1]
    # Get the query key and value vectors
    # TODO: do we need bias broadcasting in DyNet?
    # q = dy.affine_transform([bq, x, Wq])
    # k = dy.affine_transform([bk, x, Wk])
    # v = dy.affine_transform([bv, x, Wv])
    q = bq + x * Wq
    k = bk + x * Wk
    v = bv + x * Wv
    
    # Split to batches [(length, head_dim) x batch * num_heads] tensor
    q, k, v = [dy.reshape(x, (x_len, self.head_dim), batch_size=x_batch * self.num_heads) for x in (q,k,v)]

    # Do scaled dot product [(length, length) x batch * num_heads], rows are queries, columns are keys
    attn_score = q * dy.transpose(k) / sqrt(self.head_dim)
    if expr_seq.mask is not None:
      mask = dy.inputTensor(np.repeat(expr_seq.mask.np_arr, self.num_heads, axis=0).transpose(), batched=True) * -1e10
      attn_score = attn_score + mask
    attn_prob = dy.softmax(attn_score, d=1)
    if self.train and self.dropout > 0.0:
      attn_prob = dy.dropout(attn_prob, self.dropout)
    # Reduce using attention and resize to match [(length, model_size) x batch]
    o = dy.reshape(attn_prob * v, (x_len, self.input_dim), batch_size=x_batch)
    # Final transformation
    # o = dy.affine_transform([bo, attn_prob * v, Wo])
    o = bo + o * Wo

    expr_seq = expression_seqs.ExpressionSequence(expr_transposed_tensor=o, mask=expr_seq.mask)

    self._final_states = [transducers.FinalTransducerState(expr_seq[-1], None)]

    return expr_seq
Esempio n. 10
0
    def transduce(
        self, es: 'expression_seqs.ExpressionSequence'
    ) -> 'expression_seqs.ExpressionSequence':

        batch_size = tt.batch_size(es.as_tensor())
        if es.mask:
            seq_lengths = es.mask.seq_lengths()
        else:
            seq_lengths = [es.sent_len()] * batch_size

        # Sort the input and lengths as the descending order
        seq_lengths = torch.LongTensor(seq_lengths).to(xnmt.device)
        lengths, perm_index = seq_lengths.sort(0, descending=True)
        sorted_input = es.as_tensor()[perm_index]

        perm_index_rev = [-1] * len(lengths)
        for i in range(len(lengths)):
            perm_index_rev[perm_index[i]] = i
        perm_index_rev = torch.LongTensor(perm_index_rev).to(xnmt.device)

        packed_input = nn.utils.rnn.pack_padded_sequence(sorted_input,
                                                         list(lengths.data),
                                                         batch_first=True)
        state_size = self.num_dir * self.num_layers, batch_size, self.hidden_dim // self.num_dir
        h0 = sorted_input.new_zeros(*state_size)
        c0 = sorted_input.new_zeros(*state_size)
        output, (final_hiddens,
                 final_cells) = self.lstm(packed_input, (h0, c0))
        output = nn.utils.rnn.pad_packed_sequence(
            output, batch_first=True, total_length=es.sent_len())[0]

        # restore the sorting
        decoded = output[perm_index_rev]

        self._final_states = []
        for layer_i in range(self.num_layers):
            final_hidden = final_hiddens.view(
                self.num_layers, self.num_dir, batch_size,
                -1)[layer_i].transpose(0, 1).contiguous().view(batch_size, -1)
            final_hidden = final_hidden[perm_index_rev]
            self._final_states.append(
                transducers.FinalTransducerState(final_hidden))

        ret = expression_seqs.ExpressionSequence(expr_tensor=decoded,
                                                 mask=es.mask)
        return ret
Esempio n. 11
0
 def transduce(
     self, x: expression_seqs.ExpressionSequence
 ) -> expression_seqs.ExpressionSequence:
     expr = x.as_transposed_tensor()
     batch_size, hidden_dim, seq_len = expr.size()
     expr = expr.view((batch_size, self.in_channels,
                       hidden_dim // self.in_channels, seq_len))
     expr = self.cnn_layer(expr)
     if self.use_pooling:
         expr = self.pooling_layer(expr)
     expr = self.activation_fct(expr)
     batch_size, out_chn, out_h, seq_len = expr.size()
     expr = expr.view((batch_size, out_chn * out_h, seq_len))
     output_seq = expression_seqs.ExpressionSequence(
         expr_transposed_tensor=expr,
         mask=x.mask.lin_subsampled(trg_len=seq_len) if x.mask else None)
     self._final_states = [transducers.FinalTransducerState(output_seq[-1])]
     return output_seq
Esempio n. 12
0
 def calc_context(self, src_encoding):
   # Generating h_t based on RNN(h_{t-1}, embed(e_{t-1}))
   if self.prev_written_word is None:
     final_transducer_state = [transducers_base.FinalTransducerState(h, c) \
                               for h, c in zip(self.encoder_state.h(), self.encoder_state.c())]
     context_state = self.model.decoder.initial_state(final_transducer_state,
                                                      vocabs.Vocab.SS)
   else:
     context_state = self.model.decoder.add_input(self.context_state, self.prev_written_word)
   # Reset attender if there is a read action
   reset_attender = self.reset_attender
   if reset_attender:
     self.model.attender.init_sent(expr_seq.ExpressionSequence(expr_list=src_encoding))
     reset_attender = False
   # Calc context for decoding
   context_state.context = self.model.attender.calc_context(context_state.rnn_state.output())
   return SimultaneousState(self.model, self.encoder_state, context_state,
                            self.output_embed, self.has_been_read, self.has_been_written,
                            self.prev_written_word,
                            reset_attender)
Esempio n. 13
0
    def transduce(self, es):
        mask = es.mask
        # first layer
        forward_es = self.forward_layers[0].transduce(es)
        rev_backward_es = self.backward_layers[0].transduce(
            expression_seqs.ReversedExpressionSequence(es))

        # TODO: concat input of each layer to its output; or, maybe just add standard residual connections
        for layer_i in range(1, len(self.forward_layers)):
            new_forward_es = self.forward_layers[layer_i].transduce([
                forward_es,
                expression_seqs.ReversedExpressionSequence(rev_backward_es)
            ])
            mask_out = mask
            if mask_out is not None and new_forward_es.mask.np_arr.shape != mask_out.np_arr.shape:
                mask_out = mask_out.lin_subsampled(trg_len=len(new_forward_es))
            rev_backward_es = expression_seqs.ExpressionSequence(
                self.backward_layers[layer_i].transduce([
                    expression_seqs.ReversedExpressionSequence(forward_es),
                    rev_backward_es
                ]).as_list(),
                mask=mask_out)
            forward_es = new_forward_es

        self._final_states = [
          transducers.FinalTransducerState(dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].main_expr(),
                                                           self.backward_layers[layer_i].get_final_states()[
                                                             0].main_expr()]),
                                           dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].cell_expr(),
                                                           self.backward_layers[layer_i].get_final_states()[
                                                             0].cell_expr()])) \
          for layer_i in range(len(self.forward_layers))]
        mask_out = mask
        if mask_out is not None and forward_es.mask.np_arr.shape != mask_out.np_arr.shape:
            mask_out = mask_out.lin_subsampled(trg_len=len(forward_es))
        return expression_seqs.ExpressionSequence(expr_list=[
            dy.concatenate([forward_es[i], rev_backward_es[-i - 1]])
            for i in range(len(forward_es))
        ],
                                                  mask=mask_out)
Esempio n. 14
0
  def transduce(self, es: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence:

    for layer_i, (fb, bb) in enumerate(self.lstm_layers):
      fs = fb.transduce(es)
      bs = bb.transduce(expression_seqs.ReversedExpressionSequence(es))
      interleaved = []

      if es.mask is None: mask = None
      else:
        mask = es.mask.lin_subsampled(0.5) # upsample the mask to encompass interleaved fwd / bwd expressions

      for pos in range(len(fs)):
        interleaved.append(fs[pos])
        interleaved.append(bs[-pos-1])
      
      projected = expression_seqs.ExpressionSequence(expr_list=interleaved, mask=mask)
      projected = self.nin_layers[layer_i].transduce(projected)
      assert math.ceil(len(es) / float(self.stride))==len(projected), \
        f"mismatched len(es)=={len(es)}, stride=={self.stride}, len(projected)=={len(projected)}"
      es = projected

    self._final_states = [transducers.FinalTransducerState(projected[-1])]
    return projected
Esempio n. 15
0
    def write(self, src_encoding, word, policy_action):
        # Reset attender if there is a read action
        reset_attender = self.reset_attender
        if reset_attender:
            encodings = src_encoding[:self.has_been_read]
            self.model.attender.init_sent(
                expr_seq.ExpressionSequence(expr_list=encodings))
            reset_attender = False

        # Generating h_t based on RNN(h_{t-1}, embed(e_{t-1}))
        if self.decoder_state is None or word is None:
            dim = src_encoding[0].dim()
            fin_tran_state = [
                transducers_base.FinalTransducerState(dy.zeros(*dim),
                                                      dy.zeros(*dim))
            ]
            decoder_state = self.model.decoder.initial_state(
                fin_tran_state, vocabs.Vocab.SS)
        else:
            decoder_state = self.model.decoder.add_input(
                self.decoder_state, word)
        decoder_state.attention = self.model.attender.calc_attention(
            decoder_state.as_vector())
        decoder_state.context = self.model.attender.calc_context(
            decoder_state.as_vector(), decoder_state.attention)

        # Calc context for decoding
        return SimultaneousState(self.model,
                                 self.encoder_state,
                                 decoder_state,
                                 has_been_read=self.has_been_read,
                                 has_been_written=self.has_been_written + 1,
                                 written_word=word,
                                 policy_action=policy_action,
                                 reset_attender=reset_attender,
                                 parent=self)
Esempio n. 16
0
    def transduce(
        self, xs: 'expression_seqs.ExpressionSequence'
    ) -> 'expression_seqs.ExpressionSequence':
        batch_size = xs[0][0].dim()[1]
        h_bot = []
        h_mid = []
        h_top = []
        z_bot = []
        z_mid = []
        z_top = []

        self.top_layer.h = None
        self.top_layer.c = None
        self.top_layer.z = None
        self.mid_layer.h = None
        self.mid_layer.c = None
        self.mid_layer.z = None
        self.bottom_layer.h = None
        self.bottom_layer.c = None
        self.bottom_layer.z = None

        #?? checkme. want to init z to ones? (cherry paper)
        z_one = dy.ones(1, batch_size=batch_size)
        h_bot.append(
            dy.zeroes(dim=(self.hidden_dim, ),
                      batch_size=batch_size))  #indices for timesteps are +1
        h_mid.append(dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size))
        h_top.append(dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size))

        for i, x_t in enumerate(xs):
            h_t_bot, z_t_bot = self.bottom_layer.transduce(
                h_below=x_t, h_above=h_mid[i], z_below=z_one
            )  #uses h_t_top from layer above@previous time step, h_t_bot and z_t_bot from previous time step (saved in hmlstmcell)
            h_t_mid, z_t_mid = self.mid_layer.transduce(
                h_below=h_t_bot, h_above=h_top[i], z_below=z_t_bot
            )  #uses h_t_top from layer above@previous time step, h_t_bot and z_t_bot from previous time step (saved in hmlstmcell)
            h_t_top, z_t_top = self.top_layer.transduce(
                h_below=h_t_mid, h_above=None, z_below=z_t_mid
            )  #uses z_t_bot and h_t_bot from previous layer call, h_t_top and z_t_top from previous time step (saved in hmlstmcell)

            h_bot.append(h_t_bot)
            z_bot.append(z_t_bot)
            h_mid.append(h_t_mid)
            z_mid.append(z_t_mid)
            h_top.append(h_t_top)
            z_top.append(z_t_top)

#        #gated output module
#
#        #sigmoid
#        W_layer = dy.parameters(dim=(len(self.modules), hidden_dim)) #needs to be moved to init? num layers by hidden_dim
#        h_cat   = dy.transpose(dy.concatenate([h_bot, h_mid, h_top]))
#        dotted  = dy.dot_product(e1, e2)
#        gates   = dy.logistic(dotted)
#        #relu
#
#        om = dy.relu()

#final state is last hidden state from top layer
        self._final_states = [transducers.FinalTransducerState(h_top[-1])]
        fin_xs = expression_seqs.ExpressionSequence(expr_list=h_top[1:])
        return fin_xs  #removes the init zeros to make it same length as seq
Esempio n. 17
0
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        if expr_seq.batch_size() > 1:
            raise ValueError(
                f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.batch_size()}"
            )
        lattice = self.cur_src[0]
        Wx_iog = dy.parameter(self.p_Wx_iog)
        Wh_iog = dy.parameter(self.p_Wh_iog)
        b_iog = dy.parameter(self.p_b_iog)
        Wx_f = dy.parameter(self.p_Wx_f)
        Wh_f = dy.parameter(self.p_Wh_f)
        b_f = dy.parameter(self.p_b_f)
        h = {}
        c = {}
        h_list = []

        batch_size = expr_seq.batch_size()
        if self.dropout_rate > 0.0 and self.train:
            self.set_dropout_masks(batch_size=batch_size)

        for i, cur_node_id in enumerate(lattice.nodes):
            prev_node = lattice.graph.predecessors(cur_node_id)
            val = expr_seq[i]
            if self.dropout_rate > 0.0 and self.train:
                val = dy.cmult(val, self.dropout_mask_x)
            i_ft_list = []
            if len(prev_node) == 0:
                tmp_iog = dy.affine_transform([b_iog, Wx_iog, val])
            else:
                h_tilde = sum(h[pred] for pred in prev_node)
                tmp_iog = dy.affine_transform(
                    [b_iog, Wx_iog, val, Wh_iog, h_tilde])
                for pred in prev_node:
                    i_ft_list.append(
                        dy.logistic(
                            dy.affine_transform(
                                [b_f, Wx_f, val, Wh_f, h[pred]])))
            i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim)
            i_aot = dy.pick_range(tmp_iog, self.hidden_dim,
                                  self.hidden_dim * 2)
            i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2,
                                  self.hidden_dim * 3)

            i_it = dy.logistic(i_ait)
            i_ot = dy.logistic(i_aot)
            i_gt = dy.tanh(i_agt)
            if len(prev_node) == 0:
                c[cur_node_id] = dy.cmult(i_it, i_gt)
            else:
                fc = dy.cmult(i_ft_list[0], c[prev_node[0]])
                for i in range(1, len(prev_node)):
                    fc += dy.cmult(i_ft_list[i], c[prev_node[i]])
                c[cur_node_id] = fc + dy.cmult(i_it, i_gt)
            h_t = dy.cmult(i_ot, dy.tanh(c[cur_node_id]))
            if self.dropout_rate > 0.0 and self.train:
                h_t = dy.cmult(h_t, self.dropout_mask_h)
            h[cur_node_id] = h_t
            h_list.append(h_t)
        self._final_states = [
            transducers.FinalTransducerState(h_list[-1], h_list[-1])
        ]
        return expression_seqs.ExpressionSequence(expr_list=h_list)
Esempio n. 18
0
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        """
    transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c)

    Args:
      expr_seq: expression sequence (will be accessed via tensor_expr)
    Return:
      expression sequence
    """

        if isinstance(expr_seq, list):
            mask_out = expr_seq[0].mask
            seq_len = len(expr_seq[0])
            batch_size = expr_seq[0].dim()[1]
            tensors = [e.as_tensor() for e in expr_seq]
            input_tensor = dy.reshape(dy.concatenate(tensors),
                                      (seq_len, 1, self.input_dim),
                                      batch_size=batch_size)
        else:
            mask_out = expr_seq.mask
            seq_len = len(expr_seq)
            batch_size = expr_seq.dim()[1]
            input_tensor = dy.reshape(dy.transpose(expr_seq.as_tensor()),
                                      (seq_len, 1, self.input_dim),
                                      batch_size=batch_size)

        if self.dropout > 0.0 and self.train:
            input_tensor = dy.dropout(input_tensor, self.dropout)

        proj_inp = dy.conv2d_bias(input_tensor,
                                  dy.parameter(self.p_f),
                                  dy.parameter(self.p_b),
                                  stride=(self.stride, 1),
                                  is_valid=False)
        reduced_seq_len = proj_inp.dim()[0][0]
        proj_inp = dy.transpose(
            dy.reshape(proj_inp, (reduced_seq_len, self.hidden_dim * 3),
                       batch_size=batch_size))
        # proj_inp dims: (hidden, 1, seq_len), batch_size
        if self.stride > 1 and mask_out is not None:
            mask_out = mask_out.lin_subsampled(trg_len=reduced_seq_len)

        h = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)]
        c = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)]
        for t in range(reduced_seq_len):
            f_t = dy.logistic(
                dy.strided_select(proj_inp, [], [0, t],
                                  [self.hidden_dim, t + 1]))
            o_t = dy.logistic(
                dy.strided_select(proj_inp, [], [self.hidden_dim, t],
                                  [self.hidden_dim * 2, t + 1]))
            z_t = dy.tanh(
                dy.strided_select(proj_inp, [], [self.hidden_dim * 2, t],
                                  [self.hidden_dim * 3, t + 1]))

            if self.dropout > 0.0 and self.train:
                retention_rate = 1.0 - self.dropout
                dropout_mask = dy.random_bernoulli((self.hidden_dim, 1),
                                                   retention_rate,
                                                   batch_size=batch_size)
                f_t = 1.0 - dy.cmult(
                    dropout_mask, 1.0 - f_t
                )  # TODO: would be easy to make a zoneout dynet operation to save memory

            i_t = 1.0 - f_t

            if t == 0:
                c_t = dy.cmult(i_t, z_t)
            else:
                c_t = dy.cmult(f_t, c[-1]) + dy.cmult(i_t, z_t)
            h_t = dy.cmult(
                o_t, c_t)  # note: LSTM would use dy.tanh(c_t) instead of c_t
            if mask_out is None or np.isclose(
                    np.sum(mask_out.np_arr[:, t:t + 1]), 0.0):
                c.append(c_t)
                h.append(h_t)
            else:
                c.append(
                    mask_out.cmult_by_timestep_expr(c_t, t, True) +
                    mask_out.cmult_by_timestep_expr(c[-1], t, False))
                h.append(
                    mask_out.cmult_by_timestep_expr(h_t, t, True) +
                    mask_out.cmult_by_timestep_expr(h[-1], t, False))

        self._final_states = [transducers.FinalTransducerState(dy.reshape(h[-1], (self.hidden_dim,), batch_size=batch_size), \
                                                               dy.reshape(c[-1], (self.hidden_dim,),
                                                                          batch_size=batch_size))]
        return expression_seqs.ExpressionSequence(expr_list=h[1:],
                                                  mask=mask_out)
Esempio n. 19
0
 def transduce(self, x):
     # some preparations
     output_states = []
     current_state = self._encode_src(x, apply_emb=False)
     if self.mode_transduce == "split":
         first_state = SymmetricDecoderState(
             rnn_state=current_state.rnn_state,
             context=current_state.context)
     batch_size = x.dim()[1]
     done = [False] * batch_size
     out_mask = batchers.Mask(np_arr=np.zeros((batch_size,
                                               self.max_dec_len)))
     out_mask.np_arr.flags.writeable = True
     # teacher / split mode: unfold guided by reference targets
     #  -> feed everything up unto (except) the last token back into the LSTM
     # other modes: unfold until EOS is output or max len is reached
     max_dec_len = self.cur_src.batches[1].sent_len(
     ) if self.mode_transduce in ["teacher", "split"] else self.max_dec_len
     atts_list = []
     generated_word_ids = []
     for pos in range(max_dec_len):
         if self.train and self.mode_transduce in ["teacher", "split"]:
             # unroll RNN guided by reference
             prev_ref_action, ref_action = None, None
             if pos > 0:
                 prev_ref_action = self._batch_ref_action(pos - 1)
             if self.transducer_loss:
                 ref_action = self._batch_ref_action(pos)
             step_loss = self.calc_loss_one_step(
                 dec_state=current_state,
                 batch_size=batch_size,
                 mode=self.mode_transduce,
                 ref_action=ref_action,
                 prev_ref_action=prev_ref_action)
             self.transducer_losses.append(step_loss)
         else:  # inference
             # unroll RNN guided by model predictions
             if self.mode_transduce in ["teacher", "split"]:
                 prev_ref_action = self._batch_max_action(
                     batch_size, current_state, pos)
             else:
                 prev_ref_action = None
             out_scores = self.generate_one_step(
                 dec_state=current_state,
                 mask=out_mask,
                 cur_step=pos,
                 batch_size=batch_size,
                 mode=self.mode_transduce,
                 prev_ref_action=prev_ref_action)
             word_id = np.argmax(out_scores.npvalue(), axis=0)
             word_id = word_id.reshape((word_id.size, ))
             generated_word_ids.append(word_id[0])
             for batch_i in range(batch_size):
                 if self._terminate_rnn(batch_i=batch_i,
                                        pos=pos,
                                        batched_word_id=word_id):
                     done[batch_i] = True
                     out_mask.np_arr[batch_i, pos + 1:] = 1.0
             if pos > 0 and all(done):
                 atts_list.append(self.attender.get_last_attention())
                 output_states.append(current_state.rnn_state.h()[-1])
                 break
         output_states.append(current_state.rnn_state.h()[-1])
         atts_list.append(self.attender.get_last_attention())
     if self.mode_transduce == "split":
         # split mode: use attentions to compute context, then run RNNs over these context inputs
         if self.split_regularizer:
             assert len(atts_list) == len(
                 self._chosen_rnn_inputs
             ), f"{len(atts_list)} != {len(self._chosen_rnn_inputs)}"
         split_output_states = []
         split_rnn_state = first_state.rnn_state
         for pos, att in enumerate(atts_list):
             lstm_input_context = self.attender.curr_sent.as_tensor(
             ) * att  # TODO: better reuse the already computed context vecs
             lstm_input_context = dy.reshape(
                 lstm_input_context, (lstm_input_context.dim()[0][0], ),
                 batch_size=batch_size)
             if self.split_dual:
                 lstm_input_label = self._chosen_rnn_inputs[pos]
                 if self.split_dual[0] > 0.0 and self.train:
                     lstm_input_context = dy.dropout_batch(
                         lstm_input_context, self.split_dual[0])
                 if self.split_dual[1] > 0.0 and self.train:
                     lstm_input_label = dy.dropout_batch(
                         lstm_input_label, self.split_dual[1])
                 if self.split_context_transform:
                     lstm_input_context = self.split_context_transform.transform(
                         lstm_input_context)
                 lstm_input_context = self.split_dual_proj.transform(
                     dy.concatenate([lstm_input_context, lstm_input_label]))
             if self.split_regularizer and pos < len(
                     self._chosen_rnn_inputs):
                 # _chosen_rnn_inputs does not contain first (empty) input, so this is in fact like comparing to pos-1:
                 penalty = dy.squared_norm(lstm_input_context -
                                           self._chosen_rnn_inputs[pos])
                 if self.split_regularizer != 1:
                     penalty = self.split_regularizer * penalty
                 self.split_reg_penalty_expr = penalty
             split_rnn_state = split_rnn_state.add_input(lstm_input_context)
             split_output_states.append(split_rnn_state.h()[-1])
         assert len(output_states) == len(split_output_states)
         output_states = split_output_states
     out_mask.np_arr = out_mask.np_arr[:, :len(output_states)]
     self._final_states = []
     if self.compute_report:
         # for symmetric reporter (this can only be run at inference time)
         assert batch_size == 1
         atts_matrix = np.asarray([att.npvalue() for att in atts_list
                                   ]).reshape(len(atts_list),
                                              atts_list[0].dim()[0][0]).T
         self.report_sent_info({
             "symm_att":
             atts_matrix,
             "symm_out":
             sent.SimpleSentence(
                 words=generated_word_ids,
                 idx=self.cur_src.batches[0][0].idx,
                 vocab=self.cur_src.batches[1][0].vocab,
                 output_procs=self.cur_src.batches[1][0].output_procs),
             "symm_ref":
             self.cur_src.batches[1][0] if isinstance(
                 self.cur_src, batchers.CompoundBatch) else None
         })
     # prepare final outputs
     for layer_i in range(len(current_state.rnn_state.h())):
         self._final_states.append(
             transducers.FinalTransducerState(
                 main_expr=current_state.rnn_state.h()[layer_i],
                 cell_expr=current_state.rnn_state._c[layer_i]))
     out_mask.np_arr.flags.writeable = False
     return expression_seqs.ExpressionSequence(expr_list=output_states,
                                               mask=out_mask)
Esempio n. 20
0
    def transduce(
        self, embed_sent: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        src = embed_sent.as_tensor()

        sent_len = src.dim()[0][1]
        batch_size = src.dim()[1]
        pad_size = (self.window_receptor -
                    1) / 2  #TODO adapt it also for even window size

        src = dy.concatenate([
            dy.zeroes((self.input_dim, pad_size), batch_size=batch_size), src,
            dy.zeroes((self.input_dim, pad_size), batch_size=batch_size)
        ],
                             d=1)
        padded_sent_len = sent_len + 2 * pad_size

        conv1 = dy.parameter(self.pConv1)
        bias1 = dy.parameter(self.pBias1)
        src_chn = dy.reshape(src, (self.input_dim, padded_sent_len, 1),
                             batch_size=batch_size)
        cnn_layer1 = dy.conv2d_bias(src_chn, conv1, bias1, stride=[1, 1])

        hidden_layer = dy.reshape(cnn_layer1, (self.internal_dim, sent_len, 1),
                                  batch_size=batch_size)
        if self.non_linearity is 'linear':
            hidden_layer = hidden_layer
        elif self.non_linearity is 'tanh':
            hidden_layer = dy.tanh(hidden_layer)
        elif self.non_linearity is 'relu':
            hidden_layer = dy.rectify(hidden_layer)
        elif self.non_linearity is 'sigmoid':
            hidden_layer = dy.logistic(hidden_layer)

        for conv_hid, bias_hid in self.builder_layers:
            hidden_layer = dy.conv2d_bias(hidden_layer,
                                          dy.parameter(conv_hid),
                                          dy.parameter(bias_hid),
                                          stride=[1, 1])
            hidden_layer = dy.reshape(hidden_layer,
                                      (self.internal_dim, sent_len, 1),
                                      batch_size=batch_size)
            if self.non_linearity is 'linear':
                hidden_layer = hidden_layer
            elif self.non_linearity is 'tanh':
                hidden_layer = dy.tanh(hidden_layer)
            elif self.non_linearity is 'relu':
                hidden_layer = dy.rectify(hidden_layer)
            elif self.non_linearity is 'sigmoid':
                hidden_layer = dy.logistic(hidden_layer)
        last_conv = dy.parameter(self.last_conv)
        last_bias = dy.parameter(self.last_bias)
        output = dy.conv2d_bias(hidden_layer,
                                last_conv,
                                last_bias,
                                stride=[1, 1])
        output = dy.reshape(output, (sent_len, self.output_dim),
                            batch_size=batch_size)
        output_seq = expression_seqs.ExpressionSequence(expr_tensor=output)
        self._final_states = [transducers.FinalTransducerState(output_seq[-1])]
        return output_seq