Esempio n. 1
0
    def transduce(
        self, es: 'expression_seqs.ExpressionSequence'
    ) -> 'expression_seqs.ExpressionSequence':
        mask = es.mask
        # first layer
        forward_es = self.forward_layers[0].transduce(es)
        rev_backward_es = self.backward_layers[0].transduce(
            expression_seqs.ReversedExpressionSequence(es))

        for layer_i in range(1, len(self.forward_layers)):
            new_forward_es = self.forward_layers[layer_i].transduce([
                forward_es,
                expression_seqs.ReversedExpressionSequence(rev_backward_es)
            ])
            rev_backward_es = expression_seqs.ExpressionSequence(
                self.backward_layers[layer_i].transduce([
                    expression_seqs.ReversedExpressionSequence(forward_es),
                    rev_backward_es
                ]).as_list(),
                mask=mask)
            forward_es = new_forward_es

        self._final_states = [
          transducers.FinalTransducerState(dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].main_expr(),
                                                           self.backward_layers[layer_i].get_final_states()[
                                                             0].main_expr()]),
                                           dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].cell_expr(),
                                                           self.backward_layers[layer_i].get_final_states()[
                                                             0].cell_expr()])) \
          for layer_i in range(len(self.forward_layers))]
        return expression_seqs.ExpressionSequence(expr_list=[
            dy.concatenate([forward_es[i], rev_backward_es[-i - 1]])
            for i in range(len(forward_es))
        ],
                                                  mask=mask)
Esempio n. 2
0
  def transduce(self, es: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence:
    """
    returns the list of output Expressions obtained by adding the given inputs
    to the current state, one by one, to both the forward and backward RNNs,
    and concatenating.

    Args:
      es: an ExpressionSequence
    """
    es_list = [es]

    for layer_i, (fb, bb) in enumerate(self.builder_layers):
      reduce_factor = self._reduce_factor_for_layer(layer_i)

      if es_list[0].mask is None: mask_out = None
      else: mask_out = es_list[0].mask.lin_subsampled(reduce_factor)

      if self.downsampling_method=="concat" and es_list[0].sent_len() % reduce_factor != 0:
        raise ValueError(f"For 'concat' subsampling, sequence lengths must be multiples of the total reduce factor, "
                         f"but got sequence length={es_list[0].sent_len()} for reduce_factor={reduce_factor}. "
                         f"Set Batcher's pad_src_to_multiple argument accordingly.")
      fs = fb.transduce(es_list)
      bs = bb.transduce([expression_seqs.ReversedExpressionSequence(es_item) for es_item in es_list])
      if layer_i < len(self.builder_layers) - 1:
        if self.downsampling_method=="skip":
          es_list = [expression_seqs.ExpressionSequence(expr_list=fs[::reduce_factor], mask=mask_out),
                     expression_seqs.ExpressionSequence(expr_list=bs[::reduce_factor][::-1], mask=mask_out)]
        elif self.downsampling_method=="concat":
          es_len = es_list[0].sent_len()
          es_list_fwd = []
          es_list_bwd = []
          for i in range(0, es_len, reduce_factor):
            for j in range(reduce_factor):
              if i==0:
                es_list_fwd.append([])
                es_list_bwd.append([])
              es_list_fwd[j].append(fs[i+j])
              es_list_bwd[j].append(bs[es_list[0].sent_len()-reduce_factor+j-i])
          es_list = [expression_seqs.ExpressionSequence(expr_list=es_list_fwd[j], mask=mask_out) for j in range(reduce_factor)] + \
                    [expression_seqs.ExpressionSequence(expr_list=es_list_bwd[j], mask=mask_out) for j in range(reduce_factor)]
        else:
          raise RuntimeError(f"unknown downsampling_method {self.downsampling_method}")
      else:
        # concat final outputs
        ret_es = expression_seqs.ExpressionSequence(
          expr_list=[tt.concatenate([f, b]) for f, b in zip(fs, expression_seqs.ReversedExpressionSequence(bs))], mask=mask_out)

    self._final_states = [transducers.FinalTransducerState(tt.concatenate([fb.get_final_states()[0].main_expr(),
                                                                           bb.get_final_states()[0].main_expr()]),
                                                           tt.concatenate([fb.get_final_states()[0].cell_expr(),
                                                                           bb.get_final_states()[0].cell_expr()])) \
                          for (fb, bb) in self.builder_layers]
    return ret_es
Esempio n. 3
0
 def transduce(
     self, src: expression_seqs.ExpressionSequence
 ) -> expression_seqs.ExpressionSequence:
     src_tensor = src.as_tensor()
     out_mask = src.mask
     if self.downsample_by > 1:
         assert len(src_tensor.dim()[0])==2, \
           f"Downsampling only supported for tensors of order two. Found dims {src_tensor.dim()}"
         (hidden_dim, seq_len), batch_size = src_tensor.dim()
         if seq_len % self.downsample_by != 0:
             raise ValueError(
                 "For downsampling, sequence lengths must be multiples of the total reduce factor. "
                 "Configure batcher accordingly.")
         src_tensor = dy.reshape(src_tensor,
                                 (hidden_dim * self.downsample_by,
                                  seq_len // self.downsample_by),
                                 batch_size=batch_size)
         if out_mask:
             out_mask = out_mask.lin_subsampled(
                 reduce_factor=self.downsample_by)
     output = self.transform.transform(src_tensor)
     if self.downsample_by == 1:
         if len(output.dim()) != src_tensor.dim(
         ):  # can happen with seq length 1
             output = dy.reshape(output,
                                 src_tensor.dim()[0],
                                 batch_size=src_tensor.dim()[1])
     output_seq = expression_seqs.ExpressionSequence(expr_tensor=output,
                                                     mask=out_mask)
     self._final_states = [FinalTransducerState(output_seq[-1])]
     return output_seq
Esempio n. 4
0
    def embed_sent(self, x) -> expression_seqs.ExpressionSequence:
        """Embed a full sentence worth of words. By default, just do a for loop.

    Args:
      x: This will generally be a list of word IDs, but could also be a list of strings or some other format.
         It could also be batched, in which case it will be a (possibly masked) :class:`xnmt.batcher.Batch` object

    Returns:
      An expression sequence representing vectors of each word in the input.
    """
        # single mode
        if not batchers.is_batched(x):
            embeddings = [self.embed(word) for word in x]
        # minibatch mode
        else:
            embeddings = []
            seq_len = x.sent_len()
            for single_sent in x:
                assert single_sent.sent_len() == seq_len
            for word_i in range(seq_len):
                batch = batchers.mark_as_batch(
                    [single_sent[word_i] for single_sent in x])
                embeddings.append(self.embed(batch))

        return expression_seqs.ExpressionSequence(
            expr_list=embeddings,
            mask=x.mask if batchers.is_batched(x) else None)
Esempio n. 5
0
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        if expr_seq.dim()[1] > 1:
            raise ValueError(
                f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.dim()[1]}"
            )
        lattice = self.cur_src[0]
        Wx_iog = dy.parameter(self.p_Wx_iog)
        Wh_iog = dy.parameter(self.p_Wh_iog)
        b_iog = dy.parameter(self.p_b_iog)
        Wx_f = dy.parameter(self.p_Wx_f)
        Wh_f = dy.parameter(self.p_Wh_f)
        b_f = dy.parameter(self.p_b_f)
        h = []
        c = []

        batch_size = expr_seq.dim()[1]
        if self.dropout_rate > 0.0 and self.train:
            self.set_dropout_masks(batch_size=batch_size)

        for node_i in range(lattice.sent_len()):
            cur_node = lattice.nodes[node_i]
            val = expr_seq[node_i]
            if self.dropout_rate > 0.0 and self.train:
                val = dy.cmult(val, self.dropout_mask_x)
            i_ft_list = []
            if len(cur_node.nodes_prev) == 0:
                tmp_iog = dy.affine_transform([b_iog, Wx_iog, val])
            else:
                h_tilde = sum(h[pred] for pred in cur_node.nodes_prev)
                tmp_iog = dy.affine_transform(
                    [b_iog, Wx_iog, val, Wh_iog, h_tilde])
                for pred in cur_node.nodes_prev:
                    i_ft_list.append(
                        dy.logistic(
                            dy.affine_transform(
                                [b_f, Wx_f, val, Wh_f, h[pred]])))
            i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim)
            i_aot = dy.pick_range(tmp_iog, self.hidden_dim,
                                  self.hidden_dim * 2)
            i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2,
                                  self.hidden_dim * 3)

            i_it = dy.logistic(i_ait)
            i_ot = dy.logistic(i_aot)
            i_gt = dy.tanh(i_agt)
            if len(cur_node.nodes_prev) == 0:
                c.append(dy.cmult(i_it, i_gt))
            else:
                fc = dy.cmult(i_ft_list[0], c[cur_node.nodes_prev[0]])
                for i in range(1, len(cur_node.nodes_prev)):
                    fc += dy.cmult(i_ft_list[i], c[cur_node.nodes_prev[i]])
                c.append(fc + dy.cmult(i_it, i_gt))
            h_t = dy.cmult(i_ot, dy.tanh(c[-1]))
            if self.dropout_rate > 0.0 and self.train:
                h_t = dy.cmult(h_t, self.dropout_mask_h)
            h.append(h_t)
        self._final_states = [transducers.FinalTransducerState(h[-1], c[-1])]
        return expression_seqs.ExpressionSequence(expr_list=h)
Esempio n. 6
0
 def embed_sent(
         self,
         sent_len: numbers.Integral) -> expression_seqs.ExpressionSequence:
     embeddings = dy.strided_select(dy.parameter(self.embeddings), [1, 1],
                                    [0, 0], [self.emb_dim, sent_len])
     return expression_seqs.ExpressionSequence(expr_tensor=embeddings,
                                               mask=None)
Esempio n. 7
0
    def embed_sent(self, x):
        speech_x = x.batches[0]
        factor_x = x.batches[1]

        #    if speech_x.sent_len()!=factor_x.sent_len():
        #      print(speech_x.sent_len())
        #      print(factor_x.sent_len())
        #      if speech_x.sent_len()!=factor_x.sent_len()+4:
        #        print("PROBLEM !!!!!!!!") #ah this is due to concatenated sentences which don't have both phonemes for both parts. shouldn't happen with the kaldi phones i don't think
        #      print("---")

        #    if speech_x.sent_len()==factor_x.sent_len():
        #      speech_xs = self.embed_speech_sent(speech_x)
        #      factor_xs = self.embed_factor_sent(factor_x, speech_x.sent_len())
        #    elif speech_x.sent_len()+4==factor_x.sent_len():
        #      speech_xs = self.embed_speech_sent(speech_x)
        #      factor_xs = self.embed_factor_sent(factor_x, speech_x.sent_len())
        #    elif speech_x.sent_len() > factor_x.sent_len():
        #      speech_xs = self.embed_speech_sent(speech_x)
        #      factor_xs = self.embed_factor_sent(factor_x, speech_x.sent_len())
        #    else:
        #      raise ValueError("!! unforseen sent mismatch in factor embedder")

        speech_xs = self.embed_speech_sent(speech_x)
        factor_xs = self.embed_factor_sent(factor_x, speech_x.sent_len())

        catted = dy.concatenate([speech_xs.as_tensor(), factor_xs.as_tensor()])
        output_seq = expression_seqs.ExpressionSequence(expr_tensor=catted,
                                                        mask=speech_x.mask)
        return output_seq
Esempio n. 8
0
 def embed_sent(self,
                x: sent.Sentence) -> expression_seqs.ExpressionSequence:
     # TODO refactor: seems a bit too many special cases that need to be distinguished
     batched = batchers.is_batched(x)
     first_sent = x[0] if batched else x
     if hasattr(first_sent, "get_array"):
         if not batched:
             return expression_seqs.LazyNumpyExpressionSequence(
                 lazy_data=x.get_array())
         else:
             return expression_seqs.LazyNumpyExpressionSequence(
                 lazy_data=batchers.mark_as_batch([s for s in x]),
                 mask=x.mask)
     else:
         if not batched:
             embeddings = [self.embed(word) for word in x]
         else:
             embeddings = []
             for word_i in range(x.sent_len()):
                 embeddings.append(
                     self.embed(
                         batchers.mark_as_batch(
                             [single_sent[word_i] for single_sent in x])))
         return expression_seqs.ExpressionSequence(expr_list=embeddings,
                                                   mask=x.mask)
Esempio n. 9
0
    def transduce(self, expr_seq: 'expression_seqs.ExpressionSequence'):
        """
    transduce the sequence

    Args:
      expr_seq: expression sequence
    Returns:
      expression sequence
    """
        batch_size = expr_seq[0].dim()[1]
        seq_len = len(expr_seq)

        output_exps = []
        for pos_i in range(seq_len):
            input_i = expr_seq[pos_i]
            affine = self.linear_layer(input_i)
            # affine = dy.affine_transform([dy.parameter(self.p_b), dy.parameter(self.p_W), input_i])
            if self.train and self.dropout_rate:
                affine = dy.dropout(affine, self.dropout_rate)
            if self.gumbel:
                affine = affine + dy.random_gumbel(dim=affine.dim()[0],
                                                   batch_size=batch_size)
            softmax_out = dy.softmax(affine)
            # embedded = self.emb_layer(softmax_out)
            embedded = dy.parameter(self.p_E) * softmax_out
            if self.residual:
                embedded = embedded + input_i
            output_exps.append(embedded)

        self._final_states = [
            transducers.FinalTransducerState(main_expr=embedded)
        ]

        return expression_seqs.ExpressionSequence(expr_list=output_exps,
                                                  mask=expr_seq.mask)
Esempio n. 10
0
 def embed_sent(self, x: Any):
     embeddings = [embedder.embed_sent(x) for embedder in self.embedders]
     ret = []
     for j in range(len(embeddings[0])):
         ret.append(
             dy.esum([embeddings[i][j] for i in range(len(embeddings))]))
     return expression_seqs.ExpressionSequence(expr_list=ret,
                                               mask=embeddings[0].mask)
Esempio n. 11
0
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        """
    transduce the sequence

    Args:
      expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated)
    Returns:
      expression sequence
    """

        # Start with a [(length, model_size) x batch] tensor
        # B x T x H -> B x H x T
        x = expr_seq.as_tensor()
        x_len = x.size()[1]
        x_batch = x.size()[0]
        # Get the query key and value vectors
        q = self.lin_q(x).transpose(1, 2).contiguous()
        k = self.lin_k(x).transpose(1, 2).contiguous()
        v = self.lin_v(x).transpose(1, 2).contiguous()
        # q = bq + x * Wq
        # k = bk + x * Wk
        # v = bv + x * Wv

        # Split to batches [(length, head_dim) x batch * num_heads] tensor
        q, k, v = [
            temp.view((x_batch * self.num_heads, self.head_dim, x_len))
            for temp in (q, k, v)
        ]

        # Do scaled dot product [batch*num_heads, length, length], rows are keys, columns are queries
        attn_score = torch.matmul(k.transpose(1, 2), q) / sqrt(self.head_dim)
        if expr_seq.mask is not None:
            mask = torch.Tensor(
                np.repeat(expr_seq.mask.np_arr, self.num_heads, axis=0) *
                -1e10).to(xnmt.device)
            attn_score = attn_score + mask.unsqueeze(2)
        attn_prob = torch.nn.Softmax(dim=1)(attn_score)
        # attn_prob = dy.softmax(attn_score, d=1)
        if self.train and self.dropout > 0.0:
            attn_prob = tt.dropout(attn_prob, self.dropout)
        # Reduce using attention and resize to match [(length, model_size) x batch]
        o = torch.matmul(v, attn_prob).view(x_batch, self.input_dim,
                                            x_len).transpose(1, 2)
        # Final transformation
        o = self.lin_o(o)
        # o = bo + o * Wo

        expr_seq = expression_seqs.ExpressionSequence(expr_tensor=o,
                                                      mask=expr_seq.mask)

        self._final_states = [
            transducers.FinalTransducerState(expr_seq[-1], None)
        ]

        return expr_seq
Esempio n. 12
0
 def calculate_baseline(
     self, input_states: expr_seq.ExpressionSequence
 ) -> expr_seq.ExpressionSequence:
     transform_seq = []
     for input_state in input_states:
         transform_seq.append(
             self.transform.transform(dy.nobackprop(input_state)))
     return expr_seq.ExpressionSequence(expr_list=transform_seq,
                                        mask=input_states.mask)
Esempio n. 13
0
  def transduce(self, es: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence:
    mask = es.mask
    sent_len = len(es)
    es_expr = es.as_transposed_tensor()
    batch_size = es_expr.dim()[1]

    es_chn = dy.reshape(es_expr, (sent_len, self.freq_dim, self.chn_dim), batch_size=batch_size)

    h_out = {}
    for direction in ["fwd", "bwd"]:
      # input convolutions
      gates_xt_bias = dy.conv2d_bias(es_chn, dy.parameter(self.params["x2all_" + direction]),
                                     dy.parameter(self.params["b_" + direction]), stride=(1, 1), is_valid=False)
      gates_xt_bias_list = [dy.pick_range(gates_xt_bias, i, i + 1) for i in range(sent_len)]

      h = []
      c = []
      for input_pos in range(sent_len):
        directional_pos = input_pos if direction == "fwd" else sent_len - input_pos - 1
        gates_t = gates_xt_bias_list[directional_pos]
        if input_pos > 0:
          # recurrent convolutions
          gates_h_t = dy.conv2d(h[-1], dy.parameter(self.params["h2all_" + direction]), stride=(1, 1), is_valid=False)
          gates_t += gates_h_t

        # standard LSTM logic
        if len(c) == 0:
          c_tm1 = dy.zeros((self.freq_dim * self.num_filters,), batch_size=batch_size)
        else:
          c_tm1 = c[-1]
        gates_t_reshaped = dy.reshape(gates_t, (4 * self.freq_dim * self.num_filters,), batch_size=batch_size)
        c_t = dy.reshape(dy.vanilla_lstm_c(c_tm1, gates_t_reshaped), (self.freq_dim * self.num_filters,),
                         batch_size=batch_size)
        h_t = dy.vanilla_lstm_h(c_t, gates_t_reshaped)
        h_t = dy.reshape(h_t, (1, self.freq_dim, self.num_filters,), batch_size=batch_size)

        if mask is None or np.isclose(np.sum(mask.np_arr[:, input_pos:input_pos + 1]), 0.0):
          c.append(c_t)
          h.append(h_t)
        else:
          c.append(
            mask.cmult_by_timestep_expr(c_t, input_pos, True) + mask.cmult_by_timestep_expr(c[-1], input_pos, False))
          h.append(
            mask.cmult_by_timestep_expr(h_t, input_pos, True) + mask.cmult_by_timestep_expr(h[-1], input_pos, False))

      h_out[direction] = h
    ret_expr = []
    for state_i in range(len(h_out["fwd"])):
      state_fwd = h_out["fwd"][state_i]
      state_bwd = h_out["bwd"][-1 - state_i]
      output_dim = (state_fwd.dim()[0][1] * state_fwd.dim()[0][2],)
      fwd_reshape = dy.reshape(state_fwd, output_dim, batch_size=batch_size)
      bwd_reshape = dy.reshape(state_bwd, output_dim, batch_size=batch_size)
      ret_expr.append(dy.concatenate([fwd_reshape, bwd_reshape], d=0 if self.reshape_output else 2))
    return expression_seqs.ExpressionSequence(expr_list=ret_expr, mask=mask)

  # TODO: implement get_final_states()
Esempio n. 14
0
 def compose(
         self, embeds: Union[dy.Expression,
                             List[dy.Expression]]) -> dy.Expression:
     if type(embeds) != list:
         embeds = [
             dy.pick_batch_elem(embeds, i) for i in range(embeds.dim()[1])
         ]
     self.seq_transducer.transduce(
         expr_seq.ExpressionSequence(expr_list=embeds))
     return self.seq_transducer.get_final_states()[-1].main_expr()
Esempio n. 15
0
    def embed_sent(self, x: Any) -> expression_seqs.ExpressionSequence:
        """Embed a full sentence worth of words. By default, just do a for loop.

    Args:
      x: This will generally be a list of word IDs, but could also be a list of strings or some other format.
         It could also be batched, in which case it will be a (possibly masked) :class:`xnmt.batcher.Batch` object

    Returns:
      An expression sequence representing vectors of each word in the input.
    """
        # single mode
        if not batchers.is_batched(x):
            expr = expression_seqs.ExpressionSequence(
                expr_list=[self.embed(word) for word in x])
        # minibatch mode
        elif type(self) == LookupEmbedder:
            embeddings = []
            for word_i in range(x.sent_len()):
                batch = batchers.mark_as_batch(
                    [single_sent[word_i] for single_sent in x])
                embeddings.append(self.embed(batch))
            expr = expression_seqs.ExpressionSequence(expr_list=embeddings,
                                                      mask=x.mask)
        else:
            assert type(
                x[0]
            ) == sent.SegmentedSentence, "Need to use CharFromWordTextReader for non standard embeddings."
            embeddings = []
            all_embeddings = []
            for sentence in x:
                embedding = []
                for i in range(sentence.len_unpadded()):
                    embed_word = self.embed(sentence.words[i])
                    embedding.append(embed_word)
                    all_embeddings.append(embed_word)
                embeddings.append(embedding)
            # Useful when using dy.autobatch
            dy.forward(all_embeddings)
            all_embeddings.clear()
            # Pad the results
            expr = batchers.pad_embedding(embeddings)

        return expr
Esempio n. 16
0
  def transduce(self, expr_seq: 'expression_seqs.ExpressionSequence') -> 'expression_seqs.ExpressionSequence':
    """
    transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c)

    Args:
      expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated)
    Returns:
      expression sequence
    """
    if isinstance(expr_seq, expression_seqs.ExpressionSequence):
      expr_seq = [expr_seq]
    batch_size = expr_seq[0][0].dim()[1]
    seq_len = len(expr_seq[0])

    if self.dropout_rate > 0.0 and self.train:
      self.set_dropout_masks(batch_size=batch_size)

    cur_input = expr_seq
    self._final_states = []
    for layer_i in range(self.num_layers):
      h = [dy.zeroes(dim=(self.hidden_dim,), batch_size=batch_size)]
      c = [dy.zeroes(dim=(self.hidden_dim,), batch_size=batch_size)]
      for pos_i in range(seq_len):
        x_t = [cur_input[j][pos_i] for j in range(len(cur_input))]
        if isinstance(x_t, dy.Expression):
          x_t = [x_t]
        elif type(x_t) != list:
          x_t = list(x_t)
        if sum([x_t_i.dim()[0][0] for x_t_i in x_t]) != self.total_input_dim:
          found_dim = sum([x_t_i.dim()[0][0] for x_t_i in x_t])
          raise ValueError(f"VanillaLSTMGates: x_t has inconsistent dimension {found_dim}, expecting {self.total_input_dim}")
        if self.dropout_rate > 0.0 and self.train:
          # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights)
          gates_t = dy.vanilla_lstm_gates_dropout_concat(x_t,
                                                         h[-1],
                                                         self.Wx[layer_i],
                                                         self.Wh[layer_i],
                                                         self.b[layer_i],
                                                         self.dropout_mask_x[layer_i],
                                                         self.dropout_mask_h[layer_i],
                                                         self.weightnoise_std if self.train else 0.0)
        else:
          gates_t = dy.vanilla_lstm_gates_concat(x_t, h[-1], self.Wx[layer_i], self.Wh[layer_i], self.b[layer_i], self.weightnoise_std if self.train else 0.0)
        c_t = dy.vanilla_lstm_c(c[-1], gates_t)
        h_t = dy.vanilla_lstm_h(c_t, gates_t)
        if expr_seq[0].mask is None or np.isclose(np.sum(expr_seq[0].mask.np_arr[:,pos_i:pos_i+1]), 0.0):
          c.append(c_t)
          h.append(h_t)
        else:
          c.append(expr_seq[0].mask.cmult_by_timestep_expr(c_t,pos_i,True) + expr_seq[0].mask.cmult_by_timestep_expr(c[-1],pos_i,False))
          h.append(expr_seq[0].mask.cmult_by_timestep_expr(h_t,pos_i,True) + expr_seq[0].mask.cmult_by_timestep_expr(h[-1],pos_i,False))
      self._final_states.append(transducers.FinalTransducerState(h[-1], c[-1]))
      cur_input = [h[1:]]

    return expression_seqs.ExpressionSequence(expr_list=h[1:], mask=expr_seq[0].mask)
Esempio n. 17
0
    def transduce(
        self, expr_seq: 'expression_seqs.ExpressionSequence'
    ) -> 'expression_seqs.ExpressionSequence':
        """
    transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c)

    Args:
      expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated)
    Returns:
      expression sequence
    """
        if isinstance(expr_seq, expression_seqs.ExpressionSequence):
            expr_seq = [expr_seq]
        concat_inputs = len(expr_seq) >= 2
        batch_size = tt.batch_size(expr_seq[0][0])
        seq_len = expr_seq[0].sent_len()
        mask = expr_seq[0].mask

        if self.dropout_rate > 0.0 and self.train:
            self.set_dropout_masks(batch_size=batch_size)

        cur_input = expr_seq
        self._final_states = []
        for layer_i in range(self.num_layers):
            h = [tt.zeroes(hidden_dim=self.hidden_dim, batch_size=batch_size)]
            c = [tt.zeroes(hidden_dim=self.hidden_dim, batch_size=batch_size)]
            for pos_i in range(seq_len):
                if concat_inputs and layer_i == 0:
                    x_t = tt.concatenate(
                        [cur_input[i][pos_i] for i in range(len(cur_input))])
                else:
                    x_t = cur_input[0][pos_i]
                h_tm1 = h[-1]
                if self.dropout_rate > 0.0 and self.train:
                    # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights)
                    x_t = torch.mul(x_t, self.dropout_mask_x[layer_i])
                    h_tm1 = torch.mul(h_tm1, self.dropout_mask_h[layer_i])
                h_t, c_t = self.layers[layer_i](x_t, (h_tm1, c[-1]))
                if mask is None or np.isclose(
                        np.sum(mask.np_arr[:, pos_i:pos_i + 1]), 0.0):
                    c.append(c_t)
                    h.append(h_t)
                else:
                    c.append(
                        mask.cmult_by_timestep_expr(c_t, pos_i, True) +
                        mask.cmult_by_timestep_expr(c[-1], pos_i, False))
                    h.append(
                        mask.cmult_by_timestep_expr(h_t, pos_i, True) +
                        mask.cmult_by_timestep_expr(h[-1], pos_i, False))
            self._final_states.append(
                transducers.FinalTransducerState(h[-1], c[-1]))
            cur_input = [h[1:]]

        return expression_seqs.ExpressionSequence(expr_list=h[1:], mask=mask)
Esempio n. 18
0
 def transduce(self, x: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence:
   x_T = x.as_transposed_tensor()
   scores = x_T * dy.parameter(self.W)
   if x.mask is not None:
     scores = x.mask.add_to_tensor_expr(scores, multiplicator=-100.0, time_first=True)
   if self.pos_enc_max:
     seq_len = x_T.dim()[0][0]
     pos_enc = self.pos_enc[:seq_len,:]
     scores = dy.cmult(scores, dy.inputTensor(pos_enc))
   attention = dy.softmax(scores)
   output_expr = x.as_tensor() * attention
   return expression_seqs.ExpressionSequence(expr_tensor=output_expr, mask=None)
Esempio n. 19
0
    def transduce(self, es):
        mask = es.mask
        # first layer
        forward_es = self.forward_layers[0].transduce(es)
        rev_backward_es = self.backward_layers[0].transduce(
            expression_seqs.ReversedExpressionSequence(es))

        # TODO: concat input of each layer to its output; or, maybe just add standard residual connections
        for layer_i in range(1, len(self.forward_layers)):
            new_forward_es = self.forward_layers[layer_i].transduce([
                forward_es,
                expression_seqs.ReversedExpressionSequence(rev_backward_es)
            ])
            mask_out = mask
            if mask_out is not None and new_forward_es.mask.np_arr.shape != mask_out.np_arr.shape:
                mask_out = mask_out.lin_subsampled(trg_len=len(new_forward_es))
            rev_backward_es = expression_seqs.ExpressionSequence(
                self.backward_layers[layer_i].transduce([
                    expression_seqs.ReversedExpressionSequence(forward_es),
                    rev_backward_es
                ]).as_list(),
                mask=mask_out)
            forward_es = new_forward_es

        self._final_states = [
          transducers.FinalTransducerState(dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].main_expr(),
                                                           self.backward_layers[layer_i].get_final_states()[
                                                             0].main_expr()]),
                                           dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].cell_expr(),
                                                           self.backward_layers[layer_i].get_final_states()[
                                                             0].cell_expr()])) \
          for layer_i in range(len(self.forward_layers))]
        mask_out = mask
        if mask_out is not None and forward_es.mask.np_arr.shape != mask_out.np_arr.shape:
            mask_out = mask_out.lin_subsampled(trg_len=len(forward_es))
        return expression_seqs.ExpressionSequence(expr_list=[
            dy.concatenate([forward_es[i], rev_backward_es[-i - 1]])
            for i in range(len(forward_es))
        ],
                                                  mask=mask_out)
Esempio n. 20
0
 def exprseq_pooling(self, exprseq):
     # Reduce to vector
     exprseq = expression_seqs.ExpressionSequence(
         expr_tensor=exprseq.mask.add_to_tensor_expr(
             exprseq.as_tensor(), -1e10),
         mask=exprseq.mask)
     if exprseq.expr_tensor is not None:
         if len(exprseq.expr_tensor.dim()[0]) > 1:
             return dy.max_dim(exprseq.expr_tensor, d=1)
         else:
             return exprseq.expr_tensor
     else:
         return dy.emax(exprseq.expr_list)
Esempio n. 21
0
 def transduce(self, src: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence:
   sent_len = len(src)
   embeddings = dy.strided_select(dy.parameter(self.embedder), [1,1], [0,0], [self.input_dim, sent_len])
   if self.op == 'sum':
     output = embeddings + src.as_tensor()
   elif self.op == 'concat':
     output = dy.concatenate([embeddings, src.as_tensor()])
   else:
     raise ValueError(f'Illegal op {op} in PositionalTransducer (options are "sum"/"concat")')
   if self.train and self.dropout > 0.0:
     output = dy.dropout(output, self.dropout)
   output_seq = expression_seqs.ExpressionSequence(expr_tensor=output, mask=src.mask)
   self._final_states = [transducers.FinalTransducerState(output_seq[-1])]
   return output_seq
Esempio n. 22
0
    def transduce(
        self, expr_sequence: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        # first layer
        forward_es = self.forward_layers[0].transduce(expr_sequence)
        rev_backward_es = self.backward_layers[0].transduce(
            expression_seqs.ReversedExpressionSequence(expr_sequence))

        for layer_i in range(1, len(self.forward_layers)):
            concat_fwd = expression_seqs.ExpressionSequence(expr_list=[
                dy.concatenate([fwd_expr, bwd_expr])
                for fwd_expr, bwd_expr in zip(
                    forward_es.as_list(), reversed(rev_backward_es.as_list()))
            ])
            concat_bwd = expression_seqs.ExpressionSequence(expr_list=[
                dy.concatenate([fwd_expr, bwd_expr])
                for fwd_expr, bwd_expr in zip(reversed(forward_es.as_list()),
                                              rev_backward_es.as_list())
            ])
            new_forward_es = self.forward_layers[layer_i].transduce(concat_fwd)
            rev_backward_es = self.backward_layers[layer_i].transduce(
                concat_bwd)
            forward_es = new_forward_es

        self._final_states = [
          transducers.FinalTransducerState(dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].main_expr(),
                                                           self.backward_layers[layer_i].get_final_states()[
                                                             0].main_expr()]),
                                           dy.concatenate([self.forward_layers[layer_i].get_final_states()[0].cell_expr(),
                                                           self.backward_layers[layer_i].get_final_states()[
                                                             0].cell_expr()])) \
          for layer_i in range(len(self.forward_layers))]
        return expression_seqs.ExpressionSequence(expr_list=[
            dy.concatenate([forward_es[i], rev_backward_es[-i - 1]])
            for i in range(len(forward_es))
        ])
Esempio n. 23
0
  def transduce(self, expr_seq: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence:
    """
    transduce the sequence

    Args:
      expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated)
    Returns:
      expression sequence
    """

    Wq, Wk, Wv, Wo = [dy.parameter(x) for x in (self.pWq, self.pWk, self.pWv, self.pWo)]
    bq, bk, bv, bo = [dy.parameter(x) for x in (self.pbq, self.pbk, self.pbv, self.pbo)]

    # Start with a [(length, model_size) x batch] tensor
    x = expr_seq.as_transposed_tensor()
    x_len = x.dim()[0][0]
    x_batch = x.dim()[1]
    # Get the query key and value vectors
    # TODO: do we need bias broadcasting in DyNet?
    # q = dy.affine_transform([bq, x, Wq])
    # k = dy.affine_transform([bk, x, Wk])
    # v = dy.affine_transform([bv, x, Wv])
    q = bq + x * Wq
    k = bk + x * Wk
    v = bv + x * Wv
    
    # Split to batches [(length, head_dim) x batch * num_heads] tensor
    q, k, v = [dy.reshape(x, (x_len, self.head_dim), batch_size=x_batch * self.num_heads) for x in (q,k,v)]

    # Do scaled dot product [(length, length) x batch * num_heads], rows are queries, columns are keys
    attn_score = q * dy.transpose(k) / sqrt(self.head_dim)
    if expr_seq.mask is not None:
      mask = dy.inputTensor(np.repeat(expr_seq.mask.np_arr, self.num_heads, axis=0).transpose(), batched=True) * -1e10
      attn_score = attn_score + mask
    attn_prob = dy.softmax(attn_score, d=1)
    if self.train and self.dropout > 0.0:
      attn_prob = dy.dropout(attn_prob, self.dropout)
    # Reduce using attention and resize to match [(length, model_size) x batch]
    o = dy.reshape(attn_prob * v, (x_len, self.input_dim), batch_size=x_batch)
    # Final transformation
    # o = dy.affine_transform([bo, attn_prob * v, Wo])
    o = bo + o * Wo

    expr_seq = expression_seqs.ExpressionSequence(expr_transposed_tensor=o, mask=expr_seq.mask)

    self._final_states = [transducers.FinalTransducerState(expr_seq[-1], None)]

    return expr_seq
Esempio n. 24
0
def pad_embedding(embeddings) -> expression_seqs.ExpressionSequence:
    max_col = max(len(xs) for xs in embeddings)
    p0 = dy.zeros(embeddings[0][0].dim()[0][0])
    masks = np.zeros((len(embeddings), max_col), dtype=int)
    modified = False
    ret = []
    for xs, mask in zip(embeddings, masks):
        deficit = max_col - len(xs)
        if deficit > 0:
            xs = xs + ([p0] * deficit)
            mask[-deficit:] = 1
            modified = True
        ret.append(dy.concatenate_cols(xs))
    mask = Mask(masks) if modified else None
    return expression_seqs.ExpressionSequence(
        expr_tensor=dy.concatenate_to_batch(ret), mask=mask)
Esempio n. 25
0
 def embed_speech_sent(self, x):
     # TODO refactor: seems a bit too many special cases that need to be distinguished
     #    x = x.batches[0]
     batched = batchers.is_batched(x)
     first_sent = x[0] if batched else x
     if hasattr(first_sent, "get_array"):
         if not batched:
             return expression_seqs.LazyNumpyExpressionSequence(
                 lazy_data=x.get_array())
         else:
             return expression_seqs.LazyNumpyExpressionSequence(
                 lazy_data=batchers.mark_as_batch([s for s in x]),
                 mask=x.mask)
     else:
         raise ValueError("!! Expected to use above")
         return expression_seqs.ExpressionSequence(expr_list=embeddings,
                                                   mask=x.mask)
Esempio n. 26
0
    def transduce(
        self, es: 'expression_seqs.ExpressionSequence'
    ) -> 'expression_seqs.ExpressionSequence':

        batch_size = tt.batch_size(es.as_tensor())
        if es.mask:
            seq_lengths = es.mask.seq_lengths()
        else:
            seq_lengths = [es.sent_len()] * batch_size

        # Sort the input and lengths as the descending order
        seq_lengths = torch.LongTensor(seq_lengths).to(xnmt.device)
        lengths, perm_index = seq_lengths.sort(0, descending=True)
        sorted_input = es.as_tensor()[perm_index]

        perm_index_rev = [-1] * len(lengths)
        for i in range(len(lengths)):
            perm_index_rev[perm_index[i]] = i
        perm_index_rev = torch.LongTensor(perm_index_rev).to(xnmt.device)

        packed_input = nn.utils.rnn.pack_padded_sequence(sorted_input,
                                                         list(lengths.data),
                                                         batch_first=True)
        state_size = self.num_dir * self.num_layers, batch_size, self.hidden_dim // self.num_dir
        h0 = sorted_input.new_zeros(*state_size)
        c0 = sorted_input.new_zeros(*state_size)
        output, (final_hiddens,
                 final_cells) = self.lstm(packed_input, (h0, c0))
        output = nn.utils.rnn.pad_packed_sequence(
            output, batch_first=True, total_length=es.sent_len())[0]

        # restore the sorting
        decoded = output[perm_index_rev]

        self._final_states = []
        for layer_i in range(self.num_layers):
            final_hidden = final_hiddens.view(
                self.num_layers, self.num_dir, batch_size,
                -1)[layer_i].transpose(0, 1).contiguous().view(batch_size, -1)
            final_hidden = final_hidden[perm_index_rev]
            self._final_states.append(
                transducers.FinalTransducerState(final_hidden))

        ret = expression_seqs.ExpressionSequence(expr_tensor=decoded,
                                                 mask=es.mask)
        return ret
Esempio n. 27
0
    def transduce(
        self, seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:

        if self.train and self.dropout > 0.0:
            seq_tensor = dy.dropout(
                self.child.transduce(seq).as_tensor(),
                self.dropout) + seq.as_tensor()
        else:
            seq_tensor = self.child.transduce(
                seq).as_tensor() + seq.as_tensor()
        if self.layer_norm:
            d = seq_tensor.dim()
            seq_tensor = dy.reshape(seq_tensor, (d[0][0], ),
                                    batch_size=d[0][1] * d[1])
            seq_tensor = dy.layer_norm(seq_tensor, self.ln_g, self.ln_b)
            seq_tensor = dy.reshape(seq_tensor, d[0], batch_size=d[1])
        return expression_seqs.ExpressionSequence(expr_tensor=seq_tensor)
Esempio n. 28
0
 def transduce(
     self, x: expression_seqs.ExpressionSequence
 ) -> expression_seqs.ExpressionSequence:
     expr = x.as_transposed_tensor()
     batch_size, hidden_dim, seq_len = expr.size()
     expr = expr.view((batch_size, self.in_channels,
                       hidden_dim // self.in_channels, seq_len))
     expr = self.cnn_layer(expr)
     if self.use_pooling:
         expr = self.pooling_layer(expr)
     expr = self.activation_fct(expr)
     batch_size, out_chn, out_h, seq_len = expr.size()
     expr = expr.view((batch_size, out_chn * out_h, seq_len))
     output_seq = expression_seqs.ExpressionSequence(
         expr_transposed_tensor=expr,
         mask=x.mask.lin_subsampled(trg_len=seq_len) if x.mask else None)
     self._final_states = [transducers.FinalTransducerState(output_seq[-1])]
     return output_seq
Esempio n. 29
0
 def embed_factor_sent(self, x, speech_len):
     # single mode
     if not batchers.is_batched(x):
         embeddings = [self.embed_factor(word) for word in x]
     # minibatch mode
     else:
         embeddings = []
         seq_len = x.sent_len()
         for single_sent in x:
             assert single_sent.sent_len() == seq_len
         #      for word_i in range(seq_len):
         for word_i in range(speech_len):
             batch = batchers.mark_as_batch(
                 [single_sent[word_i] for single_sent in x])
             embeddings.append(self.embed_factor(batch))
     return expression_seqs.ExpressionSequence(
         expr_list=embeddings,
         mask=x.mask if batchers.is_batched(x) else None)
Esempio n. 30
0
    def transduce(
        self, seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:

        if self.train and self.dropout > 0.0:
            seq_tensor = tt.dropout(
                self.child.transduce(seq).as_tensor(),
                self.dropout) + seq.as_tensor()
        else:
            seq_tensor = self.child.transduce(
                seq).as_tensor() + seq.as_tensor()
        if self.layer_norm:
            batch_size = tt.batch_size(seq_tensor)
            merged_seq_tensor = tt.merge_time_batch_dims(seq_tensor)
            transformed_seq_tensor = self.layer_norm_component.transform(
                merged_seq_tensor)
            seq_tensor = tt.unmerge_time_batch_dims(transformed_seq_tensor,
                                                    batch_size)
        return expression_seqs.ExpressionSequence(expr_tensor=seq_tensor)