Ejemplo n.º 1
0
 def transduce(
     self, xs: 'expression_seqs.ExpressionSequence'
 ) -> 'expression_seqs.ExpressionSequence':
     Wx = dy.parameter(self.p_Wx)
     Wh = dy.parameter(self.p_Wh)
     b = dy.parameter(self.p_b)
     h = []
     c = []
     for i, x_t in enumerate(xs):
         if i == 0:
             tmp = dy.affine_transform([b, Wx, x_t])
         else:
             tmp = dy.affine_transform([b, Wx, x_t, Wh, h[-1]])
         i_ait = dy.pick_range(tmp, 0, self.hidden_dim)
         i_aft = dy.pick_range(tmp, self.hidden_dim, self.hidden_dim * 2)
         i_aot = dy.pick_range(tmp, self.hidden_dim * 2,
                               self.hidden_dim * 3)
         i_agt = dy.pick_range(tmp, self.hidden_dim * 3,
                               self.hidden_dim * 4)
         i_it = dy.logistic(i_ait)
         i_ft = dy.logistic(i_aft + 1.0)
         i_ot = dy.logistic(i_aot)
         i_gt = dy.tanh(i_agt)
         if i == 0:
             c.append(dy.cmult(i_it, i_gt))
         else:
             c.append(dy.cmult(i_ft, c[-1]) + dy.cmult(i_it, i_gt))
         h.append(dy.cmult(i_ot, dy.tanh(c[-1])))
     return h
Ejemplo n.º 2
0
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        if expr_seq.dim()[1] > 1:
            raise ValueError(
                f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.dim()[1]}"
            )
        lattice = self.cur_src[0]
        Wx_iog = dy.parameter(self.p_Wx_iog)
        Wh_iog = dy.parameter(self.p_Wh_iog)
        b_iog = dy.parameter(self.p_b_iog)
        Wx_f = dy.parameter(self.p_Wx_f)
        Wh_f = dy.parameter(self.p_Wh_f)
        b_f = dy.parameter(self.p_b_f)
        h = []
        c = []

        batch_size = expr_seq.dim()[1]
        if self.dropout_rate > 0.0 and self.train:
            self.set_dropout_masks(batch_size=batch_size)

        for node_i in range(lattice.sent_len()):
            cur_node = lattice.nodes[node_i]
            val = expr_seq[node_i]
            if self.dropout_rate > 0.0 and self.train:
                val = dy.cmult(val, self.dropout_mask_x)
            i_ft_list = []
            if len(cur_node.nodes_prev) == 0:
                tmp_iog = dy.affine_transform([b_iog, Wx_iog, val])
            else:
                h_tilde = sum(h[pred] for pred in cur_node.nodes_prev)
                tmp_iog = dy.affine_transform(
                    [b_iog, Wx_iog, val, Wh_iog, h_tilde])
                for pred in cur_node.nodes_prev:
                    i_ft_list.append(
                        dy.logistic(
                            dy.affine_transform(
                                [b_f, Wx_f, val, Wh_f, h[pred]])))
            i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim)
            i_aot = dy.pick_range(tmp_iog, self.hidden_dim,
                                  self.hidden_dim * 2)
            i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2,
                                  self.hidden_dim * 3)

            i_it = dy.logistic(i_ait)
            i_ot = dy.logistic(i_aot)
            i_gt = dy.tanh(i_agt)
            if len(cur_node.nodes_prev) == 0:
                c.append(dy.cmult(i_it, i_gt))
            else:
                fc = dy.cmult(i_ft_list[0], c[cur_node.nodes_prev[0]])
                for i in range(1, len(cur_node.nodes_prev)):
                    fc += dy.cmult(i_ft_list[i], c[cur_node.nodes_prev[i]])
                c.append(fc + dy.cmult(i_it, i_gt))
            h_t = dy.cmult(i_ot, dy.tanh(c[-1]))
            if self.dropout_rate > 0.0 and self.train:
                h_t = dy.cmult(h_t, self.dropout_mask_h)
            h.append(h_t)
        self._final_states = [transducers.FinalTransducerState(h[-1], c[-1])]
        return expression_seqs.ExpressionSequence(expr_list=h)
Ejemplo n.º 3
0
 def transduce(self,
               embed_sent: ExpressionSequence) -> List[ExpressionSequence]:
     batch_size = embed_sent[0].dim()[1]
     actions = self.sample_segmentation(embed_sent, batch_size)
     sample_size = len(actions)
     embeddings = dy.concatenate(embed_sent.expr_list, d=1)
     embeddings.value()
     #
     composed_words = []
     for i in range(batch_size):
         sequence = dy.pick_batch_elem(embeddings, i)
         # For each sampled segmentations
         for j, sample in enumerate(actions):
             lower_bound = 0
             # Read every 'segment' decision
             for k, upper_bound in enumerate(sample[i]):
                 char_sequence = dy.pick_range(sequence, lower_bound,
                                               upper_bound + 1, 1)
                 composed_words.append(
                     (dy.pick_range(sequence, lower_bound, upper_bound + 1,
                                    1), j, i, k, lower_bound,
                      upper_bound + 1))
                 #self.segment_composer.set_word_boundary(lower_bound, upper_bound, self.src_sent[i])
                 #composed = self.segment_composer.transduce(char_sequence)
                 #outputs[j][i].append(composed)
                 lower_bound = upper_bound + 1
     outputs = self.segment_composer.compose(composed_words, sample_size,
                                             batch_size)
     # Padding + return
     try:
         if self.length_prior:
             seg_size_unpadded = [[
                 len(outputs[i][j]) for j in range(batch_size)
             ] for i in range(sample_size)]
         enc_outputs = []
         for batched_sampled_sentence in outputs:
             sampled_sentence, segment_mask = self.pad(
                 batched_sampled_sentence)
             expr_seq = ExpressionSequence(
                 expr_tensor=dy.concatenate_to_batch(sampled_sentence),
                 mask=segment_mask)
             sent_context = self.final_transducer.transduce(expr_seq)
             self.final_states.append(
                 self.final_transducer.get_final_states())
             enc_outputs.append(sent_context)
         return CompoundSeqExpression(enc_outputs)
     finally:
         if self.length_prior:
             self.seg_size_unpadded = seg_size_unpadded
         self.compose_output = outputs
         self.segment_actions = actions
         if not self.train and self.compute_report:
             self.add_sent_for_report({"segment_actions": actions})
Ejemplo n.º 4
0
    def transduce(
        self, embed_sent: expr_seq.ExpressionSequence
    ) -> List[expr_seq.ExpressionSequence]:
        self.create_trajectories(embed_sent, force_oracle=False)
        actions = [np.nonzero(a.content) for a in self.actions]
        actions = [[
            a for a in actions[i] if a < self.src_sents[i].len_unpadded()
        ] for i in range(len(actions))]

        # Create sentence embedding
        outputs = []
        embeddings = dy.concatenate(embed_sent.expr_list, d=1)
        for i in range(self.src_sents.batch_size()):
            sequence = dy.pick_batch_elem(embeddings, i)
            src = self.src_sents[i]
            lower_bound = 0
            output = []
            for j, upper_bound in enumerate(actions[i]):
                char_sequence = dy.pick_range(
                    sequence, lower_bound, upper_bound +
                    1, 1) if self.no_char_embed else None
                output.append(
                    self.segment_composer.compose_single(
                        char_sequence, src, lower_bound, upper_bound + 1))
                lower_bound = upper_bound + 1
            outputs.append(output)

        outputs = pad_output()

        return self.final_transducer.transduce(outputs)
Ejemplo n.º 5
0
  def transduce(self, es: expression_seqs.ExpressionSequence) -> expression_seqs.ExpressionSequence:
    mask = es.mask
    sent_len = len(es)
    es_expr = es.as_transposed_tensor()
    batch_size = es_expr.dim()[1]

    es_chn = dy.reshape(es_expr, (sent_len, self.freq_dim, self.chn_dim), batch_size=batch_size)

    h_out = {}
    for direction in ["fwd", "bwd"]:
      # input convolutions
      gates_xt_bias = dy.conv2d_bias(es_chn, dy.parameter(self.params["x2all_" + direction]),
                                     dy.parameter(self.params["b_" + direction]), stride=(1, 1), is_valid=False)
      gates_xt_bias_list = [dy.pick_range(gates_xt_bias, i, i + 1) for i in range(sent_len)]

      h = []
      c = []
      for input_pos in range(sent_len):
        directional_pos = input_pos if direction == "fwd" else sent_len - input_pos - 1
        gates_t = gates_xt_bias_list[directional_pos]
        if input_pos > 0:
          # recurrent convolutions
          gates_h_t = dy.conv2d(h[-1], dy.parameter(self.params["h2all_" + direction]), stride=(1, 1), is_valid=False)
          gates_t += gates_h_t

        # standard LSTM logic
        if len(c) == 0:
          c_tm1 = dy.zeros((self.freq_dim * self.num_filters,), batch_size=batch_size)
        else:
          c_tm1 = c[-1]
        gates_t_reshaped = dy.reshape(gates_t, (4 * self.freq_dim * self.num_filters,), batch_size=batch_size)
        c_t = dy.reshape(dy.vanilla_lstm_c(c_tm1, gates_t_reshaped), (self.freq_dim * self.num_filters,),
                         batch_size=batch_size)
        h_t = dy.vanilla_lstm_h(c_t, gates_t_reshaped)
        h_t = dy.reshape(h_t, (1, self.freq_dim, self.num_filters,), batch_size=batch_size)

        if mask is None or np.isclose(np.sum(mask.np_arr[:, input_pos:input_pos + 1]), 0.0):
          c.append(c_t)
          h.append(h_t)
        else:
          c.append(
            mask.cmult_by_timestep_expr(c_t, input_pos, True) + mask.cmult_by_timestep_expr(c[-1], input_pos, False))
          h.append(
            mask.cmult_by_timestep_expr(h_t, input_pos, True) + mask.cmult_by_timestep_expr(h[-1], input_pos, False))

      h_out[direction] = h
    ret_expr = []
    for state_i in range(len(h_out["fwd"])):
      state_fwd = h_out["fwd"][state_i]
      state_bwd = h_out["bwd"][-1 - state_i]
      output_dim = (state_fwd.dim()[0][1] * state_fwd.dim()[0][2],)
      fwd_reshape = dy.reshape(state_fwd, output_dim, batch_size=batch_size)
      bwd_reshape = dy.reshape(state_bwd, output_dim, batch_size=batch_size)
      ret_expr.append(dy.concatenate([fwd_reshape, bwd_reshape], d=0 if self.reshape_output else 2))
    return expression_seqs.ExpressionSequence(expr_list=ret_expr, mask=mask)

  # TODO: implement get_final_states()
Ejemplo n.º 6
0
 def transduce(self,
               embed_sent: ExpressionSequence) -> List[ExpressionSequence]:
     batch_size = embed_sent[0].dim()[1]
     actions = self.sample_segmentation(embed_sent, batch_size)
     embeddings = dy.concatenate(embed_sent.expr_list, d=1)
     embeddings.value()
     #
     composed_words = []
     for i in range(batch_size):
         sequence = dy.pick_batch_elem(embeddings, i)
         # For each sampled segmentations
         lower_bound = 0
         for j, upper_bound in enumerate(actions[i]):
             if self.no_char_embed:
                 char_sequence = []
             else:
                 char_sequence = dy.pick_range(sequence, lower_bound,
                                               upper_bound + 1, 1)
             composed_words.append(
                 (char_sequence, i, j, lower_bound, upper_bound + 1))
             lower_bound = upper_bound + 1
     outputs = self.segment_composer.compose(composed_words, batch_size)
     # Padding + return
     try:
         if self.length_prior:
             seg_size_unpadded = [
                 len(outputs[i]) for i in range(batch_size)
             ]
         sampled_sentence, segment_mask = self.pad(outputs)
         expr_seq = ExpressionSequence(
             expr_tensor=dy.concatenate_to_batch(sampled_sentence),
             mask=segment_mask)
         return self.final_transducer.transduce(expr_seq)
     finally:
         if self.length_prior:
             self.seg_size_unpadded = seg_size_unpadded
         self.compose_output = outputs
         self.segment_actions = actions
         if not self.train and self.is_reporting():
             if len(actions) == 1:  # Support only AccuracyEvalTask
                 self.report_sent_info({"segment_actions": actions})
Ejemplo n.º 7
0
    def transduce(self, h_below: 'expression_seqs.ExpressionSequence', h_above,
                  z_below) -> 'expression_seqs.ExpressionSequence':
        if self.c == None:
            self.c = dy.zeroes(
                dim=(self.hidden_dim,
                     ))  #?? does (hidden,) take care of batch_size?
        if self.h == None:
            self.h = dy.zeroes(dim=(self.hidden_dim, ))
        if self.z == None:
            self.z = dy.ones(dim=(1, ))

        W_1l_r = dy.parameter(self.p_W_1l_r)
        bias = dy.parameter(self.p_bias)
        h = dy.parameter(self.h)

        s_recur = W_1l_r * h  #matrix multiply is *, element-wise is dy.cmult. CURRERROR: stale expression
        if not self.last_layer:
            W_2l_td = dy.parameter(self.p_W_2l_td)
            W_0l_bu = dy.parameter(self.p_W_0l_bu)
            s_bottomup = W_0l_bu * h_below  #?? this is becoming (2049,). does it need to be (2049,1) to do scalar * matrix?
            s_topdown = W_2l_td * h_above
        else:
            s_topdown = dy.zeroes(
                s_recur.dim()[0][0],
            )  #?? this gets the shape e.g. ((5, 1), 1). do i actually want batch_size as well?
            s_bottomup = W_1l_r * h
        s_bottomup = dy.cmult(
            z_below, s_bottomup
        )  #to handle batched scalar * matrix -> e.g. (1x10, 2049x10)
        s_topdown = dy.cmult(
            self.z, s_topdown
        )  #will be zeros if last_layer. is this right, or should z=1 in this case ??

        fslice = s_recur + s_topdown + s_bottomup + bias  #?? checkme. bias has same shape as s_recur et al? [4*hidden+1, batch_size]?

        i_ft = dy.pick_range(fslice, 0, self.hidden_dim)
        i_it = dy.pick_range(fslice, self.hidden_dim, self.hidden_dim * 2)
        i_ot = dy.pick_range(fslice, self.hidden_dim * 2, self.hidden_dim * 3)
        i_gt = dy.pick_range(fslice, self.hidden_dim * 3, self.hidden_dim * 4)
        f_t = dy.logistic(
            i_ft + 1.0
        )  #+1.0 bc a paper said it was better to init that way (matthias)
        i_t = dy.logistic(i_it)
        o_t = dy.logistic(i_ot)
        g_t = dy.tanh(i_gt)

        #z * normal_update + (1-z)*copy: ie, when z_below is 0, z_new = z (copied prev timestamp). when z_below is 1, z_new = dy.round etc

        #hier = True
        #        z_tmp = dy.pick_range(fslice, self.hidden_dim*4,self.hidden_dim*4+1)
        #        z_tilde = dy.logistic(z_tmp)  #original: hard sigmoid + slope annealing (a)
        #        z_new = dy.cmult(1-z_below, self.z) + dy.cmult(z_below, dy.round(z_tilde, gradient_mode="straight_through_gradient"))

        #hier = False
        z_tmp = dy.pick_range(fslice, self.hidden_dim * 4,
                              self.hidden_dim * 4 + 1)
        z_tilde = dy.logistic(
            z_tmp)  #original: hard sigmoid + slope annealing (a)
        z_new = dy.round(
            z_tilde, gradient_mode="straight_through_gradient"
        )  #use straight-through estimator for gradient: step fn forward, hard sigmoid backward

        #z = z_l,t-1
        #z_below = z_l-1,t

        #        if self.z.value() == 1: #FLUSH
        #            c_new = dy.cmult(i_t, g_t)
        #            h_new = dy.cmult(o_t, dy.tanh(c_new))
        #        elif z_below.value() == 0: #COPY

        # if flush removed, only copy or normal update
        # when z_below is 0, c_new and h_new are self.c and self.h. when z_below is 1, c_new, h_new = normal update
        c_new = dy.cmult((1 - z_below), self.c) + dy.cmult(
            z_below, (dy.cmult(f_t, self.c) + dy.cmult(i_t, g_t)))
        h_new = dy.cmult((1 - z_below), self.h) + dy.cmult(
            z_below, dy.cmult(o_t, dy.tanh(c_new)))

        #        if z_below.value() == 0: #COPY
        #            c_new = self.c
        #            h_new = self.h
        #        else: #UPDATE
        #            c_new = dy.cmult(f_t, self.c) + dy.cmult(i_t, g_t)
        #            h_new = dy.cmult(o_t, dy.tanh(c_new))

        self.c = c_new
        self.h = h_new
        self.z = z_new

        return h_new, z_new
Ejemplo n.º 8
0
    def __call__(self, x: dy.Expression, att_mask: np.ndarray,
                 batch_mask: np.ndarray, p: numbers.Real):
        """
    x: expression of dimensions (input_dim, time) x batch
    att_mask: numpy array of dimensions (time, time); pre-transposed
    batch_mask: numpy array of dimensions (batch, time)
    p: dropout prob
    """
        sent_len = x.dim()[0][1]
        batch_size = x[0].dim()[1]

        if self.downsample_factor > 1:
            if sent_len % self.downsample_factor != 0:
                raise ValueError(
                    "For 'reshape' downsampling, sequence lengths must be multiples of the downsampling factor. "
                    "Configure batcher accordingly.")
            if batch_mask is not None:
                batch_mask = batch_mask[:, ::self.downsample_factor]
            sent_len_out = sent_len // self.downsample_factor
            sent_len = sent_len_out
            out_mask = x.mask
            if self.downsample_factor > 1 and out_mask is not None:
                out_mask = out_mask.lin_subsampled(
                    reduce_factor=self.downsample_factor)

            x = ExpressionSequence(expr_tensor=dy.reshape(
                x.as_tensor(), (x.dim()[0][0] * self.downsample_factor,
                                x.dim()[0][1] / self.downsample_factor),
                batch_size=batch_size),
                                   mask=out_mask)
            residual = SAAMTimeDistributed()(x)
        else:
            residual = SAAMTimeDistributed()(x)
            sent_len_out = sent_len
        if self.model_dim != self.input_dim * self.downsample_factor:
            residual = self.res_shortcut.transform(residual)

        # Concatenate all the words together for doing vectorized affine transform
        if self.kq_pos_encoding_type is None:
            kvq_lin = self.linear_kvq.transform(SAAMTimeDistributed()(x))
            key_up = self.shape_projection(
                dy.pick_range(kvq_lin, 0, self.head_count * self.dim_per_head),
                batch_size)
            value_up = self.shape_projection(
                dy.pick_range(kvq_lin, self.head_count * self.dim_per_head,
                              2 * self.head_count * self.dim_per_head),
                batch_size)
            query_up = self.shape_projection(
                dy.pick_range(kvq_lin, 2 * self.head_count * self.dim_per_head,
                              3 * self.head_count * self.dim_per_head),
                batch_size)
        else:
            assert self.kq_pos_encoding_type == "embedding"
            encoding = self.kq_positional_embedder.embed_sent(
                sent_len).as_tensor()
            kq_lin = self.linear_kq.transform(SAAMTimeDistributed()(
                ExpressionSequence(
                    expr_tensor=dy.concatenate([x.as_tensor(), encoding]))))
            key_up = self.shape_projection(
                dy.pick_range(kq_lin, 0, self.head_count * self.dim_per_head),
                batch_size)
            query_up = self.shape_projection(
                dy.pick_range(kq_lin, self.head_count * self.dim_per_head,
                              2 * self.head_count * self.dim_per_head),
                batch_size)
            v_lin = self.linear_v.transform(SAAMTimeDistributed()(x))
            value_up = self.shape_projection(v_lin, batch_size)

        if self.cross_pos_encoding_type:
            assert self.cross_pos_encoding_type == "embedding"
            emb1 = dy.pick_range(dy.parameter(self.cross_pos_emb_p1), 0,
                                 sent_len)
            emb2 = dy.pick_range(dy.parameter(self.cross_pos_emb_p2), 0,
                                 sent_len)
            key_up = dy.reshape(key_up,
                                (sent_len, self.dim_per_head, self.head_count),
                                batch_size=batch_size)
            key_up = dy.concatenate_cols(
                [dy.cmult(key_up, emb1),
                 dy.cmult(key_up, emb2)])
            key_up = dy.reshape(key_up, (sent_len, self.dim_per_head * 2),
                                batch_size=self.head_count * batch_size)
            query_up = dy.reshape(
                query_up, (sent_len, self.dim_per_head, self.head_count),
                batch_size=batch_size)
            query_up = dy.concatenate_cols(
                [dy.cmult(query_up, emb2),
                 dy.cmult(query_up, -emb1)])
            query_up = dy.reshape(query_up, (sent_len, self.dim_per_head * 2),
                                  batch_size=self.head_count * batch_size)

        scaled = query_up * dy.transpose(
            key_up / math.sqrt(self.dim_per_head)
        )  # scale before the matrix multiplication to save memory

        # Apply Mask here
        if not self.ignore_masks:
            if att_mask is not None:
                att_mask_inp = att_mask * -100.0
                if self.downsample_factor > 1:
                    att_mask_inp = att_mask_inp[::self.downsample_factor, ::
                                                self.downsample_factor]
                scaled += dy.inputTensor(att_mask_inp)
            if batch_mask is not None:
                # reshape (batch, time) -> (time, head_count*batch), then *-100
                inp = np.resize(np.broadcast_to(batch_mask.T[:, np.newaxis, :],
                                                (sent_len, self.head_count, batch_size)),
                                (1, sent_len, self.head_count * batch_size)) \
                      * -100
                mask_expr = dy.inputTensor(inp, batched=True)
                scaled += mask_expr
            if self.diag_gauss_mask:
                diag_growing = np.zeros((sent_len, sent_len, self.head_count))
                for i in range(sent_len):
                    for j in range(sent_len):
                        diag_growing[i, j, :] = -(i - j)**2 / 2.0
                e_diag_gauss_mask = dy.inputTensor(diag_growing)
                e_sigma = dy.parameter(self.diag_gauss_mask_sigma)
                if self.square_mask_std:
                    e_sigma = dy.square(e_sigma)
                e_sigma_sq_inv = dy.cdiv(
                    dy.ones(e_sigma.dim()[0], batch_size=batch_size),
                    dy.square(e_sigma))
                e_diag_gauss_mask_final = dy.cmult(e_diag_gauss_mask,
                                                   e_sigma_sq_inv)
                scaled += dy.reshape(e_diag_gauss_mask_final,
                                     (sent_len, sent_len),
                                     batch_size=batch_size * self.head_count)

        # Computing Softmax here.
        attn = dy.softmax(scaled, d=1)
        if LOG_ATTENTION:
            yaml_logger.info({
                "key": "selfatt_mat_ax0",
                "value": np.average(attn.value(), axis=0).dumps(),
                "desc": self.desc
            })
            yaml_logger.info({
                "key": "selfatt_mat_ax1",
                "value": np.average(attn.value(), axis=1).dumps(),
                "desc": self.desc
            })
            yaml_logger.info({
                "key": "selfatt_mat_ax0_ent",
                "value": entropy(attn.value()).dumps(),
                "desc": self.desc
            })
            yaml_logger.info({
                "key": "selfatt_mat_ax1_ent",
                "value": entropy(attn.value().transpose()).dumps(),
                "desc": self.desc
            })

        self.select_att_head = 0
        if self.select_att_head is not None:
            attn = dy.reshape(attn, (sent_len, sent_len, self.head_count),
                              batch_size=batch_size)
            sel_mask = np.zeros((1, 1, self.head_count))
            sel_mask[0, 0, self.select_att_head] = 1.0
            attn = dy.cmult(attn, dy.inputTensor(sel_mask))
            attn = dy.reshape(attn, (sent_len, sent_len),
                              batch_size=self.head_count * batch_size)

        # Applying dropout to attention
        if p > 0.0:
            drop_attn = dy.dropout(attn, p)
        else:
            drop_attn = attn

        # Computing weighted attention score
        attn_prod = drop_attn * value_up

        # Reshaping the attn_prod to input query dimensions
        out = dy.reshape(attn_prod,
                         (sent_len_out, self.dim_per_head * self.head_count),
                         batch_size=batch_size)
        out = dy.transpose(out)
        out = dy.reshape(out, (self.model_dim, ),
                         batch_size=batch_size * sent_len_out)
        #     out = dy.reshape_transpose_reshape(attn_prod, (sent_len_out, self.dim_per_head * self.head_count), (self.model_dim,), pre_batch_size=batch_size, post_batch_size=batch_size*sent_len_out)

        if self.plot_attention:
            from sklearn.metrics.pairwise import cosine_similarity
            assert batch_size == 1
            mats = []
            for i in range(attn.dim()[1]):
                mats.append(dy.pick_batch_elem(attn, i).npvalue())
                self.plot_att_mat(
                    mats[-1], "{}.sent_{}.head_{}.png".format(
                        self.plot_attention, self.plot_attention_counter, i),
                    300)
            avg_mat = np.average(mats, axis=0)
            self.plot_att_mat(
                avg_mat,
                "{}.sent_{}.head_avg.png".format(self.plot_attention,
                                                 self.plot_attention_counter),
                300)
            cosim_before = cosine_similarity(x.as_tensor().npvalue().T)
            self.plot_att_mat(
                cosim_before, "{}.sent_{}.cosim_before.png".format(
                    self.plot_attention, self.plot_attention_counter), 600)
            cosim_after = cosine_similarity(out.npvalue().T)
            self.plot_att_mat(
                cosim_after, "{}.sent_{}.cosim_after.png".format(
                    self.plot_attention, self.plot_attention_counter), 600)
            self.plot_attention_counter += 1

        # Adding dropout and layer normalization
        if p > 0.0:
            res = dy.dropout(out, p) + residual
        else:
            res = out + residual
        ret = self.layer_norm.transform(res)
        return ret
Ejemplo n.º 9
0
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        if expr_seq.batch_size() > 1:
            raise ValueError(
                f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.batch_size()}"
            )
        lattice = self.cur_src[0]
        Wx_iog = dy.parameter(self.p_Wx_iog)
        Wh_iog = dy.parameter(self.p_Wh_iog)
        b_iog = dy.parameter(self.p_b_iog)
        Wx_f = dy.parameter(self.p_Wx_f)
        Wh_f = dy.parameter(self.p_Wh_f)
        b_f = dy.parameter(self.p_b_f)
        h = {}
        c = {}
        h_list = []

        batch_size = expr_seq.batch_size()
        if self.dropout_rate > 0.0 and self.train:
            self.set_dropout_masks(batch_size=batch_size)

        for i, cur_node_id in enumerate(lattice.nodes):
            prev_node = lattice.graph.predecessors(cur_node_id)
            val = expr_seq[i]
            if self.dropout_rate > 0.0 and self.train:
                val = dy.cmult(val, self.dropout_mask_x)
            i_ft_list = []
            if len(prev_node) == 0:
                tmp_iog = dy.affine_transform([b_iog, Wx_iog, val])
            else:
                h_tilde = sum(h[pred] for pred in prev_node)
                tmp_iog = dy.affine_transform(
                    [b_iog, Wx_iog, val, Wh_iog, h_tilde])
                for pred in prev_node:
                    i_ft_list.append(
                        dy.logistic(
                            dy.affine_transform(
                                [b_f, Wx_f, val, Wh_f, h[pred]])))
            i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim)
            i_aot = dy.pick_range(tmp_iog, self.hidden_dim,
                                  self.hidden_dim * 2)
            i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2,
                                  self.hidden_dim * 3)

            i_it = dy.logistic(i_ait)
            i_ot = dy.logistic(i_aot)
            i_gt = dy.tanh(i_agt)
            if len(prev_node) == 0:
                c[cur_node_id] = dy.cmult(i_it, i_gt)
            else:
                fc = dy.cmult(i_ft_list[0], c[prev_node[0]])
                for i in range(1, len(prev_node)):
                    fc += dy.cmult(i_ft_list[i], c[prev_node[i]])
                c[cur_node_id] = fc + dy.cmult(i_it, i_gt)
            h_t = dy.cmult(i_ot, dy.tanh(c[cur_node_id]))
            if self.dropout_rate > 0.0 and self.train:
                h_t = dy.cmult(h_t, self.dropout_mask_h)
            h[cur_node_id] = h_t
            h_list.append(h_t)
        self._final_states = [
            transducers.FinalTransducerState(h_list[-1], h_list[-1])
        ]
        return expression_seqs.ExpressionSequence(expr_list=h_list)
Ejemplo n.º 10
0
Archivo: lstm.py Proyecto: anhad13/xnmt
    def __call__(self, es, transitions):
        mask = es.mask
        #import pdb;pdb.set_trace()
        transitions = [t + [0, 1] for t in transitions]
        transitions = np.array(transitions)
        maxlen = max(len(r) for r in transitions)
        Wl = dy.parameter(self.p_Wl)
        Wr = dy.parameter(self.p_Wr)
        b = dy.parameter(self.p_b)
        batch_size = len(transitions)
        ha = []
        c = []
        self.hfinals = []
        hfinal_state = None
        cfinal_state = None
        self.cfinals = []
        for i in range(batch_size):
            hstack = []
            cstack = []
            htmp = []
            count = 0
            for j in range(len(transitions[i])):
                if transitions[i][j] == 0:
                    #print("Shift")
                    #shift onto stack
                    e1 = dy.reshape(es[count],
                                    (batch_size, self.hidden_dim))[i]
                    count += 1
                    hstack.append(e1)
                    cstack.append(e1)
                elif transitions[i][j] == 1:
                    #reduce
                    #print("Reduce")
                    h1 = hstack.pop()
                    h2 = hstack.pop()
                    c1 = cstack.pop()
                    c2 = cstack.pop()
                    tmp = dy.affine_transform([b, Wl, h1, Wr, h2])
                    i_gate = dy.pick_range(tmp, 0, self.hidden_dim)
                    fl_gate = dy.pick_range(tmp, self.hidden_dim,
                                            self.hidden_dim * 2)
                    fr_gate = dy.pick_range(tmp, self.hidden_dim * 2,
                                            self.hidden_dim * 3)
                    o_gate = dy.pick_range(tmp, self.hidden_dim * 3,
                                           self.hidden_dim * 4)
                    cell_inp = dy.pick_range(tmp, self.hidden_dim * 4,
                                             self.hidden_dim * 5)
                    i_gate = dy.tanh(i_gate)
                    cell_inp = dy.logistic(cell_inp)
                    fl_gate = dy.logistic(fl_gate)
                    fr_gate = dy.logistic(fr_gate)
                    o_gate = dy.logistic(o_gate)
                    c_t = dy.cmult(fl_gate, c1) + dy.cmult(
                        fr_gate, c2) + dy.cmult(i_gate, cell_inp)
                    h_t = dy.cmult(o_gate, dy.tanh(c_t))
                    cstack.append(c_t)
                    hstack.append(h_t)
                    htmp.append(h_t)
                    hfinal_state = h_t
                    cfinal_state = c_t
                else:
                    htmp.append(dy.zeros(self.hidden_dim))
            self.hfinals.append(h_t)
            self.cfinals.append(c_t)
            ha.append(htmp)

        self._final_states = [
            FinalTransducerState(dy.concatenate_to_batch(self.hfinals),
                                 dy.concatenate_to_batch(self.cfinals))
        ]
        ha = list(zip_longest(*ha))
        hh = []
        for x in ha:
            hh.append(list(x))
        k = [
            dy.reshape(dy.concatenate(xx), (xx[0].dim()[0][0], len(xx)))
            for xx in hh
        ]
        return ExpressionSequence(expr_list=k)