Esempio n. 1
0
 def set_dropout_masks(self, batch_size=1):
   if self.dropout_rate > 0.0 and self.train:
     retention_rate = 1.0 - self.dropout_rate
     scale = 1.0 / retention_rate
     self.dropout_mask_x = [dy.random_bernoulli((self.input_dim,), retention_rate, scale, batch_size=batch_size)]
     self.dropout_mask_x += [dy.random_bernoulli((self.hidden_dim,), retention_rate, scale, batch_size=batch_size) for _ in range(1, self.num_layers)]
     self.dropout_mask_h = [dy.random_bernoulli((self.hidden_dim,), retention_rate, scale, batch_size=batch_size) for _ in range(self.num_layers)]
Esempio n. 2
0
 def set_dropout_masks(self, batch_size: numbers.Integral = 1) -> None:
     if self.dropout_rate > 0.0 and self.train:
         retention_rate = 1.0 - self.dropout_rate
         scale = 1.0 / retention_rate
         self.dropout_mask_x = dy.random_bernoulli((self.input_dim, ),
                                                   retention_rate,
                                                   scale,
                                                   batch_size=batch_size)
         self.dropout_mask_h = dy.random_bernoulli((self.hidden_dim, ),
                                                   retention_rate,
                                                   scale,
                                                   batch_size=batch_size)
Esempio n. 3
0
 def _fast_sample(self, prob, temperature=1):
     temperature = temperature / 2
     bern = dy.random_bernoulli(256, 0.5,
                                scale=temperature) + (1.0 - temperature)
     prob = dy.cmult(prob, bern)
     # print prob.npvalue().argmax()
     return prob.npvalue().argmax()
Esempio n. 4
0
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        """
    transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c)

    Args:
      expr_seq: expression sequence (will be accessed via tensor_expr)
    Return:
      expression sequence
    """

        if isinstance(expr_seq, list):
            mask_out = expr_seq[0].mask
            seq_len = len(expr_seq[0])
            batch_size = expr_seq[0].dim()[1]
            tensors = [e.as_tensor() for e in expr_seq]
            input_tensor = dy.reshape(dy.concatenate(tensors),
                                      (seq_len, 1, self.input_dim),
                                      batch_size=batch_size)
        else:
            mask_out = expr_seq.mask
            seq_len = len(expr_seq)
            batch_size = expr_seq.dim()[1]
            input_tensor = dy.reshape(dy.transpose(expr_seq.as_tensor()),
                                      (seq_len, 1, self.input_dim),
                                      batch_size=batch_size)

        if self.dropout > 0.0 and self.train:
            input_tensor = dy.dropout(input_tensor, self.dropout)

        proj_inp = dy.conv2d_bias(input_tensor,
                                  dy.parameter(self.p_f),
                                  dy.parameter(self.p_b),
                                  stride=(self.stride, 1),
                                  is_valid=False)
        reduced_seq_len = proj_inp.dim()[0][0]
        proj_inp = dy.transpose(
            dy.reshape(proj_inp, (reduced_seq_len, self.hidden_dim * 3),
                       batch_size=batch_size))
        # proj_inp dims: (hidden, 1, seq_len), batch_size
        if self.stride > 1 and mask_out is not None:
            mask_out = mask_out.lin_subsampled(trg_len=reduced_seq_len)

        h = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)]
        c = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)]
        for t in range(reduced_seq_len):
            f_t = dy.logistic(
                dy.strided_select(proj_inp, [], [0, t],
                                  [self.hidden_dim, t + 1]))
            o_t = dy.logistic(
                dy.strided_select(proj_inp, [], [self.hidden_dim, t],
                                  [self.hidden_dim * 2, t + 1]))
            z_t = dy.tanh(
                dy.strided_select(proj_inp, [], [self.hidden_dim * 2, t],
                                  [self.hidden_dim * 3, t + 1]))

            if self.dropout > 0.0 and self.train:
                retention_rate = 1.0 - self.dropout
                dropout_mask = dy.random_bernoulli((self.hidden_dim, 1),
                                                   retention_rate,
                                                   batch_size=batch_size)
                f_t = 1.0 - dy.cmult(
                    dropout_mask, 1.0 - f_t
                )  # TODO: would be easy to make a zoneout dynet operation to save memory

            i_t = 1.0 - f_t

            if t == 0:
                c_t = dy.cmult(i_t, z_t)
            else:
                c_t = dy.cmult(f_t, c[-1]) + dy.cmult(i_t, z_t)
            h_t = dy.cmult(
                o_t, c_t)  # note: LSTM would use dy.tanh(c_t) instead of c_t
            if mask_out is None or np.isclose(
                    np.sum(mask_out.np_arr[:, t:t + 1]), 0.0):
                c.append(c_t)
                h.append(h_t)
            else:
                c.append(
                    mask_out.cmult_by_timestep_expr(c_t, t, True) +
                    mask_out.cmult_by_timestep_expr(c[-1], t, False))
                h.append(
                    mask_out.cmult_by_timestep_expr(h_t, t, True) +
                    mask_out.cmult_by_timestep_expr(h[-1], t, False))

        self._final_states = [transducers.FinalTransducerState(dy.reshape(h[-1], (self.hidden_dim,), batch_size=batch_size), \
                                                               dy.reshape(c[-1], (self.hidden_dim,),
                                                                          batch_size=batch_size))]
        return expression_seqs.ExpressionSequence(expr_list=h[1:],
                                                  mask=mask_out)
Esempio n. 5
0
    def transduce(self, es):
        ret = []
        #  wix, wih, bi = parameter(self.wix), parameter(self.wih), parameter(self.bi)
        #  wfx, wfh, bf = parameter(self.wfx), parameter(self.wfh), parameter(self.bf)
        #  wcx, wch, bc = parameter(self.wcx), parameter(self.wch), parameter(self.bc)
        #  wox, woh, bo = parameter(self.wox), parameter(self.woh), parameter(self.bo)
        #  wrx, wrh, whx, br = parameter(self.wrx), parameter(self.wrh), parameter(self.whx), parameter(self.br)
        #  prev_c, prev_h = parameter(self.initc), parameter(self.inith)
        wix, wih, bi = self.wix, self.wih, self.bi
        wfx, wfh, bf = self.wfx, self.wfh, self.bf
        wcx, wch, bc = self.wcx, self.wch, self.bc
        wox, woh, bo = self.wox, self.woh, self.bo
        wrx, wrh, whx, br = self.wrx, self.wrh, self.whx, self.br
        prev_c, prev_h = self.initc, self.inith

        if self.dropout_x > 0.:
            retention_x = 1. - self.dropout_x
            scale_x = 1. / retention_x
            mask_x_i = random_bernoulli(self._input_dim,
                                        p=retention_x,
                                        scale=scale_x)
            mask_x_f = random_bernoulli(self._input_dim,
                                        p=retention_x,
                                        scale=scale_x)
            mask_x_c = random_bernoulli(self._input_dim,
                                        p=retention_x,
                                        scale=scale_x)
            mask_x_o = random_bernoulli(self._input_dim,
                                        p=retention_x,
                                        scale=scale_x)
            mask_x_r = random_bernoulli(self._input_dim,
                                        p=retention_x,
                                        scale=scale_x)
        if self.dropout_h > 0.:
            retention_h = 1. - self.dropout_h
            scale_h = 1. / retention_h
            mask_h_i = random_bernoulli(self._hidden_dim,
                                        p=retention_h,
                                        scale=scale_h)
            mask_h_f = random_bernoulli(self._hidden_dim,
                                        p=retention_h,
                                        scale=scale_h)
            mask_h_c = random_bernoulli(self._hidden_dim,
                                        p=retention_h,
                                        scale=scale_h)
            mask_h_o = random_bernoulli(self._hidden_dim,
                                        p=retention_h,
                                        scale=scale_h)
            mask_h_r = random_bernoulli(self._hidden_dim,
                                        p=retention_h,
                                        scale=scale_h)

        for x in es:
            ait = affine_transform([
                bi, wix,
                cmult(mask_x_i, x) if self.dropout_x > 0. else x, wih,
                cmult(mask_h_i, prev_h) if self.dropout_h > 0. else prev_h
            ])
            it = logistic(ait)

            aft = affine_transform([
                bf, wfx,
                cmult(mask_x_f, x) if self.dropout_x > 0. else x, wfh,
                cmult(mask_h_f, prev_h) if self.dropout_h > 0. else prev_h
            ])
            ft = logistic(aft)

            atct = affine_transform([
                bc, wcx,
                cmult(mask_x_c, x) if self.dropout_x > 0. else x, wch,
                cmult(mask_h_c, prev_h) if self.dropout_h > 0. else prev_h
            ])
            tct = tanh(atct)
            #  ct = prev_c + cmult(tct - prev_c, it)
            ct = cmult(ft, prev_c) + cmult(it, tct)

            aot = affine_transform([
                bo, wox,
                cmult(mask_x_o, x) if self.dropout_x > 0. else x, woh,
                cmult(mask_h_o, prev_h) if self.dropout_h > 0. else prev_h
            ])
            ot = logistic(aot)

            h = cmult(tanh(ct), ot)

            art = affine_transform([
                br, wrx,
                cmult(mask_x_r, x) if self.dropout_x > 0. else x, wrh,
                cmult(mask_h_r, prev_h) if self.dropout_h > 0. else prev_h
            ])
            rt = logistic(art)
            highway_h = cmult(rt, h) + cmult(1. - rt, whx * x)

            ret.append(highway_h)
            prev_c = ct
            prev_h = highway_h

        return ret
Esempio n. 6
0
    def transduce(self, es):
        ret = []
        #  wix, wih, bi = parameter(self.wix), parameter(self.wih), parameter(self.bi)
        #  wcx, wch, bc = parameter(self.wcx), parameter(self.wch), parameter(self.bc)
        #  wox, woh, bo = parameter(self.wox), parameter(self.woh), parameter(self.bo)
        #  prev_c, prev_h = parameter(self.initc), parameter(self.inith)
        wix, wih, bi = self.wix, self.wih, self.bi
        wcx, wch, bc = self.wcx, self.wch, self.bc
        wox, woh, bo = self.wox, self.woh, self.bo
        prev_c, prev_h = self.initc, self.inith

        if self.dropout_x > 0.:
            retention_x = 1. - self.dropout_x
            scale_x = 1. / retention_x
            mask_x_i = random_bernoulli(self._input_dim,
                                        p=retention_x,
                                        scale=scale_x)
            mask_x_c = random_bernoulli(self._input_dim,
                                        p=retention_x,
                                        scale=scale_x)
            mask_x_o = random_bernoulli(self._input_dim,
                                        p=retention_x,
                                        scale=scale_x)
        if self.dropout_h > 0.:
            retention_h = 1. - self.dropout_h
            scale_h = 1. / retention_h
            mask_h_i = random_bernoulli(self._hidden_dim,
                                        p=retention_h,
                                        scale=scale_h)
            mask_h_c = random_bernoulli(self._hidden_dim,
                                        p=retention_h,
                                        scale=scale_h)
            mask_h_o = random_bernoulli(self._hidden_dim,
                                        p=retention_h,
                                        scale=scale_h)

        for x in es:
            ait = affine_transform([
                bi, wix,
                cmult(mask_x_i, x) if self.dropout_x > 0. else x, wih,
                cmult(mask_h_i, prev_h) if self.dropout_h > 0. else prev_h
            ])
            it = logistic(ait)
            ft = 1. - it

            atct = affine_transform([
                bc, wcx,
                cmult(mask_x_c, x) if self.dropout_x > 0. else x, wch,
                cmult(mask_h_c, prev_h) if self.dropout_h > 0. else prev_h
            ])
            tct = tanh(atct)
            ct = prev_c + cmult(tct - prev_c, it)

            aot = affine_transform([
                bo, wox,
                cmult(mask_x_o, x) if self.dropout_x > 0. else x, woh,
                cmult(mask_h_o, prev_h) if self.dropout_h > 0. else prev_h
            ])
            ot = logistic(aot)
            h = cmult(tanh(ct), ot)

            ret.append(h)
            prev_c = ct
            prev_h = h
        return ret