Example #1
0
    def __call__(self, expr_seq):
        """
    transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c)

    Args:
      expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated)
    Returns:
      expression sequence
    """
        if isinstance(expr_seq, ExpressionSequence):
            expr_seq = [expr_seq]
        batch_size = expr_seq[0][0].dim()[1]
        seq_len = len(expr_seq[0])

        if self.dropout_rate > 0.0 and self.train:
            self.set_dropout_masks(batch_size=batch_size)

        cur_input = expr_seq
        self._final_states = []
        for layer_i in range(self.num_layers):
            h = [dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)]
            c = [dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)]
            for pos_i in range(seq_len):
                x_t = [cur_input[j][pos_i] for j in range(len(cur_input))]
                if isinstance(x_t, dy.Expression):
                    x_t = [x_t]
                elif type(x_t) != list:
                    x_t = list(x_t)
                if self.dropout_rate > 0.0 and self.train:
                    # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights)
                    gates_t = dy.vanilla_lstm_gates_dropout_concat(
                        x_t, h[-1], self.Wx[layer_i], self.Wh[layer_i],
                        self.b[layer_i], self.dropout_mask_x[layer_i],
                        self.dropout_mask_h[layer_i],
                        self.weightnoise_std if self.train else 0.0)
                else:
                    gates_t = dy.vanilla_lstm_gates_concat(
                        x_t, h[-1], self.Wx[layer_i], self.Wh[layer_i],
                        self.b[layer_i],
                        self.weightnoise_std if self.train else 0.0)
                c_t = dy.vanilla_lstm_c(c[-1], gates_t)
                h_t = dy.vanilla_lstm_h(c_t, gates_t)
                if expr_seq[0].mask is None or np.isclose(
                        np.sum(expr_seq[0].mask.np_arr[:, pos_i:pos_i + 1]),
                        0.0):
                    c.append(c_t)
                    h.append(h_t)
                else:
                    c.append(expr_seq[0].mask.cmult_by_timestep_expr(
                        c_t, pos_i, True) +
                             expr_seq[0].mask.cmult_by_timestep_expr(
                                 c[-1], pos_i, False))
                    h.append(expr_seq[0].mask.cmult_by_timestep_expr(
                        h_t, pos_i, True) +
                             expr_seq[0].mask.cmult_by_timestep_expr(
                                 h[-1], pos_i, False))
            self._final_states.append(FinalTransducerState(h[-1], c[-1]))
            cur_input = [h[1:]]

        return ExpressionSequence(expr_list=h[1:], mask=expr_seq[0].mask)
Example #2
0
    def translate(self, x, beam_size=1):
        """Translate a source sentence
        
        Translate a single source sentence by decoding using beam search

        Arguments:
            x (list): Source sentence (list of indices)
        
        Keyword Arguments:
            beam_size (int): Size of the beam for beam search. A value of 1 means greedy decoding (default: (1))
        
        Returns:
            list: generated translation (list of indices)
        """
        dy.renew_cg()
        input_len = len(x)
        encodings = self.encode([x], test=True)
        # Decode
        # Add parameters to the graph
        Wp, bp = self.Wp_p.expr(), self.bp_p.expr()
        Wo, bo = self.Wo_p.expr(), self.bo_p.expr()
        D, b = dy.transpose(dy.parameter(self.MT_p)), self.b_p.expr()
        # Initialize decoder with last encoding
        last_enc = dy.select_cols(encodings, [encodings.dim()[0][-1] - 1])
        init_state = dy.affine_transform([bp, Wp, last_enc])
        ds = self.dec.initial_state([init_state, dy.zeroes((self.dh, ))])
        # Initialize context
        context = dy.zeroes((self.enc_dim, ))
        # Initialize beam
        beam = [(ds, context, [self.trg_sos], 0.0)]
        # Loop
        for i in range(int(min(self.max_len, input_len * 1.5))):
            new_beam = []
            for ds, pc, pw, logprob in beam:
                embs = dy.lookup(self.MT_p, pw[-1])
                # Run LSTM
                ds = ds.add_input(dy.concatenate([embs, pc]))
                h = ds.output()
                # Compute next context
                context, _ = self.attend(encodings, h)
                # Compute output with residual connections
                output = dy.affine_transform(
                    [bo, Wo, dy.concatenate([h, context, embs])])
                # Score
                s = dy.affine_transform([b, D, output])
                # Probabilities
                p = dy.softmax(s).npvalue().flatten()
                # Careful of float error
                p = p / p.sum()
                kbest = np.argsort(p)
                for nw in kbest[-beam_size:]:
                    new_beam.append(
                        (ds, context, pw + [nw], logprob + np.log(p[nw])))

            beam = sorted(new_beam, key=lambda x: x[-1])[-beam_size:]

            if beam[-1][2][-1] == self.trg_eos:
                break

        return beam[-1][2]
Example #3
0
    def transduce(self, embed_sent):
        src = embed_sent.as_tensor()

        sent_len = src.dim()[0][1]
        src_width = 1
        batch_size = src.dim()[1]
        pad_size = (self.window_receptor -
                    1) / 2  #TODO adapt it also for even window size

        src = dy.concatenate([
            dy.zeroes((self.input_dim, pad_size), batch_size=batch_size), src,
            dy.zeroes((self.input_dim, pad_size), batch_size=batch_size)
        ],
                             d=1)
        padded_sent_len = sent_len + 2 * pad_size

        conv1 = dy.parameter(self.pConv1)
        bias1 = dy.parameter(self.pBias1)
        src_chn = dy.reshape(src, (self.input_dim, padded_sent_len, 1),
                             batch_size=batch_size)
        cnn_layer1 = dy.conv2d_bias(src_chn, conv1, bias1, stride=[1, 1])

        hidden_layer = dy.reshape(cnn_layer1, (self.internal_dim, sent_len, 1),
                                  batch_size=batch_size)
        if self.non_linearity is 'linear':
            hidden_layer = hidden_layer
        elif self.non_linearity is 'tanh':
            hidden_layer = dy.tanh(hidden_layer)
        elif self.non_linearity is 'relu':
            hidden_layer = dy.rectify(hidden_layer)
        elif self.non_linearity is 'sigmoid':
            hidden_layer = dy.logistic(hidden_layer)

        for conv_hid, bias_hid in self.builder_layers:
            hidden_layer = dy.conv2d_bias(hidden_layer,
                                          dy.parameter(conv_hid),
                                          dy.parameter(bias_hid),
                                          stride=[1, 1])
            hidden_layer = dy.reshape(hidden_layer,
                                      (self.internal_dim, sent_len, 1),
                                      batch_size=batch_size)
            if self.non_linearity is 'linear':
                hidden_layer = hidden_layer
            elif self.non_linearity is 'tanh':
                hidden_layer = dy.tanh(hidden_layer)
            elif self.non_linearity is 'relu':
                hidden_layer = dy.rectify(hidden_layer)
            elif self.non_linearity is 'sigmoid':
                hidden_layer = dy.logistic(hidden_layer)
        last_conv = dy.parameter(self.last_conv)
        last_bias = dy.parameter(self.last_bias)
        output = dy.conv2d_bias(hidden_layer,
                                last_conv,
                                last_bias,
                                stride=[1, 1])
        output = dy.reshape(output, (sent_len, self.output_dim),
                            batch_size=batch_size)
        output_seq = ExpressionSequence(expr_tensor=output)
        self._final_states = [FinalTransducerState(output_seq[-1])]
        return output_seq
Example #4
0
    def __call__(self, x, y):
        x = concatenate([x, zeroes((1, x.dim()[0][1],)) + 1.])
        y = concatenate([y, zeroes((1, y.dim()[0][1],)) + 1.])

        if self.spec[1] == 1:
            return self.U[0](x, y)
        else:
            return concatenate([u(x, y) for u in self.U], 2)
Example #5
0
 def __init__(self, network, prev=None, c=None, h=None):
   self._network = network
   if c is None:
     c = [dy.zeroes(dim=(network.hidden_dim,)) for _ in range(network.num_layers)]
   if h is None:
     h = [dy.zeroes(dim=(network.hidden_dim,)) for _ in range(network.num_layers)]
   self._c = tuple(c)
   self._h = tuple(h)
   self._prev = prev
Example #6
0
 def __call__(self, x, y):
     x = concatenate([x, zeroes((
         1,
         x.dim()[0][1],
     )) + 1.])
     y = concatenate([y, zeroes((
         1,
         y.dim()[0][1],
     )) + 1.])
     return concatenate([u(x, y) for u in self.U], 2)
Example #7
0
 def __init__(self,
              network: 'UniLSTMSeqTransducer',
              prev: Optional['UniLSTMState'] = None,
              c: Sequence[dy.Expression] = None,
              h: Sequence[dy.Expression] = None) -> None:
   self._network = network
   if c is None:
     c = [dy.zeroes(dim=(network.hidden_dim,)) for _ in range(network.num_layers)]
   if h is None:
     h = [dy.zeroes(dim=(network.hidden_dim,)) for _ in range(network.num_layers)]
   self._c = tuple(c)
   self._h = tuple(h)
   self._prev = prev
Example #8
0
 def decode_loss(self, encodings, trg, test=False):
     """Compute the negative conditional log likelihood of the target sentence given the encoding of the source sentence
     
     Arguments:
         encodings (dynet.Expression): Source sentence encodings obtained with self.encode
         trg (list): List of target sentences
     
     Keyword Arguments:
         test (bool): Switch used for things like dropout where the behaviour is different at test time (default: (False)
     
     Returns:
         dynet.Expression: Expression of the loss averaged on the minibatch
     """
     y, masksy = self.prepare_batch(trg, self.trg_eos)
     slen, bsize = y.shape
     # Add parameters to the graph
     Wp, bp = self.Wp_p.expr(), self.bp_p.expr()
     Wo, bo = self.Wo_p.expr(), self.bo_p.expr()
     D, b = dy.transpose(dy.parameter(self.MT_p)), self.b_p.expr()
     # Initialize decoder with last encoding
     last_enc = dy.select_cols(encodings, [encodings.dim()[0][-1] - 1])
     init_state = dy.affine_transform([bp, Wp, last_enc])
     ds = self.dec.initial_state(
         [init_state, dy.zeroes((self.dh, ), batch_size=bsize)])
     # Initialize context
     context = dy.zeroes((self.enc_dim, ), batch_size=bsize)
     # Start decoding
     errs = []
     for cw, nw, mask in zip(y, y[1:], masksy[1:]):
         embs = dy.lookup_batch(self.MT_p, cw)
         # Run LSTM
         ds = ds.add_input(dy.concatenate([embs, context]))
         h = ds.output()
         # Compute next context
         context, _ = self.attend(encodings, h)
         # Compute output with residual connections
         output = dy.affine_transform(
             [bo, Wo, dy.concatenate([h, context, embs])])
         if not test:
             output = dy.dropout(output, self.dr)
         # Score
         s = dy.affine_transform([b, D, output])
         masksy_e = dy.inputTensor(mask, batched=True)
         # Loss
         err = dy.cmult(dy.pickneglogsoftmax_batch(s, nw), masksy_e)
         errs.append(err)
     # Add all losses together
     err = dy.sum_batches(dy.esum(errs)) / float(bsize)
     return err
Example #9
0
  def transduce(self, es):
    es_expr = es.as_tensor()

    # e.g. es_expr.dim() ==((276, 240), 1)
    sent_len = es_expr.dim()[0][0]
    batch_size=es_expr.dim()[1]
    
    # convolutions won't work if sent length is too short; pad if necessary
    pad_size = 0
    while math.ceil(float(sent_len + pad_size - self.filter_size_time + 1) / float(self.stride[0])) < self.filter_size_time:
      pad_size += 1
    if pad_size>0:
      es_expr = dy.concatenate([es_expr, dy.zeroes((pad_size, self.freq_dim * self.chn_dim), batch_size=es_expr.dim()[1])])
      sent_len += pad_size

    # convolution layers    
    es_chn = dy.reshape(es_expr, (sent_len, self.freq_dim, self.chn_dim), batch_size=batch_size) # ((276, 80, 3), 1)
    cnn_layer1 = dy.conv2d(es_chn, dy.parameter(self.filters1), stride=self.stride, is_valid=True) # ((137, 39, 32), 1)
    cnn_layer2 = dy.conv2d(cnn_layer1, dy.parameter(self.filters2), stride=self.stride, is_valid=True) # ((68, 19, 32), 1)
    cnn_out = dy.reshape(cnn_layer2, (cnn_layer2.dim()[0][0], cnn_layer2.dim()[0][1]*cnn_layer2.dim()[0][2]), batch_size=batch_size) # ((68, 608), 1)
    es_list = [cnn_out[i] for i in range(cnn_out.dim()[0][0])]
    
    # RNN layers
    for (fb, bb) in self.builder_layers:
      fs = fb.initial_state().transduce(es_list)
      bs = bb.initial_state().transduce(reversed(es_list))
      es_list = [dy.concatenate([f, b]) for f, b in zip(fs, reversed(bs))]
    return es_list
def softmax(x):
    """
    Compute the softmax function in tensorflow.

    You might find the tensorflow functions tf.exp, tf.reduce_max,
    tf.reduce_sum, tf.expand_dims useful. (Many solutions are possible, so you may
    not need to use all of these functions). Recall also that many common
    tensorflow operations are sugared (e.g. x * y does a tensor multiplication
    if x and y are both tensors). Make sure to implement the numerical stability
    fixes as in the previous homework!

    Args:
        x:   tf.Tensor with shape (n_samples, n_features). Note feature vectors are
                  represented by row-vectors. (For simplicity, no need to handle 1-d
                  input as in the previous homework)
    Returns:
        out: tf.Tensor with shape (n_sample, n_features). You need to construct this
                  tensor in this problem.
    """

    ### YOUR CODE HERE
    x_max = dy.max_dim(x, 1)
    x_sub = dy.colwise_add(x, -x_max)
    x_exp = dy.exp(x_sub)
    sum_exp = dy.colwise_add(dy.zeroes(x.dim()[0]), dy.sum_cols(x_exp))

    out = dy.cdiv(x_exp, sum_exp)
    ### END YOUR CODE

    return out
Example #11
0
 def evaluate(self, inputs, train=False):
     """
     Apply all MLP layers to concatenated input
     :param inputs: (key, vector) per feature type
     :param train: are we training now?
     :return: output vector of size self.output_dim
     """
     input_keys, inputs = list(map(list, zip(*list(inputs))))
     if self.input_keys:
         assert input_keys == self.input_keys, "Got:     %s\nBut expected input keys: %s" % (
             self.input_keys_str(
                 self.input_keys), self.input_keys_str(input_keys))
     else:
         self.input_keys = input_keys
     if self.gated:
         gates = self.params.get("gates")
         if gates is None:  # FIXME attention weights should not be just parameters, but based on biaffine product?
             gates = self.params["gates"] = self.model.add_parameters(
                 (len(inputs), self.gated), init=dy.UniformInitializer(1))
         input_dims = [i.dim()[0][0] for i in inputs]
         max_dim = max(input_dims)
         x = dy.concatenate_cols([
             dy.concatenate([i, dy.zeroes(max_dim - d)
                             ])  # Pad with zeros to get uniform dim
             if d < max_dim else i for i, d in zip(inputs, input_dims)
         ]) * gates
         # Possibly multiple "attention heads" -- concatenate outputs to one vector
         inputs = [dy.reshape(x, (x.dim()[0][0] * x.dim()[0][1], ))]
     x = dy.concatenate(inputs)
     assert len(
         x.dim()
         [0]) == 1, "Input should be a vector, but has dimension " + str(
             x.dim()[0])
     dim = x.dim()[0][0]
     if self.input_dim:
         assert dim == self.input_dim, "Input dim mismatch: %d != %d" % (
             dim, self.input_dim)
     else:
         self.init_params(dim)
     self.config.print(self, level=4)
     if self.total_layers:
         if self.weights is None:
             self.weights = [[
                 self.params[prefix + str(i)] for prefix in ("W", "b")
             ] for i in range(self.total_layers)]
             if self.weights[0][0].dim(
             )[0][1] < dim:  # number of columns in W0
                 self.weights[0][0] = dy.concatenate_cols(
                     [self.weights[0][0], self.params["W0+"]])
         for i, (W, b) in enumerate(self.weights):
             self.config.print(lambda: x.npvalue().tolist(), level=4)
             try:
                 if train and self.dropout:
                     x = dy.dropout(x, self.dropout)
                 x = self.activation()(W * x + b)
             except ValueError as e:
                 raise ValueError("Error in evaluating layer %d of %d" %
                                  (i + 1, self.total_layers)) from e
     self.config.print(lambda: x.npvalue().tolist(), level=4)
     return x
Example #12
0
def padding(src, min_size):
  """ do padding for the sequence input along the time step (for example speech), so that so that the output of convolutional layer has the same size(time) of the input.

      note that for padding image(two dimensional padding), please refer to dyne.conv2d(..., is_valid = False)
  """
  # pad before put into convolutional layer
  src_dim = src.dim()
  if src_dim[0][1] >= min_size:
    return src
  pad_size = min_size - src_dim[0][1]
  channels = src_dim[0][2] if len(src_dim[0]) >= 3 else 1
  if pad_size == 1:
    return dy.concatenate([src, dy.zeroes((src_dim[0][0], 1, channels))], d=1)
  else:
    left_border = int(pad_size) / 2
    right_border = (int(pad_size)+1) / 2
    return dy.concatenate([dy.zeroes((src_dim[0][0], left_border, channels)), src, dy.zeroes((src_dim[0][0], right_border, channels))], d=1) # do concatenate along cols
Example #13
0
 def pad(self, expr, pad_size):
     assert pad_size >= 0
     if pad_size == 0:
         return expr
     return dy.concatenate([
         expr,
         dy.zeroes((pad_size, self.freq_dim * self.chn_dim),
                   batch_size=expr.dim()[1])
     ])  # TODO: replicate last frame instead of padding zeros
def softmax(x):
    ### YOUR CODE HERE
    x_max = dy.max_dim(x, 1)
    x_sub = dy.colwise_add(x, -x_max)
    x_exp = dy.exp(x_sub)
    x_sum = dy.sum_cols(x_exp)
    x_tmp = dy.zeroes(x.dim()[0])
    x_tmp = dy.colwise_add(x_tmp, x_sum)
    out = dy.cdiv(x_exp, x_tmp)
    ### END YOUR CODE
    return out
Example #15
0
    def beam_decode(self, encodings, input_len=10, beam_size=1):
        # Add parameters to the graph
        self.dec.init(encodings, [[self.trg_sos]],
                      self.usr.user_vector,
                      test=self.test,
                      update=self.update)
        # Initialize context
        context = dy.zeroes((self.enc.dim, ))
        # Process user token if necessary
        if self.user_token:
            _, _, _ = self.dec.next(self.usr.user_vector,
                                    context,
                                    test=self.test)
        # Get conditional log probability of lengths
        llp = np.log(self.lex.p_L[input_len])
        # Initialize beam
        beams = [beam.Beam(self.dec.ds, context, [self.trg_sos], llp[0])]
        # Loop
        for i in range(int(min(self.max_len, input_len * 1.5))):
            new_beam = []
            for b in beams:
                if b.words[-1] == self.trg_eos:
                    new_beam.append(
                        beam.Beam(b.state, b.context, b.words, b.logprob,
                                  b.align))
                    continue
                h, e, b.state = self.dec.next([b.words[-1]],
                                              b.context,
                                              state=b.state)
                # Compute next context
                b.context, att = self.attend(encodings, h)
                # Score
                s = self.dec.s(h, b.context, e, test=self.test)
                # Probabilities
                p = dy.softmax(s).npvalue()
                # Careful for floating errors
                p = p.flatten() / p.sum()
                # Store alignment for e.g. unk replacement
                align = np.argmax(att.npvalue())
                kbest = np.argsort(p)
                for nw in kbest[-beam_size:]:
                    new_beam.append(
                        beam.Beam(
                            b.state, b.context, b.words + [nw],
                            b.logprob + np.log(p[nw]) + llp[i + 1] - llp[i],
                            b.align + [align]))
            # Only keep the best
            beams = sorted(new_beam, key=lambda b: b.logprob)[-beam_size:]
            if beams[-1].words[-1] == self.trg_eos:
                break

        return beams[-1]
Example #16
0
def run_IRNN(x):
    """
    Runs MLP to get the last layer before softmax
    """
    bsize, d = x.shape
    Wh, Wx, bh = Wh_p.expr(), Wx_p.expr(), bh_p.expr()                   # Load parameters in computation graph
    A,b=A_p.expr(),b_p.expr()
    x_list = [dy.inputTensor(x_t, batched=True) for x_t in x.T]               # Initialize layer value
    h=dy.zeroes((dh,),batch_size=bsize)# Initialize layer value
    for x_t in x_list:             # Iterate over layers
        a = Wh * h + Wx * x_t + bh                                   # Affine transform
        h = dy.rectify(a) # Apply non-linearity (except for last layer)
    return A * h + b
 def init(self, x, usr, test=True, update=True):
     bs = len(x[0])
     if not test:
         self.lstm.set_dropout(self.dr)
     else:
         self.lstm.disable_dropout()
     # Add encoder to computation graph
     self.Th = self.Th_p.expr(update)
     init_state = self.Th * usr
     init_state = [init_state, dy.zeroes((self.dh,), batch_size=bs)]
     self.es = self.lstm.initial_state(init_state, update=update)
     if not test:
         self.lstm.set_dropout_masks(bs)
Example #18
0
def encode_sequence(sequence, rnns, embedder, dropout_amount=0.):
    """ Encodes a sequence given RNN cells and an embedding function.

    Inputs:
        seq (list of str): The sequence to encode.
        rnns (list of dy._RNNBuilder): The RNNs to use.
        emb_fn (dict str->dy.Expression): Function that embeds strings to
            word vectors.
        size (int): The size of the RNN.
        dropout_amount (float, optional): The amount of dropout to apply.

    Returns:
        (list of dy.Expression, list of dy.Expression), list of dy.Expression,
        where the first pair is the (final cell memories, final cell states) of
        all layers, and the second list is a list of the final layer's cell
        state for all tokens in the sequence.
    """
    layer_states = []
    for rnn in rnns:
        hidden_size = rnn.spec[2]
        layer_states.append(rnn.initial_state([dy.zeroes((hidden_size, 1)),
                                               dy.zeroes((hidden_size, 1))]))

    outputs = []

    for token in sequence:
        rnn_input = embedder(token)

        (cell_states, hidden_states), output, layer_states = \
            forward_one_multilayer(rnn_input,
                                   layer_states,
                                   dropout_amount)

        outputs.append(output)

    return (cell_states, hidden_states), outputs
Example #19
0
 def evaluate(self, inputs, train=False):
     """
     Apply all MLP layers to concatenated input
     :param inputs: (key, vector) per feature type
     :param train: are we training now?
     :return: output vector of size self.output_dim
     """
     input_keys, inputs = list(map(list, zip(*list(inputs))))
     if self.input_keys:
         assert input_keys == self.input_keys, "Got:     %s\nBut expected input keys: %s" % (
             self.input_keys_str(self.input_keys), self.input_keys_str(input_keys))
     else:
         self.input_keys = input_keys
     if self.gated:
         gates = self.params.get("gates")
         if gates is None:  # FIXME attention weights should not be just parameters, but based on biaffine product?
             gates = self.params["gates"] = self.model.add_parameters((len(inputs), self.gated),
                                                                      init=dy.UniformInitializer(1))
         input_dims = [i.dim()[0][0] for i in inputs]
         max_dim = max(input_dims)
         x = dy.concatenate_cols([dy.concatenate([i, dy.zeroes(max_dim - d)])  # Pad with zeros to get uniform dim
                                  if d < max_dim else i for i, d in zip(inputs, input_dims)]) * gates
         # Possibly multiple "attention heads" -- concatenate outputs to one vector
         inputs = [dy.reshape(x, (x.dim()[0][0] * x.dim()[0][1],))]
     x = dy.concatenate(inputs)
     assert len(x.dim()[0]) == 1, "Input should be a vector, but has dimension " + str(x.dim()[0])
     dim = x.dim()[0][0]
     if self.input_dim:
         assert dim == self.input_dim, "Input dim mismatch: %d != %d" % (dim, self.input_dim)
     else:
         self.init_params(dim)
     self.config.print(self, level=4)
     if self.total_layers:
         if self.weights is None:
             self.weights = [[self.params[prefix + str(i)] for prefix in ("W", "b")]
                             for i in range(self.total_layers)]
             if self.weights[0][0].dim()[0][1] < dim:  # number of columns in W0
                 self.weights[0][0] = dy.concatenate_cols([self.weights[0][0], self.params["W0+"]])
         for i, (W, b) in enumerate(self.weights):
             self.config.print(lambda: x.npvalue().tolist(), level=4)
             try:
                 if train and self.dropout:
                     x = dy.dropout(x, self.dropout)
                 x = self.activation()(W * x + b)
             except ValueError as e:
                 raise ValueError("Error in evaluating layer %d of %d" % (i + 1, self.total_layers)) from e
     self.config.print(lambda: x.npvalue().tolist(), level=4)
     return x
Example #20
0
def zeroes(hidden_dim: numbers.Integral,
           batch_size: numbers.Integral = 1) -> Tensor:
    """
  Create a possibly batched zero vector.

  Args:
    hidden_dim: vector size
    batch_size: batch size

  Returns:
    DyNet expression of size ((hidden_dim,),batch_size) or PyTorch tensor of size (batch_size,hidden_dim)
  """
    if xnmt.backend_dynet:
        return dy.zeroes((hidden_dim, ), batch_size=batch_size)
    else:
        return torch.zeros(size=(
            batch_size,
            hidden_dim,
        ), device=xnmt.device)
Example #21
0
def compute_loss(gold_seq,
                 scores,
                 index_to_token_maps,
                 gold_tok_to_id,
                 noise=0.00000001):
    """ Computes the loss of a gold sequence given scores.

    Inputs:
        gold_seq (list of str): A sequence of gold tokens.
        scores (list of dy.Expression): Expressions representing the scores of
            potential output tokens for each token in gold_seq.
        index_to_tok_maps (list of dict str->list of int): Maps from index in the
            sequence to a dictionary mapping from a string to a set of integers.
        gold_tok_to_id (lambda (str, str)->list of int): Maps from the gold token
            and some lookup function to the indices in the probability distribution
            where the gold token occurs.
        noise (float, optional): The amount of noise to add to the loss.

    Returns:
        dy.Expression representing the sum of losses over the sequence.
    """
    assert len(gold_seq) == len(scores)
    assert len(index_to_token_maps) == len(scores)

    losses = []
    for i, gold_tok in enumerate(gold_seq):
        score = scores[i]
        token_map = index_to_token_maps[i]

        gold_indices = gold_tok_to_id(gold_tok, token_map)
        assert len(gold_indices) > 0
        if len(gold_indices) == 1:
            losses.append(dy.pickneglogsoftmax(score, gold_indices[0]))
        else:
            prob_of_tok = dy.zeroes(1)
            probdist = dy.softmax(score)
            for index in gold_indices:
                prob_of_tok += probdist[index]
            prob_of_tok += noise
            losses.append(-dy.log(prob_of_tok))

    return dy.esum(losses)
Example #22
0
    def decode_loss(self, encodings, trg):
        """Compute the negative conditional log likelihood of the target sentence
        given the encoding of the source sentence

        :param encodings: Source sentence encodings obtained with self.encode
        :param trg: List of target sentences

        :returns: Expression of the loss averaged on the minibatch
        """
        y, masksy = self.prepare_batch(trg, self.trg_eos)
        slen, bsize = y.shape
        # Init decoder
        self.dec.init(encodings,
                      y,
                      self.usr.user_vector,
                      test=self.test,
                      update=self.update)
        # Initialize context
        context = dy.zeroes((self.enc.dim, ), batch_size=bsize)
        # Process user token if necessary
        if self.user_token:
            _, _, _ = self.dec.next(self.usr.user_vector,
                                    context,
                                    test=self.test)
        # Start decoding
        errs = []
        for cw, nw, mask in zip(y, y[1:], masksy[1:]):
            # Run LSTM
            h, e, _ = self.dec.next(cw, context, test=self.test)
            # Compute next context
            context, _ = self.attend(encodings, h)
            # Score
            s = self.dec.s(h, context, e, test=self.test)
            masksy_e = dy.inputTensor(mask, batched=True)
            # Loss
            loss = self.cross_entropy_loss(s, nw, cw)
            loss = dy.cmult(loss, masksy_e)
            errs.append(loss)
        # Add all losses together
        err = dy.mean_batches(dy.esum(errs))
        return err
Example #23
0
    def init(self, H, y, test=True, update=True):
        bs = len(y[0])
        if not test:
            self.lstm.set_dropout(self.dr)
        else:
            self.lstm.disable_dropout()
        # Initialize first state of the decoder with the last state of the encoder
        self.Wp = self.Wp_p.expr(update)
        self.bp = self.bp_p.expr(update)
        last_enc = dy.pick(H, index=H.dim()[0][-1] - 1, dim=1)
        init_state = dy.affine_transform([self.bp, self.Wp, last_enc])
        init_state = [init_state, dy.zeroes((self.dh, ), batch_size=bs)]
        self.ds = self.lstm.initial_state(init_state, update=update)
        # Initialize dropout masks
        if not test:
            self.lstm.set_dropout_masks(bs)

        self.Wo = self.Wo_p.expr(update)
        self.bo = self.bo_p.expr(update)

        self.E = self.E_p.expr(update)
        self.b = self.b_p.expr(False)
Example #24
0
    def search(self, translation_model):
        beams = [self.Beam(translation_model.decoder.current_state, [translation_model.tgt_vocab.sos],
                           0.0, 0.0, 0.0, 1)]
        next_candidate_id = 2
        num_pruned = 0
        fan_outs = []
        target_vocabulary_size = len(translation_model.tgt_vocab)
        for i in range(self.max_output_len):
            probabilities = []
            next_states = []
            for beam in beams:
                # if already at end of sentence, no work to be done
                if beam.words[-1] == translation_model.tgt_vocab.eos:
                    probabilities.append(dy.zeroes((target_vocabulary_size,)) + 1)
                    next_states.append(None)
                    continue

                # calculate decoding scores
                scores = translation_model.decode([beam.words[-1]], beam.state)
                # then, keep track of next decoder state
                next_states.append(translation_model.decoder.current_state)
                probabilities.append(dy.log_softmax(scores))

            # run forward pass
            probabilities = dy.concatenate_to_batch(probabilities).npvalue().T.reshape(-1, target_vocabulary_size)

            new_beams = []
            for prob, beam, next_state in zip(probabilities, beams, next_states):
                if beam.words[-1] == translation_model.tgt_vocab.eos:
                    # if we're already at the end of the sentence, keep it as is
                    new_beams.append(beam)
                else:
                    # otherwise, find the k best candidate words
                    k_best = np.argsort(prob)  # best is last
                    for next_word in k_best[-self.beam_size:]:
                        next_word_prob = prob[next_word]
                        new_prob = beam.log_prob + next_word_prob
                        if self.length_norm_alpha:
                            len_norm = (5 + len(beam.words) + 1)**self.length_norm_alpha / (5 + 1) ** self.length_norm_alpha
                        else:
                            len_norm = 1
                        new_beams.append(
                            self.Beam(next_state, beam.words + [next_word], new_prob, next_word_prob,
                                      new_prob / len_norm, next_candidate_id))
                    next_candidate_id += 1

            # Only keep the k best
            beams = sorted(new_beams, key=lambda beam: beam.len_norm_score)[-self.beam_size:]
            # if highest scoring candidate is a complete sentence, exit
            if beams[-1].words[-1] == translation_model.tgt_vocab.eos:
                break

            best_score = beams[-1].len_norm_score
            if self.pruning_strategy.relative:
                beams = [beam for beam in beams if beam.len_norm_score - best_score > self.pruning_strategy.relative]
            if self.pruning_strategy.absolute:
                beams = [beam for beam in beams if
                         logsumexp(a=[best_score, beam.len_norm_score], b=[1, -1]) < self.pruning_strategy.absolute]
            if self.pruning_strategy.local:
                best_word_score = max(beam.last_word_prob for beam in beams)
                beams = [beam for beam in beams if beam.last_word_prob - best_word_score > self.pruning_strategy.local]
            if self.pruning_strategy.candidate:
                pruned_beams = []
                candidate_counts = defaultdict(lambda: 0)
                for beam in reversed(beams):
                    if candidate_counts[beam.candidate_id] < self.pruning_strategy.candidate:
                        pruned_beams.insert(0, beam)
                        candidate_counts[beam.candidate_id] += 1
                beams = pruned_beams
            num_pruned += self.beam_size - len(beams)
            fan_out = 0
            for beam in beams:
                if beam.words[-1] != translation_model.tgt_vocab.eos:
                    fan_out += 1
            fan_outs.append(fan_out)

        total_fan_out = sum(fan_outs)
        avg_fan_out = total_fan_out / len(fan_outs) if len(fan_outs) != 0 else 0
        return beams[-1].words, avg_fan_out, total_fan_out, num_pruned
Example #25
0
 def init(self, initial_state, batch_size=1):
     self.current_state = self.lstm.initial_state([
         initial_state,
         dy.zeroes((self.hidden_dim, ), batch_size=batch_size)
     ])
Example #26
0
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        """
    transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c)

    Args:
      expr_seq: expression sequence (will be accessed via tensor_expr)
    Return:
      expression sequence
    """

        if isinstance(expr_seq, list):
            mask_out = expr_seq[0].mask
            seq_len = len(expr_seq[0])
            batch_size = expr_seq[0].dim()[1]
            tensors = [e.as_tensor() for e in expr_seq]
            input_tensor = dy.reshape(dy.concatenate(tensors),
                                      (seq_len, 1, self.input_dim),
                                      batch_size=batch_size)
        else:
            mask_out = expr_seq.mask
            seq_len = len(expr_seq)
            batch_size = expr_seq.dim()[1]
            input_tensor = dy.reshape(dy.transpose(expr_seq.as_tensor()),
                                      (seq_len, 1, self.input_dim),
                                      batch_size=batch_size)

        if self.dropout > 0.0 and self.train:
            input_tensor = dy.dropout(input_tensor, self.dropout)

        proj_inp = dy.conv2d_bias(input_tensor,
                                  dy.parameter(self.p_f),
                                  dy.parameter(self.p_b),
                                  stride=(self.stride, 1),
                                  is_valid=False)
        reduced_seq_len = proj_inp.dim()[0][0]
        proj_inp = dy.transpose(
            dy.reshape(proj_inp, (reduced_seq_len, self.hidden_dim * 3),
                       batch_size=batch_size))
        # proj_inp dims: (hidden, 1, seq_len), batch_size
        if self.stride > 1 and mask_out is not None:
            mask_out = mask_out.lin_subsampled(trg_len=reduced_seq_len)

        h = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)]
        c = [dy.zeroes(dim=(self.hidden_dim, 1), batch_size=batch_size)]
        for t in range(reduced_seq_len):
            f_t = dy.logistic(
                dy.strided_select(proj_inp, [], [0, t],
                                  [self.hidden_dim, t + 1]))
            o_t = dy.logistic(
                dy.strided_select(proj_inp, [], [self.hidden_dim, t],
                                  [self.hidden_dim * 2, t + 1]))
            z_t = dy.tanh(
                dy.strided_select(proj_inp, [], [self.hidden_dim * 2, t],
                                  [self.hidden_dim * 3, t + 1]))

            if self.dropout > 0.0 and self.train:
                retention_rate = 1.0 - self.dropout
                dropout_mask = dy.random_bernoulli((self.hidden_dim, 1),
                                                   retention_rate,
                                                   batch_size=batch_size)
                f_t = 1.0 - dy.cmult(
                    dropout_mask, 1.0 - f_t
                )  # TODO: would be easy to make a zoneout dynet operation to save memory

            i_t = 1.0 - f_t

            if t == 0:
                c_t = dy.cmult(i_t, z_t)
            else:
                c_t = dy.cmult(f_t, c[-1]) + dy.cmult(i_t, z_t)
            h_t = dy.cmult(
                o_t, c_t)  # note: LSTM would use dy.tanh(c_t) instead of c_t
            if mask_out is None or np.isclose(
                    np.sum(mask_out.np_arr[:, t:t + 1]), 0.0):
                c.append(c_t)
                h.append(h_t)
            else:
                c.append(
                    mask_out.cmult_by_timestep_expr(c_t, t, True) +
                    mask_out.cmult_by_timestep_expr(c[-1], t, False))
                h.append(
                    mask_out.cmult_by_timestep_expr(h_t, t, True) +
                    mask_out.cmult_by_timestep_expr(h[-1], t, False))

        self._final_states = [transducers.FinalTransducerState(dy.reshape(h[-1], (self.hidden_dim,), batch_size=batch_size), \
                                                               dy.reshape(c[-1], (self.hidden_dim,),
                                                                          batch_size=batch_size))]
        return expression_seqs.ExpressionSequence(expr_list=h[1:],
                                                  mask=mask_out)
Example #27
0
    def transduce(
        self, xs: 'expression_seqs.ExpressionSequence'
    ) -> 'expression_seqs.ExpressionSequence':
        batch_size = xs[0][0].dim()[1]
        h_bot = []
        h_mid = []
        h_top = []
        z_bot = []
        z_mid = []
        z_top = []

        self.top_layer.h = None
        self.top_layer.c = None
        self.top_layer.z = None
        self.mid_layer.h = None
        self.mid_layer.c = None
        self.mid_layer.z = None
        self.bottom_layer.h = None
        self.bottom_layer.c = None
        self.bottom_layer.z = None

        #?? checkme. want to init z to ones? (cherry paper)
        z_one = dy.ones(1, batch_size=batch_size)
        h_bot.append(
            dy.zeroes(dim=(self.hidden_dim, ),
                      batch_size=batch_size))  #indices for timesteps are +1
        h_mid.append(dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size))
        h_top.append(dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size))

        for i, x_t in enumerate(xs):
            h_t_bot, z_t_bot = self.bottom_layer.transduce(
                h_below=x_t, h_above=h_mid[i], z_below=z_one
            )  #uses h_t_top from layer above@previous time step, h_t_bot and z_t_bot from previous time step (saved in hmlstmcell)
            h_t_mid, z_t_mid = self.mid_layer.transduce(
                h_below=h_t_bot, h_above=h_top[i], z_below=z_t_bot
            )  #uses h_t_top from layer above@previous time step, h_t_bot and z_t_bot from previous time step (saved in hmlstmcell)
            h_t_top, z_t_top = self.top_layer.transduce(
                h_below=h_t_mid, h_above=None, z_below=z_t_mid
            )  #uses z_t_bot and h_t_bot from previous layer call, h_t_top and z_t_top from previous time step (saved in hmlstmcell)

            h_bot.append(h_t_bot)
            z_bot.append(z_t_bot)
            h_mid.append(h_t_mid)
            z_mid.append(z_t_mid)
            h_top.append(h_t_top)
            z_top.append(z_t_top)

#        #gated output module
#
#        #sigmoid
#        W_layer = dy.parameters(dim=(len(self.modules), hidden_dim)) #needs to be moved to init? num layers by hidden_dim
#        h_cat   = dy.transpose(dy.concatenate([h_bot, h_mid, h_top]))
#        dotted  = dy.dot_product(e1, e2)
#        gates   = dy.logistic(dotted)
#        #relu
#
#        om = dy.relu()

#final state is last hidden state from top layer
        self._final_states = [transducers.FinalTransducerState(h_top[-1])]
        fin_xs = expression_seqs.ExpressionSequence(expr_list=h_top[1:])
        return fin_xs  #removes the init zeros to make it same length as seq
Example #28
0
    def __call__(self,
                 final_encoder_state,
                 encoder_states,
                 max_generation_length,
                 snippets=None,
                 gold_sequence=None,
                 input_sequence=None,
                 dropout_amount=0.):
        """ Generates a sequence. """
        index = 0

        context_vector_size = self.token_predictor.attention_module.value_size

        # Decoder states: just the initialized decoder.
        # Current input to decoder: phi(start_token) ; zeros the size of the
        # context vector
        predictions = []
        sequence = []
        probability = 1.

        decoder_states = self._initialize_decoder_lstm(final_encoder_state)
        decoder_input = dy.concatenate(
            [self.start_token_embedding,
             dy.zeroes((context_vector_size, ))])

        continue_generating = True

        while continue_generating:
            if len(sequence) == 0 or sequence[-1] != EOS_TOK:
                _, decoder_state, decoder_states = du.forward_one_multilayer(
                    decoder_input, decoder_states, dropout_amount)
                prediction_input = PredictionInput(
                    decoder_state=decoder_state,
                    input_hidden_states=encoder_states,
                    snippets=snippets,
                    input_sequence=input_sequence)
                prediction = self.token_predictor(
                    prediction_input, dropout_amount=dropout_amount)

                predictions.append(prediction)

                if gold_sequence:
                    decoder_input = dy.concatenate([
                        self.output_embedder.bow_snippets(
                            gold_sequence[index], snippets),
                        prediction.attention_results.vector
                    ])
                    sequence.append(gold_sequence[index])

                    if index >= len(gold_sequence) - 1:
                        continue_generating = False
                else:
                    probabilities = np.transpose(
                        dy.softmax(prediction.scores).npvalue()).tolist()[0]
                    distribution_map = prediction.aligned_tokens

                    # Get a new probabilities and distribution_map consolidating
                    # duplicates
                    distribution_map, probabilities = flatten_distribution(
                        distribution_map, probabilities)

                    # Modify the probability distribution so that the UNK token can
                    # never be produced
                    probabilities[distribution_map.index(UNK_TOK)] = 0.
                    argmax_index = int(np.argmax(probabilities))

                    argmax_token = distribution_map[argmax_index]
                    sequence.append(argmax_token)

                    decoder_input = dy.concatenate([
                        self.output_embedder.bow_snippets(
                            argmax_token, snippets),
                        prediction.attention_results.vector
                    ])
                    probability *= probabilities[argmax_index]

                    continue_generating = False
                    if index < max_generation_length and argmax_token != EOS_TOK:
                        continue_generating = True

            index += 1

        return SQLPrediction(predictions, sequence, probability)
Example #29
0
    def transduce(
        self, expr_seq: 'expression_seqs.ExpressionSequence'
    ) -> 'expression_seqs.ExpressionSequence':
        """
    transduce the sequence, applying masks if given (masked timesteps simply copy previous h / c)

    Args:
      expr_seq: expression sequence or list of expression sequences (where each inner list will be concatenated)
    Returns:
      expression sequence
    """
        if isinstance(expr_seq, expression_seqs.ExpressionSequence):
            expr_seq = [expr_seq]
        batch_size = expr_seq[0].batch_size()
        seq_len = expr_seq[0].sent_len()

        if self.dropout_rate > 0.0 and self.train:
            self.set_dropout_masks(batch_size=batch_size)

        cur_input = expr_seq
        self._final_states = []
        for layer_i in range(self.num_layers):
            h = [dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)]
            c = [dy.zeroes(dim=(self.hidden_dim, ), batch_size=batch_size)]
            for pos_i in range(seq_len):
                x_t = [cur_input[j][pos_i] for j in range(len(cur_input))]
                if isinstance(x_t, dy.Expression):
                    x_t = [x_t]
                elif type(x_t) != list:
                    x_t = list(x_t)
                if (layer_i == 0 and sum([x_t_i.dim()[0][0] for x_t_i in x_t]) != self.total_input_dim) \
                        or (layer_i>0 and sum([x_t_i.dim()[0][0] for x_t_i in x_t]) != self.hidden_dim):
                    found_dim = sum([x_t_i.dim()[0][0] for x_t_i in x_t])
                    raise ValueError(
                        f"VanillaLSTMGates: x_t has inconsistent dimension {found_dim}, "
                        f"expecting {self.total_input_dim if layer_i==0 else self.hidden_dim}"
                    )
                if self.dropout_rate > 0.0 and self.train:
                    # apply dropout according to https://arxiv.org/abs/1512.05287 (tied weights)
                    gates_t = dy.vanilla_lstm_gates_dropout_concat(
                        x_t, h[-1], self.Wx[layer_i], self.Wh[layer_i],
                        self.b[layer_i], self.dropout_mask_x[layer_i],
                        self.dropout_mask_h[layer_i],
                        self.weightnoise_std if self.train else 0.0)
                else:
                    gates_t = dy.vanilla_lstm_gates_concat(
                        x_t, h[-1], self.Wx[layer_i], self.Wh[layer_i],
                        self.b[layer_i],
                        self.weightnoise_std if self.train else 0.0)
                c_t = dy.vanilla_lstm_c(c[-1], gates_t)
                h_t = dy.vanilla_lstm_h(c_t, gates_t)
                if expr_seq[0].mask is None or np.isclose(
                        np.sum(expr_seq[0].mask.np_arr[:, pos_i:pos_i + 1]),
                        0.0):
                    c.append(c_t)
                    h.append(h_t)
                else:
                    c.append(expr_seq[0].mask.cmult_by_timestep_expr(
                        c_t, pos_i, True) +
                             expr_seq[0].mask.cmult_by_timestep_expr(
                                 c[-1], pos_i, False))
                    h.append(expr_seq[0].mask.cmult_by_timestep_expr(
                        h_t, pos_i, True) +
                             expr_seq[0].mask.cmult_by_timestep_expr(
                                 h[-1], pos_i, False))
            self._final_states.append(
                transducers.FinalTransducerState(h[-1], c[-1]))
            cur_input = [h[1:]]

        return expression_seqs.ExpressionSequence(expr_list=h[1:],
                                                  mask=expr_seq[0].mask)
Example #30
0
    def transduce(self, h_below: 'expression_seqs.ExpressionSequence', h_above,
                  z_below) -> 'expression_seqs.ExpressionSequence':
        if self.c == None:
            self.c = dy.zeroes(
                dim=(self.hidden_dim,
                     ))  #?? does (hidden,) take care of batch_size?
        if self.h == None:
            self.h = dy.zeroes(dim=(self.hidden_dim, ))
        if self.z == None:
            self.z = dy.ones(dim=(1, ))

        W_1l_r = dy.parameter(self.p_W_1l_r)
        bias = dy.parameter(self.p_bias)
        h = dy.parameter(self.h)

        s_recur = W_1l_r * h  #matrix multiply is *, element-wise is dy.cmult. CURRERROR: stale expression
        if not self.last_layer:
            W_2l_td = dy.parameter(self.p_W_2l_td)
            W_0l_bu = dy.parameter(self.p_W_0l_bu)
            s_bottomup = W_0l_bu * h_below  #?? this is becoming (2049,). does it need to be (2049,1) to do scalar * matrix?
            s_topdown = W_2l_td * h_above
        else:
            s_topdown = dy.zeroes(
                s_recur.dim()[0][0],
            )  #?? this gets the shape e.g. ((5, 1), 1). do i actually want batch_size as well?
            s_bottomup = W_1l_r * h
        s_bottomup = dy.cmult(
            z_below, s_bottomup
        )  #to handle batched scalar * matrix -> e.g. (1x10, 2049x10)
        s_topdown = dy.cmult(
            self.z, s_topdown
        )  #will be zeros if last_layer. is this right, or should z=1 in this case ??

        fslice = s_recur + s_topdown + s_bottomup + bias  #?? checkme. bias has same shape as s_recur et al? [4*hidden+1, batch_size]?

        i_ft = dy.pick_range(fslice, 0, self.hidden_dim)
        i_it = dy.pick_range(fslice, self.hidden_dim, self.hidden_dim * 2)
        i_ot = dy.pick_range(fslice, self.hidden_dim * 2, self.hidden_dim * 3)
        i_gt = dy.pick_range(fslice, self.hidden_dim * 3, self.hidden_dim * 4)
        f_t = dy.logistic(
            i_ft + 1.0
        )  #+1.0 bc a paper said it was better to init that way (matthias)
        i_t = dy.logistic(i_it)
        o_t = dy.logistic(i_ot)
        g_t = dy.tanh(i_gt)

        #z * normal_update + (1-z)*copy: ie, when z_below is 0, z_new = z (copied prev timestamp). when z_below is 1, z_new = dy.round etc

        #hier = True
        #        z_tmp = dy.pick_range(fslice, self.hidden_dim*4,self.hidden_dim*4+1)
        #        z_tilde = dy.logistic(z_tmp)  #original: hard sigmoid + slope annealing (a)
        #        z_new = dy.cmult(1-z_below, self.z) + dy.cmult(z_below, dy.round(z_tilde, gradient_mode="straight_through_gradient"))

        #hier = False
        z_tmp = dy.pick_range(fslice, self.hidden_dim * 4,
                              self.hidden_dim * 4 + 1)
        z_tilde = dy.logistic(
            z_tmp)  #original: hard sigmoid + slope annealing (a)
        z_new = dy.round(
            z_tilde, gradient_mode="straight_through_gradient"
        )  #use straight-through estimator for gradient: step fn forward, hard sigmoid backward

        #z = z_l,t-1
        #z_below = z_l-1,t

        #        if self.z.value() == 1: #FLUSH
        #            c_new = dy.cmult(i_t, g_t)
        #            h_new = dy.cmult(o_t, dy.tanh(c_new))
        #        elif z_below.value() == 0: #COPY

        # if flush removed, only copy or normal update
        # when z_below is 0, c_new and h_new are self.c and self.h. when z_below is 1, c_new, h_new = normal update
        c_new = dy.cmult((1 - z_below), self.c) + dy.cmult(
            z_below, (dy.cmult(f_t, self.c) + dy.cmult(i_t, g_t)))
        h_new = dy.cmult((1 - z_below), self.h) + dy.cmult(
            z_below, dy.cmult(o_t, dy.tanh(c_new)))

        #        if z_below.value() == 0: #COPY
        #            c_new = self.c
        #            h_new = self.h
        #        else: #UPDATE
        #            c_new = dy.cmult(f_t, self.c) + dy.cmult(i_t, g_t)
        #            h_new = dy.cmult(o_t, dy.tanh(c_new))

        self.c = c_new
        self.h = h_new
        self.z = z_new

        return h_new, z_new
 def __init__(self, embedding_size, batch_size):
     self.vector_zero = dynet.zeroes((embedding_size, ), batch_size)
     self.reading_depth = dynet.inputTensor([1.0] * batch_size, True)
     self.elements = []