Beispiel #1
0
    def word_repr(self, char_seq):
        # obtain the word representation when given its character sequence
        wlen = len(char_seq)
        if 'rgW%d'%wlen not in self.param_exprs:
            self.param_exprs['rgW%d'%wlen] = dy.parameter(self.params['reset_gate_W'][wlen-1])
            self.param_exprs['rgb%d'%wlen] = dy.parameter(self.params['reset_gate_b'][wlen-1])
            self.param_exprs['cW%d'%wlen] = dy.parameter(self.params['com_W'][wlen-1])
            self.param_exprs['cb%d'%wlen] = dy.parameter(self.params['com_b'][wlen-1])
            self.param_exprs['ugW%d'%wlen] = dy.parameter(self.params['update_gate_W'][wlen-1])
            self.param_exprs['ugb%d'%wlen] = dy.parameter(self.params['update_gate_b'][wlen-1])
          
        chars = dy.concatenate(char_seq)
        reset_gate = dy.logistic(self.param_exprs['rgW%d'%wlen] * chars + self.param_exprs['rgb%d'%wlen])
        comb = dy.concatenate([dy.tanh(self.param_exprs['cW%d'%wlen] * dy.cmult(reset_gate,chars) + self.param_exprs['cb%d'%wlen]),chars])
        update_logits = self.param_exprs['ugW%d'%wlen] * comb + self.param_exprs['ugb%d'%wlen]
        
        update_gate = dy.transpose(dy.concatenate_cols([dy.softmax(dy.pickrange(update_logits,i*(wlen+1),(i+1)*(wlen+1))) for i in xrange(self.options['ndims'])]))
        
        # The following implementation of Softmax fucntion is not safe, but faster...
        #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1)))
        #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1)))
        #assert (not np.isnan(update_gate.npvalue()).any())

        word = dy.sum_cols(dy.cmult(update_gate,dy.reshape(comb,(self.options['ndims'],wlen+1))))
        return word
Beispiel #2
0
 def word_assoc_score(self, source_idx, target_idx, relation):
     """
     NOTE THAT DROPOUT IS BEING APPLIED HERE
     :param source_idx: embedding index of source atom
     :param target_idx: embedding index of target atom
     :param relation: relation type
     :return: score
     """
     # prepare
     s = self.embeddings[source_idx]
     if self.no_assoc:
         A = dy.const_parameter(self.word_assoc_weights[relation])
     else:
         A = dy.parameter(self.word_assoc_weights[relation])
     dy.dropout(A, self.dropout)
     t = self.embeddings[target_idx]
     
     # compute
     if self.mode == BILINEAR_MODE:
         return dy.transpose(s) * A * t
     elif self.mode == DIAG_RANK1_MODE:
         diag_A = dyagonalize(A[0])
         rank1_BC = A[1] * dy.transpose(A[2])
         ABC = diag_A + rank1_BC
         return dy.transpose(s) * ABC * t
     elif self.mode == TRANSLATIONAL_EMBED_MODE:
         return -dy.l2_norm(s - t + A)
     elif self.mode == DISTMULT:
         return dy.sum_elems(dy.cmult(dy.cmult(s, A), t))
 def predict_chunks_by_tokens(self, w_t, chunk_batch):
     ender = [self.lattice_vocab.chunk_end.i] * self.BATCH_SIZE
     lps = []
     state = self.lattice_rnn.initial_state(dropout=self.DROPOUT)
     cs = [[self.lattice_vocab.chunk_start.i] * self.BATCH_SIZE
           ] + chunk_batch
     cum_lp = dynet.scalarInput(0.0, device=self.args.param_device)
     for i, (cc, nc) in enumerate(zip(cs, cs[1:])):
         if self.args.concat_context_vector:
             x_t = dynet.pick_batch(self.vocab_R, cc)
             state.add_input(x_t)
         else:
             if i == 0:
                 state.add_input(self.project_main_to_lattice_init_R * w_t)
             else:
                 x_t = dynet.pick_batch(self.vocab_R, cc)
                 state.add_input(x_t)
         y_t = state.output()
         y_t = dynet.to_device(y_t, self.args.param_device)
         if self.DROPOUT:
             y_t = dynet.cmult(y_t, self.dropout_mask_lattice_y_t)
         if self.args.concat_context_vector:
             y_t = dynet.concatenate([y_t, w_t])
         r_t = dynet.affine_transform([
             self.vocab_bias, self.vocab_R,
             dynet.tanh(
                 dynet.affine_transform(
                     [self.lattice_bias, self.lattice_R, y_t]))
         ])
         if i > 0:
             lps.append(cum_lp + -dynet.pickneglogsoftmax_batch(r_t, ender))
         cum_lp = cum_lp + -dynet.pickneglogsoftmax_batch(r_t, nc)
     lps.append(cum_lp)
     return lps
Beispiel #4
0
    def calc_loss(self, src, trg, loss_calculator):
        self.start_sent(src)
        initial_states = self._encode_src(src)
        # Calculate losses from multiple initial states
        losses = []
        for initial_state in initial_states:
            model_loss = FactoredLossExpr()
            model_loss.add_factored_loss_expr(
                loss_calculator.calc_loss(self, initial_state, src, trg))

            if self.global_fertility != 0:
                masked_attn = self.attender.attention_vecs
                if trg.mask is not None:
                    trg_mask = 1 - (trg.mask.np_arr.transpose())
                    masked_attn = [
                        dy.cmult(attn, dy.inputTensor(mask, batched=True))
                        for attn, mask in zip(masked_attn, trg_mask)
                    ]
                model_loss.add_loss("fertility",
                                    self._global_fertility(masked_attn))
            losses.append(model_loss)
        try:
            total_loss = FactoredLossExpr()
            list(total_loss.add_factored_loss_expr(x) for x in losses)
            return total_loss
        finally:
            self.losses = losses
Beispiel #5
0
def aggregate_masked_loss(x: Tensor,
                          mask: 'xnmt.batchers.Mask' = None) -> Tensor:
    """
  Aggregate loss values for unmasked entries.

  Args:
    x: Batched sequence of losses.
    mask: An optional mask for the case of outputs of unequal lengths.

  Returns:
    Batched sequence of losses, with masked ones zeroed out.
  """
    if xnmt.backend_dynet:
        if mask:
            x = dy.cmult(x, dy.inputTensor(1.0 - mask.np_arr.T, batched=True))
        return dy.sum_elems(x)
    else:
        if mask:
            x = torch.mul(
                x,
                torch.as_tensor(1.0 - mask.np_arr,
                                dtype=x.dtype,
                                device=xnmt.device))
        return torch.sum(x, dim=tuple(range(1, len(
            x.size()))))  # sum over all but batch elems
    def word_repr(self, char_seq, cembs):
        # obtain the word representation when given its character sequence

        wlen = len(char_seq)
        if 'rgW%d' % wlen not in self.param_exprs:
            self.param_exprs['rgW%d' % wlen] = dy.parameter(
                self.params['reset_gate_W'][wlen - 1])
            self.param_exprs['rgb%d' % wlen] = dy.parameter(
                self.params['reset_gate_b'][wlen - 1])
            self.param_exprs['cW%d' % wlen] = dy.parameter(
                self.params['com_W'][wlen - 1])
            self.param_exprs['cb%d' % wlen] = dy.parameter(
                self.params['com_b'][wlen - 1])

        chars = dy.concatenate(cembs)
        reset_gate = dy.logistic(self.param_exprs['rgW%d' % wlen] * chars +
                                 self.param_exprs['rgb%d' % wlen])
        word = dy.tanh(self.param_exprs['cW%d' % wlen] *
                       dy.cmult(reset_gate, chars) +
                       self.param_exprs['cb%d' % wlen])
        if self.known_words is not None and tuple(
                char_seq) in self.known_words:
            return (word + dy.lookup(self.params['word_embed'],
                                     self.known_words[tuple(char_seq)])) / 2.
        return word
Beispiel #7
0
 def _fast_sample(self, prob, temperature=1):
     temperature = temperature / 2
     bern = dy.random_bernoulli(256, 0.5,
                                scale=temperature) + (1.0 - temperature)
     prob = dy.cmult(prob, bern)
     # print prob.npvalue().argmax()
     return prob.npvalue().argmax()
Beispiel #8
0
 def reparameterize(self, mu, logvar):
     if self.training:
         std = dy.exp(logvar * 0.5)
         eps = dy.random_normal(dim=std.dim()[0], mean=0.0, stddev=1.0)
         return dy.cmult(eps, std) + mu
     else:
         return mu
Beispiel #9
0
  def calc_loss(self, src, trg, loss_calculator):
    self.start_sent(src)
    embeddings = self.src_embedder.embed_sent(src)
    encodings = self.encoder(embeddings)
    self.attender.init_sent(encodings)
    # Initialize the hidden state from the encoder
    ss = mark_as_batch([Vocab.SS] * len(src)) if is_batched(src) else Vocab.SS
    initial_state = self.decoder.initial_state(self.encoder.get_final_states(), self.trg_embedder.embed(ss))
    # Compose losses
    model_loss = LossBuilder()
    model_loss.add_loss("mle", loss_calculator(self, initial_state, src, trg))

    if self.calc_global_fertility or self.calc_attention_entropy:
      # philip30: I assume that attention_vecs is already masked src wisely.
      # Now applying the mask to the target
      masked_attn = self.attender.attention_vecs
      if trg.mask is not None:
        trg_mask = trg.mask.get_active_one_mask().transpose()
        masked_attn = [dy.cmult(attn, dy.inputTensor(mask, batched=True)) for attn, mask in zip(masked_attn, trg_mask)]

    if self.calc_global_fertility:
      model_loss.add_loss("fertility", self.global_fertility(masked_attn))
    if self.calc_attention_entropy:
      model_loss.add_loss("H(attn)", self.attention_entropy(masked_attn))

    return model_loss
Beispiel #10
0
 def embed(self, x):
   if self.train and self.word_dropout > 0.0 and self.word_id_mask is None:
     batch_size = len(x) if xnmt.batcher.is_batched(x) else 1
     self.word_id_mask = [set(np.random.choice(self.vocab_size, int(self.vocab_size * self.word_dropout), replace=False)) for _ in range(batch_size)]
   # single mode
   if not xnmt.batcher.is_batched(x):
     if self.train and self.word_id_mask and x in self.word_id_mask[0]:
       ret = dy.zeros((self.emb_dim,))
     else:
       ret = self.embeddings[x]
       if self.fix_norm != None:
         ret = dy.cdiv(ret, dy.l2_norm(ret))
         if self.fix_norm != 1:
           ret *= self.fix_norm
   # minibatch mode
   else:
     ret = self.embeddings.batch(x)
     if self.fix_norm != None:
       ret = dy.cdiv(ret, dy.l2_norm(ret))
       if self.fix_norm != 1:
         ret *= self.fix_norm
     if self.train and self.word_id_mask and any(x[i] in self.word_id_mask[i] for i in range(len(x))):
       dropout_mask = dy.inputTensor(np.transpose([[0.0]*self.emb_dim if x[i] in self.word_id_mask[i] else [1.0]*self.emb_dim for i in range(len(x))]), batched=True)
       ret = dy.cmult(ret, dropout_mask)
   if self.train and self.weight_noise > 0.0:
     ret = dy.noise(ret, self.weight_noise)
   return ret
Beispiel #11
0
 def sample_one(
     self,
     translator: 'xnmt.models.translators.AutoRegressiveTranslator',
     initial_state: decoders.AutoRegressiveDecoderState,
     forced_trg_ids: Optional[Sequence[numbers.Integral]] = None
 ) -> SearchOutput:
     # Search variables
     current_words = None
     current_state = initial_state
     done = None
     # Outputs
     logsofts = []
     samples = []
     states = []
     attentions = []
     masks = []
     # Sample to the max length
     for length in range(self.max_len):
         translator_output = translator.generate_one_step(
             current_words, current_state)
         if forced_trg_ids is None:
             sample = translator_output.logsoftmax.tensor_value(
             ).categorical_sample_log_prob().as_numpy()
             if len(sample.shape) == 2:
                 sample = sample[0]
         else:
             sample = [
                 forced_trg[length]
                 if forced_trg.sent_len() > length else Vocab.ES
                 for forced_trg in forced_trg_ids
             ]
         logsoft = dy.pick_batch(translator_output.logsoftmax, sample)
         if done is not None:
             sample = [
                 sample[i] if not done[i] else Vocab.ES
                 for i in range(len(done))
             ]
             # masking for logsoftmax
             mask = [1 if not done[i] else 0 for i in range(len(done))]
             logsoft = dy.cmult(logsoft, dy.inputTensor(mask, batched=True))
             masks.append(mask)
         # Appending output
         logsofts.append(logsoft)
         samples.append(sample)
         states.append(translator.get_nobp_state(translator_output.state))
         attentions.append(translator_output.attention)
         # Next time step
         current_words = sample
         current_state = translator_output.state
         # Check done
         done = [x == Vocab.ES for x in sample]
         # Check if we are done.
         if all(done):
             break
     # Packing output
     scores = dy.esum(logsofts).npvalue()
     masks.insert(0, [1 for _ in range(len(done))])
     samples = np.stack(samples, axis=1)
     return SearchOutput(samples, attentions, scores, logsofts, states,
                         masks)
Beispiel #12
0
 def generate_output(self,
                     translator,
                     initial_state,
                     src_length=None,
                     forced_trg_ids=None):
     # Output variables
     score = []
     word_ids = []
     attentions = []
     logsoftmaxes = []
     states = []
     masks = []
     # Search Variables
     done = None
     current_state = initial_state
     for length in range(self.max_len):
         prev_word = word_ids[length - 1] if length > 0 else None
         current_output = translator.generate_one_step(
             prev_word, current_state)
         current_state = current_output.state
         if forced_trg_ids is None:
             word_id = np.argmax(current_output.logsoftmax.npvalue(),
                                 axis=0)
             if len(word_id.shape) == 2:
                 word_id = word_id[0]
         else:
             if xnmt.batcher.is_batched(forced_trg_ids):
                 word_id = [
                     forced_trg_ids[i][length]
                     for i in range(len(forced_trg_ids))
                 ]
             else:
                 word_id = [forced_trg_ids[length]]
         logsoft = dy.pick_batch(current_output.logsoftmax, word_id)
         if done is not None:
             word_id = [
                 word_id[i] if not done[i] else Vocab.ES
                 for i in range(len(done))
             ]
             # masking for logsoftmax
             mask = [1 if not done[i] else 0 for i in range(len(done))]
             logsoft = dy.cmult(logsoft, dy.inputTensor(mask, batched=True))
             masks.append(mask)
         # Packing outputs
         score.append(logsoft.npvalue())
         word_ids.append(word_id)
         attentions.append(current_output.attention)
         logsoftmaxes.append(
             dy.pick_batch(current_output.logsoftmax, word_id))
         states.append(translator.get_nobp_state(current_state))
         # Check if we are done.
         done = [x == Vocab.ES for x in word_id]
         if all(done):
             break
     masks.insert(0, [1 for _ in range(len(done))])
     words = np.stack(word_ids, axis=1)
     score = np.sum(score, axis=0)
     return [
         SearchOutput(words, attentions, score, logsoftmaxes, states, masks)
     ]
Beispiel #13
0
def reparameterize(mu, logvar):
    # Get z by reparameterization.
    d = mu.dim()[0][0]
    eps = dy.random_normal(d)
    std = dy.exp(logvar * 0.5)

    return mu + dy.cmult(std, eps)
Beispiel #14
0
    def calc_nll(self, src: Union[batchers.Batch, sent.Sentence],
                 trg: Union[batchers.Batch, sent.Sentence]):
        batch_size, encodings, outputs, seq_len = self._encode_src(src)

        if trg.sent_len() != seq_len:
            if self.auto_cut_pad:
                trg = self._cut_or_pad_targets(seq_len, trg)
            else:
                raise ValueError(
                    f"src/trg length do not match: {seq_len} != {len(trg[0])}")

        ref_action = np.asarray([trg_sent.words for trg_sent in trg]).reshape(
            (seq_len * batch_size, ))
        loss_expr_perstep = self.scorer.calc_loss(
            outputs, batchers.mark_as_batch(ref_action))
        # loss_expr_perstep = dy.pickneglogsoftmax_batch(outputs, ref_action)
        loss_expr_perstep = dy.reshape(loss_expr_perstep, (seq_len, ),
                                       batch_size=batch_size)
        if trg.mask:
            loss_expr_perstep = dy.cmult(
                loss_expr_perstep,
                dy.inputTensor(1.0 - trg.mask.np_arr.T, batched=True))
        loss_expr = dy.sum_elems(loss_expr_perstep)
        units = [t.len_unpadded() for t in trg]
        return LossExpr(loss_expr, units)
Beispiel #15
0
  def calc_loss(self, src, trg, loss_calculator):
    """
    :param src: source sequence (unbatched, or batched + padded)
    :param trg: target sequence (unbatched, or batched + padded); losses will be accumulated only if trg_mask[batch,pos]==0, or no mask is set
    :param loss_calculator:
    :returns: (possibly batched) loss expression
    """
    self.start_sent(src)
    embeddings = self.src_embedder.embed_sent(src)
    encodings = self.encoder(embeddings)
    self.attender.init_sent(encodings)
    # Initialize the hidden state from the encoder
    ss = mark_as_batch([Vocab.SS] * len(src)) if is_batched(src) else Vocab.SS
    dec_state = self.decoder.initial_state(self.encoder.get_final_states(), self.trg_embedder.embed(ss))
    # Compose losses
    model_loss = LossBuilder()
    model_loss.add_loss("mle", loss_calculator(self, dec_state, src, trg))

    if self.calc_global_fertility or self.calc_attention_entropy:
      # philip30: I assume that attention_vecs is already masked src wisely.
      # Now applying the mask to the target
      masked_attn = self.attender.attention_vecs
      if trg.mask is not None:
        trg_mask = trg.mask.get_active_one_mask().transpose()
        masked_attn = [dy.cmult(attn, dy.inputTensor(mask, batched=True)) for attn, mask in zip(masked_attn, trg_mask)]

    if self.calc_global_fertility:
      model_loss.add_loss("fertility", self.global_fertility(masked_attn))
    if self.calc_attention_entropy:
      model_loss.add_loss("H(attn)", self.attention_entropy(masked_attn))

    return model_loss
Beispiel #16
0
 def _attend(self, query, mask=None):
     query = unsqueeze(query, 0) # ((1, H), B)
     # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B)
     attn_scores = dy.transpose(query * self.context)
     if mask is not None:
         attn_scores = dy.cmult(attn_scores, mask[0]) + (mask[1] * dy.scalarInput(-1e9))
     return dy.softmax(attn_scores)
Beispiel #17
0
  def attention_entropy(self, a):
    entropy = []
    for a_i in a:
      a_i += EPSILON
      entropy.append(dy.cmult(a_i, dy.log(a_i)))

    return -dy.sum_elems(dy.esum(entropy))
Beispiel #18
0
 def _upsample(self, mgc, start, stop):
     mgc_index = start / len(self.upsample_w_s)
     ups_index = start % len(self.upsample_w_s)
     upsampled = []
     mgc_vect = dy.inputVector(mgc[mgc_index])
     for x in xrange(stop - start):
         sigm = dy.logistic(self.upsample_w_s[ups_index].expr(update=True) *
                            mgc_vect +
                            self.upsample_b_s[ups_index].expr(update=True))
         tnh = dy.tanh(self.upsample_w_t[ups_index].expr(update=True) *
                       mgc_vect +
                       self.upsample_b_t[ups_index].expr(update=True))
         r = dy.cmult(sigm, tnh)
         upsampled.append(r)
         ups_index += 1
         if ups_index == len(self.upsample_w_s):
             ups_index = 0
             mgc_index += 1
             if mgc_index == len(
                     mgc
             ):  # last frame is sometimes not processed, but it should have similar parameters
                 mgc_index -= 1
             else:
                 mgc_vect = dy.inputVector(mgc[mgc_index])
     return upsampled
def cross_entropy_loss(y, yhat):
    """
    Compute the cross entropy loss in tensorflow.
    The loss should be summed over the current minibatch.

    y is a one-hot tensor of shape (n_samples, n_classes) and yhat is a tensor
    of shape (n_samples, n_classes). y should be of dtype tf.int32, and yhat should
    be of dtype tf.float32.

    The functions tf.to_float, tf.reduce_sum, and tf.log might prove useful. (Many
    solutions are possible, so you may not need to use all of these functions).

    Note: You are NOT allowed to use the tensorflow built-in cross-entropy
                functions.

    Args:
        y:    tf.Tensor with shape (n_samples, n_classes). One-hot encoded.
        yhat: tf.Tensorwith shape (n_sample, n_classes). Each row encodes a
                    probability distribution and should sum to 1.
    Returns:
        out:  tf.Tensor with shape (1,) (Scalar output). You need to construct this
                    tensor in the problem.
    """

    ### YOUR CODE HERE
    #out = (dy.sum_elems(out) / y.value().shape[0]).npvalue().reshape([])
    out = dy.sum_elems(-dy.cmult(y, dy.log(yhat)))
    ### END YOUR CODE

    return out
Beispiel #20
0
 def cross_entropy_structbag(self, P, Q):
     """
     P (K x m) represents a distribution over STRUCTURED labels where each
     label is a BAG of K INDEPENDENT symbols taking values in {1 ... m}.
     That is, z = (z1 ... zK) is assigned probability P1(z1) * ... * PK(zK).
     (Similarly for Q.) By the independence, H(P, Q) = sum_k H(Pk, Qk).
     """
     return -dy.sum_dim(dy.cmult(P, self.log2(Q)), [0, 1])
def attend_vector(encoder_outputs,state_factor_vector):
    encoderOutputLength=state_factor_vector.npvalue().shape[0]
    hiddenSize=encoder_outputs[0].npvalue().shape[0]
    
    factor_Products=[dy.cmult(dy.concatenate([state_factor_vector[l]]*hiddenSize),encoder_outputs[l]) for l in range(encoderOutputLength)]
   
    factor_Products=dy.esum(factor_Products)
    return factor_Products
Beispiel #22
0
 def _attend(self, query, mask=None):
     query = unsqueeze(query, 0)  # ((1, H), B)
     # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B)
     attn_scores = dy.transpose(query * self.context)
     if mask is not None:
         attn_scores = dy.cmult(attn_scores,
                                mask[0]) + (mask[1] * dy.scalarInput(-1e9))
     return dy.softmax(attn_scores)
    def embed_sentence(self, ws, pwords, ts, chars, is_train):
        cembed = [dy.lookup_batch(self.clookup, c) for c in chars]
        char_fwd, char_bckd = self.char_lstm.builder_layers[0][0].initial_state().transduce(cembed)[-1], \
                              self.char_lstm.builder_layers[0][1].initial_state().transduce(reversed(cembed))[-1]
        crnn = dy.reshape(dy.concatenate_cols([char_fwd, char_bckd]), (self.options.we, ws.shape[0] * ws.shape[1]))
        cnn_reps = [list() for _ in range(len(ws))]
        for i in range(ws.shape[0]):
            cnn_reps[i] = dy.pick_batch(crnn, [i * ws.shape[1] + j for j in range(ws.shape[1])], 1)

        wembed = [dy.lookup_batch(self.wlookup, ws[i]) + dy.lookup_batch(self.elookup, pwords[i]) + cnn_reps[i] for i in range(len(ws))]
        posembed = [dy.lookup_batch(self.tlookup, ts[i]) for i in range(len(ts))]
        if (not is_train) or self.options.dropout == 0:
            return [dy.concatenate([wembed[i], posembed[i]]) for i in range(len(ts))]
        else:
            emb_masks = self.generate_emb_mask(ws.shape[0], ws.shape[1])
            return [dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in
                      zip(wembed, posembed, emb_masks)]
Beispiel #24
0
    def _encodings_to_label_log_probabilities(self, encodings, lmbd=None):
        label_scores = self.f_label(dy.concatenate_to_batch(encodings))
        label_scores_reshaped = dy.reshape(label_scores, (self.label_vocab.size, len(encodings)))

        if lmbd is not None:
            label_scores_reshaped = dy.cmult(label_scores_reshaped, lmbd)

        return dy.log_softmax(label_scores_reshaped)
Beispiel #25
0
 def transduce(self, inputs, train):
     xs = inputs[:self.max_length]
     if not xs:
         return []
     for i in range(self.lstm_layers):
         for n, d in ("f", 1), ("b", -1):
             Wr, br, Wh = [self.params["%s%d%s" % (p, i, n)] for p in ("Wr", "br", "Wh")]
             hs_ = self.params["rnn%d%s" % (i, n)].initial_state().transduce(xs[::d])
             hs = [hs_[0]]
             for t in range(1, len(hs_)):
                 r = dy.logistic(Wr * dy.concatenate([hs[t - 1], xs[t]]) + br)
                 hs.append(dy.cmult(r, hs_[t]) + dy.cmult(1 - r, Wh * xs[t]))
             xs = hs
             if train:
                 x = dy.dropout_dim(dy.concatenate(xs, 1), 1, self.dropout)
                 xs = [dy.pick(x, i, 1) for i in range(len(xs))]
     return xs
Beispiel #26
0
    def run_lstm(self, word_inputs, tag_inputs, isTrain=True):
        batch_size = word_inputs.shape[1]
        seq_len = word_inputs.shape[0]

        word_embs = [
            dy.lookup_batch(
                self.word_embs,
                np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) +
            dy.lookup_batch(self.pret_word_embs, w, update=False)
            for w in word_inputs
        ]
        tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs]

        if isTrain:
            emb_masks = self.generate_emb_mask(seq_len, batch_size)
            emb_inputs = [
                dy.concatenate([dy.cmult(w, wm),
                                dy.cmult(pos, posm)])
                for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks)
            ]
        else:
            emb_inputs = [
                dy.concatenate([w, pos])
                for w, pos in zip(word_embs, tag_embs)
            ]

        common_top_input, c_fs, c_bs = biLSTM(
            self.cLSTM_builders, emb_inputs, batch_size,
            self.dropout_clstm_input if isTrain else 0.,
            self.dropout_clstm_hidden if isTrain else 0.)
        common_top_recur = dy.concatenate_cols(common_top_input)

        private_top_input, p_fs, p_bs = biLSTM(
            self.pLSTM_builders, emb_inputs, batch_size,
            self.dropout_plstm_input if isTrain else 0.,
            self.dropout_plstm_hidden if isTrain else 0.)
        private_top_recur = dy.concatenate_cols(private_top_input)

        if isTrain:
            common_top_recur = dy.dropout_dim(common_top_recur, 1,
                                              self.dropout_mlp)
            private_top_recur = dy.dropout_dim(private_top_recur, 1,
                                               self.dropout_mlp)

        return common_top_recur, private_top_recur, p_fs, p_bs
Beispiel #27
0
    def transduce(
        self, expr_seq: expression_seqs.ExpressionSequence
    ) -> expression_seqs.ExpressionSequence:
        if expr_seq.dim()[1] > 1:
            raise ValueError(
                f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.dim()[1]}"
            )
        lattice = self.cur_src[0]
        Wx_iog = dy.parameter(self.p_Wx_iog)
        Wh_iog = dy.parameter(self.p_Wh_iog)
        b_iog = dy.parameter(self.p_b_iog)
        Wx_f = dy.parameter(self.p_Wx_f)
        Wh_f = dy.parameter(self.p_Wh_f)
        b_f = dy.parameter(self.p_b_f)
        h = []
        c = []

        batch_size = expr_seq.dim()[1]
        if self.dropout_rate > 0.0 and self.train:
            self.set_dropout_masks(batch_size=batch_size)

        for node_i in range(lattice.sent_len()):
            cur_node = lattice.nodes[node_i]
            val = expr_seq[node_i]
            if self.dropout_rate > 0.0 and self.train:
                val = dy.cmult(val, self.dropout_mask_x)
            i_ft_list = []
            if len(cur_node.nodes_prev) == 0:
                tmp_iog = dy.affine_transform([b_iog, Wx_iog, val])
            else:
                h_tilde = sum(h[pred] for pred in cur_node.nodes_prev)
                tmp_iog = dy.affine_transform(
                    [b_iog, Wx_iog, val, Wh_iog, h_tilde])
                for pred in cur_node.nodes_prev:
                    i_ft_list.append(
                        dy.logistic(
                            dy.affine_transform(
                                [b_f, Wx_f, val, Wh_f, h[pred]])))
            i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim)
            i_aot = dy.pick_range(tmp_iog, self.hidden_dim,
                                  self.hidden_dim * 2)
            i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2,
                                  self.hidden_dim * 3)

            i_it = dy.logistic(i_ait)
            i_ot = dy.logistic(i_aot)
            i_gt = dy.tanh(i_agt)
            if len(cur_node.nodes_prev) == 0:
                c.append(dy.cmult(i_it, i_gt))
            else:
                fc = dy.cmult(i_ft_list[0], c[cur_node.nodes_prev[0]])
                for i in range(1, len(cur_node.nodes_prev)):
                    fc += dy.cmult(i_ft_list[i], c[cur_node.nodes_prev[i]])
                c.append(fc + dy.cmult(i_it, i_gt))
            h_t = dy.cmult(i_ot, dy.tanh(c[-1]))
            if self.dropout_rate > 0.0 and self.train:
                h_t = dy.cmult(h_t, self.dropout_mask_h)
            h.append(h_t)
        self._final_states = [transducers.FinalTransducerState(h[-1], c[-1])]
        return expression_seqs.ExpressionSequence(expr_list=h)
Beispiel #28
0
    def word_repr(self, char_seq):
        # obtain the word representation when given its character sequence
        wlen = len(char_seq)
        if 'rgW%d' % wlen not in self.param_exprs:
            self.param_exprs['rgW%d' % wlen] = dy.parameter(
                self.params['reset_gate_W'][wlen - 1])
            self.param_exprs['rgb%d' % wlen] = dy.parameter(
                self.params['reset_gate_b'][wlen - 1])
            self.param_exprs['cW%d' % wlen] = dy.parameter(
                self.params['com_W'][wlen - 1])
            self.param_exprs['cb%d' % wlen] = dy.parameter(
                self.params['com_b'][wlen - 1])
            self.param_exprs['ugW%d' % wlen] = dy.parameter(
                self.params['update_gate_W'][wlen - 1])
            self.param_exprs['ugb%d' % wlen] = dy.parameter(
                self.params['update_gate_b'][wlen - 1])

        chars = dy.concatenate(char_seq)
        reset_gate = dy.logistic(self.param_exprs['rgW%d' % wlen] * chars +
                                 self.param_exprs['rgb%d' % wlen])
        comb = dy.concatenate([
            dy.tanh(self.param_exprs['cW%d' % wlen] *
                    dy.cmult(reset_gate, chars) +
                    self.param_exprs['cb%d' % wlen]), chars
        ])
        update_logits = self.param_exprs[
            'ugW%d' % wlen] * comb + self.param_exprs['ugb%d' % wlen]

        update_gate = dy.transpose(
            dy.concatenate_cols([
                dy.softmax(
                    dy.pickrange(update_logits, i * (wlen + 1),
                                 (i + 1) * (wlen + 1)))
                for i in xrange(self.options['ndims'])
            ]))

        # The following implementation of Softmax fucntion is not safe, but faster...
        #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1)))
        #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1)))
        #assert (not np.isnan(update_gate.npvalue()).any())

        word = dy.sum_cols(
            dy.cmult(update_gate,
                     dy.reshape(comb, (self.options['ndims'], wlen + 1))))
        return word
Beispiel #29
0
    def cosine_proximity(self, pred, gold):
        def l2_normalize(x):
            square_sum = dynet.sqrt(dynet.bmax(dynet.sum_elems(dynet.square(x)), np.finfo(float).eps * dynet.ones((1))[0]))
            return dynet.cdiv(x, square_sum)

        y_true = l2_normalize(pred)
        y_pred = l2_normalize(gold)

        return -dynet.sum_elems(dynet.cmult(y_true, y_pred))
Beispiel #30
0
    def step(self, x, hx, cx):
        if not self.test:
            if self.dropout_x > 0:
                x = dy.cmult(self.dropout_mask_x, x)
            if self.dropout_h > 0:
                hx = dy.cmult(self.dropout_mask_h, hx)

        gates = dy.affine_transform(
            [self.bias, self.weight_ih, x, self.weight_hh, hx])
        i = dy.pickrange(gates, 0, self.n_hidden)
        f = dy.pickrange(gates, self.n_hidden, self.n_hidden * 2)
        g = dy.pickrange(gates, self.n_hidden * 2, self.n_hidden * 3)
        o = dy.pickrange(gates, self.n_hidden * 3, self.n_hidden * 4)

        i, f, g, o = dy.logistic(i), dy.logistic(f), dy.tanh(g), dy.logistic(o)
        cy = dy.cmult(f, cx) + dy.cmult(i, g)
        hy = dy.cmult(o, dy.tanh(cy))
        return hy, cy
Beispiel #31
0
 def __cosine_loss(self, pred, gold):
     sn1 = dy.l2_norm(pred)
     sn2 = dy.l2_norm(gold)
     mult = dy.cmult(sn1, sn2)
     dot = dy.dot_product(pred, gold)
     div = dy.cdiv(dot, mult)
     vec_y = dy.scalarInput(2)
     res = dy.cdiv(1 - div, vec_y)
     return res
Beispiel #32
0
    def __call__(self, *args):
        U = [dy.parameter(U_) for U_ in self.U]

        out = U[0] * args[0]
        for x, u in zip(args[1:], U[1:]):
            out = dy.cmult(out, u * x)

        out = dy.sum_cols(dy.transpose(out))
        return out
Beispiel #33
0
 def gate_vecs(self, ht1, xt):
     b = self.expressions["b"]
     W = self.expressions["W"]
     gate_vecs = {}
     for g, activation in zip(self.gate_names, self.gate_activations):
         hin = ht1 if not g == "htilde" else dy.cmult(gate_vecs["r"], ht1)
         gate_vecs[g] = activation(
             dy.affine_transform([b[g], W["x"][g], xt, W["h"][g], ht1]))
     return gate_vecs
def attend(encoder_outputs,state_factor_matrix):
    miniBatchLength=state_factor_matrix.npvalue().shape[1]
    encoderOutputLength=state_factor_matrix.npvalue().shape[0]
    hiddenSize=encoder_outputs[0].npvalue().shape[0]

    factor_Products=[state_factor_matrix[l] for l in range(encoderOutputLength)]
    factor_Products=dy.esum([dy.cmult(encoder_outputs[l],dy.concatenate([state_factor_matrix[l]]*hiddenSize)) for l in range(encoderOutputLength)])
    
    return factor_Products
Beispiel #35
0
 def _attend(self, query, mask=None):
     # query ((H), B)
     # mask  ((T, 1), B)
     projected_state = self.decoder * query  # ((H,), B)
     non_lin = dy.tanh(dy.colwise_add(self.context_proj, projected_state))  # ((H, T), B)
     attn_scores = dy.transpose(self.v * non_lin)  # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B)
     if mask is not None:
         attn_scores = dy.cmult(attn_scores, mask[0]) + (mask[1] * dy.scalarInput(-1e9))
     return dy.softmax(attn_scores)  # ((T, 1), B)
Beispiel #36
0
def dyagonalize(col):
    """
    A convoluted way to make a dynet vector into a dynet matrix where it's the diagonal
    God I hope there's a better way.
    :param col: column vector in dynet format
    """
    col_dim = col.dim()[0][0]
    nump_eye = np.eye(col_dim)
    return dy.cmult(col, dy.inputTensor(nump_eye))
Beispiel #37
0
 def mask_embeddings(self, embeddings, mask):
   """
   We convert the embeddings of masked input sequence to zero vector
   """
   (embed_dim, _), _ = embeddings.dim()
   temp_mask = np.repeat(1. - mask[:, None, :], embed_dim, axis=1)
   temp_mask = dy.inputTensor(np.moveaxis(temp_mask, [1, 0, 2], [0, 2, 1]), batched=True)
   embeddings = dy.cmult(embeddings, temp_mask)
   return embeddings
Beispiel #38
0
def dyagonalize(col):
    """
    A convoluted way to make a dynet vector into a dynet matrix where it's the diagonal
    God I hope there's a better way.
    :param col: column vector in dynet format
    """
    col_dim = col.dim()[0][0]
    nump_eye = np.eye(col_dim)
    return dy.cmult(col, dy.inputTensor(nump_eye))
Beispiel #39
0
def dot_product_attention(query, key, value, mask=None, dropout=None):
    """Input Shape: ((D, T, H), B)"""
    scores = batch_matmul(transpose(key, 0, 1), query)
    if mask is not None:
        scores = dy.cmult(scores, mask[0]) + (mask[1] * -1e9)

    weights = folded_softmax(scores)

    if dropout is not None:
        weights = dy.dropout(weights, dropout)

    return batch_matmul(value, weights)
Beispiel #40
0
 def expr_for_tree(self, tree):
     if tree.isleaf():
         return self.E[self.w2i.get(tree.label,0)]
     if len(tree.children) == 1:
         assert(tree.children[0].isleaf())
         emb = self.expr_for_tree(tree.children[0])
         Wi,Wo,Wu   = [dy.parameter(w) for w in self.WS]
         bi,bo,bu,_ = [dy.parameter(b) for b in self.BS]
         i = dy.logistic(Wi*emb + bi)
         o = dy.logistic(Wo*emb + bo)
         u = dy.tanh(    Wu*emb + bu)
         c = dy.cmult(i,u)
         expr = dy.cmult(o,dy.tanh(c))
         return expr
     assert(len(tree.children) == 2),tree.children[0]
     e1 = self.expr_for_tree(tree.children[0])
     e2 = self.expr_for_tree(tree.children[1])
     Ui,Uo,Uu = [dy.parameter(u) for u in self.US]
     Uf1,Uf2 = [dy.parameter(u) for u in self.UFS]
     bi,bo,bu,bf = [dy.parameter(b) for b in self.BS]
     e = dy.concatenate([e1,e2])
     i = dy.logistic(Ui*e + bi)
     o = dy.logistic(Uo*e + bo)
     f1 = dy.logistic(Uf1*e1 + bf)
     f2 = dy.logistic(Uf2*e2 + bf)
     u = dy.tanh(    Uu*e + bu)
     c = dy.cmult(i,u) + dy.cmult(f1,e1) + dy.cmult(f2,e2)
     h = dy.cmult(o,dy.tanh(c))
     expr = h
     return expr
Beispiel #41
0
    def learn(self, batch_size):
        if self.prioritized:
            if not self.memory.is_full(): return -np.inf
            indices, exps, weights = self.memory.sample(batch_size, self.beta)
        else:
            exps = self.memory.sample(batch_size)
        obss, actions, rewards, obs_nexts, dones = self._process(exps)

        dy.renew_cg()
        target_network = self.target_network if self.use_double_dqn else self.network
        if self.dueling:
            target_values, v = target_network(obs_nexts, batched=True)
            target_values = target_values.npvalue() + v.npvalue()
        else:
            target_values = target_network(obs_nexts, batched=True)
            target_values = target_values.npvalue()
        target_values = np.max(target_values, axis=0)
        target_values = rewards + self.reward_decay * (target_values * (1 - dones))

        dy.renew_cg()
        if self.dueling:
            all_values_expr, v = self.network(obss, batched=True)
        else:
            all_values_expr = self.network(obss, batched=True)
        picked_values = dy.pick_batch(all_values_expr, actions)
        diff = (picked_values + v if self.dueling else picked_values) - dy.inputTensor(target_values, batched=True)
        if self.prioritized:
            self.memory.update(indices, np.transpose(np.abs(diff.npvalue())))
        losses = dy.pow(diff, dy.constant(1, 2))
        if self.prioritized:
            losses = dy.cmult(losses, dy.inputTensor(weights, batched=True))
        loss = dy.sum_batches(losses)
        loss_value = loss.npvalue()
        loss.backward()
        self.trainer.update()

        self.epsilon = max(self.epsilon - self.epsilon_decrease, self.epsilon_lower)
        if self.prioritized:
            self.beta = min(self.beta + self.beta_increase, 1.)

        self.learn_step += 1
        if self.use_double_dqn and self.learn_step % self.n_replace_target == 0:
            self.target_network.update(self.network)
        return loss_value
Beispiel #42
0
 def transitions(self):
     if self.mask is not None:
         return dy.cmult(self.transitions_p, dy.inputTensor(self.mask)) + dy.inputTensor(self.inv_mask)
     return self.transitions_p
Beispiel #43
0
 def highway(input_, train):
     for func, weight, bias in zip(funcs, weights, biases):
         proj = dy.rectify(func(input_, train))
         transform = dy.logistic(dy.affine_transform([bias, weight, input_]))
         input_ = dy.cmult(transform, proj) + dy.cmult(input_, 1 - transform)
     return input_
Beispiel #44
0
 def __call__(self, in_expr):
     return self.act(dy.cmult(in_expr))
Beispiel #45
0
    def __init__(self, vocab, w2i, pos, rels, options):
        if isinstance(options, dict):
            options = _dict_to_obj(options, 'Values')

        self.model = ParameterCollection()
        random.seed(1)
        self.trainer = AdamTrainer(self.model)

        self.activations = {'tanh': tanh, 'sigmoid': logistic, 'relu': rectify,
                            'tanh3': (lambda x: tanh(cmult(cmult(x, x), x)))}
        self.activation = self.activations[options.activation]

        self.blstm_flag = options.blstmFlag
        self.labels_flag = options.labelsFlag
        self.costaug_flag = options.costaugFlag
        self.bibi_flag = options.bibiFlag

        self.ldims = options.lstm_dims
        self.wdims = options.wembedding_dims
        self.pdims = options.pembedding_dims
        self.rdims = options.rembedding_dims
        self.layers = options.lstm_layers
        self.words_count = vocab
        self.vocab = {word: ind + 3 for word, ind in list(w2i.items())}
        self.pos = {word: ind + 3 for ind, word in enumerate(pos)}
        self.rels = {word: ind for ind, word in enumerate(rels)}
        self.irels = rels

        if self.bibi_flag:
            self.builders = [LSTMBuilder(1, self.wdims + self.pdims, self.ldims, self.model),
                             LSTMBuilder(1, self.wdims + self.pdims, self.ldims, self.model)]
            self.bbuilders = [LSTMBuilder(1, self.ldims * 2, self.ldims, self.model),
                              LSTMBuilder(1, self.ldims * 2, self.ldims, self.model)]
        elif self.layers > 0:
            self.builders = \
                [LSTMBuilder(self.layers, self.wdims + self.pdims, self.ldims, self.model),
                 LSTMBuilder(self.layers, self.wdims + self.pdims, self.ldims, self.model)]
        else:
            self.builders = [SimpleRNNBuilder(1, self.wdims + self.pdims, self.ldims, self.model),
                             SimpleRNNBuilder(1, self.wdims + self.pdims, self.ldims, self.model)]

        self.hidden_units = options.hidden_units
        self.hidden2_units = options.hidden2_units

        self.vocab['*PAD*'] = 1
        self.pos['*PAD*'] = 1

        self.vocab['*INITIAL*'] = 2
        self.pos['*INITIAL*'] = 2

        self.wlookup = self.model.add_lookup_parameters((len(vocab) + 3, self.wdims))
        self.plookup = self.model.add_lookup_parameters((len(pos) + 3, self.pdims))
        self.rlookup = self.model.add_lookup_parameters((len(rels), self.rdims))

        self.hid_layer_foh = self.model.add_parameters((self.hidden_units, self.ldims * 2))
        self.hid_layer_fom = self.model.add_parameters((self.hidden_units, self.ldims * 2))
        self.hid_bias = self.model.add_parameters((self.hidden_units))

        self.hid2_layer = self.model.add_parameters((self.hidden2_units, self.hidden_units))
        self.hid2_bias = self.model.add_parameters((self.hidden2_units))

        self.out_layer = self.model.add_parameters(
            (1, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units))

        if self.labels_flag:
            self.rhid_layer_foh = self.model.add_parameters((self.hidden_units, 2 * self.ldims))
            self.rhid_layer_fom = self.model.add_parameters((self.hidden_units, 2 * self.ldims))
            self.rhid_bias = self.model.add_parameters((self.hidden_units))
            self.rhid2_layer = self.model.add_parameters((self.hidden2_units, self.hidden_units))
            self.rhid2_bias = self.model.add_parameters((self.hidden2_units))
            self.rout_layer = self.model.add_parameters(
                (len(self.irels),
                 self.hidden2_units if self.hidden2_units > 0 else self.hidden_units))
            self.rout_bias = self.model.add_parameters((len(self.irels)))