def word_repr(self, char_seq): # obtain the word representation when given its character sequence wlen = len(char_seq) if 'rgW%d'%wlen not in self.param_exprs: self.param_exprs['rgW%d'%wlen] = dy.parameter(self.params['reset_gate_W'][wlen-1]) self.param_exprs['rgb%d'%wlen] = dy.parameter(self.params['reset_gate_b'][wlen-1]) self.param_exprs['cW%d'%wlen] = dy.parameter(self.params['com_W'][wlen-1]) self.param_exprs['cb%d'%wlen] = dy.parameter(self.params['com_b'][wlen-1]) self.param_exprs['ugW%d'%wlen] = dy.parameter(self.params['update_gate_W'][wlen-1]) self.param_exprs['ugb%d'%wlen] = dy.parameter(self.params['update_gate_b'][wlen-1]) chars = dy.concatenate(char_seq) reset_gate = dy.logistic(self.param_exprs['rgW%d'%wlen] * chars + self.param_exprs['rgb%d'%wlen]) comb = dy.concatenate([dy.tanh(self.param_exprs['cW%d'%wlen] * dy.cmult(reset_gate,chars) + self.param_exprs['cb%d'%wlen]),chars]) update_logits = self.param_exprs['ugW%d'%wlen] * comb + self.param_exprs['ugb%d'%wlen] update_gate = dy.transpose(dy.concatenate_cols([dy.softmax(dy.pickrange(update_logits,i*(wlen+1),(i+1)*(wlen+1))) for i in xrange(self.options['ndims'])])) # The following implementation of Softmax fucntion is not safe, but faster... #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1))) #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1))) #assert (not np.isnan(update_gate.npvalue()).any()) word = dy.sum_cols(dy.cmult(update_gate,dy.reshape(comb,(self.options['ndims'],wlen+1)))) return word
def word_assoc_score(self, source_idx, target_idx, relation): """ NOTE THAT DROPOUT IS BEING APPLIED HERE :param source_idx: embedding index of source atom :param target_idx: embedding index of target atom :param relation: relation type :return: score """ # prepare s = self.embeddings[source_idx] if self.no_assoc: A = dy.const_parameter(self.word_assoc_weights[relation]) else: A = dy.parameter(self.word_assoc_weights[relation]) dy.dropout(A, self.dropout) t = self.embeddings[target_idx] # compute if self.mode == BILINEAR_MODE: return dy.transpose(s) * A * t elif self.mode == DIAG_RANK1_MODE: diag_A = dyagonalize(A[0]) rank1_BC = A[1] * dy.transpose(A[2]) ABC = diag_A + rank1_BC return dy.transpose(s) * ABC * t elif self.mode == TRANSLATIONAL_EMBED_MODE: return -dy.l2_norm(s - t + A) elif self.mode == DISTMULT: return dy.sum_elems(dy.cmult(dy.cmult(s, A), t))
def predict_chunks_by_tokens(self, w_t, chunk_batch): ender = [self.lattice_vocab.chunk_end.i] * self.BATCH_SIZE lps = [] state = self.lattice_rnn.initial_state(dropout=self.DROPOUT) cs = [[self.lattice_vocab.chunk_start.i] * self.BATCH_SIZE ] + chunk_batch cum_lp = dynet.scalarInput(0.0, device=self.args.param_device) for i, (cc, nc) in enumerate(zip(cs, cs[1:])): if self.args.concat_context_vector: x_t = dynet.pick_batch(self.vocab_R, cc) state.add_input(x_t) else: if i == 0: state.add_input(self.project_main_to_lattice_init_R * w_t) else: x_t = dynet.pick_batch(self.vocab_R, cc) state.add_input(x_t) y_t = state.output() y_t = dynet.to_device(y_t, self.args.param_device) if self.DROPOUT: y_t = dynet.cmult(y_t, self.dropout_mask_lattice_y_t) if self.args.concat_context_vector: y_t = dynet.concatenate([y_t, w_t]) r_t = dynet.affine_transform([ self.vocab_bias, self.vocab_R, dynet.tanh( dynet.affine_transform( [self.lattice_bias, self.lattice_R, y_t])) ]) if i > 0: lps.append(cum_lp + -dynet.pickneglogsoftmax_batch(r_t, ender)) cum_lp = cum_lp + -dynet.pickneglogsoftmax_batch(r_t, nc) lps.append(cum_lp) return lps
def calc_loss(self, src, trg, loss_calculator): self.start_sent(src) initial_states = self._encode_src(src) # Calculate losses from multiple initial states losses = [] for initial_state in initial_states: model_loss = FactoredLossExpr() model_loss.add_factored_loss_expr( loss_calculator.calc_loss(self, initial_state, src, trg)) if self.global_fertility != 0: masked_attn = self.attender.attention_vecs if trg.mask is not None: trg_mask = 1 - (trg.mask.np_arr.transpose()) masked_attn = [ dy.cmult(attn, dy.inputTensor(mask, batched=True)) for attn, mask in zip(masked_attn, trg_mask) ] model_loss.add_loss("fertility", self._global_fertility(masked_attn)) losses.append(model_loss) try: total_loss = FactoredLossExpr() list(total_loss.add_factored_loss_expr(x) for x in losses) return total_loss finally: self.losses = losses
def aggregate_masked_loss(x: Tensor, mask: 'xnmt.batchers.Mask' = None) -> Tensor: """ Aggregate loss values for unmasked entries. Args: x: Batched sequence of losses. mask: An optional mask for the case of outputs of unequal lengths. Returns: Batched sequence of losses, with masked ones zeroed out. """ if xnmt.backend_dynet: if mask: x = dy.cmult(x, dy.inputTensor(1.0 - mask.np_arr.T, batched=True)) return dy.sum_elems(x) else: if mask: x = torch.mul( x, torch.as_tensor(1.0 - mask.np_arr, dtype=x.dtype, device=xnmt.device)) return torch.sum(x, dim=tuple(range(1, len( x.size())))) # sum over all but batch elems
def word_repr(self, char_seq, cembs): # obtain the word representation when given its character sequence wlen = len(char_seq) if 'rgW%d' % wlen not in self.param_exprs: self.param_exprs['rgW%d' % wlen] = dy.parameter( self.params['reset_gate_W'][wlen - 1]) self.param_exprs['rgb%d' % wlen] = dy.parameter( self.params['reset_gate_b'][wlen - 1]) self.param_exprs['cW%d' % wlen] = dy.parameter( self.params['com_W'][wlen - 1]) self.param_exprs['cb%d' % wlen] = dy.parameter( self.params['com_b'][wlen - 1]) chars = dy.concatenate(cembs) reset_gate = dy.logistic(self.param_exprs['rgW%d' % wlen] * chars + self.param_exprs['rgb%d' % wlen]) word = dy.tanh(self.param_exprs['cW%d' % wlen] * dy.cmult(reset_gate, chars) + self.param_exprs['cb%d' % wlen]) if self.known_words is not None and tuple( char_seq) in self.known_words: return (word + dy.lookup(self.params['word_embed'], self.known_words[tuple(char_seq)])) / 2. return word
def _fast_sample(self, prob, temperature=1): temperature = temperature / 2 bern = dy.random_bernoulli(256, 0.5, scale=temperature) + (1.0 - temperature) prob = dy.cmult(prob, bern) # print prob.npvalue().argmax() return prob.npvalue().argmax()
def reparameterize(self, mu, logvar): if self.training: std = dy.exp(logvar * 0.5) eps = dy.random_normal(dim=std.dim()[0], mean=0.0, stddev=1.0) return dy.cmult(eps, std) + mu else: return mu
def calc_loss(self, src, trg, loss_calculator): self.start_sent(src) embeddings = self.src_embedder.embed_sent(src) encodings = self.encoder(embeddings) self.attender.init_sent(encodings) # Initialize the hidden state from the encoder ss = mark_as_batch([Vocab.SS] * len(src)) if is_batched(src) else Vocab.SS initial_state = self.decoder.initial_state(self.encoder.get_final_states(), self.trg_embedder.embed(ss)) # Compose losses model_loss = LossBuilder() model_loss.add_loss("mle", loss_calculator(self, initial_state, src, trg)) if self.calc_global_fertility or self.calc_attention_entropy: # philip30: I assume that attention_vecs is already masked src wisely. # Now applying the mask to the target masked_attn = self.attender.attention_vecs if trg.mask is not None: trg_mask = trg.mask.get_active_one_mask().transpose() masked_attn = [dy.cmult(attn, dy.inputTensor(mask, batched=True)) for attn, mask in zip(masked_attn, trg_mask)] if self.calc_global_fertility: model_loss.add_loss("fertility", self.global_fertility(masked_attn)) if self.calc_attention_entropy: model_loss.add_loss("H(attn)", self.attention_entropy(masked_attn)) return model_loss
def embed(self, x): if self.train and self.word_dropout > 0.0 and self.word_id_mask is None: batch_size = len(x) if xnmt.batcher.is_batched(x) else 1 self.word_id_mask = [set(np.random.choice(self.vocab_size, int(self.vocab_size * self.word_dropout), replace=False)) for _ in range(batch_size)] # single mode if not xnmt.batcher.is_batched(x): if self.train and self.word_id_mask and x in self.word_id_mask[0]: ret = dy.zeros((self.emb_dim,)) else: ret = self.embeddings[x] if self.fix_norm != None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm # minibatch mode else: ret = self.embeddings.batch(x) if self.fix_norm != None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm if self.train and self.word_id_mask and any(x[i] in self.word_id_mask[i] for i in range(len(x))): dropout_mask = dy.inputTensor(np.transpose([[0.0]*self.emb_dim if x[i] in self.word_id_mask[i] else [1.0]*self.emb_dim for i in range(len(x))]), batched=True) ret = dy.cmult(ret, dropout_mask) if self.train and self.weight_noise > 0.0: ret = dy.noise(ret, self.weight_noise) return ret
def sample_one( self, translator: 'xnmt.models.translators.AutoRegressiveTranslator', initial_state: decoders.AutoRegressiveDecoderState, forced_trg_ids: Optional[Sequence[numbers.Integral]] = None ) -> SearchOutput: # Search variables current_words = None current_state = initial_state done = None # Outputs logsofts = [] samples = [] states = [] attentions = [] masks = [] # Sample to the max length for length in range(self.max_len): translator_output = translator.generate_one_step( current_words, current_state) if forced_trg_ids is None: sample = translator_output.logsoftmax.tensor_value( ).categorical_sample_log_prob().as_numpy() if len(sample.shape) == 2: sample = sample[0] else: sample = [ forced_trg[length] if forced_trg.sent_len() > length else Vocab.ES for forced_trg in forced_trg_ids ] logsoft = dy.pick_batch(translator_output.logsoftmax, sample) if done is not None: sample = [ sample[i] if not done[i] else Vocab.ES for i in range(len(done)) ] # masking for logsoftmax mask = [1 if not done[i] else 0 for i in range(len(done))] logsoft = dy.cmult(logsoft, dy.inputTensor(mask, batched=True)) masks.append(mask) # Appending output logsofts.append(logsoft) samples.append(sample) states.append(translator.get_nobp_state(translator_output.state)) attentions.append(translator_output.attention) # Next time step current_words = sample current_state = translator_output.state # Check done done = [x == Vocab.ES for x in sample] # Check if we are done. if all(done): break # Packing output scores = dy.esum(logsofts).npvalue() masks.insert(0, [1 for _ in range(len(done))]) samples = np.stack(samples, axis=1) return SearchOutput(samples, attentions, scores, logsofts, states, masks)
def generate_output(self, translator, initial_state, src_length=None, forced_trg_ids=None): # Output variables score = [] word_ids = [] attentions = [] logsoftmaxes = [] states = [] masks = [] # Search Variables done = None current_state = initial_state for length in range(self.max_len): prev_word = word_ids[length - 1] if length > 0 else None current_output = translator.generate_one_step( prev_word, current_state) current_state = current_output.state if forced_trg_ids is None: word_id = np.argmax(current_output.logsoftmax.npvalue(), axis=0) if len(word_id.shape) == 2: word_id = word_id[0] else: if xnmt.batcher.is_batched(forced_trg_ids): word_id = [ forced_trg_ids[i][length] for i in range(len(forced_trg_ids)) ] else: word_id = [forced_trg_ids[length]] logsoft = dy.pick_batch(current_output.logsoftmax, word_id) if done is not None: word_id = [ word_id[i] if not done[i] else Vocab.ES for i in range(len(done)) ] # masking for logsoftmax mask = [1 if not done[i] else 0 for i in range(len(done))] logsoft = dy.cmult(logsoft, dy.inputTensor(mask, batched=True)) masks.append(mask) # Packing outputs score.append(logsoft.npvalue()) word_ids.append(word_id) attentions.append(current_output.attention) logsoftmaxes.append( dy.pick_batch(current_output.logsoftmax, word_id)) states.append(translator.get_nobp_state(current_state)) # Check if we are done. done = [x == Vocab.ES for x in word_id] if all(done): break masks.insert(0, [1 for _ in range(len(done))]) words = np.stack(word_ids, axis=1) score = np.sum(score, axis=0) return [ SearchOutput(words, attentions, score, logsoftmaxes, states, masks) ]
def reparameterize(mu, logvar): # Get z by reparameterization. d = mu.dim()[0][0] eps = dy.random_normal(d) std = dy.exp(logvar * 0.5) return mu + dy.cmult(std, eps)
def calc_nll(self, src: Union[batchers.Batch, sent.Sentence], trg: Union[batchers.Batch, sent.Sentence]): batch_size, encodings, outputs, seq_len = self._encode_src(src) if trg.sent_len() != seq_len: if self.auto_cut_pad: trg = self._cut_or_pad_targets(seq_len, trg) else: raise ValueError( f"src/trg length do not match: {seq_len} != {len(trg[0])}") ref_action = np.asarray([trg_sent.words for trg_sent in trg]).reshape( (seq_len * batch_size, )) loss_expr_perstep = self.scorer.calc_loss( outputs, batchers.mark_as_batch(ref_action)) # loss_expr_perstep = dy.pickneglogsoftmax_batch(outputs, ref_action) loss_expr_perstep = dy.reshape(loss_expr_perstep, (seq_len, ), batch_size=batch_size) if trg.mask: loss_expr_perstep = dy.cmult( loss_expr_perstep, dy.inputTensor(1.0 - trg.mask.np_arr.T, batched=True)) loss_expr = dy.sum_elems(loss_expr_perstep) units = [t.len_unpadded() for t in trg] return LossExpr(loss_expr, units)
def calc_loss(self, src, trg, loss_calculator): """ :param src: source sequence (unbatched, or batched + padded) :param trg: target sequence (unbatched, or batched + padded); losses will be accumulated only if trg_mask[batch,pos]==0, or no mask is set :param loss_calculator: :returns: (possibly batched) loss expression """ self.start_sent(src) embeddings = self.src_embedder.embed_sent(src) encodings = self.encoder(embeddings) self.attender.init_sent(encodings) # Initialize the hidden state from the encoder ss = mark_as_batch([Vocab.SS] * len(src)) if is_batched(src) else Vocab.SS dec_state = self.decoder.initial_state(self.encoder.get_final_states(), self.trg_embedder.embed(ss)) # Compose losses model_loss = LossBuilder() model_loss.add_loss("mle", loss_calculator(self, dec_state, src, trg)) if self.calc_global_fertility or self.calc_attention_entropy: # philip30: I assume that attention_vecs is already masked src wisely. # Now applying the mask to the target masked_attn = self.attender.attention_vecs if trg.mask is not None: trg_mask = trg.mask.get_active_one_mask().transpose() masked_attn = [dy.cmult(attn, dy.inputTensor(mask, batched=True)) for attn, mask in zip(masked_attn, trg_mask)] if self.calc_global_fertility: model_loss.add_loss("fertility", self.global_fertility(masked_attn)) if self.calc_attention_entropy: model_loss.add_loss("H(attn)", self.attention_entropy(masked_attn)) return model_loss
def _attend(self, query, mask=None): query = unsqueeze(query, 0) # ((1, H), B) # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B) attn_scores = dy.transpose(query * self.context) if mask is not None: attn_scores = dy.cmult(attn_scores, mask[0]) + (mask[1] * dy.scalarInput(-1e9)) return dy.softmax(attn_scores)
def attention_entropy(self, a): entropy = [] for a_i in a: a_i += EPSILON entropy.append(dy.cmult(a_i, dy.log(a_i))) return -dy.sum_elems(dy.esum(entropy))
def _upsample(self, mgc, start, stop): mgc_index = start / len(self.upsample_w_s) ups_index = start % len(self.upsample_w_s) upsampled = [] mgc_vect = dy.inputVector(mgc[mgc_index]) for x in xrange(stop - start): sigm = dy.logistic(self.upsample_w_s[ups_index].expr(update=True) * mgc_vect + self.upsample_b_s[ups_index].expr(update=True)) tnh = dy.tanh(self.upsample_w_t[ups_index].expr(update=True) * mgc_vect + self.upsample_b_t[ups_index].expr(update=True)) r = dy.cmult(sigm, tnh) upsampled.append(r) ups_index += 1 if ups_index == len(self.upsample_w_s): ups_index = 0 mgc_index += 1 if mgc_index == len( mgc ): # last frame is sometimes not processed, but it should have similar parameters mgc_index -= 1 else: mgc_vect = dy.inputVector(mgc[mgc_index]) return upsampled
def cross_entropy_loss(y, yhat): """ Compute the cross entropy loss in tensorflow. The loss should be summed over the current minibatch. y is a one-hot tensor of shape (n_samples, n_classes) and yhat is a tensor of shape (n_samples, n_classes). y should be of dtype tf.int32, and yhat should be of dtype tf.float32. The functions tf.to_float, tf.reduce_sum, and tf.log might prove useful. (Many solutions are possible, so you may not need to use all of these functions). Note: You are NOT allowed to use the tensorflow built-in cross-entropy functions. Args: y: tf.Tensor with shape (n_samples, n_classes). One-hot encoded. yhat: tf.Tensorwith shape (n_sample, n_classes). Each row encodes a probability distribution and should sum to 1. Returns: out: tf.Tensor with shape (1,) (Scalar output). You need to construct this tensor in the problem. """ ### YOUR CODE HERE #out = (dy.sum_elems(out) / y.value().shape[0]).npvalue().reshape([]) out = dy.sum_elems(-dy.cmult(y, dy.log(yhat))) ### END YOUR CODE return out
def cross_entropy_structbag(self, P, Q): """ P (K x m) represents a distribution over STRUCTURED labels where each label is a BAG of K INDEPENDENT symbols taking values in {1 ... m}. That is, z = (z1 ... zK) is assigned probability P1(z1) * ... * PK(zK). (Similarly for Q.) By the independence, H(P, Q) = sum_k H(Pk, Qk). """ return -dy.sum_dim(dy.cmult(P, self.log2(Q)), [0, 1])
def attend_vector(encoder_outputs,state_factor_vector): encoderOutputLength=state_factor_vector.npvalue().shape[0] hiddenSize=encoder_outputs[0].npvalue().shape[0] factor_Products=[dy.cmult(dy.concatenate([state_factor_vector[l]]*hiddenSize),encoder_outputs[l]) for l in range(encoderOutputLength)] factor_Products=dy.esum(factor_Products) return factor_Products
def embed_sentence(self, ws, pwords, ts, chars, is_train): cembed = [dy.lookup_batch(self.clookup, c) for c in chars] char_fwd, char_bckd = self.char_lstm.builder_layers[0][0].initial_state().transduce(cembed)[-1], \ self.char_lstm.builder_layers[0][1].initial_state().transduce(reversed(cembed))[-1] crnn = dy.reshape(dy.concatenate_cols([char_fwd, char_bckd]), (self.options.we, ws.shape[0] * ws.shape[1])) cnn_reps = [list() for _ in range(len(ws))] for i in range(ws.shape[0]): cnn_reps[i] = dy.pick_batch(crnn, [i * ws.shape[1] + j for j in range(ws.shape[1])], 1) wembed = [dy.lookup_batch(self.wlookup, ws[i]) + dy.lookup_batch(self.elookup, pwords[i]) + cnn_reps[i] for i in range(len(ws))] posembed = [dy.lookup_batch(self.tlookup, ts[i]) for i in range(len(ts))] if (not is_train) or self.options.dropout == 0: return [dy.concatenate([wembed[i], posembed[i]]) for i in range(len(ts))] else: emb_masks = self.generate_emb_mask(ws.shape[0], ws.shape[1]) return [dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in zip(wembed, posembed, emb_masks)]
def _encodings_to_label_log_probabilities(self, encodings, lmbd=None): label_scores = self.f_label(dy.concatenate_to_batch(encodings)) label_scores_reshaped = dy.reshape(label_scores, (self.label_vocab.size, len(encodings))) if lmbd is not None: label_scores_reshaped = dy.cmult(label_scores_reshaped, lmbd) return dy.log_softmax(label_scores_reshaped)
def transduce(self, inputs, train): xs = inputs[:self.max_length] if not xs: return [] for i in range(self.lstm_layers): for n, d in ("f", 1), ("b", -1): Wr, br, Wh = [self.params["%s%d%s" % (p, i, n)] for p in ("Wr", "br", "Wh")] hs_ = self.params["rnn%d%s" % (i, n)].initial_state().transduce(xs[::d]) hs = [hs_[0]] for t in range(1, len(hs_)): r = dy.logistic(Wr * dy.concatenate([hs[t - 1], xs[t]]) + br) hs.append(dy.cmult(r, hs_[t]) + dy.cmult(1 - r, Wh * xs[t])) xs = hs if train: x = dy.dropout_dim(dy.concatenate(xs, 1), 1, self.dropout) xs = [dy.pick(x, i, 1) for i in range(len(xs))] return xs
def run_lstm(self, word_inputs, tag_inputs, isTrain=True): batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] word_embs = [ dy.lookup_batch( self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) + dy.lookup_batch(self.pret_word_embs, w, update=False) for w in word_inputs ] tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs] if isTrain: emb_masks = self.generate_emb_mask(seq_len, batch_size) emb_inputs = [ dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks) ] else: emb_inputs = [ dy.concatenate([w, pos]) for w, pos in zip(word_embs, tag_embs) ] common_top_input, c_fs, c_bs = biLSTM( self.cLSTM_builders, emb_inputs, batch_size, self.dropout_clstm_input if isTrain else 0., self.dropout_clstm_hidden if isTrain else 0.) common_top_recur = dy.concatenate_cols(common_top_input) private_top_input, p_fs, p_bs = biLSTM( self.pLSTM_builders, emb_inputs, batch_size, self.dropout_plstm_input if isTrain else 0., self.dropout_plstm_hidden if isTrain else 0.) private_top_recur = dy.concatenate_cols(private_top_input) if isTrain: common_top_recur = dy.dropout_dim(common_top_recur, 1, self.dropout_mlp) private_top_recur = dy.dropout_dim(private_top_recur, 1, self.dropout_mlp) return common_top_recur, private_top_recur, p_fs, p_bs
def transduce( self, expr_seq: expression_seqs.ExpressionSequence ) -> expression_seqs.ExpressionSequence: if expr_seq.dim()[1] > 1: raise ValueError( f"LatticeLSTMTransducer requires batch size 1, got {expr_seq.dim()[1]}" ) lattice = self.cur_src[0] Wx_iog = dy.parameter(self.p_Wx_iog) Wh_iog = dy.parameter(self.p_Wh_iog) b_iog = dy.parameter(self.p_b_iog) Wx_f = dy.parameter(self.p_Wx_f) Wh_f = dy.parameter(self.p_Wh_f) b_f = dy.parameter(self.p_b_f) h = [] c = [] batch_size = expr_seq.dim()[1] if self.dropout_rate > 0.0 and self.train: self.set_dropout_masks(batch_size=batch_size) for node_i in range(lattice.sent_len()): cur_node = lattice.nodes[node_i] val = expr_seq[node_i] if self.dropout_rate > 0.0 and self.train: val = dy.cmult(val, self.dropout_mask_x) i_ft_list = [] if len(cur_node.nodes_prev) == 0: tmp_iog = dy.affine_transform([b_iog, Wx_iog, val]) else: h_tilde = sum(h[pred] for pred in cur_node.nodes_prev) tmp_iog = dy.affine_transform( [b_iog, Wx_iog, val, Wh_iog, h_tilde]) for pred in cur_node.nodes_prev: i_ft_list.append( dy.logistic( dy.affine_transform( [b_f, Wx_f, val, Wh_f, h[pred]]))) i_ait = dy.pick_range(tmp_iog, 0, self.hidden_dim) i_aot = dy.pick_range(tmp_iog, self.hidden_dim, self.hidden_dim * 2) i_agt = dy.pick_range(tmp_iog, self.hidden_dim * 2, self.hidden_dim * 3) i_it = dy.logistic(i_ait) i_ot = dy.logistic(i_aot) i_gt = dy.tanh(i_agt) if len(cur_node.nodes_prev) == 0: c.append(dy.cmult(i_it, i_gt)) else: fc = dy.cmult(i_ft_list[0], c[cur_node.nodes_prev[0]]) for i in range(1, len(cur_node.nodes_prev)): fc += dy.cmult(i_ft_list[i], c[cur_node.nodes_prev[i]]) c.append(fc + dy.cmult(i_it, i_gt)) h_t = dy.cmult(i_ot, dy.tanh(c[-1])) if self.dropout_rate > 0.0 and self.train: h_t = dy.cmult(h_t, self.dropout_mask_h) h.append(h_t) self._final_states = [transducers.FinalTransducerState(h[-1], c[-1])] return expression_seqs.ExpressionSequence(expr_list=h)
def word_repr(self, char_seq): # obtain the word representation when given its character sequence wlen = len(char_seq) if 'rgW%d' % wlen not in self.param_exprs: self.param_exprs['rgW%d' % wlen] = dy.parameter( self.params['reset_gate_W'][wlen - 1]) self.param_exprs['rgb%d' % wlen] = dy.parameter( self.params['reset_gate_b'][wlen - 1]) self.param_exprs['cW%d' % wlen] = dy.parameter( self.params['com_W'][wlen - 1]) self.param_exprs['cb%d' % wlen] = dy.parameter( self.params['com_b'][wlen - 1]) self.param_exprs['ugW%d' % wlen] = dy.parameter( self.params['update_gate_W'][wlen - 1]) self.param_exprs['ugb%d' % wlen] = dy.parameter( self.params['update_gate_b'][wlen - 1]) chars = dy.concatenate(char_seq) reset_gate = dy.logistic(self.param_exprs['rgW%d' % wlen] * chars + self.param_exprs['rgb%d' % wlen]) comb = dy.concatenate([ dy.tanh(self.param_exprs['cW%d' % wlen] * dy.cmult(reset_gate, chars) + self.param_exprs['cb%d' % wlen]), chars ]) update_logits = self.param_exprs[ 'ugW%d' % wlen] * comb + self.param_exprs['ugb%d' % wlen] update_gate = dy.transpose( dy.concatenate_cols([ dy.softmax( dy.pickrange(update_logits, i * (wlen + 1), (i + 1) * (wlen + 1))) for i in xrange(self.options['ndims']) ])) # The following implementation of Softmax fucntion is not safe, but faster... #exp_update_logits = dy.exp(dy.reshape(update_logits,(self.options['ndims'],wlen+1))) #update_gate = dy.cdiv(exp_update_logits, dy.concatenate_cols([dy.sum_cols(exp_update_logits)] *(wlen+1))) #assert (not np.isnan(update_gate.npvalue()).any()) word = dy.sum_cols( dy.cmult(update_gate, dy.reshape(comb, (self.options['ndims'], wlen + 1)))) return word
def cosine_proximity(self, pred, gold): def l2_normalize(x): square_sum = dynet.sqrt(dynet.bmax(dynet.sum_elems(dynet.square(x)), np.finfo(float).eps * dynet.ones((1))[0])) return dynet.cdiv(x, square_sum) y_true = l2_normalize(pred) y_pred = l2_normalize(gold) return -dynet.sum_elems(dynet.cmult(y_true, y_pred))
def step(self, x, hx, cx): if not self.test: if self.dropout_x > 0: x = dy.cmult(self.dropout_mask_x, x) if self.dropout_h > 0: hx = dy.cmult(self.dropout_mask_h, hx) gates = dy.affine_transform( [self.bias, self.weight_ih, x, self.weight_hh, hx]) i = dy.pickrange(gates, 0, self.n_hidden) f = dy.pickrange(gates, self.n_hidden, self.n_hidden * 2) g = dy.pickrange(gates, self.n_hidden * 2, self.n_hidden * 3) o = dy.pickrange(gates, self.n_hidden * 3, self.n_hidden * 4) i, f, g, o = dy.logistic(i), dy.logistic(f), dy.tanh(g), dy.logistic(o) cy = dy.cmult(f, cx) + dy.cmult(i, g) hy = dy.cmult(o, dy.tanh(cy)) return hy, cy
def __cosine_loss(self, pred, gold): sn1 = dy.l2_norm(pred) sn2 = dy.l2_norm(gold) mult = dy.cmult(sn1, sn2) dot = dy.dot_product(pred, gold) div = dy.cdiv(dot, mult) vec_y = dy.scalarInput(2) res = dy.cdiv(1 - div, vec_y) return res
def __call__(self, *args): U = [dy.parameter(U_) for U_ in self.U] out = U[0] * args[0] for x, u in zip(args[1:], U[1:]): out = dy.cmult(out, u * x) out = dy.sum_cols(dy.transpose(out)) return out
def gate_vecs(self, ht1, xt): b = self.expressions["b"] W = self.expressions["W"] gate_vecs = {} for g, activation in zip(self.gate_names, self.gate_activations): hin = ht1 if not g == "htilde" else dy.cmult(gate_vecs["r"], ht1) gate_vecs[g] = activation( dy.affine_transform([b[g], W["x"][g], xt, W["h"][g], ht1])) return gate_vecs
def attend(encoder_outputs,state_factor_matrix): miniBatchLength=state_factor_matrix.npvalue().shape[1] encoderOutputLength=state_factor_matrix.npvalue().shape[0] hiddenSize=encoder_outputs[0].npvalue().shape[0] factor_Products=[state_factor_matrix[l] for l in range(encoderOutputLength)] factor_Products=dy.esum([dy.cmult(encoder_outputs[l],dy.concatenate([state_factor_matrix[l]]*hiddenSize)) for l in range(encoderOutputLength)]) return factor_Products
def _attend(self, query, mask=None): # query ((H), B) # mask ((T, 1), B) projected_state = self.decoder * query # ((H,), B) non_lin = dy.tanh(dy.colwise_add(self.context_proj, projected_state)) # ((H, T), B) attn_scores = dy.transpose(self.v * non_lin) # ((1, H), B) * ((H, T), B) -> ((1, T), B) -> ((T, 1), B) if mask is not None: attn_scores = dy.cmult(attn_scores, mask[0]) + (mask[1] * dy.scalarInput(-1e9)) return dy.softmax(attn_scores) # ((T, 1), B)
def dyagonalize(col): """ A convoluted way to make a dynet vector into a dynet matrix where it's the diagonal God I hope there's a better way. :param col: column vector in dynet format """ col_dim = col.dim()[0][0] nump_eye = np.eye(col_dim) return dy.cmult(col, dy.inputTensor(nump_eye))
def mask_embeddings(self, embeddings, mask): """ We convert the embeddings of masked input sequence to zero vector """ (embed_dim, _), _ = embeddings.dim() temp_mask = np.repeat(1. - mask[:, None, :], embed_dim, axis=1) temp_mask = dy.inputTensor(np.moveaxis(temp_mask, [1, 0, 2], [0, 2, 1]), batched=True) embeddings = dy.cmult(embeddings, temp_mask) return embeddings
def dot_product_attention(query, key, value, mask=None, dropout=None): """Input Shape: ((D, T, H), B)""" scores = batch_matmul(transpose(key, 0, 1), query) if mask is not None: scores = dy.cmult(scores, mask[0]) + (mask[1] * -1e9) weights = folded_softmax(scores) if dropout is not None: weights = dy.dropout(weights, dropout) return batch_matmul(value, weights)
def expr_for_tree(self, tree): if tree.isleaf(): return self.E[self.w2i.get(tree.label,0)] if len(tree.children) == 1: assert(tree.children[0].isleaf()) emb = self.expr_for_tree(tree.children[0]) Wi,Wo,Wu = [dy.parameter(w) for w in self.WS] bi,bo,bu,_ = [dy.parameter(b) for b in self.BS] i = dy.logistic(Wi*emb + bi) o = dy.logistic(Wo*emb + bo) u = dy.tanh( Wu*emb + bu) c = dy.cmult(i,u) expr = dy.cmult(o,dy.tanh(c)) return expr assert(len(tree.children) == 2),tree.children[0] e1 = self.expr_for_tree(tree.children[0]) e2 = self.expr_for_tree(tree.children[1]) Ui,Uo,Uu = [dy.parameter(u) for u in self.US] Uf1,Uf2 = [dy.parameter(u) for u in self.UFS] bi,bo,bu,bf = [dy.parameter(b) for b in self.BS] e = dy.concatenate([e1,e2]) i = dy.logistic(Ui*e + bi) o = dy.logistic(Uo*e + bo) f1 = dy.logistic(Uf1*e1 + bf) f2 = dy.logistic(Uf2*e2 + bf) u = dy.tanh( Uu*e + bu) c = dy.cmult(i,u) + dy.cmult(f1,e1) + dy.cmult(f2,e2) h = dy.cmult(o,dy.tanh(c)) expr = h return expr
def learn(self, batch_size): if self.prioritized: if not self.memory.is_full(): return -np.inf indices, exps, weights = self.memory.sample(batch_size, self.beta) else: exps = self.memory.sample(batch_size) obss, actions, rewards, obs_nexts, dones = self._process(exps) dy.renew_cg() target_network = self.target_network if self.use_double_dqn else self.network if self.dueling: target_values, v = target_network(obs_nexts, batched=True) target_values = target_values.npvalue() + v.npvalue() else: target_values = target_network(obs_nexts, batched=True) target_values = target_values.npvalue() target_values = np.max(target_values, axis=0) target_values = rewards + self.reward_decay * (target_values * (1 - dones)) dy.renew_cg() if self.dueling: all_values_expr, v = self.network(obss, batched=True) else: all_values_expr = self.network(obss, batched=True) picked_values = dy.pick_batch(all_values_expr, actions) diff = (picked_values + v if self.dueling else picked_values) - dy.inputTensor(target_values, batched=True) if self.prioritized: self.memory.update(indices, np.transpose(np.abs(diff.npvalue()))) losses = dy.pow(diff, dy.constant(1, 2)) if self.prioritized: losses = dy.cmult(losses, dy.inputTensor(weights, batched=True)) loss = dy.sum_batches(losses) loss_value = loss.npvalue() loss.backward() self.trainer.update() self.epsilon = max(self.epsilon - self.epsilon_decrease, self.epsilon_lower) if self.prioritized: self.beta = min(self.beta + self.beta_increase, 1.) self.learn_step += 1 if self.use_double_dqn and self.learn_step % self.n_replace_target == 0: self.target_network.update(self.network) return loss_value
def transitions(self): if self.mask is not None: return dy.cmult(self.transitions_p, dy.inputTensor(self.mask)) + dy.inputTensor(self.inv_mask) return self.transitions_p
def highway(input_, train): for func, weight, bias in zip(funcs, weights, biases): proj = dy.rectify(func(input_, train)) transform = dy.logistic(dy.affine_transform([bias, weight, input_])) input_ = dy.cmult(transform, proj) + dy.cmult(input_, 1 - transform) return input_
def __call__(self, in_expr): return self.act(dy.cmult(in_expr))
def __init__(self, vocab, w2i, pos, rels, options): if isinstance(options, dict): options = _dict_to_obj(options, 'Values') self.model = ParameterCollection() random.seed(1) self.trainer = AdamTrainer(self.model) self.activations = {'tanh': tanh, 'sigmoid': logistic, 'relu': rectify, 'tanh3': (lambda x: tanh(cmult(cmult(x, x), x)))} self.activation = self.activations[options.activation] self.blstm_flag = options.blstmFlag self.labels_flag = options.labelsFlag self.costaug_flag = options.costaugFlag self.bibi_flag = options.bibiFlag self.ldims = options.lstm_dims self.wdims = options.wembedding_dims self.pdims = options.pembedding_dims self.rdims = options.rembedding_dims self.layers = options.lstm_layers self.words_count = vocab self.vocab = {word: ind + 3 for word, ind in list(w2i.items())} self.pos = {word: ind + 3 for ind, word in enumerate(pos)} self.rels = {word: ind for ind, word in enumerate(rels)} self.irels = rels if self.bibi_flag: self.builders = [LSTMBuilder(1, self.wdims + self.pdims, self.ldims, self.model), LSTMBuilder(1, self.wdims + self.pdims, self.ldims, self.model)] self.bbuilders = [LSTMBuilder(1, self.ldims * 2, self.ldims, self.model), LSTMBuilder(1, self.ldims * 2, self.ldims, self.model)] elif self.layers > 0: self.builders = \ [LSTMBuilder(self.layers, self.wdims + self.pdims, self.ldims, self.model), LSTMBuilder(self.layers, self.wdims + self.pdims, self.ldims, self.model)] else: self.builders = [SimpleRNNBuilder(1, self.wdims + self.pdims, self.ldims, self.model), SimpleRNNBuilder(1, self.wdims + self.pdims, self.ldims, self.model)] self.hidden_units = options.hidden_units self.hidden2_units = options.hidden2_units self.vocab['*PAD*'] = 1 self.pos['*PAD*'] = 1 self.vocab['*INITIAL*'] = 2 self.pos['*INITIAL*'] = 2 self.wlookup = self.model.add_lookup_parameters((len(vocab) + 3, self.wdims)) self.plookup = self.model.add_lookup_parameters((len(pos) + 3, self.pdims)) self.rlookup = self.model.add_lookup_parameters((len(rels), self.rdims)) self.hid_layer_foh = self.model.add_parameters((self.hidden_units, self.ldims * 2)) self.hid_layer_fom = self.model.add_parameters((self.hidden_units, self.ldims * 2)) self.hid_bias = self.model.add_parameters((self.hidden_units)) self.hid2_layer = self.model.add_parameters((self.hidden2_units, self.hidden_units)) self.hid2_bias = self.model.add_parameters((self.hidden2_units)) self.out_layer = self.model.add_parameters( (1, self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) if self.labels_flag: self.rhid_layer_foh = self.model.add_parameters((self.hidden_units, 2 * self.ldims)) self.rhid_layer_fom = self.model.add_parameters((self.hidden_units, 2 * self.ldims)) self.rhid_bias = self.model.add_parameters((self.hidden_units)) self.rhid2_layer = self.model.add_parameters((self.hidden2_units, self.hidden_units)) self.rhid2_bias = self.model.add_parameters((self.hidden2_units)) self.rout_layer = self.model.add_parameters( (len(self.irels), self.hidden2_units if self.hidden2_units > 0 else self.hidden_units)) self.rout_bias = self.model.add_parameters((len(self.irels)))