def _embed_word(self, segmented_word, is_batched): if self.word_vocab is not None: ngram_stats = self.to_ngram_stats(segmented_word.word) elif self.char_vocab is not None: ngram_stats = self.to_ngram_stats(segmented_word.chars) else: raise ValueError( "Either word vocab or char vocab should not be None") not_in = [ key for key in ngram_stats.keys() if key not in self.ngram_vocab.w2i ] for key in not_in: ngram_stats.pop(key) if len(ngram_stats) > 0: ngrams = [ self.ngram_vocab.convert(ngram) for ngram in ngram_stats.keys() ] counts = list(ngram_stats.values()) else: ngrams = [self.ngram_vocab.UNK] counts = [1] input_tensor = dy.sparse_inputTensor([ngrams], counts, (self.ngram_vocab.vocab_size(), )) # Note: If one wants to use CHARAGRAM embeddings, use NonLinear with Relu. return self.transform.transform(input_tensor)
def transduce(self, inputs): ngrams = [self.convert(ngram) for ngram in self.word_vect.keys()] counts = list(self.word_vect.values()) if len(ngrams) != 0: ngram_vocab_vect = dy.sparse_inputTensor([ngrams], counts, (self.dict_entry,)) return dy.rectify(self.embedding.transform(ngram_vocab_vect)) else: return None
def _choose_rnn_input(self, dec_state, batch_size, prev_ref_action, mode): hidden_size = dec_state.context.dim()[0][0] vocab_size = self.trg_embedder.vocab_size if mode == "context": context_vec = dy.reshape(dec_state.context, (hidden_size, ), batch_size=batch_size) ret = context_vec elif dec_state.out_prob is None and prev_ref_action is None: ret = None elif mode == "expected": ret = dy.reshape(dec_state.out_prob, (1, vocab_size), batch_size=batch_size) * dy.parameter( self.trg_embedder.embeddings) ret = dy.reshape(ret, (hidden_size, ), batch_size=batch_size) elif mode in ["argmax", "argmax_st"]: gradient_mode = "zero_gradient" if mode == "argmax" else "straight_through_gradient" argmax = dy.reshape(dy.argmax(dec_state.out_prob, gradient_mode=gradient_mode), (1, vocab_size), batch_size=batch_size) ret = argmax * dy.parameter(self.trg_embedder.embeddings) ret = dy.reshape(ret, (hidden_size, ), batch_size=batch_size) elif mode in ["teacher", "split"]: do_sample = self.train and dec_state.out_prob and self.sampling_prob > 0.0 and random.random( ) < self.sampling_prob if not do_sample: ret = self.trg_embedder.embed(prev_ref_action) else: # do sample sampled_vals = [] npval = dec_state.out_prob.npvalue() for bi in range(batch_size): npval_bi = npval[:, bi] if batch_size > 1 else npval sampled_vals.append( np.random.choice(vocab_size, p=npval_bi / np.sum(npval_bi))) idxs = ([], []) for batch_i in range(batch_size): idxs[0].append(sampled_vals[batch_i]) idxs[1].append(batch_i) argmax = dy.sparse_inputTensor( idxs, values=np.ones(batch_size), shape=(vocab_size, batch_size), batched=True, ) argmax = dy.reshape(argmax, (1, vocab_size), batch_size=batch_size) ret = argmax * dy.parameter(self.trg_embedder.embeddings) ret = dy.reshape(ret, (hidden_size, ), batch_size=batch_size) else: raise ValueError(f"unknown value for mode: {mode}") if ret is not None: self._chosen_rnn_inputs.append(ret) return ret
def on_start_sent(self, src): self.coeff = None self.dict_prob = None batch_size = src.batch_size() col_size = src.sent_len() idxs = [(x, j, i) for i in range(batch_size) for j in range(col_size) for x in self.lexicon[src[i][j]].keys()] idxs = tuple(map(list, list(zip(*idxs)))) values = [x for i in range(batch_size) for j in range(col_size) for x in self.lexicon[src[i][j]].values()] dim = len(self.trg_vocab), col_size, batch_size self.lexicon_prob = dy.nobackprop(dy.sparse_inputTensor(idxs, values, dim, batched=True))
def test_sparse_inputTensor(self): dy.renew_cg() input_tensor = self.input_vals.reshape((3, 3, 3, 3)) input_vals = [input_tensor[0, 0, 0, 0], input_tensor[0, 1, 2, 0]] input_indices = ([0, 0], [0, 1], [0, 2], [0, 0]) x = dy.sparse_inputTensor(input_indices, input_vals, (3, 3, 3, 3), batched=True) self.assertEqual(x.dim()[0], (3, 3, 3), msg="Dimension mismatch") self.assertEqual(x.dim()[1], 3, msg="Dimension mismatch") self.assertTrue(np.allclose(x.npvalue()[0, 0, 0, 0], input_vals[0]), msg="Expression value different from initial value") self.assertTrue(np.allclose(x.npvalue()[0, 1, 2, 0], input_vals[1]), msg="Expression value different from initial value") self.assertTrue(np.allclose(x.npvalue()[1, 1, 1, 1], 0), msg="Expression value different from initial value")
def transduce(self, inputs): batch_size = len(self.words) word_vects = [] keys = [] values = [] for i, word in enumerate(self.words): word_vects.append(self.to_word_vector(word)) idxs = [(x, i) for i in range(batch_size) for x in word_vects[i].keys()] idxs = tuple(map(list, list(zip(*idxs)))) values = [x for i in range(batch_size) for x in word_vects[i].values()] ngram_vocab_vect = dy.sparse_inputTensor( idxs, values, (self.dict_entry, len(self.words)), batched=True) return dy.rectify(self.word_ngram(ngram_vocab_vect))
def on_start_sent(self, src): batch_size = len(src) col_size = len(src[0]) idxs = [(x, j, i) for i in range(batch_size) for j in range(col_size) for x in self.lexicon[src[i][j]].keys()] idxs = tuple(map(list, list(zip(*idxs)))) values = [ x for i in range(batch_size) for j in range(col_size) for x in self.lexicon[src[i][j]].values() ] self.lexicon_prob = dy.nobackprop( dy.sparse_inputTensor(idxs, values, (len(self.trg_vocab), col_size, batch_size), batched=True))
def test_sparse_inputTensor(self): dy.renew_cg() input_tensor = self.input_vals.reshape((3, 3, 3, 3)) input_vals = [input_tensor[0, 0, 0, 0], input_tensor[0, 1, 2, 0]] input_indices = ([0, 0], [0, 1], [0, 2], [0, 0]) x = dy.sparse_inputTensor( input_indices, input_vals, (3, 3, 3, 3), batched=True) self.assertEqual(x.dim()[0], (3, 3, 3), msg="Dimension mismatch") self.assertEqual(x.dim()[1], 3, msg="Dimension mismatch") self.assertTrue(np.allclose(x.npvalue()[0, 0, 0, 0], input_vals[0]), msg="Expression value different from initial value") self.assertTrue(np.allclose(x.npvalue()[0, 1, 2, 0], input_vals[1]), msg="Expression value different from initial value") self.assertTrue(np.allclose(x.npvalue()[1, 1, 1, 1], 0), msg="Expression value different from initial value")
def calc_nll(self, src, trg): event_trigger.start_sent(src) embeddings = self.src_embedder.embed_sent(src) encodings = self.encoder.transduce(embeddings) if not batchers.is_batched(trg): trg = batchers.mark_as_batch([trg]) if self.mode in ["avg_mlp", "final_mlp"]: if self.mode=="avg_mlp": if encodings.mask: encoding_fixed_size = dy.cdiv(dy.sum_dim(encodings.as_tensor(), [1]), dy.inputTensor(np.sum(1.0 - encodings.mask.np_arr, axis=1), batched=True)) else: encoding_fixed_size = dy.sum_dim(encodings.as_tensor(), [1]) / encodings.dim()[0][1] elif self.mode=="final_mlp": encoding_fixed_size = self.encoder.get_final_states()[-1].main_expr() scores = dy.logistic(self.output_layer.transform(encoding_fixed_size)) elif self.mode=="lin_sum_sig": enc_lin = [] for step_i, enc_i in enumerate(encodings): step_linear = self.output_layer.transform(enc_i) if encodings.mask and np.sum(encodings.mask.np_arr[:,step_i])>0: step_linear = dy.cmult(step_linear, dy.inputTensor(1.0 - encodings.mask.np_arr[:,step_i], batched=True)) enc_lin.append(step_linear) if encodings.mask: encoding_fixed_size = dy.cdiv(dy.esum(enc_lin), dy.inputTensor(np.sum(1.0 - encodings.mask.np_arr, axis=1), batched=True)) else: encoding_fixed_size = dy.esum(enc_lin) / encodings.dim()[0][1] scores = dy.logistic(encoding_fixed_size) else: raise ValueError(f"unknown mode '{self.mode}'") idxs = ([], []) for batch_i in range(trg.batch_size()): for word in set(trg[batch_i]): if word not in {vocabs.Vocab.ES, vocabs.Vocab.SS}: idxs[0].append(word) idxs[1].append(batch_i) trg_scores = dy.sparse_inputTensor(idxs, values = np.ones(len(idxs[0])), shape=scores.dim()[0] + (scores.dim()[1],), batched=True, ) loss_expr = dy.binary_log_loss(scores, trg_scores) return loss_expr
def calc_loss(sent, epsilon=0.0): #dy.renew_cg() # Transduce all batch elements with an LSTM src = sent[0] tags = sent[1] # initialize the LSTM init_state_src = lstm_encode.initial_state() # get the output of the first LSTM src_output = init_state_src.add_inputs([embed[x] for x in src])[-1].output() # Now compute mean and standard deviation of source hidden state. W_mu_tweet = dy.parameter(W_mu_tweet_p) V_mu_tweet = dy.parameter(V_mu_tweet_p) b_mu_tweet = dy.parameter(b_mu_tweet_p) W_sig_tweet = dy.parameter(W_sig_tweet_p) V_sig_tweet = dy.parameter(V_sig_tweet_p) b_sig_tweet = dy.parameter(b_sig_tweet_p) # Compute tweet encoding mu_tweet = dy.dropout(mlp(src_output, W_mu_tweet, V_mu_tweet, b_mu_tweet), DROPOUT) log_var_tweet = dy.dropout( mlp(src_output, W_sig_tweet, V_sig_tweet, b_sig_tweet), DROPOUT) W_mu_tag = dy.parameter(W_mu_tag_p) V_mu_tag = dy.parameter(V_mu_tag_p) b_mu_tag = dy.parameter(b_mu_tag_p) W_sig_tag = dy.parameter(W_sig_tag_p) V_sig_tag = dy.parameter(V_sig_tag_p) b_sig_tag = dy.parameter(b_sig_tag_p) # Compute tag encoding tags_tensor = dy.sparse_inputTensor([tags], np.ones((len(tags), )), (NUM_TAGS, )) mu_tag = dy.dropout(mlp(tags_tensor, W_mu_tag, V_mu_tag, b_mu_tag), DROPOUT) log_var_tag = dy.dropout(mlp(tags_tensor, W_sig_tag, V_sig_tag, b_sig_tag), DROPOUT) # Combine encodings for mean and diagonal covariance W_mu = dy.parameter(W_mu_p) b_mu = dy.parameter(b_mu_p) W_sig = dy.parameter(W_sig_p) b_sig = dy.parameter(b_sig_p) # Slowly phase out getting both inputs if random.random() < epsilon: mask = dy.zeros(HIDDEN_DIM) else: mask = dy.ones(HIDDEN_DIM) if random.random() < 0.5: mu_tweet = dy.cmult(mu_tweet, mask) log_var_tweet = dy.cmult(log_var_tweet, mask) else: mu_tag = dy.cmult(mu_tag, mask) log_var_tag = dy.cmult(log_var_tag, mask) mu = dy.affine_transform([b_mu, W_mu, dy.concatenate([mu_tweet, mu_tag])]) log_var = dy.affine_transform( [b_sig, W_sig, dy.concatenate([log_var_tweet, log_var_tag])]) # KL-Divergence loss computation kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) z = reparameterize(mu, log_var) # now step through the output sentence all_losses = [] current_state = lstm_decode.initial_state().set_s([z, dy.tanh(z)]) prev_word = src[0] W_sm = dy.parameter(W_tweet_softmax_p) b_sm = dy.parameter(b_tweet_softmax_p) for next_word in src[1:]: # feed the current state into the current_state = current_state.add_input(embed[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) all_losses.append(dy.pickneglogsoftmax(s, next_word)) # Slowly phase out teacher forcing (this may be slow??) if random.random() < epsilon: p = dy.softmax(s).npvalue() prev_word = np.random.choice(VOCAB_SIZE, p=p / p.sum()) else: prev_word = next_word softmax_loss = dy.esum(all_losses) W_hidden = dy.parameter(W_hidden_p) b_hidden = dy.parameter(b_hidden_p) W_out = dy.parameter(W_tag_output_p) b_out = dy.parameter(b_tag_output_p) h = dy.dropout(dy.tanh(b_hidden + W_hidden * z), DROPOUT) o = dy.logistic(b_out + W_out * h) crossentropy_loss = dy.binary_log_loss(o, tags_tensor) return kl_loss, softmax_loss, crossentropy_loss
def hallucinate_tweet(given_tags): dy.renew_cg() # Transduce all batch elements with an LSTM tags = given_tags # initialize the LSTM #init_state_src = lstm_encode.initial_state() # get the output of the first LSTM #src_output = init_state_src.add_inputs([embed[x] for x in src])[-1].output() # Now compute mean and standard deviation of source hidden state. #W_mu_tweet = dy.parameter(W_mu_tweet_p) #V_mu_tweet = dy.parameter(V_mu_tweet_p) #b_mu_tweet = dy.parameter(b_mu_tweet_p) #W_sig_tweet = dy.parameter(W_sig_tweet_p) #V_sig_tweet = dy.parameter(V_sig_tweet_p) #b_sig_tweet = dy.parameter(b_sig_tweet_p) # Compute tweet encoding #mu_tweet = mlp(src_output, W_mu_tweet, V_mu_tweet, b_mu_tweet) #log_var_tweet = mlp(src_output, W_sig_tweet, V_sig_tweet, b_sig_tweet) W_mu_tag = dy.parameter(W_mu_tag_p) V_mu_tag = dy.parameter(V_mu_tag_p) b_mu_tag = dy.parameter(b_mu_tag_p) W_sig_tag = dy.parameter(W_sig_tag_p) V_sig_tag = dy.parameter(V_sig_tag_p) b_sig_tag = dy.parameter(b_sig_tag_p) # Compute tag encoding tags_tensor = dy.sparse_inputTensor([tags], np.ones((len(tags), )), (NUM_TAGS, )) mu_tag = dy.dropout(mlp(tags_tensor, W_mu_tag, V_mu_tag, b_mu_tag), DROPOUT) log_var_tag = dy.dropout(mlp(tags_tensor, W_sig_tag, V_sig_tag, b_sig_tag), DROPOUT) # Combine encodings for mean and diagonal covariance W_mu = dy.parameter(W_mu_p) b_mu = dy.parameter(b_mu_p) W_sig = dy.parameter(W_sig_p) b_sig = dy.parameter(b_sig_p) mu_tweet = dy.zeros(HIDDEN_DIM) log_var_tweet = dy.zeros(HIDDEN_DIM) mu = dy.affine_transform([b_mu, W_mu, dy.concatenate([mu_tweet, mu_tag])]) log_var = dy.affine_transform( [b_sig, W_sig, dy.concatenate([log_var_tweet, log_var_tag])]) # KL-Divergence loss computation kl_loss = -0.5 * dy.sum_elems(1 + log_var - dy.pow(mu, dy.inputVector([2])) - dy.exp(log_var)) z = reparameterize(mu, log_var) # now step through the output sentence all_losses = [] current_state = lstm_decode.initial_state().set_s([z, dy.tanh(z)]) prev_word = vocab[START] W_sm = dy.parameter(W_tweet_softmax_p) b_sm = dy.parameter(b_tweet_softmax_p) gen_tweet = [] for i in range(20): # feed the current state into the current_state = current_state.add_input(embed[prev_word]) output_embedding = current_state.output() s = dy.affine_transform([b_sm, W_sm, output_embedding]) p = dy.softmax(s).npvalue() next_word = np.random.choice(VOCAB_SIZE, p=p / p.sum()) gen_tweet.append(next_word) prev_word = next_word return gen_tweet
def transduce(self, inputs): keys = list(self.word_vect.keys()) values = list(self.word_vect.values()) ngram_vocab_vect = dy.sparse_inputTensor([keys], values, (self.dict_entry,)) return dy.rectify(self.word_ngram.transform(ngram_vocab_vect))