def __call__(self, H,is_train=True): """ :param xs: a list of ngrams (or words if win is set to 1) :return: embeddings looked from tables """ seq_len = len(H) if is_train: # in the training phase, perform dropout W1 = dy.dropout(self.W1, self.dropout_rate) W2 = dy.dropout(self.W2, self.dropout_rate) else: W1= self.W1 W2 = self.W2 pool= dy.average(H) aspect_attentions = [] Weights=[] for t in range(seq_len): ht = H[t] scores = dy.tanh(dy.transpose(ht)*W1*pool+self.bd) # print(scores.value()) Weights.append(scores.value() ) ht_hat=dy.cmult(dy.softmax(scores),ht) # print(ht_hat.value()) aspect_attentions.append(ht_hat) Weights_np=[] return aspect_attentions,Weights_np
def enable_dropout(self): self.fwdRNN.set_dropout(0.3) self.bwdRNN.set_dropout(0.3) self.cfwdRNN.set_dropout(0.3) self.cbwdRNN.set_dropout(0.3) self.w1 = dy.dropout(self.w1, 0.3) self.b1 = dy.dropout(self.b1, 0.3)
def init_sequence(self, test=False): self.test = test if not test: self.dropout_mask_x = dy.dropout(dy.ones((self.n_in, )), self.dropout_x) self.dropout_mask_h = dy.dropout(dy.ones((self.n_hidden, )), self.dropout_h)
def expr_for_tree(self,xt,tree,node,is_train): if is_train: # in the training phase, perform dropout W_dropout = dy.dropout(self.WP, self.dropout_rate) WR_dropout = dy.dropout(self.WR, self.dropout_rate) WC_dropout = dy.dropout(self.WC, self.dropout_rate) else: W_dropout = self.WP WR_dropout = self.WR WC_dropout = self.WC if node is None or node.is_leaf(): Wx = W_dropout * xt # h = dy.tanh(Wx + self.bc) h = dy.tanh(dy.affine_transform([self.bc, self.WC, xt])) return h #get child nodes children=tree.children(node.identifier) children_sum=dy.zeros((self.n_out)) for i in range(len(children)): hc=self.expr_for_tree(xt=xt,tree=tree,node=children[i],is_train=is_train) rt = dy.logistic(self.WR * xt +self.UR*hc+self.br) children_sum=children_sum+dy.cmult(rt, hc) Wx = W_dropout * xt h = dy.tanh(Wx + self.bp+self.UP*children_sum) return h
def build_tagging_graph(self, sentence): dy.renew_cg() embeddings = [self.word_rep(w) for w in sentence] lstm_out = self.bi_lstm.transduce(embeddings) H = dy.parameter(self.lstm_to_tags_params) Hb = dy.parameter(self.lstm_to_tags_bias) O = dy.parameter(self.mlp_out) Ob = dy.parameter(self.mlp_out_bias) scores = [] if options.bigram: for rep, word in zip(lstm_out, sentence): bi1 = dy.lookup(self.bigram_lookup, word[0], update=self.we_update) bi2 = dy.lookup(self.bigram_lookup, word[1], update=self.we_update) if self.dropout is not None: bi1 = dy.dropout(bi1, self.dropout) bi2 = dy.dropout(bi2, self.dropout) score_t = O * dy.tanh(H * dy.concatenate([bi1, rep, bi2]) + Hb) + Ob scores.append(score_t) else: for rep in lstm_out: score_t = O * dy.tanh(H * rep + Hb) + Ob scores.append(score_t) return scores
def build_tagging_graph(self, batch): self.initialize_paramerets() # get the word vectors. batch_embs = self.word_rep(batch) # feed word vectors into biLSTM fw_exps = self.f_init.transduce(batch_embs) bw_exps = self.b_init.transduce(reversed(batch_embs)) # biLSTM states bi_exps = [ dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps)) ] # 2nd biLSTM fw_exps = self.f2_init.transduce(bi_exps) bw_exps = self.b2_init.transduce(reversed(bi_exps)) # biLSTM states bi_exps = dy.concatenate([ dy.concatenate([f, b]) for f, b in zip(fw_exps, reversed(bw_exps)) ], d=1) aT = self.meta.activation(self.aw * bi_exps + self.ab) alpha = self.av * aT attn = dy.softmax(alpha, 1) weighted_sum = dy.reshape(bi_exps * dy.transpose(attn), (self.meta.lstm_word_dim * 2, )) if not self.eval: weighted_sum = dy.dropout(weighted_sum, 0.3) xh = self.meta.activation(self.w1 * weighted_sum + self.b1) if not self.eval: xh = dy.dropout(xh, 0.3) xo = self.w2 * xh + self.b2 return xo
def apply(self, sent1, sent2): eL = dy.parameter(self.linear) sent1 = dy.inputTensor(self.embedding.all_embeds_from_ix(sent1)) * eL sent2 = dy.inputTensor(self.embedding.all_embeds_from_ix(sent2)) * eL out1, out2 = self.feed_F(sent1, sent2) e_out = out1 * dy.transpose(out2) prob_f_1 = dy.softmax(e_out) score = dy.transpose(e_out) prob_f_2 = dy.softmax(score) sent1_allign = dy.concatenate_cols([sent1, prob_f_1 * sent2]) sent2_allign = dy.concatenate_cols([sent2, prob_f_2 * sent1]) out_g_1, out_g_2 = self.feed_G(sent1_allign, sent2_allign) sent1_out_g = dy.sum_dim(out_g_1, [0]) sent2_out_g = dy.sum_dim(out_g_2, [0]) concat = dy.transpose(dy.concatenate([sent1_out_g, sent2_out_g])) h_step_1 = dy.parameter(self.h_step_1) sent_h = dy.rectify(dy.dropout(concat, 0.2) * h_step_1) h_step_2 = dy.parameter(self.h_step_2) sent_h = dy.rectify(dy.dropout(sent_h, 0.2) * h_step_2) final = dy.parameter(self.linear2) final = dy.transpose(sent_h * final) return final
def recurrence(self, xt, hmtm1, h_history_tm1, dropout_flag): """ :param xt: input vector at the time step t :param hmtm1: hidden memories in previous n_steps steps :param h_tilde_tm1: previous hidden summary :param dropout_flag: make a decision for conducting partial dropout :return: """ score = dy.concatenate([dy.dot_product(self.u, dy.tanh( \ self.W_h * hmtm1[i] + self.W_x * xt + self.W_htilde * h_history_tm1)) for i in range(self.n_steps)]) # normalize the attention score score = dy.softmax(score) # shape: (1, n_out), history of [h[t-n_steps-1], ..., h[t-2]] h_history_t = dy.reshape(dy.transpose(score) * hmtm1[:-1], d=(self.n_out,)) htm1 = hmtm1[-1] #h_tilde_t = dy.concatenate([h_history_t, htm1]) h_tilde_t = htm1 + dy.rectify(h_history_t) if dropout_flag: # perform partial dropout, i.e., add dropout over the matrices W_x* rt = dy.logistic(dy.dropout(self.W_xr, self.dropout_rate) * xt + self.W_hr * h_tilde_t + self.br) zt = dy.logistic(dy.dropout(self.W_xz, self.dropout_rate) * xt + self.W_hz * h_tilde_t + self.bz) ht_hat = dy.tanh(dy.dropout(self.W_xh, self.dropout_rate) * xt + self.W_hh * dy.cmult(rt, h_tilde_t) \ + self.bh) ht = dy.cmult(zt, h_tilde_t) + dy.cmult((1.0 - zt), ht_hat) else: rt = dy.logistic(self.W_xr * xt + self.W_hr * h_tilde_t + self.br) zt = dy.logistic(self.W_xz * xt + self.W_hz * h_tilde_t + self.bz) ht_hat = dy.tanh(self.W_xh * xt + self.W_hh * dy.cmult(rt, h_tilde_t) + self.bh) ht = dy.cmult(zt, h_tilde_t) + dy.cmult((1.0 - zt), ht_hat) hmt = dy.concatenate([hmtm1[1:], dy.reshape(ht, (1, self.n_out))]) return hmt, h_history_t
def word_assoc_score(self, source_idx, target_idx, relation): """ NOTE THAT DROPOUT IS BEING APPLIED HERE :param source_idx: embedding index of source atom :param target_idx: embedding index of target atom :param relation: relation type :return: score """ # prepare s = self.embeddings[source_idx] if self.no_assoc: A = dy.const_parameter(self.word_assoc_weights[relation]) else: A = dy.parameter(self.word_assoc_weights[relation]) dy.dropout(A, self.dropout) t = self.embeddings[target_idx] # compute if self.mode == BILINEAR_MODE: return dy.transpose(s) * A * t elif self.mode == DIAG_RANK1_MODE: diag_A = dyagonalize(A[0]) rank1_BC = A[1] * dy.transpose(A[2]) ABC = diag_A + rank1_BC return dy.transpose(s) * ABC * t elif self.mode == TRANSLATIONAL_EMBED_MODE: return -dy.l2_norm(s - t + A) elif self.mode == DISTMULT: return dy.sum_elems(dy.cmult(dy.cmult(s, A), t))
def calc_loss(self, enc_sen, y_adv, vec_drop, train): """ the attacker core functionality. mlp function, with (possibely) multi layers, and at least one. :param enc_sen: :param y_adv: :param vec_drop: :param train: :return: """ w = dy.parameter(self._params["adv_w0"]) b = dy.parameter(self._params["adv_b0"]) if train: drop = self._dropout out = dy.dropout(enc_sen, vec_drop) else: drop = 0 out = enc_sen out = dy.tanh(dy.dropout(dy.affine_transform([b, w, out]), drop)) if self._mlp_layers > 2: for i in range(self._mlp_layers - 2): w = dy.parameter(self._params["adv_w" + str(i + 1)]) b = dy.parameter(self._params["adv_b" + str(i + 1)]) out = dy.tanh( dy.dropout(dy.affine_transform([b, w, out]), drop)) w = dy.parameter(self._params["adv_w" + str(self._mlp_layers - 1)]) b = dy.parameter(self._params["adv_b" + str(self._mlp_layers - 1)]) out = dy.affine_transform([b, w, out]) task_probs = dy.softmax(out) adv_loss = dy.pickneglogsoftmax(out, y_adv) return adv_loss, np.argmax(task_probs.npvalue())
def __call__(self, H1,H2,H3,is_train=True): """ :param xs: a list of ngrams (or words if win is set to 1) :return: embeddings looked from tables """ seq_len = len(H1) if is_train: # in the training phase, perform dropout W1 = dy.dropout(self.W1, self.dropout_rate) W2 = dy.dropout(self.W2, self.dropout_rate) W3 = dy.dropout(self.W3, self.dropout_rate) else: W1= self.W1 W2 = self.W2 W3 = self.W3 H = [] for t in range(seq_len): ht_hat = dy.tanh(W1*H1[t]+W2*H2[t]+W3*H3[t]+self.bd) H.append(ht_hat) return H
def adv_mlp(self, vec_sen, adv_ind, train, vec_drop): """ calculating the adversarial mlp over the sentence representation vector. more than a single adversarial mlp is supported """ if train: drop = self._dropout out = dy.dropout(vec_sen, vec_drop) else: drop = 0 out = vec_sen for i in range(self._adv_depth): w = dy.parameter(self._params["adv_" + str(adv_ind) + "_w" + str(i + 1)]) b = dy.parameter(self._params["adv_" + str(adv_ind) + "_b" + str(i + 1)]) out = dy.tanh(dy.dropout(dy.affine_transform([b, w, out]), drop)) w = dy.parameter(self._params["adv_" + str(adv_ind) + "_w" + str(self._adv_depth + 1)]) b = dy.parameter(self._params["adv_" + str(adv_ind) + "_b" + str(self._adv_depth + 1)]) out = dy.affine_transform([b, w, out]) return out
def forward(self, features, dropout=False): # extract ids for word, pos and label word_ids = [self.vocab.word2id(w) for w in features[:20]] pos_ids = [self.vocab.pos2id(p) for p in features[20:40]] label_ids = [self.vocab.label2id(l) for l in features[40:52]] # extract embedding from features word_embeds = [self.word_embedding[wid] for wid in word_ids] pos_embeds = [self.pos_embedding[pid] for pid in pos_ids] label_embeds = [self.label_embedding[lid] for lid in label_ids] # concatenating all features embedding_layer = dynet.concatenate(word_embeds + pos_embeds + label_embeds) # calculating the hidden layers hidden_1 = self.transfer(self.hidden_layer_1.expr() * embedding_layer + self.hidden_layer_bias_1.expr()) if dropout: hidden_1 = dynet.dropout(hidden_1, self.properties.dropout) hidden_2 = self.transfer(self.hidden_layer_2.expr() * hidden_1 + self.hidden_layer_bias_2.expr()) if dropout: hidden_2 = dynet.dropout(hidden_2, self.properties.dropout) # calculating the output layer output = self.output_layer.expr() * hidden_2 + self.output_bias.expr() return output
def _build_computation_graph(self, words, train_mode=True): """ Builds the computational graph. """ dy.renew_cg() # turn parameters into expressions softmax_weight_exp = dy.parameter(self.softmax_weight) softmax_bias_exp = dy.parameter(self.softmax_bias) word_reps = [self._word_rep(word) for word in words] embs = dy.concatenate(word_reps, d=1) if self.pooling_method == "average": average_emb = dy.mean_dim(embs, d=1) elif self.pooling_method == "max": average_emb = dy.max_dim(embs, d=1) else: raise NotImplementedError average_emb = dy.reshape(average_emb, (self.word_embedding_size,)) if self.average_dropout is not None: dy.dropout(average_emb, p=self.average_dropout) return softmax_weight_exp * average_emb + softmax_bias_exp
def evaluate_recurrent(self, fwd_bigrams, unigrams, test=False): fwd1 = self.fwd_lstm1.initial_state() back1 = self.back_lstm1.initial_state() fwd2 = self.fwd_lstm2.initial_state() back2 = self.back_lstm2.initial_state() fwd_input = [] for i in range(len(unigrams)): bivec = dynet.lookup(self.bigram_embed, fwd_bigrams[i]) univec = dynet.lookup(self.unigram_embed, unigrams[i]) vec = dynet.concatenate([bivec, univec]) # fwd_input.append(dynet.tanh(self.embed2lstm_W*vec)) fwd_input.append(vec) back_input = [] for i in range(len(unigrams)): bivec = dynet.lookup(self.bigram_embed, fwd_bigrams[i + 1]) univec = dynet.lookup(self.unigram_embed, unigrams[i]) vec = dynet.concatenate([bivec, univec]) # back_input.append(dynet.tanh(self.embed2lstm_W*vec)) back_input.append(vec) fwd1_out = [] for vec in fwd_input: fwd1 = fwd1.add_input(vec) fwd_vec = fwd1.output() fwd1_out.append(fwd_vec) back1_out = [] for vec in reversed(back_input): back = back1.add_input(vec) back1_vec = back.output() back1_out.append(back1_vec) lsmt2_input = [] for (f, b) in zip(fwd1_out, reversed(back1_out)): lsmt2_input.append(dynet.concatenate([f, b])) fwd2_out = [] for vec in lsmt2_input: if self.droprate > 0 and not test: vec = dynet.dropout(vec, self.droprate) fwd2 = fwd2.add_input(vec) fwd_vec = fwd2.output() fwd2_out.append(fwd_vec) back2_out = [] for vec in reversed(lsmt2_input): if self.droprate > 0 and not test: vec = dynet.dropout(vec, self.droprate) back2 = back2.add_input(vec) back_vec = back2.output() back2_out.append(back_vec) # fwd_out = [dynet.concatenate([f1,f2]) for (f1,f2) in zip(fwd1_out,fwd2_out)] # back_out = [dynet.concatenate([b1,b2]) for (b1,b2) in zip(back1_out,back2_out)] return fwd2_out, back2_out[::-1]
def out_layer(self,x,dropout): if dropout: W = dy.dropout(self._W2,0.3) b = dy.dropout(self._b2,0.3) else: W = self._W2 b = self._b2 return (W*x+b)
def set_dropouts(self, input_drop=0, recur_drop=0): self.input_drop = input_drop self.recur_drop = recur_drop self.input_drop_mask = dy.dropout(dy.ones(self.input_size), self.input_drop) self.recur_drop_mask = dy.dropout(dy.ones(self.recur_size), self.recur_drop)
def out_layer(self, x, dropout): if dropout: W = dy.dropout(dy.parameter(self._W2), 0.3) b = dy.dropout(dy.parameter(self._b2), 0.3) else: W = dy.parameter(self._W2) b = dy.parameter(self._b2) return (W * x + b)
def hid_2_layer(self,x,dropout): if dropout: W = dy.dropout(self._W12,0.3) b = dy.dropout(self._b12,0.3) else: W = self._W12 b = self._b12 return self.activation(W*x+b)
def hid_2_layer(self, x, dropout): if dropout: W = dy.dropout(dy.parameter(self._W12), 0.3) b = dy.dropout(dy.parameter(self._b12), 0.3) else: W = dy.parameter(self._W12) b = dy.parameter(self._b12) return self.activation(W * x + b)
def hid_layer(self, x, dropout): if dropout: W = dy.dropout(dy.parameter(self._W1), 0.3) b = dy.dropout(dy.parameter(self._b1), 0.3) else: W = dy.parameter(self._W1) b = dy.parameter(self._b1) return dy.rectify(W * x + b)
def evaluate_recurrent(self, word_inds, tag_inds, test=False): fwd1 = self.fwd_lstm1.initial_state() back1 = self.back_lstm1.initial_state() fwd2 = self.fwd_lstm2.initial_state() back2 = self.back_lstm2.initial_state() sentence = [] for (w, t) in zip(word_inds, tag_inds): wordvec = dynet.lookup(self.word_embed, w) tagvec = dynet.lookup(self.tag_embed, t) vec = dynet.concatenate([wordvec, tagvec]) sentence.append(vec) fwd1_out = [] for vec in sentence: fwd1 = fwd1.add_input(vec) fwd_vec = fwd1.output() fwd1_out.append(fwd_vec) back1_out = [] for vec in reversed(sentence): back1 = back1.add_input(vec) back_vec = back1.output() back1_out.append(back_vec) lstm2_input = [] for (f, b) in zip(fwd1_out, reversed(back1_out)): lstm2_input.append(dynet.concatenate([f, b])) fwd2_out = [] for vec in lstm2_input: if self.droprate > 0 and not test: vec = dynet.dropout(vec, self.droprate) fwd2 = fwd2.add_input(vec) fwd_vec = fwd2.output() fwd2_out.append(fwd_vec) back2_out = [] for vec in reversed(lstm2_input): if self.droprate > 0 and not test: vec = dynet.dropout(vec, self.droprate) back2 = back2.add_input(vec) back_vec = back2.output() back2_out.append(back_vec) fwd_out = [ dynet.concatenate([f1, f2]) for (f1, f2) in zip(fwd1_out, fwd2_out) ] back_out = [ dynet.concatenate([b1, b2]) for (b1, b2) in zip(back1_out, back2_out) ] return fwd_out, back_out[::-1]
def _calculate_train_score(self, sentence): """Same as _calculate_score, but applies dropout after embedding and bi-lstm layers, used for training""" embeddings = [self.lookup[w] for w in sentence] embeddings = [dy.dropout(e, self.dropout_rate) for e in embeddings] bi_lstm_output = self.bilstm.transduce(embeddings) bi_lstm_output = [ dy.dropout(o, self.dropout_rate) for o in bi_lstm_output ] return [self.w * o + self.b for o in bi_lstm_output]
def hid_layer(self,x,y,dropout): if dropout: W_h = dy.dropout(self._W1_h,0.3) W_d = dy.dropout(self._W1_d,0.3) b = dy.dropout(self._b1,0.3) else: W_h = self._W1_h W_d = self._W1_d b = self._b1 return self.activation(W_h*x+W_d*y+b)
def hid_layer(self, x, y, dropout): if dropout: W_h = dy.dropout(dy.parameter(self._W1_h), 0.3) W_d = dy.dropout(dy.parameter(self._W1_d), 0.3) b = dy.dropout(dy.parameter(self._b1), 0.3) else: W_h = dy.parameter(self._W1_h) W_d = dy.parameter(self._W1_d) b = dy.parameter(self._b1) return self.activation(W_h * x + W_d * y + b)
def __call__(self, sentence1, sentence2): W_1 = dy.parameter(self.W_1) # relu activation with dropout out1 = dy.rectify(dy.dropout(sentence1, self.drop_param) * W_1) out2 = dy.rectify(dy.dropout(sentence2, self.drop_param) * W_1) W_2 = dy.parameter(self.W_2) out1 = dy.rectify(dy.dropout(out1, self.drop_param) * W_2) out2 = dy.rectify(dy.dropout(out2, self.drop_param) * W_2) return out1, out2
def cal_scores(self, src_encodings, masks, train): src_len = len(src_encodings) batch_size = src_encodings[0].dim()[1] heads_LRlayer = [] mods_LRlayer = [] for encoding in src_encodings: heads_LRlayer.append( self.leaky_ReLu(self.b_head.expr() + self.W_head.expr() * encoding)) mods_LRlayer.append( self.leaky_ReLu(self.b_mod.expr() + self.W_mod.expr() * encoding)) heads_labels = [] heads = [] labels = [] neg_inf = dy.constant(1, -float("inf")) for row in range( 1, src_len ): #exclude root @ index=0 since roots do not have heads scores_idx = [] for col in range(src_len): dist = col - row mdist = self.dist_max dist_i = (min(dist, mdist - 1) + mdist if dist >= 0 else int( min(-1.0 * dist, mdist - 1))) dist_vec = dy.lookup_batch(self.dlookup, [dist_i] * batch_size) if train: input_vec = dy.concatenate([ dy.esum([ dy.dropout(heads_LRlayer[col], self.dropout), dy.dropout(mods_LRlayer[row], self.dropout) ]), dist_vec ]) else: input_vec = dy.concatenate([ dy.esum([heads_LRlayer[col], mods_LRlayer[row]]), dist_vec ]) score = self.scoreHeadModLabel(input_vec, train) mask = masks[row] and masks[col] join_scores = [] for bdx in range(batch_size): if (mask[bdx] == 1): join_scores.append(dy.pick_batch_elem(score, bdx)) else: join_scores.append( dy.concatenate([neg_inf] * self.n_labels)) scores_idx.append(dy.concatenate_to_batch(join_scores)) heads_labels.append(dy.concatenate(scores_idx)) return heads_labels
def forward(self, s1, s2, label=None): eL = dy.parameter(self.embeddingLinear) s1 = dy.inputTensor(s1) * eL s2 = dy.inputTensor(s2) * eL # F step Lf1 = dy.parameter(self.mlpF1) Fs1 = dy.rectify(dy.dropout(s1, 0.2) * Lf1) Fs2 = dy.rectify(dy.dropout(s2, 0.2) * Lf1) Lf2 = dy.parameter(self.mlpF2) Fs1 = dy.rectify(dy.dropout(Fs1, 0.2) * Lf2) Fs2 = dy.rectify(dy.dropout(Fs2, 0.2) * Lf2) # Attention scoring score1 = Fs1 * dy.transpose(Fs2) prob1 = dy.softmax(score1) score2 = dy.transpose(score1) prob2 = dy.softmax(score2) # Align pairs using attention s1Pairs = dy.concatenate_cols([s1, prob1 * s2]) s2Pairs = dy.concatenate_cols([s2, prob2 * s1]) # G step Lg1 = dy.parameter(self.mlpG1) Gs1 = dy.rectify(dy.dropout(s1Pairs, 0.2) * Lg1) Gs2 = dy.rectify(dy.dropout(s2Pairs, 0.2) * Lg1) Lg2 = dy.parameter(self.mlpG2) Gs1 = dy.rectify(dy.dropout(Gs1, 0.2) * Lg2) Gs2 = dy.rectify(dy.dropout(Gs2, 0.2) * Lg2) # Sum Ss1 = dy.sum_dim(Gs1, [0]) Ss2 = dy.sum_dim(Gs2, [0]) concatS12 = dy.transpose(dy.concatenate([Ss1, Ss2])) # H step Lh1 = dy.parameter(self.mlpH1) Hs = dy.rectify(dy.dropout(concatS12, 0.2) * Lh1) Lh2 = dy.parameter(self.mlpH2) Hs = dy.rectify(dy.dropout(Hs, 0.2) * Lh2) # Final layer final_layer = dy.parameter(self.final_layer) final = dy.transpose(Hs * final_layer) # Label can be 0... if label != None: return dy.pickneglogsoftmax(final, label) else: out = dy.softmax(final) return np.argmax(out.npvalue())
def greedy_search(self, char_seq, truth = None, mu =0.): init_state = self.params['lstm'].initial_state().add_input(self.param_exprs['<bos>']) init_y = dy.tanh(self.param_exprs['pW'] * init_state.output() + self.param_exprs['pb']) init_score = dy.scalarInput(0.) init_sentence = Sentence(score=init_score.scalar_value(),score_expr=init_score,LSTMState =init_state, y= init_y , prevState = None, wlen=None, golden=True) if truth is not None: cembs = [ dy.dropout(dy.lookup(self.params['embed'],char),self.options['dropout_rate']) for char in char_seq ] else: cembs = [dy.lookup(self.params['embed'],char) for char in char_seq ] #cembs = [ dy.dropout(dy.lookup(self.params['embed'],char),self.options['dropout_rate']) for char in char_seq ] start_agenda = init_sentence agenda = [start_agenda] for idx, _ in enumerate(char_seq,1): # from left to right, character by character now = None for wlen in range(1,min(idx,self.options['max_word_len'])+1): # generate word candidate vectors # join segmentation sent + word word = self.word_repr(char_seq[idx-wlen:idx], cembs[idx-wlen:idx]) sent = agenda[idx-wlen] if truth is not None: word = dy.dropout(word,self.options['dropout_rate']) word_score = dy.dot_product(word,self.param_exprs['U']) if truth is not None: golden = sent.golden and truth[idx-1]==wlen margin = dy.scalarInput(mu*wlen if truth[idx-1]!=wlen else 0.) score = margin + sent.score_expr + dy.dot_product(sent.y, word) + word_score else: golden = False score = sent.score_expr + dy.dot_product(sent.y, word) + word_score good = (now is None or now.score < score.scalar_value()) if golden or good: new_state = sent.LSTMState.add_input(word) new_y = dy.tanh(self.param_exprs['pW'] * new_state.output() + self.param_exprs['pb']) new_sent = Sentence(score=score.scalar_value(),score_expr=score,LSTMState=new_state,y=new_y, prevState=sent, wlen=wlen, golden=golden) if good: now = new_sent if golden: golden_sent = new_sent agenda.append(now) if truth is not None and truth[idx-1]>0 and (not now.golden): return (now.score_expr - golden_sent.score_expr) if truth is not None: return (now.score_expr - golden_sent.score_expr) return agenda
def __convolve__(self, embeddings, F, b, W1, bW1): sntlen = len(embeddings) emb = dy.concatenate_cols(embeddings) x = dy.conv2d_bias(emb, F, b, [1, 1], is_valid=False) x = dy.rectify(x) x = dy.maxpooling2d(x, [1, sntlen], [1, 1], is_valid=True) if self.DROPOUT > 0: dy.dropout(x, self.DROPOUT) f = dy.reshape(x, (self.EMB_DIM * 1 * 100, )) return W1 * f + bW1
def __call__(self, x, mask=None, train=False): """Input: ((H, T), B)""" x = self.ln1(x) y = self.self_attn(x, x, x, mask, train) y = dy.dropout(y, self.pdrop) if train else y x = x + y x = self.ln2(x) y = self.ffn(x, train) y = dy.dropout(y, self.pdrop) if train else y x = x + y return x
def build_graph(self, features): # extract word and tags ids word_ids = [self.vocab.word2id(word_feat) for word_feat in features[0:20]] tag_ids = [self.vocab.tag2id(tag_feat) for tag_feat in features[20:40]] dep_ids = [self.vocab.dep2id(tag_feat) for tag_feat in features[40:]] # extract word embeddings and tag embeddings from features word_embeds = [self.word_embedding[wid] for wid in word_ids] tag_embeds = [self.tag_embedding[tid] for tid in tag_ids] dep_embeds = [self.dep_embedding[tid] for tid in dep_ids] # concatenating all features (recall that '+' for lists is equivalent to appending two lists) embedding_layer = dynet.concatenate(word_embeds + tag_embeds + dep_embeds) # calculating the hidden layer # .expr() converts a parameter to a matrix expression in dynet (its a dynet-specific syntax). hidden1 = self.transfer(self.hidden_layer1 * embedding_layer + self.hidden_layer_bias1) dropout1 = dynet.dropout(hidden1, 0.1) hidden2 = self.transfer(self.hidden_layer2 * dropout1 + self.hidden_layer_bias2) # To implement network without dropout, remove the line with dropout1 and change hidden2 to: # hidden2 = self.transfer(self.hidden_layer2 * hidden1 + self.hidden_layer_bias2) # calculating the output layer output = self.output_layer * hidden2 + self.output_bias # return the output as a dynet vector (expression) return output
def __call__(self, x, memory, src_mask, tgt_mask, train=False): """Input shape: ((H, T), B)""" x = self.ln1(x) y = self.self_attn(x, x, x, tgt_mask, train) y = dy.dropout(y, self.pdrop) if train else y x = x + y x = self.ln2(x) y = self.src_attn(x, memory, memory, src_mask) y = dy.dropout(y, self.pdrop) if train else y x = x + y x = self.ln3(x) y = self.ffn(x, train) y = dy.dropout(y, self.pdrop) if train else y x = x + y return x
def calc_score_of_history(words, dropout=0.0): # Lookup the embeddings and concatenate them emb = dy.concatenate([W_emb[x] for x in words]) # Create the hidden layer h = dy.tanh(dy.affine_transform([b_h, W_h, emb])) # CHANGE 2: perform dropout if dropout != 0.0: h = dy.dropout(h, dropout) # Calculate the score and return return dy.affine_transform([b_sm, W_sm, h])
def calc_score_of_histories(words, dropout=0.0): # This will change from a list of histories, to a list of words in each history position words = np.transpose(words) # Lookup the embeddings and concatenate them emb = dy.concatenate([dy.lookup_batch(W_emb, x) for x in words]) # Create the hidden layer h = dy.tanh(dy.affine_transform([b_h, W_h, emb])) # Perform dropout if dropout != 0.0: h = dy.dropout(h, dropout) # Calculate the score and return return dy.affine_transform([b_sm, W_sm, h])
def dot_product_attention(query, key, value, mask=None, dropout=None): """Input Shape: ((D, T, H), B)""" scores = batch_matmul(transpose(key, 0, 1), query) if mask is not None: scores = dy.cmult(scores, mask[0]) + (mask[1] * -1e9) weights = folded_softmax(scores) if dropout is not None: weights = dy.dropout(weights, dropout) return batch_matmul(value, weights)
def __call__(self, x, dropout=False): if args.conv: x = dy.reshape(x, (28, 28, 1)) x = dy.conv2d_bias(x, self.F1, self.b1, [1, 1], is_valid=False) x = dy.rectify(dy.maxpooling2d(x, [2, 2], [2, 2])) x = dy.conv2d_bias(x, self.F2, self.b2, [1, 1], is_valid=False) x = dy.rectify(dy.maxpooling2d(x, [2, 2], [2, 2])) # 7x7x64 x = dy.reshape(x, (7 * 7 * 64,)) h = dy.rectify(self.W1 * x + self.hbias) if dropout: h = dy.dropout(h, DROPOUT_RATE) logits = self.W2 * h return logits
def evaluate(self, inputs, train=False): """ Apply all MLP layers to concatenated input :param inputs: (key, vector) per feature type :param train: are we training now? :return: output vector of size self.output_dim """ input_keys, inputs = list(map(list, zip(*list(inputs)))) if self.input_keys: assert input_keys == self.input_keys, "Got: %s\nBut expected input keys: %s" % ( self.input_keys_str(self.input_keys), self.input_keys_str(input_keys)) else: self.input_keys = input_keys if self.gated: gates = self.params.get("gates") if gates is None: # FIXME attention weights should not be just parameters, but based on biaffine product? gates = self.params["gates"] = self.model.add_parameters((len(inputs), self.gated), init=dy.UniformInitializer(1)) input_dims = [i.dim()[0][0] for i in inputs] max_dim = max(input_dims) x = dy.concatenate_cols([dy.concatenate([i, dy.zeroes(max_dim - d)]) # Pad with zeros to get uniform dim if d < max_dim else i for i, d in zip(inputs, input_dims)]) * gates # Possibly multiple "attention heads" -- concatenate outputs to one vector inputs = [dy.reshape(x, (x.dim()[0][0] * x.dim()[0][1],))] x = dy.concatenate(inputs) assert len(x.dim()[0]) == 1, "Input should be a vector, but has dimension " + str(x.dim()[0]) dim = x.dim()[0][0] if self.input_dim: assert dim == self.input_dim, "Input dim mismatch: %d != %d" % (dim, self.input_dim) else: self.init_params(dim) self.config.print(self, level=4) if self.total_layers: if self.weights is None: self.weights = [[self.params[prefix + str(i)] for prefix in ("W", "b")] for i in range(self.total_layers)] if self.weights[0][0].dim()[0][1] < dim: # number of columns in W0 self.weights[0][0] = dy.concatenate_cols([self.weights[0][0], self.params["W0+"]]) for i, (W, b) in enumerate(self.weights): self.config.print(lambda: x.npvalue().tolist(), level=4) try: if train and self.dropout: x = dy.dropout(x, self.dropout) x = self.activation()(W * x + b) except ValueError as e: raise ValueError("Error in evaluating layer %d of %d" % (i + 1, self.total_layers)) from e self.config.print(lambda: x.npvalue().tolist(), level=4) return x
def __call__(self, inputs, dropout=False): x = dy.inputTensor(inputs) conv1 = dy.parameter(self.pConv1) b1 = dy.parameter(self.pB1) x = dy.conv2d_bias(x, conv1, b1, [1, 1], is_valid=False) x = dy.rectify(dy.maxpooling2d(x, [2, 2], [2, 2])) conv2 = dy.parameter(self.pConv2) b2 = dy.parameter(self.pB2) x = dy.conv2d_bias(x, conv2, b2, [1, 1], is_valid=False) x = dy.rectify(dy.maxpooling2d(x, [2, 2], [2, 2])) x = dy.reshape(x, (7*7*64, 1)) w1 = dy.parameter(self.pW1) b3 = dy.parameter(self.pB3) h = dy.rectify(w1*x+b3) if dropout: h = dy.dropout(h, DROPOUT_RATE) w2 = dy.parameter(self.pW2) output = w2*h # output = dy.softmax(w2*h) return output
def backward(self, char_seq, truth): self.renew_cg() cembs = [ dy.dropout(dy.lookup(self.params['embed'],char),self.options['dropout_rate']) for char in char_seq ] word_seq,word = [],[] for char,label in zip(cembs,truth): word.append(char) if label > 0: word_seq.append(word) word = [] score = self.truth_score(word_seq) score_plus_margin_loss = self.beam_search(cembs,truth,self.options['margin_loss_discount']) loss = score_plus_margin_loss - score res = loss.scalar_value() loss.backward() return res
def __call__(self, x, train=False): """Input: ((H, T), B) Output: ((H, T), B).""" x = self.act(self.expand(x)) x = dy.dropout(x, self.pdrop) if train else x return self.contract(x)
def encode(input_, train): x = conv(input_) x = dy.dropout(x, pdrop) if train else x return x
def dropout(self, input_): if self.train: return dy.dropout(input_, self.pdrop) return input_