def apply(self, sent1, sent2): eL = dy.parameter(self.linear) sent1 = dy.inputTensor(self.embedding.all_embeds_from_ix(sent1)) * eL sent2 = dy.inputTensor(self.embedding.all_embeds_from_ix(sent2)) * eL out1, out2 = self.feed_F(sent1, sent2) e_out = out1 * dy.transpose(out2) prob_f_1 = dy.softmax(e_out) score = dy.transpose(e_out) prob_f_2 = dy.softmax(score) sent1_allign = dy.concatenate_cols([sent1, prob_f_1 * sent2]) sent2_allign = dy.concatenate_cols([sent2, prob_f_2 * sent1]) out_g_1, out_g_2 = self.feed_G(sent1_allign, sent2_allign) sent1_out_g = dy.sum_dim(out_g_1, [0]) sent2_out_g = dy.sum_dim(out_g_2, [0]) concat = dy.transpose(dy.concatenate([sent1_out_g, sent2_out_g])) h_step_1 = dy.parameter(self.h_step_1) sent_h = dy.rectify(dy.dropout(concat, 0.2) * h_step_1) h_step_2 = dy.parameter(self.h_step_2) sent_h = dy.rectify(dy.dropout(sent_h, 0.2) * h_step_2) final = dy.parameter(self.linear2) final = dy.transpose(sent_h * final) return final
def transduce(self, src: ExpressionSequence) -> ExpressionSequence: src = src.as_tensor() src_height = src.dim()[0][0] src_width = src.dim()[0][1] # src_channels = 1 batch_size = src.dim()[1] # convolution and pooling layers # src dim is ((40, 1000), 128) src = padding(src, self.filter_width[0]+3) l1 = dy.rectify(dy.conv2d(src, dy.parameter(self.filters1), stride = [self.stride[0], self.stride[0]], is_valid = True)) # ((1, 1000, 64), 128) pool1 = dy.maxpooling2d(l1, (1, 4), (1,2), is_valid = True) #((1, 499, 64), 128) pool1 = padding(pool1, self.filter_width[1]+3) l2 = dy.rectify(dy.conv2d(pool1, dy.parameter(self.filters2), stride = [self.stride[1], self.stride[1]], is_valid = True))# ((1, 499, 512), 128) pool2 = dy.maxpooling2d(l2, (1, 4), (1,2), is_valid = True)#((1, 248, 512), 128) pool2 = padding(pool2, self.filter_width[2]) l3 = dy.rectify(dy.conv2d(pool2, dy.parameter(self.filters3), stride = [self.stride[2], self.stride[2]], is_valid = True))# ((1, 248, 1024), 128) pool3 = dy.max_dim(l3, d = 1) my_norm = dy.l2_norm(pool3) + 1e-6 output = dy.cdiv(pool3,my_norm) output = dy.reshape(output, (self.num_filters[2],), batch_size = batch_size) return ExpressionSequence(expr_tensor=output)
def pop(self, strength): strength_left = strength for element in reversed(self.elements): old_strength = element.strength element.strength = dynet.rectify(old_strength - dynet.rectify(strength_left)) strength_left -= old_strength
def beam_train_max_margin_with_answer_guidence(self, init_state, gold_ans): # perform two beam search; one for prediction and the other for state action suff # max reward y = argmax(r(y)) with the help of gold_ans # max y' = argmax f(x,y) - R(y') # loss = max(f(x,y') - f(x,y) + R(y) - R(y') , 0) #end_state_list = self.beam_predict(init_state) end_state_list = self.beam_predict_max_violation( init_state, gold_ans) # have to use this to make it work.... reward_list = [x.reward(gold_ans) for x in end_state_list] violation_list = [ s.path_score_expression.value() - reward for s, reward in zip(end_state_list, reward_list) ] best_score_state_idx = violation_list.index(max( violation_list)) # find the best scoring seq with minimal reward best_score_state = end_state_list[best_score_state_idx] best_score_state_reward = reward_list[best_score_state_idx] loss_value = 0 if self.only_one_best: best_states = self.beam_find_actions_with_answer_guidence( init_state, gold_ans) if best_states == []: return 0, [] best_reward_state = best_states[0] #print ("debug: found best_reward_state: qid =", best_reward_state.qinfo.seq_qid, best_reward_state) best_reward_state_reward = best_reward_state.reward(gold_ans) #print ("debug: best_reward_state_reward =", best_reward_state_reward) loss = dt.rectify(best_score_state.path_score_expression - best_reward_state.path_score_expression + dt.scalarInput(best_reward_state_reward - best_score_state_reward)) else: best_states = self.beam_find_actions_with_answer_guidence( init_state, gold_ans) best_states_rewards = [s.reward(gold_ans) for s in best_states] max_reward = max(best_states_rewards) best_states = [ s for s, r in zip(best_states, best_states_rewards) if r == max_reward ] loss = dt.average([ dt.rectify(best_score_state.path_score_expression - best_reward_state.path_score_expression + dt.scalarInput(max_reward - best_score_state_reward)) for best_reward_state in best_states ]) loss_value = loss.value() loss.backward() self.neural_model.learner.update() #print ("debug: beam_train_max_margin_with_answer_guidence done. loss_value =", loss_value) return loss_value, best_states
def scorer(self, q_d_hists, q_idf, bm25_score, overlap_features, p): """ Makes all the calculations and returns a relevance score """ idf_vec = dy.inputVector(q_idf) bm25_score = dy.scalarInput(bm25_score) overlap_features = dy.inputVector(overlap_features) # Pass each query term representation through the MLP term_scores = [] for hist in q_d_hists: q_d_hist = dy.reshape(dy.inputVector(hist), (1, len(hist))) hidd_out = dy.rectify(q_d_hist * self.W_1 + self.b_1) for i in range(0, self.mlp_layers): hidd_out = dy.rectify(hidd_out * self.W_n[i] + self.b_n[i]) term_scores.append(hidd_out * self.W_last + self.b_last) # Term Gating gating_weights = idf_vec * self.w_g bm25_feature = bm25_score * self.W_bm25 + self.b_bm25 drop_out = dy.scalarInput(1) drop_num = (np.random.rand(1) < p)/p #p= probability of keeping a unit active drop_out.set(drop_num) bm25_feature *= drop_out drmm_score = dy.transpose(dy.concatenate(term_scores)) * dy.reshape(gating_weights, (len(q_idf), 1)) #basic MLPs output doc_score = dy.transpose(dy.concatenate([drmm_score, overlap_features])) * self.W_scores + self.b_scores #extra features layer return doc_score
def leaky_relu(x): """:type x: dn.Expression :rtype: dn.Expression""" positive = dn.rectify(x) negative = dn.rectify(-x) * -0.01 ret = positive + negative return ret
def transduce(self, embed_sent): src = embed_sent.as_tensor() sent_len = src.dim()[0][1] src_width = 1 batch_size = src.dim()[1] pad_size = (self.window_receptor - 1) / 2 #TODO adapt it also for even window size src = dy.concatenate([ dy.zeroes((self.input_dim, pad_size), batch_size=batch_size), src, dy.zeroes((self.input_dim, pad_size), batch_size=batch_size) ], d=1) padded_sent_len = sent_len + 2 * pad_size conv1 = dy.parameter(self.pConv1) bias1 = dy.parameter(self.pBias1) src_chn = dy.reshape(src, (self.input_dim, padded_sent_len, 1), batch_size=batch_size) cnn_layer1 = dy.conv2d_bias(src_chn, conv1, bias1, stride=[1, 1]) hidden_layer = dy.reshape(cnn_layer1, (self.internal_dim, sent_len, 1), batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) for conv_hid, bias_hid in self.builder_layers: hidden_layer = dy.conv2d_bias(hidden_layer, dy.parameter(conv_hid), dy.parameter(bias_hid), stride=[1, 1]) hidden_layer = dy.reshape(hidden_layer, (self.internal_dim, sent_len, 1), batch_size=batch_size) if self.non_linearity is 'linear': hidden_layer = hidden_layer elif self.non_linearity is 'tanh': hidden_layer = dy.tanh(hidden_layer) elif self.non_linearity is 'relu': hidden_layer = dy.rectify(hidden_layer) elif self.non_linearity is 'sigmoid': hidden_layer = dy.logistic(hidden_layer) last_conv = dy.parameter(self.last_conv) last_bias = dy.parameter(self.last_bias) output = dy.conv2d_bias(hidden_layer, last_conv, last_bias, stride=[1, 1]) output = dy.reshape(output, (sent_len, self.output_dim), batch_size=batch_size) output_seq = ExpressionSequence(expr_tensor=output) self._final_states = [FinalTransducerState(output_seq[-1])] return output_seq
def __call__(self, a, b, c): enc = [ dy.rectify(self.a_mlp(a)), # HOTFIX rectify here? dy.rectify(self.b_mlp(b)), dy.rectify(self.c_mlp(c)) ] enc = [dy.concatenate([dy.scalarInput(1), x]) for x in enc] return self.multilinear(*enc)
def do_one_batch(X_batch, Z_batch): # Flatten the batch into 1-D vector for workaround batch_size = X_batch.shape[0] if DO_BATCH: X_batch_f = X_batch.flatten('F') Z_batch_f = Z_batch.flatten('F') x = dy.reshape(dy.inputVector(X_batch_f), (nmf, nframes), batch_size=batch_size) z = dy.reshape(dy.inputVector(Z_batch_f), (nvgg), batch_size=batch_size) scnn.add_input([X_batch[i] for i in range(X_batch.shape[0])]) vgg.add_input([Z_batch[i] for i in range(X_batch.shape[0])]) else: x = dy.matInput(X_batch.shape[0], X_batch.shape[1]) x.set(X_batch.flatten('F')) z = dy.vecInput(Z_batch.shape[0]) z.set(Z_batch.flatten('F')) x = dy.reshape(dy.transpose(x, [1, 0]), (1, X_batch.shape[1], X_batch.shape[0])) print(x.npvalue().shape) a_h1 = dy.conv2d_bias(x, w_i, b_i, [1, 1], is_valid=False) h1 = dy.rectify(a_h1) h1_pool = dy.kmax_pooling(h1, D[1], d=1) a_h2 = dy.conv2d_bias(h1_pool, w_h1, b_h1, [1, 1], is_valid=False) h2 = dy.rectify(a_h2) h2_pool = dy.kmax_pooling(h2, D[2], d=1) a_h3 = dy.conv2d_bias(h2_pool, w_h2, b_h2, [1, 1], is_valid=False) h3 = dy.rectify(a_h3) h3_pool = dy.kmax_pooling(h3, D[3], d=1) h4 = dy.kmax_pooling(h3_pool, 1, d=1) h4_re = dy.reshape(h4, (J[3], )) #print(h4_re.npvalue().shape) g = dy.scalarInput(1.) zem_sp = dy.weight_norm(h4_re, g) #print(zem_sp.npvalue().shape) zem_vgg = w_embed * z + b_embed #print(zem_vgg.npvalue().shape) sa = dy.transpose(zem_sp) * zem_vgg s = dy.rectify(sa) if PRINT_EMBED: print('Vgg embedding vector:', zem_vgg.npvalue().shape) print(zem_vgg.value()) print('Speech embedding vector:', zem_sp.npvalue().shape) print(zem_sp.value()) if PRINT_SIM: print('Raw Similarity:', sa.npvalue()) print(sa.value()) print('Similarity:', s.npvalue()) print(s.value()) return s
def __call__(self, sentence1, sentence2): W_1 = dy.parameter(self.W_1) # relu activation with dropout out1 = dy.rectify(dy.dropout(sentence1, self.drop_param) * W_1) out2 = dy.rectify(dy.dropout(sentence2, self.drop_param) * W_1) W_2 = dy.parameter(self.W_2) out1 = dy.rectify(dy.dropout(out1, self.drop_param) * W_2) out2 = dy.rectify(dy.dropout(out2, self.drop_param) * W_2) return out1, out2
def selu(x): """ :type x: dn.Expression :rtype: dn.Expression """ positive = dn.rectify(x) positive_indicator = dn.rectify(dn.cdiv(positive, positive + epsilon)) negative = -dn.rectify(-x) exp_negative = dn.exp(negative) - positive_indicator exp_negative_minus_alpha = exp_negative * alpha - alpha + positive_indicator * alpha # x>0: x=x * scale; x<0: x = (alpha * exp(x) - alpha) * scale ret = (positive + exp_negative_minus_alpha) * scale return ret
def forward(self, s1, s2, label=None): eL = dy.parameter(self.embeddingLinear) s1 = dy.inputTensor(s1) * eL s2 = dy.inputTensor(s2) * eL # F step Lf1 = dy.parameter(self.mlpF1) Fs1 = dy.rectify(dy.dropout(s1, 0.2) * Lf1) Fs2 = dy.rectify(dy.dropout(s2, 0.2) * Lf1) Lf2 = dy.parameter(self.mlpF2) Fs1 = dy.rectify(dy.dropout(Fs1, 0.2) * Lf2) Fs2 = dy.rectify(dy.dropout(Fs2, 0.2) * Lf2) # Attention scoring score1 = Fs1 * dy.transpose(Fs2) prob1 = dy.softmax(score1) score2 = dy.transpose(score1) prob2 = dy.softmax(score2) # Align pairs using attention s1Pairs = dy.concatenate_cols([s1, prob1 * s2]) s2Pairs = dy.concatenate_cols([s2, prob2 * s1]) # G step Lg1 = dy.parameter(self.mlpG1) Gs1 = dy.rectify(dy.dropout(s1Pairs, 0.2) * Lg1) Gs2 = dy.rectify(dy.dropout(s2Pairs, 0.2) * Lg1) Lg2 = dy.parameter(self.mlpG2) Gs1 = dy.rectify(dy.dropout(Gs1, 0.2) * Lg2) Gs2 = dy.rectify(dy.dropout(Gs2, 0.2) * Lg2) # Sum Ss1 = dy.sum_dim(Gs1, [0]) Ss2 = dy.sum_dim(Gs2, [0]) concatS12 = dy.transpose(dy.concatenate([Ss1, Ss2])) # H step Lh1 = dy.parameter(self.mlpH1) Hs = dy.rectify(dy.dropout(concatS12, 0.2) * Lh1) Lh2 = dy.parameter(self.mlpH2) Hs = dy.rectify(dy.dropout(Hs, 0.2) * Lh2) # Final layer final_layer = dy.parameter(self.final_layer) final = dy.transpose(Hs * final_layer) # Label can be 0... if label != None: return dy.pickneglogsoftmax(final, label) else: out = dy.softmax(final) return np.argmax(out.npvalue())
def transform(sentence): w1 = dy.parameter(transform_w1) b1 = dy.parameter(transform_b1) w2 = dy.parameter(transform_w2) b2 = dy.parameter(transform_b2) sentence_transformed = dy.colwise_add(w1 * sentence, b1) sentence_transformed = dy.rectify(sentence_transformed) sentence_transformed = dy.colwise_add(w2 * sentence_transformed, b2) sentence_transformed = dy.rectify(sentence_transformed) return sentence_transformed
def cal_scores(self, src_encodings): src_len = len(src_encodings) src_encodings = dy.concatenate_cols( src_encodings) # src_ctx_dim, src_len, batch_size W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head) b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head) W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep) b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep) W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head) b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head) W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep) b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep) U_arc_1 = dy.parameter(self.U_arc_1) u_arc_2 = dy.parameter(self.u_arc_2) U_label_1 = [dy.parameter(x) for x in self.U_label_1] u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1] u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2] b_label = [dy.parameter(x) for x in self.b_label] h_arc_head = dy.rectify( dy.affine_transform( [b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings])) # n_arc_ml_units, src_len, bs h_arc_dep = dy.rectify( dy.affine_transform( [b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings])) h_label_head = dy.rectify( dy.affine_transform([ b_label_hidden_to_head, W_label_hidden_to_head, src_encodings ])) h_label_dep = dy.rectify( dy.affine_transform( [b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings])) h_arc_head_transpose = dy.transpose(h_arc_head) h_label_head_transpose = dy.transpose(h_label_head) s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep, u_arc_2) s_label = [] for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2, b_label): e1 = h_label_head_transpose * U_1 * h_label_dep e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len)) e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep s_label.append(e1 + e2 + e3 + b) return s_arc, s_label
def __call__(self, x, dropout=False): if args.conv: x = dy.reshape(x, (28, 28, 1)) x = dy.conv2d_bias(x, self.F1, self.b1, [1, 1], is_valid=False) x = dy.rectify(dy.maxpooling2d(x, [2, 2], [2, 2])) x = dy.conv2d_bias(x, self.F2, self.b2, [1, 1], is_valid=False) x = dy.rectify(dy.maxpooling2d(x, [2, 2], [2, 2])) # 7x7x64 x = dy.reshape(x, (7 * 7 * 64, )) h = dy.rectify(self.W1 * x + self.hbias) if dropout: h = dy.dropout(h, DROPOUT_RATE) logits = self.W2 * h return logits
def __call__(self, x, dropout=False): if args.conv: x = dy.reshape(x, (28, 28, 1)) x = dy.conv2d_bias(x, self.F1, self.b1, [1, 1], is_valid=False) x = dy.rectify(dy.maxpooling2d(x, [2, 2], [2, 2])) x = dy.conv2d_bias(x, self.F2, self.b2, [1, 1], is_valid=False) x = dy.rectify(dy.maxpooling2d(x, [2, 2], [2, 2])) # 7x7x64 x = dy.reshape(x, (7 * 7 * 64,)) h = dy.rectify(self.W1 * x + self.hbias) if dropout: h = dy.dropout(h, DROPOUT_RATE) logits = self.W2 * h return logits
def set_E_matrix(sen1, sen2, model_params): F_w1 = model_params['F_w1'] F_b1 = model_params['F_b1'] F_w2 = model_params['F_w2'] F_b2 = model_params['F_b2'] #sen1 = dy.dropout(sen1, DROPOUT_RATE) #sen2 = dy.dropout(sen2, DROPOUT_RATE) F_sen1 = dy.rectify(F_w2 * (dy.rectify(dy.colwise_add(F_w1*sen1, F_b1))) + F_b2) F_sen2 = dy.rectify(F_w2 * (dy.rectify(dy.colwise_add(F_w1*sen2, F_b1))) + F_b2) E_matrix = (dy.transpose(F_sen1)) * F_sen2 return E_matrix, F_sen1, F_sen2
def get_v1_v2(alpha, beta, sen1, sen2, model_params): G_w1 = model_params['G_w1'] G_b1 = model_params['G_b1'] G_w2 = model_params['G_w2'] G_b2 = model_params['G_b2'] con = dy.concatenate([sen1, beta], d=0) #con = dy.dropout(con, DROPOUT_RATE) v1 = dy.rectify(G_w2 * (dy.rectify(dy.colwise_add(G_w1 * con, G_b1))) + G_b2) con = dy.concatenate([sen2, alpha], d=0) #con = dy.dropout(con, DROPOUT_RATE) v2 = dy.rectify(G_w2 * (dy.rectify(dy.colwise_add(G_w1 * con, G_b1))) + G_b2) return v1, v2
def recurrence(self, xt, hmtm1, h_history_tm1, dropout_flag): """ :param xt: input vector at the time step t :param hmtm1: hidden memories in previous n_steps steps :param h_tilde_tm1: previous hidden summary :param dropout_flag: make a decision for conducting partial dropout :return: """ score = dy.concatenate([dy.dot_product(self.u, dy.tanh( \ self.W_h * hmtm1[i] + self.W_x * xt + self.W_htilde * h_history_tm1)) for i in range(self.n_steps)]) # normalize the attention score score = dy.softmax(score) # shape: (1, n_out), history of [h[t-n_steps-1], ..., h[t-2]] h_history_t = dy.reshape(dy.transpose(score) * hmtm1[:-1], d=(self.n_out,)) htm1 = hmtm1[-1] #h_tilde_t = dy.concatenate([h_history_t, htm1]) h_tilde_t = htm1 + dy.rectify(h_history_t) if dropout_flag: # perform partial dropout, i.e., add dropout over the matrices W_x* rt = dy.logistic(dy.dropout(self.W_xr, self.dropout_rate) * xt + self.W_hr * h_tilde_t + self.br) zt = dy.logistic(dy.dropout(self.W_xz, self.dropout_rate) * xt + self.W_hz * h_tilde_t + self.bz) ht_hat = dy.tanh(dy.dropout(self.W_xh, self.dropout_rate) * xt + self.W_hh * dy.cmult(rt, h_tilde_t) \ + self.bh) ht = dy.cmult(zt, h_tilde_t) + dy.cmult((1.0 - zt), ht_hat) else: rt = dy.logistic(self.W_xr * xt + self.W_hr * h_tilde_t + self.br) zt = dy.logistic(self.W_xz * xt + self.W_hz * h_tilde_t + self.bz) ht_hat = dy.tanh(self.W_xh * xt + self.W_hh * dy.cmult(rt, h_tilde_t) + self.bh) ht = dy.cmult(zt, h_tilde_t) + dy.cmult((1.0 - zt), ht_hat) hmt = dy.concatenate([hmtm1[1:], dy.reshape(ht, (1, self.n_out))]) return hmt, h_history_t
def loss_cost_sensitive_margin_last(gold_tags, idx, beam_costs_prev, scores, beam_size): beam_size_prev, num_tags = scores.dim()[0] gold_idx = dynet_get_best_flat_idx(gold_tags, idx, beam_costs_prev) costs_flat = dynet_compute_costs_flat(gold_tags, idx, beam_costs_prev) scores_flat = dy.reshape(scores, (beam_size_prev * num_tags,)) scores_flat_np = scores_flat.npvalue() sigma_hat = np.argsort(scores_flat_np)[::-1] # the beam size for the last transition is one. next_beam_size = beam_size if idx < len(gold_tags) - 1 else 1 # gold_idx is inside the beam, so compare to first outside beam. if gold_idx in sigma_hat[:next_beam_size]: comp_idx = sigma_hat[next_beam_size] # gold_idx is outside the beam, so compare to last in beam. else: comp_idx = sigma_hat[next_beam_size - 1] # NOTE: this can be zero if comp_idx has the same cost as gold_idx (desirable?) cost_delta = costs_flat[comp_idx] - costs_flat[gold_idx] return cost_delta * dy.rectify(scores_flat[comp_idx] - scores_flat[gold_idx] + 1.0)
def _vaswani_model_scores(m): out_c2 = dy.rectify( dy.colwise_add(c2_Wlm * m["beam_lm_hs"], dy.pick(m["aux_c2"], m["idx"], 1))) # if cfg["use_beam_bilstm"]: # _, beam_size_prev = out_c2.dim()[0] # beam_hs = [dy.pick(out_c2, i, 1) for i in xrange(beam_size_prev)] # bf_init = b_fwd.initial_state() # bb_init = b_bwd.initial_state() # bf_hs = dy.concatenate_cols(bf_init.transduce(beam_hs)) # bb_hs = dy.concatenate_cols(bb_init.transduce(reversed(beam_hs))[::-1]) # out_c2 = dy.concatenate([bf_hs, bb_hs]) # if cfg["use_beam_mlp"]: # out_b = dy.max_dim(b_W1 * out_c2 + b_b1, 1) # out_c2 = dy.colwise_add(out_c2, dy.rectify(b_W2 * out_b + b_b2)) scores = o_W * out_c2 + o_b scores = dy.transpose(scores) if cfg["accumulate_scores"]: scores = m["acc_scores"] + scores m["scores"] = scores return scores
def _vaswani_model_init(e): w_embs = [w2e[idx] for idx in e["tk_words"]] if cfg["use_postags"]: pos_embs = [pos2e[idx] for idx in e["tk_postags"]] i_embs = [ dy.concatenate([w_embs[i], pos_embs[i]]) for i in xrange(len(e["tk_words"])) ] else: i_embs = w_embs f_init = fwd.initial_state() b_init = bwd.initial_state() lm_init = lm.initial_state() f_hs = dy.concatenate_cols(f_init.transduce(i_embs)) b_hs = dy.concatenate_cols(b_init.transduce(reversed(i_embs))[::-1]) out_c1 = dy.rectify(c1_Wf * f_hs + c1_Wb * b_hs) aux_c2 = c2_Wc * out_c1 m = { "aux_c2": aux_c2, "beam_lm_states": [lm_init], "beam_lm_hs": dy.zeros((cfg["lm_h_dim"], 1)), "idx": 0 } if cfg["accumulate_scores"]: m["acc_scores"] = dy.zeros((1, 1)) return m
def get_constit_loss(fws, bws, goldspans): if not USE_PTB_CONSTITS: raise Exception("should not be using the constit loss now!", USE_PTB_CONSTITS) if len(goldspans) == 0: return None, 0 losses = [] sentlen = len(fws) for j in range(sentlen): istart = 0 if USE_SPAN_CLIP and j > ALLOWED_SPANLEN: istart = max(0, j - ALLOWED_SPANLEN) for i in range(istart, j + 1): constit_ij = w_c * dy.rectify( w_fb * dy.concatenate([fws[i][j], bws[i][j]]) + b_fb) + b_c logloss = dy.log_softmax(constit_ij) isconstit = int((i, j) in goldspans) losses.append(pick(logloss, isconstit)) ptbconstitloss = dy.scalarInput(DELTA) * -esum(losses) numspanstagged = len(losses) return ptbconstitloss, numspanstagged
def forward(self, state): full_side = state[0] empty_side = state[1] full_embs = [ self.l1_weights[actions_ids.index(item)] for item in full_side ] empty_embs = [ self.l1_weights[actions_ids.index(item)] for item in empty_side ] full_sum = dy.esum(full_embs) if len(empty_embs) > 0: empty_sum = dy.esum(empty_embs) else: empty_sum = dy.parameter(self.empty_state) cat = dy.concatenate([full_sum, empty_sum]) result = dy.transpose( dy.rectify( dy.reshape(cat, (1, 2)) * dy.parameter(self.l2_weights))) return result
def encode_sentence(self, toks): state_forward = self.forward_buffRNN.initial_state() state_backward = self.backward_buffRNN.initial_state() tok_embeddings = [] buffer_forward = [] buffer_backward = [] for tok in toks: tok_embeddings.append( dy.rectify(self.W_input * self.get_tok_embedding(tok))) for tid in range(len(toks)): state_forward = state_forward.add_input(tok_embeddings[tid]) buffer_forward.append(state_forward.output()) state_backward = state_backward.add_input( tok_embeddings[len(toks) - 1 - tid]) buffer_backward.append(state_backward.output()) buffer = [ dy.concatenate([x, y]) for x, y in zip(buffer_forward, reversed(buffer_backward)) ] return tok_embeddings, buffer
def calc_scores(words): dy.renew_cg() W_cnn_express = dy.parameter(W_cnn) b_cnn_express = dy.parameter(b_cnn) W_sm_express = dy.parameter(W_sm) b_sm_express = dy.parameter(b_sm) Waux_sm_express = dy.parameter(Waux_sm) baux_sm_express = dy.parameter(baux_sm) # basically, win size tells you how many words/chars/pixels (?) we're 'looking at' at each step. # Here, 1 unit is 1 word. If a sample has fewer words than win size, then we probably do need some padding. # Padd with index 0. (so we're treating the pad words as UNK (?)) if len(words) < WIN_SIZE: words += [0] * (WIN_SIZE-len(words)) # Convolution + pooling layer cnn_in = dy.concatenate([W_emb[x] for x in words], d=1) # concat repr of all words cnn_out = dy.conv2d_bias(cnn_in, W_cnn_express, b_cnn_express, stride=(1, 1), is_valid=False) pool_out = dy.max_dim(cnn_out, d=1) # Is this max pooling? pool_out = dy.reshape(pool_out, (FILTER_SIZE,)) pool_out = dy.rectify(pool_out) # Is this ReLU activation? # get scores for either task scores_main = W_sm_express * pool_out + b_sm_express scores_aux = Waux_sm_express * pool_out + baux_sm_express return scores_main, scores_aux
def _get_semantic_rep(self, y): w_emb = dy.parameter(self.W_latin_embeddings) w_emb2 = dy.parameter(self.W_latin_embeddings2) semantic_rep = w_emb2 * dy.rectify( w_emb * self.embedding_provider.get_word_embedding(y)) self.latin_semantic_rep[y] = semantic_rep.npvalue() return semantic_rep
def lookup(self, token): """ Performs forward propagation from the token yielding a char embedding Args: token (list): a list of chars Returns: a dynet expression. The char embedding. """ token = list(token) char_embeddings = [ self.E[self.charset.index(c)] for c in token if c in self.charset ] #ignores unk chars if not char_embeddings: #empty word, no char recognized print('problematic token', token, file=sys.stderr, flush=True) return self.b fwd_state = self.fwd_rnn.initial_state() fwd_states = fwd_state.transduce(char_embeddings) bwd_state = self.bwd_rnn.initial_state() bwd_states = bwd_state.transduce(reversed(char_embeddings)) hidden = dy.concatenate([fwd_states[-1], bwd_states[-1]]) out = dy.rectify(self.O * hidden + self.b) return out
def calc_predict_and_activations(wids, tag, words): dy.renew_cg() if len(wids) < WIN_SIZE: wids += [0] * (WIN_SIZE-len(wids)) cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1) cnn_out = dy.conv2d_bias(cnn_in, W_cnn, b_cnn, stride=(1, 1), is_valid=False) filters = (dy.reshape(cnn_out, (len(wids), FILTER_SIZE))).npvalue() activations = filters.argmax(axis=0) pool_out = dy.max_dim(cnn_out, d=1) pool_out = dy.reshape(pool_out, (FILTER_SIZE,)) pool_out = dy.rectify(pool_out) scores = (W_sm * pool_out + b_sm).npvalue() print ('%d ||| %s' % (tag, ' '.join(words))) predict = np.argmax(scores) print (display_activations(words, activations)) print ('scores=%s, predict: %d' % (scores, predict)) features = pool_out.npvalue() W = W_sm.npvalue() bias = b_sm.npvalue() print (' bias=%s' % bias) contributions = W * features print (' very bad (%.4f): %s' % (scores[0], contributions[0])) print (' bad (%.4f): %s' % (scores[1], contributions[1])) print (' neutral (%.4f): %s' % (scores[2], contributions[2])) print (' good (%.4f): %s' % (scores[3], contributions[3])) print ('very good (%.4f): %s' % (scores[4], contributions[4]))
def __call__(self, htA, HO, transform_flag=True): """ :param htA: :param HO: :param transform_flag: determine if the model needs selective transformation, :return: """ seq_len = len(HO) HO_hat = [] Weights = [] for i in range(seq_len): hiO = HO[i] if transform_flag: hiO_hat = hiO + dy.rectify(self.W_A * htA + self.W_O * hiO + self.b) else: hiO_hat = hiO wi = dy.tanh(dy.dot_product(self.W_concat, dy.concatenate([htA, hiO_hat]))) HO_hat.append(hiO_hat) Weights.append(wi) HO_hat = dy.concatenate([dy.reshape(ele, d=(1, 2 * self.dim_opi)) for ele in HO_hat]) Weights = dy.concatenate(Weights) # length: seq_len Weights = dy.softmax(Weights) Weights_np = Weights.npvalue() ho_summary_t = dy.reshape(Weights, (1, seq_len)) * HO_hat return dy.reshape(ho_summary_t, (2 * self.dim_opi,)), Weights_np
def calc_predict_and_activations(wids, tag, words): dy.renew_cg() if len(wids) < WIN_SIZE: wids += [0] * (WIN_SIZE - len(wids)) cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1) cnn_out = dy.conv2d_bias(cnn_in, W_cnn, b_cnn, stride=(1, 1), is_valid=False) filters = (dy.reshape(cnn_out, (len(wids), FILTER_SIZE))).npvalue() activations = filters.argmax(axis=0) pool_out = dy.max_dim(cnn_out, d=1) pool_out = dy.reshape(pool_out, (FILTER_SIZE, )) pool_out = dy.rectify(pool_out) scores = (W_sm * pool_out + b_sm).npvalue() print('%d ||| %s' % (tag, ' '.join(words))) predict = np.argmax(scores) print(display_activations(words, activations)) print('scores=%s, predict: %d' % (scores, predict)) features = pool_out.npvalue() W = W_sm.npvalue() bias = b_sm.npvalue() print(' bias=%s' % bias) contributions = W * features print(' very bad (%.4f): %s' % (scores[0], contributions[0])) print(' bad (%.4f): %s' % (scores[1], contributions[1])) print(' neutral (%.4f): %s' % (scores[2], contributions[2])) print(' good (%.4f): %s' % (scores[3], contributions[3])) print('very good (%.4f): %s' % (scores[4], contributions[4]))
def __call__(self, s1): b_nli = dy.parameter(self.b_nli) W_nli_1 = dy.parameter(self.W_nli_1) relu = dy.rectify(dy.affine_transform([b_nli, W_nli_1, s1, ]))#W_nli_2, s2, W_nli_u, u, W_nli_v, v])) b_s = dy.parameter(self.b_s) w_s = dy.parameter(self.w_s) return dy.affine_transform([b_s, w_s, relu])
def calc_scores(wids): dy.renew_cg() if len(wids) < WIN_SIZE: wids += [0] * (WIN_SIZE-len(wids)) cnn_in = dy.concatenate([dy.lookup(W_emb, x) for x in wids], d=1) cnn_out = dy.conv2d_bias(cnn_in, W_cnn, b_cnn, stride=(1, 1), is_valid=False) pool_out = dy.max_dim(cnn_out, d=1) pool_out = dy.reshape(pool_out, (FILTER_SIZE,)) pool_out = dy.rectify(pool_out) return W_sm * pool_out + b_sm
def __call__(self, inputs, dropout=False): x = dy.inputTensor(inputs) conv1 = dy.parameter(self.pConv1) b1 = dy.parameter(self.pB1) x = dy.conv2d_bias(x, conv1, b1, [1, 1], is_valid=False) x = dy.rectify(dy.maxpooling2d(x, [2, 2], [2, 2])) conv2 = dy.parameter(self.pConv2) b2 = dy.parameter(self.pB2) x = dy.conv2d_bias(x, conv2, b2, [1, 1], is_valid=False) x = dy.rectify(dy.maxpooling2d(x, [2, 2], [2, 2])) x = dy.reshape(x, (7*7*64, 1)) w1 = dy.parameter(self.pW1) b3 = dy.parameter(self.pB3) h = dy.rectify(w1*x+b3) if dropout: h = dy.dropout(h, DROPOUT_RATE) w2 = dy.parameter(self.pW2) output = w2*h # output = dy.softmax(w2*h) return output
def cal_scores(self, src_encodings): src_len = len(src_encodings) src_encodings = dy.concatenate_cols(src_encodings) # src_ctx_dim, src_len, batch_size W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head) b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head) W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep) b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep) W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head) b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head) W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep) b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep) U_arc_1 = dy.parameter(self.U_arc_1) u_arc_2 = dy.parameter(self.u_arc_2) U_label_1 = [dy.parameter(x) for x in self.U_label_1] u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1] u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2] b_label = [dy.parameter(x) for x in self.b_label] h_arc_head = dy.rectify(dy.affine_transform([b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings])) # n_arc_ml_units, src_len, bs h_arc_dep = dy.rectify(dy.affine_transform([b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings])) h_label_head = dy.rectify(dy.affine_transform([b_label_hidden_to_head, W_label_hidden_to_head, src_encodings])) h_label_dep = dy.rectify(dy.affine_transform([b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings])) h_arc_head_transpose = dy.transpose(h_arc_head) h_label_head_transpose = dy.transpose(h_label_head) s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep, u_arc_2) s_label = [] for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2, b_label): e1 = h_label_head_transpose * U_1 * h_label_dep e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len)) e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep s_label.append(e1 + e2 + e3 + b) return s_arc, s_label
def highway(input_, train): for func, weight, bias in zip(funcs, weights, biases): proj = dy.rectify(func(input_, train)) transform = dy.logistic(dy.affine_transform([bias, weight, input_])) input_ = dy.cmult(transform, proj) + dy.cmult(input_, 1 - transform) return input_
def node_iteration(rel, g, node, opts, assoc_model, trainer, log_file, is_source): """ Perform one iteration of trying to score a node's neighbors above negative samples. """ # true instances likelihood trues = targets(g, node) if is_source else sources(g, node) side = '->' if is_source else '<-' if len(trues) == 0: return 0.0 if opts.debug: dy.renew_cg(immediate_compute = True, check_validity = True) else: dy.renew_cg() # compute association score as dynet expression (can't do this above due to staleness) true_scores = [] for tr in trues: if is_source: j_assoc_score = assoc_model.word_assoc_score(node, tr, rel) else: j_assoc_score = assoc_model.word_assoc_score(tr, node, rel) if log_file is not None: log_file.write('{} {}\tTRUE_{}\t{:.3e}\n'\ .format(node, side, tr, j_assoc_score.scalar_value())) true_scores.append(j_assoc_score) # false targets likelihood - negative sampling (uniform) # collect negative samples if opts.nll: sample_scores = [[ts] for ts in true_scores] else: margins = [] neg_samples = [np.random.choice(range(N)) for _ in range(opts.neg_samp * len(trues))] # remove source and true targets if applicable for t in [node] + trues: if t in neg_samples: neg_samples.remove(t) neg_samples.append(np.random.choice(range(N))) for (i,ns) in enumerate(neg_samples): # compute association score as dynet expression if is_source: ns_assoc_score = assoc_model.word_assoc_score(node, ns, rel) else: ns_assoc_score = assoc_model.word_assoc_score(ns, node, rel) if log_file is not None: log_file.write('{} {}\tNEG_{}\t{:.3e}\n'\ .format(node, side, ns, ns_assoc_score.scalar_value())) corresponding_true = i // opts.neg_samp if opts.nll: sample_scores[corresponding_true].append(ns_assoc_score) else: # TODO maybe use dy.hinge() ctt_score = true_scores[corresponding_true] margin = ctt_score - ns_assoc_score margins.append(dy.rectify(dy.scalarInput(1.0) - margin)) # compute overall loss if opts.nll: if len(sample_scores) == 0: dy_loss = dy.scalarInput(0.0) else: dy_loss = dy.esum([dy.pickneglogsoftmax(dy.concatenate(scrs), 0) for scrs in sample_scores]) else: if len(margins) == 0: dy_loss = dy.scalarInput(0.0) else: dy_loss = dy.esum(margins) sc_loss = dy_loss.scalar_value() if log_file is not None: log_file.write('{}\tLOSS\t{:.3e}\n'\ .format(node, sc_loss)) # backprop and recompute score if opts.v > 1: timeprint('overall loss for relation {}, node {} as {} = {:.6f}'\ .format(rel, node, 'source' if is_source else 'target', sc_loss)) dy_loss.backward() trainer.update() return sc_loss