def cal_scores(self, src_encodings,predict=False): src_len = len(src_encodings) src_encodings = dy.concatenate_cols(src_encodings) # src_ctx_dim, src_len, batch_size batch_size = src_encodings.dim()[1] W_pos = dy.parameter(self.W_pos) b_pos = dy.parameter(self.b_pos) W_xpos = dy.parameter(self.W_xpos) b_xpos = dy.parameter(self.b_xpos) W_affine_pos = dy.parameter(self.W_affine_pos) b_affine_pos = dy.parameter(self.b_affine_pos) W_affine_xpos = dy.parameter(self.W_affine_xpos) b_affine_xpos = dy.parameter(self.b_affine_xpos) if predict: pos = self.leaky_ReLu(dy.affine_transform([b_pos, W_pos, src_encodings])) # n_pos_mlp_units, src_len, bs xpos = self.leaky_ReLu(dy.affine_transform([b_xpos, W_xpos, src_encodings])) else: src_encodings = dy.dropout_dim(src_encodings,1,self.dropout) pos = dy.dropout_dim(self.leaky_ReLu(dy.affine_transform([b_pos, W_pos, src_encodings])),1,self.dropout) # n_pos_mlp_units, src_len, bs xpos = dy.dropout_dim(self.leaky_ReLu(dy.affine_transform([b_xpos, W_xpos, src_encodings])),1,self.dropout) pos_label = dy.affine_transform([b_affine_pos, dy.transpose(W_affine_pos), pos]) xpos_label = dy.affine_transform([b_affine_xpos, dy.transpose(W_affine_xpos), xpos]) return pos_label, xpos_label
def predict_sequence_batched(self, inputs, mask_array, wlen, predictFlag=False): batch_size = inputs[0].dim()[1] src_len = len(inputs) if not predictFlag: self.charlstm.set_dropouts(self.dropout, self.dropout) self.charlstm.set_dropout_masks(batch_size) char_fwd = self.charlstm.initial_state(batch_size) recur_states, cells = char_fwd.add_inputs(inputs, mask_array, predictFlag) hidden_states = [] for idx in range(src_len): mask = dy.inputVector(mask_array[idx]) mask_expr = dy.reshape(mask, (1, ), batch_size) hidden_states.append(recur_states[idx] * mask_expr) H = dy.concatenate_cols(hidden_states) if (predictFlag): a = dy.softmax(dy.transpose(self.W_atten.expr()) * H) else: #dropout attention connections(keep the same dim across the sequence) a = dy.softmax( dy.transpose(self.W_atten.expr()) * dy.dropout_dim(H, 1, self.dropout)) cell_states = [] for idx in range(batch_size): if (wlen[idx] > 0): cell = dy.pick_batch_elem(cells[wlen[idx] - 1], idx) else: cell = dy.zeros(self.ldims) cell_states.append(cell) C = dy.concatenate_to_batch(cell_states) H_atten = H * dy.transpose(a) char_emb = dy.concatenate([H_atten, C]) if predictFlag: proj_char_emb = dy.affine_transform( [self.b_linear.expr(), self.W_linear.expr(), char_emb]) else: proj_char_emb = dy.affine_transform([ self.b_linear.expr(), self.W_linear.expr(), dy.dropout(char_emb, self.dropout) ]) return proj_char_emb
def run_lstm(self, word_inputs, tag_inputs, isTrain=True): batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] word_embs = [ dy.lookup_batch( self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) + dy.lookup_batch(self.pret_word_embs, w, update=False) for w in word_inputs ] tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs] if isTrain: emb_masks = self.generate_emb_mask(seq_len, batch_size) emb_inputs = [ dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks) ] else: emb_inputs = [ dy.concatenate([w, pos]) for w, pos in zip(word_embs, tag_embs) ] common_top_input, c_fs, c_bs = biLSTM( self.cLSTM_builders, emb_inputs, batch_size, self.dropout_clstm_input if isTrain else 0., self.dropout_clstm_hidden if isTrain else 0.) common_top_recur = dy.concatenate_cols(common_top_input) private_top_input, p_fs, p_bs = biLSTM( self.pLSTM_builders, emb_inputs, batch_size, self.dropout_plstm_input if isTrain else 0., self.dropout_plstm_hidden if isTrain else 0.) private_top_recur = dy.concatenate_cols(private_top_input) if isTrain: common_top_recur = dy.dropout_dim(common_top_recur, 1, self.dropout_mlp) private_top_recur = dy.dropout_dim(private_top_recur, 1, self.dropout_mlp) return common_top_recur, private_top_recur, p_fs, p_bs
def __call__(self, x, train=False): h = x # for W, b in zip(self.W[:-1], self.b[:-1]): for i in range(len(self.W[:-1])): h = self.act(self.W[i]*h + (self.b[i] if self.bias else 0)) if train: if len(h.dim()[0]) > 1: h = dy.dropout_dim(h, 1, self.dropout) else: h = dy.dropout(h, self.dropout) return self.W[-1]*h + (self.b[-1] if self.bias else 0)
def __call__(self, x): for layer, dim in zip(self.layers, self.outdim): x = layer(x) if self.dropout > 0.: if self.dropout_dim >= 0: x = dropout_dim(x, self.dropout_dim, self.dropout) else: x = dropout(x, self.dropout) return x
def __call__(self, x, train=False): h = x for i in range(len(self.W[:-1])): h = self.f(self.W[i] * h + (self.b[i] if self.bias else 0)) if train: if len(h.dim()[0]) > 1: h = dy.dropout_dim(h, 1, self.p) else: h = dy.dropout(h, self.p) return self.W[-1] * h + (self.b[-1] if self.bias else 0)
def encode(self, sentence): if self._train_flag: return dy.dropout_dim( self._bilstm.encode(\ self._base.encode(sentence),\ len(sentence)), 1, self.dropout_rate) else: return self._bilstm.encode(\ self._base.encode(sentence),\ len(sentence))
def next(self, word_idx, context, train, cur_state=None): embs = dy.pick_batch(self.E, word_idx) if train: embs = dy.dropout_dim(embs, 0, self.word_dropout) x = dy.concatenate([embs, context]) if cur_state is None: self.dec_state = self.dec_state.add_input(x) next_state = self.dec_state else: next_state = cur_state.add_input(x) hidden = next_state.output() return hidden, embs, next_state
def next(self, w, c, test=True, state=None): e = dy.pick_batch(self.E, w) if not test: e = dy.dropout_dim(e, 0, self.wdr) # Run LSTM if state is None: self.ds = self.ds.add_input(e) next_state = self.ds else: next_state = state.add_input(e) h = next_state.output() return h, e, next_state
def next(self, w, c, test=True, state=None): if isinstance(w, dy.Expression): e = w else: e = dy.pick_batch(self.E, w) if not test: e = dy.dropout_dim(e, 0, self.wdr) x = dy.concatenate([e, c]) # Run LSTM if state is None: self.ds = self.ds.add_input(x) next_state = self.ds else: next_state = state.add_input(x) h = next_state.output() return h, e, next_state
def transduce(self, inputs, train): xs = inputs[:self.max_length] if not xs: return [] for i in range(self.lstm_layers): for n, d in ("f", 1), ("b", -1): Wr, br, Wh = [self.params["%s%d%s" % (p, i, n)] for p in ("Wr", "br", "Wh")] hs_ = self.params["rnn%d%s" % (i, n)].initial_state().transduce(xs[::d]) hs = [hs_[0]] for t in range(1, len(hs_)): r = dy.logistic(Wr * dy.concatenate([hs[t - 1], xs[t]]) + br) hs.append(dy.cmult(r, hs_[t]) + dy.cmult(1 - r, Wh * xs[t])) xs = hs if train: x = dy.dropout_dim(dy.concatenate(xs, 1), 1, self.dropout) xs = [dy.pick(x, i, 1) for i in range(len(xs))] return xs
def transduce(self, inputs, train): xs = inputs[:self.max_length] if not xs: return [] for i in range(self.lstm_layers): for n, d in ("f", 1), ("b", -1): Wr, br, Wh = [ dy.parameter(self.params["%s%d%s" % (p, i, n)]) for p in ("Wr", "br", "Wh") ] hs_ = self.params["rnn%d%s" % (i, n)].initial_state().transduce(xs[::d]) hs = [hs_[0]] for t in range(1, len(hs_)): r = dy.logistic(Wr * dy.concatenate([hs[t - 1], xs[t]]) + br) hs.append( dy.cmult(r, hs_[t]) + dy.cmult(1 - r, Wh * xs[t])) xs = hs if train: x = dy.dropout_dim(dy.concatenate(xs, 1), 1, self.dropout) xs = [dy.pick(x, i, 1) for i in range(len(xs))] return xs
def run(self, word_inputs, tag_inputs, arc_targets=None, rel_targets=None, isTrain=True): # inputs, targets: seq_len x batch_size def dynet_flatten_numpy(ndarray): return np.reshape(ndarray, (-1, ), 'F') batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32) num_tokens = int(np.sum(mask)) if isTrain or arc_targets is not None: mask_1D = dynet_flatten_numpy(mask) mask_1D_tensor = dy.inputTensor(mask_1D, batched=True) word_embs = [ dy.lookup_batch(self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK), update=True) #+ dy.lookup_batch(self.pret_word_embs, w, update = False) # remove 1 line for w in word_inputs ] tag_embs = [ dy.lookup_batch(self.tag_embs, pos, update=True) for pos in tag_inputs ] if isTrain: emb_masks = self.generate_emb_mask(seq_len, batch_size) emb_inputs = [ dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks) ] else: emb_inputs = [ dy.concatenate([w, pos]) for w, pos in zip(word_embs, tag_embs) ] top_recur = dy.concatenate_cols( biLSTM(self.LSTM_builders, emb_inputs, batch_size, self.dropout_lstm_input if isTrain else 0., self.dropout_lstm_hidden if isTrain else 0.)) if isTrain: top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp) W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter( self.mlp_dep_b) W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter( self.mlp_head_b) dep, head = leaky_relu(dy.affine_transform([ b_dep, W_dep, top_recur ])), leaky_relu(dy.affine_transform([b_head, W_head, top_recur])) if isTrain: dep, head = dy.dropout_dim(dep, 1, self.dropout_mlp), dy.dropout_dim( head, 1, self.dropout_mlp) dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:] head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:] W_arc = dy.parameter(self.arc_W) arc_logits = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False) # (#head x #dep) x batch_size flat_arc_logits = dy.reshape(arc_logits, (seq_len, ), seq_len * batch_size) # (#head ) x (#dep x batch_size) arc_preds = arc_logits.npvalue().argmax(0) # seq_len x batch_size if isTrain or arc_targets is not None: arc_correct = np.equal(arc_preds, arc_targets).astype( np.float32) * mask arc_accuracy = np.sum(arc_correct) / num_tokens targets_1D = dynet_flatten_numpy(arc_targets) losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D) arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: arc_probs = np.transpose( np.reshape( dy.softmax(flat_arc_logits).npvalue(), (seq_len, seq_len, batch_size), 'F')) # #batch_size x #dep x #head W_rel = dy.parameter(self.rel_W) #dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))]) #head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # (#head x rel_size x #dep) x batch_size flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size) # (#head x rel_size) x (#dep x batch_size) partial_rel_logits = dy.pick_batch( flat_rel_logits, targets_1D if isTrain else dynet_flatten_numpy(arc_preds)) # (rel_size) x (#dep x batch_size) if isTrain or arc_targets is not None: rel_preds = partial_rel_logits.npvalue().argmax(0) targets_1D = dynet_flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds, targets_1D).astype( np.float32) * mask_1D rel_accuracy = np.sum(rel_correct) / num_tokens losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D) rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: rel_probs = np.transpose( np.reshape( dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) # batch_size x #dep x #head x #nclasses if isTrain or arc_targets is not None: loss = arc_loss + rel_loss correct = rel_correct * dynet_flatten_numpy(arc_correct) overall_accuracy = np.sum(correct) / num_tokens if isTrain: return arc_accuracy, rel_accuracy, overall_accuracy, loss outputs = [] for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs): # parse sentences one by one msk[0] = 1. sent_len = int(np.sum(msk)) arc_pred = arc_argmax(arc_prob, sent_len, msk) rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred] rel_pred = rel_argmax(rel_prob, sent_len) outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len])) if arc_targets is not None: return arc_accuracy, rel_accuracy, overall_accuracy, outputs return outputs
def run_parser(self, word_inputs, common_top_recur, private_top_recur, arc_targets=None, rel_targets=None, isTrain=True): # inputs, targets: seq_len x batch_size batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32) num_tokens = int(np.sum(mask)) top_recur = dy.concatenate([common_top_recur, private_top_recur]) if isTrain or arc_targets is not None: mask_1D = self.dynet_flatten_numpy(mask) mask_1D_tensor = dy.inputTensor(mask_1D, batched=True) W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter( self.mlp_dep_b) W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter( self.mlp_head_b) dep = leaky_relu(dy.affine_transform([b_dep, W_dep, top_recur])) head = leaky_relu(dy.affine_transform([b_head, W_head, top_recur])) if isTrain: dep = dy.dropout_dim(dep, 1, self.dropout_mlp) head = dy.dropout_dim(head, 1, self.dropout_mlp) dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:] head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:] W_arc = dy.parameter(self.arc_W) arc_logits = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False) # (#head x #dep) x batch_size flat_arc_logits = dy.reshape(arc_logits, (seq_len, ), seq_len * batch_size) # (#head ) x (#dep x batch_size) arc_preds = arc_logits.npvalue().argmax(0) # seq_len x batch_size if isTrain or arc_targets is not None: arc_correct = np.equal(arc_preds, arc_targets).astype( np.float32) * mask arc_accuracy = np.sum(arc_correct) / num_tokens targets_1D = self.dynet_flatten_numpy(arc_targets) losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D) arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: arc_probs = np.transpose( np.reshape( dy.softmax(flat_arc_logits).npvalue(), (seq_len, seq_len, batch_size), 'F')) # #batch_size x #dep x #head W_rel = dy.parameter(self.rel_W) # dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))]) # head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # (#head x rel_size x #dep) x batch_size flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size) # (#head x rel_size) x (#dep x batch_size) partial_rel_logits = dy.pick_batch( flat_rel_logits, targets_1D if isTrain else self.dynet_flatten_numpy(arc_preds)) # (rel_size) x (#dep x batch_size) if isTrain or arc_targets is not None: rel_preds = partial_rel_logits.npvalue().argmax(0) targets_1D = self.dynet_flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds, targets_1D).astype( np.float32) * mask_1D rel_accuracy = np.sum(rel_correct) / num_tokens losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D) rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: rel_probs = np.transpose( np.reshape( dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) # batch_size x #dep x #head x #nclasses if isTrain or arc_targets is not None: loss = arc_loss + rel_loss correct = rel_correct * self.dynet_flatten_numpy(arc_correct) overall_accuracy = np.sum(correct) / num_tokens if isTrain: return arc_accuracy, rel_accuracy, overall_accuracy, loss outputs = [] for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs): # parse sentences one by one msk[0] = 1. sent_len = int(np.sum(msk)) arc_pred = arc_argmax(arc_prob, sent_len, msk) rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred] rel_pred = rel_argmax(rel_prob, sent_len) outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len])) if arc_targets is not None: return arc_accuracy, rel_accuracy, overall_accuracy, outputs return outputs
def run(self, word_inputs, tag_inputs, arc_targets=None, rel_targets=None): is_train = arc_targets is not None # @djam modification word_inputs = word_inputs.T tag_inputs = tag_inputs.T if arc_targets is not None: arc_targets[:, 0] = 0 arc_targets = arc_targets.T targets_1D = dynet_flatten_numpy(arc_targets) batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32) if self.pret_word_embs: word_embs = [ dy.lookup_batch(self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) + dy.lookup_batch(self.pret_word_embs, w, update=False) for w in word_inputs ] else: word_embs = [dy.lookup_batch(self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) for w in word_inputs] tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs] if is_train: emb_masks = self.generate_emb_mask(seq_len, batch_size) emb_inputs = [dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks)] else: emb_inputs = [dy.concatenate([w, pos]) for w, pos in zip(word_embs, tag_embs)] top_recur = dy.concatenate_cols(biLSTM(self.LSTM_builders, emb_inputs, batch_size, self.dropout_lstm_input if is_train else 0., self.dropout_lstm_hidden if is_train else 0.)) if is_train: top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp) W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter(self.mlp_dep_b) W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter(self.mlp_head_b) dep, head = leaky_relu(dy.affine_transform([b_dep, W_dep, top_recur])), leaky_relu(dy.affine_transform([b_head, W_head, top_recur])) if is_train: dep, head= dy.dropout_dim(dep, 1, self.dropout_mlp), dy.dropout_dim(head, 1, self.dropout_mlp) dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:] head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:] W_arc = dy.parameter(self.arc_W) arc_logits = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs= 1, bias_x = True, bias_y = False) # (#head x #dep) x batch_size arc_preds = arc_logits.npvalue().argmax(0) arc_preds = arc_preds if arc_preds.ndim == 2 else arc_preds[:, None] # seq_len x batch_size W_rel = dy.parameter(self.rel_W) rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs = self._vocab.rel_size, bias_x = True, bias_y = True) # (#head x rel_size x #dep) x batch_size flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size) # (#head x rel_size) x (#dep x batch_size) partial_rel_logits = dy.pick_batch(flat_rel_logits, targets_1D if is_train else dynet_flatten_numpy(arc_preds)) # (rel_size) x (#dep x batch_size) # @djam - restored shape partial_rel_logits = dy.reshape(partial_rel_logits, (self._vocab.rel_size, seq_len), batch_size) # if not isTrain: arc_probs = np.transpose(np.reshape(dy.softmax(arc_logits).npvalue(), (seq_len, seq_len, batch_size), 'F')) # #batch_size x #dep x #head rel_probs = np.transpose(np.reshape(dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) # batch_size x #dep x #head x #nclasses # @djam contribution if is_train: # 'decode' with argmax # Why on earth can't i get this guy to work # arc_predictions = arc_probs.argmax(1) arc_predictions = arc_preds.T # batch_size x dep _1 = np.repeat(range(batch_size), seq_len) # batches _2 = np.tile(range(seq_len), batch_size) # modifiers _3 = arc_predictions.reshape(-1) # predicted arcs rel_predictions = rel_probs[_1, _2, _3].argmax(-1) rel_predictions = rel_predictions.reshape(batch_size, seq_len) # batch_size x dep return arc_predictions, rel_predictions, arc_logits, partial_rel_logits else: arc_predictions, rel_predictions = [], [] for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs): msk[0] = 1. sent_len = int(np.sum(msk)) arc_pred = uniparse.arc_argmax(arc_prob, sent_len, msk) rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred] rel_pred = uniparse.rel_argmax(rel_prob, sent_len) arc_predictions.append(arc_pred[:sent_len]) rel_predictions.append(rel_pred[:sent_len]) return arc_predictions, rel_predictions, None, None
def run(self, word_inputs, lemma_inputs, tag_inputs, pred_golds, rel_targets=None, isTrain=True): # inputs, targets: seq_len x batch_size def dynet_flatten_numpy(ndarray): return np.reshape(ndarray, (-1,), 'F') batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] marker = self._vocab.PAD if self._unified else self._vocab.DUMMY mask = np.greater(word_inputs, marker).astype(np.float32) num_tokens = int(np.sum(mask)) word_embs = [dy.lookup_batch(self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK) ) for w in word_inputs] pre_embs = [dy.lookup_batch(self.pret_word_embs, w) for w in word_inputs] flag_embs = [dy.lookup_batch(self.flag_embs, np.array(w == i + 1, dtype=np.int) ) for i, w in enumerate(pred_golds)] lemma_embs = [dy.lookup_batch(self.lemma_embs, lemma) for lemma in lemma_inputs] tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs] if isTrain: emb_masks = self.generate_emb_mask(seq_len, batch_size) emb_inputs = [dy.concatenate([dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm), dy.cmult(lemma, wm), dy.cmult(pos, posm)]) for word, pre, flag, lemma, pos, (wm, posm) in zip(word_embs, pre_embs, flag_embs, lemma_embs, tag_embs, emb_masks)] else: emb_inputs = [dy.concatenate([word, pre, flag, lemma, pos]) for word, pre, flag, lemma, pos in zip(word_embs, pre_embs, flag_embs, lemma_embs, tag_embs)] top_recur = dy.concatenate_cols( biLSTM(self.LSTM_builders, emb_inputs, batch_size, self.dropout_lstm_input if isTrain else 0., self.dropout_lstm_hidden if isTrain else 0.)) if isTrain: top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp) W_arg, b_arg = dy.parameter(self.mlp_arg_W), dy.parameter(self.mlp_arg_b) W_pred, b_pred = dy.parameter(self.mlp_pred_W), dy.parameter(self.mlp_pred_b) arg_hidden = leaky_relu(dy.affine_transform([b_arg, W_arg, top_recur])) # pred_hidden = leaky_relu(dy.affine_transform([b_pred, W_pred, top_recur])) predicates_1D = pred_golds[0] pred_recur = dy.pick_batch(top_recur, predicates_1D, dim=1) pred_hidden = leaky_relu(dy.affine_transform([b_pred, W_pred, pred_recur])) if isTrain: arg_hidden = dy.dropout_dim(arg_hidden, 1, self.dropout_mlp) # pred_hidden = dy.dropout_dim(pred_hidden, 1, self.dropout_mlp) pred_hidden = dy.dropout(pred_hidden, self.dropout_mlp) W_rel = dy.parameter(self.rel_W) # rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, batch_size, # num_outputs = self._vocab.rel_size, bias_x = True, bias_y = True) # # (#pred x rel_size x #arg) x batch_size # flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size) # # (#pred x rel_size) x (#arg x batch_size) # predicates_1D = dynet_flatten_numpy(pred_golds) # partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D) # # (rel_size) x (#arg x batch_size) rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, 1, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # (1 x rel_size x #arg) x batch_size flat_rel_logits = dy.reshape(rel_logits, (1, self._vocab.rel_size), seq_len * batch_size) # (1 x rel_size) x (#arg x batch_size) predicates_1D = np.zeros(dynet_flatten_numpy(pred_golds).shape[0]) partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D) # (1 x rel_size) x (#arg x batch_size) if isTrain: mask_1D = dynet_flatten_numpy(mask) mask_1D_tensor = dy.inputTensor(mask_1D, batched=True) rel_preds = partial_rel_logits.npvalue().argmax(0) targets_1D = dynet_flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds, targets_1D).astype(np.float32) * mask_1D rel_accuracy = np.sum(rel_correct) / num_tokens losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D) rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens return rel_accuracy, rel_loss # rel_probs = np.transpose(np.reshape(dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), # (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) rel_probs = np.transpose(np.reshape(dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (self._vocab.rel_size, 1, seq_len, batch_size), 'F')) outputs = [] # for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T, rel_probs): # msk[0] = 1. # sent_len = int(np.sum(msk)) # rel_prob = rel_prob[np.arange(len(pred_gold)), pred_gold] # rel_pred = rel_argmax(rel_prob) # outputs.append(rel_pred[:sent_len]) for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T, rel_probs): msk[0] = 1. sent_len = int(np.sum(msk)) rel_prob = rel_prob[np.arange(len(pred_gold)), 0] rel_pred = rel_argmax(rel_prob) outputs.append(rel_pred[:sent_len]) return outputs
def run(self, word_inputs, tag_inputs, arc_targets=None, rel_targets=None, isTrain=True): # inputs, targets: seq_len x batch_size def dynet_flatten_numpy(ndarray): return np.reshape(ndarray, (-1, ), 'F') batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32) num_tokens = int(np.sum(mask)) if isTrain or arc_targets is not None: mask_1D = dynet_flatten_numpy(mask) # batched here means that the last dim is treated as batch dimension, both in input and output mask_1D_tensor = dy.inputTensor(mask_1D, batched=True) # TODO: 注意 _words_in_train # 两个 embedding 相加, [Expression of dim=((embedding_dim,), batch_size)] * seq_len if self.pre_train_emb: word_embs = [ dy.lookup_batch( self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) + dy.lookup_batch(self.pret_word_embs, w, update=False) for w in word_inputs ] # 两个 embedding 相加 [Expression] * seq_len else: word_embs = [ dy.lookup_batch( self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) for w in word_inputs ] tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs] if isTrain: emb_masks = self.generate_emb_mask(seq_len, batch_size) emb_inputs = [ dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks) ] else: emb_inputs = [ dy.concatenate([w, pos]) for w, pos in zip(word_embs, tag_embs) ] top_recur = dy.concatenate_cols( biLSTM(self.LSTM_builders, emb_inputs, batch_size, self.dropout_lstm_input if isTrain else 0., self.dropout_lstm_hidden if isTrain else 0.)) if isTrain: top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp) W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter( self.mlp_dep_b) W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter( self.mlp_head_b) dep, head = leaky_relu(dy.affine_transform([ b_dep, W_dep, top_recur ])), leaky_relu(dy.affine_transform([b_head, W_head, top_recur])) if isTrain: dep, head = dy.dropout_dim(dep, 1, self.dropout_mlp), dy.dropout_dim( head, 1, self.dropout_mlp) # 1 就意味着某些情况下整个 dim 1 变成0, dim=0 就是 drop 列, dim=1 就是 drop 行, 第三维是 batch dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:] head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:] W_arc = dy.parameter(self.arc_W) arc_logits = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False) # (#head x #dep) x batch_size flat_arc_logits = dy.reshape(arc_logits, (seq_len, ), seq_len * batch_size) # 这种风格的平坦是为了计算 loss 啦 # (#head ) x (#dep x batch_size) arc_preds = arc_logits.npvalue().argmax(0) # seq_len x batch_size if isTrain or arc_targets is not None: # 用得分最高的去计算 loss, 并不意味着我就选这个作为解码结果的哦, 但是必须削减它 arc_correct = np.equal(arc_preds, arc_targets).astype( np.float32) * mask # mask 你真厉害呀现在还活着 arc_accuracy = np.sum(arc_correct) / num_tokens targets_1D = dynet_flatten_numpy(arc_targets) losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D) arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: arc_probs = np.transpose( np.reshape( dy.softmax(flat_arc_logits).npvalue(), (seq_len, seq_len, batch_size), 'F')) # #batch_size x #dep x #head W_rel = dy.parameter(self.rel_W) #dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))]) #head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # (#head x rel_size x #dep) x batch_size flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size) # (#head x rel_size) x (#dep x batch_size) partial_rel_logits = dy.pick_batch( flat_rel_logits, targets_1D if isTrain else dynet_flatten_numpy(arc_preds)) # (rel_size) x (#dep x batch_size) if isTrain or arc_targets is not None: rel_preds = partial_rel_logits.npvalue().argmax(0) targets_1D = dynet_flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds, targets_1D).astype( np.float32) * mask_1D # 这里的形状如此, 需要用 mask1d rel_accuracy = np.sum(rel_correct) / num_tokens losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D) rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: rel_probs = np.transpose( np.reshape( dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) # batch_size x #dep x #head x #nclasses if isTrain or arc_targets is not None: loss = arc_loss + rel_loss correct = rel_correct * dynet_flatten_numpy(arc_correct) overall_accuracy = np.sum(correct) / num_tokens if isTrain: return arc_accuracy, rel_accuracy, overall_accuracy, loss outputs = [] for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs): # parse sentences one by ones # 我非常赞同, parse 的解码这一部分根本没法 batch msk[0] = 1. sent_len = int(np.sum(msk)) arc_pred = arc_argmax(arc_prob, sent_len, msk) rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred] rel_pred = rel_argmax(rel_prob, sent_len) outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len])) # 难道第 0 个真的是 ROOT, 确实如此 if arc_targets is not None: return arc_accuracy, rel_accuracy, overall_accuracy, outputs return outputs
def __call__(self, inputs, masks, truth, is_train=True, is_tree=True): sent_len = len(inputs) batch_size = inputs[0].dim()[1] flat_len = sent_len * batch_size # H -> hidden size, L -> sentence length, B -> batch size # ((H, L), B) X = dy.concatenate_cols(inputs) if is_train: X = dy.dropout_dim(X, 1, self.cfg.MLP_DROP) # M_H -> MLP hidden size # ((M_H, L), B) # head_mat = leaky_relu(self.head_MLP(X, is_train)) head_mat = self.head_MLP(X, is_train) # ((M_H, L), B) dept_mat = self.dept_MLP(X, is_train) if is_train: total_token = sum(masks['flat'].tolist()) head_mat = dy.dropout_dim(head_mat, 1, self.cfg.MLP_DROP) dept_mat = dy.dropout_dim(dept_mat, 1, self.cfg.MLP_DROP) # A_H -> Arc hidden size, R_H -> Label hidden size, A_H + R_H = M_H head_arc = head_mat[:self.arc_size] # ((A_H, L), B) dept_arc = dept_mat[:self.arc_size] # ((A_H, L), B) head_rel = head_mat[self.arc_size:] # ((R_H, L), B) dept_rel = dept_mat[self.arc_size:] # ((R_H, L), B) # ((L, L), B) masks_2D = dy.inputTensor(masks['2D'], True) # (1, L*B) masks_flat = dy.inputTensor(masks['flat'], True) gnn_losses = [] for k in range(self.cfg.GRAPH_LAYERS): # Graph Weights # ((L, L), B) arc_mat = self.arc_attn_mat[k](head_arc, dept_arc) - 1e9 * (1 - masks_2D) arc_prob = dy.softmax(arc_mat) # Layer-wise Loss if is_train: # ((L,), L*B) arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len) # ((1,), L*B) arc_loss = dy.pickneglogsoftmax_batch(arc_mat, truth['head']) # (1,) arc_loss = dy.sum_batches(arc_loss * masks_flat) / total_token gnn_losses.append(arc_loss) # Aggregation Function # Fusion head and dept representation # ((A_H, L), B) HX = head_arc * arc_prob DX = dept_arc * dy.transpose(arc_prob) FX = HX + DX # Async Update Function # Head-first # ((A_H, L), B) head_arc = self.head_gnn(FX, head_arc) FX_new = head_arc * arc_prob + DX dept_arc = self.dept_gnn(FX_new, dept_arc) # ((L, L), B) arc_mat = self.arc_attn_mat[-1](head_arc, dept_arc) - 1e9 * (1 - masks_2D) # ((L,), L*B) arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len) # Predict Relation # (R_H, L*B) head_rel = dy.reshape(head_rel, (self.rel_size, flat_len)) # ((R_H,), L*B) dept_rel = dy.reshape(dept_rel, (self.rel_size, ), flat_len) if is_train: # ((1,), L*B) arc_losses = dy.pickneglogsoftmax_batch(arc_mat, truth['head']) # (1,) arc_loss = dy.sum_batches(arc_losses * masks_flat) / total_token # ((R_H,), L*B) truth_rel = dy.pick_batch(head_rel, truth['flat_head'], 1) # R -> Relation Set Size # ((R,), L*B) rel_mat = self.rel_attn(dept_rel, truth_rel) else: if is_tree: # MST Inference, Achieve Tree Edge. arc_probs = dy.softmax(arc_mat).npvalue() arc_probs = np.reshape(arc_probs, (sent_len, sent_len, batch_size), 'F') arc_probs = np.transpose(arc_probs) # Mask PAD arc_masks = [ np.array(masks['flat'][i:i + sent_len]) for i in range(0, flat_len, sent_len) ] arc_pred = [] # Inference One By One. for msk, arc_prob in zip(arc_masks, arc_probs): msk[0] = 1 seq_len = int(np.sum(msk)) tmp_pred = MST_inference(arc_prob, seq_len, msk) tmp_pred[0] = 0 arc_pred.extend(tmp_pred) else: # Greedy Inference (argmax) arc_pred = np.argmax(arc_mat.npvalue(), 0) # Pick Predicted Edge's <Head, Dept> pair. flat_pred = [ j + (i // sent_len) * sent_len for i, j in enumerate(arc_pred) ] pred_rel = dy.pick_batch(head_rel, flat_pred, 1) # Predict Relation (mask ROOT) rel_mat = self.rel_attn(dept_rel, pred_rel) rel_mask = dy.inputTensor(self.rel_mask) rel_mat = rel_mat - 1e9 * rel_mask if is_train: # Calculate Relation Classification Loss # ((1,), L*B) rel_losses = dy.pickneglogsoftmax_batch(rel_mat, truth['rel']) # (1,) rel_loss = dy.sum_batches(rel_losses * masks_flat) / total_token # Final Total Loss with Layer-wise losses = (rel_loss + arc_loss) * self.cfg.LAMBDA2 if gnn_losses: losses += dy.esum(gnn_losses) * self.cfg.LAMBDA1 losses_list = gnn_losses + [arc_loss, rel_loss] return losses, losses_list else: rel_mat = dy.reshape(rel_mat, (self.rel_num, )).npvalue() rel_pred = np.argmax(rel_mat, 0) pred = {} pred['head'], pred['rel'] = arc_pred, rel_pred return pred
def __call__(self, inputs, masks, truth, iters, is_train=True, is_tree=True): if type(inputs) == list: sent_len = len(inputs) batch_size = inputs[0].dim()[1] X = dy.concatenate_cols(inputs) else: sent_len = inputs.dim()[0][0] batch_size = inputs.dim()[1] X = dy.transpose(inputs, [1, 0]) flat_len = sent_len * batch_size #sent_len = len(inputs) #batch_size = inputs[0].dim()[1] #flat_len = sent_len * batch_size # H -> hidden size, L -> sentence length, B -> batch size # ((H, L), B) #X = dy.concatenate_cols(inputs) if is_train: X = dy.dropout_dim(X, 1, self.cfg.MLP_DROP) # A_H -> ARC MLP hidden size, R_H -> REL MLP hidden size # ((A_H, L), B) head_arc = self.head_arc_MLP(X, is_train) dept_arc = self.dept_arc_MLP(X, is_train) # ((R_H, L), B) head_rel = self.head_rel_MLP(X, is_train) dept_rel = self.dept_rel_MLP(X, is_train) if is_train: total_token = sum(masks['flat'].tolist()) head_arc = dy.dropout_dim(head_arc, 1, self.cfg.MLP_DROP) head_rel = dy.dropout_dim(head_rel, 1, self.cfg.MLP_DROP) dept_arc = dy.dropout_dim(dept_arc, 1, self.cfg.MLP_DROP) dept_rel = dy.dropout_dim(dept_rel, 1, self.cfg.MLP_DROP) # ((L, L), B) masks_2D = 1e9*(1-dy.inputTensor(masks['2D'], True)) # (1, L*B) masks_flat = dy.inputTensor(masks['flat'], True) gnn_losses = [] arc_norm = math.sqrt(self.arc_size) rel_norm = math.sqrt(self.rel_size) for k in range(self.cfg.GRAPH_LAYERS): # Graph Weights # ((L, L), B) arc_mat = self.arc_attn_mat[k](head_arc, dept_arc)/arc_norm-masks_2D arc_prob = dy.softmax(arc_mat) # Layer-wise Loss if is_train: arc_prob = dy.dropout(arc_prob, self.cfg.ARC_DROP) # ((L,), L*B) arc_mat = dy.reshape(arc_mat, (sent_len,), flat_len) # ((1,), L*B) arc_loss = dy.pickneglogsoftmax_batch(arc_mat, truth['head']) # (1,) arc_loss = dy.sum_batches(arc_loss*masks_flat)/total_token gnn_losses.append(arc_loss) # Aggregation Function # Fusion head and dept representation # ((A_H, L), B) HX = head_arc * arc_prob DX = dept_arc * dy.transpose(arc_prob) FX = HX + DX # Async Update Function # Head-first # ((A_H, L), B) head_arc = self.head_gnn(FX, head_arc) FX_new = head_arc * arc_prob + DX dept_arc = self.dept_gnn(FX_new, dept_arc) # Relation Aggregation Function # Sync update # ((R_H, L), B) HR = head_rel * arc_prob DR = dept_rel * dy.transpose(arc_prob) FX = HR+DR head_rel = self.head_rel_gnn(FX, head_rel) + head_rel dept_rel = self.dept_rel_gnn(FX, dept_rel) + dept_rel # ((L, L), B) arc_mat = self.arc_attn_mat[-1](head_arc, dept_arc)/arc_norm-masks_2D is_tree_computed_val = is_tree_computed(arc_mat.npvalue()) print(is_tree_computed_val) exit(0) # ((L,), L*B) arc_mat = dy.reshape(arc_mat, (sent_len,), flat_len) # Predict Relation # (R_H, L*B) head_rel = dy.reshape(head_rel, (self.rel_size, flat_len)) # ((R_H,), L*B) dept_rel = dy.reshape(dept_rel, (self.rel_size,), flat_len) if is_train: # print(arc_mat.dim()) # ((3,), 300) # arc_pred = np.argmax(arc_mat.npvalue(), 0) # print(arc_pred.shape) # (300,) # print(arc_pred) # all 0's and 1's # ((1,), L*B) arc_losses = dy.pickneglogsoftmax_batch(arc_mat, truth['head']) # (1,) arc_loss = dy.sum_batches(arc_losses*masks_flat)/total_token # ((R_H,), L*B) truth_rel = dy.pick_batch(head_rel, truth['flat_head'], 1) # R -> Relation Set Size # ((R,), L*B) rel_mask = 1e9*dy.inputTensor(self.rel_mask) rel_mat = self.rel_attn(dept_rel, truth_rel)/rel_norm - rel_mask # Calculate Relation Classification Loss # ((1,), L*B) rel_losses = dy.pickneglogsoftmax_batch(rel_mat, truth['rel']) # (1,) rel_loss = dy.sum_batches(rel_losses*masks_flat) / total_token # Final Total Loss with Layer-wise warm = [int(iters>=x) for x in self.warm_list] losses = rel_loss*self.cfg.LAMBDA2*warm[-1]+arc_loss*self.cfg.LAMBDA2*warm[-1] if gnn_losses: for i in range(self.cfg.GRAPH_LAYERS): gnn_losses[i] *= warm[i] losses += dy.esum(gnn_losses)*self.cfg.LAMBDA1 losses_list = gnn_losses + [arc_loss, rel_loss] return losses, losses_list else: if is_tree: # MST Inference, Achieve Tree Edge. arc_probs = dy.softmax(arc_mat).npvalue() arc_probs = np.reshape(arc_probs, (sent_len, sent_len, batch_size), 'F') arc_probs = np.transpose(arc_probs) # Mask PAD arc_masks = [np.array(masks['flat'][i:i+sent_len]) for i in range(0, flat_len, sent_len)] arc_pred = [] # Inference One By One. for msk, arc_prob in zip(arc_masks, arc_probs): msk[0] = 1 seq_len = int(np.sum(msk)) tmp_pred = MST_inference(arc_prob, seq_len, msk) tmp_pred[0] = 0 arc_pred.extend(tmp_pred) else: # Greedy Inference (argmax) arc_pred = np.argmax(arc_mat.npvalue(), 0) # Pick Predicted Edge's <Head, Dept> pair. flat_pred = [j+(i//sent_len)*sent_len for i, j in enumerate(arc_pred)] pred_rel = dy.pick_batch(head_rel, flat_pred, 1) # Predict Relation (mask ROOT) rel_mask = 1e9*dy.inputTensor(self.rel_mask) rel_mat = self.rel_attn(dept_rel, pred_rel)/rel_norm-rel_mask rel_mat = dy.reshape(rel_mat, (self.rel_num,)).npvalue() rel_pred = np.argmax(rel_mat, 0) pred = {} pred['head'], pred['rel'] = arc_pred, rel_pred return pred
def single_training_call(self, inputs, masks, truth, iters, is_train=True, is_tree=True): if type(inputs) == list: sent_len = len(inputs) batch_size = inputs[0].dim()[1] X = dy.concatenate_cols(inputs) else: sent_len = inputs.dim()[0][0] batch_size = inputs.dim()[1] X = dy.transpose(inputs, [1, 0]) flat_len = sent_len * batch_size # H -> hidden size, L -> sentence length, B -> batch size # ((H, L), B) #X = dy.concatenate_cols(inputs) if is_train: X = dy.dropout_dim(X, 1, self.cfg.MLP_DROP) # A_H -> ARC MLP hidden size, R_H -> REL MLP hidden size # ((A_H, L), B) head_arc = self.head_arc_MLP(X, is_train) dept_arc = self.dept_arc_MLP(X, is_train) # ((R_H, L), B) head_rel = self.head_rel_MLP(X, is_train) dept_rel = self.dept_rel_MLP(X, is_train) if is_train: total_token = sum(masks['flat'].tolist()) head_arc = dy.dropout_dim(head_arc, 1, self.cfg.MLP_DROP) head_rel = dy.dropout_dim(head_rel, 1, self.cfg.MLP_DROP) dept_arc = dy.dropout_dim(dept_arc, 1, self.cfg.MLP_DROP) dept_rel = dy.dropout_dim(dept_rel, 1, self.cfg.MLP_DROP) else: # added by me total_token = None # ((L, L), B) masks_2D = 1e9 * (1 - dy.inputTensor(masks['2D'], True)) # (1, L*B) masks_flat = dy.inputTensor(masks['flat'], True) gnn_losses = [] arc_norm = math.sqrt(self.arc_size) rel_norm = math.sqrt(self.rel_size) for k in range(self.cfg.GRAPH_LAYERS): # Graph Weights # ((L, L), B) arc_mat = self.arc_attn_mat[k](head_arc, dept_arc) / arc_norm - masks_2D arc_prob = dy.softmax(arc_mat) # Layer-wise Loss if is_train: arc_prob = dy.dropout(arc_prob, self.cfg.ARC_DROP) # ((L,), L*B) arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len) # ((1,), L*B) arc_loss = dy.pickneglogsoftmax_batch(arc_mat, truth['head']) # (1,) arc_loss = dy.sum_batches(arc_loss * masks_flat) / total_token gnn_losses.append(arc_loss) # Aggregation Function # Fusion head and dept representation # ((A_H, L), B) HX = head_arc * arc_prob DX = dept_arc * dy.transpose(arc_prob) FX = HX + DX # Async Update Function # Head-first # ((A_H, L), B) head_arc = self.head_gnn(FX, head_arc) FX_new = head_arc * arc_prob + DX dept_arc = self.dept_gnn(FX_new, dept_arc) # Relation Aggregation Function # Sync update # ((R_H, L), B) HR = head_rel * arc_prob DR = dept_rel * dy.transpose(arc_prob) FX = HR + DR head_rel = self.head_rel_gnn(FX, head_rel) + head_rel dept_rel = self.dept_rel_gnn(FX, dept_rel) + dept_rel # ((L, L), B) arc_mat = self.arc_attn_mat[-1](head_arc, dept_arc) / arc_norm - masks_2D # ((L,), L*B) arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len) # Predict Relation # (R_H, L*B) head_rel = dy.reshape(head_rel, (self.rel_size, flat_len)) # ((R_H,), L*B) dept_rel = dy.reshape(dept_rel, (self.rel_size, ), flat_len) return arc_mat, head_rel, dept_rel, masks_flat, total_token, gnn_losses, sent_len, batch_size, arc_norm, rel_norm, flat_len
def cal_scores(self, src_encodings, predict=False): src_len = len(src_encodings) src_encodings = dy.concatenate_cols( src_encodings) # src_ctx_dim, src_len, batch_size batch_size = src_encodings.dim()[1] W_arc_hidden_to_head = dy.parameter(self.W_arc_hidden_to_head) b_arc_hidden_to_head = dy.parameter(self.b_arc_hidden_to_head) W_arc_hidden_to_dep = dy.parameter(self.W_arc_hidden_to_dep) b_arc_hidden_to_dep = dy.parameter(self.b_arc_hidden_to_dep) W_label_hidden_to_head = dy.parameter(self.W_label_hidden_to_head) b_label_hidden_to_head = dy.parameter(self.b_label_hidden_to_head) W_label_hidden_to_dep = dy.parameter(self.W_label_hidden_to_dep) b_label_hidden_to_dep = dy.parameter(self.b_label_hidden_to_dep) U_arc_1 = dy.parameter(self.U_arc_1) u_arc_2 = dy.parameter(self.u_arc_2) U_label_1 = [dy.parameter(x) for x in self.U_label_1] u_label_2_1 = [dy.parameter(x) for x in self.u_label_2_1] u_label_2_2 = [dy.parameter(x) for x in self.u_label_2_2] b_label = [dy.parameter(x) for x in self.b_label] if predict: h_arc_head = self.leaky_ReLu( dy.affine_transform([ b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings ])) # n_arc_ml_units, src_len, bs h_arc_dep = self.leaky_ReLu( dy.affine_transform( [b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings])) h_label_head = self.leaky_ReLu( dy.affine_transform([ b_label_hidden_to_head, W_label_hidden_to_head, src_encodings ])) h_label_dep = self.leaky_ReLu( dy.affine_transform([ b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings ])) else: src_encodings = dy.dropout_dim(src_encodings, 1, self.arc_mlp_dropout) h_arc_head = dy.dropout_dim( self.leaky_ReLu( dy.affine_transform([ b_arc_hidden_to_head, W_arc_hidden_to_head, src_encodings ])), 1, self.arc_mlp_dropout) # n_arc_ml_units, src_len, bs h_arc_dep = dy.dropout_dim( self.leaky_ReLu( dy.affine_transform([ b_arc_hidden_to_dep, W_arc_hidden_to_dep, src_encodings ])), 1, self.arc_mlp_dropout) h_label_head = dy.dropout_dim( self.leaky_ReLu( dy.affine_transform([ b_label_hidden_to_head, W_label_hidden_to_head, src_encodings ])), 1, self.label_mlp_dropout) h_label_dep = dy.dropout_dim( self.leaky_ReLu( dy.affine_transform([ b_label_hidden_to_dep, W_label_hidden_to_dep, src_encodings ])), 1, self.label_mlp_dropout) h_arc_head_transpose = dy.transpose(h_arc_head) h_label_head_transpose = dy.transpose(h_label_head) s_arc = h_arc_head_transpose * dy.colwise_add(U_arc_1 * h_arc_dep, u_arc_2) s_label = [] for U_1, u_2_1, u_2_2, b in zip(U_label_1, u_label_2_1, u_label_2_2, b_label): e1 = h_label_head_transpose * U_1 * h_label_dep e2 = h_label_head_transpose * u_2_1 * dy.ones((1, src_len)) e3 = dy.ones((src_len, 1)) * u_2_2 * h_label_dep s_label.append(e1 + e2 + e3 + b) return s_arc, s_label
def run(self, char_vocab, cased_word_inputs, word_inputs, tag_inputs, arc_targets=None, rel_targets=None, is_train=True): """ Train or test :param char_vocab: :param cased_word_inputs: seq_len x batch_size :param word_inputs: seq_len x batch_size :param tag_inputs: seq_len x batch_size :param arc_targets: seq_len x batch_size :param rel_targets: seq_len x batch_size :param is_train: is training or test :return: """ def flatten_numpy(ndarray): """ Flatten nd-array to 1-d column vector :param ndarray: :return: """ return np.reshape(ndarray, (-1, ), 'F') batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = np.greater(word_inputs, self._vocab.ROOT).astype(np.float32) num_tokens = int(np.sum(mask)) # non padding, non root token number if is_train or arc_targets is not None: mask_1D = flatten_numpy(mask) mask_1D_tensor = dy.inputTensor(mask_1D, batched=True) # if batched=True, the last dimension is used as a batch dimension if arr is a list of numpy ndarrays if self.char_lstm: # Subword model char_w = dy.parameter(self.char_w) def LSTM_attention(lstm, inputs, dropout_x=0., dropout_h=0.): ss = LSTM(lstm, inputs, None, dropout_x, dropout_h) hs = [s.h()[0] for s in ss] return dy.concatenate([attention(hs, char_w), ss[-1].s()[0]]) subword_embs = [] for char_ids in char_vocab: char_inputs = [ dy.lookup(self.char_embs, char) for char in char_ids ] subword_embs.append( LSTM_attention( self.char_lstm, char_inputs, self.dropout_lstm_input if is_train else 0., self.dropout_lstm_hidden if is_train else 0.)) subword_embs = dy.concatenate_cols(subword_embs) word_embs = [ dy.lookup_batch( self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) + subword_embs * dy.inputTensor(one_hot(cw, len(char_vocab)).T, batched=True) + 0 if self.pret_word_embs is None else dy.lookup_batch( self.pret_word_embs, w, update=False) for cw, w in zip(cased_word_inputs, word_inputs) ] else: word_embs = [ dy.lookup_batch( self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) + 0 if self.pret_word_embs is None else dy.lookup_batch( self.pret_word_embs, w, update=False) for w in word_inputs ] tag_embs = [dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs] # Dropout if is_train: emb_masks = self.generate_emb_mask(seq_len, batch_size) emb_inputs = [ dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks) ] else: emb_inputs = [ dy.concatenate([w, pos]) for w, pos in zip(word_embs, tag_embs) ] # seq_len x batch_size top_recur = dy.concatenate_cols( biLSTM(self.LSTM_builders, emb_inputs, batch_size, self.dropout_lstm_input if is_train else 0., self.dropout_lstm_hidden if is_train else 0.)) if is_train: top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp) W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter( self.mlp_dep_b) W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter( self.mlp_head_b) dep, head = leaky_relu(dy.affine_transform([ b_dep, W_dep, top_recur ])), leaky_relu(dy.affine_transform([b_head, W_head, top_recur])) if is_train: dep, head = dy.dropout_dim(dep, 1, self.dropout_mlp), dy.dropout_dim( head, 1, self.dropout_mlp) dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:] head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:] W_arc = dy.parameter(self.arc_W) arc_logits = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False) # (#head x #dep) x batch_size flat_arc_logits = dy.reshape(arc_logits, (seq_len, ), seq_len * batch_size) # (#head ) x (#dep x batch_size) arc_preds = arc_logits.npvalue().argmax(0) if len(arc_preds.shape) == 1: # dynet did unnecessary jobs arc_preds = np.expand_dims(arc_preds, axis=1) # seq_len x batch_size if is_train or arc_targets is not None: arc_correct = np.equal(arc_preds, arc_targets).astype( np.float32) * mask arc_accuracy = np.sum(arc_correct) / num_tokens targets_1D = flatten_numpy(arc_targets) losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D) arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not is_train: arc_probs = np.transpose( np.reshape( dy.softmax(flat_arc_logits).npvalue(), (seq_len, seq_len, batch_size), 'F')) # #batch_size x #dep x #head W_rel = dy.parameter(self.rel_W) # dep_rel = dy.concatenate([dep_rel, dy.inputTensor(np.ones((1, seq_len),dtype=np.float32))]) # head_rel = dy.concatenate([head_rel, dy.inputTensor(np.ones((1, seq_len), dtype=np.float32))]) rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # (#head x rel_size x #dep) x batch_size flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size) # (#head x rel_size) x (#dep x batch_size) partial_rel_logits = dy.pick_batch( flat_rel_logits, targets_1D if is_train else flatten_numpy(arc_preds)) # (rel_size) x (#dep x batch_size) if is_train or arc_targets is not None: rel_preds = partial_rel_logits.npvalue().argmax(0) targets_1D = flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds, targets_1D).astype( np.float32) * mask_1D rel_accuracy = np.sum(rel_correct) / num_tokens losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D) rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not is_train: rel_probs = np.transpose( np.reshape( dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) # batch_size x #dep x #head x #nclasses if is_train or arc_targets is not None: loss = arc_loss + rel_loss correct = rel_correct * flatten_numpy(arc_correct) overall_accuracy = np.sum(correct) / num_tokens if is_train: return arc_accuracy * 100., rel_accuracy * 100., overall_accuracy * 100., loss outputs = [] for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs): # parse sentences one by one msk[0] = 1. sent_len = int(np.sum(msk)) arc_pred = arc_argmax(arc_prob, sent_len, msk) rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred] rel_pred = rel_argmax(rel_prob, sent_len) outputs.append((arc_pred[1:sent_len], rel_pred[1:sent_len])) if arc_targets is not None: return arc_accuracy * 100., rel_accuracy * 100., overall_accuracy * 100., outputs return outputs
def __call__(self, inputs, masks, truth, iters, is_train=True, is_tree=True): sent_len = len(inputs) batch_size = inputs[0].dim()[1] flat_len = sent_len * batch_size print('===Vào call===') print('input length: ', inputs.__len__()) # input length: 46 print('input dim: ', inputs[1].dim()) # input dim: ((400,), 2) print('sent_len', sent_len) # sent_len 46 print('batch_size', batch_size) # batch_size 2 print('flat_len', flat_len) # flat_len 92 # H -> hidden size, L -> sentence length, B -> batch size # ((H, L), B) X = dy.concatenate_cols(inputs) print('X dim: ', X.dim()) # X dim: ((400, 46), 2) if is_train: X = dy.dropout_dim(X, 1, self.cfg.MLP_DROP) # A_H -> ARC MLP hidden size, R_H -> REL MLP hidden size # ((A_H, L), B) head_arc = self.head_arc_MLP(X, is_train) dept_arc = self.dept_arc_MLP(X, is_train) print('head_arc dim: ', head_arc.dim()) print('dept_arc dim: ', dept_arc.dim()) # head_arc dim: ((300, 46), 2) # dept_arc dim: ((300, 46), 2) # ((R_H, L), B) head_rel = self.head_rel_MLP(X, is_train) dept_rel = self.dept_rel_MLP(X, is_train) print('head_rel dim: ', head_rel.dim()) print('dept_rel dim: ', dept_rel.dim()) # head_rel dim: ((100, 46), 2) # dept_rel dim: ((100, 46), 2) if is_train: total_token = sum(masks['flat'].tolist()) head_arc = dy.dropout_dim(head_arc, 1, self.cfg.MLP_DROP) head_rel = dy.dropout_dim(head_rel, 1, self.cfg.MLP_DROP) dept_arc = dy.dropout_dim(dept_arc, 1, self.cfg.MLP_DROP) dept_rel = dy.dropout_dim(dept_rel, 1, self.cfg.MLP_DROP) # ((L, L), B) masks_2D = 1e9 * (1 - dy.inputTensor(masks['2D'], True)) masks_flat = dy.inputTensor(masks['flat'], True) gnn_losses = [] arc_norm = math.sqrt(self.arc_size) rel_norm = math.sqrt(self.rel_size) for k in range(self.cfg.GRAPH_LAYERS): print('----layer-----', k) # Graph Weights # ((L, L), B) arc_mat = self.arc_attn_mat[k](head_arc, dept_arc) / arc_norm - masks_2D arc_prob = dy.softmax(arc_mat) # arc_mat dim: ((46, 46), 2) # arc_prob dim: ((46, 46), 2) # Layer-wise Loss if is_train: arc_prob = dy.dropout(arc_prob, self.cfg.ARC_DROP) # ((L,), L*B) arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len) # ((1,), L*B) print('arc_mat val', arc_mat.value()) print('arc_mat dim', arc_mat.dim()) print("truth['head'] value", truth['head']) print("truth['head'] lengt", truth['head'].__len__()) arc_loss = dy.pickneglogsoftmax_batch(arc_mat, truth['head']) print('arc_loss', arc_loss.value()) print('arc_loss', arc_loss.dim()) # (1,) arc_loss = dy.sum_batches(arc_loss * masks_flat) / total_token print('arc_loss', arc_loss.value) print('arc_loss', arc_loss.dim()) gnn_losses.append(arc_loss.value()) input("pause") # Aggregation Function # Fusion head and dept representation # ((A_H, L), B) HX = head_arc * arc_prob DX = dept_arc * dy.transpose(arc_prob) FX = HX + DX print('HX dim: ', HX.dim()) print('DX dim: ', DX.dim()) print('FX dim: ', FX.dim()) # HX dim: ((300, 46), 2) # DX dim: ((300, 46), 2) # FX dim: ((300, 46), 2) # Async Update Function # Head-first # ((A_H, L), B) head_arc = self.head_gnn(FX, head_arc) FX_new = head_arc * arc_prob + DX dept_arc = self.dept_gnn(FX_new, dept_arc) print('head_arc dim: ', head_arc.dim()) print('FX_new dim: ', FX_new.dim()) print('dept_arc dim: ', dept_arc.dim()) # head_arc dim: ((300, 46), 2) # FX_new dim: ((300, 46), 2) # dept_arc dim: ((300, 46), 2) # Relation Aggregation Function # Sync update # ((R_H, L), B) HR = head_rel * arc_prob DR = dept_rel * dy.transpose(arc_prob) FX = HR + DR head_rel = self.head_rel_gnn(FX, head_rel) + head_rel dept_rel = self.dept_rel_gnn(FX, dept_rel) + dept_rel print('HR dim: ', HR.dim()) print('DR dim: ', DR.dim()) print('FX dim: ', FX.dim()) # HR dim: ((100, 46), 2) # DR dim: ((100, 46), 2) # FX dim: ((100, 46), 2) print('head_rel dim: ', head_rel.dim()) print('dept_rel dim: ', dept_rel.dim()) # head_rel dim: ((100, 46), 2) # dept_rel dim: ((100, 46), 2) # ((L, L), B) arc_mat = self.arc_attn_mat[-1](head_arc, dept_arc) / arc_norm - masks_2D # ((L,), L*B) arc_mat = dy.reshape(arc_mat, (sent_len, ), flat_len) # Predict Relation # (R_H, L*B) head_rel = dy.reshape(head_rel, (self.rel_size, flat_len)) # ((R_H,), L*B) dept_rel = dy.reshape(dept_rel, (self.rel_size, ), flat_len) print('arc_mat dim: ', arc_mat.dim()) print('head_rel dim: ', head_rel.dim()) print('dept_rel dim: ', dept_rel.dim()) # arc_mat dim: ((46,), 92) # head_rel dim: ((100, 92), 1) # dept_rel dim: ((100,), 92) if is_train: # ((1,), L*B) arc_losses = dy.pickneglogsoftmax_batch(arc_mat, truth['head']) # (1,) arc_loss = dy.sum_batches(arc_losses * masks_flat) / total_token # ((R_H,), L*B) truth_rel = dy.pick_batch(head_rel, truth['flat_head'], 1) # R -> Relation Set Size # ((R,), L*B) rel_mask = 1e9 * dy.inputTensor(self.rel_mask) rel_mat = self.rel_attn(dept_rel, truth_rel) / rel_norm - rel_mask # Calculate Relation Classification Loss # ((1,), L*B) rel_losses = dy.pickneglogsoftmax_batch(rel_mat, truth['rel']) # (1,) rel_loss = dy.sum_batches(rel_losses * masks_flat) / total_token # Final Total Loss with Layer-wise warm = [int(iters >= x) for x in self.warm_list] losses = rel_loss*self.cfg.LAMBDA2 * \ warm[-1]+arc_loss*self.cfg.LAMBDA2*warm[-1] if gnn_losses: for i in range(self.cfg.GRAPH_LAYERS): gnn_losses[i] *= warm[i] losses += dy.esum(gnn_losses) * self.cfg.LAMBDA1 losses_list = gnn_losses + [arc_loss, rel_loss] return losses, losses_list else: if is_tree: # MST Inference, Achieve Tree Edge. arc_probs = dy.softmax(arc_mat).npvalue() arc_probs = np.reshape(arc_probs, (sent_len, sent_len, batch_size), 'F') arc_probs = np.transpose(arc_probs) # Mask PAD arc_masks = [ np.array(masks['flat'][i:i + sent_len]) for i in range(0, flat_len, sent_len) ] arc_pred = [] # Inference One By One. for msk, arc_prob in zip(arc_masks, arc_probs): msk[0] = 1 seq_len = int(np.sum(msk)) tmp_pred = MST_inference(arc_prob, seq_len, msk) tmp_pred[0] = 0 arc_pred.extend(tmp_pred) else: # Greedy Inference (argmax) arc_pred = np.argmax(arc_mat.npvalue(), 0) # Pick Predicted Edge's <Head, Dept> pair. flat_pred = [ j + (i // sent_len) * sent_len for i, j in enumerate(arc_pred) ] pred_rel = dy.pick_batch(head_rel, flat_pred, 1) # Predict Relation (mask ROOT) rel_mask = 1e9 * dy.inputTensor(self.rel_mask) rel_mat = self.rel_attn(dept_rel, pred_rel) / rel_norm - rel_mask rel_mat = dy.reshape(rel_mat, (self.rel_num, )).npvalue() rel_pred = np.argmax(rel_mat, 0) pred = {} pred['head'], pred['rel'] = arc_pred, rel_pred return pred
def run(self, word_inputs, lengths, tag_inputs, arc_targets=None, rel_targets=None, isTrain=True): batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = (np.broadcast_to(np.reshape(np.arange(seq_len), (seq_len, 1)), (seq_len, batch_size)) < lengths).astype( np.float32) mask[0] = 0. num_tokens = int(np.sum(mask)) if isTrain or arc_targets is not None: mask_1D = self.dynet_flatten_numpy(mask) # batched here means that the last dim is treated as batch dimension, both in input and output mask_1D_tensor = dy.inputTensor(mask_1D, batched=True) # TODO: 注意 _words_in_train # 两个 embedding 相加, [Expression of dim=((embedding_dim,), batch_size)] * seq_len if self.e_ext is not None: word_embs = [ dy.lookup_batch( self.e_form, np.where(w < self.v_train, w, self.vocab_form.stoi["<unk>"])) + dy.lookup_batch(self.e_ext, w, update=False) for w in word_inputs ] # 两个 embedding 相加 [Expression] * seq_len else: word_embs = [ dy.lookup_batch( self.e_form, np.where(w < self.v_train, w, self.vocab_form.stoi["<unk>"])) for w in word_inputs ] tag_embs = [dy.lookup_batch(self.e_tag, pos) for pos in tag_inputs] if isTrain: emb_masks = self.generate_emb_msk(seq_len, batch_size) emb_inputs = [ dy.concatenate([dy.cmult(w, wm), dy.cmult(pos, posm)]) for w, pos, (wm, posm) in zip(word_embs, tag_embs, emb_masks) ] else: emb_inputs = [ dy.concatenate([w, pos]) for w, pos in zip(word_embs, tag_embs) ] top_recur = dy.concatenate_cols( biLSTM(self.lstm_builders, emb_inputs, batch_size, self.dropout_lstm_input if isTrain else 0., self.dropout_lstm_hidden if isTrain else 0.)) if isTrain: # drop some dim for lstm_output for all words, all sentences top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp) dep = leaky_relu( dy.affine_transform([self.mlp_dep_b, self.mlp_dep_W, top_recur])) head = leaky_relu( dy.affine_transform([self.mlp_head_b, self.mlp_head_W, top_recur])) if isTrain: dep, head = dy.dropout_dim(dep, 1, self.dropout_mlp), dy.dropout_dim( head, 1, self.dropout_mlp) # drop dim k means, it is possible that the whole dim k is set to zeros # for matrix with batch, ((R, C), B) # drop dim 0 means drop some cols, drop dim 1 means drop some rows # drop 2 means drop some batches, and it only supports for Tensor with rank <=3 dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:] head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:] arc_logits = bilinear(dep_arc, self.arc_W, head_arc, self.mlp_arc_size, seq_len, batch_size, num_outputs=1, bias_x=True, bias_y=False) # (#head x #dep) x batch_size flat_arc_logits = dy.reshape(arc_logits, (seq_len, ), seq_len * batch_size) # flatten it to compute loss # (#head ) x (#dep x batch_size) arc_preds = np.reshape(arc_logits.npvalue().argmax(0), (seq_len, batch_size)) # seq_len x batch_size # here if an Expression's batch size is 1 # npvalue() will drop the batch dimension # so add it back if needed if isTrain or arc_targets is not None: # tarin it in a neg log likelihood fashion, but enforce tree constraint when testing arc_correct = np.equal(arc_preds, arc_targets).astype( np.float32) * mask # mask is used to filter <pad>'s out in summing loss arc_accuracy = np.sum(arc_correct) / num_tokens targets_1D = self.dynet_flatten_numpy(arc_targets) losses = dy.pickneglogsoftmax_batch(flat_arc_logits, targets_1D) arc_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: arc_probs = np.transpose( np.reshape( dy.softmax(flat_arc_logits).npvalue(), (seq_len, seq_len, batch_size), 'F')) # #batch_size x #dep x #head, transpose reverse all, and since layout has changed, it's totally fine rel_logits = bilinear(dep_rel, self.rel_W, head_rel, self.mlp_rel_size, seq_len, batch_size, num_outputs=len(self.vocab_deprel), bias_x=True, bias_y=True) # (#head x rel_size x #dep) x batch_size flat_rel_logits = dy.reshape(rel_logits, (seq_len, len(self.vocab_deprel)), seq_len * batch_size) # (#head x rel_size) x (#dep x batch_size) partial_rel_logits = dy.pick_batch( flat_rel_logits, targets_1D if isTrain else self.dynet_flatten_numpy(arc_preds)) # (rel_size) x (#dep x batch_size) if isTrain or arc_targets is not None: rel_preds = partial_rel_logits.npvalue().argmax(0) targets_1D = self.dynet_flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds, targets_1D).astype( np.float32) * mask_1D # 这里的形状如此, 需要用 mask1d rel_accuracy = np.sum(rel_correct) / num_tokens losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D) rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens if not isTrain: rel_probs = np.transpose( np.reshape( dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (len(self.vocab_deprel), seq_len, seq_len, batch_size), 'F')) # batch_size x #dep x #head x #nclasses if isTrain or arc_targets is not None: loss = arc_loss + rel_loss correct = rel_correct * self.dynet_flatten_numpy(arc_correct) overall_accuracy = np.sum(correct) / num_tokens if isTrain: return arc_accuracy, rel_accuracy, overall_accuracy, loss outputs = [] for msk, arc_prob, rel_prob in zip(np.transpose(mask), arc_probs, rel_probs): # parse sentences one by ones msk[0] = 1. sent_len = int(np.sum(msk)) arc_pred = arc_argmax(arc_prob, sent_len, msk) rel_prob = rel_prob[np.arange(len(arc_pred)), arc_pred] rel_pred = rel_argmax( rel_prob, sent_len, self.vocab_deprel, "root" if "root" in self.vocab_deprel.stoi else "ROOT") outputs.append( (arc_pred[1:sent_len], rel_pred[1:sent_len])) # w_0 is <roor> assert (len(outputs) == batch_size) if arc_targets is not None: return arc_accuracy, rel_accuracy, overall_accuracy, outputs return outputs
def forward(self, words, extwords, tags, isTrain): # inputs, targets: seq_le seq_len = len(words) dynamic_embs = [dy.lookup(self.word_embs, w) for w in words] static_embs = [ dy.lookup(self.pret_word_embs, w, update=False) for w in extwords ] word_embs = [ dynamic_emb + static_emb for dynamic_emb, static_emb in zip(dynamic_embs, static_embs) ] tag_embs = [dy.lookup(self.tag_embs, pos) for pos in tags] if isTrain: word_masks = np.random.binomial(1, 1. - self.dropout_emb, seq_len).astype(np.float32) tag_masks = np.random.binomial(1, 1. - self.dropout_emb, seq_len).astype(np.float32) scale = 3. / (2. * word_masks + tag_masks + 1e-12) word_masks *= scale tag_masks *= scale word_embs = [dy.cmult(word_emb, dy.inputVector([word_mask])) \ for word_emb, word_mask in zip(word_embs, word_masks)] tag_embs = [dy.cmult(tag_emb, dy.inputVector([tag_mask])) \ for tag_emb, tag_mask in zip(tag_embs, tag_masks)] emb_inputs = [ dy.concatenate([word_emb, pos_emb]) \ for word_emb, pos_emb in zip(word_embs, tag_embs)] # (2 * lstm_hiddens) * seq_len bilstm_out = dy.concatenate_cols( biLSTM(self.LSTM_builders, emb_inputs, self.dropout_lstm_input if isTrain else 0., self.dropout_lstm_hidden if isTrain else 0.)) if isTrain: bilstm_out = dy.dropout_dim(bilstm_out, 1, self.dropout_mlp) # (mlp_arc_size + mlp_rel_size) * seq_len W_dep, b_dep = dy.parameter(self.mlp_dep_W), dy.parameter( self.mlp_dep_b) dep = leaky_relu(dy.affine_transform([b_dep, W_dep, bilstm_out])) W_head, b_head = dy.parameter(self.mlp_head_W), dy.parameter( self.mlp_head_b) head = leaky_relu(dy.affine_transform([b_head, W_head, bilstm_out])) if isTrain: dep, head = dy.dropout_dim(dep, 1, self.dropout_mlp), dy.dropout_dim( head, 1, self.dropout_mlp) # mlp_arc_size * seq_len, mlp_rel_size * seq_len dep_arc, dep_rel = dep[:self.mlp_arc_size], dep[self.mlp_arc_size:] head_arc, head_rel = head[:self.mlp_arc_size], head[self.mlp_arc_size:] # (#head x #dep) W_arc = dy.parameter(self.arc_W) arc_logits = bilinear(dep_arc, W_arc, head_arc, self.mlp_arc_size, seq_len, num_outputs=1, bias_x=True, bias_y=False) # (#head x rel_size x #dep) W_rel = dy.parameter(self.rel_W) rel_logits = bilinear(dep_rel, W_rel, head_rel, self.mlp_rel_size, seq_len, num_outputs=self.rel_size, bias_x=True, bias_y=True) return arc_logits, rel_logits
def dropout_dim_list(rep_list, dp_rate, dim=0): return [dy.dropout_dim(rep, dim, dp_rate) for rep in rep_list]
def run(self, word_inputs, lemma_inputs, tag_inputs, pred_golds, rel_targets=None, isTrain=True, syn_mask=None, seq_lens=None): # inputs, targets: seq_len x batch_size def dynet_flatten_numpy(ndarray): return np.reshape(ndarray, (-1, ), 'F') batch_size = word_inputs.shape[1] seq_len = word_inputs.shape[0] mask = np.greater(word_inputs, self._vocab.PAD).astype(np.float32) num_tokens = int(np.sum(mask)) word_embs = [ dy.lookup_batch( self.word_embs, np.where(w < self._vocab.words_in_train, w, self._vocab.UNK)) for w in word_inputs ] if self.use_lm: lm_embs = np.zeros((batch_size, seq_len, self.lm_dims), dtype=float) for idx in range(batch_size): if self._unified: txt = [ self._vocab.id2word(w) for w in word_inputs[1:, idx] if self._vocab.id2word(w) != '<PAD>' ] key = ' '.join(txt) key = self.lm_dict.get(key, None) if key is None: for sidx in range(len(self.lm_sentences)): line = self.lm_sentences[sidx] if len(line) != len(txt): continue found = True for mdx in range(len(line)): if line[mdx] != txt[mdx] and txt[ mdx] != '<UNK>': found = False break if found: key = str(sidx) self.lm_dict[' '.join(txt)] = key break assert key is not None lm_embs[idx, 1:1 + len(txt), :] = self.lm_data[key][...] else: txt = [ self._vocab.id2word(w) for w in word_inputs[:, idx] if self._vocab.id2word(w) != '<PAD>' ] key = ' '.join(txt) key = self.lm_dict.get(key, None) if key is None: for sidx in range(len(self.lm_sentences)): line = self.lm_sentences[sidx] if len(line) != len(txt): continue found = True for mdx in range(len(line)): if line[mdx] != txt[mdx] and txt[ mdx] != '<UNK>': found = False break if found: key = str(sidx) self.lm_dict[' '.join(txt)] = key break assert key is not None lm_embs[idx, :len(txt), :] = self.lm_data[key][...] lm_embs = lm_embs.transpose(1, 2, 0) lm_embs = [dy.inputTensor(e, batched=True) for e in list(lm_embs)] pre_embs = [ dy.lookup_batch(self.pret_word_embs, w) for w in word_inputs ] flag_embs = [ dy.lookup_batch(self.flag_embs, np.array(w == i + 1, dtype=np.int)) for i, w in enumerate(pred_golds) ] if self.use_lemma: lemma_embs = [ dy.lookup_batch(self.lemma_embs, lemma) for lemma in lemma_inputs ] if self.use_pos: tag_embs = [ dy.lookup_batch(self.tag_embs, pos) for pos in tag_inputs ] if self.use_lm: if isTrain: emb_masks = self.generate_emb_mask(seq_len, batch_size) if self.use_lemma and self.use_pos: emb_inputs = [ dy.concatenate([ dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm), dy.cmult(lemma, wm), dy.cmult(lme, wm), dy.cmult(pos, posm) ]) for word, pre, flag, lemma, pos, lme, (wm, posm) in zip(word_embs, pre_embs, flag_embs, lemma_embs, tag_embs, lm_embs, emb_masks) ] elif self.use_lemma: emb_inputs = [ dy.concatenate([ dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm), dy.cmult(lemma, wm), dy.cmult(lme, wm) ]) for word, pre, flag, lemma, pos, lme, ( wm, posm) in zip(word_embs, pre_embs, flag_embs, lemma_embs, lm_embs, emb_masks) ] elif self.use_pos: emb_inputs = [ dy.concatenate([ dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm), dy.cmult(lme, wm), dy.cmult(pos, posm) ]) for word, pre, flag, pos, lme, ( wm, posm) in zip(word_embs, pre_embs, flag_embs, tag_embs, lm_embs, emb_masks) ] else: emb_inputs = [ dy.concatenate([ dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm), dy.cmult(lme, wm) ]) for word, pre, flag, lme, (wm, posm) in zip( word_embs, pre_embs, flag_embs, lm_embs, emb_masks) ] else: if self.use_lemma and self.use_pos: emb_inputs = [ dy.concatenate([word, pre, flag, lemma, lme, pos]) for word, pre, flag, lemma, lme, pos in zip( word_embs, pre_embs, flag_embs, lemma_embs, lm_embs, tag_embs) ] elif self.use_lemma: emb_inputs = [ dy.concatenate([word, pre, flag, lme, pos]) for word, pre, flag, lemma, lme, pos in zip( word_embs, pre_embs, flag_embs, lm_embs, tag_embs) ] elif self.use_pos: emb_inputs = [ dy.concatenate([word, pre, flag, lemma, lme]) for word, pre, flag, lemma, lme in zip( word_embs, pre_embs, flag_embs, lemma_embs, lm_embs) ] else: emb_inputs = [ dy.concatenate([word, pre, flag, lme]) for word, pre, flag, lme in zip( word_embs, pre_embs, flag_embs, lm_embs) ] else: if isTrain: emb_masks = self.generate_emb_mask(seq_len, batch_size) if self.use_lemma and self.use_pos: emb_inputs = [ dy.concatenate([ dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm), dy.cmult(lemma, wm), dy.cmult(pos, posm) ]) for word, pre, flag, lemma, pos, ( wm, posm) in zip(word_embs, pre_embs, flag_embs, lemma_embs, tag_embs, emb_masks) ] elif self.use_lemma: emb_inputs = [ dy.concatenate([ dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm), dy.cmult(lemma, wm) ]) for word, pre, flag, lemma, ( wm, posm) in zip(word_embs, pre_embs, flag_embs, lemma_embs, emb_masks) ] elif self.use_pos: emb_inputs = [ dy.concatenate([ dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm), dy.cmult(pos, posm) ]) for word, pre, flag, pos, ( wm, posm) in zip(word_embs, pre_embs, flag_embs, tag_embs, emb_masks) ] else: emb_inputs = [ dy.concatenate([ dy.cmult(word, wm), dy.cmult(pre, wm), dy.cmult(flag, wm) ]) for word, pre, flag, (wm, posm) in zip( word_embs, pre_embs, flag_embs, emb_masks) ] else: if self.use_lemma and self.use_pos: emb_inputs = [ dy.concatenate([word, pre, flag, lemma, pos]) for word, pre, flag, lemma, pos in zip( word_embs, pre_embs, flag_embs, lemma_embs, tag_embs) ] elif self.use_lemma: emb_inputs = [ dy.concatenate([word, pre, flag, lemma]) for word, pre, flag, lemma in zip( word_embs, pre_embs, flag_embs, lemma_embs) ] elif self.use_pos: emb_inputs = [ dy.concatenate([word, pre, flag, pos]) for word, pre, flag, pos in zip( word_embs, pre_embs, flag_embs, tag_embs) ] else: emb_inputs = [ dy.concatenate([word, pre, flag]) for word, pre, flag in zip( word_embs, pre_embs, flag_embs) ] if self.encoder_type == 'rnn': top_recur = dy.concatenate_cols( biLSTM(self.LSTM_builders, emb_inputs, batch_size, self.dropout_lstm_input if isTrain else 0., self.dropout_lstm_hidden if isTrain else 0.)) else: emb_inputs = dy.concatenate_cols(emb_inputs) emb_inputs = emb_inputs * math.sqrt(self.input_dims) emb_inputs = emb_inputs + dy.transpose( dy.inputTensor(self.pe[:seq_len])) emb_inputs = dy.transpose(emb_inputs) encoder_outputs = self.transformer(emb_inputs, src_len=seq_lens, train=isTrain) top_recur = encoder_outputs.output top_recur = dy.concatenate_cols(top_recur) #print(top_recur.dim()) if isTrain: top_recur = dy.dropout_dim(top_recur, 1, self.dropout_mlp) W_arg, b_arg = self.mlp_arg_W.expr(), self.mlp_arg_b.expr( ) #dy.parameter(self.mlp_arg_W), dy.parameter(self.mlp_arg_b) W_pred, b_pred = dy.parameter(self.mlp_pred_W), dy.parameter( self.mlp_pred_b) arg_hidden = leaky_relu(dy.affine_transform([b_arg, W_arg, top_recur])) # pred_hidden = leaky_relu(dy.affine_transform([b_pred, W_pred, top_recur])) predicates_1D = pred_golds[0] pred_recur = dy.pick_batch(top_recur, predicates_1D, dim=1) pred_hidden = leaky_relu( dy.affine_transform([b_pred, W_pred, pred_recur])) if isTrain: arg_hidden = dy.dropout_dim(arg_hidden, 1, self.dropout_mlp) # pred_hidden = dy.dropout_dim(pred_hidden, 1, self.dropout_mlp) pred_hidden = dy.dropout(pred_hidden, self.dropout_mlp) W_rel = dy.parameter(self.rel_W) # rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, batch_size, # num_outputs = self._vocab.rel_size, bias_x = True, bias_y = True) # # (#pred x rel_size x #arg) x batch_size # flat_rel_logits = dy.reshape(rel_logits, (seq_len, self._vocab.rel_size), seq_len * batch_size) # # (#pred x rel_size) x (#arg x batch_size) # predicates_1D = dynet_flatten_numpy(pred_golds) # partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D) # # (rel_size) x (#arg x batch_size) if self.use_si_droput and syn_mask is not None: syn_mask = np.expand_dims(syn_mask, axis=0) # (1, seq_len, batch_size) arg_hidden = dy.cmult(arg_hidden, dy.inputTensor(syn_mask, batched=True)) rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, 1, batch_size, num_outputs=self._vocab.rel_size, bias_x=True, bias_y=True) # if self.use_biaffine: # rel_logits = bilinear(arg_hidden, W_rel, pred_hidden, self.mlp_size, seq_len, 1, batch_size, # num_outputs = self._vocab.rel_size, bias_x = True, bias_y = True) # else: # pred_hidden = dy.reshape(pred_hidden, (self.mlp_size, 1), batch_size) # preds_hidden = [pred_hidden for _ in xrange(seq_len)] # preds_hidden = dy.concatenate(preds_hidden, d=1) # rel_hidden = dy.concatenate([preds_hidden, arg_hidden], d=0) # (2*mlp_size x seq_len) x batch_size # flat_rel_hidden = dy.reshape(rel_hidden, (self.mlp_size*2, ), seq_len * batch_size) # W_ffn_layer1 = dy.parameter(self.ffn_layer1_W) # b_ffn_layer1 = dy.parameter(self.ffn_layer1_b) # W_ffn_layer2 = dy.parameter(self.ffn_layer2_W) # b_ffn_layer2 = dy.parameter(self.ffn_layer2_b) # flat_rel_hidden = leaky_relu(dy.affine_transform([b_ffn_layer1, W_ffn_layer1, flat_rel_hidden])) # flat_rel_hidden = leaky_relu(dy.affine_transform([b_ffn_layer2, W_ffn_layer2, flat_rel_hidden])) # flat_rel_hidden = W_rel * flat_rel_hidden # rel_logits = dy.reshape(flat_rel_hidden, (1, self._vocab.rel_size, seq_len), batch_size) # (1 x rel_size x #arg) x batch_size flat_rel_logits = dy.reshape(rel_logits, (1, self._vocab.rel_size), seq_len * batch_size) # (1 x rel_size) x (#arg x batch_size) predicates_1D = np.zeros(dynet_flatten_numpy(pred_golds).shape[0]) partial_rel_logits = dy.pick_batch(flat_rel_logits, predicates_1D) # (1 x rel_size) x (#arg x batch_size) if isTrain: mask_1D = dynet_flatten_numpy(mask) mask_1D_tensor = dy.inputTensor(mask_1D, batched=True) rel_preds = partial_rel_logits.npvalue().argmax(0) targets_1D = dynet_flatten_numpy(rel_targets) rel_correct = np.equal(rel_preds, targets_1D).astype( np.float32) * mask_1D rel_accuracy = np.sum(rel_correct) / num_tokens losses = dy.pickneglogsoftmax_batch(partial_rel_logits, targets_1D) rel_loss = dy.sum_batches(losses * mask_1D_tensor) / num_tokens return rel_accuracy, rel_loss # rel_probs = np.transpose(np.reshape(dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), # (self._vocab.rel_size, seq_len, seq_len, batch_size), 'F')) rel_probs = np.transpose( np.reshape( dy.softmax(dy.transpose(flat_rel_logits)).npvalue(), (self._vocab.rel_size, 1, seq_len, batch_size), 'F')) outputs = [] # for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T, rel_probs): # msk[0] = 1. # sent_len = int(np.sum(msk)) # rel_prob = rel_prob[np.arange(len(pred_gold)), pred_gold] # rel_pred = rel_argmax(rel_prob) # outputs.append(rel_pred[:sent_len]) for msk, pred_gold, rel_prob in zip(np.transpose(mask), pred_golds.T, rel_probs): msk[0] = 1. sent_len = int(np.sum(msk)) rel_prob = rel_prob[np.arange(len(pred_gold)), 0] rel_pred = rel_argmax(rel_prob) outputs.append(rel_pred[:sent_len]) return outputs