def forward(self, words, labels=None): multitask = labels is not None if self.training: words = self.word_vocab.unkify(words) rnn = self.rnn_builder.initial_state() word_ids = [self.word_vocab.index_or_unk(word) for word in [START] + words + [STOP]] prev_embeddings = [self.embeddings[word_id] for word_id in word_ids[:-1]] lstm_outputs = rnn.transduce(prev_embeddings) logits = self.out(dy.concatenate_to_batch(lstm_outputs)) nlls = dy.pickneglogsoftmax_batch(logits, word_ids[1:]) word_nll = dy.sum_batches(nlls) if multitask: label_ids = [self.label_vocab.index(label) for label in labels] logits = self.f_label(dy.concatenate_to_batch(lstm_outputs[1:])) nlls = dy.pickneglogsoftmax_batch(logits, label_ids) label_nll = dy.sum_batches(nlls) # easy proxy to track progress on this task self.correct += np.sum(np.argmax(logits.npvalue(), axis=0) == label_ids) self.predicted += len(label_ids) nll = word_nll + label_nll else: nll = word_nll return nll
def __evaluate(self, lstm_output): length = len(lstm_output) # (i, j) -> (i * length + j,) # i = k / length, j = k % length # 1 1 2 2 3 3 4 4 .. heads = [ dn.transpose(self.activation(self.head_dense_layer( lstm_output[i]))) for i in range(length) ] mods = [ self.activation(self.dep_dense_layer(lstm_output[i])) for i in range(length) ] head_part = dn.concatenate_to_batch( [heads[i // len(lstm_output)] for i in range(length * length)]) # 1 2 3 4 .. 1 2 3 4 ... mod_part = dn.concatenate_to_batch([mods[i] for i in range(length)] * length) output = self.fusion_layer(head_part, mod_part) exprs = [[ dn.pick_batch_elem(output, i * length + j) for j in range(length) ] for i in range(length)] scores = output.npvalue() scores = scores.reshape((len(lstm_output), len(lstm_output))) return scores, exprs
def forward(self, words, spans=None): multitask = spans is not None if self.training: words = self.word_vocab.unkify(words) rnn = self.rnn_builder.initial_state() word_ids = [self.word_vocab.index_or_unk(word) for word in [START] + words + [STOP]] prev_embeddings = [self.embeddings[word_id] for word_id in word_ids[:-1]] lstm_outputs = rnn.transduce(prev_embeddings) logits = self.out(dy.concatenate_to_batch(lstm_outputs)) nlls = dy.pickneglogsoftmax_batch(logits, word_ids[1:]) word_nll = dy.sum_batches(nlls) if multitask: # predict label for each possible span (null for nonexistent spans) if self.predict_all_spans: gold_spans = {(left, right): self.label_vocab.index(label) for left, right, label in spans} all_spans = [(left, left + length) for length in range(1, len(words) + 1) for left in range(0, len(words) + 1 - length)] label_ids = [gold_spans.get((left, right), self.label_vocab.size) # last index is for null label for left, right in all_spans] # 'lstm minus' features, same as those of the crf parser span_encodings = [lstm_outputs[right] - lstm_outputs[left] for left, right in all_spans] # only predict labels for existing spans else: label_ids = [self.label_vocab.index(label) for _, _, label in spans] # 'lstm minus' features, same as those of the crf parser span_encodings = [lstm_outputs[right] - lstm_outputs[left] for left, right, label in spans] logits = self.f_label(dy.concatenate_to_batch(span_encodings)) nlls = dy.pickneglogsoftmax_batch(logits, label_ids) label_nll = dy.sum_batches(nlls) # easy proxy to track progress on this task self.correct += np.sum(np.argmax(logits.npvalue(), axis=0) == label_ids) self.predicted += len(label_ids) nll = word_nll + label_nll else: nll = word_nll return nll
def __call__(self, encoder_output, hsz, beam_width=1): h_i = self.get_state(encoder_output) context = encoder_output.output if beam_width > 1: # To vectorize, we need to expand along the batch dimension, K times context = [dy.concatenate_to_batch([c] * beam_width) for c in context] h_i = [dy.concatenate_to_batch([h] * beam_width) for h in h_i] _, batchsz = context[0].dim() init_zeros = dy.zeros((hsz,), batch_size=batchsz) return h_i, init_zeros, context
def transduce(self, inputs, masks, predict=False): if not self.init: print("No Initial state provided") return outputs = [] batch_size = inputs[0].dim()[1] for idx, input_tensor in enumerate(inputs): recur_s = [] cell_s = [] out = [] hidden = self.hidden_previous cell = self.cell_previous if not predict: input_tensor = dy.cmult(input_tensor, self.input_drop_mask) hidden = dy.cmult(hidden, self.recur_drop_mask) gates = dy.affine_transform([ self.b.expr(), self.WXH.expr(), dy.concatenate([input_tensor, hidden]) ]) iga = dy.pickrange(gates, 0, self.recur_size) fga = dy.pickrange(gates, self.recur_size, 2 * self.recur_size) oga = dy.pickrange(gates, 2 * self.recur_size, 3 * self.recur_size) cga = dy.pickrange(gates, 3 * self.recur_size, 4 * self.recur_size) ig = dy.logistic(iga) fg = dy.logistic(fga) # +self.forget_bias og = dy.logistic(oga) c_tilda = dy.tanh(cga) new_cell = dy.cmult(cell, fg) + dy.cmult(c_tilda, ig) new_hidden = dy.cmult(dy.tanh(new_cell), og) for jdx in range(batch_size): if masks[idx][jdx] == 1: h_t = dy.pick_batch_elem(new_hidden, jdx) recur_s.append(h_t) cell_s.append(dy.pick_batch_elem(new_cell, jdx)) out.append(h_t) else: recur_s.append(dy.pick_batch_elem(hidden, jdx)) cell_s.append(dy.pick_batch_elem(cell, jdx)) out.append(dy.zeros(self.recur_size)) new_cell = dy.concatenate_to_batch(cell_s) new_hidden = dy.concatenate_to_batch(recur_s) self.cell_previous = new_cell self.hidden_previous = new_hidden outputs.append(dy.concatenate_to_batch(out)) return outputs
def __call__(self, x, z=None, mask=None): h = self.h if z == None: Q = self.W_Q(x) K = self.W_K(x) V = self.W_V(x) else: Q = self.W_Q(x) K = self.W_K(z) V = self.W_V(z) (n_units, n_querys), batch = Q.dim() (_, n_keys), _ = K.dim() batch_Q = dy.concatenate_to_batch(self.split_rows(Q, h)) batch_K = dy.concatenate_to_batch(self.split_rows(K, h)) batch_V = dy.concatenate_to_batch(self.split_rows(V, h)) assert(batch_Q.dim() == (n_units // h, n_querys), batch * h) assert(batch_K.dim() == (n_units // h, n_keys), batch * h) assert(batch_V.dim() == (n_units // h, n_keys), batch * h) mask = np.concatenate([mask] * h, axis=0) mask = np.moveaxis(mask, [1, 0, 2], [0, 2, 1]) mask = dy.inputTensor(mask, batched=True) batch_A = (dy.transpose(batch_Q) * batch_K) * self.scale_score batch_A = dy.cmult(batch_A, mask) + (1 - mask)*MIN_VALUE sent_len = batch_A.dim()[0][0] if sent_len == 1: batch_A = dy.softmax(batch_A) else: batch_A = dy.softmax(batch_A, d=1) batch_A = dy.cmult(batch_A, mask) assert (batch_A.dim() == ((n_querys, n_keys), batch * h)) if self.attn_dropout: if self.dropout != 0.0: batch_A = dy.dropout(batch_A, self.dropout) batch_C = dy.transpose(batch_A * dy.transpose(batch_V)) assert (batch_C.dim() == ((n_units // h, n_querys), batch * h)) C = dy.concatenate(self.split_batch(batch_C, h), d=0) assert (C.dim() == ((n_units, n_querys), batch)) C = self.finishing_linear_layer(C) return C
def test_concatenate_to_batch(self): dy.renew_cg() x = dy.lookup_batch(self.p, [0, 1]) y = dy.pick_batch_elem(x, 0) z = dy.pick_batch_elem(x, 1) w = dy.concatenate_to_batch([y, z]) self.assertTrue(np.allclose(w.npvalue(), self.pval.T))
def produce_parse_forest(self, sentence, required_probability_mass): lstm_outputs = self._featurize_sentence(sentence, is_train=False) encodings = [] spans = [] for start in range(0, len(sentence)): for end in range(start + 1, len(sentence) + 1): spans.append((start, end)) encodings.append(self._get_span_encoding(start, end, lstm_outputs)) label_scores = self.f_label(dy.concatenate_to_batch(encodings)) label_scores_reshaped = dy.reshape(label_scores, (self.label_vocab.size, len(encodings))) label_probabilities_np = dy.softmax(label_scores_reshaped).npvalue() span_to_labels = {} forest_prob_mass = 1 for index, span in enumerate(spans): distribution = list(enumerate(label_probabilities_np[:, index])) distribution.sort(key=lambda x: - x[1]) total_probability = 0 labels = [] while total_probability < required_probability_mass: (label_index, probability) = distribution.pop() labels.append(self.label_vocab.values[label_index]) total_probability += probability forest_prob_mass *= total_probability span_to_labels[span] = labels return span_to_labels, forest_prob_mass
def embed(self, x): if self.word_dropout > 0.0 and self.word_id_mask is None: batch_size = len(x) if xnmt.batcher.is_batched(x) else 1 self.word_id_mask = [set(np.random.choice(self.vocab_size, int(self.vocab_size * self.word_dropout), replace=False)) for _ in range(batch_size)] emb_e = dy.parameter(self.embeddings) # single mode if not xnmt.batcher.is_batched(x): if self.train and self.word_id_mask and x in self.word_id_mask[0]: ret = dy.zeros((self.emb_dim,)) else: ret = dy.pick(emb_e, index=x) if self.fix_norm != None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm # minibatch mode else: ret = dy.concatenate_to_batch([dy.pick(emb_e, index=xi) for xi in x]) if self.fix_norm != None: ret = dy.cdiv(ret, dy.l2_norm(ret)) if self.fix_norm != 1: ret *= self.fix_norm if self.train and self.word_id_mask and any(x[i] in self.word_id_mask[i] for i in range(len(x))): dropout_mask = dy.inputTensor(np.transpose([[0.0]*self.emb_dim if x[i] in self.word_id_mask[i] else [1.0]*self.emb_dim for i in range(len(x))]), batched=True) ret = dy.cmult(ret, dropout_mask) if self.train and self.weight_noise > 0.0: ret = dy.noise(ret, self.weight_noise) return ret
def beam_decode(self, encodings, input_len=10, beam_size=1): batch_size = 1 self.__dec.init_params(encodings, batch_size, self.__train_flag) context = dy.zeros((self.__enc.output_dim, )) beams = [Beam(self.__dec.dec_state, context, [self.__trg_sos], 0.0)] for i in xrange(int(min(self.__max_len, input_len * 1.5))): new_beams = [] p_list = [] for b in beams: if b.words[-1] == self.__trg_eos: p_list.append(dy.ones((self.__trg_vsize, ))) continue hidden, embs, b.state = self.__dec.next([b.words[-1]], b.context, self.__train_flag, b.state) b.context, _ = self.attend(encodings, hidden) score = self.__dec.score(hidden, b.context, embs, self.__train_flag) p_list.append(dy.softmax(score)) p_list = dy.concatenate_to_batch(p_list).npvalue().T.reshape(-1, self.__trg_vsize) for p, b in zip(p_list, beams): p = p.flatten() / p.sum() kbest = np.argsort(p) if b.words[-1] == self.__trg_eos: new_beams.append(Beam(b.state, b.context, b.words, b.log_prob)) else: for next_word in kbest[-beam_size:]: new_beams.append(Beam(b.state, b.context, b.words + [next_word], b.log_prob + np.log(p[next_word]))) beams = sorted(new_beams, key=lambda b: b.log_prob)[-beam_size:] if beams[-1].words[-1] == self.__trg_eos: break return beams[-1].words
def predict_sequence_batched(self, inputs, mask_array, wlen, predictFlag=False): batch_size = inputs[0].dim()[1] src_len = len(inputs) if not predictFlag: self.charlstm.set_dropouts(self.dropout, self.dropout) self.charlstm.set_dropout_masks(batch_size) char_fwd = self.charlstm.initial_state(batch_size) recur_states, cells = char_fwd.add_inputs(inputs, mask_array, predictFlag) hidden_states = [] for idx in range(src_len): mask = dy.inputVector(mask_array[idx]) mask_expr = dy.reshape(mask, (1, ), batch_size) hidden_states.append(recur_states[idx] * mask_expr) H = dy.concatenate_cols(hidden_states) if (predictFlag): a = dy.softmax(dy.transpose(self.W_atten.expr()) * H) else: #dropout attention connections(keep the same dim across the sequence) a = dy.softmax( dy.transpose(self.W_atten.expr()) * dy.dropout_dim(H, 1, self.dropout)) cell_states = [] for idx in range(batch_size): if (wlen[idx] > 0): cell = dy.pick_batch_elem(cells[wlen[idx] - 1], idx) else: cell = dy.zeros(self.ldims) cell_states.append(cell) C = dy.concatenate_to_batch(cell_states) H_atten = H * dy.transpose(a) char_emb = dy.concatenate([H_atten, C]) if predictFlag: proj_char_emb = dy.affine_transform( [self.b_linear.expr(), self.W_linear.expr(), char_emb]) else: proj_char_emb = dy.affine_transform([ self.b_linear.expr(), self.W_linear.expr(), dy.dropout(char_emb, self.dropout) ]) return proj_char_emb
def disc_ll(self): try: return dy.concatenate_to_batch(self.ll_buffer) finally: # Make sure that the state is not used again after the log likelihood is requested del self.ll_buffer del self.batch_size del self.counter
def _encodings_to_label_log_probabilities(self, encodings, lmbd=None): label_scores = self.f_label(dy.concatenate_to_batch(encodings)) label_scores_reshaped = dy.reshape(label_scores, (self.label_vocab.size, len(encodings))) if lmbd is not None: label_scores_reshaped = dy.cmult(label_scores_reshaped, lmbd) return dy.log_softmax(label_scores_reshaped)
def evaluate_all_states(self, states): input_tensors = dn.concatenate_to_batch( [state.get_input_tensor(self.k, self.empty) for state in states]) action_outputs = self.action_classifier(input_tensors) relation_outputs = None if self.relation: relation_outputs = self.relation_classifier(input_tensors) return action_outputs, relation_outputs
def aggressive_annotation(self, sentence, sentence_number, span_to_gold_label, low_conf_cutoff, seen): if len(span_to_gold_label) == 0: return [] # , [] lstm_outputs = self._featurize_sentence(sentence, is_train=False) encodings = [] spans = span_to_gold_label.keys() for (start, end) in spans: encodings.append(self._get_span_encoding(start, end, lstm_outputs)) label_scores = self.f_label(dy.concatenate_to_batch(encodings)) label_scores_reshaped = dy.reshape(label_scores, (self.label_vocab.size, len(encodings))) label_probabilities_np = dy.softmax(label_scores_reshaped).npvalue() low_confidence_labels = [] # high_confidence_labels = [] on_labels = [] for index, (start, end) in list(enumerate(spans)): distribution = label_probabilities_np[:, index] entropy = stats.entropy(distribution) oracle_label = span_to_gold_label[(start, end)] annotation_request = dict( sentence_number=sentence_number, left=start, right=end, entropy=entropy, non_constituent_probability=distribution[0], label=oracle_label ) if (start, end) in seen: del span_to_gold_label[(start, end)] continue if low_conf_cutoff < entropy and distribution[self.empty_label_index] < 0.5: # annotation_request['label'] = oracle_label low_confidence_labels.append(annotation_request) elif entropy < 10 ** -5 and distribution[self.empty_label_index] > 0.99: del span_to_gold_label[(start, end)] # if entropy > 10 ** -7: # high_confidence_labels.append(annotation_request) if np.max(distribution) > distribution[self.empty_label_index]: on_labels.append(annotation_request) for index, label_a in enumerate(on_labels): span_a = (label_a['left'], label_a['right']) for label_b in on_labels[index + 1:]: span_b = (label_b['left'], label_b['right']) if check_overlap(span_a, span_b): label_a['entropy'] = 10 low_confidence_labels.append(label_a) label_b['entropy'] = 10 low_confidence_labels.append(label_b) return low_confidence_labels # , high_confidence_labels
def _encodings_to_label_log_probabilities(self, encodings, lmbd=None, alpha=None): label_scores = self.f_label(dy.concatenate_to_batch(encodings)) label_scores_reshaped = dy.reshape(label_scores, (self.label_vocab.size, len(encodings))) # if alpha is not None: # temp = dy.abs(dy.reshape(alpha[0], (1, 1))) # label_scores_reshaped = dy.cmult(dy.logistic(dy.cmult(label_scores_reshaped, temp) + alpha[1]), lmbd) + alpha[2] # 990.51641846]] [ 0.03124614 4.00097179 -9.43100834 # label_scores_reshaped = dy.logistic(label_scores_reshaped * 0.03124614 + 4.00097179) * 990.51641846 - 9.43100834 return dy.log_softmax(label_scores_reshaped)
def cal_scores(self, src_encodings, masks, train): src_len = len(src_encodings) batch_size = src_encodings[0].dim()[1] heads_LRlayer = [] mods_LRlayer = [] for encoding in src_encodings: heads_LRlayer.append( self.leaky_ReLu(self.b_head.expr() + self.W_head.expr() * encoding)) mods_LRlayer.append( self.leaky_ReLu(self.b_mod.expr() + self.W_mod.expr() * encoding)) heads_labels = [] heads = [] labels = [] neg_inf = dy.constant(1, -float("inf")) for row in range( 1, src_len ): #exclude root @ index=0 since roots do not have heads scores_idx = [] for col in range(src_len): dist = col - row mdist = self.dist_max dist_i = (min(dist, mdist - 1) + mdist if dist >= 0 else int( min(-1.0 * dist, mdist - 1))) dist_vec = dy.lookup_batch(self.dlookup, [dist_i] * batch_size) if train: input_vec = dy.concatenate([ dy.esum([ dy.dropout(heads_LRlayer[col], self.dropout), dy.dropout(mods_LRlayer[row], self.dropout) ]), dist_vec ]) else: input_vec = dy.concatenate([ dy.esum([heads_LRlayer[col], mods_LRlayer[row]]), dist_vec ]) score = self.scoreHeadModLabel(input_vec, train) mask = masks[row] and masks[col] join_scores = [] for bdx in range(batch_size): if (mask[bdx] == 1): join_scores.append(dy.pick_batch_elem(score, bdx)) else: join_scores.append( dy.concatenate([neg_inf] * self.n_labels)) scores_idx.append(dy.concatenate_to_batch(join_scores)) heads_labels.append(dy.concatenate(scores_idx)) return heads_labels
def transduce(self, embed_sent: ExpressionSequence) -> List[ExpressionSequence]: batch_size = embed_sent[0].dim()[1] actions = self.sample_segmentation(embed_sent, batch_size) sample_size = len(actions) embeddings = dy.concatenate(embed_sent.expr_list, d=1) embeddings.value() # composed_words = [] for i in range(batch_size): sequence = dy.pick_batch_elem(embeddings, i) # For each sampled segmentations for j, sample in enumerate(actions): lower_bound = 0 # Read every 'segment' decision for k, upper_bound in enumerate(sample[i]): char_sequence = dy.pick_range(sequence, lower_bound, upper_bound + 1, 1) composed_words.append( (dy.pick_range(sequence, lower_bound, upper_bound + 1, 1), j, i, k, lower_bound, upper_bound + 1)) #self.segment_composer.set_word_boundary(lower_bound, upper_bound, self.src_sent[i]) #composed = self.segment_composer.transduce(char_sequence) #outputs[j][i].append(composed) lower_bound = upper_bound + 1 outputs = self.segment_composer.compose(composed_words, sample_size, batch_size) # Padding + return try: if self.length_prior: seg_size_unpadded = [[ len(outputs[i][j]) for j in range(batch_size) ] for i in range(sample_size)] enc_outputs = [] for batched_sampled_sentence in outputs: sampled_sentence, segment_mask = self.pad( batched_sampled_sentence) expr_seq = ExpressionSequence( expr_tensor=dy.concatenate_to_batch(sampled_sentence), mask=segment_mask) sent_context = self.final_transducer.transduce(expr_seq) self.final_states.append( self.final_transducer.get_final_states()) enc_outputs.append(sent_context) return CompoundSeqExpression(enc_outputs) finally: if self.length_prior: self.seg_size_unpadded = seg_size_unpadded self.compose_output = outputs self.segment_actions = actions if not self.train and self.compute_report: self.add_sent_for_report({"segment_actions": actions})
def fit_partial(self, instances): random.shuffle(instances) self.iter += 1 losses = [] dy.renew_cg() total_loss, total_size = 0., 0 prog = tqdm(desc="Epoch {}".format(self.iter), ncols=80, total=len(instances) + 1) for i, ins in enumerate(instances, 1): losses.extend(list(self.model.loss(*ins))) if i % self.batch_size == 0: loss = dy.sum_batches(dy.concatenate_to_batch(losses)) total_loss += loss.value() total_size += len(losses) prog.set_postfix(loss=loss.value()/len(losses)) loss.backward() self.opt.update() dy.renew_cg() losses = [] prog.update() if losses: loss = dy.sum_batches(dy.concatenate_to_batch(losses)) total_loss += loss.value() total_size += len(losses) self.loss = total_loss / total_size prog.set_postfix(loss=self.loss) loss.backward() self.opt.update() dy.renew_cg() prog.update() self.opt.learning_rate *= self.lr_decay prog.close()
def get_complete_raw_exprs(self, lstm_output): length = len(lstm_output) lstm_output_as_batch = dn.concatenate_to_batch(lstm_output) headfov = self.bilinear_layer.w1.expr() * lstm_output_as_batch modfov = self.bilinear_layer.w2.expr() * lstm_output_as_batch # (i, j) -> (i * length + j,) # i = k / length, j = k % length # 1 1 2 2 3 3 4 4 .. heads = [dn.pick_batch_elem(headfov, i) for i in range(length)] mods = [dn.pick_batch_elem(modfov, i) for i in range(length)] head_part = dn.concatenate_to_batch([heads[i // len(lstm_output)] for i in range(length * length)]) # 1 2 3 4 .. 1 2 3 4 ... mod_part = dn.concatenate_to_batch([mods[i] for i in range(length)] * length) hidden = self.activation(head_part + mod_part + self.bilinear_layer.bias.expr()) struct_dropout = getattr(self.options, "struct_dropout", 0.0) if self.options.is_train and struct_dropout > 0: hidden = dn.dropout(hidden, struct_dropout) output = self.dense_layer(hidden) return output
def get_complete_raw_exprs(self, lstm_output): length = len(lstm_output) lstm_output_as_batch = dn.concatenate_to_batch(lstm_output) headfov = self.bilinear_layer.w1.expr() * lstm_output_as_batch modfov = self.bilinear_layer.w2.expr() * lstm_output_as_batch # (i, j) -> (i * length + j,) # i = k / length, j = k % length # 1 1 2 2 3 3 4 4 .. heads = [dn.pick_batch_elem(headfov, i) for i in range(length)] mods = [dn.pick_batch_elem(modfov, i) for i in range(length)] head_part = dn.concatenate_to_batch( [heads[i // len(lstm_output)] for i in range(length * length)]) # 1 2 3 4 .. 1 2 3 4 ... mod_part = dn.concatenate_to_batch([mods[i] for i in range(length)] * length) output = self.dense_layer( self.activation(head_part + mod_part + self.bilinear_layer.bias.expr())) return output
def rnn_encode(rnn, input_, lengths): """Return the final output for each batch based on lengths. :param rnn: dy.RNNBuilder or dy.BiRNNBuilder :param input_: List[dy.Expression] :param lengths: List[int] Returns: dy.Expression """ states = rnn_forward(rnn, input_) final_states = [dy.pick_batch_elem(states[l - 1], i) for i, l in enumerate(lengths)] return dy.concatenate_to_batch(final_states)
def return_spans_and_uncertainties(self, sentence, sentence_number, gold, use_oracle, low_conf_cutoff, pseudo_label_cutoff, seen): spans = [span for span in get_all_spans(gold).keys() if (span, sentence_number) not in seen] if len(spans) == 0: return [] lstm_outputs = self._featurize_sentence(sentence, is_train=False) encodings = [] for (start, end) in spans: encodings.append(self._get_span_encoding(start, end, lstm_outputs)) label_scores = self.f_label(dy.concatenate_to_batch(encodings)) label_scores_reshaped = dy.reshape(label_scores, (self.label_vocab.size, len(encodings))) label_probabilities_np = dy.softmax(label_scores_reshaped).npvalue() low_confidence_labels = [] high_confidence_labels = [] for index, (start, end) in enumerate(spans): distribution = label_probabilities_np[:, index] entropy = stats.entropy(distribution) oracle_label = gold.oracle_label(start, end) predicted_label_index = distribution.argmax() predicted_label = self.label_vocab.value(predicted_label_index) annotation_request = dict( sentence_number=sentence_number, left=start, right=end, entropy=entropy, non_constituent_probability=distribution[0] ) if use_oracle: oracle_label_index = self.label_vocab.index(oracle_label) if oracle_label_index != predicted_label_index and distribution[ oracle_label_index] > 0.01: annotation_request['label'] = oracle_label low_confidence_labels.append(annotation_request) elif max(distribution) > pseudo_label_cutoff and ( distribution[ self.empty_label_index] < 0.001 or random.random() < 0.001): annotation_request['label'] = predicted_label high_confidence_labels.append(annotation_request) elif low_conf_cutoff < entropy: annotation_request['label'] = oracle_label low_confidence_labels.append(annotation_request) return low_confidence_labels, high_confidence_labels
def rnn_forward_with_state(rnn, input_, lengths=None, state=None, batched=True, backward=False): """Return the output of the final layers and the final state of the RNN. :param rnn: dy.RNNBuilder :param input_: List[dy.Expression] :param lengths: List[int] :param state: List[np.ndarray] The previous state (used in TBPTT) :param batched: bool Is the state batched? :param backward: bool Is this a backward rnn in a bRNN? Returns: List[dy.Expression] (Seq_len): The outputs List[dy.Expression] (2 * layers if lstm): The state """ if state is not None: state = [dy.inputTensor(s, batched) for s in state] lstm_state = rnn.initial_state(state) if backward: states = lstm_state.add_inputs(reversed(input_)) outputs = list(reversed([s.h()[-1] for s in states])) # When going backwards (we pad right) the final state of the rnn # is always the last one. final_state = states[-1].s() return outputs, final_state states = lstm_state.add_inputs(input_) outputs = [s.h()[-1] for s in states] if lengths is None: if backward: outputs = list(reversed(outputs)) return outputs, states[-1].s() final_states = [states[l - 1].s() for l in lengths] final_state_by_batch = [] for i, state in enumerate(final_states): batch_state = [dy.pick_batch_elem(s, i) for s in state] final_state_by_batch.append(batch_state) final_state = [] for i in range(len(final_state_by_batch[0])): col = dy.concatenate_to_batch([ final_state_by_batch[j][i] for j in range(len(final_state_by_batch)) ]) final_state.append(col) if backward: outputs = list(reversed(outputs)) return outputs, final_state
def forward(self, words): if self.training: words = self.word_vocab.unkify(words) rnn = self.rnn_builder.initial_state() word_ids = [self.word_vocab.index_or_unk(word) for word in [START] + words + [STOP]] prev_embeddings = [self.embeddings[word_id] for word_id in word_ids[:-1]] lstm_outputs = rnn.transduce(prev_embeddings) logits = self.out(dy.concatenate_to_batch(lstm_outputs)) nlls = dy.pickneglogsoftmax_batch(logits, word_ids[1:]) return dy.sum_batches(nlls)
def pad_embedding(embeddings) -> expression_seqs.ExpressionSequence: max_col = max(len(xs) for xs in embeddings) p0 = dy.zeros(embeddings[0][0].dim()[0][0]) masks = np.zeros((len(embeddings), max_col), dtype=int) modified = False ret = [] for xs, mask in zip(embeddings, masks): deficit = max_col - len(xs) if deficit > 0: xs = xs + ([p0] * deficit) mask[-deficit:] = 1 modified = True ret.append(dy.concatenate_cols(xs)) mask = Mask(masks) if modified else None return expression_seqs.ExpressionSequence( expr_tensor=dy.concatenate_to_batch(ret), mask=mask)
def pad(self, outputs): # Padding max_col = max(len(xs) for xs in outputs) P0 = dy.vecInput(outputs[0][0].dim()[0][0]) masks = numpy.zeros((len(outputs), max_col), dtype=int) ret = [] modified = False for xs, mask in zip(outputs, masks): deficit = max_col - len(xs) if deficit > 0: xs.extend([P0 for _ in range(deficit)]) mask[-deficit:] = 1 modified = True ret.append(dy.concatenate_cols(xs)) mask = Mask(masks) if modified else None return dy.concatenate_to_batch(ret), mask
def calc_output(self, sents, train_mode): cache = {} cf_init, cb_init = [b.initial_state() for b in self.char_lstms] wf_init, wb_init = [b.initial_state() for b in self.word_lstms] #get input/output for T1 #get list of tokens xs = [['<SOS>'] + x.split() + ['<EOS>'] for (x, _) in sents] #fill the word embedding cache for x in xs: for w in x: if w not in cache: t = [dy.lookup(self.char_lookup, c) for c in w.encode()] fw = [x.output() for x in cf_init.add_inputs(t)] bw = [x.output() for x in cb_init.add_inputs(reversed(t))] wid = 0 if w in self.word_to_idx: wid = self.word_to_idx[w] if self.level == Level.HYBRID: cache[w] = dy.lookup(self.word_lookup, wid) + fw[-1] + bw[-1] if self.level == Level.CHAR: cache[w] = fw[-1] + bw[-1] if self.level == Level.WORD: cache[w] = dy.lookup(self.word_lookup, wid) src_len = [len(x) for x in xs] max_src_len = np.max(src_len) num_words = 0 #build the batch. Be careful! src_cws = [] for i in range(max_src_len): src_cws.append( dy.concatenate_to_batch([ dy.dropout(cache[x[i]], self.dropout_rate) if train_mode else cache[x[i]] for x in xs ])) fw = [x.output() for x in wf_init.add_inputs(src_cws)] bw = [x.output() for x in wb_init.add_inputs(reversed(src_cws))] return (fw, bw)
def rnn_forward_with_state(rnn, input_, lengths=None, state=None, batched=True, backward=False): """Return the output of the final layers and the final state of the RNN. :param rnn: dy.RNNBuilder :param input_: List[dy.Expression] :param lengths: List[int] :param state: List[np.ndarray] The previous state (used in TBPTT) :param batched: bool Is the state batched? :param backward: bool Is this a backward rnn in a bRNN? Returns: List[dy.Expression] (Seq_len): The outputs List[dy.Expression] (2 * layers if lstm): The state """ if state is not None: state = [dy.inputTensor(s, batched) for s in state] lstm_state = rnn.initial_state(state) if backward: states = lstm_state.add_inputs(reversed(input_)) outputs = list(reversed([s.h()[-1] for s in states])) # When going backwards (we pad right) the final state of the rnn # is always the last one. final_state = states[-1].s() return outputs, final_state states = lstm_state.add_inputs(input_) outputs = [s.h()[-1] for s in states] if lengths is None: if backward: outputs = list(reversed(outputs)) return outputs, states[-1].s() final_states = [states[l - 1].s() for l in lengths] final_state_by_batch = [] for i, state in enumerate(final_states): batch_state = [dy.pick_batch_elem(s, i) for s in state] final_state_by_batch.append(batch_state) final_state = [] for i in range(len(final_state_by_batch[0])): col = dy.concatenate_to_batch([final_state_by_batch[j][i] for j in range(len(final_state_by_batch))]) final_state.append(col) if backward: outputs = list(reversed(outputs)) return outputs, final_state
def transduce(self, embed_sent: ExpressionSequence) -> List[ExpressionSequence]: batch_size = embed_sent[0].dim()[1] actions = self.sample_segmentation(embed_sent, batch_size) embeddings = dy.concatenate(embed_sent.expr_list, d=1) embeddings.value() # composed_words = [] for i in range(batch_size): sequence = dy.pick_batch_elem(embeddings, i) # For each sampled segmentations lower_bound = 0 for j, upper_bound in enumerate(actions[i]): if self.no_char_embed: char_sequence = [] else: char_sequence = dy.pick_range(sequence, lower_bound, upper_bound + 1, 1) composed_words.append( (char_sequence, i, j, lower_bound, upper_bound + 1)) lower_bound = upper_bound + 1 outputs = self.segment_composer.compose(composed_words, batch_size) # Padding + return try: if self.length_prior: seg_size_unpadded = [ len(outputs[i]) for i in range(batch_size) ] sampled_sentence, segment_mask = self.pad(outputs) expr_seq = ExpressionSequence( expr_tensor=dy.concatenate_to_batch(sampled_sentence), mask=segment_mask) return self.final_transducer.transduce(expr_seq) finally: if self.length_prior: self.seg_size_unpadded = seg_size_unpadded self.compose_output = outputs self.segment_actions = actions if not self.train and self.is_reporting(): if len(actions) == 1: # Support only AccuracyEvalTask self.report_sent_info({"segment_actions": actions})
def calc_loss(self, rewards): loss = FactoredLossExpr() ## Z-Normalization if self.z_normalization: reward_batches = dy.concatenate_to_batch(rewards) mean_batches = dy.mean_batches(reward_batches) std_batches = dy.std_batches(reward_batches) rewards = [ dy.cdiv(reward - mean_batches, std_batches) for reward in rewards ] ## Calculate baseline if self.baseline is not None: pred_reward, baseline_loss = self.calc_baseline_loss(rewards) loss.add_loss("rl_baseline", baseline_loss) ## Calculate Confidence Penalty if self.confidence_penalty: loss.add_loss("rl_confpen", self.confidence_penalty.calc_loss(self.policy_lls)) ## Calculate Reinforce Loss reinf_loss = [] # Loop through all action in one sequence for i, (policy, action_sample) in enumerate(zip(self.policy_lls, self.actions)): # Discount the reward if we use baseline if self.baseline is not None: rewards = [reward - pred_reward[i] for reward in rewards] # Main Reinforce calculation sample_loss = [] for action, reward in zip(action_sample, rewards): ll = dy.pick_batch(policy, action) if self.valid_pos is not None: ll = dy.pick_batch_elems(ll, self.valid_pos[i]) reward = dy.pick_batch_elems(reward, self.valid_pos[i]) sample_loss.append(dy.sum_batches(ll * reward)) # Take the average of the losses accross multiple samples reinf_loss.append(dy.esum(sample_loss) / len(sample_loss)) loss.add_loss("rl_reinf", self.weight * -dy.esum(reinf_loss)) ## the composed losses return loss
def predict_one(self, src, encoder_outputs, **kwargs): K = int(kwargs.get('beam', 5)) mxlen = int(kwargs.get('mxlen', 100)) paths = [[Offsets.GO] for _ in range(K)] # Which beams are done? done = np.array([False] * K) scores = np.array([0.0]*K) hidden, output_i, context = self.arc_policy(encoder_outputs, self.hsz, beam_width=K) num_states = len(hidden) rnn_state = self.decoder_rnn.initial_state(hidden) self.attn_cache(context) src_mask = encoder_outputs.src_mask for i in range(mxlen): dst_last = np.array([path[-1] for path in paths]).reshape(1, K) embed_i = self.tgt_embeddings.encode(dst_last)[-1] embed_i = self.input_i(embed_i, output_i) rnn_state = rnn_state.add_input(embed_i) rnn_output_i = rnn_state.output() output_i = self.attn(rnn_output_i, src_mask) wll = self.prediction([output_i])[-1].npvalue() # (V,) K V = wll.shape[0] if i > 0: # expanded_history = np.expand_dims(scores, -1) # done_mask = np.expand_dims((done == False).astype(np.uint8), -1) # sll = np.multiply(wll.T, done_mask) + expanded_history wll = wll.T expanded_history = np.expand_dims(scores, -1) done_mask = np.expand_dims((done == False).astype(np.uint8), -1) done_mask_inv = (done_mask != 1).astype(np.uint8) eos_mask = np.zeros((1, V)).astype(np.uint8) mask = ((done_mask & eos_mask) != 1).astype(np.uint8) masked_wll = np.multiply(done_mask, wll) negged_wll = masked_wll + (done_mask_inv * -1e4) removed_eos = np.multiply(mask, negged_wll) sll = removed_eos + expanded_history else: sll = wll.T flat_sll = sll.reshape(-1) bests = topk(K, flat_sll) best_idx_flat = np.array(list(bests.keys())) best_beams = best_idx_flat // V best_idx = best_idx_flat % V new_paths = [] new_done = [] hidden = rnn_state.s() new_hidden = [[] for _ in range(num_states)] for j, best_flat in enumerate(best_idx_flat): beam_id = best_beams[j] best_word = best_idx[j] if done[j]: new_paths.append(paths[beam_id] + [Offsets.EOS]) else: new_paths.append(paths[beam_id] + [best_word]) if best_word == Offsets.EOS: done[j] = True new_done.append(done[beam_id]) scores[j] = bests[best_flat] # For each path, we need to pick that out and add it to the hiddens # This will be (c1, c2, ..., h1, h2, ...) for h_i, h in enumerate(hidden): new_hidden[h_i].append(dy.pick_batch_elem(h, beam_id)) done = np.array(new_done) new_hidden = [dy.concatenate_to_batch(new_h) for new_h in new_hidden] paths = new_paths rnn_state = self.decoder_rnn.initial_state(new_hidden) paths = np.stack([p[1:] for p in paths]) return paths, scores