def optimal_tree_construction(span_to_label, sentence, span_to_on_score): conflicting = set() for span_a in span_to_label: for span_b in span_to_label: if check_overlap(span_a, span_b): conflicting.add(span_a) cache = {} def helper(left, right): if (left, right) in cache: return cache[(left, right)] if (left, right) in span_to_label: label = span_to_label[(left, right)] assert label != () else: assert left != 0 or right != len(sentence) label = () if right - left == 1: tag, word = sentence[left] tree = LeafParseNode(left, tag, word) score = 0 if label: tree = InternalParseNode(label, [tree]) score += span_to_on_score[(left, right)] return [tree], score split_options = [] for split in range(right - 1, left, -1): if (left, split) in span_to_label: split_options.append(split) if (left, split) not in conflicting: break if split == left + 1: split_options.append(left + 1) assert len(split_options) > 0 best_option_score = None best_option = None for split in split_options: left_trees, left_score = helper(left, split) right_trees, right_score = helper(split, right) children = left_trees + right_trees score = left_score + right_score if label: children = [InternalParseNode(label, children)] score += span_to_on_score[(left, right)] if best_option_score is None or score > best_option_score: best_option_score = score best_option = children response = best_option, best_option_score cache[(left, right)] = response return response trees, _ = helper(0, len(sentence)) assert (0, len(sentence)) in span_to_label assert len(trees) == 1, len(trees) return trees[0]
def aggressive_annotation(self, sentence, sentence_number, span_to_gold_label, low_conf_cutoff, seen): if len(span_to_gold_label) == 0: return [] # , [] lstm_outputs = self._featurize_sentence(sentence, is_train=False) encodings = [] spans = span_to_gold_label.keys() for (start, end) in spans: encodings.append(self._get_span_encoding(start, end, lstm_outputs)) label_scores = self.f_label(dy.concatenate_to_batch(encodings)) label_scores_reshaped = dy.reshape(label_scores, (self.label_vocab.size, len(encodings))) label_probabilities_np = dy.softmax(label_scores_reshaped).npvalue() low_confidence_labels = [] # high_confidence_labels = [] on_labels = [] for index, (start, end) in list(enumerate(spans)): distribution = label_probabilities_np[:, index] entropy = stats.entropy(distribution) oracle_label = span_to_gold_label[(start, end)] annotation_request = dict( sentence_number=sentence_number, left=start, right=end, entropy=entropy, non_constituent_probability=distribution[0], label=oracle_label ) if (start, end) in seen: del span_to_gold_label[(start, end)] continue if low_conf_cutoff < entropy and distribution[self.empty_label_index] < 0.5: # annotation_request['label'] = oracle_label low_confidence_labels.append(annotation_request) elif entropy < 10 ** -5 and distribution[self.empty_label_index] > 0.99: del span_to_gold_label[(start, end)] # if entropy > 10 ** -7: # high_confidence_labels.append(annotation_request) if np.max(distribution) > distribution[self.empty_label_index]: on_labels.append(annotation_request) for index, label_a in enumerate(on_labels): span_a = (label_a['left'], label_a['right']) for label_b in on_labels[index + 1:]: span_b = (label_b['left'], label_b['right']) if check_overlap(span_a, span_b): label_a['entropy'] = 10 low_confidence_labels.append(label_a) label_b['entropy'] = 10 low_confidence_labels.append(label_b) return low_confidence_labels # , high_confidence_labels
def test_must_check_if_there_is_an_overlap(self): l1 = (1, 10) l2 = (22, 7) res = question_a.check_overlap(l1, l2) self.assertTrue(res)
def test_must_check_if_there_is_no_overlap(self): l1 = (1, 10) l2 = (11, 22) res = question_a.check_overlap(l1, l2) self.assertFalse(res)