def _collapse_entities_in_doc(doc, entities_to_collapse: Iterable[Entity], entity_types_to_collapse: Union[set, frozenset]): if set(doc.extras.keys()).difference({"ne"}): raise Exception("Currently support only ne extras") # copy features not to affect default document tokens_to_process = list(doc.tokens) token_features_to_process = { k: list(v) for k, v in doc.token_features.items() } borders_to_change = { 'entities_to_collapse': build_borders_dict(entities_to_collapse), 'sentences': build_borders_dict(doc.sentences) } try: borders_to_change["entities"] = build_borders_dict(doc.entities) except ValueError: pass if "ne" in doc.extras: borders_to_change["ne"] = build_borders_dict(doc.extras["ne"]) _collapse_entities_and_correct_features(entities_to_collapse, tokens_to_process, token_features_to_process, entity_types_to_collapse, borders_to_change) sentences_mapping = create_objects_with_new_borders( doc.sentences, borders_to_change['sentences']) collapsed_entities_mapping = create_objects_with_new_borders( entities_to_collapse, borders_to_change['entities_to_collapse']) if 'entities' in borders_to_change: doc_entities_mapping = create_objects_with_new_borders( doc.entities, borders_to_change['entities']) doc_entities = doc_entities_mapping.values() else: doc_entities = None if "ne" in doc.extras: ne_mapping = create_objects_with_new_borders(doc.extras["ne"], borders_to_change["ne"]) extras = {"ne": SortedSpansSet(ne_mapping.values())} else: extras = None doc_to_process = Document(doc.name, tokens_to_process, sentences_mapping.values(), doc.paragraphs, doc_entities, token_features=token_features_to_process, extras=extras) try: relations = [ Relation(doc_entities_mapping[r.first_entity], doc_entities_mapping[r.second_entity], r.type) for r in doc.relations ] doc_to_process = doc_to_process.with_relations(relations) except ValueError: pass return doc_to_process, collapsed_entities_mapping
def make_document_from_json_file(file_path): d = load_json_file_as_dict(file_path) tokens = d.get('tokens', []) entities = d.get('entities', []) sentences = d.get('sentences', []) paragraphs = d.get('paragraphs', []) token_features = {} for feature in [ 'pos', 'entities_types', 'entities_depths', 'borders', 'dt_labels', 'dt_head_distances', 'dt_depths', 'dt_deltas_forward', 'dt_deltas_backward', 'dt_breakups_forward', 'dt_breakups_backward' ]: if feature in d: token_features[feature] = d[feature] relations = d.get('relations', []) doc_entities = [] for ent in entities: id_, start_token, end_token, ent_type = tuple(ent) doc_entities.append(Entity(id_, start_token, end_token, ent_type)) doc_sentences = [] for sent in sentences: start_token, end_token = tuple(sent) doc_sentences.append(Sentence(start_token, end_token)) doc_paragraphs = [] for par in paragraphs: start_sentence, end_sentence = tuple(par) doc_paragraphs.append(Paragraph(start_sentence, end_sentence)) doc_relations = [] for rel in relations: e1 = None e2 = None e1_id, e2_id, rel_type = tuple(rel) for entity in doc_entities: if entity.id == e1_id: e1 = entity if entity.id == e2_id: e2 = entity if e1 is not None and e2 is not None: break doc_relations.append(Relation(e1, e2, rel_type)) doc = Document("", tokens, doc_sentences, doc_paragraphs, token_features=token_features) if 'entities' in d: doc = doc.with_entities(doc_entities) if 'relations' in d: doc = doc.with_relations(doc_relations) return doc
class TestCandidatesExtraction(unittest.TestCase): def setUp(self) -> None: tokens = [ "I", "will", "do", "my", "homework", "today", ".", "It", "is", "very", "hard", "but", "i", "don't", "care", "." ] sentences = [Sentence(0, 7), Sentence(7, 16)] paragraphs = [Paragraph(0, 2)] entities = [ Entity("_", 0, 1, "t1"), Entity("_", 3, 5, "t2"), Entity("_", 7, 8, "t1"), Entity("_", 9, 11, "t2"), Entity("_", 10, 11, "t4") ] self.doc = Document("_", tokens, sentences, paragraphs, entities) self.relations = { Relation(entities[2], entities[3], "t1"), Relation(entities[3], entities[4], "t2") } def test_DifferentEntitiesCandidateFilter(self): f = DifferentEntitiesCandidateFilter() self.assertTrue( f.apply(self.doc, self.doc.entities[0], self.doc.entities[1])) self.assertFalse( f.apply(self.doc, self.doc.entities[2], self.doc.entities[2])) def test_InSameSentenceCandidateFilter(self): f = InSameSentenceCandidateFilter() self.assertTrue( f.apply(self.doc, self.doc.entities[0], self.doc.entities[1])) self.assertTrue( f.apply(self.doc, self.doc.entities[2], self.doc.entities[3])) self.assertFalse( f.apply(self.doc, self.doc.entities[0], self.doc.entities[3])) self.assertFalse( f.apply(self.doc, self.doc.entities[2], self.doc.entities[1])) def test_MaxTokenDistanceCandidateFilter_intersecting_case(self): f = MaxTokenDistanceCandidateFilter(0) self.assertFalse( f.apply(self.doc, self.doc.entities[0], self.doc.entities[3])) self.assertFalse( f.apply(self.doc, self.doc.entities[2], self.doc.entities[1])) self.assertFalse( f.apply(self.doc, self.doc.entities[4], self.doc.entities[2])) self.assertTrue( f.apply(self.doc, self.doc.entities[3], self.doc.entities[4])) self.assertTrue( f.apply(self.doc, self.doc.entities[4], self.doc.entities[3])) def test_MaxTokenDistanceCandidateFilter_normal_case(self): f = MaxTokenDistanceCandidateFilter(3) self.assertFalse( f.apply(self.doc, self.doc.entities[0], self.doc.entities[3])) self.assertFalse( f.apply(self.doc, self.doc.entities[0], self.doc.entities[2])) self.assertTrue( f.apply(self.doc, self.doc.entities[1], self.doc.entities[2])) self.assertTrue( f.apply(self.doc, self.doc.entities[2], self.doc.entities[3])) def test_RelArgTypesCandidateFilter(self): valid_types = {("t1", "t1"), ("t2", "t4")} f = RelArgTypesCandidateFilter(valid_types) self.assertTrue( f.apply(self.doc, self.doc.entities[0], self.doc.entities[0])) self.assertTrue( f.apply(self.doc, self.doc.entities[0], self.doc.entities[2])) self.assertTrue( f.apply(self.doc, self.doc.entities[2], self.doc.entities[0])) self.assertTrue( f.apply(self.doc, self.doc.entities[1], self.doc.entities[4])) self.assertTrue( f.apply(self.doc, self.doc.entities[3], self.doc.entities[4])) self.assertFalse( f.apply(self.doc, self.doc.entities[4], self.doc.entities[1])) self.assertFalse( f.apply(self.doc, self.doc.entities[2], self.doc.entities[3])) self.assertFalse( f.apply(self.doc, self.doc.entities[3], self.doc.entities[3])) def test_IntersectingCandidateFilter(self): f = IntersectingCandidateFilter() self.assertTrue( f.apply(self.doc, self.doc.entities[0], self.doc.entities[2])) self.assertTrue( f.apply(self.doc, self.doc.entities[2], self.doc.entities[0])) self.assertTrue( f.apply(self.doc, self.doc.entities[1], self.doc.entities[4])) self.assertFalse( f.apply(self.doc, self.doc.entities[0], self.doc.entities[0])) self.assertFalse( f.apply(self.doc, self.doc.entities[3], self.doc.entities[4])) self.assertFalse( f.apply(self.doc, self.doc.entities[4], self.doc.entities[3])) def test_AndFilter(self): filts = [ DifferentEntitiesCandidateFilter(), InSameSentenceCandidateFilter(), RelArgTypesCandidateFilter({("t1", "t1"), ("t2", "t4")}) ] f = AndFilter(filts) self.assertFalse( f.apply(self.doc, self.doc.entities[0], self.doc.entities[0])) self.assertFalse( f.apply(self.doc, self.doc.entities[0], self.doc.entities[2])) self.assertFalse( f.apply(self.doc, self.doc.entities[1], self.doc.entities[4])) self.assertFalse( f.apply(self.doc, self.doc.entities[2], self.doc.entities[4])) self.assertTrue( f.apply(self.doc, self.doc.entities[3], self.doc.entities[4])) def test_DefaultPairExtractionStrategy_no_rels(self): filts = [ DifferentEntitiesCandidateFilter(), InSameSentenceCandidateFilter(), RelArgTypesCandidateFilter({("t1", "t2"), ("t2", "t4")}) ] expected_pairs = [(self.doc.entities[0], self.doc.entities[1]), (self.doc.entities[2], self.doc.entities[3]), (self.doc.entities[3], self.doc.entities[4])] strategy = DefaultPairExtractionStrategy(AndFilter(filts)) actual_pairs = strategy.apply(self.doc, include_labels=False) self.assertEqual(actual_pairs, expected_pairs) def test_DefaultPairExtractionStrategy_with_rels(self): filts = [ DifferentEntitiesCandidateFilter(), InSameSentenceCandidateFilter(), RelArgTypesCandidateFilter({("t1", "t2"), ("t2", "t4")}) ] expected_pairs = [(self.doc.entities[0], self.doc.entities[1]), (self.doc.entities[2], self.doc.entities[3]), (self.doc.entities[3], self.doc.entities[4])] strategy = DefaultPairExtractionStrategy(AndFilter(filts)) actual_pairs = strategy.apply(self.doc.with_relations(self.relations), include_labels=True) self.assertEqual(actual_pairs, expected_pairs) def test_DefaultCandidateExtractionStrategy_no_rels(self): filts = [ DifferentEntitiesCandidateFilter(), InSameSentenceCandidateFilter(), RelArgTypesCandidateFilter({("t1", "t2"), ("t2", "t4")}) ] expected_candidates = [ (self.doc.entities[0], self.doc.entities[1], None), (self.doc.entities[2], self.doc.entities[3], None), (self.doc.entities[3], self.doc.entities[4], None) ] strategy = DefaultCandidateExtractionStrategy( DefaultPairExtractionStrategy(AndFilter(filts))) actual_candidates = strategy.apply(self.doc, include_labels=False) self.assertEqual(actual_candidates, expected_candidates) def test_DefaultCandidateExtractionStrategy_with_rels(self): filts = [ DifferentEntitiesCandidateFilter(), InSameSentenceCandidateFilter(), RelArgTypesCandidateFilter({("t1", "t2"), ("t2", "t4")}) ] expected_candidates = [ (self.doc.entities[0], self.doc.entities[1], None), (self.doc.entities[2], self.doc.entities[3], "t1"), (self.doc.entities[3], self.doc.entities[4], "t2") ] strategy = DefaultCandidateExtractionStrategy( DefaultPairExtractionStrategy(AndFilter(filts))) actual_candidates = strategy.apply(self.doc.with_relations( self.relations), include_labels=True) self.assertEqual(actual_candidates, expected_candidates)