Beispiel #1
0
    def test_inner_entities_collapse(self):
        expected_tokens = [
            "Recurrence", "of", "Pelecypod-associated", "cholera", "in",
            "$Geographical$", ".", "From", "Oct.", "30", "to", "Nov.", "7",
            ",", "1979", ",", "10", "people", "in", "the", "$Geographical$",
            "of", "$Geographical$", "had", "onset", "of", "bacteriologically",
            "confirmed", "cholera", "."
        ]
        expected_sentences = [Sentence(0, 7), Sentence(7, 30)]

        expected_entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 4, "Bacteria"),
            Entity("T3", 3, 4, "Bacteria"),
            Entity("T4", 5, 6, "Geographical"),
            Entity("T5", 17, 18, "Habitat"),
            Entity("T6", 17, 23, "Habitat"),
            Entity("T7", 20, 21, "Geographical"),
            Entity("T8", 22, 23, "Geographical"),
            Entity("T9", 28, 29, "Bacteria")
        ]

        expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        expected_relations = [
            Relation(expected_entities[0], expected_entities[1], "Lives_in"),
            Relation(expected_entities[8], expected_entities[6], "Lives_in")
        ]

        expected_doc = Document("_", expected_tokens, expected_sentences,
                                expected_paragraphs, expected_entities,
                                expected_relations)

        actual_doc = EntitiesCollapser({"Geographical"}).transform(self.doc)
        self.assertEqual(expected_doc, actual_doc)
Beispiel #2
0
    def test_one_entity(self):
        sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5)]
        entities = [Entity('', 2, 4, 'test')]

        expected_sentences = [Sentence(0, 2), Sentence(2, 5)]
        got_sentences = adjust_sentences(sentences, entities)
        self.assertEqual(expected_sentences, got_sentences)
Beispiel #3
0
    def _create_doc(self, doc_raw_tokens: List[List[str]], doc_idx) -> Document:
        tokens, sentences, entities, pos_tags = [], [], [], []

        sent_tokens, sent_pos_tags, sent_entities_labels = [], [], []
        sent_start = 0
        for raw_token in doc_raw_tokens:
            if not raw_token:
                if sent_tokens:
                    tokens.extend(sent_tokens)
                    pos_tags.extend(sent_pos_tags)
                    sentences.append(Sentence(sent_start, sent_start + len(sent_tokens)))
                    sent_start += len(sent_tokens)
                    entities.extend(self._decode_strategy.decode_labels(sentences[-1], sent_entities_labels))
                    sent_tokens, sent_pos_tags, sent_entities_labels = [], [], []
                continue

            token, pos_tag, _, ent_label = raw_token
            sent_tokens.append(token)
            sent_entities_labels.append(ent_label)
            sent_pos_tags.append(pos_tag)

        if sent_tokens:
            tokens.extend(sent_tokens)
            pos_tags.extend(sent_pos_tags)
            sentences.append(Sentence(sent_start, sent_start + len(sent_tokens)))
            entities.extend(self._decode_strategy.decode_labels(sentences[-1], sent_entities_labels))

        return Document(
            str(doc_idx), tokens, sentences, [Paragraph(0, len(sentences))], entities, token_features={"pos": pos_tags})
Beispiel #4
0
    def test_leq(self):
        self.assertTrue(TokenSpan(1, 2) < TokenSpan(2, 3))
        self.assertTrue(TokenSpan(1, 2) < TokenSpan(2, 3))
        self.assertFalse(TokenSpan(2, 3) < TokenSpan(0, 1))

        self.assertRaises(Exception, lambda: TokenSpan(0, 1) < Sentence(2, 3))
        self.assertRaises(Exception, lambda: Sentence(0, 1) < TokenSpan(2, 3))
Beispiel #5
0
    def setUp(self):
        tokens = [
            "Recurrence", "of", "Pelecypod-associated", "cholera", "in",
            "Sardinia", ".", "From", "Oct.", "30", "to", "Nov.", "7", ",",
            "1979", ",", "10", "people", "in", "the", "Sardinian", "province",
            "of", "Cagliari", "had", "onset", "of", "bacteriologically",
            "confirmed", "cholera", "."
        ]
        sentences = [Sentence(0, 7), Sentence(7, 31)]

        entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 4, "Bacteria"),
            Entity("T3", 3, 4, "Bacteria"),
            Entity("T4", 5, 6, "Geographical"),
            Entity("T5", 17, 18, "Habitat"),
            Entity("T6", 17, 24, "Habitat"),
            Entity("T7", 20, 22, "Geographical"),
            Entity("T8", 23, 24, "Geographical"),
            Entity("T9", 29, 30, "Bacteria")
        ]

        paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        relations = [
            Relation(entities[0], entities[1], "Lives_in"),
            Relation(entities[8], entities[6], "Lives_in")
        ]

        self.doc = Document("_", tokens, sentences, paragraphs, entities,
                            relations)
Beispiel #6
0
    def test_borders_extraction_3(self):
        tokens = ["bacteria", "spotted", ".", ".", "it's", "."]
        sentences = [Sentence(0, 3), Sentence(3, 4), Sentence(4, 6)]
        broken_doc = Document("", tokens, sentences, [Paragraph(0, 2)])

        borders = ["start", "in", "end", "start", "start", "end"]
        self.assertEqual(get_sentence_borders_feature(broken_doc), borders)
    def test_3_entity_paragraphs(self):
        sentences = [
            Sentence(0, 5),
            Sentence(5, 10),
        ]
        paragraphs = [
            Paragraph(0, 1),
            Paragraph(1, 2),
        ]
        entities = [
            Entity('_', 0, 1, '1'),
            Entity('_', 1, 2, '1'),
            Entity('_', 5, 6, '2'),
        ]
        doc = Document('test', [], sentences, paragraphs, entities)

        max_distance = 3

        actual_samples = get_samples(doc, max_distance, False)

        expected_samples = [
            (Entity('_', 0, 1, '1'), Entity('_', 1, 2, '1'), None),
            (Entity('_', 0, 1, '1'), Entity('_', 5, 6, '2'), None),
            (Entity('_', 1, 2, '1'), Entity('_', 5, 6, '2'), None),
        ]

        self.assertEqual(expected_samples, actual_samples)
Beispiel #8
0
    def setUp(self):
        sent_1_tokens = [
            "Human", "and", "tick", "spotted", "fever", "group", "Rickettsia",
            "isolates", "from", "Israel", ":", "a", "genotypic", "analysis",
            "."
        ]
        sent_1_head_distances = [
            3, -1, -2, 0, 2, 1, 1, 7, -1, -1, 4, 2, 1, 1, -11
        ]

        self.doc_with_1_sent = Document(
            "",
            sent_1_tokens, [Sentence(0, len(sent_1_tokens))],
            [Paragraph(0, 1)],
            token_features={"dt_head_distances": sent_1_head_distances})

        sent_2_tokens = [
            "The", "precise", "mechanisms", "that", "initiate", "bacterial",
            "uptake", "have", "not", "yet", "been", "elucidated", "."
        ]
        sent_2_head_distances = [2, 1, 9, 1, -2, 1, -2, 4, 3, 2, 1, 0, -1]

        self.doc_with_2_sent = Document(
            "",
            sent_1_tokens + sent_2_tokens, [
                Sentence(0, len(sent_1_tokens)),
                Sentence(len(sent_1_tokens),
                         len(sent_1_tokens) + len(sent_2_tokens))
            ], [Paragraph(0, 2)],
            token_features={
                "dt_head_distances":
                sent_1_head_distances + sent_2_head_distances
            })
Beispiel #9
0
    def test_collapsement_of_same_spans(self):
        tokens = ["Elon", "Musk", "is", "CEO", "of", "Tesla", "."]
        sentences = [Sentence(0, 7)]
        entities = [
            Entity("_", 0, 2, "ELON"),
            Entity("_", 0, 2, "MUSK"),
            Entity("_", 5, 6, "COMP"),
            Entity("_", 5, 6, "ORG")
        ]

        input_doc = Document("_", tokens, sentences, [], entities)

        expected_tokens = ["$ELON$", "is", "CEO", "of", "$COMP$", "."]
        expected_sentences = [Sentence(0, 6)]
        expected_entities = [
            Entity("_", 0, 1, "ELON"),
            Entity("_", 0, 1, "MUSK"),
            Entity("_", 4, 5, "COMP"),
            Entity("_", 4, 5, "ORG")
        ]

        expected_doc = Document("_", expected_tokens, expected_sentences, [],
                                expected_entities)

        actual_doc = EntitiesCollapser({"ELON", "COMP"}).transform(input_doc)
        self.assertEqual(expected_doc, actual_doc)
Beispiel #10
0
    def test_entities_with_nesting_collapse(self):
        expected_tokens = [
            "Recurrence", "of", "$Bacteria$", "in", "Sardinia", ".", "From",
            "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10", "people",
            "in", "the", "Sardinian", "province", "of", "Cagliari", "had",
            "onset", "of", "bacteriologically", "confirmed", "$Bacteria$", "."
        ]
        expected_sentences = [Sentence(0, 6), Sentence(6, 30)]

        expected_entities = [
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 3, "Bacteria"),
            Entity("T3", 2, 3, "Bacteria"),
            Entity("T4", 4, 5, "Geographical"),
            Entity("T5", 16, 17, "Habitat"),
            Entity("T6", 16, 23, "Habitat"),
            Entity("T7", 19, 21, "Geographical"),
            Entity("T8", 22, 23, "Geographical"),
            Entity("T9", 28, 29, "Bacteria")
        ]

        expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]
        expected_relations = [
            Relation(expected_entities[0], expected_entities[1], "Lives_in"),
            Relation(expected_entities[8], expected_entities[6], "Lives_in")
        ]

        expected_doc = Document("_", expected_tokens, expected_sentences,
                                expected_paragraphs, expected_entities,
                                expected_relations)

        actual_doc = EntitiesCollapser({"Bacteria"}).transform(self.doc)
        self.assertEqual(expected_doc, actual_doc)
Beispiel #11
0
def get_sentence_relations(relations: Iterable[Relation], sent: Sentence):
    ret = set()

    for rel in relations:
        if sent.contains(rel.first_entity) and sent.contains(
                rel.second_entity):
            ret.add(rel)

    return ret
Beispiel #12
0
    def test_collapsing_with_ne(self):
        input_doc = self.doc.with_additional_extras({"ne": self.doc.entities})
        input_doc = input_doc.without_relations().without_entities()

        entities = SortedSpansSet([
            Entity("_", 0, 1, "left"),
            Entity("_", 2, 4, "same"),
            Entity("_", 3, 4, "include"),
            Entity("_", 5, 6, "same"),
            Entity("_", 15, 19, "intersect"),
            Entity("_", 17, 20, "include"),
            Entity("_", 22, 25, "intersect")
        ])

        input_doc = input_doc.with_entities(entities)

        expected_tokens = [
            "Recurrence", "of", "$Bacteria$", "in", "$Geographical$", ".",
            "From", "Oct.", "30", "to", "Nov.", "7", ",", "1979", ",", "10",
            "$Habitat$", "had", "onset", "of", "bacteriologically",
            "confirmed", "$Bacteria$", "."
        ]
        expected_sentences = [Sentence(0, 6), Sentence(6, 24)]
        expected_paragraphs = [Paragraph(0, 1), Paragraph(1, 2)]

        expected_nes = SortedSpansSet([
            Entity("T1", 2, 3, "Habitat"),
            Entity("T2", 2, 3, "Bacteria"),
            Entity("T3", 2, 3, "Bacteria"),
            Entity("T4", 4, 5, "Geographical"),
            Entity("T5", 16, 17, "Habitat"),
            Entity("T6", 16, 17, "Habitat"),
            Entity("T7", 16, 17, "Geographical"),
            Entity("T8", 16, 17, "Geographical"),
            Entity("T9", 22, 23, "Bacteria")
        ])

        expected_entities = SortedSpansSet([
            Entity("_", 0, 1, "left"),
            Entity("_", 2, 3, "same"),
            Entity("_", 2, 3, "include"),
            Entity("_", 4, 5, "same"),
            Entity("_", 14, 17, "intersect"),
            Entity("_", 16, 17, "include"),
            Entity("_", 16, 18, "intersect")
        ])

        expected_doc = Document("_",
                                expected_tokens,
                                expected_sentences,
                                expected_paragraphs,
                                expected_entities,
                                extras={"ne": expected_nes})

        actual_doc = EntitiesCollapser({"Habitat", "Bacteria", "Geographical"},
                                       True).transform(input_doc)
        self.assertEqual(expected_doc, actual_doc)
Beispiel #13
0
    def test_two_entities_separated(self):
        sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5), Sentence(5, 8), Sentence(8, 10), Sentence(10, 15)]
        entities = [Entity('', 2, 4, 'test'), Entity('', 9, 11, 'test')]

        expected_sentences = [Sentence(0, 2), Sentence(2, 5), Sentence(5, 8), Sentence(8, 15)]
        got_sentences = adjust_sentences(sentences, entities)
        self.assertEqual(expected_sentences, got_sentences)
Beispiel #14
0
    def test_contained_entities(self):
        sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5), Sentence(5, 8), Sentence(8, 10), Sentence(10, 15)]
        entities = [Entity('', 2, 6, 'test'), Entity('', 6, 7, 'test'), Entity('', 7, 8, 'test')]

        expected_sentences = [Sentence(0, 2), Sentence(2, 8), Sentence(8, 10), Sentence(10, 15)]
        got_sentences = adjust_sentences(sentences, entities)
        self.assertEqual(expected_sentences, got_sentences)
Beispiel #15
0
    def test_multi_sentence(self):
        sentences = [Sentence(0, 2), Sentence(2, 3), Sentence(3, 5), Sentence(5, 8), Sentence(8, 10), Sentence(10, 15)]
        entities = [Entity('', 2, 9, 'test')]

        expected_sentences = [Sentence(0, 2), Sentence(2, 10), Sentence(10, 15)]
        got_sentences = adjust_sentences(sentences, entities)
        self.assertEqual(expected_sentences, got_sentences)
    def test_2_chains_2_pron(self):
        sentences = [Sentence(0, 10)]
        paragraphs = [Paragraph(0, 1)]
        entities = [
            Entity('_', 0, 1, 'noun'),
            Entity('_', 1, 2, 'pron'),
            Entity('_', 2, 3, 'pron'),
            Entity('_', 3, 4, 'noun'),
            Entity('_', 5, 6, 'noun'),
        ]
        rels = {
            Relation(Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'pron'),
                     '1'),
            Relation(Entity('_', 1, 2, 'pron'), Entity('_', 3, 4, 'noun'),
                     '1'),
            Relation(Entity('_', 3, 4, 'noun'), Entity('_', 5, 6, 'noun'),
                     '1'),
        }
        doc = Document('test', [], sentences, paragraphs, entities, rels)

        max_distance = 3

        actual_samples = get_pron_samples(doc, max_distance, True)
        expected_samples = [
            (Entity('_', 0, 1, 'noun'), Entity('_', 1, 2, 'pron'), None),
            (Entity('_', 1, 2, 'pron'), Entity('_', 3, 4, 'noun'), '1'),
            (Entity('_', 0, 1, 'noun'), Entity('_', 2, 3, 'pron'), '1'),
            (Entity('_', 2, 3, 'pron'), Entity('_', 3, 4, 'noun'), None),
        ]
        self.assertEqual(actual_samples, expected_samples)
Beispiel #17
0
    def test_bio2_strategy_encoding(self):
        strategy = BIO2LabellingStrategy()
        sent = Sentence(110, 120)
        ents = [
            Entity("_", 110, 112, "T1"),
            Entity("_", 112, 113, "T2"),
            Entity("_", 114, 115, "T3"),
            Entity("_", 115, 118, "T3"),
            Entity("_", 119, 120, "T1")
        ]

        expected_possible_categories = {
            "O", "I-T1", "I-T2", "I-T3", "B-T1", "B-T2", "B-T3"
        }
        actual_possible_categories = strategy.get_possible_categories(
            self.ent_types)
        self.assertEqual(expected_possible_categories,
                         actual_possible_categories)

        expected_encoding = [
            "I-T1", "I-T1", "I-T2", "O", "I-T3", "B-T3", "I-T3", "I-T3", "O",
            "I-T1"
        ]
        actual_encoding = strategy.encode_labels(sent, ents)
        self.assertEqual(expected_encoding, actual_encoding)
Beispiel #18
0
 def setUp(self):
     self.sents = SortedSpansSet([
         Sentence(6, 9),
         Sentence(0, 10),
         Sentence(4, 12),
         Sentence(6, 9),
         Sentence(6, 12)
     ])
     self.ents = SortedSpansSet([
         Entity('', 0, 5, ''),
         Entity('', 0, 5, ''),
         Entity('1', 2, 6, ''),
         Entity('2', 2, 7, ''),
         Entity('', 7, 9, ''),
         Entity('', 7, 9, '')
     ])
Beispiel #19
0
    def test_eq(self):
        self.assertEqual(Entity("_", 1, 2, "T"), Entity("_", 1, 2, "T"))
        self.assertNotEqual(Entity("_", 1, 2, "T"), Entity("_", 1, 2, "P"))
        self.assertNotEqual(Entity("__", 1, 2, "T"), Entity("_", 1, 2, "T"))
        self.assertNotEqual(Entity("_", 1, 2, "T"), Entity("_", 1, 3, "T"))

        self.assertNotEqual(Entity("_", 1, 2, "T"), Sentence(1, 2))
Beispiel #20
0
    def segment(self, text):
        tokens = []
        sentences = []
        raw_tokens = []

        token_sent_start = 0
        for line_match in self.lines_regexp.finditer(text):
            raw_sent_start, raw_sent_end = line_match.span()
            sent_text = line_match.group()

            token_matches = list(self.tokens_regexp.finditer(sent_text))
            if not token_matches:
                continue

            sentences.append(
                Sentence(token_sent_start,
                         token_sent_start + len(token_matches)))
            token_sent_start += len(token_matches)

            for token_match in token_matches:
                token_start, token_end = token_match.span()
                token_text = token_match.group()

                raw_tokens.append(
                    (raw_sent_start + token_start, raw_sent_start + token_end))
                tokens.append(token_text)

        return tokens, sentences, raw_tokens
Beispiel #21
0
    def segment(self, text):
        sent = ud.Sentence()
        tokenizer = self.model.newTokenizer('ranges')
        tokenizer.setText(text)

        sentences = []
        tokens = []
        raw_tokens = []
        sent_start = 0

        while tokenizer.nextSentence(sent):
            words = sent.words[1:]
            sent_raw_tokens = [(word.getTokenRangeStart(),
                                word.getTokenRangeEnd()) for word in words]

            sentences.append(
                Sentence(sent_start, sent_start + len(sent_raw_tokens)))
            tokens += [
                text[raw_token[0]:raw_token[1]]
                for raw_token in sent_raw_tokens
            ]

            raw_tokens += sent_raw_tokens
            sent_start += len(sent_raw_tokens)

        return tokens, sentences, raw_tokens
Beispiel #22
0
def convert_from_digger_to_derek(diggerdoc: DiggerDoc,
                                 doc_name: str) -> Document:
    tokens = []
    token_features = {
        "pos": [],
        "dt_labels": [],
        "dt_head_distances": [],
        "lemmas": [],
        "feats": []
    }

    for i, token in enumerate(diggerdoc.tokens):
        tokens.append(token.doc_text)
        token_features["pos"].append(token.pos.upos)
        token_features["dt_labels"].append(token.deprel)
        token_features["dt_head_distances"].append(
            token.head_index - i if token.head_index != -1 else 0)
        token_features["lemmas"].append(token.lemma)
        token_features["feats"].append(token.pos.feats)

    sentences = list(
        Sentence(sent.start, sent.end)
        for sent in diggerdoc.sentences_boundaries)
    # here we assume all doc sentences to be in 1 paragraph
    paragraphs = [Paragraph(0, len(sentences))]

    return Document(doc_name,
                    tokens,
                    sentences,
                    paragraphs,
                    token_features=token_features)
Beispiel #23
0
    def _process_tokens_and_sentences(raw_text, raw_tokens, raw_sentences):
        cur_sent_idx = 0
        sent_start = 0
        tokens, sentences = [], []

        for token in raw_tokens:
            tokens.append(raw_text[token[0]:token[1]])
            if raw_sentences[cur_sent_idx][1] <= token[1]:
                sentences.append(Sentence(sent_start, len(tokens)))
                sent_start = len(tokens)
                cur_sent_idx += 1

        if sent_start < len(tokens):
            sentences.append(Sentence(sent_start, len(tokens)))

        return tokens, sentences
Beispiel #24
0
def _fix_sentences_and_tokens(subtokens_ranges, tokens, sentences, raw_tokens):
    fixed_tokens = []
    fixed_raw_tokens = []
    fixed_sentences = []

    new_sentence_start = 0
    for sent in sentences:
        new_sentence_length = 0

        for i in range(sent.start_token, sent.end_token):
            new_token_ranges = subtokens_ranges[i]
            token = tokens[i]
            raw_token = raw_tokens[i]

            for rng in new_token_ranges:
                fixed_tokens.append(token[rng[0]:rng[1]])
                fixed_raw_tokens.append(
                    (raw_token[0] + rng[0], raw_token[0] + rng[1]))
                new_sentence_length += 1

        fixed_sentences.append(
            Sentence(new_sentence_start,
                     new_sentence_start + new_sentence_length))
        new_sentence_start += new_sentence_length

    return fixed_tokens, fixed_sentences, fixed_raw_tokens
Beispiel #25
0
    def setUp(self):
        doc_tokens = [
            "Human", "and", "tick", "spotted", "fever", "group", "Rickettsia",
            "isolates", "from", "Israel", ":", "a", "genotypic", "analysis",
            "."
        ] + [
            "The", "precise", "mechanisms", "that", "initiate", "bacterial",
            "uptake", "have", "not", "yet", "been", "elucidated", "."
        ]

        doc_sentences = [Sentence(0, 15), Sentence(15, 28)]
        doc_paragraphs = [Paragraph(0, 2)]

        doc_head_distances = [
            3, -1, -2, 0, 2, 1, 1, 7, -1, -1, 4, 2, 1, 1, -11
        ] + [2, 1, 9, 1, -2, 1, -2, 4, 3, 2, 1, 0, -1]

        doc_dt_labels = ["test"] * len(doc_tokens)
        doc_token_features = {
            "dt_head_distances": doc_head_distances,
            "dt_labels": doc_dt_labels
        }

        self.entity_with_one_token_no_root = (6, 7, 0)
        self.entity_with_several_tokens_no_root = (12, 14, 0)
        self.entity_with_one_token_root = (3, 4, 0)
        self.entity_with_several_tokens_root = (22, 27, 1)

        doc_entities = [
            self.entity_with_one_token_no_root,
            self.entity_with_several_tokens_no_root,
            self.entity_with_one_token_root,
            self.entity_with_several_tokens_root
        ]
        doc_entities = [
            Entity("", start, end, "") for start, end, _ in doc_entities
        ]

        self.doc = Document("",
                            doc_tokens,
                            doc_sentences,
                            doc_paragraphs,
                            doc_entities,
                            token_features=doc_token_features)
Beispiel #26
0
    def _get_docs(self, raw_docs: Dict[str, List[dict]],
                  groups: Dict[str, list]) -> Dict[str, Document]:
        docs = {}
        for doc_id, raw_tokens in raw_docs.items():
            tokens = []
            token_features = {}
            sentences = []
            sent_start = 0
            shift2idx = {}

            for i, raw_token in enumerate(raw_tokens):
                tokens.append(raw_token['token'])
                token_features.setdefault('lemma',
                                          []).append(raw_token['lemma'])
                token_features.setdefault('gram', []).append(raw_token['gram'])
                if "speech" in raw_token:
                    token_features.setdefault("speech",
                                              []).append(raw_token['speech'])
                    token_features.setdefault("said",
                                              []).append(raw_token['said'])
                    token_features.setdefault("author_comment", []).append(
                        raw_token['author_comment'])
                    token_features.setdefault("speech_verb", []).append(
                        raw_token['speech_verb'])
                shift2idx[raw_token['shift']] = i

                if raw_token['gram'] == 'SENT':
                    sentences.append(Sentence(sent_start, i + 1))
                    sent_start = i + 1
            if sentences[-1].end_token != len(tokens):
                sentences.append(Sentence(sent_start, len(tokens)))
            entities = self._get_entities(groups, shift2idx, doc_id)
            sentences = adjust_sentences(sentences, entities)

            doc = Document(doc_id,
                           tokens,
                           sentences, [Paragraph(0, len(sentences))],
                           entities,
                           token_features=token_features)
            docs[doc_id] = doc

        return docs
Beispiel #27
0
    def test_normal_tokens(self):
        tokens = ["Bacteria", "lives", "in", "Habitat", "-"]
        sentences = [Sentence(0, 5)]
        raw_tokens = [(0, 8), (9, 14), (15, 17), (18, 25), (26, 27)]
        expected_tokens = tokens
        expected_sentences = sentences
        expected_raw_tokens = raw_tokens

        self.assertEqual(
            fix_joined_tokens(tokens, sentences, raw_tokens, {"/", "-"}),
            (expected_tokens, expected_sentences, expected_raw_tokens))
    def setUp(self) -> None:
        tokens = [
            "I", "will", "do", "my", "homework", "today", ".", "It", "is",
            "very", "hard", "but", "i", "don't", "care", "."
        ]
        sentences = [Sentence(0, 7), Sentence(7, 16)]
        paragraphs = [Paragraph(0, 2)]
        entities = [
            Entity("_", 0, 1, "t1"),
            Entity("_", 3, 5, "t2"),
            Entity("_", 7, 8, "t1"),
            Entity("_", 9, 11, "t2"),
            Entity("_", 10, 11, "t4")
        ]

        self.doc = Document("_", tokens, sentences, paragraphs, entities)
        self.relations = {
            Relation(entities[2], entities[3], "t1"),
            Relation(entities[3], entities[4], "t2")
        }
    def setUp(self) -> None:
        tokens = ['Planning', 'of', 'work', 'of', 'Elon', "by", "Elon", "in", "LA", "in", "USA", "."]
        sents = [Sentence(0, 12)]
        ents = [Entity("_", 4, 5, "PER"), Entity("_", 6, 7, "PER"), Entity("_", 8, 9, "ORG"), Entity("_", 10, 11, "ORG")]
        nes = SortedSpansSet([
                Entity("gen", 0, 1, "STUFF"),
                Entity("gen", 4, 5, "PERORG"), Entity("gen", 6, 7, "PERORG"),
                Entity("gen", 8, 9, "PERORG"), Entity("gen", 10, 11, "PERORG")
        ])

        self.doc = Document('', tokens, sents, [], ents, extras={"ne": nes})
Beispiel #30
0
    def test_fully_augmented(self):
        tokens = [
            "Elon", "Musk", "must", "donate", "Tesla", "to", "our",
            "subscribers", ".", "It", "is", "important", "!"
        ]
        sentences = [Sentence(0, 9), Sentence(9, 13)]
        entities = [
            Entity("_", 0, 2, "CEO"),
            Entity("_", 3, 4, "donate"),
            Entity("_", 4, 5, "Tesla"),
            Entity("_", 7, 8, "subscribers"),
            Entity("_", 9, 10, "It"),
            Entity("_", 11, 12, "important")
        ]

        nes = SortedSpansSet(
            [Entity("_", 4, 5, "Tesla"),
             Entity("_", 11, 12, "important")])

        token_features = {
            "tokens":
            list(tokens),
            "pos": [
                "NNP", "NNP", "VB", "VB", "NNP", "TO", "NNPS", "NNS", "DOT",
                "NNP", "VB", "RB", "DOT"
            ]
        }

        expected_doc = Document("_",
                                tokens,
                                sentences, [],
                                entities,
                                token_features=token_features,
                                extras={"ne": nes})
        to_augment = [
            "CEO", "donate", "Tesla", "subscribers", "It", "important"
        ]
        actual_doc = EntitiesUnquoteAugmentor(1.0,
                                              to_augment).transform(self.doc)

        self.assertEqual(expected_doc, actual_doc)