def test_tag_sentence_splitter(): tag_splitter = TagSentenceSplitter(tag="#!") sentences = tag_splitter.split("I love Berlin#!Me too") assert len(sentences) == 2 assert sentences[0].start_pos == 0 assert len(sentences[0].tokens) == 3 assert sentences[1].start_pos == 15 assert len(sentences[1].tokens) == 2 tag_splitter = TagSentenceSplitter(tag="#!", tokenizer=TokenizerWrapper(no_op_tokenizer)) sentences = tag_splitter.split("I love Berlin#!Me too") assert len(sentences) == 2 assert sentences[0].start_pos == 0 assert len(sentences[0].tokens) == 1 assert sentences[1].start_pos == 15 assert len(sentences[1].tokens) == 1 sentences = tag_splitter.split("I love Berlin Me too") assert len(sentences) == 1 sentences = tag_splitter.split("I love Berlin#!#!Me too") assert len(sentences) == 2 sentences = tag_splitter.split("I love Berl#! #!inMe too") assert len(sentences) == 2
def test_conll_writer_whitespace_after(): text = f"A sentence with cardio-dependent. {SENTENCE_TAG}Clark et al. reported that" dataset = InternalBioNerDataset( documents={"1": text}, entities_per_document={"1": []}, ) assert_conll_writer_output( dataset, [ "A O +", "sentence O +", "with O +", "cardio O -", "dependent. O +", "Clark O +", "et O +", "al. O +", "reported O +", "that O -", ], TagSentenceSplitter(tag=SENTENCE_TAG, tokenizer=TokenizerWrapper(simple_tokenizer)))