def test_no_sentence_splitter(): no_splitter = NoSentenceSplitter() sentences = no_splitter.split("I love Berlin") assert len(sentences) == 1 assert sentences[0].start_pos == 0 assert len(sentences[0].tokens) == 3 no_splitter = NoSentenceSplitter(TokenizerWrapper(no_op_tokenizer)) sentences = no_splitter.split("I love Berlin") assert len(sentences) == 1 assert sentences[0].start_pos == 0 assert len(sentences[0].tokens) == 1
def assert_conll_writer_output( dataset: InternalBioNerDataset, expected_output: List[str], sentence_splitter: SentenceSplitter = None, ): fd, outfile_path = tempfile.mkstemp() try: sentence_splitter = sentence_splitter if sentence_splitter else NoSentenceSplitter( tokenizer=SpaceTokenizer()) writer = CoNLLWriter(sentence_splitter=sentence_splitter) writer.write_to_conll(dataset, Path(outfile_path)) with open(outfile_path) as f: contents = [line.strip() for line in f.readlines() if line.strip()] finally: os.close(fd) os.remove(outfile_path) assert contents == expected_output