Esempio n. 1
0
def test_no_sentence_splitter():
    no_splitter = NoSentenceSplitter()
    sentences = no_splitter.split("I love Berlin")
    assert len(sentences) == 1
    assert sentences[0].start_pos == 0
    assert len(sentences[0].tokens) == 3

    no_splitter = NoSentenceSplitter(TokenizerWrapper(no_op_tokenizer))
    sentences = no_splitter.split("I love Berlin")
    assert len(sentences) == 1
    assert sentences[0].start_pos == 0
    assert len(sentences[0].tokens) == 1
Esempio n. 2
0
def assert_conll_writer_output(
    dataset: InternalBioNerDataset,
    expected_output: List[str],
    sentence_splitter: SentenceSplitter = None,
):
    fd, outfile_path = tempfile.mkstemp()
    try:
        sentence_splitter = sentence_splitter if sentence_splitter else NoSentenceSplitter(
            tokenizer=SpaceTokenizer())

        writer = CoNLLWriter(sentence_splitter=sentence_splitter)
        writer.write_to_conll(dataset, Path(outfile_path))
        with open(outfile_path) as f:
            contents = [line.strip() for line in f.readlines() if line.strip()]
    finally:
        os.close(fd)
        os.remove(outfile_path)

    assert contents == expected_output