コード例 #1
0
def test_tag_sentence_splitter():
    tag_splitter = TagSentenceSplitter(tag="#!")

    sentences = tag_splitter.split("I love Berlin#!Me too")
    assert len(sentences) == 2
    assert sentences[0].start_pos == 0
    assert len(sentences[0].tokens) == 3
    assert sentences[1].start_pos == 15
    assert len(sentences[1].tokens) == 2

    tag_splitter = TagSentenceSplitter(tag="#!", tokenizer=TokenizerWrapper(no_op_tokenizer))
    sentences = tag_splitter.split("I love Berlin#!Me too")
    assert len(sentences) == 2
    assert sentences[0].start_pos == 0
    assert len(sentences[0].tokens) == 1
    assert sentences[1].start_pos == 15
    assert len(sentences[1].tokens) == 1

    sentences = tag_splitter.split("I love Berlin Me too")
    assert len(sentences) == 1

    sentences = tag_splitter.split("I love Berlin#!#!Me too")
    assert len(sentences) == 2

    sentences = tag_splitter.split("I love Berl#! #!inMe too")
    assert len(sentences) == 2
コード例 #2
0
def test_conll_writer_whitespace_after():
    text = f"A sentence with cardio-dependent. {SENTENCE_TAG}Clark et al. reported that"
    dataset = InternalBioNerDataset(
        documents={"1": text},
        entities_per_document={"1": []},
    )

    assert_conll_writer_output(
        dataset, [
            "A O +",
            "sentence O +",
            "with O +",
            "cardio O -",
            "dependent. O +",
            "Clark O +",
            "et O +",
            "al. O +",
            "reported O +",
            "that O -",
        ],
        TagSentenceSplitter(tag=SENTENCE_TAG,
                            tokenizer=TokenizerWrapper(simple_tokenizer)))