Example #1
0
def test_en_sbd_serialization_projective(combined_all_model_fixture):
    """Test that before and after serialization, the sentence boundaries are
    the same."""

    text = "I bought a couch from IKEA It wasn't very comfortable."
    transition = [
        'L-nsubj', 'S', 'L-det', 'R-dobj', 'D', 'R-prep', 'R-pobj', 'B-ROOT',
        'L-nsubj', 'R-neg', 'D', 'S', 'L-advmod', 'R-acomp', 'D', 'R-punct'
    ]

    doc = combined_all_model_fixture.tokenizer(text)
    apply_transition_sequence(en_with_combined_rule_tokenizer_fixture.parser,
                              doc, transition)
    doc_serialized = Doc(
        en_with_combined_rule_tokenizer_fixture.vocab).from_bytes(
            doc.to_bytes())
    assert doc.is_parsed == True
    assert doc_serialized.is_parsed == True
    assert doc.to_bytes() == doc_serialized.to_bytes()
    assert [s.text
            for s in doc.sents] == [s.text for s in doc_serialized.sents]
Example #2
0
def test_en_sentence_breaks(combined_all_model_fixture, en_parser):
    text = "This is a sentence . This is another one ."
    heads = [1, 0, 1, -2, -3, 1, 0, 1, -2, -3]
    deps = [
        'nsubj', 'ROOT', 'det', 'attr', 'punct', 'nsubj', 'ROOT', 'det',
        'attr', 'punct'
    ]
    transition = [
        'L-nsubj', 'S', 'L-det', 'R-attr', 'D', 'R-punct', 'B-ROOT', 'L-nsubj',
        'S', 'L-attr', 'R-attr', 'D', 'R-punct'
    ]

    tokens = combined_all_model_fixture(text)
    doc = get_doc(tokens.vocab, [t.text for t in tokens],
                  heads=heads,
                  deps=deps)
    apply_transition_sequence(en_parser, doc, transition)

    assert len(list(doc.sents)) == 2
    for token in doc:
        assert token.dep != 0 or token.is_space
    assert [token.head.i for token in doc] == [1, 1, 3, 1, 1, 6, 6, 8, 6, 6]