def test_split_sentences_by_char_limit():
    """Unit test of splitting sentences by char limit."""
    lingual_parser = SpacyParser("en")

    text = "This is a text. This is another text."
    all_sentences = [
        Sentence(**parts) for parts in lingual_parser.split_sentences(text)
    ]
    assert len(all_sentences) == 2
    assert [len(sentence.text) for sentence in all_sentences] == [15, 21]

    lingual_parser.model.remove_pipe("sentencizer")
    lingual_parser.model.add_pipe(set_custom_boundary,
                                  before="parser",
                                  name="sentence_boundary_detector")

    sentence_batches = lingual_parser._split_sentences_by_char_limit(
        all_sentences, 20)
    assert len(sentence_batches) == 2
    sentence_batches = lingual_parser._split_sentences_by_char_limit(
        all_sentences, 100)
    assert len(sentence_batches) == 1

    sentence_batch = sentence_batches[0]
    custom_tokenizer = TokenPreservingTokenizer(lingual_parser.model.vocab)
    doc = custom_tokenizer(sentence_batch)
    doc.user_data = sentence_batch
    for name, proc in lingual_parser.model.pipeline:  # iterate over components in order
        doc = proc(doc)
    assert doc.is_parsed

    # See if the number of parsed spaCy sentences matches that of input sentences
    assert len(list(doc.sents)) == len(sentence_batch)
Exemple #2
0
def doc_setup():
    doc = Document(id=1, name="test", stable_id="1::document:0:0")
    doc.text = "This is apple"
    lingual_parser = SpacyParser("en")
    for parts in lingual_parser.split_sentences(doc.text):
        parts["document"] = doc
        Sentence(**parts)
    return doc
Exemple #3
0
def test_spacy_split_sentences():
    lingual_parser = SpacyParser("en")
    tokenize_and_split_sentences = lingual_parser.split_sentences
    text = "This is a text. This is another text."

    iterator = tokenize_and_split_sentences(text)
    assert len(list(iterator)) == 2
Exemple #4
0
def doc_setup():
    """Set up document."""
    doc = Document(id=1, name="test", stable_id="1::document:0:0")
    doc.text = "This is apple. That is orange. Where is banaba? I like Apple."
    lingual_parser = SpacyParser("en")
    # Split sentences
    for parts in lingual_parser.split_sentences(doc.text):
        parts["document"] = doc
        Sentence(**parts)
    # Enrich sentences
    for _ in lingual_parser.enrich_sentences_with_NLP(doc.sentences):
        pass

    # Pick one sentence and add visual information
    # so that all the words get aligned horizontally.
    sentence: Sentence = doc.sentences[0]
    sentence.page = [1, 1, 1, 1]
    sentence.top = [0, 0, 0, 0]
    sentence.bottom = [10, 10, 10, 10]
    sentence.left = [0, 10, 20, 30]
    sentence.right = [10, 20, 30, 40]

    # Assume the 2nd sentence is horizontally aligned with 1st.
    sentence: Sentence = doc.sentences[1]
    sentence.page = [1, 1, 1, 1]
    sentence.top = [0, 0, 0, 0]
    sentence.bottom = [10, 10, 10, 10]
    sentence.left = [40, 50, 60, 70]
    sentence.right = [50, 60, 70, 80]

    # Assume the 3rd sentence is vertically aligned with 1st.
    sentence: Sentence = doc.sentences[2]
    sentence.page = [1, 1, 1, 1]
    sentence.top = [10, 10, 10, 10]
    sentence.bottom = [20, 20, 20, 20]
    sentence.left = [0, 10, 20, 30]
    sentence.right = [10, 20, 30, 40]

    # Assume the 4th sentence is in 2nd page.
    sentence: Sentence = doc.sentences[3]
    sentence.page = [2, 2, 2, 2]
    sentence.top = [0, 0, 0, 0]
    sentence.bottom = [10, 10, 10, 10]
    sentence.left = [0, 10, 20, 30]
    sentence.right = [10, 20, 30, 40]

    return doc
def test_spacy_split_sentences(caplog):
    caplog.set_level(logging.INFO)

    lingual_parser = SpacyParser("en")
    tokenize_and_split_sentences = lingual_parser.split_sentences
    text = "This is a text. This is another text."

    iterator = tokenize_and_split_sentences(text)
    assert len(list(iterator)) == 2
def test_spacy_support():
    """Unit test of spacy support."""
    # Supported language
    lingual_parser = SpacyParser("en")
    assert lingual_parser.has_tokenizer_support()
    assert lingual_parser.has_NLP_support()

    # Alpha-supported language
    lingual_parser = SpacyParser("ja")
    assert lingual_parser.has_tokenizer_support()
    assert not lingual_parser.has_NLP_support()

    # Non supported language
    lingual_parser = SpacyParser("non-supported-lang")
    assert not lingual_parser.has_tokenizer_support()
    assert not lingual_parser.has_NLP_support()

    # Language not specified
    with pytest.raises(TypeError):
        lingual_parser = SpacyParser()
Exemple #7
0
def test_ner_matchers():
    """Test different ner type matchers."""
    # Set up a document
    doc = Document(id=1, name="test", stable_id="1::document:0:0")
    doc.text = " ".join([
        "Tim Cook was born in USA in 1960.",
        "He is the CEO of Apple.",
        "He sold 100 million of iPhone.",
    ])
    lingual_parser = SpacyParser("en")
    for parts in lingual_parser.split_sentences(doc.text):
        parts["document"] = doc
        Sentence(**parts)
    # Manually attach ner_tags as the result from spacy may fluctuate.
    doc.sentences[0].ner_tags = [
        "PERSON",
        "PERSON",
        "O",
        "O",
        "O",
        "GPE",
        "O",
        "DATE",
        "O",
    ]
    doc.sentences[1].ner_tags = ["O", "O", "O", "O", "O", "ORG", "O"]
    doc.sentences[2].ner_tags = [
        "O", "O", "CARDINAL", "CARDINAL", "O", "MISC", "O"
    ]

    # the length of words and that of ner_tags should match.
    assert len(doc.sentences[0].words) == len(doc.sentences[0].ner_tags)
    assert len(doc.sentences[1].words) == len(doc.sentences[1].ner_tags)

    space = MentionNgrams(n_min=1, n_max=2)

    # Test if PersonMatcher works as expected
    matcher = PersonMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"Tim Cook"}

    # Test if LocationMatcher works as expected
    matcher = LocationMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"USA"}

    # Test if DateMatcher works as expected
    matcher = DateMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"1960"}

    # Test if OrganizationMatcher works as expected
    matcher = OrganizationMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"Apple"}

    # Test if NumberMatcher works as expected
    matcher = NumberMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"100 million"}

    # Test if MiscMatcher works as expected
    matcher = MiscMatcher()
    assert set(tc.get_span()
               for tc in matcher.apply(space.apply(doc))) == {"iPhone"}
def test_spacy_support(caplog):
    caplog.set_level(logging.INFO)

    # Supported language
    lingual_parser = SpacyParser("en")
    assert lingual_parser.has_tokenizer_support()
    assert lingual_parser.has_NLP_support()

    # Alpha-supported language
    lingual_parser = SpacyParser("ja")
    assert lingual_parser.has_tokenizer_support()
    assert not lingual_parser.has_NLP_support()

    # Non supported language
    lingual_parser = SpacyParser("non-supported-lang")
    assert not lingual_parser.has_tokenizer_support()
    assert not lingual_parser.has_NLP_support()

    # Language not specified
    with pytest.raises(TypeError):
        lingual_parser = SpacyParser()