Esempio n. 1
0
def test_features(normal_substitution):
    drop_caches()
    # A shortcut.
    s = normal_substitution

    # Check we defined the right substitution.
    assert s.tokens == ("containing", "other")
    assert s.lemmas == ("contain", "other")

    # An unknown feature raises an error
    with pytest.raises(ValueError):
        s.features("unknown_feature")
    with pytest.raises(ValueError):
        s.features("unknown_feature", sentence_relative="mean")

    # Syllable, phonemes, letters counts, and densities are right,
    # and computed on tokens.
    assert s.features("syllables_count") == (3, 2)
    assert s.features("phonemes_count") == (8, 3)
    assert s.features("letters_count") == (10, 5)
    assert np.isnan(s.features("phonological_density")[0])
    assert s.features("phonological_density")[1] == np.log(7)
    assert np.isnan(s.features("orthographic_density")[0])
    assert s.features("orthographic_density")[1] == np.log(5)
    # Same with features computed relative to sentence.
    assert s.features("syllables_count", sentence_relative="mean") == (3 - 7 / 5, 2 - 6 / 5)
    assert s.features("phonemes_count", sentence_relative="mean") == (8 - 18 / 5, 3 - 13 / 5)
    assert s.features("letters_count", sentence_relative="mean") == (10 - 21 / 5, 5 - 16 / 5)
    assert np.isnan(s.features("phonological_density", sentence_relative="median")[0])
    assert s.features("phonological_density", sentence_relative="median")[1] == np.log(7) - np.median(
        np.log([31, 24, 9, 7, 28])
    )
    assert np.isnan(s.features("orthographic_density", sentence_relative="mean")[0])
    assert (
        s.features("orthographic_density", sentence_relative="mean")[1]
        == np.log(5) - np.log([17, 14, 11, 5, 20]).mean()
    )

    # Synonyms count and age-of-acquisition are right, and computed on lemmas.
    # The rest of the features need computed files, and are tested separately.
    assert s.features("synonyms_count") == (np.log(3), np.log(0.5))
    assert s.features("aoa") == (7.88, 5.33)
    # Same with features computed relative to sentence.
    assert s.features("synonyms_count", sentence_relative="median") == (
        np.log(3) - np.median(np.log([1, 1, 3, 2.4444444444444446])),
        np.log(0.5) - np.median(np.log([1, 1, 0.5, 2.4444444444444446])),
    )
    assert s.features("aoa", sentence_relative="mean") == (7.88 - 6.033333333333334, 5.33 - 5.183333333333334)

    # Unknown words are ignored. Also when in the rest of the sentence.
    q1 = Quote(string="makakiki is the goal")
    q2 = Quote(string="makakiki is the moukakaka")
    s = Substitution(source=q1, destination=q2, start=0, position=3)
    assert s.features("syllables_count")[0] == 1
    # np.nan != np.nan so we can't `assert s.features(...) == (1, np.nan)`
    assert np.isnan(s.features("syllables_count")[1])
    assert s.features("syllables_count", sentence_relative="mean")[0] == 1 - 3 / 3
    assert np.isnan(s.features("syllables_count", sentence_relative="median")[1])