def test_features(normal_substitution): drop_caches() # A shortcut. s = normal_substitution # Check we defined the right substitution. assert s.tokens == ("containing", "other") assert s.lemmas == ("contain", "other") # An unknown feature raises an error with pytest.raises(ValueError): s.features("unknown_feature") with pytest.raises(ValueError): s.features("unknown_feature", sentence_relative="mean") # Syllable, phonemes, letters counts, and densities are right, # and computed on tokens. assert s.features("syllables_count") == (3, 2) assert s.features("phonemes_count") == (8, 3) assert s.features("letters_count") == (10, 5) assert np.isnan(s.features("phonological_density")[0]) assert s.features("phonological_density")[1] == np.log(7) assert np.isnan(s.features("orthographic_density")[0]) assert s.features("orthographic_density")[1] == np.log(5) # Same with features computed relative to sentence. assert s.features("syllables_count", sentence_relative="mean") == (3 - 7 / 5, 2 - 6 / 5) assert s.features("phonemes_count", sentence_relative="mean") == (8 - 18 / 5, 3 - 13 / 5) assert s.features("letters_count", sentence_relative="mean") == (10 - 21 / 5, 5 - 16 / 5) assert np.isnan(s.features("phonological_density", sentence_relative="median")[0]) assert s.features("phonological_density", sentence_relative="median")[1] == np.log(7) - np.median( np.log([31, 24, 9, 7, 28]) ) assert np.isnan(s.features("orthographic_density", sentence_relative="mean")[0]) assert ( s.features("orthographic_density", sentence_relative="mean")[1] == np.log(5) - np.log([17, 14, 11, 5, 20]).mean() ) # Synonyms count and age-of-acquisition are right, and computed on lemmas. # The rest of the features need computed files, and are tested separately. assert s.features("synonyms_count") == (np.log(3), np.log(0.5)) assert s.features("aoa") == (7.88, 5.33) # Same with features computed relative to sentence. assert s.features("synonyms_count", sentence_relative="median") == ( np.log(3) - np.median(np.log([1, 1, 3, 2.4444444444444446])), np.log(0.5) - np.median(np.log([1, 1, 0.5, 2.4444444444444446])), ) assert s.features("aoa", sentence_relative="mean") == (7.88 - 6.033333333333334, 5.33 - 5.183333333333334) # Unknown words are ignored. Also when in the rest of the sentence. q1 = Quote(string="makakiki is the goal") q2 = Quote(string="makakiki is the moukakaka") s = Substitution(source=q1, destination=q2, start=0, position=3) assert s.features("syllables_count")[0] == 1 # np.nan != np.nan so we can't `assert s.features(...) == (1, np.nan)` assert np.isnan(s.features("syllables_count")[1]) assert s.features("syllables_count", sentence_relative="mean")[0] == 1 - 3 / 3 assert np.isnan(s.features("syllables_count", sentence_relative="median")[1])