def test_synonyms_count_none(): drop_caches() # Lemmas are properly counted. assert len(SubstitutionFeaturesMixin._synonyms_count()) == 147306 # Lemmas are all lowercase. for word in SubstitutionFeaturesMixin._synonyms_count(): assert word.islower() or is_int(word[0]) or is_int(word[-1])
def test_transformed_feature(): # Phonological density is log-transformed. drop_caches() transformed_phonological_density = SubstitutionFeaturesMixin._transformed_feature("phonological_density") assert transformed_phonological_density("time") == np.log(29) assert np.isnan(transformed_phonological_density("wickiup")) # Doc and name are transformed too. assert ( transformed_phonological_density.__doc__ == "log(" + SubstitutionFeaturesMixin._phonological_density.__doc__ + ")" ) assert transformed_phonological_density.__name__ == "_log_phonological_density" # And the list of words is properly computed. drop_caches() with settings.file_override("CLEARPOND"): with open(settings.CLEARPOND, "w") as f: f.write("dog" + 5 * "\t" + "2" + 24 * "\t" + "3\n" "cat" + 5 * "\t" + "2" + 24 * "\t" + "3") assert set(transformed_phonological_density()) == {"dog", "cat"} # AoA is left untouched. drop_caches() transformed_aoa = SubstitutionFeaturesMixin._transformed_feature("aoa") assert transformed_aoa("time") == 5.16 assert transformed_aoa("vocative") == 14.27 assert np.isnan(transformed_aoa("wickiup")) # Doc and name are passed on. assert transformed_aoa.__doc__ == SubstitutionFeaturesMixin._aoa.__doc__ assert transformed_aoa.__name__ == "_aoa" # And the list of words is properly computed. drop_caches() with settings.file_override("AOA"): with open(settings.AOA, "w") as f: f.write("Word,Rating.Mean\nhave,2\ntell,3") assert set(transformed_aoa()) == {"have", "tell"}
def test_strict_synonyms(): assert SubstitutionFeaturesMixin._strict_synonyms("frisbee") == set() assert SubstitutionFeaturesMixin._strict_synonyms("dog") == { "domestic_dog", "canis_familiaris", "frump", "cad", "bounder", "blackguard", "hound", "heel", "frank", "frankfurter", "hotdog", "hot_dog", "wiener", "wienerwurst", "weenie", "pawl", "detent", "click", "andiron", "firedog", "dog-iron", "chase", "chase_after", "trail", "tail", "tag", "give_chase", "go_after", "track", } assert SubstitutionFeaturesMixin._strict_synonyms("makakiki") == set()
def test_orthographic_density_none(): drop_caches() # Lemmas are all lowercase. for word in SubstitutionFeaturesMixin._orthographic_density(): assert word.islower() # And it's computed right. drop_caches() with settings.file_override("CLEARPOND"): with open(settings.CLEARPOND, "w") as f: f.write("dog" + 5 * "\t" + "2" + 24 * "\t" + "3\n" "cat" + 5 * "\t" + "2" + 24 * "\t" + "3") assert set(SubstitutionFeaturesMixin._orthographic_density()) == {"dog", "cat"}
def test_aoa_none(): drop_caches() # Lemmas are all lowercase. for word in SubstitutionFeaturesMixin._aoa(): assert word.islower() # And it's properly computed. drop_caches() with settings.file_override("AOA"): with open(settings.AOA, "w") as f: f.write("Word,Rating.Mean\nhave,2\ntell,3") assert set(SubstitutionFeaturesMixin._aoa()) == {"have", "tell"}
def test_synonyms_count(): drop_caches() # 'hello' has a single synset, with 5 members. So 4 synonyms. assert SubstitutionFeaturesMixin._synonyms_count("hello") == 4 # 'mountain' has two synsets, with 2 and 27 members. # So ((2-1) + (27-1))/2 synonyms. assert SubstitutionFeaturesMixin._synonyms_count("mountain") == 13.5 # 'lamp' has two synsets, with only one member in each. # So no synonyms, which yields `np.nan`. assert np.isnan(SubstitutionFeaturesMixin._synonyms_count("lamp")) # 'makakiki' does not exist. assert np.isnan(SubstitutionFeaturesMixin._synonyms_count("makakiki"))
def test_phonological_density(): drop_caches() assert SubstitutionFeaturesMixin._phonological_density("time") == 29 assert np.isnan(SubstitutionFeaturesMixin._phonological_density("wickiup"))
def test_frequency_none_with_computed(): drop_caches() # Lemmas are all lowercase. for word in SubstitutionFeaturesMixin._frequency(): assert word.islower() or is_int(word[0]) or is_int(word[-1]) or word in ["%", "!"]
def test_frequency_none(): drop_caches() with settings.file_override("FREQUENCY"): with open(settings.FREQUENCY, "wb") as f: pickle.dump({"dog": 2, "cat": 3}, f) assert set(SubstitutionFeaturesMixin._frequency()) == {"dog", "cat"}
def test_frequency(): drop_caches() assert SubstitutionFeaturesMixin._frequency("dog") == 7865 assert SubstitutionFeaturesMixin._frequency("play") == 45848 assert np.isnan(SubstitutionFeaturesMixin._frequency("wickiup"))
def test_clustering_none(): drop_caches() with settings.file_override("CLUSTERING"): with open(settings.CLUSTERING, "wb") as f: pickle.dump({"dog": 2, "cat": 3}, f) assert set(SubstitutionFeaturesMixin._clustering()) == {"dog", "cat"}
def test_letters_count_none(): drop_caches() with settings.file_override("TOKENS"): with open(settings.TOKENS, "wb") as f: pickle.dump({"these", "are", "tokens"}, f) assert SubstitutionFeaturesMixin._letters_count() == {"these", "are", "tokens"}
def test_aoa(): drop_caches() assert SubstitutionFeaturesMixin._aoa("time") == 5.16 assert SubstitutionFeaturesMixin._aoa("vocative") == 14.27 assert np.isnan(SubstitutionFeaturesMixin._aoa("wickiup"))
def test_syllables_count(): drop_caches() assert SubstitutionFeaturesMixin._syllables_count("hello") == 2 assert SubstitutionFeaturesMixin._syllables_count("mountain") == 2 assert np.isnan(SubstitutionFeaturesMixin._syllables_count("makakiki"))
def test_letters_count_none_with_computed(): drop_caches() # Lemmas are all lowercase. for word in SubstitutionFeaturesMixin._letters_count(): assert word.islower() or is_int(word[0]) or is_int(word[-1])
def test_pagerank(): drop_caches() assert abs(SubstitutionFeaturesMixin._pagerank("you") - 0.0006390798677378056) < 1e-15 assert abs(SubstitutionFeaturesMixin._pagerank("play") - 0.0012008124120435305) < 1e-15 assert np.isnan(SubstitutionFeaturesMixin._pagerank("wickiup"))
def test_degree_none(): drop_caches() with settings.file_override("DEGREE"): with open(settings.DEGREE, "wb") as f: pickle.dump({"dog": 2, "cat": 3}, f) assert set(SubstitutionFeaturesMixin._degree()) == {"dog", "cat"}
def test_degree(): drop_caches() assert SubstitutionFeaturesMixin._degree("abdomen") == 1 / (10617 - 1) assert SubstitutionFeaturesMixin._degree("speaker") == 9 / (10617 - 1) assert np.isnan(SubstitutionFeaturesMixin._degree("wickiup"))
def test_orthographic_density(): drop_caches() assert SubstitutionFeaturesMixin._orthographic_density("time") == 13 assert np.isnan(SubstitutionFeaturesMixin._orthographic_density("wickiup"))
def test_pagerank_none(): drop_caches() with settings.file_override("PAGERANK"): with open(settings.PAGERANK, "wb") as f: pickle.dump({"dog": 2, "cat": 3}, f) assert set(SubstitutionFeaturesMixin._pagerank()) == {"dog", "cat"}
def test_phonemes_count(): drop_caches() assert SubstitutionFeaturesMixin._phonemes_count("hello") == 4 assert SubstitutionFeaturesMixin._phonemes_count("mountain") == 6 assert np.isnan(SubstitutionFeaturesMixin._phonemes_count("makakiki"))
def test_betweenness(): drop_caches() assert SubstitutionFeaturesMixin._betweenness("dog") == 0.0046938277117769605 assert SubstitutionFeaturesMixin._betweenness("play") == 0.008277234906313704 assert np.isnan(SubstitutionFeaturesMixin._betweenness("wickiup"))
def test_betweenness_none(): drop_caches() with settings.file_override("BETWEENNESS"): with open(settings.BETWEENNESS, "wb") as f: pickle.dump({"dog": 2, "cat": 3}, f) assert set(SubstitutionFeaturesMixin._betweenness()) == {"dog", "cat"}
def test_component(): drop_caches() # Create a test PCA with features alternatively log-transformed and not, # alternatively on tokens and lemmas. features = ("letters_count", "aoa", "synonyms_count", "phonological_density") pca = PCA(n_components=3) # Trying this with a PCA fitted with the wrong shape fails. pca.fit(np.array([[1, 1, 0], [0, 1, 0], [0, 1, 1]])) with pytest.raises(AssertionError): SubstitutionFeaturesMixin._component(0, pca, features) with pytest.raises(AssertionError): SubstitutionFeaturesMixin._component(1, pca, features) # Trying this with unknown features fails. with pytest.raises(ValueError) as excinfo: SubstitutionFeaturesMixin._component(0, pca, ("letters_count", "unknown_feature", "aoa")) assert "Unknown feature" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: SubstitutionFeaturesMixin._component(1, pca, ("letters_count", "unknown_feature", "aoa")) assert "Unknown feature" in str(excinfo.value) # Now training with the right shape. pca.fit(np.array([[1, 0, 0, 0], [-1, 0, 0, 0], [0, 1, 0, 0], [0, -1, 0, 0], [0, 0, 1, 0], [0, 0, -1, 0]])) with settings.file_override("TOKENS"): with open(settings.TOKENS, "wb") as f: pickle.dump({"these", "are", "tokens"}, f) c0 = SubstitutionFeaturesMixin._component(0, pca, features) c1 = SubstitutionFeaturesMixin._component(1, pca, features) c2 = SubstitutionFeaturesMixin._component(2, pca, features) # Doc and name are properly set. assert c0.__name__ == "_component_0" assert c0.__doc__ == "component 0" assert c1.__name__ == "_component_1" assert c1.__doc__ == "component 1" assert c2.__name__ == "_component_2" assert c2.__doc__ == "component 2" # We get the expected hand-computed values. assert c0("time") == -5.16 assert c1("time") == 0.62860865942237421 assert c2("time") == -4 assert np.isnan(c0("makakiki")) assert np.isnan(c1("makakiki")) assert np.isnan(c2("makakiki")) # And the list of words is properly computed. (These are not the true # values since we overrode the tokens list.) assert len(c0()) == 157863 assert len(c1()) == 157863 assert len(c2()) == 157863
def test_clustering(): drop_caches() assert abs(SubstitutionFeaturesMixin._clustering("dog") - 0.0009318641757868838) < 1e-17 assert abs(SubstitutionFeaturesMixin._clustering("play") - 0.0016238663632016216) < 1e-17 assert np.isnan(SubstitutionFeaturesMixin._clustering("wickiup"))
def test_phonemes_count_none(): drop_caches() assert SubstitutionFeaturesMixin._phonemes_count() == _get_pronunciations().keys() for word in SubstitutionFeaturesMixin._phonemes_count(): assert word.islower()
def test_letters_count(): drop_caches() assert SubstitutionFeaturesMixin._letters_count("hello") == 5 assert SubstitutionFeaturesMixin._letters_count("mountain") == 8 assert SubstitutionFeaturesMixin._letters_count("makakiki") == 8