def test_average(): drop_caches() # Two subtitutions. q1a = Quote(string="Chase it others is the dogs hound") q1b = Quote(string="Others is the hound hound") s1 = Substitution(source=q1a, destination=q1b, start=2, position=3) q2a = Quote(string="Chase it others is the frisbee hound") q2b = q1b s2 = Substitution(source=q2a, destination=q2b, start=2, position=3) # Our test feature. values = {"dog": 2, "hound": 3, "frisbee": 4, "chase": 6, "cad": 7, "other": 8} def feature(word=None): if word is None: return set(values.keys()) else: return values.get(word, np.nan) # Global average and average of synonyms (computed on lemmas) are well # retrieved. assert s1._static_average(feature) == 30 / 6 assert s1._average(feature, False) == 30 / 6 assert s2._static_average(feature) == 30 / 6 assert s2._average(feature, False) == 30 / 6 assert s1._average(feature, True) == np.mean([3, 6, 7]) # 'frisbee' has no synonyms. assert np.isnan(s2._average(feature, True)) # If we have a lot of NaNs, things still work well. drop_caches() values = {"dog": 2, "frisbee": 4, "chase": np.nan, "cad": 7, "other": 8} assert s1._average(feature, True) == 7 # 'frisbee' has no synonyms. assert np.isnan(s2._average(feature, True))
def test_features(normal_substitution): drop_caches() # A shortcut. s = normal_substitution # Check we defined the right substitution. assert s.tokens == ("containing", "other") assert s.lemmas == ("contain", "other") # An unknown feature raises an error with pytest.raises(ValueError): s.features("unknown_feature") with pytest.raises(ValueError): s.features("unknown_feature", sentence_relative="mean") # Syllable, phonemes, letters counts, and densities are right, # and computed on tokens. assert s.features("syllables_count") == (3, 2) assert s.features("phonemes_count") == (8, 3) assert s.features("letters_count") == (10, 5) assert np.isnan(s.features("phonological_density")[0]) assert s.features("phonological_density")[1] == np.log(7) assert np.isnan(s.features("orthographic_density")[0]) assert s.features("orthographic_density")[1] == np.log(5) # Same with features computed relative to sentence. assert s.features("syllables_count", sentence_relative="mean") == (3 - 7 / 5, 2 - 6 / 5) assert s.features("phonemes_count", sentence_relative="mean") == (8 - 18 / 5, 3 - 13 / 5) assert s.features("letters_count", sentence_relative="mean") == (10 - 21 / 5, 5 - 16 / 5) assert np.isnan(s.features("phonological_density", sentence_relative="median")[0]) assert s.features("phonological_density", sentence_relative="median")[1] == np.log(7) - np.median( np.log([31, 24, 9, 7, 28]) ) assert np.isnan(s.features("orthographic_density", sentence_relative="mean")[0]) assert ( s.features("orthographic_density", sentence_relative="mean")[1] == np.log(5) - np.log([17, 14, 11, 5, 20]).mean() ) # Synonyms count and age-of-acquisition are right, and computed on lemmas. # The rest of the features need computed files, and are tested separately. assert s.features("synonyms_count") == (np.log(3), np.log(0.5)) assert s.features("aoa") == (7.88, 5.33) # Same with features computed relative to sentence. assert s.features("synonyms_count", sentence_relative="median") == ( np.log(3) - np.median(np.log([1, 1, 3, 2.4444444444444446])), np.log(0.5) - np.median(np.log([1, 1, 0.5, 2.4444444444444446])), ) assert s.features("aoa", sentence_relative="mean") == (7.88 - 6.033333333333334, 5.33 - 5.183333333333334) # Unknown words are ignored. Also when in the rest of the sentence. q1 = Quote(string="makakiki is the goal") q2 = Quote(string="makakiki is the moukakaka") s = Substitution(source=q1, destination=q2, start=0, position=3) assert s.features("syllables_count")[0] == 1 # np.nan != np.nan so we can't `assert s.features(...) == (1, np.nan)` assert np.isnan(s.features("syllables_count")[1]) assert s.features("syllables_count", sentence_relative="mean")[0] == 1 - 3 / 3 assert np.isnan(s.features("syllables_count", sentence_relative="median")[1])
def test_substitution_features(normal_substitution): drop_caches() # A shortcut. s = normal_substitution # Check we defined the right substitution. assert s.tokens == ("containing", "other") assert s.lemmas == ("contain", "other") # An unknown feature raises an error with pytest.raises(ValueError): s._substitution_features("unknown_feature") # Syllable, phonemes, letters counts, and densities are right, # and computed on tokens. assert s._substitution_features("syllables_count") == (3, 2) assert s._substitution_features("phonemes_count") == (8, 3) assert s._substitution_features("letters_count") == (10, 5) assert np.isnan(s._substitution_features("phonological_density")[0]) assert s._substitution_features("phonological_density")[1] == np.log(7) assert np.isnan(s._substitution_features("orthographic_density")[0]) assert s._substitution_features("orthographic_density")[1] == np.log(5) # Synonyms count and age-of-acquisition are right, and computed on lemmas. # The rest of the features need computed files, and are only tested through # 'features()' directly so as not to make other file-dependent tests heavy # to read. assert s._substitution_features("synonyms_count") == (np.log(3), np.log(0.5)) assert s._substitution_features("aoa") == (7.88, 5.33) # Unknown words are ignored. Also when in the rest of the sentence. q1 = Quote(string="makakiki is the goal") q2 = Quote(string="makakiki is the moukakaka") s = Substitution(source=q1, destination=q2, start=0, position=3) assert s._substitution_features("syllables_count")[0] == 1 # np.nan != np.nan so we can't `assert s.features(...) == (1, np.nan)` assert np.isnan(s._substitution_features("syllables_count")[1])
def test_component_average(): drop_caches() # Two subtitutions. q1a = Quote(string="Chase it others is the dogs hound") q1b = Quote(string="Others is the hound hound") s1 = Substitution(source=q1a, destination=q1b, start=2, position=3) q2a = Quote(string="Chase it others is the frisbee hound") q2b = q1b s2 = Substitution(source=q2a, destination=q2b, start=2, position=3) # Create a test PCA that will use features we later override. features = ("aoa", "phonological_density") pca = PCA(n_components=2) # Trying this with a PCA fitted with the wrong shape fails. pca.fit(np.array([[1, 1, 0], [0, 1, 0], [0, 1, 1]])) with pytest.raises(AssertionError): s1.component_average(0, pca, features) with pytest.raises(AssertionError): s1.component_average(0, pca, features, source_synonyms=True) with pytest.raises(AssertionError): s1.component_average(0, pca, features, source_synonyms=False, sentence_relative="mean") with pytest.raises(AssertionError): s1.component_average(0, pca, features, source_synonyms=True, sentence_relative="mean") # Trying this with unknown features fails. with pytest.raises(ValueError) as excinfo: s1.component_average(0, pca, ("letters_count", "unknown_feature", "aoa")) assert "Unknown feature" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: s1.component_average(0, pca, ("letters_count", "unknown_feature", "aoa"), source_synonyms=True) assert "Unknown feature" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: s1.component_average( 0, pca, ("letters_count", "unknown_feature", "aoa"), source_synonyms=False, sentence_relative="mean" ) assert "Unknown feature" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: s1.component_average( 0, pca, ("letters_count", "unknown_feature", "aoa"), source_synonyms=True, sentence_relative="mean" ) assert "Unknown feature" in str(excinfo.value) # Now with features we override to test manual values. drop_caches() pca.fit(np.array([[2, 1], [1, -2]])) sign = np.sign(pca.components_[:, 0]) with settings.file_override("AOA", "CLEARPOND"): with open(settings.AOA, "w") as f: f.write("Word,Rating.Mean\n" "dog,2\nhound,3\nfrisbee,4\nchase,6\ncad,7\nother,8") with open(settings.CLEARPOND, "w") as f: f.write( "dog" + 5 * "\t" + "0" + 24 * "\t" + "2\n" "hound" + 5 * "\t" + "0" + 24 * "\t" + "3\n" "frisbee" + 5 * "\t" + "0" + 24 * "\t" + "4\n" "screen" + 5 * "\t" + "0" + 24 * "\t" + "5\n" "chase" + 5 * "\t" + "0" + 24 * "\t" + "6\n" "other" + 5 * "\t" + "0" + 24 * "\t" + "8\n" "others" + 5 * "\t" + "0" + 24 * "\t" + "9" ) # We find the hand-computed values alright. assert abs(-sign[0] * s1.component_average(0, pca, features) - (-2.7921497899976822)) < 1e-14 assert abs(-sign[0] * s2.component_average(0, pca, features) - (-2.7921497899976822)) < 1e-14 assert abs(-sign[1] * s1.component_average(1, pca, features) - (-2.3369703188414315)) < 1e-14 assert abs(-sign[1] * s2.component_average(1, pca, features) - (-2.3369703188414315)) < 1e-14 # Same with synonyms. Computed on synonyms of 'dog' (lemma of # 'dogs'). 'frisbee' has no synonyms, hence the NaN for s2. assert ( abs(-sign[0] * s1.component_average(0, pca, features, source_synonyms=True) - (-2.7940486530122683)) < 1e-14 ) assert np.isnan(s2.component_average(0, pca, features, source_synonyms=True)) assert ( abs(-sign[1] * s1.component_average(1, pca, features, source_synonyms=True) - (-2.2309281091642896)) < 1e-14 ) assert np.isnan(s2.component_average(1, pca, features, source_synonyms=True)) # Same without synonyms but with sentence_relative. # Each feature uses either lemmas or tokens (whereas above it was # all lemmas). assert ( abs( -sign[0] * s1.component_average(0, pca, features, source_synonyms=False, sentence_relative="mean") - 0.34030374468910285 ) < 1e-14 ) assert ( abs( -sign[0] * s2.component_average(0, pca, features, source_synonyms=False, sentence_relative="mean") - 0.34030374468910285 ) < 1e-14 ) assert ( abs( -sign[1] * s1.component_average(1, pca, features, source_synonyms=False, sentence_relative="mean") - 0.51902095047064112 ) < 1e-14 ) assert ( abs( -sign[1] * s2.component_average(1, pca, features, source_synonyms=False, sentence_relative="mean") - 0.51902095047064112 ) < 1e-14 ) # Same with synonyms and sentence_relative. assert ( abs( -sign[0] * s1.component_average(0, pca, features, source_synonyms=True, sentence_relative="mean") - 0.3390378360127122 ) < 1e-14 ) assert np.isnan(s2.component_average(0, pca, features, source_synonyms=True, sentence_relative="median")) assert ( abs( -sign[1] * s1.component_average(1, pca, features, source_synonyms=True, sentence_relative="mean") - 0.58971575692206901 ) < 1e-14 ) assert np.isnan(s2.component_average(1, pca, features, source_synonyms=True, sentence_relative="mean"))
def test_feature_average(): # Two subtitutions. q1a = Quote(string="Chase it others is the dogs hound") q1b = Quote(string="Others is the hound hound") s1 = Substitution(source=q1a, destination=q1b, start=2, position=3) q2a = Quote(string="Chase it others is the frisbee hound") q2b = q1b s2 = Substitution(source=q2a, destination=q2b, start=2, position=3) # Test a non-transformed feature (AoA), computed on lemmas. drop_caches() with settings.file_override("AOA"): with open(settings.AOA, "w") as f: f.write("Word,Rating.Mean\n" "dog,2\nhound,3\nfrisbee,4\nchase,6\ncad,7\nother,8") assert s1.feature_average("aoa") == 30 / 6 assert s2.feature_average("aoa") == 30 / 6 assert s1.feature_average("aoa", source_synonyms=True) == np.mean([3, 6, 7]) # 'frisbee' has no synonyms. assert np.isnan(s2.feature_average("aoa", source_synonyms=True)) assert s1.feature_average("aoa", source_synonyms=False, sentence_relative="mean") == (-0.33333333333333304) assert s2.feature_average("aoa", source_synonyms=False, sentence_relative="mean") == (-0.33333333333333304) assert s1.feature_average("aoa", source_synonyms=True, sentence_relative="mean") == (-0.11111111111111072) # 'frisbee' has no synonyms. assert np.isnan(s2.feature_average("aoa", source_synonyms=True, sentence_relative="mean")) # Test a log-transformed feature (phonological density), computed on # tokens. drop_caches() with settings.file_override("CLEARPOND"): with open(settings.CLEARPOND, "w") as f: f.write( "dog" + 5 * "\t" + "0" + 24 * "\t" + "2\n" "hound" + 5 * "\t" + "0" + 24 * "\t" + "3\n" "frisbee" + 5 * "\t" + "0" + 24 * "\t" + "4\n" "chase" + 5 * "\t" + "0" + 24 * "\t" + "6\n" "cad" + 5 * "\t" + "0" + 24 * "\t" + "7\n" "other" + 5 * "\t" + "0" + 24 * "\t" + "8" ) assert s1.feature_average("phonological_density") == np.log([2, 3, 4, 6, 7, 8]).mean() assert s2.feature_average("phonological_density") == np.log([2, 3, 4, 6, 7, 8]).mean() # Even though phonological density is computed on tokens, the synonyms # come from the lemmas. assert s1.feature_average("phonological_density", source_synonyms=True) == np.log([3, 6, 7]).mean() # 'frisbee' has no synonyms. assert np.isnan(s2.feature_average("phonological_density", source_synonyms=True)) # Features for the 'sentence_relative' part are still taken from the # tokens, which leads us to drop 'others'. assert ( s1.feature_average("phonological_density", source_synonyms=False, sentence_relative="mean") == 0.20029093819187427 ) assert ( s2.feature_average("phonological_density", source_synonyms=False, sentence_relative="mean") == 0.20029093819187427 ) assert ( s1.feature_average("phonological_density", source_synonyms=True, sentence_relative="mean") == 0.25674084015785814 ) # 'frisbee' has no synonyms. assert np.isnan(s2.feature_average("phonological_density", source_synonyms=True, sentence_relative="median")) # _synonyms_count(word=None) returns a list of words, some of which have # a _synonyms_count(word) == np.nan (because 0 synonyms is returned as # np.nan). So check that synonyms_count feature average is not np.nan. assert np.isfinite(s1.feature_average("synonyms_count"))