def test_transformed_feature(): # Phonological density is log-transformed. drop_caches() transformed_phonological_density = SubstitutionFeaturesMixin._transformed_feature("phonological_density") assert transformed_phonological_density("time") == np.log(29) assert np.isnan(transformed_phonological_density("wickiup")) # Doc and name are transformed too. assert ( transformed_phonological_density.__doc__ == "log(" + SubstitutionFeaturesMixin._phonological_density.__doc__ + ")" ) assert transformed_phonological_density.__name__ == "_log_phonological_density" # And the list of words is properly computed. drop_caches() with settings.file_override("CLEARPOND"): with open(settings.CLEARPOND, "w") as f: f.write("dog" + 5 * "\t" + "2" + 24 * "\t" + "3\n" "cat" + 5 * "\t" + "2" + 24 * "\t" + "3") assert set(transformed_phonological_density()) == {"dog", "cat"} # AoA is left untouched. drop_caches() transformed_aoa = SubstitutionFeaturesMixin._transformed_feature("aoa") assert transformed_aoa("time") == 5.16 assert transformed_aoa("vocative") == 14.27 assert np.isnan(transformed_aoa("wickiup")) # Doc and name are passed on. assert transformed_aoa.__doc__ == SubstitutionFeaturesMixin._aoa.__doc__ assert transformed_aoa.__name__ == "_aoa" # And the list of words is properly computed. drop_caches() with settings.file_override("AOA"): with open(settings.AOA, "w") as f: f.write("Word,Rating.Mean\nhave,2\ntell,3") assert set(transformed_aoa()) == {"have", "tell"}
def test_orthographic_density_none(): drop_caches() # Lemmas are all lowercase. for word in SubstitutionFeaturesMixin._orthographic_density(): assert word.islower() # And it's computed right. drop_caches() with settings.file_override("CLEARPOND"): with open(settings.CLEARPOND, "w") as f: f.write("dog" + 5 * "\t" + "2" + 24 * "\t" + "3\n" "cat" + 5 * "\t" + "2" + 24 * "\t" + "3") assert set(SubstitutionFeaturesMixin._orthographic_density()) == {"dog", "cat"}
def test_aoa_none(): drop_caches() # Lemmas are all lowercase. for word in SubstitutionFeaturesMixin._aoa(): assert word.islower() # And it's properly computed. drop_caches() with settings.file_override("AOA"): with open(settings.AOA, "w") as f: f.write("Word,Rating.Mean\nhave,2\ntell,3") assert set(SubstitutionFeaturesMixin._aoa()) == {"have", "tell"}
def test_component(): drop_caches() # Create a test PCA with features alternatively log-transformed and not, # alternatively on tokens and lemmas. features = ("letters_count", "aoa", "synonyms_count", "phonological_density") pca = PCA(n_components=3) # Trying this with a PCA fitted with the wrong shape fails. pca.fit(np.array([[1, 1, 0], [0, 1, 0], [0, 1, 1]])) with pytest.raises(AssertionError): SubstitutionFeaturesMixin._component(0, pca, features) with pytest.raises(AssertionError): SubstitutionFeaturesMixin._component(1, pca, features) # Trying this with unknown features fails. with pytest.raises(ValueError) as excinfo: SubstitutionFeaturesMixin._component(0, pca, ("letters_count", "unknown_feature", "aoa")) assert "Unknown feature" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: SubstitutionFeaturesMixin._component(1, pca, ("letters_count", "unknown_feature", "aoa")) assert "Unknown feature" in str(excinfo.value) # Now training with the right shape. pca.fit(np.array([[1, 0, 0, 0], [-1, 0, 0, 0], [0, 1, 0, 0], [0, -1, 0, 0], [0, 0, 1, 0], [0, 0, -1, 0]])) with settings.file_override("TOKENS"): with open(settings.TOKENS, "wb") as f: pickle.dump({"these", "are", "tokens"}, f) c0 = SubstitutionFeaturesMixin._component(0, pca, features) c1 = SubstitutionFeaturesMixin._component(1, pca, features) c2 = SubstitutionFeaturesMixin._component(2, pca, features) # Doc and name are properly set. assert c0.__name__ == "_component_0" assert c0.__doc__ == "component 0" assert c1.__name__ == "_component_1" assert c1.__doc__ == "component 1" assert c2.__name__ == "_component_2" assert c2.__doc__ == "component 2" # We get the expected hand-computed values. assert c0("time") == -5.16 assert c1("time") == 0.62860865942237421 assert c2("time") == -4 assert np.isnan(c0("makakiki")) assert np.isnan(c1("makakiki")) assert np.isnan(c2("makakiki")) # And the list of words is properly computed. (These are not the true # values since we overrode the tokens list.) assert len(c0()) == 157863 assert len(c1()) == 157863 assert len(c2()) == 157863
def test_component_average(): drop_caches() # Two subtitutions. q1a = Quote(string="Chase it others is the dogs hound") q1b = Quote(string="Others is the hound hound") s1 = Substitution(source=q1a, destination=q1b, start=2, position=3) q2a = Quote(string="Chase it others is the frisbee hound") q2b = q1b s2 = Substitution(source=q2a, destination=q2b, start=2, position=3) # Create a test PCA that will use features we later override. features = ("aoa", "phonological_density") pca = PCA(n_components=2) # Trying this with a PCA fitted with the wrong shape fails. pca.fit(np.array([[1, 1, 0], [0, 1, 0], [0, 1, 1]])) with pytest.raises(AssertionError): s1.component_average(0, pca, features) with pytest.raises(AssertionError): s1.component_average(0, pca, features, source_synonyms=True) with pytest.raises(AssertionError): s1.component_average(0, pca, features, source_synonyms=False, sentence_relative="mean") with pytest.raises(AssertionError): s1.component_average(0, pca, features, source_synonyms=True, sentence_relative="mean") # Trying this with unknown features fails. with pytest.raises(ValueError) as excinfo: s1.component_average(0, pca, ("letters_count", "unknown_feature", "aoa")) assert "Unknown feature" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: s1.component_average(0, pca, ("letters_count", "unknown_feature", "aoa"), source_synonyms=True) assert "Unknown feature" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: s1.component_average( 0, pca, ("letters_count", "unknown_feature", "aoa"), source_synonyms=False, sentence_relative="mean" ) assert "Unknown feature" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: s1.component_average( 0, pca, ("letters_count", "unknown_feature", "aoa"), source_synonyms=True, sentence_relative="mean" ) assert "Unknown feature" in str(excinfo.value) # Now with features we override to test manual values. drop_caches() pca.fit(np.array([[2, 1], [1, -2]])) sign = np.sign(pca.components_[:, 0]) with settings.file_override("AOA", "CLEARPOND"): with open(settings.AOA, "w") as f: f.write("Word,Rating.Mean\n" "dog,2\nhound,3\nfrisbee,4\nchase,6\ncad,7\nother,8") with open(settings.CLEARPOND, "w") as f: f.write( "dog" + 5 * "\t" + "0" + 24 * "\t" + "2\n" "hound" + 5 * "\t" + "0" + 24 * "\t" + "3\n" "frisbee" + 5 * "\t" + "0" + 24 * "\t" + "4\n" "screen" + 5 * "\t" + "0" + 24 * "\t" + "5\n" "chase" + 5 * "\t" + "0" + 24 * "\t" + "6\n" "other" + 5 * "\t" + "0" + 24 * "\t" + "8\n" "others" + 5 * "\t" + "0" + 24 * "\t" + "9" ) # We find the hand-computed values alright. assert abs(-sign[0] * s1.component_average(0, pca, features) - (-2.7921497899976822)) < 1e-14 assert abs(-sign[0] * s2.component_average(0, pca, features) - (-2.7921497899976822)) < 1e-14 assert abs(-sign[1] * s1.component_average(1, pca, features) - (-2.3369703188414315)) < 1e-14 assert abs(-sign[1] * s2.component_average(1, pca, features) - (-2.3369703188414315)) < 1e-14 # Same with synonyms. Computed on synonyms of 'dog' (lemma of # 'dogs'). 'frisbee' has no synonyms, hence the NaN for s2. assert ( abs(-sign[0] * s1.component_average(0, pca, features, source_synonyms=True) - (-2.7940486530122683)) < 1e-14 ) assert np.isnan(s2.component_average(0, pca, features, source_synonyms=True)) assert ( abs(-sign[1] * s1.component_average(1, pca, features, source_synonyms=True) - (-2.2309281091642896)) < 1e-14 ) assert np.isnan(s2.component_average(1, pca, features, source_synonyms=True)) # Same without synonyms but with sentence_relative. # Each feature uses either lemmas or tokens (whereas above it was # all lemmas). assert ( abs( -sign[0] * s1.component_average(0, pca, features, source_synonyms=False, sentence_relative="mean") - 0.34030374468910285 ) < 1e-14 ) assert ( abs( -sign[0] * s2.component_average(0, pca, features, source_synonyms=False, sentence_relative="mean") - 0.34030374468910285 ) < 1e-14 ) assert ( abs( -sign[1] * s1.component_average(1, pca, features, source_synonyms=False, sentence_relative="mean") - 0.51902095047064112 ) < 1e-14 ) assert ( abs( -sign[1] * s2.component_average(1, pca, features, source_synonyms=False, sentence_relative="mean") - 0.51902095047064112 ) < 1e-14 ) # Same with synonyms and sentence_relative. assert ( abs( -sign[0] * s1.component_average(0, pca, features, source_synonyms=True, sentence_relative="mean") - 0.3390378360127122 ) < 1e-14 ) assert np.isnan(s2.component_average(0, pca, features, source_synonyms=True, sentence_relative="median")) assert ( abs( -sign[1] * s1.component_average(1, pca, features, source_synonyms=True, sentence_relative="mean") - 0.58971575692206901 ) < 1e-14 ) assert np.isnan(s2.component_average(1, pca, features, source_synonyms=True, sentence_relative="mean"))
def test_feature_average(): # Two subtitutions. q1a = Quote(string="Chase it others is the dogs hound") q1b = Quote(string="Others is the hound hound") s1 = Substitution(source=q1a, destination=q1b, start=2, position=3) q2a = Quote(string="Chase it others is the frisbee hound") q2b = q1b s2 = Substitution(source=q2a, destination=q2b, start=2, position=3) # Test a non-transformed feature (AoA), computed on lemmas. drop_caches() with settings.file_override("AOA"): with open(settings.AOA, "w") as f: f.write("Word,Rating.Mean\n" "dog,2\nhound,3\nfrisbee,4\nchase,6\ncad,7\nother,8") assert s1.feature_average("aoa") == 30 / 6 assert s2.feature_average("aoa") == 30 / 6 assert s1.feature_average("aoa", source_synonyms=True) == np.mean([3, 6, 7]) # 'frisbee' has no synonyms. assert np.isnan(s2.feature_average("aoa", source_synonyms=True)) assert s1.feature_average("aoa", source_synonyms=False, sentence_relative="mean") == (-0.33333333333333304) assert s2.feature_average("aoa", source_synonyms=False, sentence_relative="mean") == (-0.33333333333333304) assert s1.feature_average("aoa", source_synonyms=True, sentence_relative="mean") == (-0.11111111111111072) # 'frisbee' has no synonyms. assert np.isnan(s2.feature_average("aoa", source_synonyms=True, sentence_relative="mean")) # Test a log-transformed feature (phonological density), computed on # tokens. drop_caches() with settings.file_override("CLEARPOND"): with open(settings.CLEARPOND, "w") as f: f.write( "dog" + 5 * "\t" + "0" + 24 * "\t" + "2\n" "hound" + 5 * "\t" + "0" + 24 * "\t" + "3\n" "frisbee" + 5 * "\t" + "0" + 24 * "\t" + "4\n" "chase" + 5 * "\t" + "0" + 24 * "\t" + "6\n" "cad" + 5 * "\t" + "0" + 24 * "\t" + "7\n" "other" + 5 * "\t" + "0" + 24 * "\t" + "8" ) assert s1.feature_average("phonological_density") == np.log([2, 3, 4, 6, 7, 8]).mean() assert s2.feature_average("phonological_density") == np.log([2, 3, 4, 6, 7, 8]).mean() # Even though phonological density is computed on tokens, the synonyms # come from the lemmas. assert s1.feature_average("phonological_density", source_synonyms=True) == np.log([3, 6, 7]).mean() # 'frisbee' has no synonyms. assert np.isnan(s2.feature_average("phonological_density", source_synonyms=True)) # Features for the 'sentence_relative' part are still taken from the # tokens, which leads us to drop 'others'. assert ( s1.feature_average("phonological_density", source_synonyms=False, sentence_relative="mean") == 0.20029093819187427 ) assert ( s2.feature_average("phonological_density", source_synonyms=False, sentence_relative="mean") == 0.20029093819187427 ) assert ( s1.feature_average("phonological_density", source_synonyms=True, sentence_relative="mean") == 0.25674084015785814 ) # 'frisbee' has no synonyms. assert np.isnan(s2.feature_average("phonological_density", source_synonyms=True, sentence_relative="median")) # _synonyms_count(word=None) returns a list of words, some of which have # a _synonyms_count(word) == np.nan (because 0 synonyms is returned as # np.nan). So check that synonyms_count feature average is not np.nan. assert np.isfinite(s1.feature_average("synonyms_count"))
def test_frequency_none(): drop_caches() with settings.file_override("FREQUENCY"): with open(settings.FREQUENCY, "wb") as f: pickle.dump({"dog": 2, "cat": 3}, f) assert set(SubstitutionFeaturesMixin._frequency()) == {"dog", "cat"}
def test_clustering_none(): drop_caches() with settings.file_override("CLUSTERING"): with open(settings.CLUSTERING, "wb") as f: pickle.dump({"dog": 2, "cat": 3}, f) assert set(SubstitutionFeaturesMixin._clustering()) == {"dog", "cat"}
def test_betweenness_none(): drop_caches() with settings.file_override("BETWEENNESS"): with open(settings.BETWEENNESS, "wb") as f: pickle.dump({"dog": 2, "cat": 3}, f) assert set(SubstitutionFeaturesMixin._betweenness()) == {"dog", "cat"}
def test_pagerank_none(): drop_caches() with settings.file_override("PAGERANK"): with open(settings.PAGERANK, "wb") as f: pickle.dump({"dog": 2, "cat": 3}, f) assert set(SubstitutionFeaturesMixin._pagerank()) == {"dog", "cat"}
def test_degree_none(): drop_caches() with settings.file_override("DEGREE"): with open(settings.DEGREE, "wb") as f: pickle.dump({"dog": 2, "cat": 3}, f) assert set(SubstitutionFeaturesMixin._degree()) == {"dog", "cat"}
def test_letters_count_none(): drop_caches() with settings.file_override("TOKENS"): with open(settings.TOKENS, "wb") as f: pickle.dump({"these", "are", "tokens"}, f) assert SubstitutionFeaturesMixin._letters_count() == {"these", "are", "tokens"}