Ejemplo n.º 1
0
def test_synonyms_count_none():
    drop_caches()
    # Lemmas are properly counted.
    assert len(SubstitutionFeaturesMixin._synonyms_count()) == 147306
    # Lemmas are all lowercase.
    for word in SubstitutionFeaturesMixin._synonyms_count():
        assert word.islower() or is_int(word[0]) or is_int(word[-1])
Ejemplo n.º 2
0
def test_transformed_feature():
    # Phonological density is log-transformed.
    drop_caches()
    transformed_phonological_density = SubstitutionFeaturesMixin._transformed_feature("phonological_density")
    assert transformed_phonological_density("time") == np.log(29)
    assert np.isnan(transformed_phonological_density("wickiup"))
    # Doc and name are transformed too.
    assert (
        transformed_phonological_density.__doc__
        == "log(" + SubstitutionFeaturesMixin._phonological_density.__doc__ + ")"
    )
    assert transformed_phonological_density.__name__ == "_log_phonological_density"
    # And the list of words is properly computed.
    drop_caches()
    with settings.file_override("CLEARPOND"):
        with open(settings.CLEARPOND, "w") as f:
            f.write("dog" + 5 * "\t" + "2" + 24 * "\t" + "3\n" "cat" + 5 * "\t" + "2" + 24 * "\t" + "3")
        assert set(transformed_phonological_density()) == {"dog", "cat"}

    # AoA is left untouched.
    drop_caches()
    transformed_aoa = SubstitutionFeaturesMixin._transformed_feature("aoa")
    assert transformed_aoa("time") == 5.16
    assert transformed_aoa("vocative") == 14.27
    assert np.isnan(transformed_aoa("wickiup"))
    # Doc and name are passed on.
    assert transformed_aoa.__doc__ == SubstitutionFeaturesMixin._aoa.__doc__
    assert transformed_aoa.__name__ == "_aoa"
    # And the list of words is properly computed.
    drop_caches()
    with settings.file_override("AOA"):
        with open(settings.AOA, "w") as f:
            f.write("Word,Rating.Mean\nhave,2\ntell,3")
        assert set(transformed_aoa()) == {"have", "tell"}
Ejemplo n.º 3
0
def test_strict_synonyms():
    assert SubstitutionFeaturesMixin._strict_synonyms("frisbee") == set()
    assert SubstitutionFeaturesMixin._strict_synonyms("dog") == {
        "domestic_dog",
        "canis_familiaris",
        "frump",
        "cad",
        "bounder",
        "blackguard",
        "hound",
        "heel",
        "frank",
        "frankfurter",
        "hotdog",
        "hot_dog",
        "wiener",
        "wienerwurst",
        "weenie",
        "pawl",
        "detent",
        "click",
        "andiron",
        "firedog",
        "dog-iron",
        "chase",
        "chase_after",
        "trail",
        "tail",
        "tag",
        "give_chase",
        "go_after",
        "track",
    }
    assert SubstitutionFeaturesMixin._strict_synonyms("makakiki") == set()
Ejemplo n.º 4
0
def test_orthographic_density_none():
    drop_caches()
    # Lemmas are all lowercase.
    for word in SubstitutionFeaturesMixin._orthographic_density():
        assert word.islower()
    # And it's computed right.
    drop_caches()
    with settings.file_override("CLEARPOND"):
        with open(settings.CLEARPOND, "w") as f:
            f.write("dog" + 5 * "\t" + "2" + 24 * "\t" + "3\n" "cat" + 5 * "\t" + "2" + 24 * "\t" + "3")
        assert set(SubstitutionFeaturesMixin._orthographic_density()) == {"dog", "cat"}
Ejemplo n.º 5
0
def test_aoa_none():
    drop_caches()
    # Lemmas are all lowercase.
    for word in SubstitutionFeaturesMixin._aoa():
        assert word.islower()
    # And it's properly computed.
    drop_caches()
    with settings.file_override("AOA"):
        with open(settings.AOA, "w") as f:
            f.write("Word,Rating.Mean\nhave,2\ntell,3")
        assert set(SubstitutionFeaturesMixin._aoa()) == {"have", "tell"}
Ejemplo n.º 6
0
def test_synonyms_count():
    drop_caches()
    # 'hello' has a single synset, with 5 members. So 4 synonyms.
    assert SubstitutionFeaturesMixin._synonyms_count("hello") == 4
    # 'mountain' has two synsets, with 2 and 27 members.
    # So ((2-1) + (27-1))/2 synonyms.
    assert SubstitutionFeaturesMixin._synonyms_count("mountain") == 13.5
    # 'lamp' has two synsets, with only one member in each.
    # So no synonyms, which yields `np.nan`.
    assert np.isnan(SubstitutionFeaturesMixin._synonyms_count("lamp"))
    # 'makakiki' does not exist.
    assert np.isnan(SubstitutionFeaturesMixin._synonyms_count("makakiki"))
Ejemplo n.º 7
0
def test_phonological_density():
    drop_caches()
    assert SubstitutionFeaturesMixin._phonological_density("time") == 29
    assert np.isnan(SubstitutionFeaturesMixin._phonological_density("wickiup"))
Ejemplo n.º 8
0
def test_frequency_none_with_computed():
    drop_caches()
    # Lemmas are all lowercase.
    for word in SubstitutionFeaturesMixin._frequency():
        assert word.islower() or is_int(word[0]) or is_int(word[-1]) or word in ["%", "!"]
Ejemplo n.º 9
0
def test_frequency_none():
    drop_caches()
    with settings.file_override("FREQUENCY"):
        with open(settings.FREQUENCY, "wb") as f:
            pickle.dump({"dog": 2, "cat": 3}, f)
        assert set(SubstitutionFeaturesMixin._frequency()) == {"dog", "cat"}
Ejemplo n.º 10
0
def test_frequency():
    drop_caches()
    assert SubstitutionFeaturesMixin._frequency("dog") == 7865
    assert SubstitutionFeaturesMixin._frequency("play") == 45848
    assert np.isnan(SubstitutionFeaturesMixin._frequency("wickiup"))
Ejemplo n.º 11
0
def test_clustering_none():
    drop_caches()
    with settings.file_override("CLUSTERING"):
        with open(settings.CLUSTERING, "wb") as f:
            pickle.dump({"dog": 2, "cat": 3}, f)
        assert set(SubstitutionFeaturesMixin._clustering()) == {"dog", "cat"}
Ejemplo n.º 12
0
def test_letters_count_none():
    drop_caches()
    with settings.file_override("TOKENS"):
        with open(settings.TOKENS, "wb") as f:
            pickle.dump({"these", "are", "tokens"}, f)
        assert SubstitutionFeaturesMixin._letters_count() == {"these", "are", "tokens"}
Ejemplo n.º 13
0
def test_aoa():
    drop_caches()
    assert SubstitutionFeaturesMixin._aoa("time") == 5.16
    assert SubstitutionFeaturesMixin._aoa("vocative") == 14.27
    assert np.isnan(SubstitutionFeaturesMixin._aoa("wickiup"))
Ejemplo n.º 14
0
def test_syllables_count():
    drop_caches()
    assert SubstitutionFeaturesMixin._syllables_count("hello") == 2
    assert SubstitutionFeaturesMixin._syllables_count("mountain") == 2
    assert np.isnan(SubstitutionFeaturesMixin._syllables_count("makakiki"))
Ejemplo n.º 15
0
def test_letters_count_none_with_computed():
    drop_caches()
    # Lemmas are all lowercase.
    for word in SubstitutionFeaturesMixin._letters_count():
        assert word.islower() or is_int(word[0]) or is_int(word[-1])
Ejemplo n.º 16
0
def test_pagerank():
    drop_caches()
    assert abs(SubstitutionFeaturesMixin._pagerank("you") - 0.0006390798677378056) < 1e-15
    assert abs(SubstitutionFeaturesMixin._pagerank("play") - 0.0012008124120435305) < 1e-15
    assert np.isnan(SubstitutionFeaturesMixin._pagerank("wickiup"))
Ejemplo n.º 17
0
def test_degree_none():
    drop_caches()
    with settings.file_override("DEGREE"):
        with open(settings.DEGREE, "wb") as f:
            pickle.dump({"dog": 2, "cat": 3}, f)
        assert set(SubstitutionFeaturesMixin._degree()) == {"dog", "cat"}
Ejemplo n.º 18
0
def test_degree():
    drop_caches()
    assert SubstitutionFeaturesMixin._degree("abdomen") == 1 / (10617 - 1)
    assert SubstitutionFeaturesMixin._degree("speaker") == 9 / (10617 - 1)
    assert np.isnan(SubstitutionFeaturesMixin._degree("wickiup"))
Ejemplo n.º 19
0
def test_orthographic_density():
    drop_caches()
    assert SubstitutionFeaturesMixin._orthographic_density("time") == 13
    assert np.isnan(SubstitutionFeaturesMixin._orthographic_density("wickiup"))
Ejemplo n.º 20
0
def test_pagerank_none():
    drop_caches()
    with settings.file_override("PAGERANK"):
        with open(settings.PAGERANK, "wb") as f:
            pickle.dump({"dog": 2, "cat": 3}, f)
        assert set(SubstitutionFeaturesMixin._pagerank()) == {"dog", "cat"}
Ejemplo n.º 21
0
def test_phonemes_count():
    drop_caches()
    assert SubstitutionFeaturesMixin._phonemes_count("hello") == 4
    assert SubstitutionFeaturesMixin._phonemes_count("mountain") == 6
    assert np.isnan(SubstitutionFeaturesMixin._phonemes_count("makakiki"))
Ejemplo n.º 22
0
def test_betweenness():
    drop_caches()
    assert SubstitutionFeaturesMixin._betweenness("dog") == 0.0046938277117769605
    assert SubstitutionFeaturesMixin._betweenness("play") == 0.008277234906313704
    assert np.isnan(SubstitutionFeaturesMixin._betweenness("wickiup"))
Ejemplo n.º 23
0
def test_betweenness_none():
    drop_caches()
    with settings.file_override("BETWEENNESS"):
        with open(settings.BETWEENNESS, "wb") as f:
            pickle.dump({"dog": 2, "cat": 3}, f)
        assert set(SubstitutionFeaturesMixin._betweenness()) == {"dog", "cat"}
Ejemplo n.º 24
0
def test_component():
    drop_caches()

    # Create a test PCA with features alternatively log-transformed and not,
    # alternatively on tokens and lemmas.
    features = ("letters_count", "aoa", "synonyms_count", "phonological_density")
    pca = PCA(n_components=3)

    # Trying this with a PCA fitted with the wrong shape fails.
    pca.fit(np.array([[1, 1, 0], [0, 1, 0], [0, 1, 1]]))
    with pytest.raises(AssertionError):
        SubstitutionFeaturesMixin._component(0, pca, features)
    with pytest.raises(AssertionError):
        SubstitutionFeaturesMixin._component(1, pca, features)
    # Trying this with unknown features fails.
    with pytest.raises(ValueError) as excinfo:
        SubstitutionFeaturesMixin._component(0, pca, ("letters_count", "unknown_feature", "aoa"))
    assert "Unknown feature" in str(excinfo.value)
    with pytest.raises(ValueError) as excinfo:
        SubstitutionFeaturesMixin._component(1, pca, ("letters_count", "unknown_feature", "aoa"))
    assert "Unknown feature" in str(excinfo.value)

    # Now training with the right shape.
    pca.fit(np.array([[1, 0, 0, 0], [-1, 0, 0, 0], [0, 1, 0, 0], [0, -1, 0, 0], [0, 0, 1, 0], [0, 0, -1, 0]]))
    with settings.file_override("TOKENS"):
        with open(settings.TOKENS, "wb") as f:
            pickle.dump({"these", "are", "tokens"}, f)

        c0 = SubstitutionFeaturesMixin._component(0, pca, features)
        c1 = SubstitutionFeaturesMixin._component(1, pca, features)
        c2 = SubstitutionFeaturesMixin._component(2, pca, features)
        # Doc and name are properly set.
        assert c0.__name__ == "_component_0"
        assert c0.__doc__ == "component 0"
        assert c1.__name__ == "_component_1"
        assert c1.__doc__ == "component 1"
        assert c2.__name__ == "_component_2"
        assert c2.__doc__ == "component 2"
        # We get the expected hand-computed values.
        assert c0("time") == -5.16
        assert c1("time") == 0.62860865942237421
        assert c2("time") == -4
        assert np.isnan(c0("makakiki"))
        assert np.isnan(c1("makakiki"))
        assert np.isnan(c2("makakiki"))
        # And the list of words is properly computed. (These are not the true
        # values since we overrode the tokens list.)
        assert len(c0()) == 157863
        assert len(c1()) == 157863
        assert len(c2()) == 157863
Ejemplo n.º 25
0
def test_clustering():
    drop_caches()
    assert abs(SubstitutionFeaturesMixin._clustering("dog") - 0.0009318641757868838) < 1e-17
    assert abs(SubstitutionFeaturesMixin._clustering("play") - 0.0016238663632016216) < 1e-17
    assert np.isnan(SubstitutionFeaturesMixin._clustering("wickiup"))
Ejemplo n.º 26
0
def test_phonemes_count_none():
    drop_caches()
    assert SubstitutionFeaturesMixin._phonemes_count() == _get_pronunciations().keys()
    for word in SubstitutionFeaturesMixin._phonemes_count():
        assert word.islower()
Ejemplo n.º 27
0
def test_letters_count():
    drop_caches()
    assert SubstitutionFeaturesMixin._letters_count("hello") == 5
    assert SubstitutionFeaturesMixin._letters_count("mountain") == 8
    assert SubstitutionFeaturesMixin._letters_count("makakiki") == 8