Ejemplo n.º 1
0
def test_transformed_feature():
    # Phonological density is log-transformed.
    drop_caches()
    transformed_phonological_density = SubstitutionFeaturesMixin._transformed_feature("phonological_density")
    assert transformed_phonological_density("time") == np.log(29)
    assert np.isnan(transformed_phonological_density("wickiup"))
    # Doc and name are transformed too.
    assert (
        transformed_phonological_density.__doc__
        == "log(" + SubstitutionFeaturesMixin._phonological_density.__doc__ + ")"
    )
    assert transformed_phonological_density.__name__ == "_log_phonological_density"
    # And the list of words is properly computed.
    drop_caches()
    with settings.file_override("CLEARPOND"):
        with open(settings.CLEARPOND, "w") as f:
            f.write("dog" + 5 * "\t" + "2" + 24 * "\t" + "3\n" "cat" + 5 * "\t" + "2" + 24 * "\t" + "3")
        assert set(transformed_phonological_density()) == {"dog", "cat"}

    # AoA is left untouched.
    drop_caches()
    transformed_aoa = SubstitutionFeaturesMixin._transformed_feature("aoa")
    assert transformed_aoa("time") == 5.16
    assert transformed_aoa("vocative") == 14.27
    assert np.isnan(transformed_aoa("wickiup"))
    # Doc and name are passed on.
    assert transformed_aoa.__doc__ == SubstitutionFeaturesMixin._aoa.__doc__
    assert transformed_aoa.__name__ == "_aoa"
    # And the list of words is properly computed.
    drop_caches()
    with settings.file_override("AOA"):
        with open(settings.AOA, "w") as f:
            f.write("Word,Rating.Mean\nhave,2\ntell,3")
        assert set(transformed_aoa()) == {"have", "tell"}
Ejemplo n.º 2
0
def test_orthographic_density_none():
    drop_caches()
    # Lemmas are all lowercase.
    for word in SubstitutionFeaturesMixin._orthographic_density():
        assert word.islower()
    # And it's computed right.
    drop_caches()
    with settings.file_override("CLEARPOND"):
        with open(settings.CLEARPOND, "w") as f:
            f.write("dog" + 5 * "\t" + "2" + 24 * "\t" + "3\n" "cat" + 5 * "\t" + "2" + 24 * "\t" + "3")
        assert set(SubstitutionFeaturesMixin._orthographic_density()) == {"dog", "cat"}
Ejemplo n.º 3
0
def test_aoa_none():
    drop_caches()
    # Lemmas are all lowercase.
    for word in SubstitutionFeaturesMixin._aoa():
        assert word.islower()
    # And it's properly computed.
    drop_caches()
    with settings.file_override("AOA"):
        with open(settings.AOA, "w") as f:
            f.write("Word,Rating.Mean\nhave,2\ntell,3")
        assert set(SubstitutionFeaturesMixin._aoa()) == {"have", "tell"}
Ejemplo n.º 4
0
def test_component():
    drop_caches()

    # Create a test PCA with features alternatively log-transformed and not,
    # alternatively on tokens and lemmas.
    features = ("letters_count", "aoa", "synonyms_count", "phonological_density")
    pca = PCA(n_components=3)

    # Trying this with a PCA fitted with the wrong shape fails.
    pca.fit(np.array([[1, 1, 0], [0, 1, 0], [0, 1, 1]]))
    with pytest.raises(AssertionError):
        SubstitutionFeaturesMixin._component(0, pca, features)
    with pytest.raises(AssertionError):
        SubstitutionFeaturesMixin._component(1, pca, features)
    # Trying this with unknown features fails.
    with pytest.raises(ValueError) as excinfo:
        SubstitutionFeaturesMixin._component(0, pca, ("letters_count", "unknown_feature", "aoa"))
    assert "Unknown feature" in str(excinfo.value)
    with pytest.raises(ValueError) as excinfo:
        SubstitutionFeaturesMixin._component(1, pca, ("letters_count", "unknown_feature", "aoa"))
    assert "Unknown feature" in str(excinfo.value)

    # Now training with the right shape.
    pca.fit(np.array([[1, 0, 0, 0], [-1, 0, 0, 0], [0, 1, 0, 0], [0, -1, 0, 0], [0, 0, 1, 0], [0, 0, -1, 0]]))
    with settings.file_override("TOKENS"):
        with open(settings.TOKENS, "wb") as f:
            pickle.dump({"these", "are", "tokens"}, f)

        c0 = SubstitutionFeaturesMixin._component(0, pca, features)
        c1 = SubstitutionFeaturesMixin._component(1, pca, features)
        c2 = SubstitutionFeaturesMixin._component(2, pca, features)
        # Doc and name are properly set.
        assert c0.__name__ == "_component_0"
        assert c0.__doc__ == "component 0"
        assert c1.__name__ == "_component_1"
        assert c1.__doc__ == "component 1"
        assert c2.__name__ == "_component_2"
        assert c2.__doc__ == "component 2"
        # We get the expected hand-computed values.
        assert c0("time") == -5.16
        assert c1("time") == 0.62860865942237421
        assert c2("time") == -4
        assert np.isnan(c0("makakiki"))
        assert np.isnan(c1("makakiki"))
        assert np.isnan(c2("makakiki"))
        # And the list of words is properly computed. (These are not the true
        # values since we overrode the tokens list.)
        assert len(c0()) == 157863
        assert len(c1()) == 157863
        assert len(c2()) == 157863
Ejemplo n.º 5
0
def test_component_average():
    drop_caches()
    # Two subtitutions.
    q1a = Quote(string="Chase it others is the dogs hound")
    q1b = Quote(string="Others is the hound hound")
    s1 = Substitution(source=q1a, destination=q1b, start=2, position=3)
    q2a = Quote(string="Chase it others is the frisbee hound")
    q2b = q1b
    s2 = Substitution(source=q2a, destination=q2b, start=2, position=3)

    # Create a test PCA that will use features we later override.
    features = ("aoa", "phonological_density")
    pca = PCA(n_components=2)

    # Trying this with a PCA fitted with the wrong shape fails.
    pca.fit(np.array([[1, 1, 0], [0, 1, 0], [0, 1, 1]]))
    with pytest.raises(AssertionError):
        s1.component_average(0, pca, features)
    with pytest.raises(AssertionError):
        s1.component_average(0, pca, features, source_synonyms=True)
    with pytest.raises(AssertionError):
        s1.component_average(0, pca, features, source_synonyms=False, sentence_relative="mean")
    with pytest.raises(AssertionError):
        s1.component_average(0, pca, features, source_synonyms=True, sentence_relative="mean")
    # Trying this with unknown features fails.
    with pytest.raises(ValueError) as excinfo:
        s1.component_average(0, pca, ("letters_count", "unknown_feature", "aoa"))
    assert "Unknown feature" in str(excinfo.value)
    with pytest.raises(ValueError) as excinfo:
        s1.component_average(0, pca, ("letters_count", "unknown_feature", "aoa"), source_synonyms=True)
    assert "Unknown feature" in str(excinfo.value)
    with pytest.raises(ValueError) as excinfo:
        s1.component_average(
            0, pca, ("letters_count", "unknown_feature", "aoa"), source_synonyms=False, sentence_relative="mean"
        )
    assert "Unknown feature" in str(excinfo.value)
    with pytest.raises(ValueError) as excinfo:
        s1.component_average(
            0, pca, ("letters_count", "unknown_feature", "aoa"), source_synonyms=True, sentence_relative="mean"
        )
    assert "Unknown feature" in str(excinfo.value)

    # Now with features we override to test manual values.
    drop_caches()
    pca.fit(np.array([[2, 1], [1, -2]]))
    sign = np.sign(pca.components_[:, 0])
    with settings.file_override("AOA", "CLEARPOND"):
        with open(settings.AOA, "w") as f:
            f.write("Word,Rating.Mean\n" "dog,2\nhound,3\nfrisbee,4\nchase,6\ncad,7\nother,8")
        with open(settings.CLEARPOND, "w") as f:
            f.write(
                "dog" + 5 * "\t" + "0" + 24 * "\t" + "2\n"
                "hound" + 5 * "\t" + "0" + 24 * "\t" + "3\n"
                "frisbee" + 5 * "\t" + "0" + 24 * "\t" + "4\n"
                "screen" + 5 * "\t" + "0" + 24 * "\t" + "5\n"
                "chase" + 5 * "\t" + "0" + 24 * "\t" + "6\n"
                "other" + 5 * "\t" + "0" + 24 * "\t" + "8\n"
                "others" + 5 * "\t" + "0" + 24 * "\t" + "9"
            )

        # We find the hand-computed values alright.
        assert abs(-sign[0] * s1.component_average(0, pca, features) - (-2.7921497899976822)) < 1e-14
        assert abs(-sign[0] * s2.component_average(0, pca, features) - (-2.7921497899976822)) < 1e-14
        assert abs(-sign[1] * s1.component_average(1, pca, features) - (-2.3369703188414315)) < 1e-14
        assert abs(-sign[1] * s2.component_average(1, pca, features) - (-2.3369703188414315)) < 1e-14
        # Same with synonyms. Computed on synonyms of 'dog' (lemma of
        # 'dogs'). 'frisbee' has no synonyms, hence the NaN for s2.
        assert (
            abs(-sign[0] * s1.component_average(0, pca, features, source_synonyms=True) - (-2.7940486530122683)) < 1e-14
        )
        assert np.isnan(s2.component_average(0, pca, features, source_synonyms=True))
        assert (
            abs(-sign[1] * s1.component_average(1, pca, features, source_synonyms=True) - (-2.2309281091642896)) < 1e-14
        )
        assert np.isnan(s2.component_average(1, pca, features, source_synonyms=True))
        # Same without synonyms but with sentence_relative.
        # Each feature uses either lemmas or tokens (whereas above it was
        # all lemmas).
        assert (
            abs(
                -sign[0] * s1.component_average(0, pca, features, source_synonyms=False, sentence_relative="mean")
                - 0.34030374468910285
            )
            < 1e-14
        )
        assert (
            abs(
                -sign[0] * s2.component_average(0, pca, features, source_synonyms=False, sentence_relative="mean")
                - 0.34030374468910285
            )
            < 1e-14
        )
        assert (
            abs(
                -sign[1] * s1.component_average(1, pca, features, source_synonyms=False, sentence_relative="mean")
                - 0.51902095047064112
            )
            < 1e-14
        )
        assert (
            abs(
                -sign[1] * s2.component_average(1, pca, features, source_synonyms=False, sentence_relative="mean")
                - 0.51902095047064112
            )
            < 1e-14
        )
        # Same with synonyms and sentence_relative.
        assert (
            abs(
                -sign[0] * s1.component_average(0, pca, features, source_synonyms=True, sentence_relative="mean")
                - 0.3390378360127122
            )
            < 1e-14
        )
        assert np.isnan(s2.component_average(0, pca, features, source_synonyms=True, sentence_relative="median"))
        assert (
            abs(
                -sign[1] * s1.component_average(1, pca, features, source_synonyms=True, sentence_relative="mean")
                - 0.58971575692206901
            )
            < 1e-14
        )
        assert np.isnan(s2.component_average(1, pca, features, source_synonyms=True, sentence_relative="mean"))
Ejemplo n.º 6
0
def test_feature_average():
    # Two subtitutions.
    q1a = Quote(string="Chase it others is the dogs hound")
    q1b = Quote(string="Others is the hound hound")
    s1 = Substitution(source=q1a, destination=q1b, start=2, position=3)
    q2a = Quote(string="Chase it others is the frisbee hound")
    q2b = q1b
    s2 = Substitution(source=q2a, destination=q2b, start=2, position=3)

    # Test a non-transformed feature (AoA), computed on lemmas.
    drop_caches()
    with settings.file_override("AOA"):
        with open(settings.AOA, "w") as f:
            f.write("Word,Rating.Mean\n" "dog,2\nhound,3\nfrisbee,4\nchase,6\ncad,7\nother,8")
        assert s1.feature_average("aoa") == 30 / 6
        assert s2.feature_average("aoa") == 30 / 6
        assert s1.feature_average("aoa", source_synonyms=True) == np.mean([3, 6, 7])
        # 'frisbee' has no synonyms.
        assert np.isnan(s2.feature_average("aoa", source_synonyms=True))
        assert s1.feature_average("aoa", source_synonyms=False, sentence_relative="mean") == (-0.33333333333333304)
        assert s2.feature_average("aoa", source_synonyms=False, sentence_relative="mean") == (-0.33333333333333304)
        assert s1.feature_average("aoa", source_synonyms=True, sentence_relative="mean") == (-0.11111111111111072)
        # 'frisbee' has no synonyms.
        assert np.isnan(s2.feature_average("aoa", source_synonyms=True, sentence_relative="mean"))
    # Test a log-transformed feature (phonological density), computed on
    # tokens.
    drop_caches()
    with settings.file_override("CLEARPOND"):
        with open(settings.CLEARPOND, "w") as f:
            f.write(
                "dog" + 5 * "\t" + "0" + 24 * "\t" + "2\n"
                "hound" + 5 * "\t" + "0" + 24 * "\t" + "3\n"
                "frisbee" + 5 * "\t" + "0" + 24 * "\t" + "4\n"
                "chase" + 5 * "\t" + "0" + 24 * "\t" + "6\n"
                "cad" + 5 * "\t" + "0" + 24 * "\t" + "7\n"
                "other" + 5 * "\t" + "0" + 24 * "\t" + "8"
            )
        assert s1.feature_average("phonological_density") == np.log([2, 3, 4, 6, 7, 8]).mean()
        assert s2.feature_average("phonological_density") == np.log([2, 3, 4, 6, 7, 8]).mean()
        # Even though phonological density is computed on tokens, the synonyms
        # come from the lemmas.
        assert s1.feature_average("phonological_density", source_synonyms=True) == np.log([3, 6, 7]).mean()
        # 'frisbee' has no synonyms.
        assert np.isnan(s2.feature_average("phonological_density", source_synonyms=True))
        # Features for the 'sentence_relative' part are still taken from the
        # tokens, which leads us to drop 'others'.
        assert (
            s1.feature_average("phonological_density", source_synonyms=False, sentence_relative="mean")
            == 0.20029093819187427
        )
        assert (
            s2.feature_average("phonological_density", source_synonyms=False, sentence_relative="mean")
            == 0.20029093819187427
        )
        assert (
            s1.feature_average("phonological_density", source_synonyms=True, sentence_relative="mean")
            == 0.25674084015785814
        )
        # 'frisbee' has no synonyms.
        assert np.isnan(s2.feature_average("phonological_density", source_synonyms=True, sentence_relative="median"))
    # _synonyms_count(word=None) returns a list of words, some of which have
    # a _synonyms_count(word) == np.nan (because 0 synonyms is returned as
    # np.nan). So check that synonyms_count feature average is not np.nan.
    assert np.isfinite(s1.feature_average("synonyms_count"))
Ejemplo n.º 7
0
def test_frequency_none():
    drop_caches()
    with settings.file_override("FREQUENCY"):
        with open(settings.FREQUENCY, "wb") as f:
            pickle.dump({"dog": 2, "cat": 3}, f)
        assert set(SubstitutionFeaturesMixin._frequency()) == {"dog", "cat"}
Ejemplo n.º 8
0
def test_clustering_none():
    drop_caches()
    with settings.file_override("CLUSTERING"):
        with open(settings.CLUSTERING, "wb") as f:
            pickle.dump({"dog": 2, "cat": 3}, f)
        assert set(SubstitutionFeaturesMixin._clustering()) == {"dog", "cat"}
Ejemplo n.º 9
0
def test_betweenness_none():
    drop_caches()
    with settings.file_override("BETWEENNESS"):
        with open(settings.BETWEENNESS, "wb") as f:
            pickle.dump({"dog": 2, "cat": 3}, f)
        assert set(SubstitutionFeaturesMixin._betweenness()) == {"dog", "cat"}
Ejemplo n.º 10
0
def test_pagerank_none():
    drop_caches()
    with settings.file_override("PAGERANK"):
        with open(settings.PAGERANK, "wb") as f:
            pickle.dump({"dog": 2, "cat": 3}, f)
        assert set(SubstitutionFeaturesMixin._pagerank()) == {"dog", "cat"}
Ejemplo n.º 11
0
def test_degree_none():
    drop_caches()
    with settings.file_override("DEGREE"):
        with open(settings.DEGREE, "wb") as f:
            pickle.dump({"dog": 2, "cat": 3}, f)
        assert set(SubstitutionFeaturesMixin._degree()) == {"dog", "cat"}
Ejemplo n.º 12
0
def test_letters_count_none():
    drop_caches()
    with settings.file_override("TOKENS"):
        with open(settings.TOKENS, "wb") as f:
            pickle.dump({"these", "are", "tokens"}, f)
        assert SubstitutionFeaturesMixin._letters_count() == {"these", "are", "tokens"}