def test_average():
    drop_caches()
    # Two subtitutions.
    q1a = Quote(string="Chase it others is the dogs hound")
    q1b = Quote(string="Others is the hound hound")
    s1 = Substitution(source=q1a, destination=q1b, start=2, position=3)
    q2a = Quote(string="Chase it others is the frisbee hound")
    q2b = q1b
    s2 = Substitution(source=q2a, destination=q2b, start=2, position=3)

    # Our test feature.
    values = {"dog": 2, "hound": 3, "frisbee": 4, "chase": 6, "cad": 7, "other": 8}

    def feature(word=None):
        if word is None:
            return set(values.keys())
        else:
            return values.get(word, np.nan)

    # Global average and average of synonyms (computed on lemmas) are well
    # retrieved.
    assert s1._static_average(feature) == 30 / 6
    assert s1._average(feature, False) == 30 / 6
    assert s2._static_average(feature) == 30 / 6
    assert s2._average(feature, False) == 30 / 6
    assert s1._average(feature, True) == np.mean([3, 6, 7])
    # 'frisbee' has no synonyms.
    assert np.isnan(s2._average(feature, True))

    # If we have a lot of NaNs, things still work well.
    drop_caches()
    values = {"dog": 2, "frisbee": 4, "chase": np.nan, "cad": 7, "other": 8}
    assert s1._average(feature, True) == 7
    # 'frisbee' has no synonyms.
    assert np.isnan(s2._average(feature, True))
def test_features(normal_substitution):
    drop_caches()
    # A shortcut.
    s = normal_substitution

    # Check we defined the right substitution.
    assert s.tokens == ("containing", "other")
    assert s.lemmas == ("contain", "other")

    # An unknown feature raises an error
    with pytest.raises(ValueError):
        s.features("unknown_feature")
    with pytest.raises(ValueError):
        s.features("unknown_feature", sentence_relative="mean")

    # Syllable, phonemes, letters counts, and densities are right,
    # and computed on tokens.
    assert s.features("syllables_count") == (3, 2)
    assert s.features("phonemes_count") == (8, 3)
    assert s.features("letters_count") == (10, 5)
    assert np.isnan(s.features("phonological_density")[0])
    assert s.features("phonological_density")[1] == np.log(7)
    assert np.isnan(s.features("orthographic_density")[0])
    assert s.features("orthographic_density")[1] == np.log(5)
    # Same with features computed relative to sentence.
    assert s.features("syllables_count", sentence_relative="mean") == (3 - 7 / 5, 2 - 6 / 5)
    assert s.features("phonemes_count", sentence_relative="mean") == (8 - 18 / 5, 3 - 13 / 5)
    assert s.features("letters_count", sentence_relative="mean") == (10 - 21 / 5, 5 - 16 / 5)
    assert np.isnan(s.features("phonological_density", sentence_relative="median")[0])
    assert s.features("phonological_density", sentence_relative="median")[1] == np.log(7) - np.median(
        np.log([31, 24, 9, 7, 28])
    )
    assert np.isnan(s.features("orthographic_density", sentence_relative="mean")[0])
    assert (
        s.features("orthographic_density", sentence_relative="mean")[1]
        == np.log(5) - np.log([17, 14, 11, 5, 20]).mean()
    )

    # Synonyms count and age-of-acquisition are right, and computed on lemmas.
    # The rest of the features need computed files, and are tested separately.
    assert s.features("synonyms_count") == (np.log(3), np.log(0.5))
    assert s.features("aoa") == (7.88, 5.33)
    # Same with features computed relative to sentence.
    assert s.features("synonyms_count", sentence_relative="median") == (
        np.log(3) - np.median(np.log([1, 1, 3, 2.4444444444444446])),
        np.log(0.5) - np.median(np.log([1, 1, 0.5, 2.4444444444444446])),
    )
    assert s.features("aoa", sentence_relative="mean") == (7.88 - 6.033333333333334, 5.33 - 5.183333333333334)

    # Unknown words are ignored. Also when in the rest of the sentence.
    q1 = Quote(string="makakiki is the goal")
    q2 = Quote(string="makakiki is the moukakaka")
    s = Substitution(source=q1, destination=q2, start=0, position=3)
    assert s.features("syllables_count")[0] == 1
    # np.nan != np.nan so we can't `assert s.features(...) == (1, np.nan)`
    assert np.isnan(s.features("syllables_count")[1])
    assert s.features("syllables_count", sentence_relative="mean")[0] == 1 - 3 / 3
    assert np.isnan(s.features("syllables_count", sentence_relative="median")[1])
def test_substitution_features(normal_substitution):
    drop_caches()
    # A shortcut.
    s = normal_substitution

    # Check we defined the right substitution.
    assert s.tokens == ("containing", "other")
    assert s.lemmas == ("contain", "other")

    # An unknown feature raises an error
    with pytest.raises(ValueError):
        s._substitution_features("unknown_feature")

    # Syllable, phonemes, letters counts, and densities are right,
    # and computed on tokens.
    assert s._substitution_features("syllables_count") == (3, 2)
    assert s._substitution_features("phonemes_count") == (8, 3)
    assert s._substitution_features("letters_count") == (10, 5)
    assert np.isnan(s._substitution_features("phonological_density")[0])
    assert s._substitution_features("phonological_density")[1] == np.log(7)
    assert np.isnan(s._substitution_features("orthographic_density")[0])
    assert s._substitution_features("orthographic_density")[1] == np.log(5)

    # Synonyms count and age-of-acquisition are right, and computed on lemmas.
    # The rest of the features need computed files, and are only tested through
    # 'features()' directly so as not to make other file-dependent tests heavy
    # to read.
    assert s._substitution_features("synonyms_count") == (np.log(3), np.log(0.5))
    assert s._substitution_features("aoa") == (7.88, 5.33)

    # Unknown words are ignored. Also when in the rest of the sentence.
    q1 = Quote(string="makakiki is the goal")
    q2 = Quote(string="makakiki is the moukakaka")
    s = Substitution(source=q1, destination=q2, start=0, position=3)
    assert s._substitution_features("syllables_count")[0] == 1
    # np.nan != np.nan so we can't `assert s.features(...) == (1, np.nan)`
    assert np.isnan(s._substitution_features("syllables_count")[1])
def test_component_average():
    drop_caches()
    # Two subtitutions.
    q1a = Quote(string="Chase it others is the dogs hound")
    q1b = Quote(string="Others is the hound hound")
    s1 = Substitution(source=q1a, destination=q1b, start=2, position=3)
    q2a = Quote(string="Chase it others is the frisbee hound")
    q2b = q1b
    s2 = Substitution(source=q2a, destination=q2b, start=2, position=3)

    # Create a test PCA that will use features we later override.
    features = ("aoa", "phonological_density")
    pca = PCA(n_components=2)

    # Trying this with a PCA fitted with the wrong shape fails.
    pca.fit(np.array([[1, 1, 0], [0, 1, 0], [0, 1, 1]]))
    with pytest.raises(AssertionError):
        s1.component_average(0, pca, features)
    with pytest.raises(AssertionError):
        s1.component_average(0, pca, features, source_synonyms=True)
    with pytest.raises(AssertionError):
        s1.component_average(0, pca, features, source_synonyms=False, sentence_relative="mean")
    with pytest.raises(AssertionError):
        s1.component_average(0, pca, features, source_synonyms=True, sentence_relative="mean")
    # Trying this with unknown features fails.
    with pytest.raises(ValueError) as excinfo:
        s1.component_average(0, pca, ("letters_count", "unknown_feature", "aoa"))
    assert "Unknown feature" in str(excinfo.value)
    with pytest.raises(ValueError) as excinfo:
        s1.component_average(0, pca, ("letters_count", "unknown_feature", "aoa"), source_synonyms=True)
    assert "Unknown feature" in str(excinfo.value)
    with pytest.raises(ValueError) as excinfo:
        s1.component_average(
            0, pca, ("letters_count", "unknown_feature", "aoa"), source_synonyms=False, sentence_relative="mean"
        )
    assert "Unknown feature" in str(excinfo.value)
    with pytest.raises(ValueError) as excinfo:
        s1.component_average(
            0, pca, ("letters_count", "unknown_feature", "aoa"), source_synonyms=True, sentence_relative="mean"
        )
    assert "Unknown feature" in str(excinfo.value)

    # Now with features we override to test manual values.
    drop_caches()
    pca.fit(np.array([[2, 1], [1, -2]]))
    sign = np.sign(pca.components_[:, 0])
    with settings.file_override("AOA", "CLEARPOND"):
        with open(settings.AOA, "w") as f:
            f.write("Word,Rating.Mean\n" "dog,2\nhound,3\nfrisbee,4\nchase,6\ncad,7\nother,8")
        with open(settings.CLEARPOND, "w") as f:
            f.write(
                "dog" + 5 * "\t" + "0" + 24 * "\t" + "2\n"
                "hound" + 5 * "\t" + "0" + 24 * "\t" + "3\n"
                "frisbee" + 5 * "\t" + "0" + 24 * "\t" + "4\n"
                "screen" + 5 * "\t" + "0" + 24 * "\t" + "5\n"
                "chase" + 5 * "\t" + "0" + 24 * "\t" + "6\n"
                "other" + 5 * "\t" + "0" + 24 * "\t" + "8\n"
                "others" + 5 * "\t" + "0" + 24 * "\t" + "9"
            )

        # We find the hand-computed values alright.
        assert abs(-sign[0] * s1.component_average(0, pca, features) - (-2.7921497899976822)) < 1e-14
        assert abs(-sign[0] * s2.component_average(0, pca, features) - (-2.7921497899976822)) < 1e-14
        assert abs(-sign[1] * s1.component_average(1, pca, features) - (-2.3369703188414315)) < 1e-14
        assert abs(-sign[1] * s2.component_average(1, pca, features) - (-2.3369703188414315)) < 1e-14
        # Same with synonyms. Computed on synonyms of 'dog' (lemma of
        # 'dogs'). 'frisbee' has no synonyms, hence the NaN for s2.
        assert (
            abs(-sign[0] * s1.component_average(0, pca, features, source_synonyms=True) - (-2.7940486530122683)) < 1e-14
        )
        assert np.isnan(s2.component_average(0, pca, features, source_synonyms=True))
        assert (
            abs(-sign[1] * s1.component_average(1, pca, features, source_synonyms=True) - (-2.2309281091642896)) < 1e-14
        )
        assert np.isnan(s2.component_average(1, pca, features, source_synonyms=True))
        # Same without synonyms but with sentence_relative.
        # Each feature uses either lemmas or tokens (whereas above it was
        # all lemmas).
        assert (
            abs(
                -sign[0] * s1.component_average(0, pca, features, source_synonyms=False, sentence_relative="mean")
                - 0.34030374468910285
            )
            < 1e-14
        )
        assert (
            abs(
                -sign[0] * s2.component_average(0, pca, features, source_synonyms=False, sentence_relative="mean")
                - 0.34030374468910285
            )
            < 1e-14
        )
        assert (
            abs(
                -sign[1] * s1.component_average(1, pca, features, source_synonyms=False, sentence_relative="mean")
                - 0.51902095047064112
            )
            < 1e-14
        )
        assert (
            abs(
                -sign[1] * s2.component_average(1, pca, features, source_synonyms=False, sentence_relative="mean")
                - 0.51902095047064112
            )
            < 1e-14
        )
        # Same with synonyms and sentence_relative.
        assert (
            abs(
                -sign[0] * s1.component_average(0, pca, features, source_synonyms=True, sentence_relative="mean")
                - 0.3390378360127122
            )
            < 1e-14
        )
        assert np.isnan(s2.component_average(0, pca, features, source_synonyms=True, sentence_relative="median"))
        assert (
            abs(
                -sign[1] * s1.component_average(1, pca, features, source_synonyms=True, sentence_relative="mean")
                - 0.58971575692206901
            )
            < 1e-14
        )
        assert np.isnan(s2.component_average(1, pca, features, source_synonyms=True, sentence_relative="mean"))
def test_feature_average():
    # Two subtitutions.
    q1a = Quote(string="Chase it others is the dogs hound")
    q1b = Quote(string="Others is the hound hound")
    s1 = Substitution(source=q1a, destination=q1b, start=2, position=3)
    q2a = Quote(string="Chase it others is the frisbee hound")
    q2b = q1b
    s2 = Substitution(source=q2a, destination=q2b, start=2, position=3)

    # Test a non-transformed feature (AoA), computed on lemmas.
    drop_caches()
    with settings.file_override("AOA"):
        with open(settings.AOA, "w") as f:
            f.write("Word,Rating.Mean\n" "dog,2\nhound,3\nfrisbee,4\nchase,6\ncad,7\nother,8")
        assert s1.feature_average("aoa") == 30 / 6
        assert s2.feature_average("aoa") == 30 / 6
        assert s1.feature_average("aoa", source_synonyms=True) == np.mean([3, 6, 7])
        # 'frisbee' has no synonyms.
        assert np.isnan(s2.feature_average("aoa", source_synonyms=True))
        assert s1.feature_average("aoa", source_synonyms=False, sentence_relative="mean") == (-0.33333333333333304)
        assert s2.feature_average("aoa", source_synonyms=False, sentence_relative="mean") == (-0.33333333333333304)
        assert s1.feature_average("aoa", source_synonyms=True, sentence_relative="mean") == (-0.11111111111111072)
        # 'frisbee' has no synonyms.
        assert np.isnan(s2.feature_average("aoa", source_synonyms=True, sentence_relative="mean"))
    # Test a log-transformed feature (phonological density), computed on
    # tokens.
    drop_caches()
    with settings.file_override("CLEARPOND"):
        with open(settings.CLEARPOND, "w") as f:
            f.write(
                "dog" + 5 * "\t" + "0" + 24 * "\t" + "2\n"
                "hound" + 5 * "\t" + "0" + 24 * "\t" + "3\n"
                "frisbee" + 5 * "\t" + "0" + 24 * "\t" + "4\n"
                "chase" + 5 * "\t" + "0" + 24 * "\t" + "6\n"
                "cad" + 5 * "\t" + "0" + 24 * "\t" + "7\n"
                "other" + 5 * "\t" + "0" + 24 * "\t" + "8"
            )
        assert s1.feature_average("phonological_density") == np.log([2, 3, 4, 6, 7, 8]).mean()
        assert s2.feature_average("phonological_density") == np.log([2, 3, 4, 6, 7, 8]).mean()
        # Even though phonological density is computed on tokens, the synonyms
        # come from the lemmas.
        assert s1.feature_average("phonological_density", source_synonyms=True) == np.log([3, 6, 7]).mean()
        # 'frisbee' has no synonyms.
        assert np.isnan(s2.feature_average("phonological_density", source_synonyms=True))
        # Features for the 'sentence_relative' part are still taken from the
        # tokens, which leads us to drop 'others'.
        assert (
            s1.feature_average("phonological_density", source_synonyms=False, sentence_relative="mean")
            == 0.20029093819187427
        )
        assert (
            s2.feature_average("phonological_density", source_synonyms=False, sentence_relative="mean")
            == 0.20029093819187427
        )
        assert (
            s1.feature_average("phonological_density", source_synonyms=True, sentence_relative="mean")
            == 0.25674084015785814
        )
        # 'frisbee' has no synonyms.
        assert np.isnan(s2.feature_average("phonological_density", source_synonyms=True, sentence_relative="median"))
    # _synonyms_count(word=None) returns a list of words, some of which have
    # a _synonyms_count(word) == np.nan (because 0 synonyms is returned as
    # np.nan). So check that synonyms_count feature average is not np.nan.
    assert np.isfinite(s1.feature_average("synonyms_count"))