def exp11():
    """
    Show correlation for each feature
    """
    poems = getPoemModel().poems
    scores = {}
    scores["affect"] = getAffectRatios()
    scores["cLength"] = getLogAverageCommentLength()
    scores["rating"] = getPoemScores()
    scores["typeToken"] = getCommentTypeTokenRatio(100)
    scores["numC"] = getNumberOfComments(True)  # use log

    result = {}
    for k1, v1 in scores.items():
        for feature in poems.values()[0].keys():
            cor, p = getCorrelation(poems, v1, feature)
            if result.get(feature, None) is None:
                result[feature] = {k1: (cor, p)}
            else:
                result[feature][k1] = (cor, p)

    for k1 in sorted(result.keys(), key=lambda x: result[x]["affect"][1]):  # sort by affect
        print "\\\\", k1, "& %0.2f & %0.4f" % result[k1]["affect"], "& %0.2f & %0.4f" % result[k1][
            "typeToken"
        ], "& %0.2f & %0.4f" % result[k1]["cLength"], "& %0.2f & %0.4f" % result[k1][
            "rating"
        ], "& %0.2f & %0.4f" % result[
            k1
        ][
            "numC"
        ]
def exp08():
    """
    If high affect ratio comments are less rich in their analysis/observation,
    does this imply that they have a lower type-token ratio? Actually, this
    cannot be well predicted. (~0% reduction in accuracy)
    """
    m = getPoemModel()
    poems = m.poems
    scores = getCommentTypeTokenRatio()
    makePlots(poems, scores, "comment type token ratio", "../experiments/exp08.pdf")
    runPredictCV(poems, scores, DEFAULT_FEATURE_LIST)
def exp12():
    affect = getAffectRatios()
    cLength = getLogAverageCommentLength()
    typeToken = getCommentTypeTokenRatio(100)

    plt.figure(num=None, figsize=(18, 4.5), dpi=80, facecolor="w", edgecolor="k")

    plt.subplot(1, 3, 1)
    checkCorrelation(affect, cLength, "affect ratio", "log average comment length")
    plt.subplot(1, 3, 2)
    checkCorrelation(affect, typeToken, "affect ratio", "type-token ratio")
    plt.subplot(1, 3, 3)
    checkCorrelation(cLength, typeToken, "log average comment length", "type-token ratio")

    plt.savefig("../experiments/exp12.pdf", format="pdf")
def exp081():
    """
    Repeat exp 8, but sample from all of the words. This should control for the
    fact that longer documents tend to have lower type-token ratios. We can 
    predict this with ~15% reduction in error.

    This suggests that "richness" of response can be categorized by the type-
    token ratio, though this is not so easily predicted as affect ratio. With
    experiment 3, we know that we can predict the log of comment length with
    ~10% accuracy.

    Taking note of the sign of the correlation of each variable, this gives us
    a definition of 'richness' that includes:
        - longer comments
        - higher type-token ratio
        - lower affect ratio
    """
    m = getPoemModel()
    poems = m.poems
    scores = getCommentTypeTokenRatio(100)  # sample words
    makePlots(poems, scores, "sampled type-token ratio", "../experiments/exp08.1.pdf")
    runPredictCV(poems, scores, DEFAULT_FEATURE_LIST)