def exp11():
    """
    Show correlation for each feature
    """
    poems = getPoemModel().poems
    scores = {}
    scores["affect"] = getAffectRatios()
    scores["cLength"] = getLogAverageCommentLength()
    scores["rating"] = getPoemScores()
    scores["typeToken"] = getCommentTypeTokenRatio(100)
    scores["numC"] = getNumberOfComments(True)  # use log

    result = {}
    for k1, v1 in scores.items():
        for feature in poems.values()[0].keys():
            cor, p = getCorrelation(poems, v1, feature)
            if result.get(feature, None) is None:
                result[feature] = {k1: (cor, p)}
            else:
                result[feature][k1] = (cor, p)

    for k1 in sorted(result.keys(), key=lambda x: result[x]["affect"][1]):  # sort by affect
        print "\\\\", k1, "& %0.2f & %0.4f" % result[k1]["affect"], "& %0.2f & %0.4f" % result[k1][
            "typeToken"
        ], "& %0.2f & %0.4f" % result[k1]["cLength"], "& %0.2f & %0.4f" % result[k1][
            "rating"
        ], "& %0.2f & %0.4f" % result[
            k1
        ][
            "numC"
        ]
def exp06():
    """
    Can we predict the log of the number of responses? Not much better.
    """
    m = getPoemModel()
    poems = m.poems
    scores = getNumberOfComments(True)  # use log
    makePlots(poems, scores, "log of number of comments", "../experiments/exp06.pdf")
    runPredictCV(poems, scores, DEFAULT_FEATURE_LIST)
def exp05():
    """
    Can we predict the number of responses? Not so well.
    """
    m = getPoemModel()
    poems = m.poems
    scores = getNumberOfComments()
    makePlots(poems, scores, "number of comments", "../experiments/exp05.pdf")
    runPredictCV(poems, scores, DEFAULT_FEATURE_LIST)
def exp04():
    """
    Can we predict poem rating? No, it seems like we cannot predict it so well.
    """
    m = getPoemModel()
    poems = m.poems
    scores = getPoemScores()
    makePlots(poems, scores, "poem score", "../experiments/exp04.pdf")
    runPredictCV(poems, scores, DEFAULT_FEATURE_LIST)
def exp02():
    """
    If comment length is the driver of affect ratio, we should find better
    correlations with comment length. However, we cannot predict this very well.
    """
    m = getPoemModel()
    poems = m.poems
    scores = getAverageCommentLength()  # plot average comment length
    makePlots(poems, scores, "average comment length", "../experiments/exp02.pdf")
    runPredictCV(poems, scores, DEFAULT_FEATURE_LIST)
def exp00():
    """
    Identify correlation of features with aspect ratios. We can predict
    this with about 30% reduction in error over the baseline.
    """
    m = getPoemModel()
    poems = m.poems
    scores = getAffectRatios()  # plot average comment length
    makePlots(poems, scores, "affect ratio", "../experiments/exp00.pdf")
    runPredictCV(poems, scores, DEFAULT_FEATURE_LIST)
def exp13():
    m = getPoemModel()
    poems = m.poems
    scores = getAffectRatios()  # plot average comment length
    for name, poem in poems.items():
        if scores.get(name, None) is None:
            continue
        if poem["proportionOfStops"] > 0.25 and scores[name] < 15:
            print "bottom right", name
        if poem["proportionOfStops"] < 0.12 and scores[name] > 25:
            print "left, name", name, scores[name]
def exp08():
    """
    If high affect ratio comments are less rich in their analysis/observation,
    does this imply that they have a lower type-token ratio? Actually, this
    cannot be well predicted. (~0% reduction in accuracy)
    """
    m = getPoemModel()
    poems = m.poems
    scores = getCommentTypeTokenRatio()
    makePlots(poems, scores, "comment type token ratio", "../experiments/exp08.pdf")
    runPredictCV(poems, scores, DEFAULT_FEATURE_LIST)
def exp09():
    """
    Experiment 9:
    What is the difference in predicting the affect ratio with predicing the
    NRC ratio? Wouldn't this also capture emotion words? We cannot predict this
    as well. (Only ~3% over baseline.)
    """
    m = getPoemModel()
    poems = m.poems
    scores = getNRCRatios()
    makePlots(poems, scores, "NRC ratio", "../experiments/exp09.pdf")
    runPredictCV(poems, scores, DEFAULT_FEATURE_LIST)
def exp03():
    """
    We can predict log of average comment length with 10% reduction in error
    over baseline (better than average comment length), but this is still worse
    than predicting the affect ratio. Why? Is there another descriptive feature
    of the comments that we can better describe?

    Are the comments with different affect ratios saying the same things
    differently or saying different things?
    """
    m = getPoemModel()
    poems = m.poems
    scores = getLogAverageCommentLength()
    makePlots(poems, scores, "log of average comment length", "../experiments/exp03.pdf")
    runPredictCV(poems, scores, DEFAULT_FEATURE_LIST)
def exp081():
    """
    Repeat exp 8, but sample from all of the words. This should control for the
    fact that longer documents tend to have lower type-token ratios. We can 
    predict this with ~15% reduction in error.

    This suggests that "richness" of response can be categorized by the type-
    token ratio, though this is not so easily predicted as affect ratio. With
    experiment 3, we know that we can predict the log of comment length with
    ~10% accuracy.

    Taking note of the sign of the correlation of each variable, this gives us
    a definition of 'richness' that includes:
        - longer comments
        - higher type-token ratio
        - lower affect ratio
    """
    m = getPoemModel()
    poems = m.poems
    scores = getCommentTypeTokenRatio(100)  # sample words
    makePlots(poems, scores, "sampled type-token ratio", "../experiments/exp08.1.pdf")
    runPredictCV(poems, scores, DEFAULT_FEATURE_LIST)
            print "  no score for", filename
            continue 

        featureSet = poems.get(filename, None)
        if featureSet is None:
            # only include poem if features are extracted for it
            print "  no features for", filename
            continue

        scoreArr.append(score)
        featureArr.append(filterFeatures(featureSet, useFeatureList))

        # # print featureArr for specific poem
        # print "poem", f
        # for key, value in poems[f].items():
        #     print "  ", key, ":", value

    vec = DictVectorizer()
    featureArr = vec.fit_transform(featureArr).toarray().tolist()
    featureNames = vec.get_feature_names()

    print "Performing regression using %d data points..." % len(scoreArr)
    tenFold(featureArr, scoreArr, featureNames)


if __name__ == "__main__":
    poems = getPoemModel().poems
    # scores = getPoemScores()
    # scores = getAverageCommentLength()
    scores = getAffectRatios()
    runPredictCV(poems, scores)
Example #13
0
    print "Plotting %d feature plots..." % len(useFeatures)
    for index, feature in enumerate(next(iter(xDict.values())).keys()):
        if feature not in useFeatures:
            continue
        plt.figure(num=None, figsize=(16, 12), dpi=80, facecolor='w', edgecolor='k')
        plotFeatureVsScore(xDict, yDict, feature)
        plt.savefig("zoom_%s.jpg" % feature, format="jpg")

def makeHistogram(affectHist, filename):
    plt.figure(num=None, figsize=(18, 4.5), dpi=80)
    cats = sorted(next(iter(affectHist.values())).keys())
    for hist in affectHist.values():
        plt.plot(range(len(cats)), 
            [hist.get(cat,0) for cat in cats],
            color="blue", alpha=0.2)

    plt.ylabel("prevalence")
    plt.xlabel("emotional category")
    plt.xticks(range(len(cats)), cats)
    plt.savefig(filename, format="pdf")


if __name__ == "__main__":
    from extract_poem_features import getPoemModel
    from extract_comment_features import getAffectRatios
    m = getPoemModel()
    poems = m.poems
    scores = getAffectRatios()  # plot average comment length
    makePlots2(poems, scores, "affect ratio", "../experiments/affect-ratio.pdf")