Ejemplo n.º 1
0
    def test_entropy(self):
        probs = np.array([0.1, 0.5, 0.01, 0.07, 0.02, 0.3, 0, 0, 0], dtype="d")

        self.assertEquals(entropy.entropy(probs), it_entropy(probs))

        try:
            entropy.entropy(np.array([-1], dtype="d"))
            self.fail()
        except AssertionError:
            pass

        try:
            entropy.entropy(np.array([0.1, 0.8], dtype="d"))
            self.fail()
        except AssertionError:
            pass

        try:
            entropy.entropy(np.array([2, -1], dtype="d"))
            self.fail()
        except AssertionError:
            pass

        try:
            entropy.entropy(np.array([], dtype="d"))
            self.fail()
        except AssertionError:
            pass
Ejemplo n.º 2
0
def _summarize(data, vocabulary, labels_column, num_cluster):
    # Basic stats
    print("Number of songs per cluster")
    counter = Counter(labels_column)
    print(counter)
    print()

    prob_Ct, prob_Tc, prob_T = compute_probs(data, num_cluster, labels_column, counter)
    all_tags = range(len(prob_T))

    print("Top tags per cluster")
    for clust in xrange(num_cluster):
        print(clust, "tags with max_freq_in_cluster")
        songs_in_cluster = np.where(labels_column == clust)[0]
        for tag in top_10_frequency(data[songs_in_cluster]):
            print("\t", vocabulary[tag])
        print()

        print(clust, "tags with max_prob_p(c|t)")
        sort_func = lambda to_sort: prob_Ct[to_sort][clust]
        for tag in sorted(all_tags, key=sort_func, reverse=True)[:10]:
            print("\t", vocabulary[tag])
        print()
    print()

    print("Term entropies for each cluster")
    term_entropies = []
    for clust in xrange(num_cluster):
        h = entropy.entropy(prob_Tc[clust])
        term_entropies.append(h)
        print(clust, h)
    print()

    # Number of shared tags between clusters
    X = np.zeros((num_cluster, len(all_tags)))
    for clust in xrange(num_cluster):
        for tag in all_tags:
            X[clust][tag] = prob_Tc[clust][tag]

    distances = pairwise_kernels(X)
    for i in xrange(num_cluster):
        distances[i, i] = 0

    plt.imshow(distances, cmap="bone_r", interpolation="nearest")
    ax = plt.gca()
    plt.xticks(np.arange(0, num_cluster))
    plt.yticks(np.arange(0, num_cluster))
    plt.colorbar()
    plt.title("Confusion Matrix for Cluster Similarities")
    plt.ylabel("ClusterID")
    plt.xlabel("ClusterID")
    for i in xrange(num_cluster):
        ax.annotate("%.3f" % term_entropies[i], xy=(i, i), horizontalalignment="center", verticalalignment="center")
    plt.show()

    print("Mean difference")
    to_corr_1 = []
    to_corr_2 = []
    for clust in xrange(num_cluster):
        to_corr_1.append(term_entropies[clust])
        to_corr_2.append(np.mean(distances[clust]))
        print(clust, term_entropies[clust], np.mean(distances[clust]))
    from scipy.stats import pearsonr

    print("R2 ", pearsonr(to_corr_1, to_corr_2))