def main(fpath, vocab_fpath, num_cluster, pca): song_matrix = read_sparse(fpath) vocab = read_vocabuary(vocab_fpath) data = None if not pca: data = song_matrix else: pca_algo = RandomizedPCA(n_components=pca) data = pca_algo.fit(song_matrix).transform(song_matrix) labels_column = run_n_kmeans(data, num_cluster, num_cluster, 2) _summarize(song_matrix, vocab, labels_column, num_cluster)
def main(fpath): song_matrix = read_sparse(fpath) to_plot_x = [0] + range(2, 21) + [25, 35] to_plot_y = [0] errors = [0] for x in to_plot_x[1:]: print(x) beta_cv, err = kmeans_betacv(song_matrix, x, True) to_plot_y.append(beta_cv) errors.append(err) ax = plt.gca() ax.set_xticks(to_plot_x) plt.ylabel('IntraCluster/InterCluster Ratio') plt.xlabel('Number of clusters') plt.errorbar(to_plot_x, to_plot_y, yerr=errors, fmt='bo', markersize=8, elinewidth=2) plt.show()
def main(fpath, hex): song_matrix = read_sparse(fpath) pca = RandomizedPCA(n_components = 2) pcas = pca.fit(song_matrix).transform(song_matrix) print(pca.explained_variance_ratio_) if hex: plt.hexbin(pcas[:,0], pcas[:,1], cmap=cm.get_cmap('bone_r', 100), bins='log', gridsize=100, mincnt=2) plt.colorbar() else: plt.scatter(pcas[:,0], pcas[:,1]) plt.legend() ax = plt.gca() plt.ylabel('Second Principal Component') plt.xlabel('First Principal Component') plt.title('PCA of the LastFM dataset') plt.show()