def genMostLikelyMovies(self): movies = parseMovies() phi = self.calcPhi() for topic in xrange(15): top_movies = np.argsort(phi[:, topic]) print "Topic: %d" % topic print "\n".join("%s: %.4f" % (movies[movieid][0], phi[movieid, topic]) for movieid in top_movies[-10:]) print ""
def print_most_likely_movies(fname, collection_name): topic_dist = topic_distribution(fname, collection_name) movies = parseMovies() for topic in xrange(topic_dist.shape[1]): top_movies = np.argsort(topic_dist[:, topic]) print "Topic: %d" % topic print "\n".join("%s: %.4f" % (movies[movieid][0], topic_dist[movieid, topic]) for movieid in top_movies[-10:]) print ""
def genMostLikelyTopic(self): phi = self.calcPhi() movies = parseMovies() topics = defaultdict(list) for movieid in xrange(self.info["movies"]): top_topic = np.argsort(phi[movieid, :])[-1] topics[top_topic].append((movies[movieid][0], phi[movieid, top_topic])) return topics
def visualizePCA(self, samples=20): phi = self.calcPhi() movies = parseMovies() pca = PCA(phi) indices = sample(xrange(len(movies)), samples) x_axis = pca.Y[indices, 0] y_axis = pca.Y[indices, 1] fig = plt.figure() fig.set_size_inches(10, 8) ax = fig.add_subplot(111) ax.scatter(x_axis, y_axis) for idx, x, y in izip(indices, x_axis, y_axis): ax.annotate(movies[idx][0].decode('ascii', 'ignore').encode('ascii', 'ignore'), (x, y)) return fig
def visualizePCA(self, samples=20): phi = self.calcPhi() movies = parseMovies() pca = PCA(phi) indices = sample(xrange(len(movies)), samples) x_axis = pca.Y[indices, 0] y_axis = pca.Y[indices, 1] fig = plt.figure() fig.set_size_inches(10, 8) ax = fig.add_subplot(111) ax.scatter(x_axis, y_axis) for idx, x, y in izip(indices, x_axis, y_axis): ax.annotate( movies[idx][0].decode('ascii', 'ignore').encode('ascii', 'ignore'), (x, y)) return fig