import convokit import numpy as np import matplotlib.pyplot as plt print("Loading corpus") corpus = convokit.Corpus(filename=convokit.download("reddit-corpus-small")) print("Computing hypergraph features") hc = convokit.HyperConvo() hc.fit_transform(corpus) print("Computing low-dimensional embeddings") te = convokit.ThreadEmbedder(n_components=7) te.fit_transform(corpus) ce = convokit.CommunityEmbedder(community_key="subreddit", method="tsne") ce.fit_transform(corpus) pts = corpus.get_meta()["communityEmbedder"]["pts"] labels = corpus.get_meta()["communityEmbedder"]["labels"] xs, ys = zip(*pts) plt.scatter(xs, ys) for i, txt in enumerate(labels): plt.annotate(txt, (xs[i], ys[i])) plt.savefig("tsne") plt.show()
corpus = convokit.Corpus(filename=convokit.download("reddit-corpus-small")) print("Computing hypergraph features") hc = convokit.HyperConvo(prefix_len=10, include_root=False) hc.fit_transform(corpus) threads_feats = dict() convos = corpus.iter_conversations() for convo in convos: threads_feats.update(convo.meta['hyperconvo']) feat_names = list(sorted(threads_feats[list(threads_feats.keys())[0]].keys())) print("Computing low-dimensional embeddings") te = convokit.ThreadEmbedder(return_components=True) te.fit_transform(corpus) X_threads = corpus.get_meta()["threadEmbedder"]["X"] roots = corpus.get_meta()["threadEmbedder"]["roots"] components = corpus.get_meta()["threadEmbedder"]["components"] ce = convokit.CommunityEmbedder(community_key="subreddit") ce.fit_transform(corpus) X_communities = corpus.get_meta()["communityEmbedder"]["pts"] subreddits = corpus.get_meta()["communityEmbedder"]["labels"] print("TOP THREADS") for d in range(7): print("dimension {}".format(d)) print("- most-negative threads") ranked = list(sorted(zip(roots, X_threads), key=lambda x: x[1][d]))