Exemple #1
0
import convokit
import numpy as np
import matplotlib.pyplot as plt

print("Loading corpus")
corpus = convokit.Corpus(filename=convokit.download("reddit-corpus-small"))

print("Computing hypergraph features")
hc = convokit.HyperConvo()
hc.fit_transform(corpus)

print("Computing low-dimensional embeddings")
te = convokit.ThreadEmbedder(n_components=7)
te.fit_transform(corpus)

ce = convokit.CommunityEmbedder(community_key="subreddit", method="tsne")
ce.fit_transform(corpus)

pts = corpus.get_meta()["communityEmbedder"]["pts"]
labels = corpus.get_meta()["communityEmbedder"]["labels"]

xs, ys = zip(*pts)
plt.scatter(xs, ys)
for i, txt in enumerate(labels):
    plt.annotate(txt, (xs[i], ys[i]))
plt.savefig("tsne")
plt.show()
Exemple #2
0
corpus = convokit.Corpus(filename=convokit.download("reddit-corpus-small"))

print("Computing hypergraph features")
hc = convokit.HyperConvo(prefix_len=10, include_root=False)
hc.fit_transform(corpus)

threads_feats = dict()
convos = corpus.iter_conversations()

for convo in convos:
    threads_feats.update(convo.meta['hyperconvo'])

feat_names = list(sorted(threads_feats[list(threads_feats.keys())[0]].keys()))

print("Computing low-dimensional embeddings")
te = convokit.ThreadEmbedder(return_components=True)
te.fit_transform(corpus)
X_threads = corpus.get_meta()["threadEmbedder"]["X"]
roots = corpus.get_meta()["threadEmbedder"]["roots"]
components = corpus.get_meta()["threadEmbedder"]["components"]

ce = convokit.CommunityEmbedder(community_key="subreddit")
ce.fit_transform(corpus)
X_communities = corpus.get_meta()["communityEmbedder"]["pts"]
subreddits = corpus.get_meta()["communityEmbedder"]["labels"]

print("TOP THREADS")
for d in range(7):
    print("dimension {}".format(d))
    print("- most-negative threads")
    ranked = list(sorted(zip(roots, X_threads), key=lambda x: x[1][d]))