コード例 #1
0
    def _read(self, corpus_split):
        corpus_split = corpus_split.split('_')

        corpus_name = corpus_split[0]
        self.split = corpus_split[1] if len(corpus_split) > 1 else None

        corpus = Corpus(filename=download(corpus_name))
        conversations = corpus.iter_conversations()
        if self.sample:
            conversations = itertools.islice(conversations, self.sample)

        for conv in conversations:
            meta = conv.meta

            if (meta.get('split') != self.split) and (meta.get(
                    'annotation_year', 2018) != 2018):
                continue

            label = str(meta[self.label_field])
            # turns = [u.text for u in conv.iter_utterances() if u.text.strip() and (not u.meta.get('is_section_header'))]
            turns = [
                u.meta.parsed for u in conv.iter_utterances()
                if not u.meta.get('is_section_header')
            ]

            end = len(turns) - 1 if self.forecast else None
            turns = turns[-self.max_turns:end]

            if turns and all(turns):
                inst = self.text_to_instance(turns, label)
                if inst:
                    yield inst
 def setUp(self) -> None:
     self.corpus = Corpus(download('subreddit-hey'))
     self.utt_df = self.corpus.get_utterances_dataframe()
     self.convo_df = self.corpus.get_conversations_dataframe()
     self.speaker_df = self.corpus.get_speakers_dataframe()
     self.new_corpus = Corpus.from_pandas(self.utt_df, self.speaker_df,
                                          self.convo_df)
コード例 #3
0
def process_corpus(corpus_name,
                   to_download=TO_DOWNLOAD,
                   min_wc_source=MIN_WC_SOURCE,
                   max_wc_source=MAX_WC_SOURCE,
                   min_wc_target=MIN_WC_TARGET,
                   max_wc_target=MAX_WC_TARGET,
                   source_filter=SOURCE_FILTER,
                   target_filter=TARGET_FILTER,
                   text_cols=TEXT_COLS,
                   data_dir=DATA_DIR):

    if to_download:
        corpus = Corpus(download(corpus_name, data_dir=data_dir))
    else:
        corpus = Corpus(os.path.join(data_dir, corpus_name))
    corpus_name = corpus.get_meta()['name']
    print(corpus_name)
    corpus.print_summary_stats()
    print('processing', corpus.get_meta()['name'])
    corpus.load_info('utterance', ['parsed'])

    corpus = text_prep_pipe().transform(corpus)

    source_df, target_df = get_train_subset(corpus, min_wc_source,
                                            max_wc_source, min_wc_target,
                                            max_wc_target, source_filter,
                                            target_filter, text_cols)
    source_df.to_csv(os.path.join(data_dir, corpus_name + '.source.tsv'),
                     sep='\t')
    target_df.to_csv(os.path.join(data_dir, corpus_name + '.target.tsv'),
                     sep='\t')
コード例 #4
0
ファイル: preprocessing.py プロジェクト: naripok/transformer
def load_conversations(corpus_name, max_samples, eval_percent=0.1):
    logging.info('Loading data.')

    def split_data(inputs, outputs, eval_percent):
        eval_index = int(len(inputs) * (1 - eval_percent))
        return (inputs[:eval_index], outputs[:eval_index], inputs[eval_index:],
                outputs[eval_index:])

    corpus = Corpus(filename=download(corpus_name))

    deleted_filter = re.compile(r'^(\[deleted]|\[removed])$')

    inputs, outputs = [], []
    for paths in corpus.iter_conversations():
        for path in paths.get_root_to_leaf_paths():
            for i in range(len(path) - 1):

                if deleted_filter.match(path[i].text) \
                or deleted_filter.match(path[i-1].text) \
                or deleted_filter.match(path[i+1].text):
                    continue

                inputs.append(path[i].text)
                outputs.append(path[i + 1].text)

                if len(inputs) >= max_samples:
                    return split_data(inputs, outputs, eval_percent)

    logging.info('Done!')
    return split_data(inputs, outputs, eval_percent)
コード例 #5
0
def main():
    #initalize globals
    readRegionalisms()
    addStopWords()

    #to select subreddits to process, add their name to the CurrentSubredditSet file
    toProcess = readSubredditSet()
    for corpusname in toProcess:
        print("doing, "+corpusname)
        download("subreddit-"+corpusname, data_dir=datadirectory+"/DataDownloads")

        #create the directory
        if not os.path.exists(datadirectory+"/ProcessedData/"+corpusname):
            os.makedirs(datadirectory+"/ProcessedData/"+corpusname)

        if os.path.exists(datadirectory+"/DataDownloads/subreddit-"+corpusname+".zip") and not os.path.exists(datadirectory+"/DataDownloads/"+corpusname+".corpus.zip"):
            os.rename(datadirectory+"/DataDownloads/subreddit-"+corpusname+".zip", datadirectory+"/DataDownloads/"+corpusname+".corpus.zip")

        print(datadirectory+"/DataDownloads/"+corpusname+".corpus.zip")
        with ZipFile(datadirectory+"/DataDownloads/"+corpusname+".corpus.zip", mode="r") as corpuszip:
            if not os.path.exists(datadirectory+"/ProcessedData/"+corpusname+"/utterances.jsonl"):
                corpuszip.extract("utterances.jsonl", path=datadirectory+"/ProcessedData/"+corpusname+"/")


        #make the unfilted text files
        old_data_exists = False
        for file in getTextFileNames(corpusname, filtered=False):
            if os.path.exists(file):
                old_data_exists = True

        if not old_data_exists:
            convertToText(corpusname)
        else:
            print(corpusname + " has already been converted to unfiltered text files, moving on")

        # remove stopwords
        old_data_exists = False
        for file in getTextFileNames(corpusname):
            if os.path.exists(file):
                old_data_exists = True

        if not old_data_exists:
            removeStopwordsFromConverted(corpusname)
        else:
            print(corpusname + " has already had its text files filtered")
コード例 #6
0
 def __init__(self, subReddit=""):
     self._startIndex = 0
     self._endIndex = 5 * (10**5)
     self._startDate = 2007
     self._endDate = 2018
     self._target = subReddit
     if self._target != "":
         Corpus(filename=download(self._target),
                utterance_start_index=self._startIndex,
                utterance_end_index=self._endIndex)
コード例 #7
0
def main():
    downloaded_corpus = download("supreme-corpus")
    results_dir = os.path.abspath("../../results")

    if len(sys.argv) <= 0:
        print(
            "Usage: dumpModalKwic --minyear=yyyy --maxyear=yyyy --limit=n --kwic=1/0"
        )
        exit(1)
    full_cmd_arguments = sys.argv
    argument_list = full_cmd_arguments[1:]
    short_options = "y:n:x:l:"
    year = 1955

    long_options = ["year=", "maxyear=", "minyear=", "limit=", "kwic="]
    maxyear = minyear = utterance_end_index = None
    kwic = False
    arguments, values = getopt.getopt(argument_list, short_options,
                                      long_options)
    for current_argument, current_value in arguments:
        kwic = bool(current_value) if current_argument in ("-x",
                                                           "--kwic") else kwic
        maxyear = int(current_value) if current_argument in (
            "-x", "--maxyear") else maxyear
        minyear = int(current_value) if current_argument in (
            "-n", "--minyear") else minyear
        year = int(current_value) if current_argument in ("-y",
                                                          "--year") else year
        utterance_end_index = int(current_value) if current_argument in (
            "-l", "--limit") else utterance_end_index
    minyear = year if ((minyear is None) and (year is not None)) else minyear
    maxyear = year if ((maxyear is None) and (year is not None)) else maxyear

    result_file = results_dir + "/kwic" + str(minyear) + "-" + str(
        maxyear) + ".csv"

    print("Initializing corpus model")
    corpus = SentenceCorpus(maxyear,
                            minyear,
                            dirname=downloaded_corpus,
                            utterance_end_index=utterance_end_index)
    print("Corpus model initialized")

    if not os.path.exists(results_dir + "/utterances" + str(minyear) + "-" +
                          str(maxyear) + ".jsonl"):
        print("Missing modal utterance json. Creating...")
        corpus.dump_modal_sentences()
    if kwic:
        print("Kwic mode selected. Creating csv...")
        corpus.dump_kwic(result_file)
コード例 #8
0
 def test_load_dump_politeness(self):
     corpus = Corpus(download('wikipedia-politeness-corpus'))
     corpus.dump('wikipedia-politeness-corpus')
コード例 #9
0
# The plots answer these questions:
# - Do users on the whole coordinate more to admins or nonadmins?
# - Do admins coordinate to other people more than nonadmins do?

from convokit import Utterance, Corpus, Coordination, download

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import os

# load corpus; split users by whether they are an admin
# this means that if a user has spoken in the corpus as both an admin and
#   a non-admin, then we will split this user into two users, one for each of
#   these roles
path = download("wiki-corpus")
corpus = Corpus(filename=os.path.join(path, "wiki-corpus"),
                subdivide_users_by=["is_admin"])

# create coordination object
coord = Coordination(corpus)


# helper function to plot two coordination scores against each other as a chart,
#   on aggregate and by coordination marker
# a is a tuple (speakers, targets)
# b is a tuple (speakers, targets)
def make_chart(a_scores,
               b_scores,
               a_description,
               b_description,
コード例 #10
0
ファイル: tsne.py プロジェクト: calebchiam/cs6742-fork
import convokit
import numpy as np
import matplotlib.pyplot as plt

print("Loading corpus")
corpus = convokit.Corpus(filename=convokit.download("reddit-corpus-small"))

print("Computing hypergraph features")
hc = convokit.HyperConvo()
hc.fit_transform(corpus)

print("Computing low-dimensional embeddings")
te = convokit.ThreadEmbedder(n_components=7)
te.fit_transform(corpus)

ce = convokit.CommunityEmbedder(community_key="subreddit", method="tsne")
ce.fit_transform(corpus)

pts = corpus.get_meta()["communityEmbedder"]["pts"]
labels = corpus.get_meta()["communityEmbedder"]["labels"]

xs, ys = zip(*pts)
plt.scatter(xs, ys)
for i, txt in enumerate(labels):
    plt.annotate(txt, (xs[i], ys[i]))
plt.savefig("tsne")
plt.show()
コード例 #11
0
reddit.subreddits.recommended([reddit.subreddit('askreddit'),reddit.subreddit('lifeprotips')])
top_6_subs = list(reddit.subreddits.popular(limit=6))

reddit.subreddits.recommended(top_6_subs)    

widgets = reddit.subreddit('askouija').widgets
for widget in widgets.sidebar:
    if isinstance(widget, praw.models.CommunityList):
        print(widget)
    

#####################

from convokit import Corpus, download

smalsubs = Corpus(filename=download('reddit-corpus-small')) # will not download twice if it already exists

ut_ids = smalsubs.get_utterance_ids()

len(ut_ids)

uid = ut_ids[0]

c_ids = smalsubs.get_conversation_ids()
cid = c_ids[0]

convo = smalsubs.get_conversation(cid)
convo.get_utterance_ids()
convo.get_utterance(uid)

top_level = [uid for uid in ut_ids if sub_corn.get]
コード例 #12
0
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 27 23:20:11 2020

@author: kach
"""

from convokit import Corpus, download
corpus = Corpus(filename=download("conversations-gone-awry-corpus"))

corpus.print_summary_stats()

reviews = open("data/reviews.txt", "w", encoding="utf-8")
label = open("data/labels.txt", "w")

#i=0
for utt in corpus.iter_utterances():
    #i+=1
    txt = str(utt.text).replace('\n', ' ')
    reviews.write(txt + '\n')
    if utt.meta['comment_has_personal_attack']:
        l = '1'
    else:
        l = '0'
    label.write(l + '\n')
    #if i>10:
    #    break

reviews.close()
label.close()
コード例 #13
0
# The plots answer these questions:
# - Do lawyers coordinate more to justices than the other way around?
# - Do lawyers coordinate more to unfavorable or favorable justices?
# - Do unfavorable justices coordinate to lawyers more than favorable justices,
#     or vice versa?

from convokit import Utterance, Corpus, Coordination, download

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np

# load corpus; split users by case id and split the justices by whether they are
#     favorable to the current presenting side
# this treats the same person across two different cases as two different users
corpus = Corpus(filename=download("supreme-corpus"))
split = ["case", "justice-is-favorable"]

# create coordination object
coord = Coordination()
coord.fit(corpus)

# helper function to plot two coordination scores against each other as a chart,
#   on aggregate and by coordination marker
# a is a tuple (speakers, targets)
# b is a tuple (speakers, targets)
def make_chart(a_scores, b_scores, a_description, b_description, a_color="b", b_color="g"):
    # get scores by marker and on aggregate
    _, a_score_by_marker, a_agg1, a_agg2, a_agg3 = coord.score_report(corpus, a_scores)
    _, b_score_by_marker, b_agg1, b_agg2, b_agg3 = coord.score_report(corpus, b_scores)
コード例 #14
0
 def test_load_dump_subreddit(self):
     corpus = Corpus(download('subreddit-hey'))
     corpus.dump('subreddit')
コード例 #15
0
import os
import sys
'''
	some code to update datasets with new parse format.	
'''

VERBOSITY = 1000
ROOT_DIR = '/kitchen/clean-corpora'
INCREMENT_VERSION = False
PARSE = True

if __name__ == '__main__':

    print('downloading corpus')
    corpus_name = sys.argv[1]
    filename = download(corpus_name, data_dir=ROOT_DIR)

    print('purging parses')
    with open(os.path.join(filename, 'index.json')) as f:
        index = json.load(f)
    try:
        del index['utterances-index']['parsed']
    except:
        pass
    if INCREMENT_VERSION:
        index['version'] += 1
    with open(os.path.join(filename, 'index.json'), 'w') as f:
        json.dump(index, f)

    if os.path.exists(os.path.join(filename, 'utterances.json')):
        with open(os.path.join(filename, 'utterances.json')) as f:
コード例 #16
0
# - Do lawyers coordinate more to justices than the other way around?
# - Do lawyers coordinate more to unfavorable or favorable justices?
# - Do unfavorable justices coordinate to lawyers more than favorable justices,
#     or vice versa?

from convokit import Utterance, Corpus, Coordination, download

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import os

# load corpus; split users by case id and split the justices by whether they are
#     favorable to the current presenting side
# this treats the same person across two different cases as two different users
path = download("supreme-corpus")
corpus = Corpus(filename=os.path.join(path, "supreme-corpus"),
    subdivide_users_by=["case", "justice-is-favorable"])

# create coordination object
coord = Coordination(corpus)

# helper function to plot two coordination scores against each other as a chart,
#   on aggregate and by coordination marker
# a is a tuple (speakers, targets)
# b is a tuple (speakers, targets)
def make_chart(a_scores, b_scores, a_description, b_description, a_color="b", b_color="g"):
    # get scores by marker and on aggregate
    _, a_score_by_marker, a_agg1, a_agg2, a_agg3 = coord.score_report(a_scores)
    _, b_score_by_marker, b_agg1, b_agg2, b_agg3 = coord.score_report(b_scores)
コード例 #17
0
#   using the methods in the asking too much paper (http://www.cs.cornell.edu/~cristian/Asking_too_much.html) to extract question types.
#   (since there is a seed provided, multiple executions of this script will always produce the same clusters)
# This version uses precomputed motifs for speed.

import os
import pkg_resources
import numpy as np

from convokit import Corpus, QuestionTypology, download

#Initialize QuestionTypology class

num_clusters = 8

# Get precomputed motifs. data_dir contains the downloaded data.
data_dir = download('tennis-corpus')
motifs_dir = download('tennis-motifs')

#Load the corpus
corpus = Corpus(filename=os.path.join(data_dir, 'tennis-corpus'))

#Extract clusters of the motifs and assign questions to these clusters
questionTypology = QuestionTypology(corpus,
                                    data_dir,
                                    dataset_name='tennis',
                                    motifs_dir=motifs_dir,
                                    num_dims=25,
                                    num_clusters=num_clusters,
                                    verbose=False,
                                    random_seed=125)
コード例 #18
0
# This example uses the supreme court corpus to compute some simple information:
# - Which justices coordinate the most to others?
# - Which justices are coordinated to the most?

import convokit

# set up corpus
corpus = convokit.Corpus(filename=convokit.download("supreme-corpus"))

# compute coordination scores on this corpus
coord = convokit.Coordination()
coord.fit(corpus)

# get coordination scores
coord.transform(corpus)

# get set of all justices
justices = corpus.iter_users(lambda user: user.meta["is-justice"])
# get set of all users
everyone = corpus.iter_users()

# compute coordination from each justice to everyone
print("Justices, ranked by how much they coordinate to others:")
justices_to_everyone = coord.score(corpus, justices, everyone)
for justice, score in sorted(justices_to_everyone.averages_by_user().items(),
    key=lambda x: x[1], reverse=True):
    print(justice.name, round(score, 5))
print()

# compute coordination from everyone to each justice
print("Justices, ranked by how much others coordinate to them:")
import os
import pkg_resources
import numpy as np

from convokit import Corpus, QuestionTypology, download

#Initialize QuestionTypology class

num_clusters = 8

# Get precomputed motifs. data_dir contains the downloaded data.
data_dir = os.path.join(pkg_resources.resource_filename("convokit", ""),
                        'downloads')

#Load the corpus
corpus = Corpus(filename=download("wiki-corpus"))

#Extract clusters of the motifs and assign questions to these clusters
questionTypology = QuestionTypology(corpus,
                                    data_dir,
                                    dataset_name='wiki',
                                    num_dims=25,
                                    num_clusters=num_clusters,
                                    verbose=False,
                                    random_seed=15)

# questionTypology.types_to_data contains the necessary data that is computed in the step above
# its keys are the indices of the clusters (here 0-7). The values are dictionaries with the following keys:
# "motifs": the motifs, as a list of tuples of the motif terms
# "motif_dists": the corresponding distances of each motif from the centroid of the cluster this motif is in
# "fragments": the answer fragments, as a list of tuples of answer terms
コード例 #20
0
import sys
import convokit

corpus = convokit.Corpus(filename=convokit.download("subreddit-Cornell"))
print(corpus.meta)
threads = corpus.utterance_threads(prefix_len=10, include_root=False)


def disp(thread, root, indent=0):
    print(" " * indent + thread[root].user.name + ": " +
          thread[root].text.replace("\n", " "))
    children = [k for k, v in thread.items() if v.reply_to == root]
    for child in children:
        disp(thread, child, indent=indent + 4)


if len(sys.argv) > 1:
    for root in sys.argv[1:]:
        print("--- {} ---".format(root))
        disp(threads[root], root)
        print()
else:
    while True:
        print("Enter thread root ID (e.g. {}): ".format(next(iter(threads))),
              end="")
        root = input()
        print("--- {} ---".format(root))
        disp(threads[root], root)
        print()
# This example extracts question types from the Wikipedia Moderators Dataset explained here (http://www.cs.cornell.edu/~cristian//Politeness.html)
#   using the methods in the asking too much paper (http://www.cs.cornell.edu/~cristian/Asking_too_much.html) to extract question types.
#   (since there is a seed provided, multiple executions of this script will always produce the same clusters)

import os
import pkg_resources
import numpy as np

from convokit import Corpus, QuestionTypology, download

#Initialize QuestionTypology class

num_clusters = 8

# Get precomputed motifs. data_dir contains the downloaded data.
data_dir = download('wiki-corpus')

#Load the corpus
corpus = Corpus(filename=os.path.join(data_dir, 'wiki-corpus'))

#Extract clusters of the motifs and assign questions to these clusters
questionTypology = QuestionTypology(corpus, data_dir, dataset_name='wiki', num_dims=25,
  num_clusters=num_clusters, verbose=False, random_seed=15)

# questionTypology.types_to_data contains the necessary data that is computed in the step above
# its keys are the indices of the clusters (here 0-7). The values are dictionaries with the following keys:
# "motifs": the motifs, as a list of tuples of the motif terms
# "motif_dists": the corresponding distances of each motif from the centroid of the cluster this motif is in
# "fragments": the answer fragments, as a list of tuples of answer terms
# "fragment_dists": the corresponding distances of each fragment from the centroid of the cluster this
# fragment is in
コード例 #22
0
 def test_load_dump_switchboard(self):
     corpus = Corpus(download("switchboard-corpus"))
     corpus.dump('switchboard-corpus')
コード例 #23
0
 def test_load_wikiconv(self):
     corpus = Corpus(download('wikiconv-2004'))
コード例 #24
0
import csv
import itertools
import sys
import os

from convokit import download
"""
Script dumps verb csv. Contains top 20 verbs for given modal for each year. 
"""

downloaded_corpus = download("supreme-corpus")
results_dir = os.path.abspath("../../results")


def get_file_list():
    linearr = []
    csv.field_size_limit(sys.maxsize)
    for fileyear in range(1950, 2020, 10):
        csvfile = results_dir + "/kwic" + str(fileyear) + "-" + str(
            fileyear + 10) + ".csv"
        with open(csvfile, 'r') as data:
            for line in csv.DictReader(data):
                linearr.append(line)
    return linearr


def get_verbs(modal_list):
    print("Assembling   modal data from files")
    linearr = get_file_list()
    filtered = {}
    baseline = {mod: {} for mod in modal_list}
import os
import pkg_resources
import numpy as np

from convokit import Corpus, QuestionTypology, download

#Initialize QuestionTypology class

num_clusters = 8

# Get precomputed motifs. data_dir contains the downloaded data.
# motifs_dir is the specific path within data_dir that contains the precomputed motifs
data_dir = os.path.join(pkg_resources.resource_filename("convokit", ""),
                        'downloads')
motifs_dir = download('parliament-motifs')

#Load the corpus
corpus = Corpus(filename=download("parliament-corpus"))

#Extract clusters of the motifs and assign questions to these clusters
questionTypology = QuestionTypology(corpus,
                                    data_dir,
                                    dataset_name='parliament',
                                    motifs_dir=motifs_dir,
                                    num_dims=25,
                                    num_clusters=num_clusters,
                                    verbose=False,
                                    random_seed=164)

# questionTypology.types_to_data contains the necessary data that is computed in the step above
コード例 #26
0
from convokit import Corpus, download
import sys
import random

if __name__ == '__main__':

    corpus_name = sys.argv[1]
    output_filename = sys.argv[2]
    corpus = Corpus(filename=download(corpus_name))

    char_list = ['<sos>', '<eos>', '<pad>', '<unk>']
    sequences = []
    for convo in corpus.iter_conversations():
        title = convo.meta['title']
        text = convo.get_utterance(
            convo.get_chronological_utterance_list()[0].conversation_id).text
        if text == '' or text == '[deleted]' or text == '[removed]':
            continue
        else:
            post = title + '\t' + text
            post = post.replace('\n', '').lower()
            sequence = ''
            for character in post:
                if character in char_list:
                    sequence += str(char_list.index(character)) + ' '
                else:
                    char_list.append(character)
                    sequence += str(len(char_list) - 1) + ' '
            sequences.append(sequence[:-1])

    random.shuffle(sequences)
# This example extracts politeness strategies from the Conversations Gone Awry dataset,
#   one of the steps in the Conversations Gone Awry paper (http://www.cs.cornell.edu/~cristian/Conversations_gone_awry.html).
#   For code reproducing the full results of the paper, see the example notebook in the
#   `conversations-gone-awry` example subdirectory.

import pandas as pd
from convokit import PolitenessStrategies, Corpus, download

print("Loading awry corpus...")
corpus = Corpus(filename=download('conversations-gone-awry-corpus'))

# extract the politeness strategies.
# Note: politeness strategies are a hand-engineered feature set, so no fitting is needed.
ps = PolitenessStrategies(verbose=100)
print("Extracting politeness strategies...")
corpus = ps.transform(corpus)

values = []
idx = []
for utterance in corpus.iter_utterances():
    values.append(utterance.meta["politeness_strategies"])
    idx.append(utterance.id)
pd.DataFrame(values, index=idx).to_csv("awry_strategy_df_v2.csv")
print("Done, results written to awry_strategy_df_v2.csv")
import convokit

#set up corpus
corpus = convokit.Corpus(filename=convokit.download("supreme-corpus") \
                         ,subdivide_users_by=["roots"])

#create count object
count = convokit.CountUtterance(corpus)

#report the result
count.print_report()
コード例 #29
0
#
# The plots answer these questions: 
# - Do users on the whole coordinate more to admins or nonadmins?
# - Do admins coordinate to other people more than nonadmins do?

from convokit import Utterance, Corpus, Coordination, download

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np

# load corpus; split users by whether they are an admin
# this means that if a user has spoken in the corpus as both an admin and
#   a non-admin, then we will split this user into two users, one for each of
#   these roles
corpus = Corpus(filename=download("wiki-corpus"))
split = ["is_admin"]

# create coordination object
coord = Coordination()
coord.fit(corpus)

# helper function to plot two coordination scores against each other as a chart,
#   on aggregate and by coordination marker
# a is a tuple (speakers, targets)
# b is a tuple (speakers, targets)
def make_chart(a_scores, b_scores, a_description, b_description, a_color="b", b_color="g"):
    # get scores by marker and on aggregate
    _, a_score_by_marker, a_agg1, a_agg2, a_agg3 = coord.score_report(corpus, a_scores)
    _, b_score_by_marker, b_agg1, b_agg2, b_agg3 = coord.score_report(corpus, b_scores)
コード例 #30
0
import convokit
from sklearn.neighbors import NearestNeighbors

print("Loading corpus")
corpus = convokit.Corpus(filename=convokit.download("reddit-corpus-small"))

print("Computing hypergraph features")
hc = convokit.HyperConvo(prefix_len=10, include_root=False)
hc.fit_transform(corpus)

print("Computing low-dimensional embeddings")
te = convokit.ThreadEmbedder(n_components=7)
te.fit_transform(corpus)

ce = convokit.CommunityEmbedder(community_key="subreddit")
ce.fit_transform(corpus)

X_communities = corpus.get_meta()["communityEmbedder"]["pts"]
subreddits = corpus.get_meta()["communityEmbedder"]["labels"]

knn = NearestNeighbors(n_neighbors=10)
knn.fit(X_communities)

print("Nearest neighbors for each subreddit:")
for x, subreddit in zip(X_communities, subreddits):
    print(subreddit, "->", end=" ")
    for idx in knn.kneighbors([x], return_distance=False)[0][1:]:
        print(subreddits[idx], end=" ")
    print()
コード例 #31
0
def main() -> None:

    args = parser.parse_args()

    if args.gpu is None:
        device = torch.device('cpu')
    else:
        device = torch.device('cuda:{}'.format(args.gpu))

    corpus = Corpus(filename=download(args.corpus))

    if args.corpus == 'conversations-gone-awry-cmv-corpus':
        DatasetClass = ConversationsGoneAwryDataset
        n_classes = 1
        criterion = nn.BCEWithLogitsLoss()
    elif args.corpus == 'winning-args-corpus':
        corpus = filter_winning_arguments_corpus(corpus)
        DatasetClass = WinningArgumentsDataset
        n_classes = 1
        criterion = nn.BCEWithLogitsLoss()
    else:
        raise ValueError('Corpus {} not currently supported'.format(
            args.corpus))

    add_title_to_root(corpus)

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)

    conversations = list(corpus.iter_conversations())
    train_ceil = math.ceil(len(conversations) * args.train_split)
    train_conversations = conversations[:train_ceil]
    val_conversations = conversations[train_ceil:]

    train_dataset = DatasetClass(corpus,
                                 train_conversations,
                                 tokenizer,
                                 max_len=args.max_conversation_len,
                                 max_tokenization_len=args.utterance_max)
    val_dataset = DatasetClass(corpus,
                               val_conversations,
                               tokenizer,
                               max_len=args.max_conversation_len,
                               max_tokenization_len=args.utterance_max)
    train_sampler = ConversationPathBatchSampler(
        args.batch_size, 1, train_dataset.get_indices_by_len())
    val_sampler = ConversationPathBatchSampler(
        args.batch_size, 1, val_dataset.get_indices_by_len())
    train_loader = DataLoader(train_dataset,
                              batch_sampler=train_sampler,
                              collate_fn=conversation_path_collate_fn,
                              pin_memory=True)
    val_loader = DataLoader(val_dataset,
                            batch_sampler=val_sampler,
                            collate_fn=conversation_path_collate_fn,
                            pin_memory=True)

    num_training_steps = args.epochs * len(train_dataset)

    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name, num_labels=n_classes)
    model.to(device)

    if args.pretrain_path is not None:
        checkpoint = torch.load(args.pretrain_path, map_location=device)
        model.bert.load_state_dict(checkpoint['state_dict'])

    optimizer = AdamW(model.parameters(), args.learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=WARMUP_RATIO * num_training_steps,
        num_training_steps=num_training_steps)
    scaler = GradScaler()

    for epoch in range(args.epochs):
        print('Epoch {}'.format(epoch))
        train(train_loader, model, criterion, optimizer, scheduler, scaler,
              device)
        validate(val_loader, model, criterion, device)
コード例 #32
0
 def test_load_dump_tennis(self):
     corpus = Corpus(download('tennis-corpus'))
     corpus.dump('tennis-corpus')
コード例 #33
0
def main() -> None:

    args = parser.parse_args()

    if args.gpu is None:
        device = torch.device('cpu')
    else:
        device = torch.device('cuda:{}'.format(args.gpu))

    corpus = Corpus(filename=download(CORPUS))
    add_title_to_root(corpus)

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)

    conversations = list(corpus.iter_conversations())
    train_ceil = math.ceil(len(conversations) * args.train_split)
    train_conversations = conversations[:train_ceil]
    val_conversations = conversations[train_ceil:]

    train_dataset = CoarseDiscourseDataset(
        corpus,
        train_conversations,
        tokenizer,
        max_len=args.max_conversation_len,
        max_tokenization_len=args.utterance_max)
    val_dataset = CoarseDiscourseDataset(
        corpus,
        val_conversations,
        tokenizer,
        max_len=args.max_conversation_len,
        max_tokenization_len=args.utterance_max)
    val_dataset.label_encoder = train_dataset.label_encoder
    train_sampler = ConversationPathBatchSampler(
        args.batch_size, 1, train_dataset.get_indices_by_len())
    val_sampler = ConversationPathBatchSampler(
        args.batch_size * 4, 1, val_dataset.get_indices_by_len())
    train_loader = DataLoader(train_dataset,
                              batch_sampler=train_sampler,
                              collate_fn=conversation_path_collate_fn,
                              pin_memory=True)
    val_loader = DataLoader(val_dataset,
                            batch_sampler=val_sampler,
                            collate_fn=conversation_path_collate_fn,
                            pin_memory=True)

    num_training_steps = args.epochs * len(train_dataset)

    model = AutoModelForTokenClassification.from_pretrained(
        args.model_name, num_labels=len(train_dataset.label_encoder.classes_))
    model.to(device)

    if args.pretrain_path is not None:
        checkpoint = torch.load(args.pretrain_path, map_location=device)
        model.bert.load_state_dict(checkpoint['state_dict'])

    optimizer = AdamW(model.parameters(), args.learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=WARMUP_RATIO * num_training_steps,
        num_training_steps=num_training_steps)
    scaler = GradScaler()

    for epoch in range(args.epochs):
        print('Epoch {}'.format(epoch))
        train(train_loader, model, optimizer, scheduler, scaler, device,
              tokenizer.sep_token_id)
        validate(val_loader, model, device, tokenizer.sep_token_id)
コード例 #34
0
def get_corpus(source,
               split_sentences=False,
               punct=True,
               to_ascii=True,
               data_path=DATA_DIR,
               min_len=3,
               max_len=15,
               test_size=0.1,
               text_field='text',
               subsample_rows=False,
               save=True):
    if source.endswith('.csv'):
        csv = True
        name = source[:-4]
    else:
        csv = False
        name = source

    # compose name
    corpus_name = f'{name}{"_split" if split_sentences else ""}' \
                  f'{"_punct" if punct else ""}' \
                  f'{"_ascii" if to_ascii else ""}' \
                  f'{f"_sub{subsample_rows}" if subsample_rows else ""}' \
                  f'_{test_size}_{min_len}_{max_len}'
    corpus_train = os.path.join(data_path, f'{corpus_name}_train.csv')
    corpus_test = os.path.join(data_path, f'{corpus_name}_test.csv')

    # Load from cache
    if test_size == 1 and os.path.isfile(corpus_test):
        df = pd.read_csv(corpus_test)
        print('Loading cached data...')
        print(len(df))
        return df, f'{corpus_name}_test.csv'

    elif os.path.isfile(corpus_train) and os.path.isfile(corpus_test):
        df_train, df_val = pd.read_csv(corpus_train), pd.read_csv(corpus_test)
        print('Loading cached data...')
        print(len(df_train))
        print(len(df_val))
        return df_train, df_val, f'{corpus_name}_train.csv', f'{corpus_name}_test.csv'

    # load csv or download
    if csv:
        print('Loading dataset from csv...')
        df = pd.read_csv(os.path.join(data_path, source))
    else:
        print('Downloading dataset...')
        corp = Corpus(filename=download(name))
        df = corp.get_utterances_dataframe()
    # get only text
    df = df.rename(columns={text_field: "utterance"})[["utterance"]]

    # remove any tags
    df['utterance'] = df['utterance'].str.replace(r'<.*>', ' ')

    # subsample
    if subsample_rows:
        df = df.sample(subsample_rows, random_state=0)

    # split sentences
    if split_sentences:
        sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        df["utterance"] = df["utterance"]
        df["utterance"] = df["utterance"].apply(sentence_detector.tokenize)
        df = flatten_sentences(df)

    print('Cleaning')
    cln_fn = lambda x: clean(
        x,
        fix_unicode=True,  # fix various unicode errors
        to_ascii=to_ascii,  # transliterate to closest ASCII representation
        lower=True,  # lowercase text
        no_line_breaks=
        False,  # fully strip line breaks as opposed to only normalizing them
        no_urls=False,  # replace all URLs with a special token
        no_emails=False,  # replace all email addresses with a special token
        no_phone_numbers=
        False,  # replace all phone numbers with a special token
        no_numbers=False,  # replace all numbers with a special token
        no_digits=False,  # replace all digits with a special token
        no_currency_symbols=
        False,  # replace all currency symbols with a special token
        no_punct=False,  # remove punctuations
        lang="en"  # set to 'de' for German special handling
    )

    # clean
    df["utterance"] = df["utterance"].apply(cln_fn)
    if not punct:
        df["utterance"] = df["utterance"].str.replace(
            r"[{}]".format(punctuation), ' ')

    # tokenize
    sen_by_words = df["utterance"].apply(word_tokenize)
    word_counts = sen_by_words.apply(len)
    sen_by_words = sen_by_words[(word_counts <= max_len)
                                & (word_counts >= min_len)]
    df = sen_by_words.to_frame()

    # no split
    if test_size == 1:
        if not save:
            return df
        print(len(df))
        df.to_csv(corpus_test, index=False)
        return df, f'{corpus_name}_test.csv'

    # split
    df_train, df_val = train_test_split(df,
                                        test_size=test_size,
                                        random_state=0)
    print(len(df_train))
    print(len(df_val))
    if not save:
        return df_train, df_val
    df_train.to_csv(corpus_train, index=False)
    df_val.to_csv(corpus_test, index=False)
    return df_train, df_val, f'{corpus_name}_train.csv', f'{corpus_name}_test.csv'
from ast import literal_eval as make_tuple
from collections import defaultdict
from scipy import sparse
from sklearn.externals import joblib
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import Normalizer
from spacy.en import English
from spacy.symbols import *
from spacy.tokens.doc import Doc

#Initialize QuestionTypology class pretrained on Parliament Dataset

num_clusters = 8

data_dir = download('parliament-corpus')
motifs_dir = download('parliament-motifs')

corpus = Corpus(filename=os.path.join(data_dir, 'parliament-corpus'))

questionTypology = QuestionTypology(corpus,
                                    data_dir,
                                    dataset_name='parliament',
                                    motifs_dir=motifs_dir,
                                    num_dims=25,
                                    num_clusters=num_clusters,
                                    verbose=False,
                                    random_seed=164)

#Determine type of input question
import sys
import convokit

corpus = convokit.Corpus(filename=convokit.download("subreddit-Cornell"))
print(corpus.meta)
threads = corpus.utterance_threads(prefix_len=10, include_root=False)

def disp(thread, root, indent=0):
    print(" "*indent + thread[root].user.name + ": " +
        thread[root].text.replace("\n", " "))
    children = [k for k, v in thread.items() if v.reply_to == root]
    for child in children:
        disp(thread, child, indent=indent+4)

if len(sys.argv) > 1:
    for root in sys.argv[1:]:
        print("--- {} ---".format(root))
        disp(threads[root], root)
        print()
else:
    while True:
        print("Enter thread root ID (e.g. {}): ".format(next(iter(threads))), end="")
        root = input()
        print("--- {} ---".format(root))
        disp(threads[root], root)
        print()