Python download Examples, convokit.download Python Examples

Example #1

0

Show file

File: convokit_reader.py Project: jacobdanovitch/jdnlp

    def _read(self, corpus_split):
        corpus_split = corpus_split.split('_')

        corpus_name = corpus_split[0]
        self.split = corpus_split[1] if len(corpus_split) > 1 else None

        corpus = Corpus(filename=download(corpus_name))
        conversations = corpus.iter_conversations()
        if self.sample:
            conversations = itertools.islice(conversations, self.sample)

        for conv in conversations:
            meta = conv.meta

            if (meta.get('split') != self.split) and (meta.get(
                    'annotation_year', 2018) != 2018):
                continue

            label = str(meta[self.label_field])
            # turns = [u.text for u in conv.iter_utterances() if u.text.strip() and (not u.meta.get('is_section_header'))]
            turns = [
                u.meta.parsed for u in conv.iter_utterances()
                if not u.meta.get('is_section_header')
            ]

            end = len(turns) - 1 if self.forecast else None
            turns = turns[-self.max_turns:end]

            if turns and all(turns):
                inst = self.text_to_instance(turns, label)
                if inst:
                    yield inst

Example #2

0

Show file

File: test_from_pandas.py Project: ritumunday/Cornell-Conversational-Analysis-Toolkit

 def setUp(self) -> None:
     self.corpus = Corpus(download('subreddit-hey'))
     self.utt_df = self.corpus.get_utterances_dataframe()
     self.convo_df = self.corpus.get_conversations_dataframe()
     self.speaker_df = self.corpus.get_speakers_dataframe()
     self.new_corpus = Corpus.from_pandas(self.utt_df, self.speaker_df,
                                          self.convo_df)

Example #3

0

Show file

File: get_train_subset.py Project: georgialoukatou/Conversational-Analysis-HAI

def process_corpus(corpus_name,
                   to_download=TO_DOWNLOAD,
                   min_wc_source=MIN_WC_SOURCE,
                   max_wc_source=MAX_WC_SOURCE,
                   min_wc_target=MIN_WC_TARGET,
                   max_wc_target=MAX_WC_TARGET,
                   source_filter=SOURCE_FILTER,
                   target_filter=TARGET_FILTER,
                   text_cols=TEXT_COLS,
                   data_dir=DATA_DIR):

    if to_download:
        corpus = Corpus(download(corpus_name, data_dir=data_dir))
    else:
        corpus = Corpus(os.path.join(data_dir, corpus_name))
    corpus_name = corpus.get_meta()['name']
    print(corpus_name)
    corpus.print_summary_stats()
    print('processing', corpus.get_meta()['name'])
    corpus.load_info('utterance', ['parsed'])

    corpus = text_prep_pipe().transform(corpus)

    source_df, target_df = get_train_subset(corpus, min_wc_source,
                                            max_wc_source, min_wc_target,
                                            max_wc_target, source_filter,
                                            target_filter, text_cols)
    source_df.to_csv(os.path.join(data_dir, corpus_name + '.source.tsv'),
                     sep='\t')
    target_df.to_csv(os.path.join(data_dir, corpus_name + '.target.tsv'),
                     sep='\t')

Example #4

0

Show file

File: preprocessing.py Project: naripok/transformer

def load_conversations(corpus_name, max_samples, eval_percent=0.1):
    logging.info('Loading data.')

    def split_data(inputs, outputs, eval_percent):
        eval_index = int(len(inputs) * (1 - eval_percent))
        return (inputs[:eval_index], outputs[:eval_index], inputs[eval_index:],
                outputs[eval_index:])

    corpus = Corpus(filename=download(corpus_name))

    deleted_filter = re.compile(r'^(\[deleted]|\[removed])$')

    inputs, outputs = [], []
    for paths in corpus.iter_conversations():
        for path in paths.get_root_to_leaf_paths():
            for i in range(len(path) - 1):

                if deleted_filter.match(path[i].text) \
                or deleted_filter.match(path[i-1].text) \
                or deleted_filter.match(path[i+1].text):
                    continue

                inputs.append(path[i].text)
                outputs.append(path[i + 1].text)

                if len(inputs) >= max_samples:
                    return split_data(inputs, outputs, eval_percent)

    logging.info('Done!')
    return split_data(inputs, outputs, eval_percent)

Example #5

0

Show file

def main():
    #initalize globals
    readRegionalisms()
    addStopWords()

    #to select subreddits to process, add their name to the CurrentSubredditSet file
    toProcess = readSubredditSet()
    for corpusname in toProcess:
        print("doing, "+corpusname)
        download("subreddit-"+corpusname, data_dir=datadirectory+"/DataDownloads")

        #create the directory
        if not os.path.exists(datadirectory+"/ProcessedData/"+corpusname):
            os.makedirs(datadirectory+"/ProcessedData/"+corpusname)

        if os.path.exists(datadirectory+"/DataDownloads/subreddit-"+corpusname+".zip") and not os.path.exists(datadirectory+"/DataDownloads/"+corpusname+".corpus.zip"):
            os.rename(datadirectory+"/DataDownloads/subreddit-"+corpusname+".zip", datadirectory+"/DataDownloads/"+corpusname+".corpus.zip")

        print(datadirectory+"/DataDownloads/"+corpusname+".corpus.zip")
        with ZipFile(datadirectory+"/DataDownloads/"+corpusname+".corpus.zip", mode="r") as corpuszip:
            if not os.path.exists(datadirectory+"/ProcessedData/"+corpusname+"/utterances.jsonl"):
                corpuszip.extract("utterances.jsonl", path=datadirectory+"/ProcessedData/"+corpusname+"/")


        #make the unfilted text files
        old_data_exists = False
        for file in getTextFileNames(corpusname, filtered=False):
            if os.path.exists(file):
                old_data_exists = True

        if not old_data_exists:
            convertToText(corpusname)
        else:
            print(corpusname + " has already been converted to unfiltered text files, moving on")

        # remove stopwords
        old_data_exists = False
        for file in getTextFileNames(corpusname):
            if os.path.exists(file):
                old_data_exists = True

        if not old_data_exists:
            removeStopwordsFromConverted(corpusname)
        else:
            print(corpusname + " has already had its text files filtered")

Example #6

0

Show file

File: scrapermodel.py Project: Nain415/corpus_scraper

 def __init__(self, subReddit=""):
     self._startIndex = 0
     self._endIndex = 5 * (10**5)
     self._startDate = 2007
     self._endDate = 2018
     self._target = subReddit
     if self._target != "":
         Corpus(filename=download(self._target),
                utterance_start_index=self._startIndex,
                utterance_end_index=self._endIndex)

Example #7

0

Show file

def main():
    downloaded_corpus = download("supreme-corpus")
    results_dir = os.path.abspath("../../results")

    if len(sys.argv) <= 0:
        print(
            "Usage: dumpModalKwic --minyear=yyyy --maxyear=yyyy --limit=n --kwic=1/0"
        )
        exit(1)
    full_cmd_arguments = sys.argv
    argument_list = full_cmd_arguments[1:]
    short_options = "y:n:x:l:"
    year = 1955

    long_options = ["year=", "maxyear=", "minyear=", "limit=", "kwic="]
    maxyear = minyear = utterance_end_index = None
    kwic = False
    arguments, values = getopt.getopt(argument_list, short_options,
                                      long_options)
    for current_argument, current_value in arguments:
        kwic = bool(current_value) if current_argument in ("-x",
                                                           "--kwic") else kwic
        maxyear = int(current_value) if current_argument in (
            "-x", "--maxyear") else maxyear
        minyear = int(current_value) if current_argument in (
            "-n", "--minyear") else minyear
        year = int(current_value) if current_argument in ("-y",
                                                          "--year") else year
        utterance_end_index = int(current_value) if current_argument in (
            "-l", "--limit") else utterance_end_index
    minyear = year if ((minyear is None) and (year is not None)) else minyear
    maxyear = year if ((maxyear is None) and (year is not None)) else maxyear

    result_file = results_dir + "/kwic" + str(minyear) + "-" + str(
        maxyear) + ".csv"

    print("Initializing corpus model")
    corpus = SentenceCorpus(maxyear,
                            minyear,
                            dirname=downloaded_corpus,
                            utterance_end_index=utterance_end_index)
    print("Corpus model initialized")

    if not os.path.exists(results_dir + "/utterances" + str(minyear) + "-" +
                          str(maxyear) + ".jsonl"):
        print("Missing modal utterance json. Creating...")
        corpus.dump_modal_sentences()
    if kwic:
        print("Kwic mode selected. Creating csv...")
        corpus.dump_kwic(result_file)

Example #8

0

Show file

 def test_load_dump_politeness(self):
     corpus = Corpus(download('wikipedia-politeness-corpus'))
     corpus.dump('wikipedia-politeness-corpus')

Example #9

0

Show file

# The plots answer these questions:
# - Do users on the whole coordinate more to admins or nonadmins?
# - Do admins coordinate to other people more than nonadmins do?

from convokit import Utterance, Corpus, Coordination, download

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import os

# load corpus; split users by whether they are an admin
# this means that if a user has spoken in the corpus as both an admin and
#   a non-admin, then we will split this user into two users, one for each of
#   these roles
path = download("wiki-corpus")
corpus = Corpus(filename=os.path.join(path, "wiki-corpus"),
                subdivide_users_by=["is_admin"])

# create coordination object
coord = Coordination(corpus)


# helper function to plot two coordination scores against each other as a chart,
#   on aggregate and by coordination marker
# a is a tuple (speakers, targets)
# b is a tuple (speakers, targets)
def make_chart(a_scores,
               b_scores,
               a_description,
               b_description,

Example #10

0

Show file

File: tsne.py Project: calebchiam/cs6742-fork

import convokit
import numpy as np
import matplotlib.pyplot as plt

print("Loading corpus")
corpus = convokit.Corpus(filename=convokit.download("reddit-corpus-small"))

print("Computing hypergraph features")
hc = convokit.HyperConvo()
hc.fit_transform(corpus)

print("Computing low-dimensional embeddings")
te = convokit.ThreadEmbedder(n_components=7)
te.fit_transform(corpus)

ce = convokit.CommunityEmbedder(community_key="subreddit", method="tsne")
ce.fit_transform(corpus)

pts = corpus.get_meta()["communityEmbedder"]["pts"]
labels = corpus.get_meta()["communityEmbedder"]["labels"]

xs, ys = zip(*pts)
plt.scatter(xs, ys)
for i, txt in enumerate(labels):
    plt.annotate(txt, (xs[i], ys[i]))
plt.savefig("tsne")
plt.show()

Example #11

0

Show file

File: playground.py Project: orichardson/util-graph-dynamics

reddit.subreddits.recommended([reddit.subreddit('askreddit'),reddit.subreddit('lifeprotips')])
top_6_subs = list(reddit.subreddits.popular(limit=6))

reddit.subreddits.recommended(top_6_subs)    

widgets = reddit.subreddit('askouija').widgets
for widget in widgets.sidebar:
    if isinstance(widget, praw.models.CommunityList):
        print(widget)
    

#####################

from convokit import Corpus, download

smalsubs = Corpus(filename=download('reddit-corpus-small')) # will not download twice if it already exists

ut_ids = smalsubs.get_utterance_ids()

len(ut_ids)

uid = ut_ids[0]

c_ids = smalsubs.get_conversation_ids()
cid = c_ids[0]

convo = smalsubs.get_conversation(cid)
convo.get_utterance_ids()
convo.get_utterance(uid)

top_level = [uid for uid in ut_ids if sub_corn.get]

Example #12

0

Show file

File: convnet_dataprep.py Project: kannaxp-git/NLP_PersonalAttack

# -*- coding: utf-8 -*-
"""
Created on Mon Jul 27 23:20:11 2020

@author: kach
"""

from convokit import Corpus, download
corpus = Corpus(filename=download("conversations-gone-awry-corpus"))

corpus.print_summary_stats()

reviews = open("data/reviews.txt", "w", encoding="utf-8")
label = open("data/labels.txt", "w")

#i=0
for utt in corpus.iter_utterances():
    #i+=1
    txt = str(utt.text).replace('\n', ' ')
    reviews.write(txt + '\n')
    if utt.meta['comment_has_personal_attack']:
        l = '1'
    else:
        l = '0'
    label.write(l + '\n')
    #if i>10:
    #    break

reviews.close()
label.close()

Example #13

0

Show file

File: supreme.py Project: CornellNLP/Cornell-Conversational-Analysis-Toolkit

# The plots answer these questions:
# - Do lawyers coordinate more to justices than the other way around?
# - Do lawyers coordinate more to unfavorable or favorable justices?
# - Do unfavorable justices coordinate to lawyers more than favorable justices,
#     or vice versa?

from convokit import Utterance, Corpus, Coordination, download

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np

# load corpus; split users by case id and split the justices by whether they are
#     favorable to the current presenting side
# this treats the same person across two different cases as two different users
corpus = Corpus(filename=download("supreme-corpus"))
split = ["case", "justice-is-favorable"]

# create coordination object
coord = Coordination()
coord.fit(corpus)

# helper function to plot two coordination scores against each other as a chart,
#   on aggregate and by coordination marker
# a is a tuple (speakers, targets)
# b is a tuple (speakers, targets)
def make_chart(a_scores, b_scores, a_description, b_description, a_color="b", b_color="g"):
    # get scores by marker and on aggregate
    _, a_score_by_marker, a_agg1, a_agg2, a_agg3 = coord.score_report(corpus, a_scores)
    _, b_score_by_marker, b_agg1, b_agg2, b_agg3 = coord.score_report(corpus, b_scores)

Example #14

0

Show file

 def test_load_dump_subreddit(self):
     corpus = Corpus(download('subreddit-hey'))
     corpus.dump('subreddit')

Example #15

0

Show file

import os
import sys
'''
	some code to update datasets with new parse format.	
'''

VERBOSITY = 1000
ROOT_DIR = '/kitchen/clean-corpora'
INCREMENT_VERSION = False
PARSE = True

if __name__ == '__main__':

    print('downloading corpus')
    corpus_name = sys.argv[1]
    filename = download(corpus_name, data_dir=ROOT_DIR)

    print('purging parses')
    with open(os.path.join(filename, 'index.json')) as f:
        index = json.load(f)
    try:
        del index['utterances-index']['parsed']
    except:
        pass
    if INCREMENT_VERSION:
        index['version'] += 1
    with open(os.path.join(filename, 'index.json'), 'w') as f:
        json.dump(index, f)

    if os.path.exists(os.path.join(filename, 'utterances.json')):
        with open(os.path.join(filename, 'utterances.json')) as f:

Example #16

0

Show file

# - Do lawyers coordinate more to justices than the other way around?
# - Do lawyers coordinate more to unfavorable or favorable justices?
# - Do unfavorable justices coordinate to lawyers more than favorable justices,
#     or vice versa?

from convokit import Utterance, Corpus, Coordination, download

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np
import os

# load corpus; split users by case id and split the justices by whether they are
#     favorable to the current presenting side
# this treats the same person across two different cases as two different users
path = download("supreme-corpus")
corpus = Corpus(filename=os.path.join(path, "supreme-corpus"),
    subdivide_users_by=["case", "justice-is-favorable"])

# create coordination object
coord = Coordination(corpus)

# helper function to plot two coordination scores against each other as a chart,
#   on aggregate and by coordination marker
# a is a tuple (speakers, targets)
# b is a tuple (speakers, targets)
def make_chart(a_scores, b_scores, a_description, b_description, a_color="b", b_color="g"):
    # get scores by marker and on aggregate
    _, a_score_by_marker, a_agg1, a_agg2, a_agg3 = coord.score_report(a_scores)
    _, b_score_by_marker, b_agg1, b_agg2, b_agg3 = coord.score_report(b_scores)

Example #17

0

Show file

#   using the methods in the asking too much paper (http://www.cs.cornell.edu/~cristian/Asking_too_much.html) to extract question types.
#   (since there is a seed provided, multiple executions of this script will always produce the same clusters)
# This version uses precomputed motifs for speed.

import os
import pkg_resources
import numpy as np

from convokit import Corpus, QuestionTypology, download

#Initialize QuestionTypology class

num_clusters = 8

# Get precomputed motifs. data_dir contains the downloaded data.
data_dir = download('tennis-corpus')
motifs_dir = download('tennis-motifs')

#Load the corpus
corpus = Corpus(filename=os.path.join(data_dir, 'tennis-corpus'))

#Extract clusters of the motifs and assign questions to these clusters
questionTypology = QuestionTypology(corpus,
                                    data_dir,
                                    dataset_name='tennis',
                                    motifs_dir=motifs_dir,
                                    num_dims=25,
                                    num_clusters=num_clusters,
                                    verbose=False,
                                    random_seed=125)

Example #18

0

Show file

File: basic.py Project: CornellNLP/Cornell-Conversational-Analysis-Toolkit

# This example uses the supreme court corpus to compute some simple information:
# - Which justices coordinate the most to others?
# - Which justices are coordinated to the most?

import convokit

# set up corpus
corpus = convokit.Corpus(filename=convokit.download("supreme-corpus"))

# compute coordination scores on this corpus
coord = convokit.Coordination()
coord.fit(corpus)

# get coordination scores
coord.transform(corpus)

# get set of all justices
justices = corpus.iter_users(lambda user: user.meta["is-justice"])
# get set of all users
everyone = corpus.iter_users()

# compute coordination from each justice to everyone
print("Justices, ranked by how much they coordinate to others:")
justices_to_everyone = coord.score(corpus, justices, everyone)
for justice, score in sorted(justices_to_everyone.averages_by_user().items(),
    key=lambda x: x[1], reverse=True):
    print(justice.name, round(score, 5))
print()

# compute coordination from everyone to each justice
print("Justices, ranked by how much others coordinate to them:")

Example #19

0

Show file

File: wiki_question_typology.py Project: span11UR/Cornell-Conversational-Analysis-Toolkit

import os
import pkg_resources
import numpy as np

from convokit import Corpus, QuestionTypology, download

#Initialize QuestionTypology class

num_clusters = 8

# Get precomputed motifs. data_dir contains the downloaded data.
data_dir = os.path.join(pkg_resources.resource_filename("convokit", ""),
                        'downloads')

#Load the corpus
corpus = Corpus(filename=download("wiki-corpus"))

#Extract clusters of the motifs and assign questions to these clusters
questionTypology = QuestionTypology(corpus,
                                    data_dir,
                                    dataset_name='wiki',
                                    num_dims=25,
                                    num_clusters=num_clusters,
                                    verbose=False,
                                    random_seed=15)

# questionTypology.types_to_data contains the necessary data that is computed in the step above
# its keys are the indices of the clusters (here 0-7). The values are dictionaries with the following keys:
# "motifs": the motifs, as a list of tuples of the motif terms
# "motif_dists": the corresponding distances of each motif from the centroid of the cluster this motif is in
# "fragments": the answer fragments, as a list of tuples of answer terms

Example #20

0

Show file

import sys
import convokit

corpus = convokit.Corpus(filename=convokit.download("subreddit-Cornell"))
print(corpus.meta)
threads = corpus.utterance_threads(prefix_len=10, include_root=False)


def disp(thread, root, indent=0):
    print(" " * indent + thread[root].user.name + ": " +
          thread[root].text.replace("\n", " "))
    children = [k for k, v in thread.items() if v.reply_to == root]
    for child in children:
        disp(thread, child, indent=indent + 4)


if len(sys.argv) > 1:
    for root in sys.argv[1:]:
        print("--- {} ---".format(root))
        disp(threads[root], root)
        print()
else:
    while True:
        print("Enter thread root ID (e.g. {}): ".format(next(iter(threads))),
              end="")
        root = input()
        print("--- {} ---".format(root))
        disp(threads[root], root)
        print()

Example #21

0

Show file

File: wiki_question_typology.py Project: xkuang/Cornell-Conversational-Analysis-Toolkit

# This example extracts question types from the Wikipedia Moderators Dataset explained here (http://www.cs.cornell.edu/~cristian//Politeness.html)
#   using the methods in the asking too much paper (http://www.cs.cornell.edu/~cristian/Asking_too_much.html) to extract question types.
#   (since there is a seed provided, multiple executions of this script will always produce the same clusters)

import os
import pkg_resources
import numpy as np

from convokit import Corpus, QuestionTypology, download

#Initialize QuestionTypology class

num_clusters = 8

# Get precomputed motifs. data_dir contains the downloaded data.
data_dir = download('wiki-corpus')

#Load the corpus
corpus = Corpus(filename=os.path.join(data_dir, 'wiki-corpus'))

#Extract clusters of the motifs and assign questions to these clusters
questionTypology = QuestionTypology(corpus, data_dir, dataset_name='wiki', num_dims=25,
  num_clusters=num_clusters, verbose=False, random_seed=15)

# questionTypology.types_to_data contains the necessary data that is computed in the step above
# its keys are the indices of the clusters (here 0-7). The values are dictionaries with the following keys:
# "motifs": the motifs, as a list of tuples of the motif terms
# "motif_dists": the corresponding distances of each motif from the centroid of the cluster this motif is in
# "fragments": the answer fragments, as a list of tuples of answer terms
# "fragment_dists": the corresponding distances of each fragment from the centroid of the cluster this
# fragment is in

Example #22

0

Show file

 def test_load_dump_switchboard(self):
     corpus = Corpus(download("switchboard-corpus"))
     corpus.dump('switchboard-corpus')

Example #23

0

Show file

 def test_load_wikiconv(self):
     corpus = Corpus(download('wikiconv-2004'))

Example #24

0

Show file

import csv
import itertools
import sys
import os

from convokit import download
"""
Script dumps verb csv. Contains top 20 verbs for given modal for each year. 
"""

downloaded_corpus = download("supreme-corpus")
results_dir = os.path.abspath("../../results")


def get_file_list():
    linearr = []
    csv.field_size_limit(sys.maxsize)
    for fileyear in range(1950, 2020, 10):
        csvfile = results_dir + "/kwic" + str(fileyear) + "-" + str(
            fileyear + 10) + ".csv"
        with open(csvfile, 'r') as data:
            for line in csv.DictReader(data):
                linearr.append(line)
    return linearr


def get_verbs(modal_list):
    print("Assembling   modal data from files")
    linearr = get_file_list()
    filtered = {}
    baseline = {mod: {} for mod in modal_list}

Example #25

0

Show file

File: parliament_question_typology_precomputed.py Project: span11UR/Cornell-Conversational-Analysis-Toolkit

import os
import pkg_resources
import numpy as np

from convokit import Corpus, QuestionTypology, download

#Initialize QuestionTypology class

num_clusters = 8

# Get precomputed motifs. data_dir contains the downloaded data.
# motifs_dir is the specific path within data_dir that contains the precomputed motifs
data_dir = os.path.join(pkg_resources.resource_filename("convokit", ""),
                        'downloads')
motifs_dir = download('parliament-motifs')

#Load the corpus
corpus = Corpus(filename=download("parliament-corpus"))

#Extract clusters of the motifs and assign questions to these clusters
questionTypology = QuestionTypology(corpus,
                                    data_dir,
                                    dataset_name='parliament',
                                    motifs_dir=motifs_dir,
                                    num_dims=25,
                                    num_clusters=num_clusters,
                                    verbose=False,
                                    random_seed=164)

# questionTypology.types_to_data contains the necessary data that is computed in the step above

Example #26

0

Show file

File: create_dataset.py Project: HerrKrishna/RedditPostWriter

from convokit import Corpus, download
import sys
import random

if __name__ == '__main__':

    corpus_name = sys.argv[1]
    output_filename = sys.argv[2]
    corpus = Corpus(filename=download(corpus_name))

    char_list = ['<sos>', '<eos>', '<pad>', '<unk>']
    sequences = []
    for convo in corpus.iter_conversations():
        title = convo.meta['title']
        text = convo.get_utterance(
            convo.get_chronological_utterance_list()[0].conversation_id).text
        if text == '' or text == '[deleted]' or text == '[removed]':
            continue
        else:
            post = title + '\t' + text
            post = post.replace('\n', '').lower()
            sequence = ''
            for character in post:
                if character in char_list:
                    sequence += str(char_list.index(character)) + ' '
                else:
                    char_list.append(character)
                    sequence += str(len(char_list) - 1) + ' '
            sequences.append(sequence[:-1])

    random.shuffle(sequences)

Example #27

0

Show file

File: awry_politeness_strategies.py Project: CornellNLP/Cornell-Conversational-Analysis-Toolkit

# This example extracts politeness strategies from the Conversations Gone Awry dataset,
#   one of the steps in the Conversations Gone Awry paper (http://www.cs.cornell.edu/~cristian/Conversations_gone_awry.html).
#   For code reproducing the full results of the paper, see the example notebook in the
#   `conversations-gone-awry` example subdirectory.

import pandas as pd
from convokit import PolitenessStrategies, Corpus, download

print("Loading awry corpus...")
corpus = Corpus(filename=download('conversations-gone-awry-corpus'))

# extract the politeness strategies.
# Note: politeness strategies are a hand-engineered feature set, so no fitting is needed.
ps = PolitenessStrategies(verbose=100)
print("Extracting politeness strategies...")
corpus = ps.transform(corpus)

values = []
idx = []
for utterance in corpus.iter_utterances():
    values.append(utterance.meta["politeness_strategies"])
    idx.append(utterance.id)
pd.DataFrame(values, index=idx).to_csv("awry_strategy_df_v2.csv")
print("Done, results written to awry_strategy_df_v2.csv")

Example #28

0

Show file

File: countutterance.py Project: Mickky666/Cornell-Conversational-Analysis-Toolkit

import convokit

#set up corpus
corpus = convokit.Corpus(filename=convokit.download("supreme-corpus") \
                         ,subdivide_users_by=["roots"])

#create count object
count = convokit.CountUtterance(corpus)

#report the result
count.print_report()

Example #29

0

Show file

File: wiki.py Project: CornellNLP/Cornell-Conversational-Analysis-Toolkit

#
# The plots answer these questions: 
# - Do users on the whole coordinate more to admins or nonadmins?
# - Do admins coordinate to other people more than nonadmins do?

from convokit import Utterance, Corpus, Coordination, download

import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import numpy as np

# load corpus; split users by whether they are an admin
# this means that if a user has spoken in the corpus as both an admin and
#   a non-admin, then we will split this user into two users, one for each of
#   these roles
corpus = Corpus(filename=download("wiki-corpus"))
split = ["is_admin"]

# create coordination object
coord = Coordination()
coord.fit(corpus)

# helper function to plot two coordination scores against each other as a chart,
#   on aggregate and by coordination marker
# a is a tuple (speakers, targets)
# b is a tuple (speakers, targets)
def make_chart(a_scores, b_scores, a_description, b_description, a_color="b", b_color="g"):
    # get scores by marker and on aggregate
    _, a_score_by_marker, a_agg1, a_agg2, a_agg3 = coord.score_report(corpus, a_scores)
    _, b_score_by_marker, b_agg1, b_agg2, b_agg3 = coord.score_report(corpus, b_scores)

Example #30

0

Show file

File: knn.py Project: CornellNLP/Cornell-Conversational-Analysis-Toolkit

import convokit
from sklearn.neighbors import NearestNeighbors

print("Loading corpus")
corpus = convokit.Corpus(filename=convokit.download("reddit-corpus-small"))

print("Computing hypergraph features")
hc = convokit.HyperConvo(prefix_len=10, include_root=False)
hc.fit_transform(corpus)

print("Computing low-dimensional embeddings")
te = convokit.ThreadEmbedder(n_components=7)
te.fit_transform(corpus)

ce = convokit.CommunityEmbedder(community_key="subreddit")
ce.fit_transform(corpus)

X_communities = corpus.get_meta()["communityEmbedder"]["pts"]
subreddits = corpus.get_meta()["communityEmbedder"]["labels"]

knn = NearestNeighbors(n_neighbors=10)
knn.fit(X_communities)

print("Nearest neighbors for each subreddit:")
for x, subreddit in zip(X_communities, subreddits):
    print(subreddit, "->", end=" ")
    for idx in knn.kneighbors([x], return_distance=False)[0][1:]:
        print(subreddits[idx], end=" ")
    print()

Example #31

0

Show file

File: evaluate_downstream.py Project: rlaboulaye/turn-of-phrase

def main() -> None:

    args = parser.parse_args()

    if args.gpu is None:
        device = torch.device('cpu')
    else:
        device = torch.device('cuda:{}'.format(args.gpu))

    corpus = Corpus(filename=download(args.corpus))

    if args.corpus == 'conversations-gone-awry-cmv-corpus':
        DatasetClass = ConversationsGoneAwryDataset
        n_classes = 1
        criterion = nn.BCEWithLogitsLoss()
    elif args.corpus == 'winning-args-corpus':
        corpus = filter_winning_arguments_corpus(corpus)
        DatasetClass = WinningArgumentsDataset
        n_classes = 1
        criterion = nn.BCEWithLogitsLoss()
    else:
        raise ValueError('Corpus {} not currently supported'.format(
            args.corpus))

    add_title_to_root(corpus)

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)

    conversations = list(corpus.iter_conversations())
    train_ceil = math.ceil(len(conversations) * args.train_split)
    train_conversations = conversations[:train_ceil]
    val_conversations = conversations[train_ceil:]

    train_dataset = DatasetClass(corpus,
                                 train_conversations,
                                 tokenizer,
                                 max_len=args.max_conversation_len,
                                 max_tokenization_len=args.utterance_max)
    val_dataset = DatasetClass(corpus,
                               val_conversations,
                               tokenizer,
                               max_len=args.max_conversation_len,
                               max_tokenization_len=args.utterance_max)
    train_sampler = ConversationPathBatchSampler(
        args.batch_size, 1, train_dataset.get_indices_by_len())
    val_sampler = ConversationPathBatchSampler(
        args.batch_size, 1, val_dataset.get_indices_by_len())
    train_loader = DataLoader(train_dataset,
                              batch_sampler=train_sampler,
                              collate_fn=conversation_path_collate_fn,
                              pin_memory=True)
    val_loader = DataLoader(val_dataset,
                            batch_sampler=val_sampler,
                            collate_fn=conversation_path_collate_fn,
                            pin_memory=True)

    num_training_steps = args.epochs * len(train_dataset)

    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name, num_labels=n_classes)
    model.to(device)

    if args.pretrain_path is not None:
        checkpoint = torch.load(args.pretrain_path, map_location=device)
        model.bert.load_state_dict(checkpoint['state_dict'])

    optimizer = AdamW(model.parameters(), args.learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=WARMUP_RATIO * num_training_steps,
        num_training_steps=num_training_steps)
    scaler = GradScaler()

    for epoch in range(args.epochs):
        print('Epoch {}'.format(epoch))
        train(train_loader, model, criterion, optimizer, scheduler, scaler,
              device)
        validate(val_loader, model, criterion, device)

Example #32

0

Show file

 def test_load_dump_tennis(self):
     corpus = Corpus(download('tennis-corpus'))
     corpus.dump('tennis-corpus')

Example #33

0

Show file

def main() -> None:

    args = parser.parse_args()

    if args.gpu is None:
        device = torch.device('cpu')
    else:
        device = torch.device('cuda:{}'.format(args.gpu))

    corpus = Corpus(filename=download(CORPUS))
    add_title_to_root(corpus)

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)

    conversations = list(corpus.iter_conversations())
    train_ceil = math.ceil(len(conversations) * args.train_split)
    train_conversations = conversations[:train_ceil]
    val_conversations = conversations[train_ceil:]

    train_dataset = CoarseDiscourseDataset(
        corpus,
        train_conversations,
        tokenizer,
        max_len=args.max_conversation_len,
        max_tokenization_len=args.utterance_max)
    val_dataset = CoarseDiscourseDataset(
        corpus,
        val_conversations,
        tokenizer,
        max_len=args.max_conversation_len,
        max_tokenization_len=args.utterance_max)
    val_dataset.label_encoder = train_dataset.label_encoder
    train_sampler = ConversationPathBatchSampler(
        args.batch_size, 1, train_dataset.get_indices_by_len())
    val_sampler = ConversationPathBatchSampler(
        args.batch_size * 4, 1, val_dataset.get_indices_by_len())
    train_loader = DataLoader(train_dataset,
                              batch_sampler=train_sampler,
                              collate_fn=conversation_path_collate_fn,
                              pin_memory=True)
    val_loader = DataLoader(val_dataset,
                            batch_sampler=val_sampler,
                            collate_fn=conversation_path_collate_fn,
                            pin_memory=True)

    num_training_steps = args.epochs * len(train_dataset)

    model = AutoModelForTokenClassification.from_pretrained(
        args.model_name, num_labels=len(train_dataset.label_encoder.classes_))
    model.to(device)

    if args.pretrain_path is not None:
        checkpoint = torch.load(args.pretrain_path, map_location=device)
        model.bert.load_state_dict(checkpoint['state_dict'])

    optimizer = AdamW(model.parameters(), args.learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=WARMUP_RATIO * num_training_steps,
        num_training_steps=num_training_steps)
    scaler = GradScaler()

    for epoch in range(args.epochs):
        print('Epoch {}'.format(epoch))
        train(train_loader, model, optimizer, scheduler, scaler, device,
              tokenizer.sep_token_id)
        validate(val_loader, model, device, tokenizer.sep_token_id)

Example #34

0

Show file

def get_corpus(source,
               split_sentences=False,
               punct=True,
               to_ascii=True,
               data_path=DATA_DIR,
               min_len=3,
               max_len=15,
               test_size=0.1,
               text_field='text',
               subsample_rows=False,
               save=True):
    if source.endswith('.csv'):
        csv = True
        name = source[:-4]
    else:
        csv = False
        name = source

    # compose name
    corpus_name = f'{name}{"_split" if split_sentences else ""}' \
                  f'{"_punct" if punct else ""}' \
                  f'{"_ascii" if to_ascii else ""}' \
                  f'{f"_sub{subsample_rows}" if subsample_rows else ""}' \
                  f'_{test_size}_{min_len}_{max_len}'
    corpus_train = os.path.join(data_path, f'{corpus_name}_train.csv')
    corpus_test = os.path.join(data_path, f'{corpus_name}_test.csv')

    # Load from cache
    if test_size == 1 and os.path.isfile(corpus_test):
        df = pd.read_csv(corpus_test)
        print('Loading cached data...')
        print(len(df))
        return df, f'{corpus_name}_test.csv'

    elif os.path.isfile(corpus_train) and os.path.isfile(corpus_test):
        df_train, df_val = pd.read_csv(corpus_train), pd.read_csv(corpus_test)
        print('Loading cached data...')
        print(len(df_train))
        print(len(df_val))
        return df_train, df_val, f'{corpus_name}_train.csv', f'{corpus_name}_test.csv'

    # load csv or download
    if csv:
        print('Loading dataset from csv...')
        df = pd.read_csv(os.path.join(data_path, source))
    else:
        print('Downloading dataset...')
        corp = Corpus(filename=download(name))
        df = corp.get_utterances_dataframe()
    # get only text
    df = df.rename(columns={text_field: "utterance"})[["utterance"]]

    # remove any tags
    df['utterance'] = df['utterance'].str.replace(r'<.*>', ' ')

    # subsample
    if subsample_rows:
        df = df.sample(subsample_rows, random_state=0)

    # split sentences
    if split_sentences:
        sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle')
        df["utterance"] = df["utterance"]
        df["utterance"] = df["utterance"].apply(sentence_detector.tokenize)
        df = flatten_sentences(df)

    print('Cleaning')
    cln_fn = lambda x: clean(
        x,
        fix_unicode=True,  # fix various unicode errors
        to_ascii=to_ascii,  # transliterate to closest ASCII representation
        lower=True,  # lowercase text
        no_line_breaks=
        False,  # fully strip line breaks as opposed to only normalizing them
        no_urls=False,  # replace all URLs with a special token
        no_emails=False,  # replace all email addresses with a special token
        no_phone_numbers=
        False,  # replace all phone numbers with a special token
        no_numbers=False,  # replace all numbers with a special token
        no_digits=False,  # replace all digits with a special token
        no_currency_symbols=
        False,  # replace all currency symbols with a special token
        no_punct=False,  # remove punctuations
        lang="en"  # set to 'de' for German special handling
    )

    # clean
    df["utterance"] = df["utterance"].apply(cln_fn)
    if not punct:
        df["utterance"] = df["utterance"].str.replace(
            r"[{}]".format(punctuation), ' ')

    # tokenize
    sen_by_words = df["utterance"].apply(word_tokenize)
    word_counts = sen_by_words.apply(len)
    sen_by_words = sen_by_words[(word_counts <= max_len)
                                & (word_counts >= min_len)]
    df = sen_by_words.to_frame()

    # no split
    if test_size == 1:
        if not save:
            return df
        print(len(df))
        df.to_csv(corpus_test, index=False)
        return df, f'{corpus_name}_test.csv'

    # split
    df_train, df_val = train_test_split(df,
                                        test_size=test_size,
                                        random_state=0)
    print(len(df_train))
    print(len(df_val))
    if not save:
        return df_train, df_val
    df_train.to_csv(corpus_train, index=False)
    df_val.to_csv(corpus_test, index=False)
    return df_train, df_val, f'{corpus_name}_train.csv', f'{corpus_name}_test.csv'

Example #35

0

Show file

File: type_input_question.py Project: iaj8/Cornell-Conversational-Analysis-Toolkit

from ast import literal_eval as make_tuple
from collections import defaultdict
from scipy import sparse
from sklearn.externals import joblib
from sklearn.cluster import KMeans
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.preprocessing import Normalizer
from spacy.en import English
from spacy.symbols import *
from spacy.tokens.doc import Doc

#Initialize QuestionTypology class pretrained on Parliament Dataset

num_clusters = 8

data_dir = download('parliament-corpus')
motifs_dir = download('parliament-motifs')

corpus = Corpus(filename=os.path.join(data_dir, 'parliament-corpus'))

questionTypology = QuestionTypology(corpus,
                                    data_dir,
                                    dataset_name='parliament',
                                    motifs_dir=motifs_dir,
                                    num_dims=25,
                                    num_clusters=num_clusters,
                                    verbose=False,
                                    random_seed=164)

#Determine type of input question

Example #36

0

Show file

File: display-thread.py Project: CornellNLP/Cornell-Conversational-Analysis-Toolkit

import sys
import convokit

corpus = convokit.Corpus(filename=convokit.download("subreddit-Cornell"))
print(corpus.meta)
threads = corpus.utterance_threads(prefix_len=10, include_root=False)

def disp(thread, root, indent=0):
    print(" "*indent + thread[root].user.name + ": " +
        thread[root].text.replace("\n", " "))
    children = [k for k, v in thread.items() if v.reply_to == root]
    for child in children:
        disp(thread, child, indent=indent+4)

if len(sys.argv) > 1:
    for root in sys.argv[1:]:
        print("--- {} ---".format(root))
        disp(threads[root], root)
        print()
else:
    while True:
        print("Enter thread root ID (e.g. {}): ".format(next(iter(threads))), end="")
        root = input()
        print("--- {} ---".format(root))
        disp(threads[root], root)
        print()