def _read(self, corpus_split): corpus_split = corpus_split.split('_') corpus_name = corpus_split[0] self.split = corpus_split[1] if len(corpus_split) > 1 else None corpus = Corpus(filename=download(corpus_name)) conversations = corpus.iter_conversations() if self.sample: conversations = itertools.islice(conversations, self.sample) for conv in conversations: meta = conv.meta if (meta.get('split') != self.split) and (meta.get( 'annotation_year', 2018) != 2018): continue label = str(meta[self.label_field]) # turns = [u.text for u in conv.iter_utterances() if u.text.strip() and (not u.meta.get('is_section_header'))] turns = [ u.meta.parsed for u in conv.iter_utterances() if not u.meta.get('is_section_header') ] end = len(turns) - 1 if self.forecast else None turns = turns[-self.max_turns:end] if turns and all(turns): inst = self.text_to_instance(turns, label) if inst: yield inst
def setUp(self) -> None: self.corpus = Corpus(download('subreddit-hey')) self.utt_df = self.corpus.get_utterances_dataframe() self.convo_df = self.corpus.get_conversations_dataframe() self.speaker_df = self.corpus.get_speakers_dataframe() self.new_corpus = Corpus.from_pandas(self.utt_df, self.speaker_df, self.convo_df)
def process_corpus(corpus_name, to_download=TO_DOWNLOAD, min_wc_source=MIN_WC_SOURCE, max_wc_source=MAX_WC_SOURCE, min_wc_target=MIN_WC_TARGET, max_wc_target=MAX_WC_TARGET, source_filter=SOURCE_FILTER, target_filter=TARGET_FILTER, text_cols=TEXT_COLS, data_dir=DATA_DIR): if to_download: corpus = Corpus(download(corpus_name, data_dir=data_dir)) else: corpus = Corpus(os.path.join(data_dir, corpus_name)) corpus_name = corpus.get_meta()['name'] print(corpus_name) corpus.print_summary_stats() print('processing', corpus.get_meta()['name']) corpus.load_info('utterance', ['parsed']) corpus = text_prep_pipe().transform(corpus) source_df, target_df = get_train_subset(corpus, min_wc_source, max_wc_source, min_wc_target, max_wc_target, source_filter, target_filter, text_cols) source_df.to_csv(os.path.join(data_dir, corpus_name + '.source.tsv'), sep='\t') target_df.to_csv(os.path.join(data_dir, corpus_name + '.target.tsv'), sep='\t')
def load_conversations(corpus_name, max_samples, eval_percent=0.1): logging.info('Loading data.') def split_data(inputs, outputs, eval_percent): eval_index = int(len(inputs) * (1 - eval_percent)) return (inputs[:eval_index], outputs[:eval_index], inputs[eval_index:], outputs[eval_index:]) corpus = Corpus(filename=download(corpus_name)) deleted_filter = re.compile(r'^(\[deleted]|\[removed])$') inputs, outputs = [], [] for paths in corpus.iter_conversations(): for path in paths.get_root_to_leaf_paths(): for i in range(len(path) - 1): if deleted_filter.match(path[i].text) \ or deleted_filter.match(path[i-1].text) \ or deleted_filter.match(path[i+1].text): continue inputs.append(path[i].text) outputs.append(path[i + 1].text) if len(inputs) >= max_samples: return split_data(inputs, outputs, eval_percent) logging.info('Done!') return split_data(inputs, outputs, eval_percent)
def main(): #initalize globals readRegionalisms() addStopWords() #to select subreddits to process, add their name to the CurrentSubredditSet file toProcess = readSubredditSet() for corpusname in toProcess: print("doing, "+corpusname) download("subreddit-"+corpusname, data_dir=datadirectory+"/DataDownloads") #create the directory if not os.path.exists(datadirectory+"/ProcessedData/"+corpusname): os.makedirs(datadirectory+"/ProcessedData/"+corpusname) if os.path.exists(datadirectory+"/DataDownloads/subreddit-"+corpusname+".zip") and not os.path.exists(datadirectory+"/DataDownloads/"+corpusname+".corpus.zip"): os.rename(datadirectory+"/DataDownloads/subreddit-"+corpusname+".zip", datadirectory+"/DataDownloads/"+corpusname+".corpus.zip") print(datadirectory+"/DataDownloads/"+corpusname+".corpus.zip") with ZipFile(datadirectory+"/DataDownloads/"+corpusname+".corpus.zip", mode="r") as corpuszip: if not os.path.exists(datadirectory+"/ProcessedData/"+corpusname+"/utterances.jsonl"): corpuszip.extract("utterances.jsonl", path=datadirectory+"/ProcessedData/"+corpusname+"/") #make the unfilted text files old_data_exists = False for file in getTextFileNames(corpusname, filtered=False): if os.path.exists(file): old_data_exists = True if not old_data_exists: convertToText(corpusname) else: print(corpusname + " has already been converted to unfiltered text files, moving on") # remove stopwords old_data_exists = False for file in getTextFileNames(corpusname): if os.path.exists(file): old_data_exists = True if not old_data_exists: removeStopwordsFromConverted(corpusname) else: print(corpusname + " has already had its text files filtered")
def __init__(self, subReddit=""): self._startIndex = 0 self._endIndex = 5 * (10**5) self._startDate = 2007 self._endDate = 2018 self._target = subReddit if self._target != "": Corpus(filename=download(self._target), utterance_start_index=self._startIndex, utterance_end_index=self._endIndex)
def main(): downloaded_corpus = download("supreme-corpus") results_dir = os.path.abspath("../../results") if len(sys.argv) <= 0: print( "Usage: dumpModalKwic --minyear=yyyy --maxyear=yyyy --limit=n --kwic=1/0" ) exit(1) full_cmd_arguments = sys.argv argument_list = full_cmd_arguments[1:] short_options = "y:n:x:l:" year = 1955 long_options = ["year=", "maxyear=", "minyear=", "limit=", "kwic="] maxyear = minyear = utterance_end_index = None kwic = False arguments, values = getopt.getopt(argument_list, short_options, long_options) for current_argument, current_value in arguments: kwic = bool(current_value) if current_argument in ("-x", "--kwic") else kwic maxyear = int(current_value) if current_argument in ( "-x", "--maxyear") else maxyear minyear = int(current_value) if current_argument in ( "-n", "--minyear") else minyear year = int(current_value) if current_argument in ("-y", "--year") else year utterance_end_index = int(current_value) if current_argument in ( "-l", "--limit") else utterance_end_index minyear = year if ((minyear is None) and (year is not None)) else minyear maxyear = year if ((maxyear is None) and (year is not None)) else maxyear result_file = results_dir + "/kwic" + str(minyear) + "-" + str( maxyear) + ".csv" print("Initializing corpus model") corpus = SentenceCorpus(maxyear, minyear, dirname=downloaded_corpus, utterance_end_index=utterance_end_index) print("Corpus model initialized") if not os.path.exists(results_dir + "/utterances" + str(minyear) + "-" + str(maxyear) + ".jsonl"): print("Missing modal utterance json. Creating...") corpus.dump_modal_sentences() if kwic: print("Kwic mode selected. Creating csv...") corpus.dump_kwic(result_file)
def test_load_dump_politeness(self): corpus = Corpus(download('wikipedia-politeness-corpus')) corpus.dump('wikipedia-politeness-corpus')
# The plots answer these questions: # - Do users on the whole coordinate more to admins or nonadmins? # - Do admins coordinate to other people more than nonadmins do? from convokit import Utterance, Corpus, Coordination, download import matplotlib.pyplot as plt import matplotlib.patches as mpatches import numpy as np import os # load corpus; split users by whether they are an admin # this means that if a user has spoken in the corpus as both an admin and # a non-admin, then we will split this user into two users, one for each of # these roles path = download("wiki-corpus") corpus = Corpus(filename=os.path.join(path, "wiki-corpus"), subdivide_users_by=["is_admin"]) # create coordination object coord = Coordination(corpus) # helper function to plot two coordination scores against each other as a chart, # on aggregate and by coordination marker # a is a tuple (speakers, targets) # b is a tuple (speakers, targets) def make_chart(a_scores, b_scores, a_description, b_description,
import convokit import numpy as np import matplotlib.pyplot as plt print("Loading corpus") corpus = convokit.Corpus(filename=convokit.download("reddit-corpus-small")) print("Computing hypergraph features") hc = convokit.HyperConvo() hc.fit_transform(corpus) print("Computing low-dimensional embeddings") te = convokit.ThreadEmbedder(n_components=7) te.fit_transform(corpus) ce = convokit.CommunityEmbedder(community_key="subreddit", method="tsne") ce.fit_transform(corpus) pts = corpus.get_meta()["communityEmbedder"]["pts"] labels = corpus.get_meta()["communityEmbedder"]["labels"] xs, ys = zip(*pts) plt.scatter(xs, ys) for i, txt in enumerate(labels): plt.annotate(txt, (xs[i], ys[i])) plt.savefig("tsne") plt.show()
reddit.subreddits.recommended([reddit.subreddit('askreddit'),reddit.subreddit('lifeprotips')]) top_6_subs = list(reddit.subreddits.popular(limit=6)) reddit.subreddits.recommended(top_6_subs) widgets = reddit.subreddit('askouija').widgets for widget in widgets.sidebar: if isinstance(widget, praw.models.CommunityList): print(widget) ##################### from convokit import Corpus, download smalsubs = Corpus(filename=download('reddit-corpus-small')) # will not download twice if it already exists ut_ids = smalsubs.get_utterance_ids() len(ut_ids) uid = ut_ids[0] c_ids = smalsubs.get_conversation_ids() cid = c_ids[0] convo = smalsubs.get_conversation(cid) convo.get_utterance_ids() convo.get_utterance(uid) top_level = [uid for uid in ut_ids if sub_corn.get]
# -*- coding: utf-8 -*- """ Created on Mon Jul 27 23:20:11 2020 @author: kach """ from convokit import Corpus, download corpus = Corpus(filename=download("conversations-gone-awry-corpus")) corpus.print_summary_stats() reviews = open("data/reviews.txt", "w", encoding="utf-8") label = open("data/labels.txt", "w") #i=0 for utt in corpus.iter_utterances(): #i+=1 txt = str(utt.text).replace('\n', ' ') reviews.write(txt + '\n') if utt.meta['comment_has_personal_attack']: l = '1' else: l = '0' label.write(l + '\n') #if i>10: # break reviews.close() label.close()
# The plots answer these questions: # - Do lawyers coordinate more to justices than the other way around? # - Do lawyers coordinate more to unfavorable or favorable justices? # - Do unfavorable justices coordinate to lawyers more than favorable justices, # or vice versa? from convokit import Utterance, Corpus, Coordination, download import matplotlib.pyplot as plt import matplotlib.patches as mpatches import numpy as np # load corpus; split users by case id and split the justices by whether they are # favorable to the current presenting side # this treats the same person across two different cases as two different users corpus = Corpus(filename=download("supreme-corpus")) split = ["case", "justice-is-favorable"] # create coordination object coord = Coordination() coord.fit(corpus) # helper function to plot two coordination scores against each other as a chart, # on aggregate and by coordination marker # a is a tuple (speakers, targets) # b is a tuple (speakers, targets) def make_chart(a_scores, b_scores, a_description, b_description, a_color="b", b_color="g"): # get scores by marker and on aggregate _, a_score_by_marker, a_agg1, a_agg2, a_agg3 = coord.score_report(corpus, a_scores) _, b_score_by_marker, b_agg1, b_agg2, b_agg3 = coord.score_report(corpus, b_scores)
def test_load_dump_subreddit(self): corpus = Corpus(download('subreddit-hey')) corpus.dump('subreddit')
import os import sys ''' some code to update datasets with new parse format. ''' VERBOSITY = 1000 ROOT_DIR = '/kitchen/clean-corpora' INCREMENT_VERSION = False PARSE = True if __name__ == '__main__': print('downloading corpus') corpus_name = sys.argv[1] filename = download(corpus_name, data_dir=ROOT_DIR) print('purging parses') with open(os.path.join(filename, 'index.json')) as f: index = json.load(f) try: del index['utterances-index']['parsed'] except: pass if INCREMENT_VERSION: index['version'] += 1 with open(os.path.join(filename, 'index.json'), 'w') as f: json.dump(index, f) if os.path.exists(os.path.join(filename, 'utterances.json')): with open(os.path.join(filename, 'utterances.json')) as f:
# - Do lawyers coordinate more to justices than the other way around? # - Do lawyers coordinate more to unfavorable or favorable justices? # - Do unfavorable justices coordinate to lawyers more than favorable justices, # or vice versa? from convokit import Utterance, Corpus, Coordination, download import matplotlib.pyplot as plt import matplotlib.patches as mpatches import numpy as np import os # load corpus; split users by case id and split the justices by whether they are # favorable to the current presenting side # this treats the same person across two different cases as two different users path = download("supreme-corpus") corpus = Corpus(filename=os.path.join(path, "supreme-corpus"), subdivide_users_by=["case", "justice-is-favorable"]) # create coordination object coord = Coordination(corpus) # helper function to plot two coordination scores against each other as a chart, # on aggregate and by coordination marker # a is a tuple (speakers, targets) # b is a tuple (speakers, targets) def make_chart(a_scores, b_scores, a_description, b_description, a_color="b", b_color="g"): # get scores by marker and on aggregate _, a_score_by_marker, a_agg1, a_agg2, a_agg3 = coord.score_report(a_scores) _, b_score_by_marker, b_agg1, b_agg2, b_agg3 = coord.score_report(b_scores)
# using the methods in the asking too much paper (http://www.cs.cornell.edu/~cristian/Asking_too_much.html) to extract question types. # (since there is a seed provided, multiple executions of this script will always produce the same clusters) # This version uses precomputed motifs for speed. import os import pkg_resources import numpy as np from convokit import Corpus, QuestionTypology, download #Initialize QuestionTypology class num_clusters = 8 # Get precomputed motifs. data_dir contains the downloaded data. data_dir = download('tennis-corpus') motifs_dir = download('tennis-motifs') #Load the corpus corpus = Corpus(filename=os.path.join(data_dir, 'tennis-corpus')) #Extract clusters of the motifs and assign questions to these clusters questionTypology = QuestionTypology(corpus, data_dir, dataset_name='tennis', motifs_dir=motifs_dir, num_dims=25, num_clusters=num_clusters, verbose=False, random_seed=125)
# This example uses the supreme court corpus to compute some simple information: # - Which justices coordinate the most to others? # - Which justices are coordinated to the most? import convokit # set up corpus corpus = convokit.Corpus(filename=convokit.download("supreme-corpus")) # compute coordination scores on this corpus coord = convokit.Coordination() coord.fit(corpus) # get coordination scores coord.transform(corpus) # get set of all justices justices = corpus.iter_users(lambda user: user.meta["is-justice"]) # get set of all users everyone = corpus.iter_users() # compute coordination from each justice to everyone print("Justices, ranked by how much they coordinate to others:") justices_to_everyone = coord.score(corpus, justices, everyone) for justice, score in sorted(justices_to_everyone.averages_by_user().items(), key=lambda x: x[1], reverse=True): print(justice.name, round(score, 5)) print() # compute coordination from everyone to each justice print("Justices, ranked by how much others coordinate to them:")
import os import pkg_resources import numpy as np from convokit import Corpus, QuestionTypology, download #Initialize QuestionTypology class num_clusters = 8 # Get precomputed motifs. data_dir contains the downloaded data. data_dir = os.path.join(pkg_resources.resource_filename("convokit", ""), 'downloads') #Load the corpus corpus = Corpus(filename=download("wiki-corpus")) #Extract clusters of the motifs and assign questions to these clusters questionTypology = QuestionTypology(corpus, data_dir, dataset_name='wiki', num_dims=25, num_clusters=num_clusters, verbose=False, random_seed=15) # questionTypology.types_to_data contains the necessary data that is computed in the step above # its keys are the indices of the clusters (here 0-7). The values are dictionaries with the following keys: # "motifs": the motifs, as a list of tuples of the motif terms # "motif_dists": the corresponding distances of each motif from the centroid of the cluster this motif is in # "fragments": the answer fragments, as a list of tuples of answer terms
import sys import convokit corpus = convokit.Corpus(filename=convokit.download("subreddit-Cornell")) print(corpus.meta) threads = corpus.utterance_threads(prefix_len=10, include_root=False) def disp(thread, root, indent=0): print(" " * indent + thread[root].user.name + ": " + thread[root].text.replace("\n", " ")) children = [k for k, v in thread.items() if v.reply_to == root] for child in children: disp(thread, child, indent=indent + 4) if len(sys.argv) > 1: for root in sys.argv[1:]: print("--- {} ---".format(root)) disp(threads[root], root) print() else: while True: print("Enter thread root ID (e.g. {}): ".format(next(iter(threads))), end="") root = input() print("--- {} ---".format(root)) disp(threads[root], root) print()
# This example extracts question types from the Wikipedia Moderators Dataset explained here (http://www.cs.cornell.edu/~cristian//Politeness.html) # using the methods in the asking too much paper (http://www.cs.cornell.edu/~cristian/Asking_too_much.html) to extract question types. # (since there is a seed provided, multiple executions of this script will always produce the same clusters) import os import pkg_resources import numpy as np from convokit import Corpus, QuestionTypology, download #Initialize QuestionTypology class num_clusters = 8 # Get precomputed motifs. data_dir contains the downloaded data. data_dir = download('wiki-corpus') #Load the corpus corpus = Corpus(filename=os.path.join(data_dir, 'wiki-corpus')) #Extract clusters of the motifs and assign questions to these clusters questionTypology = QuestionTypology(corpus, data_dir, dataset_name='wiki', num_dims=25, num_clusters=num_clusters, verbose=False, random_seed=15) # questionTypology.types_to_data contains the necessary data that is computed in the step above # its keys are the indices of the clusters (here 0-7). The values are dictionaries with the following keys: # "motifs": the motifs, as a list of tuples of the motif terms # "motif_dists": the corresponding distances of each motif from the centroid of the cluster this motif is in # "fragments": the answer fragments, as a list of tuples of answer terms # "fragment_dists": the corresponding distances of each fragment from the centroid of the cluster this # fragment is in
def test_load_dump_switchboard(self): corpus = Corpus(download("switchboard-corpus")) corpus.dump('switchboard-corpus')
def test_load_wikiconv(self): corpus = Corpus(download('wikiconv-2004'))
import csv import itertools import sys import os from convokit import download """ Script dumps verb csv. Contains top 20 verbs for given modal for each year. """ downloaded_corpus = download("supreme-corpus") results_dir = os.path.abspath("../../results") def get_file_list(): linearr = [] csv.field_size_limit(sys.maxsize) for fileyear in range(1950, 2020, 10): csvfile = results_dir + "/kwic" + str(fileyear) + "-" + str( fileyear + 10) + ".csv" with open(csvfile, 'r') as data: for line in csv.DictReader(data): linearr.append(line) return linearr def get_verbs(modal_list): print("Assembling modal data from files") linearr = get_file_list() filtered = {} baseline = {mod: {} for mod in modal_list}
import os import pkg_resources import numpy as np from convokit import Corpus, QuestionTypology, download #Initialize QuestionTypology class num_clusters = 8 # Get precomputed motifs. data_dir contains the downloaded data. # motifs_dir is the specific path within data_dir that contains the precomputed motifs data_dir = os.path.join(pkg_resources.resource_filename("convokit", ""), 'downloads') motifs_dir = download('parliament-motifs') #Load the corpus corpus = Corpus(filename=download("parliament-corpus")) #Extract clusters of the motifs and assign questions to these clusters questionTypology = QuestionTypology(corpus, data_dir, dataset_name='parliament', motifs_dir=motifs_dir, num_dims=25, num_clusters=num_clusters, verbose=False, random_seed=164) # questionTypology.types_to_data contains the necessary data that is computed in the step above
from convokit import Corpus, download import sys import random if __name__ == '__main__': corpus_name = sys.argv[1] output_filename = sys.argv[2] corpus = Corpus(filename=download(corpus_name)) char_list = ['<sos>', '<eos>', '<pad>', '<unk>'] sequences = [] for convo in corpus.iter_conversations(): title = convo.meta['title'] text = convo.get_utterance( convo.get_chronological_utterance_list()[0].conversation_id).text if text == '' or text == '[deleted]' or text == '[removed]': continue else: post = title + '\t' + text post = post.replace('\n', '').lower() sequence = '' for character in post: if character in char_list: sequence += str(char_list.index(character)) + ' ' else: char_list.append(character) sequence += str(len(char_list) - 1) + ' ' sequences.append(sequence[:-1]) random.shuffle(sequences)
# This example extracts politeness strategies from the Conversations Gone Awry dataset, # one of the steps in the Conversations Gone Awry paper (http://www.cs.cornell.edu/~cristian/Conversations_gone_awry.html). # For code reproducing the full results of the paper, see the example notebook in the # `conversations-gone-awry` example subdirectory. import pandas as pd from convokit import PolitenessStrategies, Corpus, download print("Loading awry corpus...") corpus = Corpus(filename=download('conversations-gone-awry-corpus')) # extract the politeness strategies. # Note: politeness strategies are a hand-engineered feature set, so no fitting is needed. ps = PolitenessStrategies(verbose=100) print("Extracting politeness strategies...") corpus = ps.transform(corpus) values = [] idx = [] for utterance in corpus.iter_utterances(): values.append(utterance.meta["politeness_strategies"]) idx.append(utterance.id) pd.DataFrame(values, index=idx).to_csv("awry_strategy_df_v2.csv") print("Done, results written to awry_strategy_df_v2.csv")
import convokit #set up corpus corpus = convokit.Corpus(filename=convokit.download("supreme-corpus") \ ,subdivide_users_by=["roots"]) #create count object count = convokit.CountUtterance(corpus) #report the result count.print_report()
# # The plots answer these questions: # - Do users on the whole coordinate more to admins or nonadmins? # - Do admins coordinate to other people more than nonadmins do? from convokit import Utterance, Corpus, Coordination, download import matplotlib.pyplot as plt import matplotlib.patches as mpatches import numpy as np # load corpus; split users by whether they are an admin # this means that if a user has spoken in the corpus as both an admin and # a non-admin, then we will split this user into two users, one for each of # these roles corpus = Corpus(filename=download("wiki-corpus")) split = ["is_admin"] # create coordination object coord = Coordination() coord.fit(corpus) # helper function to plot two coordination scores against each other as a chart, # on aggregate and by coordination marker # a is a tuple (speakers, targets) # b is a tuple (speakers, targets) def make_chart(a_scores, b_scores, a_description, b_description, a_color="b", b_color="g"): # get scores by marker and on aggregate _, a_score_by_marker, a_agg1, a_agg2, a_agg3 = coord.score_report(corpus, a_scores) _, b_score_by_marker, b_agg1, b_agg2, b_agg3 = coord.score_report(corpus, b_scores)
import convokit from sklearn.neighbors import NearestNeighbors print("Loading corpus") corpus = convokit.Corpus(filename=convokit.download("reddit-corpus-small")) print("Computing hypergraph features") hc = convokit.HyperConvo(prefix_len=10, include_root=False) hc.fit_transform(corpus) print("Computing low-dimensional embeddings") te = convokit.ThreadEmbedder(n_components=7) te.fit_transform(corpus) ce = convokit.CommunityEmbedder(community_key="subreddit") ce.fit_transform(corpus) X_communities = corpus.get_meta()["communityEmbedder"]["pts"] subreddits = corpus.get_meta()["communityEmbedder"]["labels"] knn = NearestNeighbors(n_neighbors=10) knn.fit(X_communities) print("Nearest neighbors for each subreddit:") for x, subreddit in zip(X_communities, subreddits): print(subreddit, "->", end=" ") for idx in knn.kneighbors([x], return_distance=False)[0][1:]: print(subreddits[idx], end=" ") print()
def main() -> None: args = parser.parse_args() if args.gpu is None: device = torch.device('cpu') else: device = torch.device('cuda:{}'.format(args.gpu)) corpus = Corpus(filename=download(args.corpus)) if args.corpus == 'conversations-gone-awry-cmv-corpus': DatasetClass = ConversationsGoneAwryDataset n_classes = 1 criterion = nn.BCEWithLogitsLoss() elif args.corpus == 'winning-args-corpus': corpus = filter_winning_arguments_corpus(corpus) DatasetClass = WinningArgumentsDataset n_classes = 1 criterion = nn.BCEWithLogitsLoss() else: raise ValueError('Corpus {} not currently supported'.format( args.corpus)) add_title_to_root(corpus) tokenizer = AutoTokenizer.from_pretrained(args.model_name) conversations = list(corpus.iter_conversations()) train_ceil = math.ceil(len(conversations) * args.train_split) train_conversations = conversations[:train_ceil] val_conversations = conversations[train_ceil:] train_dataset = DatasetClass(corpus, train_conversations, tokenizer, max_len=args.max_conversation_len, max_tokenization_len=args.utterance_max) val_dataset = DatasetClass(corpus, val_conversations, tokenizer, max_len=args.max_conversation_len, max_tokenization_len=args.utterance_max) train_sampler = ConversationPathBatchSampler( args.batch_size, 1, train_dataset.get_indices_by_len()) val_sampler = ConversationPathBatchSampler( args.batch_size, 1, val_dataset.get_indices_by_len()) train_loader = DataLoader(train_dataset, batch_sampler=train_sampler, collate_fn=conversation_path_collate_fn, pin_memory=True) val_loader = DataLoader(val_dataset, batch_sampler=val_sampler, collate_fn=conversation_path_collate_fn, pin_memory=True) num_training_steps = args.epochs * len(train_dataset) model = AutoModelForSequenceClassification.from_pretrained( args.model_name, num_labels=n_classes) model.to(device) if args.pretrain_path is not None: checkpoint = torch.load(args.pretrain_path, map_location=device) model.bert.load_state_dict(checkpoint['state_dict']) optimizer = AdamW(model.parameters(), args.learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=WARMUP_RATIO * num_training_steps, num_training_steps=num_training_steps) scaler = GradScaler() for epoch in range(args.epochs): print('Epoch {}'.format(epoch)) train(train_loader, model, criterion, optimizer, scheduler, scaler, device) validate(val_loader, model, criterion, device)
def test_load_dump_tennis(self): corpus = Corpus(download('tennis-corpus')) corpus.dump('tennis-corpus')
def main() -> None: args = parser.parse_args() if args.gpu is None: device = torch.device('cpu') else: device = torch.device('cuda:{}'.format(args.gpu)) corpus = Corpus(filename=download(CORPUS)) add_title_to_root(corpus) tokenizer = AutoTokenizer.from_pretrained(args.model_name) conversations = list(corpus.iter_conversations()) train_ceil = math.ceil(len(conversations) * args.train_split) train_conversations = conversations[:train_ceil] val_conversations = conversations[train_ceil:] train_dataset = CoarseDiscourseDataset( corpus, train_conversations, tokenizer, max_len=args.max_conversation_len, max_tokenization_len=args.utterance_max) val_dataset = CoarseDiscourseDataset( corpus, val_conversations, tokenizer, max_len=args.max_conversation_len, max_tokenization_len=args.utterance_max) val_dataset.label_encoder = train_dataset.label_encoder train_sampler = ConversationPathBatchSampler( args.batch_size, 1, train_dataset.get_indices_by_len()) val_sampler = ConversationPathBatchSampler( args.batch_size * 4, 1, val_dataset.get_indices_by_len()) train_loader = DataLoader(train_dataset, batch_sampler=train_sampler, collate_fn=conversation_path_collate_fn, pin_memory=True) val_loader = DataLoader(val_dataset, batch_sampler=val_sampler, collate_fn=conversation_path_collate_fn, pin_memory=True) num_training_steps = args.epochs * len(train_dataset) model = AutoModelForTokenClassification.from_pretrained( args.model_name, num_labels=len(train_dataset.label_encoder.classes_)) model.to(device) if args.pretrain_path is not None: checkpoint = torch.load(args.pretrain_path, map_location=device) model.bert.load_state_dict(checkpoint['state_dict']) optimizer = AdamW(model.parameters(), args.learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=WARMUP_RATIO * num_training_steps, num_training_steps=num_training_steps) scaler = GradScaler() for epoch in range(args.epochs): print('Epoch {}'.format(epoch)) train(train_loader, model, optimizer, scheduler, scaler, device, tokenizer.sep_token_id) validate(val_loader, model, device, tokenizer.sep_token_id)
def get_corpus(source, split_sentences=False, punct=True, to_ascii=True, data_path=DATA_DIR, min_len=3, max_len=15, test_size=0.1, text_field='text', subsample_rows=False, save=True): if source.endswith('.csv'): csv = True name = source[:-4] else: csv = False name = source # compose name corpus_name = f'{name}{"_split" if split_sentences else ""}' \ f'{"_punct" if punct else ""}' \ f'{"_ascii" if to_ascii else ""}' \ f'{f"_sub{subsample_rows}" if subsample_rows else ""}' \ f'_{test_size}_{min_len}_{max_len}' corpus_train = os.path.join(data_path, f'{corpus_name}_train.csv') corpus_test = os.path.join(data_path, f'{corpus_name}_test.csv') # Load from cache if test_size == 1 and os.path.isfile(corpus_test): df = pd.read_csv(corpus_test) print('Loading cached data...') print(len(df)) return df, f'{corpus_name}_test.csv' elif os.path.isfile(corpus_train) and os.path.isfile(corpus_test): df_train, df_val = pd.read_csv(corpus_train), pd.read_csv(corpus_test) print('Loading cached data...') print(len(df_train)) print(len(df_val)) return df_train, df_val, f'{corpus_name}_train.csv', f'{corpus_name}_test.csv' # load csv or download if csv: print('Loading dataset from csv...') df = pd.read_csv(os.path.join(data_path, source)) else: print('Downloading dataset...') corp = Corpus(filename=download(name)) df = corp.get_utterances_dataframe() # get only text df = df.rename(columns={text_field: "utterance"})[["utterance"]] # remove any tags df['utterance'] = df['utterance'].str.replace(r'<.*>', ' ') # subsample if subsample_rows: df = df.sample(subsample_rows, random_state=0) # split sentences if split_sentences: sentence_detector = nltk.data.load('tokenizers/punkt/english.pickle') df["utterance"] = df["utterance"] df["utterance"] = df["utterance"].apply(sentence_detector.tokenize) df = flatten_sentences(df) print('Cleaning') cln_fn = lambda x: clean( x, fix_unicode=True, # fix various unicode errors to_ascii=to_ascii, # transliterate to closest ASCII representation lower=True, # lowercase text no_line_breaks= False, # fully strip line breaks as opposed to only normalizing them no_urls=False, # replace all URLs with a special token no_emails=False, # replace all email addresses with a special token no_phone_numbers= False, # replace all phone numbers with a special token no_numbers=False, # replace all numbers with a special token no_digits=False, # replace all digits with a special token no_currency_symbols= False, # replace all currency symbols with a special token no_punct=False, # remove punctuations lang="en" # set to 'de' for German special handling ) # clean df["utterance"] = df["utterance"].apply(cln_fn) if not punct: df["utterance"] = df["utterance"].str.replace( r"[{}]".format(punctuation), ' ') # tokenize sen_by_words = df["utterance"].apply(word_tokenize) word_counts = sen_by_words.apply(len) sen_by_words = sen_by_words[(word_counts <= max_len) & (word_counts >= min_len)] df = sen_by_words.to_frame() # no split if test_size == 1: if not save: return df print(len(df)) df.to_csv(corpus_test, index=False) return df, f'{corpus_name}_test.csv' # split df_train, df_val = train_test_split(df, test_size=test_size, random_state=0) print(len(df_train)) print(len(df_val)) if not save: return df_train, df_val df_train.to_csv(corpus_train, index=False) df_val.to_csv(corpus_test, index=False) return df_train, df_val, f'{corpus_name}_train.csv', f'{corpus_name}_test.csv'
from ast import literal_eval as make_tuple from collections import defaultdict from scipy import sparse from sklearn.externals import joblib from sklearn.cluster import KMeans from sklearn.metrics.pairwise import pairwise_distances from sklearn.preprocessing import Normalizer from spacy.en import English from spacy.symbols import * from spacy.tokens.doc import Doc #Initialize QuestionTypology class pretrained on Parliament Dataset num_clusters = 8 data_dir = download('parliament-corpus') motifs_dir = download('parliament-motifs') corpus = Corpus(filename=os.path.join(data_dir, 'parliament-corpus')) questionTypology = QuestionTypology(corpus, data_dir, dataset_name='parliament', motifs_dir=motifs_dir, num_dims=25, num_clusters=num_clusters, verbose=False, random_seed=164) #Determine type of input question
import sys import convokit corpus = convokit.Corpus(filename=convokit.download("subreddit-Cornell")) print(corpus.meta) threads = corpus.utterance_threads(prefix_len=10, include_root=False) def disp(thread, root, indent=0): print(" "*indent + thread[root].user.name + ": " + thread[root].text.replace("\n", " ")) children = [k for k, v in thread.items() if v.reply_to == root] for child in children: disp(thread, child, indent=indent+4) if len(sys.argv) > 1: for root in sys.argv[1:]: print("--- {} ---".format(root)) disp(threads[root], root) print() else: while True: print("Enter thread root ID (e.g. {}): ".format(next(iter(threads))), end="") root = input() print("--- {} ---".format(root)) disp(threads[root], root) print()