def _read(self, corpus_split): corpus_split = corpus_split.split('_') corpus_name = corpus_split[0] self.split = corpus_split[1] if len(corpus_split) > 1 else None corpus = Corpus(filename=download(corpus_name)) conversations = corpus.iter_conversations() if self.sample: conversations = itertools.islice(conversations, self.sample) for conv in conversations: meta = conv.meta if (meta.get('split') != self.split) and (meta.get( 'annotation_year', 2018) != 2018): continue label = str(meta[self.label_field]) # turns = [u.text for u in conv.iter_utterances() if u.text.strip() and (not u.meta.get('is_section_header'))] turns = [ u.meta.parsed for u in conv.iter_utterances() if not u.meta.get('is_section_header') ] end = len(turns) - 1 if self.forecast else None turns = turns[-self.max_turns:end] if turns and all(turns): inst = self.text_to_instance(turns, label) if inst: yield inst
def load_conversations(corpus_name, max_samples, eval_percent=0.1): logging.info('Loading data.') def split_data(inputs, outputs, eval_percent): eval_index = int(len(inputs) * (1 - eval_percent)) return (inputs[:eval_index], outputs[:eval_index], inputs[eval_index:], outputs[eval_index:]) corpus = Corpus(filename=download(corpus_name)) deleted_filter = re.compile(r'^(\[deleted]|\[removed])$') inputs, outputs = [], [] for paths in corpus.iter_conversations(): for path in paths.get_root_to_leaf_paths(): for i in range(len(path) - 1): if deleted_filter.match(path[i].text) \ or deleted_filter.match(path[i-1].text) \ or deleted_filter.match(path[i+1].text): continue inputs.append(path[i].text) outputs.append(path[i + 1].text) if len(inputs) >= max_samples: return split_data(inputs, outputs, eval_percent) logging.info('Done!') return split_data(inputs, outputs, eval_percent)
def rank2(self, corpus: Corpus, score=None): if score == None: score = self.convo_length h = defaultdict(list) for convo in corpus.iter_conversations(): h[score(corpus, convo)].append(convo) return h
def rank(self, corpus: Corpus, score=None): if score == None: score = self.convo_length h = [] for convo in corpus.iter_conversations(): heappush(h, (score(corpus, convo), len(h), convo)) while len(h) > 0: yield heappop(h)
def add_title_to_root(corpus: Corpus): for conversation in corpus.iter_conversations(): utterance = corpus.get_utterance(conversation.id) title = conversation.retrieve_meta('title') if title is None: title = '' if utterance.text is None: utterance.text = title else: utterance.text = title + ' ' + utterance.text
def transform(self, corpus: Corpus) -> Corpus: corpus = copy.deepcopy(corpus) for convo in corpus.iter_conversations(): if 'rank' in convo.meta.keys(): raise Exception( 'rank is already a key in this conversations meta! aborting' ) t = 0 for id in convo._utterance_ids: u = corpus.get_utterance(id) t += len(u.text) convo.meta['rank'] = t return corpus
def main() -> None: args = parser.parse_args() if args.gpu is None: device = torch.device('cpu') else: device = torch.device('cuda:{}'.format(args.gpu)) corpus = Corpus(filename=download(CORPUS)) add_title_to_root(corpus) tokenizer = AutoTokenizer.from_pretrained(args.model_name) conversations = list(corpus.iter_conversations()) train_ceil = math.ceil(len(conversations) * args.train_split) train_conversations = conversations[:train_ceil] val_conversations = conversations[train_ceil:] train_dataset = CoarseDiscourseDataset( corpus, train_conversations, tokenizer, max_len=args.max_conversation_len, max_tokenization_len=args.utterance_max) val_dataset = CoarseDiscourseDataset( corpus, val_conversations, tokenizer, max_len=args.max_conversation_len, max_tokenization_len=args.utterance_max) val_dataset.label_encoder = train_dataset.label_encoder train_sampler = ConversationPathBatchSampler( args.batch_size, 1, train_dataset.get_indices_by_len()) val_sampler = ConversationPathBatchSampler( args.batch_size * 4, 1, val_dataset.get_indices_by_len()) train_loader = DataLoader(train_dataset, batch_sampler=train_sampler, collate_fn=conversation_path_collate_fn, pin_memory=True) val_loader = DataLoader(val_dataset, batch_sampler=val_sampler, collate_fn=conversation_path_collate_fn, pin_memory=True) num_training_steps = args.epochs * len(train_dataset) model = AutoModelForTokenClassification.from_pretrained( args.model_name, num_labels=len(train_dataset.label_encoder.classes_)) model.to(device) if args.pretrain_path is not None: checkpoint = torch.load(args.pretrain_path, map_location=device) model.bert.load_state_dict(checkpoint['state_dict']) optimizer = AdamW(model.parameters(), args.learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=WARMUP_RATIO * num_training_steps, num_training_steps=num_training_steps) scaler = GradScaler() for epoch in range(args.epochs): print('Epoch {}'.format(epoch)) train(train_loader, model, optimizer, scheduler, scaler, device, tokenizer.sep_token_id) validate(val_loader, model, device, tokenizer.sep_token_id)
def main() -> None: args = parser.parse_args() if args.gpu is None: device = torch.device('cpu') else: device = torch.device('cuda:{}'.format(args.gpu)) corpus = Corpus(filename=download(args.corpus)) if args.corpus == 'conversations-gone-awry-cmv-corpus': DatasetClass = ConversationsGoneAwryDataset n_classes = 1 criterion = nn.BCEWithLogitsLoss() elif args.corpus == 'winning-args-corpus': corpus = filter_winning_arguments_corpus(corpus) DatasetClass = WinningArgumentsDataset n_classes = 1 criterion = nn.BCEWithLogitsLoss() else: raise ValueError('Corpus {} not currently supported'.format( args.corpus)) add_title_to_root(corpus) tokenizer = AutoTokenizer.from_pretrained(args.model_name) conversations = list(corpus.iter_conversations()) train_ceil = math.ceil(len(conversations) * args.train_split) train_conversations = conversations[:train_ceil] val_conversations = conversations[train_ceil:] train_dataset = DatasetClass(corpus, train_conversations, tokenizer, max_len=args.max_conversation_len, max_tokenization_len=args.utterance_max) val_dataset = DatasetClass(corpus, val_conversations, tokenizer, max_len=args.max_conversation_len, max_tokenization_len=args.utterance_max) train_sampler = ConversationPathBatchSampler( args.batch_size, 1, train_dataset.get_indices_by_len()) val_sampler = ConversationPathBatchSampler( args.batch_size, 1, val_dataset.get_indices_by_len()) train_loader = DataLoader(train_dataset, batch_sampler=train_sampler, collate_fn=conversation_path_collate_fn, pin_memory=True) val_loader = DataLoader(val_dataset, batch_sampler=val_sampler, collate_fn=conversation_path_collate_fn, pin_memory=True) num_training_steps = args.epochs * len(train_dataset) model = AutoModelForSequenceClassification.from_pretrained( args.model_name, num_labels=n_classes) model.to(device) if args.pretrain_path is not None: checkpoint = torch.load(args.pretrain_path, map_location=device) model.bert.load_state_dict(checkpoint['state_dict']) optimizer = AdamW(model.parameters(), args.learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=WARMUP_RATIO * num_training_steps, num_training_steps=num_training_steps) scaler = GradScaler() for epoch in range(args.epochs): print('Epoch {}'.format(epoch)) train(train_loader, model, criterion, optimizer, scheduler, scaler, device) validate(val_loader, model, criterion, device)
from convokit import Corpus, download import sys import random if __name__ == '__main__': corpus_name = sys.argv[1] output_filename = sys.argv[2] corpus = Corpus(filename=download(corpus_name)) char_list = ['<sos>', '<eos>', '<pad>', '<unk>'] sequences = [] for convo in corpus.iter_conversations(): title = convo.meta['title'] text = convo.get_utterance( convo.get_chronological_utterance_list()[0].conversation_id).text if text == '' or text == '[deleted]' or text == '[removed]': continue else: post = title + '\t' + text post = post.replace('\n', '').lower() sequence = '' for character in post: if character in char_list: sequence += str(char_list.index(character)) + ' ' else: char_list.append(character) sequence += str(len(char_list) - 1) + ' ' sequences.append(sequence[:-1]) random.shuffle(sequences)
import pandas as pd from convokit import Corpus, download # This scripts downloads the following datasets using ConvKit (https://convokit.cornell.edu/) # - Stanford Politeness Corpus (Wikipedia) # - Stanford Politeness Corpus (Stack Exchange) # This code is based on the following notebook: # - https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/examples/conversations-gone-awry/Conversations_Gone_Awry_Prediction.ipynb for dataset_name in ['stack-exchange-politeness-corpus', 'wikipedia-politeness-corpus']: corpus = Corpus(filename=download(dataset_name)) kept_conversations = {c.id: c for c in corpus.iter_conversations()} kept_utterances = {} for convo_id in kept_conversations: for utterance in kept_conversations[convo_id].iter_utterances(): kept_utterances[utterance.id] = utterance corpus.conversations = kept_conversations corpus.utterances = kept_utterances print('{}: {} utterances'.format(dataset_name, len(corpus.utterances))) texts = [ corpus.utterances[id].text for id in iter(corpus.utterances) ] labels = [ corpus.utterances[id].meta['Binary'] for id in iter(corpus.utterances) ] df = pd.DataFrame(data={ 'text': texts, 'label': labels }) df.to_csv('./{}.csv'.format(dataset_name), index=False)
import warnings warnings.filterwarnings('ignore') # In 8 AWRY_ROOT_DIR = BASE_DIR + '/conversations-gone-awry-corpus' awry_corpus = Corpus(AWRY_ROOT_DIR) awry_corpus.load_info('utterance', ['parsed']) # In 9 # first, construct a table of conversations that meet the filter criteria (annotation_year = '2018') kept_conversations = { c.id: c for c in awry_corpus.iter_conversations() if c.meta['annotation_year'] == "2018" } # next, construct a filtered utterance table containing only the utterances in the filtered conversations kept_utterances = {} for convo_id in kept_conversations: for utterance in kept_conversations[convo_id].iter_utterances(): kept_utterances[utterance.id] = utterance # finally, we overwrite the `conversations` and `utterances` fields of the Corpus object to turn it into a filtered Corpus. awry_corpus.conversations = kept_conversations awry_corpus.utterances = kept_utterances # make sure the size is what we expect print(len(awry_corpus.conversations))
def main() -> None: global best_loss step = 0 args = parser.parse_args() if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) cudnn.deterministic = True warnings.warn('You have chosen to seed training. ' 'This will turn on the CUDNN deterministic setting, ' 'which can slow down your training considerably! ' 'You may see unexpected behavior when restarting ' 'from checkpoints.') device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if args.start_index is not None or args.end_index is not None: start_index = args.start_index end_index = args.end_index if start_index is None: start_index = 0 if end_index is None: corpus = Corpus(filename=download(args.corpus), utterance_start_index=start_index) else: corpus = Corpus(filename=download(args.corpus), utterance_start_index=start_index, utterance_end_index=end_index) else: corpus = Corpus(filename=download(args.corpus)) add_title_to_root(corpus) conversations = list(corpus.iter_conversations()) tokenizer = AutoTokenizer.from_pretrained(args.model_name) dataset = ConversationPathDataset(corpus, tokenizer, min_len=args.conversation_min, max_len=args.conversation_max, n_neighbors=args.num_neighbors, max_tokenization_len=args.utterance_max) sampler = ConversationPathBatchSampler(args.batch_size, dataset.min_len, dataset.get_indices_by_len()) loader = DataLoader(dataset, batch_sampler=sampler, collate_fn=conversation_path_collate_fn, pin_memory=device.type != 'cpu', num_workers=4) # utterance_encoder = AutoModel.from_pretrained(args.model_name) # conversation_encoder = nn.LSTM(utterance_encoder.config.hidden_size, args.hidden, args.num_layers) # model = ConversationClassificationHRNN(utterance_encoder, conversation_encoder, 1) # mlm_head = AutoModelForMaskedLM.from_pretrained(args.model_name).predictions model = AutoModelForMultipleChoice.from_pretrained(args.model_name) model.to(device) # mlm_head.to(device) criterion = nn.CrossEntropyLoss() # optimizer = AdamW(list(model.parameters()) + list(mlm_head.parameters()), args.learning_rate) optimizer = AdamW(list(model.parameters()), args.learning_rate) scheduler = get_linear_schedule_with_warmup( optimizer, num_warmup_steps=WARMUP_RATIO * args.training_steps, num_training_steps=args.training_steps) scaler = GradScaler() if args.resume_path is not None: if os.path.isfile(args.resume_path): print("=> loading checkpoint '{}'".format(args.resume_path)) checkpoint = torch.load(args.resume_path, map_location=device) step = checkpoint['step'] best_loss = checkpoint['best_loss'] model.bert.load_state_dict(checkpoint['state_dict']) # mlm_head.load_state_dict(checkpoint['head_state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) print("=> loaded checkpoint '{}' (step {})".format( args.resume_path, checkpoint['step'])) else: print("=> no checkpoint found at '{}'".format(args.resume_path)) while step < args.training_steps: loop_steps = args.loop_steps if args.training_steps - step > args.loop_steps else args.training_steps - step # loss = train(loader, model, mlm_head, criterion, optimizer, scheduler, scaler, # device, loop_steps, step // args.loop_steps) loss = train(loader, model, criterion, optimizer, scheduler, scaler, device, loop_steps, step // args.loop_steps) step += loop_steps # checkpoint model every k training loops k = 2 if step % (k * args.loop_steps) == 0 or step == args.training_steps: is_best = loss < best_loss best_loss = min(loss, best_loss) run_name = '{}.{}.{}.{}.{}'.format( args.model_name.split('/')[-1], args.corpus, args.conversation_max, args.num_neighbors, args.utterance_max) # save_checkpoint({ # 'step': step, # 'model': args.model_name, # 'state_dict': model.state_dict(), # 'head_state_dict': mlm_head.state_dict(), # 'best_loss': best_loss, # 'optimizer': optimizer.state_dict(), # 'scheduler': scheduler.state_dict() # }, is_best, run_name) save_checkpoint( { 'step': step, 'model': args.model_name, 'state_dict': model.bert.state_dict(), 'best_loss': best_loss, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict() }, is_best, run_name)
def order(self, corpus: Corpus): return sorted(list(corpus.iter_conversations()), key=lambda convo: convo.meta['rank'])