コード例 #1
0
    def _read(self, corpus_split):
        corpus_split = corpus_split.split('_')

        corpus_name = corpus_split[0]
        self.split = corpus_split[1] if len(corpus_split) > 1 else None

        corpus = Corpus(filename=download(corpus_name))
        conversations = corpus.iter_conversations()
        if self.sample:
            conversations = itertools.islice(conversations, self.sample)

        for conv in conversations:
            meta = conv.meta

            if (meta.get('split') != self.split) and (meta.get(
                    'annotation_year', 2018) != 2018):
                continue

            label = str(meta[self.label_field])
            # turns = [u.text for u in conv.iter_utterances() if u.text.strip() and (not u.meta.get('is_section_header'))]
            turns = [
                u.meta.parsed for u in conv.iter_utterances()
                if not u.meta.get('is_section_header')
            ]

            end = len(turns) - 1 if self.forecast else None
            turns = turns[-self.max_turns:end]

            if turns and all(turns):
                inst = self.text_to_instance(turns, label)
                if inst:
                    yield inst
コード例 #2
0
ファイル: preprocessing.py プロジェクト: naripok/transformer
def load_conversations(corpus_name, max_samples, eval_percent=0.1):
    logging.info('Loading data.')

    def split_data(inputs, outputs, eval_percent):
        eval_index = int(len(inputs) * (1 - eval_percent))
        return (inputs[:eval_index], outputs[:eval_index], inputs[eval_index:],
                outputs[eval_index:])

    corpus = Corpus(filename=download(corpus_name))

    deleted_filter = re.compile(r'^(\[deleted]|\[removed])$')

    inputs, outputs = [], []
    for paths in corpus.iter_conversations():
        for path in paths.get_root_to_leaf_paths():
            for i in range(len(path) - 1):

                if deleted_filter.match(path[i].text) \
                or deleted_filter.match(path[i-1].text) \
                or deleted_filter.match(path[i+1].text):
                    continue

                inputs.append(path[i].text)
                outputs.append(path[i + 1].text)

                if len(inputs) >= max_samples:
                    return split_data(inputs, outputs, eval_percent)

    logging.info('Done!')
    return split_data(inputs, outputs, eval_percent)
コード例 #3
0
ファイル: Rank.py プロジェクト: jschluger/first-convokit
 def rank2(self, corpus: Corpus, score=None):
     if score == None:
         score = self.convo_length
     h = defaultdict(list)
     for convo in corpus.iter_conversations():
         h[score(corpus, convo)].append(convo)
     return h
コード例 #4
0
ファイル: Rank.py プロジェクト: jschluger/first-convokit
 def rank(self, corpus: Corpus, score=None):
     if score == None:
         score = self.convo_length
     h = []
     for convo in corpus.iter_conversations():
         heappush(h, (score(corpus, convo), len(h), convo))
     while len(h) > 0:
         yield heappop(h)
コード例 #5
0
ファイル: data.py プロジェクト: rlaboulaye/turn-of-phrase
def add_title_to_root(corpus: Corpus):
    for conversation in corpus.iter_conversations():
        utterance = corpus.get_utterance(conversation.id)
        title = conversation.retrieve_meta('title')
        if title is None:
            title = ''
        if utterance.text is None:
            utterance.text = title
        else:
            utterance.text = title + ' ' + utterance.text
コード例 #6
0
ファイル: Rank.py プロジェクト: jschluger/first-convokit
 def transform(self, corpus: Corpus) -> Corpus:
     corpus = copy.deepcopy(corpus)
     for convo in corpus.iter_conversations():
         if 'rank' in convo.meta.keys():
             raise Exception(
                 'rank is already a key in this conversations meta! aborting'
             )
         t = 0
         for id in convo._utterance_ids:
             u = corpus.get_utterance(id)
             t += len(u.text)
         convo.meta['rank'] = t
     return corpus
コード例 #7
0
def main() -> None:

    args = parser.parse_args()

    if args.gpu is None:
        device = torch.device('cpu')
    else:
        device = torch.device('cuda:{}'.format(args.gpu))

    corpus = Corpus(filename=download(CORPUS))
    add_title_to_root(corpus)

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)

    conversations = list(corpus.iter_conversations())
    train_ceil = math.ceil(len(conversations) * args.train_split)
    train_conversations = conversations[:train_ceil]
    val_conversations = conversations[train_ceil:]

    train_dataset = CoarseDiscourseDataset(
        corpus,
        train_conversations,
        tokenizer,
        max_len=args.max_conversation_len,
        max_tokenization_len=args.utterance_max)
    val_dataset = CoarseDiscourseDataset(
        corpus,
        val_conversations,
        tokenizer,
        max_len=args.max_conversation_len,
        max_tokenization_len=args.utterance_max)
    val_dataset.label_encoder = train_dataset.label_encoder
    train_sampler = ConversationPathBatchSampler(
        args.batch_size, 1, train_dataset.get_indices_by_len())
    val_sampler = ConversationPathBatchSampler(
        args.batch_size * 4, 1, val_dataset.get_indices_by_len())
    train_loader = DataLoader(train_dataset,
                              batch_sampler=train_sampler,
                              collate_fn=conversation_path_collate_fn,
                              pin_memory=True)
    val_loader = DataLoader(val_dataset,
                            batch_sampler=val_sampler,
                            collate_fn=conversation_path_collate_fn,
                            pin_memory=True)

    num_training_steps = args.epochs * len(train_dataset)

    model = AutoModelForTokenClassification.from_pretrained(
        args.model_name, num_labels=len(train_dataset.label_encoder.classes_))
    model.to(device)

    if args.pretrain_path is not None:
        checkpoint = torch.load(args.pretrain_path, map_location=device)
        model.bert.load_state_dict(checkpoint['state_dict'])

    optimizer = AdamW(model.parameters(), args.learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=WARMUP_RATIO * num_training_steps,
        num_training_steps=num_training_steps)
    scaler = GradScaler()

    for epoch in range(args.epochs):
        print('Epoch {}'.format(epoch))
        train(train_loader, model, optimizer, scheduler, scaler, device,
              tokenizer.sep_token_id)
        validate(val_loader, model, device, tokenizer.sep_token_id)
コード例 #8
0
def main() -> None:

    args = parser.parse_args()

    if args.gpu is None:
        device = torch.device('cpu')
    else:
        device = torch.device('cuda:{}'.format(args.gpu))

    corpus = Corpus(filename=download(args.corpus))

    if args.corpus == 'conversations-gone-awry-cmv-corpus':
        DatasetClass = ConversationsGoneAwryDataset
        n_classes = 1
        criterion = nn.BCEWithLogitsLoss()
    elif args.corpus == 'winning-args-corpus':
        corpus = filter_winning_arguments_corpus(corpus)
        DatasetClass = WinningArgumentsDataset
        n_classes = 1
        criterion = nn.BCEWithLogitsLoss()
    else:
        raise ValueError('Corpus {} not currently supported'.format(
            args.corpus))

    add_title_to_root(corpus)

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)

    conversations = list(corpus.iter_conversations())
    train_ceil = math.ceil(len(conversations) * args.train_split)
    train_conversations = conversations[:train_ceil]
    val_conversations = conversations[train_ceil:]

    train_dataset = DatasetClass(corpus,
                                 train_conversations,
                                 tokenizer,
                                 max_len=args.max_conversation_len,
                                 max_tokenization_len=args.utterance_max)
    val_dataset = DatasetClass(corpus,
                               val_conversations,
                               tokenizer,
                               max_len=args.max_conversation_len,
                               max_tokenization_len=args.utterance_max)
    train_sampler = ConversationPathBatchSampler(
        args.batch_size, 1, train_dataset.get_indices_by_len())
    val_sampler = ConversationPathBatchSampler(
        args.batch_size, 1, val_dataset.get_indices_by_len())
    train_loader = DataLoader(train_dataset,
                              batch_sampler=train_sampler,
                              collate_fn=conversation_path_collate_fn,
                              pin_memory=True)
    val_loader = DataLoader(val_dataset,
                            batch_sampler=val_sampler,
                            collate_fn=conversation_path_collate_fn,
                            pin_memory=True)

    num_training_steps = args.epochs * len(train_dataset)

    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_name, num_labels=n_classes)
    model.to(device)

    if args.pretrain_path is not None:
        checkpoint = torch.load(args.pretrain_path, map_location=device)
        model.bert.load_state_dict(checkpoint['state_dict'])

    optimizer = AdamW(model.parameters(), args.learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=WARMUP_RATIO * num_training_steps,
        num_training_steps=num_training_steps)
    scaler = GradScaler()

    for epoch in range(args.epochs):
        print('Epoch {}'.format(epoch))
        train(train_loader, model, criterion, optimizer, scheduler, scaler,
              device)
        validate(val_loader, model, criterion, device)
コード例 #9
0
from convokit import Corpus, download
import sys
import random

if __name__ == '__main__':

    corpus_name = sys.argv[1]
    output_filename = sys.argv[2]
    corpus = Corpus(filename=download(corpus_name))

    char_list = ['<sos>', '<eos>', '<pad>', '<unk>']
    sequences = []
    for convo in corpus.iter_conversations():
        title = convo.meta['title']
        text = convo.get_utterance(
            convo.get_chronological_utterance_list()[0].conversation_id).text
        if text == '' or text == '[deleted]' or text == '[removed]':
            continue
        else:
            post = title + '\t' + text
            post = post.replace('\n', '').lower()
            sequence = ''
            for character in post:
                if character in char_list:
                    sequence += str(char_list.index(character)) + ' '
                else:
                    char_list.append(character)
                    sequence += str(len(char_list) - 1) + ' '
            sequences.append(sequence[:-1])

    random.shuffle(sequences)
コード例 #10
0
import pandas as pd
from convokit import Corpus, download

# This scripts downloads the following datasets using ConvKit (https://convokit.cornell.edu/)
# - Stanford Politeness Corpus (Wikipedia)
# - Stanford Politeness Corpus (Stack Exchange)
# This code is based on the following notebook:
# - https://github.com/CornellNLP/Cornell-Conversational-Analysis-Toolkit/blob/master/examples/conversations-gone-awry/Conversations_Gone_Awry_Prediction.ipynb
for dataset_name in ['stack-exchange-politeness-corpus', 'wikipedia-politeness-corpus']:
    corpus = Corpus(filename=download(dataset_name))

    kept_conversations = {c.id: c for c in corpus.iter_conversations()}
    kept_utterances = {}
    for convo_id in kept_conversations:
        for utterance in kept_conversations[convo_id].iter_utterances():
            kept_utterances[utterance.id] = utterance

    corpus.conversations = kept_conversations
    corpus.utterances = kept_utterances
    print('{}: {} utterances'.format(dataset_name, len(corpus.utterances)))

    texts = [ corpus.utterances[id].text for id in iter(corpus.utterances) ]
    labels = [ corpus.utterances[id].meta['Binary'] for id in iter(corpus.utterances) ]
    df = pd.DataFrame(data={
        'text': texts,
        'label': labels
    })
    df.to_csv('./{}.csv'.format(dataset_name), index=False)
コード例 #11
0
import warnings

warnings.filterwarnings('ignore')

# In 8

AWRY_ROOT_DIR = BASE_DIR + '/conversations-gone-awry-corpus'
awry_corpus = Corpus(AWRY_ROOT_DIR)
awry_corpus.load_info('utterance', ['parsed'])

# In 9

# first, construct a table of conversations that meet the filter criteria (annotation_year = '2018')
kept_conversations = {
    c.id: c
    for c in awry_corpus.iter_conversations()
    if c.meta['annotation_year'] == "2018"
}

# next, construct a filtered utterance table containing only the utterances in the filtered conversations
kept_utterances = {}
for convo_id in kept_conversations:
    for utterance in kept_conversations[convo_id].iter_utterances():
        kept_utterances[utterance.id] = utterance

# finally, we overwrite the `conversations` and `utterances` fields of the Corpus object to turn it into a filtered Corpus.
awry_corpus.conversations = kept_conversations
awry_corpus.utterances = kept_utterances

# make sure the size is what we expect
print(len(awry_corpus.conversations))
コード例 #12
0
def main() -> None:
    global best_loss
    step = 0

    args = parser.parse_args()

    if args.seed is not None:
        random.seed(args.seed)
        torch.manual_seed(args.seed)
        cudnn.deterministic = True
        warnings.warn('You have chosen to seed training. '
                      'This will turn on the CUDNN deterministic setting, '
                      'which can slow down your training considerably! '
                      'You may see unexpected behavior when restarting '
                      'from checkpoints.')

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    if args.start_index is not None or args.end_index is not None:
        start_index = args.start_index
        end_index = args.end_index
        if start_index is None:
            start_index = 0
        if end_index is None:
            corpus = Corpus(filename=download(args.corpus),
                            utterance_start_index=start_index)
        else:
            corpus = Corpus(filename=download(args.corpus),
                            utterance_start_index=start_index,
                            utterance_end_index=end_index)
    else:
        corpus = Corpus(filename=download(args.corpus))

    add_title_to_root(corpus)

    conversations = list(corpus.iter_conversations())

    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
    dataset = ConversationPathDataset(corpus,
                                      tokenizer,
                                      min_len=args.conversation_min,
                                      max_len=args.conversation_max,
                                      n_neighbors=args.num_neighbors,
                                      max_tokenization_len=args.utterance_max)
    sampler = ConversationPathBatchSampler(args.batch_size, dataset.min_len,
                                           dataset.get_indices_by_len())
    loader = DataLoader(dataset,
                        batch_sampler=sampler,
                        collate_fn=conversation_path_collate_fn,
                        pin_memory=device.type != 'cpu',
                        num_workers=4)

    # utterance_encoder = AutoModel.from_pretrained(args.model_name)
    # conversation_encoder = nn.LSTM(utterance_encoder.config.hidden_size, args.hidden, args.num_layers)
    # model = ConversationClassificationHRNN(utterance_encoder, conversation_encoder, 1)
    # mlm_head = AutoModelForMaskedLM.from_pretrained(args.model_name).predictions
    model = AutoModelForMultipleChoice.from_pretrained(args.model_name)
    model.to(device)
    # mlm_head.to(device)
    criterion = nn.CrossEntropyLoss()
    # optimizer = AdamW(list(model.parameters()) + list(mlm_head.parameters()), args.learning_rate)
    optimizer = AdamW(list(model.parameters()), args.learning_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=WARMUP_RATIO * args.training_steps,
        num_training_steps=args.training_steps)
    scaler = GradScaler()

    if args.resume_path is not None:
        if os.path.isfile(args.resume_path):
            print("=> loading checkpoint '{}'".format(args.resume_path))
            checkpoint = torch.load(args.resume_path, map_location=device)
            step = checkpoint['step']
            best_loss = checkpoint['best_loss']
            model.bert.load_state_dict(checkpoint['state_dict'])
            # mlm_head.load_state_dict(checkpoint['head_state_dict'])
            optimizer.load_state_dict(checkpoint['optimizer'])
            scheduler.load_state_dict(checkpoint['scheduler'])
            print("=> loaded checkpoint '{}' (step {})".format(
                args.resume_path, checkpoint['step']))
        else:
            print("=> no checkpoint found at '{}'".format(args.resume_path))

    while step < args.training_steps:
        loop_steps = args.loop_steps if args.training_steps - step > args.loop_steps else args.training_steps - step
        # loss = train(loader, model, mlm_head, criterion, optimizer, scheduler, scaler,
        #     device, loop_steps, step // args.loop_steps)
        loss = train(loader, model, criterion, optimizer, scheduler, scaler,
                     device, loop_steps, step // args.loop_steps)
        step += loop_steps

        # checkpoint model every k training loops
        k = 2
        if step % (k * args.loop_steps) == 0 or step == args.training_steps:

            is_best = loss < best_loss
            best_loss = min(loss, best_loss)

            run_name = '{}.{}.{}.{}.{}'.format(
                args.model_name.split('/')[-1], args.corpus,
                args.conversation_max, args.num_neighbors, args.utterance_max)

            # save_checkpoint({
            #     'step': step,
            #     'model': args.model_name,
            #     'state_dict': model.state_dict(),
            #     'head_state_dict': mlm_head.state_dict(),
            #     'best_loss': best_loss,
            #     'optimizer': optimizer.state_dict(),
            #     'scheduler': scheduler.state_dict()
            # }, is_best, run_name)
            save_checkpoint(
                {
                    'step': step,
                    'model': args.model_name,
                    'state_dict': model.bert.state_dict(),
                    'best_loss': best_loss,
                    'optimizer': optimizer.state_dict(),
                    'scheduler': scheduler.state_dict()
                }, is_best, run_name)
コード例 #13
0
ファイル: Rank.py プロジェクト: jschluger/first-convokit
 def order(self, corpus: Corpus):
     return sorted(list(corpus.iter_conversations()),
                   key=lambda convo: convo.meta['rank'])