def main(): """ here is the plan: for each dialogue create a history sequence of sentences seperated by <s>. The sentences in the history must occur in a short time span from another so they are relevant. The last sentence becomes the response where the response must also be in the span :return: """ parser = argparse.ArgumentParser() parser.add_argument( "-dataset_dir", default="./datasets/personachat/raw", type=str, required=False, help="The input data dir. Should contain the xml for the task.") parser.add_argument("-output_dir", default="./datasets/personachat/", type=str, required=False, help="The output data dir.") parser.add_argument("-type", default="none_original", type=str, required=False, help="The genres you would like to use.") parser.add_argument("-max_sentence_tokens", default=30, type=int, help="the maximum amout of sentence tokens") parser.add_argument( "-a_nice_note", default="only dialogues 1-10", type=str, required=False, help="leave a nice lil note for yourself in the future") parser.add_argument( '-train_split', default=0.9, type=float, help= 'fraction of dataset to use for training, remainder is halved for val & test' ) parser.add_argument('-vocab_size', default=20000, type=int, help='maximum size of the vocabulary for training') args = parser.parse_args() filename = os.path.join(args.dataset_dir, "train_{}.txt".format(args.type)) conversations = create_dialogues(filename, args.max_sentence_tokens) for conversation in conversations: for utterance in conversation: if len(utterance) != args.max_sentence_tokens: print('Length of utterance not equal max: %s' % len(utterance)) exit() print(conversations[0]) # shuffle dataset random.seed('seed') random.shuffle(conversations) print('Number of conversations: %s' % len(conversations)) mean_n_convos = sum([len(conv) for conv in conversations]) / len(conversations) print('Average utterances per conversations: %s' % mean_n_convos) # this is format needed to train dialogue models on this domain def format_for_dialogue(conversations): conversation_length = [len(conv) for conv in conversations] sentence_length = [[ sum([1 for token in sent if token != '<pad>']) for sent in conv ] for conv in conversations] sentences = conversations return conversation_length, sentence_length, sentences val_idx = int(len(conversations) * args.train_split) test_idx = (len(conversations) + val_idx) // 2 print(val_idx) train_convos = conversations[:val_idx] val_convos = conversations[val_idx:test_idx] test_convos = conversations[test_idx:] # construct vocab vocab = Vocab() vocab.add_dataframe(train_convos, tokenized=True) vocab.update(args.vocab_size) print('Vocab size: %s' % len(vocab)) word2id_path = os.path.join(args.output_dir, 'word2id.pkl') id2word_path = os.path.join(args.output_dir, 'id2word.pkl') vocab.pickle(word2id_path, id2word_path) print('Split: train %s, val %s, test %s' % (len(train_convos), len(val_convos), len(test_convos))) os.makedirs(args.output_dir, exist_ok=True) train_convo_len, train_sent_len, train_sent = format_for_dialogue( train_convos) print('Example data') print(train_convo_len[0]) print(train_sent_len[0]) print(train_sent[0]) print() os.makedirs(os.path.join(args.output_dir, 'train'), exist_ok=True) pickle.dump( train_convo_len, open(os.path.join(args.output_dir, 'train', 'conversation_length.pkl'), 'wb')) pickle.dump( train_sent_len, open(os.path.join(args.output_dir, 'train', 'sentence_length.pkl'), 'wb')) pickle.dump( train_sent, open(os.path.join(args.output_dir, 'train', 'sentences.pkl'), 'wb')) val_convo_len, val_sent_len, val_sent = format_for_dialogue(val_convos) os.makedirs(os.path.join(args.output_dir, 'valid'), exist_ok=True) pickle.dump( val_convo_len, open(os.path.join(args.output_dir, 'valid', 'conversation_length.pkl'), 'wb')) pickle.dump( val_sent_len, open(os.path.join(args.output_dir, 'valid', 'sentence_length.pkl'), 'wb')) pickle.dump( val_sent, open(os.path.join(args.output_dir, 'valid', 'sentences.pkl'), 'wb')) test_convo_len, test_sent_len, test_sent = format_for_dialogue(test_convos) os.makedirs(os.path.join(args.output_dir, 'test'), exist_ok=True) pickle.dump( test_convo_len, open(os.path.join(args.output_dir, 'test', 'conversation_length.pkl'), 'wb')) pickle.dump( test_sent_len, open(os.path.join(args.output_dir, 'test', 'sentence_length.pkl'), 'wb')) pickle.dump( test_sent, open(os.path.join(args.output_dir, 'test', 'sentences.pkl'), 'wb'))
# [n_conversations, conversation_length (various)] conversation_length = [ min(len(conversation), max_conv_len) for conversation in conversations ] sentences, sentence_length = preprocess_utils.pad_sentences( conversations, max_sentence_length=max_sent_len, max_conversation_length=max_conv_len) print('Saving preprocessed data at', split_data_dir) to_pickle(conversation_length, split_data_dir.joinpath('conversation_length.pkl')) to_pickle(conversations, split_data_dir.joinpath('sentences.pkl')) to_pickle(sentence_length, split_data_dir.joinpath('sentence_length.pkl')) if split_type == 'train': print('Save Vocabulary...') vocab = Vocab(tokenizer) vocab.add_dataframe(conversations) vocab.update(max_size=max_vocab_size, min_freq=min_freq) print('Vocabulary size: ', len(vocab)) vocab.pickle(ubuntu_dir.joinpath('word2id.pkl'), ubuntu_dir.joinpath('id2word.pkl')) print('Done!')
return tokenize_conversation(conv) with Pool(n_workers) as pool: conversations = list(tqdm(pool.imap(_tokenize_conversation, conv_objects), total=len(conv_objects))) conversation_length = [min(len(conv), max_conv_len) for conv in conv_objects] sentences, sentence_length = preprocess_utils.pad_sentences( conversations, max_sentence_length=max_sent_len, max_conversation_length=max_conv_len) print('Saving preprocessed data at', split_data_dir) to_pickle(conversation_length, split_data_dir.joinpath('conversation_length.pkl')) to_pickle(sentences, split_data_dir.joinpath('sentences.pkl')) to_pickle(sentence_length, split_data_dir.joinpath('sentence_length.pkl')) if split_type == 'train': print('Save Vocabulary...') vocab = Vocab(tokenizer) vocab.add_dataframe(conversations) vocab.update(max_size=max_vocab_size, min_freq=min_freq) print('Vocabulary size: ', len(vocab)) vocab.pickle(data_dir.joinpath('word2id.pkl'), data_dir.joinpath('id2word.pkl')) print('Done!')
total=len(conv_objects))) conversation_length = [ min(len(conv['lines']), max_conv_len) for conv in conv_objects ] sentences, sentence_length = pad_sentences( conversations, max_sentence_length=max_sent_len, max_conversation_length=max_conv_len) print('Saving preprocessed data at', split_data_dir) to_pickle(conversation_length, split_data_dir.joinpath('conversation_length.pkl')) to_pickle(sentences, split_data_dir.joinpath('sentences.pkl')) to_pickle(sentence_length, split_data_dir.joinpath('sentence_length.pkl')) if split_type == 'train': print('Save Vocabulary...') vocab = Vocab(tokenizer) vocab.add_dataframe(conversations) vocab.update(max_size=max_vocab_size, min_freq=min_freq) print('Vocabulary size: ', len(vocab)) vocab.pickle(cornell_dir.joinpath('word2id.pkl'), cornell_dir.joinpath('id2word.pkl')) print('Done!')
with Pool(n_workers) as pool: conversations = list(tqdm(pool.imap(_tokenize_conversation, conv_objects), total=len(conv_objects))) conversation_length = [min(len(conv['content']), max_conv_len) for conv in conv_objects] sentences, sentence_length = pad_sentences( conversations, max_sentence_length=max_sent_len, max_conversation_length=max_conv_len) print('max conversation turns:', max(conversation_length)) print('max_sentence_length:', max(flat(sentence_length))) print('Saving preprocessed data at', split_data_dir) to_pickle(conversation_length, split_data_dir.joinpath('conversation_length.pkl')) to_pickle(sentences, split_data_dir.joinpath('sentences.pkl')) to_pickle(sentence_length, split_data_dir.joinpath('sentence_length.pkl')) if split_type == 'train': print('Save Vocabulary...') vocab = Vocab(tokenizer) vocab.add_dataframe(conversations) vocab.update(max_size=max_vocab_size, min_freq=min_freq) print('Vocabulary size: ', len(vocab)) vocab.pickle(topical_conv_dir.joinpath('word2id.pkl'), topical_conv_dir.joinpath('id2word.pkl')) print('Done!')