('test', test)]:
        print(f'Processing {split_type} dataset...')
        split_data_dir = datasets_dir.joinpath(split_type)
        split_data_dir.mkdir(exist_ok=True)
        conversation_length = [
            min(len(conv), max_conv_len) for conv in conversations
        ]

        sentences, sentence_length = pad_sentences(
            conversations,
            max_sentence_length=max_sent_len,
            max_conversation_length=max_conv_len)

        print('Saving preprocessed data at', split_data_dir)
        to_pickle(conversation_length,
                  split_data_dir.joinpath('conversation_length.pkl'))
        to_pickle(sentences, split_data_dir.joinpath('sentences.pkl'))
        to_pickle(sentence_length,
                  split_data_dir.joinpath('sentence_length.pkl'))

        if split_type != 'test':
            print('Save Vocabulary...')
            vocab.add_dataframe(conversations)
            vocab.update(max_size=max_vocab_size, min_freq=min_freq)

            print('Vocabulary size: ', len(vocab))
            vocab.pickle(datasets_dir.joinpath('word2id.pkl'),
                         datasets_dir.joinpath('id2word.pkl'))

    print('Done!')
Exemple #2
0
        sentences, sentence_length = pad_sentences(
            conversations,
            max_sentence_length=max_sent_len,
            max_conversation_length=max_conv_len)


        print('Saving preprocessed data at', split_data_dir)
        to_pickle(conversation_length, split_data_dir.joinpath('conversation_length.pkl'))
        to_pickle(sentences, split_data_dir.joinpath('sentences.pkl'))
        to_pickle(sentence_length, split_data_dir.joinpath('sentence_length.pkl'))
        to_pickle(emotions, split_data_dir.joinpath('labels.pkl'))

        if split_type == 'train':


            print('Save Vocabulary...')
            vocab = Vocab(tokenizer)
            vocab.add_dataframe(conversations)
            assert(GLOVE_DIR != "")
            vocab.update(GLOVE_DIR, max_size=max_vocab_size, min_freq=min_freq)

            print('Vocabulary size: ', len(vocab))
            vocab.pickle(dailydialog_dir.joinpath('word2id.pkl'),
                         dailydialog_dir.joinpath('id2word.pkl'),
                         dailydialog_dir.joinpath('word_emb.pkl'))
            

        print('Done!')

Exemple #3
0
        sentences, sentence_length = pad_sentences(
            conv_sentences,
            max_sentence_length=max_sent_len,
            max_conversation_length=max_conv_len)

        for sentence_len, label in zip(conversation_length, conv_labels):
            assert(sentence_len ==len(label))

        
        print('Saving preprocessed data at', split_data_dir)
        to_pickle(conversation_length, split_data_dir.joinpath(
            'conversation_length.pkl'))
        to_pickle(sentences, split_data_dir.joinpath('sentences.pkl'))
        to_pickle(conv_labels, split_data_dir.joinpath('labels.pkl'))
        to_pickle(sentence_length, split_data_dir.joinpath(
            'sentence_length.pkl'))
        to_pickle(iemocap.vids[split_type], split_data_dir.joinpath('video_id.pkl'))

        if split_type == 'train':

            print('Save Vocabulary...')
            vocab = Vocab(tokenizer)
            vocab.add_dataframe(conv_sentences)

            assert(GLOVE_DIR != "")
            vocab.update(GLOVE_DIR, max_size=max_vocab_size, min_freq=min_freq)

            print('Vocabulary size: ', len(vocab))
            vocab.pickle(iemocap_dir.joinpath('word2id.pkl'),
                         iemocap_dir.joinpath('id2word.pkl'),
                         iemocap_dir.joinpath('word_emb.pkl'))