コード例 #1
0
'''
raw_train_vua = data_parser.load_raw_train_vua()
raw_test_vua = data_parser.load_raw_test_vua()

print('VUA dataset division: ', len(raw_train_vua), len(raw_test_vua))
"""
2. Data preparation
"""
'''
2. 1
get vocabulary and glove embeddings in raw dataset 
'''
# vocab is a set of words
vocab = get_vocab(raw_train_vua + raw_test_vua)
# two dictionaries. <PAD>: 0, <UNK>: 1
word2idx, idx2word = get_word2idx_idx2word(vocab)
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(word2idx,
                                        idx2word,
                                        normalization=False)
# elmo_embeddings
elmos_train_vua = h5py.File('../elmo/VUA_train2.hdf5', 'r')
# suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50
suffix_embeddings = nn.Embedding(2, 50)
'''
2. 2
embed the datasets
'''
random.seed(0)
random.shuffle(raw_train_vua)
# """
# Metaphor Identification
# """
# train_sample_ms, train_sam_sen_ms = get_metaphor_feature(train_data)
# test_sample_ms, test_sam_sen_ms = get_metaphor_feature(test_data)

train_dp = 'train_an'
test_dp = 'test_an'

train_data, train_vocab, train_sample_ms, train_sam_sen_ms = get_data(train_dp)
test_data, test_vocab, test_sample_ms, test_sam_sen_ms = get_data(test_dp)
"""
Data Embedding
optional: Bert or Glove. Default Glove
"""
word2idx, idx2word = get_word2idx_idx2word(train_vocab)
glove_embeddings = get_embedding_matrix(word2idx,
                                        idx2word,
                                        normalization=False)

train_embedded_text, train_labels = embed_sentences(train_data, word2idx,
                                                    glove_embeddings)
test_embedded_text, test_labels = embed_sentences(test_data, word2idx,
                                                  glove_embeddings)
"""
Produce Dataset & DataLoader
"""
train_dataset = TextDataset(train_embedded_text, train_sample_ms,
                            train_sam_sen_ms, train_labels)
test_dataset = TextDataset(test_embedded_text, test_sample_ms, test_sam_sen_ms,
                           test_labels)