Beispiel #1
0
# elmo_embeddings
elmos_train_vua = h5py.File('../elmo/VUA_train2.hdf5', 'r')
# suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50
suffix_embeddings = nn.Embedding(2, 50)
'''
2. 2
embed the datasets
'''
random.seed(0)
random.shuffle(raw_train_vua)

sentence_to_index_train = ast.literal_eval(
    elmos_train_vua['sentence_to_index'][0])
sentences = [
    embed_sequence(example[0], example[1], word2idx, glove_embeddings,
                   elmos_train_vua, suffix_embeddings,
                   sentence_to_index_train[example[0]])
    for example in raw_train_vua
]
labels = [example[2] for example in raw_train_vua]

assert (len(sentences) == len(labels))
'''
2. 3
set up Dataloader for batching
'''
# 10 folds takes up too much RAM, just do 1
fold_size = int(len(raw_train_vua) / 10)
embedded_train_vua = [[sentences[i], labels[i]]
                      for i in range(fold_size, len(sentences))]
embedded_val_vua = [[sentences[i], labels[i]] for i in range(fold_size)]
vocab = get_vocab(raw_mohX)
# two dictionaries. <PAD>: 0, <UNK>: 1
word2idx, idx2word = get_word2idx_idx2word(vocab)
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False)
# elmo_embeddings
# To exclude elmos: use elmos_mohx=None, and change embedding_dim in later model initiation.
elmos_mohx = h5py.File('../elmo/MOH-X_cleaned.hdf5', 'r')
# suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50
suffix_embeddings = nn.Embedding(2, 50)

'''
2. 2
embed the datasets
'''
embedded_mohX = [[embed_sequence(example[0], example[1], word2idx,
                                 glove_embeddings, elmos_mohx, suffix_embeddings), example[2]]
                 for example in raw_mohX]


'''
2. 3 10-fold cross validation
'''
# separate the embedded_sentences and labels into 2 list, in order to pass into the TextDataset as argument
sentences = [example[0] for example in embedded_mohX]
labels = [example[1] for example in embedded_mohX]
# ten_folds is a list of 10 tuples, each tuple is (list_of_embedded_sentences, list_of_corresponding_labels)
ten_folds = []
for i in range(10):
    ten_folds.append((sentences[i*65:(i+1)*65], labels[i*65:(i+1)*65]))

optimal_f1s = []
Beispiel #3
0
glove_embeddings = get_embedding_matrix(word2idx,
                                        idx2word,
                                        normalization=False)
# elmo_embeddings
elmos_poetry = None
# suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50
suffix_embeddings = nn.Embedding(2, 50)
'''
2. 2
embed the datasets
'''
random.seed(0)  # set a seed
random.shuffle(raw_poetry)

embedded_poetry = [[
    embed_sequence(example[0], example[1], word2idx, glove_embeddings,
                   elmos_poetry, suffix_embeddings), example[2]
] for example in raw_poetry]
'''
2. 3
set up Dataloader for batching
'''
'''
2. 3 10-fold cross validation
'''
# separate the embedded_sentences and labels into 2 list, in order to pass into the TextDataset as argument
sentences = [example[0] for example in embedded_poetry]
labels = [example[1] for example in embedded_poetry]
# ten_folds is a list of 10 tuples, each tuple is (list_of_embedded_sentences, list_of_corresponding_labels)
ten_folds = []
fold_size = int(585 / 10)
for i in range(10):
Beispiel #4
0
word2idx, idx2word = get_word2idx_idx2word(vocab)
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(word2idx,
                                        idx2word,
                                        normalization=False)
# elmo_embeddings
elmos_train_vua = h5py.File('../elmo/VUA_train.hdf5', 'r')
elmos_val_vua = h5py.File('../elmo/VUA_val.hdf5', 'r')
# suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50
suffix_embeddings = nn.Embedding(2, 50)
'''
2. 2
embed the datasets
'''
embedded_train_vua = [[
    embed_sequence(example[0], example[1], word2idx, glove_embeddings,
                   elmos_train_vua, suffix_embeddings), example[2]
] for example in raw_train_vua]
embedded_val_vua = [[
    embed_sequence(example[0], example[1], word2idx, glove_embeddings,
                   elmos_val_vua, suffix_embeddings), example[2]
] for example in raw_val_vua]
'''
2. 3
set up Dataloader for batching
'''
# Separate the input (embedded_sequence) and labels in the indexed train sets.
train_dataset_vua = TextDataset([example[0] for example in embedded_train_vua],
                                [example[1] for example in embedded_train_vua])
val_dataset_vua = TextDataset([example[0] for example in embedded_val_vua],
                              [example[1] for example in embedded_val_vua])
Beispiel #5
0
                                        idx2word,
                                        normalization=False)
# elmo_embeddings
# set elmos_mohx=None to exclude elmo vectors
#elmos_trofi = h5py.File('../elmo/TroFi3737.hdf5', 'r')
# suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50
suffix_embeddings = nn.Embedding(2, 50)
'''
2. 2
embed the datasets
'''
random.seed(0)
random.shuffle(raw_trofi)

embedded_trofi = [[
    embed_sequence(example[0], example[1], word2idx, glove_embeddings, None,
                   suffix_embeddings), example[2]
] for example in raw_trofi]
'''
2. 3
set up Dataloader for batching
'''
'''
2. 3 10-fold cross validation
'''
# separate the embedded_sentences and labels into 2 list, in order to pass into the TextDataset as argument
sentences = [example[0] for example in embedded_trofi]
labels = [example[1] for example in embedded_trofi]
# ten_folds is a list of 10 tuples, each tuple is (list_of_embedded_sentences, list_of_corresponding_labels)
ten_folds = []
fold_size = int(3737 / 10)
for i in range(10):
Beispiel #6
0
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False)
# elmo_embeddings
# set elmos_mohx=None to exclude elmo vectors
elmos_trofi = h5py.File('../elmo/TroFi3737.hdf5', 'r')
# suffix_embeddings: number of suffix tag is 2, and the suffix embedding dimension is 50
suffix_embeddings = nn.Embedding(2, 50)

'''
2. 2
embed the datasets
'''
random.seed(0)
random.shuffle(raw_trofi)

embedded_trofi = [[embed_sequence(example[0], example[1], word2idx,
                                 glove_embeddings, elmos_trofi, suffix_embeddings), example[2]]
                 for example in raw_trofi]


'''
2. 3
set up Dataloader for batching
'''
'''
2. 3 10-fold cross validation
'''
# separate the embedded_sentences and labels into 2 list, in order to pass into the TextDataset as argument
sentences = [example[0] for example in embedded_trofi]
labels = [example[1] for example in embedded_trofi]
# ten_folds is a list of 10 tuples, each tuple is (list_of_embedded_sentences, list_of_corresponding_labels)
ten_folds = []