Ejemplo n.º 1
0
glove_embeddings = get_embedding_matrix(word2idx,
                                        idx2word,
                                        normalization=False)
# elmo_embeddings
# set elmos_mohx=None to exclude elmo vectors. Also need to change the embedding_dim in later model initialization
elmos_mohx = h5py.File('../elmo/MOH-X_cleaned.hdf5', 'r')
'''
2. 2
embed the datasets
'''
random.seed(0)
random.shuffle(raw_mohx)

# second argument is the post sequence, which we don't need
embedded_mohx = [[
    embed_indexed_sequence(example[0], example[2], word2idx, glove_embeddings,
                           elmos_mohx, None), example[2], example[1]
] for example in raw_mohx]
'''
2. 3 10-fold cross validation
'''
# separate the embedded_sentences and labels into 2 list, in order to pass into the TextDataset as argument
sentences = [example[0] for example in embedded_mohx]
poss = [example[1] for example in embedded_mohx]
labels = [example[2] for example in embedded_mohx]
# ten_folds is a list of 10 tuples, each tuple is (list_of_embedded_sentences, list_of_corresponding_labels)
ten_folds = []
fold_size = int(647 / 10)
for i in range(10):
    ten_folds.append((sentences[i * fold_size:(i + 1) * fold_size],
                      poss[i * fold_size:(i + 1) * fold_size],
                      labels[i * fold_size:(i + 1) * fold_size]))
Ejemplo n.º 2
0
glove_embeddings = get_embedding_matrix(word2idx,
                                        idx2word,
                                        normalization=False)
# elmo_embeddings
elmos_train_vua = h5py.File('../elmo/VUA_train.hdf5', 'r')
elmos_val_vua = h5py.File('../elmo/VUA_val.hdf5', 'r')
# no suffix embeddings for sequence labeling
suffix_embeddings = None
'''
2. 2
embed the datasets
'''
# raw_train_vua: sentence, label_seq, pos_seq
# embedded_train_vua: embedded_sentence, pos, labels
embedded_train_vua = [[
    embed_indexed_sequence(example[0], example[2], word2idx, glove_embeddings,
                           elmos_train_vua, suffix_embeddings), example[2],
    example[1]
] for example in raw_train_vua]
embedded_val_vua = [[
    embed_indexed_sequence(example[0], example[2], word2idx, glove_embeddings,
                           elmos_val_vua, suffix_embeddings), example[2],
    example[1]
] for example in raw_val_vua]
'''
2. 3
set up Dataloader for batching
'''
# Separate the input (embedded_sequence) and labels in the indexed train sets.
# embedded_train_vua: embedded_sentence, pos, labels
train_dataset_vua = TextDataset([example[0] for example in embedded_train_vua],
                                [example[1] for example in embedded_train_vua],
Ejemplo n.º 3
0
# elmo_embeddings
# set elmos_mohx=None to exclude elmo vectors. Also need to change the embedding_dim in later model initialization
elmos_mohx = h5py.File('../elmo/MOH-X_cleaned.hdf5', 'r')

bert_mohx = None

suffix_embeddings = None
#suffix_embeddings = nn.Embedding(15, 50)
'''
2. 2
embed the datasets
'''

# second argument is the post sequence, which we don't need
embedded_mohx = [[
    embed_indexed_sequence(example[0], example[2], word2idx, glove_embeddings,
                           elmos_mohx, bert_mohx, suffix_embeddings),
    example[2], example[1]
] for example in raw_mohx]

#100 times 10-fold cross validation
#for valid in range(100):
'''
2. 3 10-fold cross validation
'''
# separate the embedded_sentences and labels into 2 list, in order to pass into the TextDataset as argument
sentences = [example[0] for example in embedded_mohx]
poss = [example[1] for example in embedded_mohx]
labels = [example[2] for example in embedded_mohx]
# ten_folds is a list of 10 tuples, each tuple is (list_of_embedded_sentences, list_of_corresponding_labels)
ten_folds = []
fold_size = int(647 / 10)
Ejemplo n.º 4
0
                                        normalization=False)

if using_GPU:
    elmo = ElmoEmbedder("./elmo/options.json", "./elmo/weights.hdf5", 0)
else:
    elmo = ElmoEmbedder("./elmo/options.json", "./elmo/weights.hdf5", -1)

############
# labeling #
############
embedded_test_rcc = []
logging.info("embedd test data with glove and elmo vectors")
for example in tqdm(raw_test_rcc, total=len(raw_test_rcc)):
    embedded_test_rcc.append([
        example[1],
        embed_indexed_sequence(example[0], word2idx, glove_embeddings, elmo)
    ])

# pickle.dump(embedded_test_rcc, open('./labeler_embedd_temp.data', "wb+"), protocol=-1)
# with open('./labeler_embedd_temp.data', "rb+") as infile:
#  embedded_test_rcc = pickle.load(infile)

logging.info("Set up Dataloader")
test_dataset_rcc = RNN_Testset(
    [example[0] for example in embedded_test_rcc],  # pub_id
    [example[1] for example in embedded_test_rcc])  # embedded sentence
# Set up a DataLoader for the test dataset
test_dataloader_rcc = DataLoader(dataset=test_dataset_rcc,
                                 batch_size=args.batch_size,
                                 collate_fn=RNN_Testset.collate_fn)
Ejemplo n.º 5
0
word2idx, idx2word = get_word2idx_idx2word(vocab)
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False)
# elmo_embeddings
#elmos_train_vua = h5py.File('../elmo/VUA_train.hdf5', 'r')
#elmos_val_vua = h5py.File('../elmo/VUA_val.hdf5', 'r')
# no suffix embeddings for sequence labeling
suffix_embeddings = None

'''
2. 2
embed the datasets
'''
# raw_train_vua: sentence, label_seq, pos_seq
# embedded_train_vua: embedded_sentence, pos, labels
embedded_train_vua = [[embed_indexed_sequence(example[0], example[2], word2idx,
                                      glove_embeddings, None , suffix_embeddings),
                       example[2], example[1]]
                      for example in raw_train_vua]
embedded_val_vua = [[embed_indexed_sequence(example[0], example[2], word2idx,
                                    glove_embeddings, None, suffix_embeddings),
                     example[2], example[1]]
                    for example in raw_val_vua]


'''
2. 3
set up Dataloader for batching
'''
# Separate the input (embedded_sequence) and labels in the indexed train sets.
# embedded_train_vua: embedded_sentence, pos, labels
train_dataset_vua = TextDataset([example[0] for example in embedded_train_vua],
Ejemplo n.º 6
0
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False)
# elmo_embeddings
elmos_train_vua = h5py.File('../elmo/VUA_train.hdf5', 'r')
elmos_val_vua = h5py.File('../elmo/VUA_val.hdf5', 'r')
# pos_embeddings: the pos embedding dimension is 50
# pos_embeddings = nn.Embedding(len(pos2idx), 50)
pos_embeddings = None

'''
2. 2
embed the datasets
'''
# raw_train_vua: sentence, label_seq, pos_seq
# embedded_train_vua: embedded_sentence, pos, labels
embedded_train_vua = [[embed_indexed_sequence(example[0], example[2], word2idx,
                                              glove_embeddings, elmos_train_vua, pos_embeddings),
                       example[2], example[1]]
                      for example in raw_train_vua]
embedded_val_vua = [[embed_indexed_sequence(example[0], example[2], word2idx,
                                            glove_embeddings, elmos_val_vua, pos_embeddings),
                     example[2], example[1]]
                    for example in raw_val_vua]

'''
2. 3
set up Dataloader for batching
'''
# Separate the input (embedded_sequence) and labels in the indexed train sets.
# embedded_train_vua: embedded_sentence, pos, labels
train_dataset_vua = TextDataset([example[0] for example in embedded_train_vua],
                                [example[1] for example in embedded_train_vua],
suffix_embeddings = None
#suffix_embeddings = nn.Embedding(15, 50)

'''
2. 2
embed the datasets
'''

random.seed(0)
random.shuffle(raw_trofi)
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.backends.cudnn.deterministic = True

embedded_trofi = [[embed_indexed_sequence(example[0], example[2], word2idx,
                                      glove_embeddings, elmos_trofi, bert_trofi, suffix_embeddings),
                       example[2], example[1]]
                      for example in raw_trofi]

'''
2. 3 10-fold cross validation
'''
# separate the embedded_sentences and labels into 2 list, in order to pass into the TextDataset as argument
sentences = [example[0] for example in embedded_trofi]
poss = [example[1] for example in embedded_trofi]
labels = [example[2] for example in embedded_trofi]
# ten_folds is a list of 10 tuples, each tuple is (list_of_embedded_sentences, list_of_corresponding_labels)
ten_folds = []
fold_size = int(3737 / 10)
for i in range(10):
    ten_folds.append((sentences[i * fold_size:(i + 1) * fold_size],
Ejemplo n.º 8
0
# no suffix embeddings for sequence labeling
suffix_embeddings = None
'''
2. 2
embed the datasets
'''
# random.seed(0)
# random.shuffle(raw_train_vua)

sentence_to_index_train = ast.literal_eval(
    elmos_train_vua['sentence_to_index'][0])
labels = [example[1] for example in raw_train_vua]
poss = [example[2] for example in raw_train_vua]
sentences = [
    embed_indexed_sequence(example[0], example[2], word2idx, glove_embeddings,
                           elmos_train_vua, suffix_embeddings,
                           sentence_to_index_train[example[0]])
    for example in raw_train_vua
]


def train_model(train_dataloader_vua, val_dataloader_vua, fold_num):
    optimal_f1s = []
    optimal_ps = []
    optimal_rs = []
    optimal_accs = []
    predictions_all = []

    RNNseq_model = RNNSequenceModel(num_classes=2,
                                    embedding_dim=300 + 1024,
                                    hidden_size=300,
Ejemplo n.º 9
0
# glove_embeddings a nn.Embeddings
glove_embeddings = get_embedding_matrix(word2idx, idx2word, normalization=False)
# elmo_embeddings
# set elmos_trofi=None to exclude elmo vectors. Also need to change the embedding_dim in later model initialization
#elmos_trofi = h5py.File('../elmo/TroFi3737.hdf5', 'r')


'''
2. 2
embed the datasets
'''
random.seed(0)
random.shuffle(raw_trofi)

# second argument is the post sequence, which we don't need
embedded_trofi = [[embed_indexed_sequence(example[0], example[2], word2idx,
                                      glove_embeddings, None),
                       example[2], example[1]]
                      for example in raw_trofi]



'''
2. 3 10-fold cross validation
'''
# separate the embedded_sentences and labels into 2 list, in order to pass into the TextDataset as argument
sentences = [example[0] for example in embedded_trofi]
poss = [example[1] for example in embedded_trofi]
labels = [example[2] for example in embedded_trofi]
# ten_folds is a list of 10 tuples, each tuple is (list_of_embedded_sentences, list_of_corresponding_labels)
ten_folds = []
fold_size = int(3737 / 10)