Exemple #1
0
# split training and validation datasets

random.shuffle(train_data)

num = len(train_data)
val_num = int(num * config['val_rate'])
val_data = train_data[num - val_num:num]
train_data = train_data[0:num - val_num]

# build vocabulary

vocab = Vocab()

for line in train_data:
    line = line.strip().split('\t')[1].split(' ')
    vocab.add_list(line)

word2index, index2word = vocab.get_vocab(max_size=config['max_size'],
                                         min_freq=config['min_freq'])

vocab_size = len(index2word)
oov_size = len(word2index) - len(index2word)

with open(word2index_path, 'wb') as handle:
    pickle.dump(word2index, handle)
with open(index2word_path, 'wb') as handle:
    pickle.dump(index2word, handle)

glove = load_glove(config['glove_path'], vocab_size, word2index)
np.save(glove_path, glove)