def train(model, opt): for i in range(epoch): print("epoch {}".format(i + 1)) start = time.time() total_loss = 0 gen1 = gens.word_list(train_txt) gen2 = gens.batch(gens.sorted_parallel(gen1, embed_size * batch_size), batch_size) batchs = [b for b in gen2] bl = list(range(len(batchs))) random.shuffle(bl) for n, j in enumerate(bl): if window_size > len(batchs[j][0]) - 4: continue tag0 = batchs[j][:] tags = [[int(c) for c in a[:-1]] for a in tag0] batch = fill_batch([b[-1].split() for b in batchs[j]]) batch = [convert_word(b) for k, b in enumerate(batch)] tags = xp.array(tags, dtype=xp.int32) accum_loss = forward(*batch, *tags, model) accum_loss.backward() opt.update() total_loss += accum_loss.data print('total_loss: {}'.format(total_loss)) evaluate(model) serializers.save_npz("{}{}".format(ssweM, i), model) print("time: {}".format(time.time() - start))
def evaluate(model, word2id): c_p = 0 correct_p = 0 c_r = 0 correct_r = 0 m = model.copy() m.volatile = True gen1 = gens.word_list(test_txt) gen2 = gens.batch(gens.sorted_parallel(gen1, embed_size * batch_size), batch_size) batchs = [b for b in gen2] for batch in batchs: tag0 = batch[:] tags = [a[0] for a in tag0] batch = [b[1:] for b in batch] batch = fill_batch([b[-1].split() for b in batch]) pres = forward(batch, tags, m, word2id, mode=False) a, b, c, d = precision_recall_f(pres, tags) c_p += a correct_p += b c_r += c correct_r += d precision = correct_p / c_p recall = correct_r / c_r f_measure = (2 * precision * recall) / (precision + recall) print('Precision:\t{}'.format(precision)) print('Recall:\t{}'.format(recall)) print('F-value\t{}'.format(f_measure))
def evaluate(model, word2id): c_p = 0 correct_p = 0 c_r = 0 correct_r = 0 m = model.copy() m.volatile = True gen1 = gens.word_list(dev_txt) gen2 = gens.batch(gens.sorted_parallel(gen1, embed_size * batch_size), batch_size) batchs = [b for b in gen2] for batch in batchs: tag0 = batch[:] tags = [a[:-1] for a in tag0] batch = [b[1:] for b in batch] batch = fill_batch([b[-1].split() for b in batch]) tags = fill_batch(tags, token=-1) pres, cons = forward(batch, tags, m, word2id, mode=False) a, b, c, d = precision_recall_f(pres, tags, cons) c_p += a correct_p += b c_r += c correct_r += d try: precision = correct_p / c_p recall = correct_r / c_r f_measure = (1 + 0.5**2) * precision * recall / (0.5**2 * precision + recall) except: precision = 'nothing' recall = 'nothing' f_measure = 'nothing' print('Precision:\t{}'.format(precision)) print('Recall:\t{}'.format(recall)) print('F-value\t{}'.format(f_measure))
def test(): sta = pickle.load(open(state_model, "rb")) word2id = pickle.load(open(vocab_dict, 'rb')) c_p = 0 correct_p = 0 c_r = 0 correct_r = 0 predicts = [] total_predicts = [] total_tags = [] total_batchs = [] model = BLSTMw2v(sta["vocab_size"], sta["embed_size"], sta["hidden_size"], output_size) serializers.load_npz(load_model, model) if gpu >= 0: model.to_gpu() for j, batchs in enumerate(gens.batch(gens.word_list(test_txt), batch_size)): tag0 = batchs[:] tags = [a[0] for a in tag0] batch = fill_batch([b[-1].split() for b in batchs[j]]) total_batchs.append(batchs) pres = forward(batchs, tags, model, word2id, mode=False) a, b, c, d = precision_recall_f(pres, tags) c_p += a correct_p += b c_r += c correct_r += d evaluate(model, word2id)
def train(): id2word = {} word2id = {} word_freq = collections.defaultdict(lambda: 0) id2word[0] = "<unk>" word2id["<unk>"] = 0 id2word[1] = "</s>" word2id["</s>"] = 1 id2word[-1] = "EOS" word2id["EOS"] = -1 word2id, id2word, word_list, word_freq = make_dict(train_txt, word2id, id2word, word_freq) word2vec_model = gensim.models.Word2Vec.load_word2vec_format( './entity_vector/entity_vector.model.bin', binary=True) model = BLSTMw2v(vocab_size, embed_size, hidden_size, output_size) model.initialize_embed(word2vec_model, word_list, word2id) if gpu >= 0: cuda.get_device(gpu).use() model.to_gpu() opt = O.Adam() opt.setup(model) gen1 = gens.word_list(train_txt) gen2 = gens.batch(gens.sorted_parallel(gen1, embed_size * batch_size), batch_size) batchs = [b for b in gen2] bl = list(range(len(batchs))) random.shuffle(bl) for i in range(epoch): print("epoch{}".format(i + 1)) start = time.time() total_loss = 0 for n, j in enumerate(bl): tag0 = batchs[j][:] tags = [a[0] for a in tag0] batch = fill_batch([b[-1].split() for b in batchs[j]]) accum_loss, pres = forward(batch, tags, model, word2id, mode=True) # 損失の計算 accum_loss.backward() # 誤差逆伝播 opt.update() # パラメータの更新 total_loss += accum_loss.data print("total_loss {}".format(total_loss)) serializers.save_npz("{}{}".format(load_model, i), model) evaluate(model, word2id) print("time: {}".format(time.time() - start)) pickle.dump(dict(word2id), open(vocab_dict, 'wb')) serializers.save_npz(load_model, model) state_d = {} state_d["vocab_size"] = vocab_size state_d["hidden_size"] = hidden_size state_d["embed_size"] = embed_size pickle.dump(state_d, open(state_model, "wb"))
def train(): id2word = {} word2id = {} word_freq = collections.defaultdict(lambda: 0) id2word[0] = "<unk>" word2id["<unk>"] = 0 id2word[1] = "<s>" word2id["<s>"] = 1 id2word[2] = "</s>" word2id["</s>"] = 2 word2id, id2word, word_list, word_freq = make_dict(train_txt, word2id, id2word, word_freq) model = BiLSTM(vocab_size, embed_size, hidden_size, output_size, extra_hidden_size) model.initialize_embed('../data/embedding.txt', word2id) if torch.cuda.is_available(): model.cuda() opt = optim.Adam(model.parameters(), lr=0.001) for i in range(1, epoch + 1): print("\nepoch {}".format(i)) total_loss = 0 gen1 = gens.word_list(train_txt) gen2 = gens.batch(gens.sorted_parallel(gen1, embed_size * batch_size), batch_size) batchs = [b for b in gen2] bl = list(range(len(batchs))) random.shuffle(bl) for n, j in enumerate(bl): tag0 = batchs[j][:] tags = [[int(c) for c in a[:-1]] for a in tag0] batch = fill_batch([b[-1].split() for b in batchs[j]]) tags = fill_batch(tags, token=0) accum_loss, pres, cons = forward(batch, tags, model, word2id, mode=True) accum_loss.backward() opt.step() total_loss += accum_loss.data[0] print("total_loss {}".format(total_loss)) evaluate(model, word2id) torch.save(model.state_dict(), "{}{}".format(load_model, i)) torch.save(model.state_dict(), load_model) with open(vocab_dict, mode='wb') as f: pickle.dump(word2id, f)
def test(): sta = pickle.load(open(state_model, "rb")) word2id = pickle.load(open(vocab_dict, 'rb')) c_p = 0 correct_p = 0 c_r = 0 correct_r = 0 predicts = [] total_predicts = [] total_tags = [] total_batchs = [] model = BLSTMw2v(sta["vocab_size"], sta["embed_size"], sta["hidden_size"], output_size) serializers.load_npz(load_model, model) if gpu >= 0: model.to_gpu() for batchs in gens.batch(gens.word_list(test_txt), batch_size): tag0 = batchs[:] tags = [a[:-1] for a in tag0] batchs = [b[1:] for b in batchs] total_batchs.append(batchs) batchs = fill_batch([b[-1].split() for b in batchs]) tags = fill_batch(tags, token=-1) pres, cons = forward(batchs, tags, model, word2id, mode=False) for num, a in enumerate(zip(pres, tags)): pre_l = [ int(xp.argmax(a[0].data[k])) for k in range(len(a[0].data)) if cons[num][k] == True ] a, b, c, d = precision_recall_f(pres, tags, cons) c_p += a correct_p += b c_r += c correct_r += d precision = correct_p / c_p recall = correct_r / c_r f_measure = 2 * precision * recall / (precision + recall) print('Precision:\t{}'.format(precision)) print('Recall:\t{}'.format(recall)) print('F-value\t{}'.format(f_measure))
def test(): res = [] word2id = pickle.load(open(vocab_dict, 'rb')) model = BiLSTM(vocab_size, embed_size, hidden_size, output_size, extra_hidden_size) model.load_state_dict(torch.load(load_model)) if torch.cuda.is_available(): model = model.cuda() for i in range(1, epoch + 1): print("\nepoch {}".format(i)) total_loss = 0 gen1 = gens.word_list(test_txt) gen2 = gens.batch(gens.sorted_parallel(gen1, embed_size * batch_size), batch_size) batchs = [b for b in gen2] bl = list(range(len(batchs))) random.shuffle(bl) for n, j in enumerate(bl): tag0 = batchs[j][:] tags = [[int(c) for c in a[:-1]] for a in tag0] batch = fill_batch([b[-1].split() for b in batchs[j]]) tags = fill_batch(tags, token=0) accum_loss, pres, cons = forward(batch, tags, model, word2id, mode=True) total_loss += accum_loss.data[0] pres = np.array(pres, dtype=np.int64).T for pre, text in zip(pres, batch): pre = [str(p) for p in pre] res.append(' '.join(pre) + '\t' + ' '.join(text)) print("total_loss {}".format(total_loss)) with open('./save1.txt', 'w') as f: f.write('\n'.join(res)) evaluate(model, word2id) # F値を出したい場合はこちら
def evaluate(model, word2id): c_p = 0 correct_p = 0 c_r = 0 correct_r = 0 gen1 = gens.word_list(test_txt) gen2 = gens.batch(gens.sorted_parallel(gen1, embed_size * batch_size), batch_size) batchs = [b for b in gen2] for batch in batchs: tag0 = batch[:] tags = [a[:-1] for a in tag0] batch = fill_batch([b[-1].split() for b in batch]) tags = fill_batch(tags, token=-1) pres, cons = forward(batch, tags, model, word2id, mode=False) a, b, c, d = precision_recall_f(pres, tags, cons) c_p += a correct_p += b c_r += c correct_r += d try: precision = correct_p / c_p recall = correct_r / c_r f_measure = (1 + 0.5**2) * precision * recall / (0.5**2 * precision + recall) print('Precision:\t{}'.format(precision)) print('Recall:\t{}'.format(recall)) print('F-value\t{}'.format(f_measure)) except ZeroDivisionError: precision = 0 recall = 0 f_measure = 0 print('Precision:\tnothing') print('Recall:\tnothing') print('F-value\tnothing') return precision, recall, f_measure
def train(): id2word = {} word2id = {} word_freq = collections.defaultdict(lambda: 0) id2word[0] = "<unk>" word2id["<unk>"] = 0 id2word[1] = "<s>" word2id["<s>"] = 1 id2word[-1] = "</s>" word2id["</s>"] = -1 word2id, id2word, word_list, word_freq = make_dict(train_txt, word2id, id2word, word_freq) word2vec_model = load_word2vec_format('embedding.txt') model = BLSTMw2v(vocab_size, embed_size, hidden_size, output_size) model.initialize_embed(word2vec_model, word_list, word2id, id2word) if gpu >= 0: cuda.get_device(gpu).use() # Make a specified GPU current model.to_gpu() # Copy the model to the GPU opt = O.Adam(alpha=0.001) opt.setup(model) for i in range(epoch): print("epoch{}".format(i + 1)) start = time.time() total_loss = 0 gen1 = gens.word_list(train_txt) gen2 = gens.batch(gens.sorted_parallel(gen1, embed_size * batch_size), batch_size) batchs = [b for b in gen2] bl = list(range(len(batchs))) random.shuffle(bl) for n, j in enumerate(bl): tag0 = batchs[j][:] tags = [[int(c) for c in a[:-1]] for a in tag0] batch = fill_batch([b[-1].split() for b in batchs[j]]) tags = fill_batch(tags, token=0) accum_loss, pres, cons = forward(batch, tags, model, word2id, mode=True) accum_loss.backward() opt.update() total_loss += accum_loss.data print("total_loss {}".format(total_loss)) serializers.save_npz("{}{}".format(load_model, i), model) evaluate(model, word2id) ff = open('embeding.txt', 'w') ff.write('{} {}\n'.format(len(model.x2e.W.data) - 3, embed_size)) for num in range(2, len(model.x2e.W.data) - 1): ff.write('{} {}\n'.format( id2word[num], ' '.join([ str(model.x2e.W.data[num][numnum]) for numnum in range(embed_size) ]))) print("time: {}".format(time.time() - start)) pickle.dump(dict(word2id), open(vocab_dict, 'wb')) serializers.save_npz(load_model, model) state_d = {} state_d["vocab_size"] = vocab_size state_d["hidden_size"] = hidden_size state_d["embed_size"] = embed_size pickle.dump(state_d, open(state_model, "wb"))