def tag(self, r_x, r_x_raw, idx2tag, idx2char, unk_chars, sub_dict, trans_dict, sess, transducer, ensemble=None, batch_size=100, outpath=None, sent_seg=False, seg_large=False, form='conll'): chars = toolbox.decode_chars(r_x[0], idx2char) for i in range(len(r_x[0])): for j, n in enumerate(r_x[0][i]): if n in sub_dict: r_x[0][i][j] = sub_dict[n] elif n in unk_chars: r_x[0][i][j] = 1 c_len = len(r_x[0][0]) idx = self.bucket_dit[c_len] real_batch = batch_size * 300 / c_len transducer_dict = None if transducer is not None: char2idx = {v: k for k, v in idx2char.items()} def transducer_dict(trans_str): return self.define_transducer_dict(trans_str, char2idx, sess[-1], transducer) prediction = self.predict(data=r_x, sess=sess, model=self.input_v[idx] + self.output[idx], index=idx, argmax=True, batch_size=real_batch, ensemble=ensemble) predictions = toolbox.decode_tags(prediction, idx2tag) if self.is_space == 'sea': prediction_out, raw_out = toolbox.generate_output_sea(chars, predictions) multi_out = prediction_out else: prediction_out, raw_out, multi_out = toolbox.generate_output(chars, predictions, trans_dict, transducer_dict, multi_tok=True) pre_out = [] mut_out = [] for pre in prediction_out: pre_out += pre for mul in multi_out: mut_out += mul prediction_out = pre_out multi_out = mut_out if form == 'mlp1' or form == 'mlp2': prediction_out = toolbox.mlp_post(r_x_raw, prediction_out, self.is_space, form) if not seg_large: toolbox.printer(r_x_raw, prediction_out, multi_out, outpath, sent_seg, form) else: return prediction_out, multi_out
def test(self, t_x1, t_x2, t_y_raw, t_y_gold, idx2tag, idx2char, unk_chars, sub_dict, trans_dict, sess, transducer, ensemble=None, batch_size=100, sent_seg=False, bias=-1, outpath=None, trans_type='mix', test_result_path=None): chars = toolbox.decode_chars_new(t_x1[0], idx2char) gold_out = t_y_gold for i in range(len(t_x1[0])): for j, n in enumerate(t_x1[0][i]): if n in sub_dict: t_x1[0][i][j] = sub_dict[n] elif n in unk_chars: t_x1[0][i][j] = 1 for i in range(len(t_x2[0])): for j, n in enumerate(t_x2[0][i]): if n in sub_dict: t_x2[0][i][j] = sub_dict[n] elif n in unk_chars: t_x2[0][i][j] = 1 transducer_dict = None if transducer is not None: char2idx = {v: k for k, v in idx2char.items()} def transducer_dict(trans_str): return self.define_transducer_dict(trans_str, char2idx, sess[-1], transducer) if bias < 0: argmax = True else: argmax = False t_x = t_x1 + t_x2 # pdb.set_trace() prediction = self.predict(data_v=t_x, sess=sess, model=self.input_v1[0] + self.input_v2[0] + self.output[0], index=0, argmax=argmax, batch_size=batch_size, ensemble=ensemble) if bias >= 0 and self.crf == 0: prediction = [toolbox.biased_out(prediction[0], bias)] predictions = toolbox.decode_tags(prediction, idx2tag) # pdb.set_trace() if self.is_space == 'sea': prediction_out, raw_out = toolbox.generate_output_sea( chars, predictions) else: prediction_out, raw_out = toolbox.generate_output( chars, predictions, trans_dict, transducer_dict, trans_type=trans_type) if sent_seg: scores = evaluation.evaluator(prediction_out, gold_out, raw_out, t_y_raw) else: scores = evaluation.evaluator(prediction_out, gold_out, verbose=True) if outpath is not None: wt = codecs.open(outpath, 'w', encoding='utf-8') for pre in prediction_out[0]: wt.write(pre + '\n') wt.close() if test_result_path is not None: wt = codecs.open(test_result_path, 'w', encoding='utf-8') if sent_seg: wt.write('Sentence segmentation:' + '\n') wt.write('F score: %f' % scores[5] + '\n') wt.write('Precision: %f' % scores[3] + '\n') wt.write('Recall: %f\n' % scores[4] + '\n') wt.write('Word segmentation:' + '\n') wt.write('F score: %f' % scores[2] + '\n') wt.write('Precision: %f' % scores[0] + '\n') wt.write('Recall: %f\n' % scores[1] + '\n') else: wt.write('F score: %f' % scores[2] + '\n') wt.write('Precision: %f' % scores[0] + '\n') wt.write('Recall: %f\n' % scores[1] + '\n') wt.write('True negative rate: %f' % scores[3] + '\n') wt.close() print 'Evaluation scores:' if sent_seg: print 'Sentence segmentation:' print 'F score: %f' % scores[5] print 'Precision: %f' % scores[3] print 'Recall: %f\n' % scores[4] print 'Word segmentation:' print 'F score: %f' % scores[2] print 'Precision: %f' % scores[0] print 'Recall: %f\n' % scores[1] else: print 'Precision: %f' % scores[0] print 'Recall: %f' % scores[1] print 'F score: %f' % scores[2] print 'True negative rate: %f' % scores[3]
def train(self, t_x1, t_x2, t_y, v_x1, v_x2, v_y_raw, v_y_gold, idx2tag, idx2char, unk_chars, trans_dict, sess, epochs, trained_model, transducer=None, lr=0.05, decay=0.05, decay_step=1, sent_seg=False, outpath=None): lr_r = lr best_epoch = 0 best_score = [0] * 6 chars = toolbox.decode_chars_new(v_x1[0], idx2char) for i in range(len(v_x1[0])): for j, n in enumerate(v_x1[0][i]): if n in unk_chars: v_x1[0][i][j] = 1 for i in range(len(v_x2[0])): for j, n in enumerate(v_x2[0][i]): if n in unk_chars: v_x2[0][i][j] = 1 for i in range(len(t_x1[0])): for k in range(len(t_x1[0][i])): for j, n in enumerate(t_x1[0][i][k]): if n in unk_chars: t_x1[0][i][k][j] = 1 for i in range(len(t_x2[0])): for k in range(len(t_x2[0][i])): for j, n in enumerate(t_x2[0][i][k]): if n in unk_chars: t_x2[0][i][k][j] = 1 transducer_dict = None if transducer is not None: char2idx = {k: v for v, k in idx2char.items()} def transducer_dict(trans_str): return self.define_transducer_dict(trans_str, char2idx, sess[-1], transducer) for epoch in range(epochs): print 'epoch: %d' % (epoch + 1) t = time() if epoch % decay_step == 0 and decay > 0: lr_r = lr / (1 + decay * (epoch / decay_step)) # #(Pdb) print(np.array(t_x1[0]).shape) # (7,) # (Pdb) print(np.array(t_x1[0][0]).shape) # (5719, 50) # (Pdb) print(np.array(t_x1[0][1]).shape) # (5473, 100) # (Pdb) print(np.array(t_x1[0][2]).shape) # (3135, 150) # (Pdb) print(np.array(t_x1[0][3]).shape) # (1323, 200) # (Pdb) print(np.array(t_x1[0][4]).shape) # (538, 250) # (Pdb) print(np.array(t_x1[0][5]).shape) # (351, 300) # (Pdb) print(np.array(t_x1[0][6]).shape) # (3, 300) # (Pdb) print(np.array(t_x1[0][7]).shape) # # data_list = t_x1 + t_x2 + t_y samples = zip(*data_list) random.shuffle(samples) # pdb.set_trace() for sample in samples: c_len = len(sample[0][0]) idx = self.bucket_dit[c_len] real_batch_size = self.num_gpus * self.batch_size model = self.input_v1[idx] + self.input_v2[idx] + self.output_[ idx] # pdb.set_trace() Batch.train(sess=sess[0], model=model, batch_size_h=self.batch_size_h, batch_size=self.real_batches[idx], config=self.train_step[idx], lr=self.l_rate, lrv=lr_r, dr=self.drop_out, drv=self.drop_out_v, data=list(sample), verbose=False, num_gpus=self.num_gpus) predictions = [] #for v_b_x in zip(*v_x): c_len = len(v_x1[0][0]) idx = self.bucket_dit[c_len] data_v = v_x1 + v_x2 b_prediction = self.predict(data_v, sess=sess, model=self.input_v1[idx] + self.input_v2[idx] + self.output[idx], index=idx, argmax=True, batch_size=200) # pdb.set_trace() b_prediction = toolbox.decode_tags(b_prediction, idx2tag) predictions.append(b_prediction) # pdb.set_trace() predictions = zip(*predictions) predictions = toolbox.merge_bucket(predictions) if self.is_space == 'sea': prediction_out, raw_out = toolbox.generate_output_sea( chars, predictions) else: prediction_out, raw_out = toolbox.generate_output( chars, predictions, trans_dict, transducer_dict) if sent_seg: scores = evaluation.evaluator(prediction_out, v_y_gold, raw_out, v_y_raw) else: scores = evaluation.evaluator(prediction_out, v_y_gold) if sent_seg: c_score = scores[2] * scores[5] c_best_score = best_score[2] * best_score[5] else: c_score = scores[2] c_best_score = best_score[2] if c_score > c_best_score: best_epoch = epoch + 1 best_score = scores self.saver.save(sess[0], trained_model, write_meta_graph=False) if outpath is not None: wt = codecs.open(outpath, 'w', encoding='utf-8') for pre in prediction_out[0]: wt.write(pre + '\n') wt.close() if sent_seg: print 'Sentence segmentation:' print 'F score: %f\n' % scores[5] print 'Word segmentation:' print 'F score: %f' % scores[2] else: print 'F score: %f' % c_score print 'Time consumed: %d seconds' % int(time() - t) print 'Training is finished!' if sent_seg: print 'Sentence segmentation:' print 'Best F score: %f' % best_score[5] print 'Best Precision: %f' % best_score[3] print 'Best Recall: %f\n' % best_score[4] print 'Word segmentation:' print 'Best F score: %f' % best_score[2] print 'Best Precision: %f' % best_score[0] print 'Best Recall: %f\n' % best_score[1] else: print 'Best F score: %f' % best_score[2] print 'Best Precision: %f' % best_score[0] print 'Best Recall: %f\n' % best_score[1] print 'Best epoch: %d' % best_epoch
def train(self, t_x, t_y, v_x, v_y_raw, v_y_gold, idx2tag, idx2char, unk_chars, trans_dict, sess, epochs, trained_model, transducer=None, lr=0.05, decay=0.05, decay_step=1, sent_seg=False, outpath=None): lr_r = lr best_epoch = 0 best_score = [0] * 6 chars = toolbox.decode_chars(v_x[0], idx2char) for i in range(len(v_x[0])): for j, n in enumerate(v_x[0][i]): if n in unk_chars: v_x[0][i][j] = 1 for i in range(len(t_x[0])): for k in range(len(t_x[0][i])): for j, n in enumerate(t_x[0][i][k]): if n in unk_chars: t_x[0][i][k][j] = 1 transducer_dict = None if transducer is not None: char2idx = {k: v for v, k in idx2char.items()} def transducer_dict(trans_str): return self.define_transducer_dict(trans_str, char2idx, sess[-1], transducer) for epoch in range(epochs): print('epoch: %d' % (epoch + 1)) sys.stdout.flush() t = time() if epoch % decay_step == 0 and decay > 0: lr_r = lr / (1 + decay * (epoch / decay_step)) data_list = t_x + t_y samples = list(zip(*data_list)) random.shuffle(samples) for sample in samples: c_len = len(sample[0][0]) idx = self.bucket_dit[c_len] real_batch_size = self.real_batches[idx] model = self.input_v[idx] + self.output_[idx] Batch.train(sess=sess[0], model=model, batch_size=real_batch_size, config=self.train_step[idx], lr=self.l_rate, lrv=lr_r, dr=self.drop_out, drv=self.drop_out_v, data=list(sample), verbose=False) predictions = [] #for v_b_x in zip(*v_x): c_len = len(v_x[0][0]) idx = self.bucket_dit[c_len] b_prediction = self.predict(data=v_x, sess=sess, model=self.input_v[idx] + self.output[idx], index=idx, argmax=True, batch_size=200) b_prediction = toolbox.decode_tags(b_prediction, idx2tag) predictions.append(b_prediction) predictions = list(zip(*predictions)) predictions = toolbox.merge_bucket(predictions) if self.is_space == 'sea': prediction_out, raw_out = toolbox.generate_output_sea( chars, predictions) else: prediction_out, raw_out = toolbox.generate_output( chars, predictions, trans_dict, transducer_dict) if sent_seg: scores = evaluation.evaluator(prediction_out, v_y_gold, raw_out, v_y_raw) else: scores = evaluation.evaluator(prediction_out, v_y_gold) if sent_seg: c_score = scores[2] * scores[5] c_best_score = best_score[2] * best_score[5] else: c_score = scores[2] c_best_score = best_score[2] if c_score > c_best_score: best_epoch = epoch + 1 best_score = scores self.saver.save(sess[0], trained_model, write_meta_graph=True) if outpath is not None: wt = codecs.open(outpath, 'w', encoding='utf-8') for pre in prediction_out[0]: wt.write(pre + '\n') wt.close() if sent_seg: print('Sentence segmentation F-score: %f' % scores[5]) print('Word segmentation F-score: %f' % scores[2]) else: print('F score: %f' % c_score) print('Time consumed: %d seconds\n' % int(time() - t)) sys.stdout.flush() print('Training is finished!') if sent_seg: print('Sentence segmentation:') print('Best F score: %f' % best_score[5]) print('Best Precision: %f' % best_score[3]) print('Best Recall: %f\n' % best_score[4]) print('Word segmentation:') print('Best F score: %f' % best_score[2]) print('Best Precision: %f' % best_score[0]) print('Best Recall: %f\n' % best_score[1]) else: print('Best F score: %f' % best_score[2]) print('Best Precision: %f' % best_score[0]) print('Best Recall: %f\n' % best_score[1]) print('Best epoch: %d' % best_epoch)