def infer(args): id2word_dict = reader.load_dict(args.word_dict_path) word2id_dict = reader.load_reverse_dict(args.word_dict_path) id2label_dict = reader.load_dict(args.label_dict_path) label2id_dict = reader.load_reverse_dict(args.label_dict_path) q2b_dict = reader.load_dict(args.word_rep_dict_path) test_data = paddle.batch(reader.test_reader(args.test_data_dir, word2id_dict, label2id_dict, q2b_dict), batch_size=args.batch_size) place = fluid.CPUPlace() exe = fluid.Executor(place) inference_scope = fluid.core.Scope() with fluid.scope_guard(inference_scope): [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model(args.model_path, exe) for data in test_data(): full_out_str = "" word_idx = to_lodtensor([x[0] for x in data], place) word_list = [x[1] for x in data] (crf_decode, ) = exe.run(inference_program, feed={"word": word_idx}, fetch_list=fetch_targets, return_numpy=False) lod_info = (crf_decode.lod())[0] np_data = np.array(crf_decode) assert len(data) == len(lod_info) - 1 for sen_index in xrange(len(data)): assert len( data[sen_index][0]) == lod_info[sen_index + 1] - lod_info[sen_index] word_index = 0 outstr = "" cur_full_word = "" cur_full_tag = "" words = word_list[sen_index] for tag_index in xrange(lod_info[sen_index], lod_info[sen_index + 1]): cur_word = words[word_index] cur_tag = id2label_dict[str(np_data[tag_index][0])] if cur_tag.endswith("-B") or cur_tag.endswith("O"): if len(cur_full_word) != 0: outstr += cur_full_word.encode( 'utf8') + "/" + cur_full_tag.encode( 'utf8') + " " cur_full_word = cur_word cur_full_tag = get_real_tag(cur_tag) else: cur_full_word += cur_word word_index += 1 outstr += cur_full_word.encode( 'utf8') + "/" + cur_full_tag.encode('utf8') + " " outstr = outstr.strip() full_out_str += outstr + "\n" print full_out_str.strip()
def train(args): """ Train the network. """ if not os.path.exists(args.model_save_dir): os.mkdir(args.model_save_dir) word2id_dict = reader.load_reverse_dict(args.word_dict_path) label2id_dict = reader.load_reverse_dict(args.label_dict_path) word_rep_dict = reader.load_dict(args.word_rep_dict_path) word_dict_len = max(map(int, word2id_dict.values())) + 1 label_dict_len = max(map(int, label2id_dict.values())) + 1 avg_cost, crf_decode, word, target = lex_net(args, word_dict_len, label_dict_len) sgd_optimizer = fluid.optimizer.SGD(learning_rate=args.base_learning_rate) sgd_optimizer.minimize(avg_cost) (precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks) = fluid.layers.chunk_eval( input=crf_decode, label=target, chunk_scheme="IOB", num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0))) chunk_evaluator = fluid.metrics.ChunkEvaluator() chunk_evaluator.reset() train_reader_list = [] corpus_num = len(args.corpus_type_list) for i in xrange(corpus_num): train_reader = paddle.batch( paddle.reader.shuffle(reader.file_reader(args.traindata_dir, word2id_dict, label2id_dict, word_rep_dict, args.corpus_type_list[i]), buf_size=args.traindata_shuffle_buffer), batch_size=int(args.batch_size * args.corpus_proportion_list[i])) train_reader_list.append(train_reader) test_reader = paddle.batch(reader.file_reader(args.testdata_dir, word2id_dict, label2id_dict, word_rep_dict), batch_size=args.batch_size) train_reader_itr_list = [] for train_reader in train_reader_list: cur_reader_itr = train_reader() train_reader_itr_list.append(cur_reader_itr) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() feeder = fluid.DataFeeder(feed_list=[word, target], place=place) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) batch_id = 0 start_time = time.time() eval_list = [] iter = 0 while True: full_batch = [] cur_batch = [] for i in xrange(corpus_num): reader_itr = train_reader_itr_list[i] try: cur_batch = next(reader_itr) except StopIteration: print(args.corpus_type_list[i] + " corpus finish a pass of training") new_reader = train_reader_list[i] train_reader_itr_list[i] = new_reader() cur_batch = next(train_reader_itr_list[i]) full_batch += cur_batch random.shuffle(full_batch) cost_var, nums_infer, nums_label, nums_correct = exe.run( fluid.default_main_program(), fetch_list=[ avg_cost, num_infer_chunks, num_label_chunks, num_correct_chunks ], feed=feeder.feed(full_batch)) print("batch_id:" + str(batch_id) + ", avg_cost:" + str(cost_var[0])) chunk_evaluator.update(nums_infer, nums_label, nums_correct) batch_id += 1 if (batch_id % args.save_model_per_batchs == 1): save_exe = fluid.Executor(place) save_dirname = os.path.join(args.model_save_dir, "params_batch_%d" % batch_id) fluid.io.save_inference_model(save_dirname, ['word'], [crf_decode], save_exe) temp_save_model = os.path.join(args.model_save_dir, "temp_model_for_test") fluid.io.save_inference_model( temp_save_model, ['word', 'target'], [num_infer_chunks, num_label_chunks, num_correct_chunks], save_exe) precision, recall, f1_score = chunk_evaluator.eval() print("[train] batch_id:" + str(batch_id) + ", precision:" + str(precision) + ", recall:" + str(recall) + ", f1:" + str(f1_score)) chunk_evaluator.reset() p, r, f1 = test(exe, chunk_evaluator, temp_save_model, test_reader, place) chunk_evaluator.reset() print("[test] batch_id:" + str(batch_id) + ", precision:" + str(p) + ", recall:" + str(r) + ", f1:" + str(f1)) end_time = time.time() print("cur_batch_id:" + str(batch_id) + ", last " + str(args.save_model_per_batchs) + " batchs, time_cost:" + str(end_time - start_time)) start_time = time.time() if len(eval_list) < 2 * args.eval_window: eval_list.append(f1) else: eval_list.pop(0) eval_list.append(f1) last_avg_f1 = sum( eval_list[0:args.eval_window]) / args.eval_window cur_avg_f1 = sum( eval_list[args.eval_window:2 * args.eval_window]) / args.eval_window if cur_avg_f1 <= last_avg_f1: return else: print "keep training!" iter += 1 if (iter == args.num_iterations): return
results = [] # get out data from output tensor output_names = predictor.get_output_names() for i, name in enumerate(output_names): output_tensor = predictor.get_output_tensor(name) output_data = output_tensor.copy_to_cpu() results.append(output_data) return results if __name__ == '__main__': args = parse_args() word2id_dict = reader.load_reverse_dict(args.word_dict_path) label2id_dict = reader.load_reverse_dict(args.label_dict_path) word_rep_dict = reader.load_dict(args.word_rep_dict_path) word_dict_len = max(map(int, word2id_dict.values())) + 1 label_dict_len = max(map(int, label2id_dict.values())) + 1 pred = create_predictor(args) test_data = paddle.batch(reader.file_reader(args.testdata_dir, word2id_dict, label2id_dict, word_rep_dict), batch_size=1) batch_id = 0 id2word = {v: k for k, v in word2id_dict.items()} id2label = {v: k for k, v in label2id_dict.items()} for data in test_data(): batch_id += 1 word_data, word_lod = to_lodtensor(list(map(lambda x: x[0], data)))
def train(train_data_path, test_data_path, src_dict_path, trg_dict_path, enc_conv_blocks, dec_conv_blocks, emb_dim=256, pos_size=200, drop_rate=0., use_bn=False, batch_size=32, num_passes=15): """ Train the convolution sequence-to-sequence model. :param train_data_path: The path of the training set. :type train_data_path: str :param test_data_path: The path of the test set. :type test_data_path: str :param src_dict_path: The path of the source dictionary. :type src_dict_path: str :param trg_dict_path: The path of the target dictionary. :type trg_dict_path: str :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of the list contains output dimension and context length of the corresponding convolution block. :type enc_conv_blocks: list of tuple :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of the list contains output dimension and context length of the corresponding convolution block. :type dec_conv_blocks: list of tuple :param emb_dim: The dimension of the embedding vector. :type emb_dim: int :param pos_size: The total number of the position indexes, which means the maximum value of the index is pos_size - 1. :type pos_size: int :param drop_rate: Dropout rate. :type drop_rate: float :param use_bn: Whether to use batch normalization or not. False is the default value. :type use_bn: bool :param batch_size: The size of a mini-batch. :type batch_size: int :param num_passes: The total number of the passes to train. :type num_passes: int """ # load dict src_dict = reader.load_dict(src_dict_path) trg_dict = reader.load_dict(trg_dict_path) src_dict_size = src_dict.__len__() trg_dict_size = trg_dict.__len__() optimizer = paddle.optimizer.Adam(learning_rate=1e-3, ) cost = conv_seq2seq(src_dict_size=src_dict_size, trg_dict_size=trg_dict_size, pos_size=pos_size, emb_dim=emb_dim, enc_conv_blocks=enc_conv_blocks, dec_conv_blocks=dec_conv_blocks, drop_rate=drop_rate, with_bn=use_bn, is_infer=False) # create parameters and trainer parameters = paddle.parameters.create(cost) trainer = paddle.trainer.SGD(cost=cost, parameters=parameters, update_equation=optimizer) padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks] padding_num = reduce(lambda x, y: x + y, padding_list) train_reader, test_reader = create_reader(padding_num=padding_num, train_data_path=train_data_path, test_data_path=test_data_path, src_dict=src_dict, trg_dict=trg_dict, pos_size=pos_size, batch_size=batch_size) feeding = { 'src_word': 0, 'src_word_pos': 1, 'trg_word': 2, 'trg_word_pos': 3, 'trg_next_word': 4 } # create event handler def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 20 == 0: cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime()) print "[%s]: Pass: %d, Batch: %d, TrainCost: %f, %s" % ( cur_time, event.pass_id, event.batch_id, event.cost, event.metrics) sys.stdout.flush() if isinstance(event, paddle.event.EndPass): if test_reader is not None: cur_time = time.strftime('%Y.%m.%d %H:%M:%S', time.localtime()) result = trainer.test(reader=test_reader, feeding=feeding) print "[%s]: Pass: %d, TestCost: %f, %s" % ( cur_time, event.pass_id, result.cost, result.metrics) sys.stdout.flush() with gzip.open("output/params.pass-%d.tar.gz" % event.pass_id, 'w') as f: trainer.save_parameter_to_tar(f) if not os.path.exists('output'): os.mkdir('output') trainer.train(reader=train_reader, event_handler=event_handler, num_passes=num_passes, feeding=feeding)
def infer(infer_data_path, src_dict_path, trg_dict_path, model_path, enc_conv_blocks, dec_conv_blocks, emb_dim=256, pos_size=200, drop_rate=0., use_bn=False, max_len=100, batch_size=1, beam_size=1, is_show_attention=False): """ Inference. :param infer_data_path: The path of the data for inference. :type infer_data_path: str :param src_dict_path: The path of the source dictionary. :type src_dict_path: str :param trg_dict_path: The path of the target dictionary. :type trg_dict_path: str :param model_path: The path of a trained model. :type model_path: str :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of the list contains output dimension and context length of the corresponding convolution block. :type enc_conv_blocks: list of tuple :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of the list contains output dimension and context length of the corresponding convolution block. :type dec_conv_blocks: list of tuple :param emb_dim: The dimension of the embedding vector. :type emb_dim: int :param pos_size: The total number of the position indexes, which means the maximum value of the index is pos_size - 1. :type pos_size: int :param drop_rate: Dropout rate. :type drop_rate: float :param use_bn: Whether to use batch normalization or not. False is the default value. :type use_bn: bool :param max_len: The maximum length of the sentence to be generated. :type max_len: int :param beam_size: The width of beam expansion. :type beam_size: int :param is_show_attention: Whether to show attention weight or not. False is the default value. :type is_show_attention: bool """ # load dict src_dict = reader.load_dict(src_dict_path) trg_dict = reader.load_dict(trg_dict_path) src_dict_size = src_dict.__len__() trg_dict_size = trg_dict.__len__() prob, weight = conv_seq2seq(src_dict_size=src_dict_size, trg_dict_size=trg_dict_size, pos_size=pos_size, emb_dim=emb_dim, enc_conv_blocks=enc_conv_blocks, dec_conv_blocks=dec_conv_blocks, drop_rate=drop_rate, with_bn=use_bn, is_infer=True) # load parameters parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path)) padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks] padding_num = reduce(lambda x, y: x + y, padding_list) infer_reader = reader.data_reader(data_file=infer_data_path, src_dict=src_dict, trg_dict=trg_dict, pos_size=pos_size, padding_num=padding_num) if is_show_attention: attention_inferer = paddle.inference.Inference(output_layer=weight, parameters=parameters) for i, data in enumerate(infer_reader()): src_len = len(data[0]) trg_len = len(data[2]) attention_weight = attention_inferer.infer([data], field='value', flatten_result=False) attention_weight = [ weight.reshape((trg_len, src_len)) for weight in attention_weight ] print attention_weight break return infer_data = [] for i, raw_data in enumerate(infer_reader()): infer_data.append([raw_data[0], raw_data[1]]) inferer = paddle.inference.Inference(output_layer=prob, parameters=parameters) searcher = BeamSearch(inferer=inferer, trg_dict=trg_dict, pos_size=pos_size, padding_num=padding_num, max_len=max_len, batch_size=batch_size, beam_size=beam_size) searcher.search(infer_data) return