def create_reader(padding_num, train_data_path, test_data_path=None, src_dict=None, trg_dict=None, pos_size=200, batch_size=32): train_reader = paddle.batch(reader=paddle.reader.shuffle( reader=reader.data_reader(data_file=train_data_path, src_dict=src_dict, trg_dict=trg_dict, pos_size=pos_size, padding_num=padding_num), buf_size=10240), batch_size=batch_size) test_reader = None if test_data_path: test_reader = paddle.batch(reader=paddle.reader.shuffle( reader=reader.data_reader(data_file=test_data_path, src_dict=src_dict, trg_dict=trg_dict, pos_size=pos_size, padding_num=padding_num), buf_size=10240), batch_size=batch_size) return train_reader, test_reader
def build_reader(data_dir, batch_size): """Build the data reader for this model. Arguments: - data_dir: The path of training data. - batch_size: batch size for the training task. """ train_samples, valid_samples = choose_samples(data_dir) train_reader = paddle.batch( paddle.reader.shuffle( reader.data_reader(train_samples), buf_size=102400), batch_size=batch_size) # testing data is not shuffled test_reader = paddle.batch( reader.data_reader( valid_samples, is_train=False), batch_size=batch_size) return train_reader, test_reader, len(train_samples)
def infer(model_path, batch_size, test_data_file, vocab_file, target_file, use_gpu): """ use the model under model_path to predict the test data, the result will be printed on the screen return nothing """ word_dict = load_dict(vocab_file) word_reverse_dict = load_reverse_dict(vocab_file) label_dict = load_dict(target_file) label_reverse_dict = load_reverse_dict(target_file) test_data = paddle.batch(reader.data_reader(test_data_file, word_dict, label_dict), batch_size=batch_size) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) inference_scope = fluid.Scope() with fluid.scope_guard(inference_scope): [inference_program, feed_target_names, fetch_targets] = fluid.io.load_inference_model(model_path, exe) for data in test_data(): word = to_lodtensor([x[0] for x in data], place) mark = to_lodtensor([x[1] for x in data], place) crf_decode = exe.run(inference_program, feed={ "word": word, "mark": mark }, fetch_list=fetch_targets, return_numpy=False) lod_info = (crf_decode[0].lod())[0] np_data = np.array(crf_decode[0]) assert len(data) == len(lod_info) - 1 for sen_index in six.moves.xrange(len(data)): assert len( data[sen_index][0]) == lod_info[sen_index + 1] - lod_info[sen_index] word_index = 0 for tag_index in six.moves.xrange(lod_info[sen_index], lod_info[sen_index + 1]): word = word_reverse_dict[data[sen_index][0][word_index]] gold_tag = label_reverse_dict[data[sen_index][2] [word_index]] tag = label_reverse_dict[np_data[tag_index][0]] print(word + "\t" + gold_tag + "\t" + tag) word_index += 1 print("")
def infer(model_path, data_dir, batch_size, config, use_gpu=False, trainer_count=1): """ The inferring process. Arguments: - model_path: The path of trained model. - data_dir: The directory path of test data. - batch_size: The batch_size. - config: The model configuration. - use_gpu: Whether to run the inferring on GPU. - trainer_count: The thread number used in inferring. When set use_gpu=True, the trainer_count cannot excess the gpu device number in your computer. """ assert os.path.exists(model_path), "The model does not exist." paddle.init(use_gpu=use_gpu, trainer_count=trainer_count) ids_2_word = load_reverse_dict(config.dict_path) outputs = GNR(config, is_infer=True) # load the trained models parameters = paddle.parameters.Parameters.from_tar( gzip.open(model_path, "r")) logger.info("loading parameter is done.") inferer = paddle.inference.Inference(output_layer=outputs, parameters=parameters) _, valid_samples = choose_samples(data_dir) test_reader = reader.data_reader(valid_samples, is_train=False) test_batch = [] for i, item in enumerate(test_reader()): test_batch.append(item) if len(test_batch) == batch_size: infer_a_batch(inferer, test_batch, ids_2_word, len(outputs)) test_batch = [] if len(test_batch): infer_a_batch(inferer, test_batch, ids_2_word, len(outputs)) test_batch = []
def infer(model_path, batch_size, test_data_file, vocab_file, target_file): def _infer_a_batch(inferer, test_data, id_2_word, id_2_label): probs = inferer.infer(input=test_data, field=["id"]) assert len(probs) == sum(len(x[0]) for x in test_data) for idx, test_sample in enumerate(test_data): start_id = 0 for w, tag in zip(test_sample[0], probs[start_id:start_id + len(test_sample[0])]): print("%s\t%s" % (id_2_word[w], id_2_label[tag])) print("\n") start_id += len(test_sample[0]) word_dict = load_dict(vocab_file) word_dict_len = len(word_dict) word_reverse_dict = load_reverse_dict(vocab_file) label_dict = load_dict(target_file) label_reverse_dict = load_reverse_dict(target_file) label_dict_len = len(label_dict) # initialize PaddlePaddle paddle.init(use_gpu=False, trainer_count=1) parameters = paddle.parameters.Parameters.from_tar( gzip.open(model_path, "r")) predict = ner_net(word_dict_len=word_dict_len, label_dict_len=label_dict_len, is_train=False) inferer = paddle.inference.Inference(output_layer=predict, parameters=parameters) test_data = [] for i, item in enumerate( reader.data_reader(test_data_file, word_dict, label_dict)()): test_data.append([item[0], item[1]]) if len(test_data) == batch_size: _infer_a_batch(inferer, test_data, word_reverse_dict, label_reverse_dict) test_data = [] _infer_a_batch(inferer, test_data, word_reverse_dict, label_reverse_dict) test_data = []
def main(train_data_file, test_data_file, vocab_file, target_file, emb_file, model_save_dir, num_passes, use_gpu, parallel): if not os.path.exists(model_save_dir): os.mkdir(model_save_dir) BATCH_SIZE = 200 word_dict = load_dict(vocab_file) label_dict = load_dict(target_file) word_vector_values = get_embedding(emb_file) word_dict_len = len(word_dict) label_dict_len = len(label_dict) avg_cost, feature_out, word, mark, target = ner_net( word_dict_len, label_dict_len, parallel) sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3) sgd_optimizer.minimize(avg_cost) crf_decode = fluid.layers.crf_decoding( input=feature_out, param_attr=fluid.ParamAttr(name='crfw')) chunk_evaluator = fluid.evaluator.ChunkEvaluator( input=crf_decode, label=target, chunk_scheme="IOB", num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0))) inference_program = fluid.default_main_program().clone() with fluid.program_guard(inference_program): test_target = chunk_evaluator.metrics + chunk_evaluator.states inference_program = fluid.io.get_inference_program(test_target) train_reader = paddle.batch(paddle.reader.shuffle(reader.data_reader( train_data_file, word_dict, label_dict), buf_size=20000), batch_size=BATCH_SIZE) test_reader = paddle.batch(paddle.reader.shuffle(reader.data_reader( test_data_file, word_dict, label_dict), buf_size=20000), batch_size=BATCH_SIZE) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) embedding_name = 'emb' embedding_param = fluid.global_scope().find_var( embedding_name).get_tensor() embedding_param.set(word_vector_values, place) batch_id = 0 total_time = 0.0 for pass_id in xrange(num_passes): chunk_evaluator.reset(exe) start_time = time.time() for data in train_reader(): cost, batch_precision, batch_recall, batch_f1_score = exe.run( fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[avg_cost] + chunk_evaluator.metrics) batch_id = batch_id + 1 t1 = time.time() total_time += t1 - start_time pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval(exe) if pass_id == num_passes - 1: train_acc_kpi.add_record(pass_precision) pass_duration_kpi.add_record(total_time / num_passes) if pass_id % 100 == 0: print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" + str(pass_precision) + " pass_recall:" + str(pass_recall) + " pass_f1_score:" + str(pass_f1_score)) pass_precision, pass_recall, pass_f1_score = test( exe, chunk_evaluator, inference_program, test_reader, place) if pass_id % 100 == 0: print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" + str(pass_precision) + " pass_recall:" + str(pass_recall) + " pass_f1_score:" + str(pass_f1_score)) save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id) fluid.io.save_inference_model(save_dirname, ['word', 'mark', 'target'], [crf_decode], exe) train_acc_kpi.persist() pass_duration_kpi.persist()
def main(train_data_file, test_data_file, vocab_file, target_file, emb_file, model_save_dir, num_passes, use_gpu, parallel): if not os.path.exists(model_save_dir): os.mkdir(model_save_dir) BATCH_SIZE = int(os.getenv("BATCH_SIZE", "200")) word_dict = load_dict(vocab_file) label_dict = load_dict(target_file) word_vector_values = get_embedding(emb_file) word_dict_len = len(word_dict) label_dict_len = len(label_dict) avg_cost, feature_out, word, mark, target = ner_net( word_dict_len, label_dict_len, parallel) sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3 / BATCH_SIZE) optimize_ops, params_grads = sgd_optimizer.minimize(avg_cost) crf_decode = fluid.layers.crf_decoding( input=feature_out, param_attr=fluid.ParamAttr(name='crfw')) chunk_evaluator = fluid.evaluator.ChunkEvaluator( input=crf_decode, label=target, chunk_scheme="IOB", num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0))) inference_program = fluid.default_main_program().clone() with fluid.program_guard(inference_program): test_target = chunk_evaluator.metrics + chunk_evaluator.states inference_program = fluid.io.get_inference_program(test_target) train_reader = paddle.batch(paddle.reader.shuffle(reader.data_reader( train_data_file, word_dict, label_dict), buf_size=20000), batch_size=BATCH_SIZE) test_reader = paddle.batch(paddle.reader.shuffle(reader.data_reader( test_data_file, word_dict, label_dict), buf_size=20000), batch_size=BATCH_SIZE) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place) exe = fluid.Executor(place) def train_loop(exe, trainer_prog, trainer_id=0, reader=train_reader): embedding_name = 'emb' embedding_param = fluid.global_scope().find_var( embedding_name).get_tensor() embedding_param.set(word_vector_values, place) batch_id = 0 for pass_id in xrange(num_passes): chunk_evaluator.reset(exe) start_time = time.time() with profiler.profiler( "CPU", 'total', profile_path="/usr/local/nvidia/lib64/tmp") as prof: for data in reader(): cost, batch_precision, batch_recall, batch_f1_score = exe.run( trainer_prog, feed=feeder.feed(data), fetch_list=[avg_cost] + chunk_evaluator.metrics) if batch_id % 5 == 0: print("Pass " + str(pass_id) + ", Batch " + str(batch_id) + ", Cost " + str(cost[0]) + ", Precision " + str(batch_precision[0]) + ", Recall " + str(batch_recall[0]) + ", F1_score" + str(batch_f1_score[0])) batch_id = batch_id + 1 pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval( exe) spent = time.time() - start_time print("pass_id: %d, precision: %f, recall: %f, f1: %f, spent: %f, speed: %f" % \ (pass_id, pass_precision, pass_recall, pass_f1_score, spent, 14987.0 / spent)) pass_precision, pass_recall, pass_f1_score = test( exe, chunk_evaluator, inference_program, test_reader, place) print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" + str(pass_precision) + " pass_recall:" + str(pass_recall) + " pass_f1_score:" + str(pass_f1_score)) # save_dirname = os.path.join(model_save_dir, # "params_pass_%d_trainer%d" % (pass_id, trainer_id)) # fluid.io.save_inference_model(save_dirname, ['word', 'mark', 'target'], # [crf_decode], exe) with open("/tmp/origin_prog", "w") as fn: fn.write(fluid.default_main_program().__str__()) if os.getenv("LOCAL") == "TRUE": exe.run(fluid.default_startup_program()) train_loop(exe, fluid.default_main_program()) else: pserver_ips = os.getenv( "PADDLE_INIT_PSERVERS") # all pserver endpoints eplist = [] port = os.getenv("PADDLE_INIT_PORT") for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) trainers = int(os.getenv("TRAINERS")) # total trainer count trainer_id = int(os.getenv("PADDLE_INIT_TRAINER_ID", "0")) current_endpoint = os.getenv( "POD_IP") + ":" + port # current pserver endpoint training_role = os.getenv( "TRAINING_ROLE", "TRAINER") # get the training role: trainer/pserver t = fluid.DistributeTranspiler() t.transpile(optimize_ops, params_grads, trainer_id, pservers=pserver_endpoints, trainers=trainers) print("endpoints: %s, current: %s, trainers: %d, trainer_id: %d, role: %s" %\ (pserver_endpoints, current_endpoint, trainers, trainer_id, training_role)) if training_role == "PSERVER": if not current_endpoint: print("need env SERVER_ENDPOINT") exit(1) pserver_prog = t.get_pserver_program(current_endpoint) print("######## pserver prog #############") with open("/tmp/pserver_prog", "w") as f: f.write(pserver_prog.__str__()) print("######## pserver prog #############") pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) with open("/tmp/pserver_startup", "w") as f: f.write(pserver_startup.__str__()) print("starting server side startup") exe.run(pserver_startup) print("starting parameter server...") exe.run(pserver_prog) elif training_role == "TRAINER": exe.run(fluid.default_startup_program()) trainer_prog = t.get_trainer_program() cluster_train_reader = paddle.batch(paddle.reader.shuffle( reader.cluster_data_reader(train_data_file, word_dict, label_dict, trainers, trainer_id), buf_size=20000), batch_size=BATCH_SIZE) print("######## trainer prog #############") with open("/tmp/trainer_prog", "w") as f: f.write(trainer_prog.__str__()) print("######## trainer prog #############") train_loop(exe, trainer_prog, trainer_id, cluster_train_reader) else: print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
def main(train_data_file, test_data_file, vocab_file, target_file, emb_file, model_save_dir, num_passes=10, batch_size=32): if not os.path.exists(model_save_dir): os.mkdir(model_save_dir) word_dict = load_dict(vocab_file) label_dict = load_dict(target_file) word_vector_values = get_embedding(emb_file) word_dict_len = len(word_dict) label_dict_len = len(label_dict) paddle.init(use_gpu=False, trainer_count=1) # define network topology crf_cost, crf_dec, target = ner_net(word_dict_len, label_dict_len) evaluator.sum(name="error", input=crf_dec) evaluator.chunk( name="ner_chunk", input=crf_dec, label=target, chunk_scheme="IOB", num_chunk_types=(label_dict_len - 1) / 2) # create parameters parameters = paddle.parameters.create(crf_cost) parameters.set("emb", word_vector_values) # create optimizer optimizer = paddle.optimizer.Momentum( momentum=0, learning_rate=2e-4, regularization=paddle.optimizer.L2Regularization(rate=8e-4), gradient_clipping_threshold=25, model_average=paddle.optimizer.ModelAverage( average_window=0.5, max_average_window=10000), ) trainer = paddle.trainer.SGD( cost=crf_cost, parameters=parameters, update_equation=optimizer, extra_layers=crf_dec) train_reader = paddle.batch( paddle.reader.shuffle( reader.data_reader(train_data_file, word_dict, label_dict), buf_size=1000), batch_size=batch_size) test_reader = paddle.batch( paddle.reader.shuffle( reader.data_reader(test_data_file, word_dict, label_dict), buf_size=1000), batch_size=batch_size) feeding = {"word": 0, "mark": 1, "target": 2} def event_handler(event): if isinstance(event, paddle.event.EndIteration): if event.batch_id % 1 == 0: logger.info("Pass %d, Batch %d, Cost %f, %s" % ( event.pass_id, event.batch_id, event.cost, event.metrics)) if event.batch_id % 1 == 0: result = trainer.test(reader=test_reader, feeding=feeding) logger.info("\nTest with Pass %d, Batch %d, %s" % (event.pass_id, event.batch_id, result.metrics)) if isinstance(event, paddle.event.EndPass): # save parameters with gzip.open( os.path.join(model_save_dir, "params_pass_%d.tar.gz" % event.pass_id), "w") as f: parameters.to_tar(f) result = trainer.test(reader=test_reader, feeding=feeding) logger.info("\nTest with Pass %d, %s" % (event.pass_id, result.metrics)) trainer.train( reader=train_reader, event_handler=event_handler, num_passes=num_passes, feeding=feeding)
def main(train_data_file, test_data_file, vocab_file, target_file, emb_file, model_save_dir, num_passes, use_gpu, parallel, batch_size=200): if not os.path.exists(model_save_dir): os.mkdir(model_save_dir) word_dict = load_dict(vocab_file) label_dict = load_dict(target_file) word_vector_values = get_embedding(emb_file) word_dict_len = len(word_dict) label_dict_len = len(label_dict) if "CE_MODE_X" in os.environ: fluid.default_startup_program().random_seed = 110 avg_cost, feature_out, word, mark, target = ner_net( word_dict_len, label_dict_len, parallel) crf_decode = fluid.layers.crf_decoding( input=feature_out, param_attr=fluid.ParamAttr(name='crfw')) (precision, recall, f1_score, num_infer_chunks, num_label_chunks, num_correct_chunks) = fluid.layers.chunk_eval( input=crf_decode, label=target, chunk_scheme="IOB", num_chunk_types=int(math.ceil((label_dict_len - 1) / 2.0))) chunk_evaluator = fluid.metrics.ChunkEvaluator() inference_program = fluid.default_main_program().clone(for_test=True) test_fetch_list = [num_infer_chunks, num_label_chunks, num_correct_chunks] sgd_optimizer = fluid.optimizer.SGD(learning_rate=1e-3) sgd_optimizer.minimize(avg_cost) if "CE_MODE_X" not in os.environ: train_reader = paddle.batch(paddle.reader.shuffle(reader.data_reader( train_data_file, word_dict, label_dict), buf_size=20000), batch_size=batch_size) test_reader = paddle.batch(paddle.reader.shuffle(reader.data_reader( test_data_file, word_dict, label_dict), buf_size=20000), batch_size=batch_size) else: train_reader = paddle.batch(reader.data_reader(train_data_file, word_dict, label_dict), batch_size=batch_size) test_reader = paddle.batch(reader.data_reader(test_data_file, word_dict, label_dict), batch_size=batch_size) place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace() feeder = fluid.DataFeeder(feed_list=[word, mark, target], place=place) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) embedding_name = 'emb' embedding_param = fluid.global_scope().find_var( embedding_name).get_tensor() embedding_param.set(word_vector_values, place) time_begin = time.time() for pass_id in six.moves.xrange(num_passes): chunk_evaluator.reset() for batch_id, data in enumerate(train_reader()): cost_var, nums_infer, nums_label, nums_correct = exe.run( fluid.default_main_program(), feed=feeder.feed(data), fetch_list=[ avg_cost, num_infer_chunks, num_label_chunks, num_correct_chunks ]) if batch_id % 5 == 0: print("Pass " + str(pass_id) + ", Batch " + str(batch_id) + ", Cost " + str(cost_var[0])) chunk_evaluator.update(nums_infer, nums_label, nums_correct) pass_precision, pass_recall, pass_f1_score = chunk_evaluator.eval() print("[TrainSet] pass_id:" + str(pass_id) + " pass_precision:" + str(pass_precision) + " pass_recall:" + str(pass_recall) + " pass_f1_score:" + str(pass_f1_score)) test_pass_precision, test_pass_recall, test_pass_f1_score = test( exe, chunk_evaluator, inference_program, test_reader, test_fetch_list, place) print("[TestSet] pass_id:" + str(pass_id) + " pass_precision:" + str(test_pass_precision) + " pass_recall:" + str(test_pass_recall) + " pass_f1_score:" + str(test_pass_f1_score)) save_dirname = os.path.join(model_save_dir, "params_pass_%d" % pass_id) if "CE_MODE_X" not in os.environ: fluid.io.save_inference_model(save_dirname, ['word', 'mark'], crf_decode, exe) if "CE_MODE_X" in os.environ: print("kpis train_precision %f" % pass_precision) print("kpis test_precision %f" % test_pass_precision) print("kpis train_duration %f" % (time.time() - time_begin))
def infer(infer_data_path, src_dict_path, trg_dict_path, model_path, enc_conv_blocks, dec_conv_blocks, emb_dim=256, pos_size=200, drop_rate=0., use_bn=False, max_len=100, batch_size=1, beam_size=1, is_show_attention=False): """ Inference. :param infer_data_path: The path of the data for inference. :type infer_data_path: str :param src_dict_path: The path of the source dictionary. :type src_dict_path: str :param trg_dict_path: The path of the target dictionary. :type trg_dict_path: str :param model_path: The path of a trained model. :type model_path: str :param enc_conv_blocks: The scale list of the encoder's convolution blocks. And each element of the list contains output dimension and context length of the corresponding convolution block. :type enc_conv_blocks: list of tuple :param dec_conv_blocks: The scale list of the decoder's convolution blocks. And each element of the list contains output dimension and context length of the corresponding convolution block. :type dec_conv_blocks: list of tuple :param emb_dim: The dimension of the embedding vector. :type emb_dim: int :param pos_size: The total number of the position indexes, which means the maximum value of the index is pos_size - 1. :type pos_size: int :param drop_rate: Dropout rate. :type drop_rate: float :param use_bn: Whether to use batch normalization or not. False is the default value. :type use_bn: bool :param max_len: The maximum length of the sentence to be generated. :type max_len: int :param beam_size: The width of beam expansion. :type beam_size: int :param is_show_attention: Whether to show attention weight or not. False is the default value. :type is_show_attention: bool """ # load dict src_dict = reader.load_dict(src_dict_path) trg_dict = reader.load_dict(trg_dict_path) src_dict_size = src_dict.__len__() trg_dict_size = trg_dict.__len__() prob, weight = conv_seq2seq(src_dict_size=src_dict_size, trg_dict_size=trg_dict_size, pos_size=pos_size, emb_dim=emb_dim, enc_conv_blocks=enc_conv_blocks, dec_conv_blocks=dec_conv_blocks, drop_rate=drop_rate, with_bn=use_bn, is_infer=True) # load parameters parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path)) padding_list = [context_len - 1 for (size, context_len) in dec_conv_blocks] padding_num = reduce(lambda x, y: x + y, padding_list) infer_reader = reader.data_reader(data_file=infer_data_path, src_dict=src_dict, trg_dict=trg_dict, pos_size=pos_size, padding_num=padding_num) if is_show_attention: attention_inferer = paddle.inference.Inference(output_layer=weight, parameters=parameters) for i, data in enumerate(infer_reader()): src_len = len(data[0]) trg_len = len(data[2]) attention_weight = attention_inferer.infer([data], field='value', flatten_result=False) attention_weight = [ weight.reshape((trg_len, src_len)) for weight in attention_weight ] print attention_weight break return infer_data = [] for i, raw_data in enumerate(infer_reader()): infer_data.append([raw_data[0], raw_data[1]]) inferer = paddle.inference.Inference(output_layer=prob, parameters=parameters) searcher = BeamSearch(inferer=inferer, trg_dict=trg_dict, pos_size=pos_size, padding_num=padding_num, max_len=max_len, batch_size=batch_size, beam_size=beam_size) searcher.search(infer_data) return