def prepare_feed_dict_list(data_generator, init_flag, count): """ Prepare the list of feed dict for multi-devices. """ feed_dict_list = [] if data_generator is not None: # use_py_reader == False data_input_names = encoder_data_input_fields + \ decoder_data_input_fields[:-1] + label_data_input_fields data = next(data_generator) for idx, data_buffer in enumerate(data): data_input_dict, num_token = prepare_batch_input( data_buffer, data_input_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) feed_dict_list.append(data_input_dict) if init_flag: for idx in range(count): pos_enc_tables = dict() for pos_enc_param_name in pos_enc_param_names: pos_enc_tables[pos_enc_param_name] = position_encoding_init( ModelHyperParams.max_length + 1, ModelHyperParams.d_model) if len(feed_dict_list) <= idx: feed_dict_list.append(pos_enc_tables) else: feed_dict_list[idx] = dict( list(pos_enc_tables.items()) + list(feed_dict_list[idx].items())) return feed_dict_list if len(feed_dict_list) == count else None
def train_loop(exe, train_progm, init, num_iters, train_data, dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict): data_input_names = encoder_data_input_fields + decoder_data_input_fields[: -1] + label_data_input_fields util_input_names = encoder_util_input_fields + decoder_util_input_fields start_time = time.time() exec_time = 0.0 for batch_id, data in enumerate(train_data()): if batch_id >= num_iters: break feed_list = [] total_num_token = 0 for place_id, data_buffer in enumerate( split_data(data, num_part=dev_count)): data_input_dict, util_input_dict, num_token = prepare_batch_input( data_buffer, data_input_names, util_input_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) total_num_token += num_token feed_kv_pairs = data_input_dict.items() + util_input_dict.items() lr_rate = lr_scheduler.update_learning_rate() feed_kv_pairs += {lr_scheduler.learning_rate.name: lr_rate}.items() feed_list.append(dict(feed_kv_pairs)) if not init: for pos_enc_param_name in pos_enc_param_names: pos_enc = position_encoding_init( ModelHyperParams.max_length + 1, ModelHyperParams.d_model) feed_list[place_id][pos_enc_param_name] = pos_enc for feed_dict in feed_list: feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token exe_start_time = time.time() if dev_count > 1: # prallel executor outs = exe.run(fetch_list=[sum_cost.name, token_num.name], feed=feed_list) else: # executor outs = exe.run(fetch_list=[sum_cost, token_num], feed=feed_list[0]) exec_time += time.time() - exe_start_time sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1]) total_sum_cost = sum_cost_val.sum() # sum the cost from multi-devices total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num print("batch: %d, sum loss: %f, avg loss: %f, ppl: %f" % (batch_id, total_sum_cost, total_avg_cost, np.exp([min(total_avg_cost, 100)]))) init = True return time.time() - start_time, exec_time
def main(): args = parse_args() place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) sum_cost, avg_cost, predict, token_num = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, TrainTaskConfig.label_smooth_eps) lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, TrainTaskConfig.warmup_steps, TrainTaskConfig.learning_rate) optimizer = fluid.optimizer.Adam(learning_rate=lr_scheduler.learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) optimizer.minimize(sum_cost) dev_count = fluid.core.get_cuda_device_count() train_data = paddle.batch(paddle.dataset.wmt16.train( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size), batch_size=TrainTaskConfig.batch_size) # Program to do validation. test_program = fluid.default_main_program().clone() with fluid.program_guard(test_program): test_program = fluid.io.get_inference_program([avg_cost]) val_data = paddle.batch(paddle.dataset.wmt16.validation( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size), batch_size=TrainTaskConfig.batch_size) def test(exe): test_total_cost = 0 test_total_token = 0 test_data = read_multiple(reader=val_data, count=dev_count) for batch_id, data in enumerate(test_data()): feed_list = [] for place_id, data_buffer in enumerate(data): data_input_dict, util_input_dict, _ = prepare_batch_input( data_buffer, data_input_names, util_input_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) feed_list.append( dict(data_input_dict.items() + util_input_dict.items())) outs = exe.run(feed=feed_list, fetch_list=[sum_cost.name, token_num.name]) sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1]) test_total_cost += sum_cost_val.sum() test_total_token += token_num_val.sum() test_avg_cost = test_total_cost / test_total_token test_ppl = np.exp([min(test_avg_cost, 100)]) return test_avg_cost, test_ppl # Initialize the parameters. if TrainTaskConfig.ckpt_path: fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path) lr_scheduler.current_steps = TrainTaskConfig.start_step else: exe.run(fluid.framework.default_startup_program()) data_input_names = encoder_data_input_fields + decoder_data_input_fields[: -1] + label_data_input_fields util_input_names = encoder_util_input_fields + decoder_util_input_fields train_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu, loss_name=sum_cost.name) test_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu, main_program=test_program, share_vars_from=train_exe) init = False train_data = read_multiple(reader=train_data, count=dev_count) for pass_id in xrange(TrainTaskConfig.pass_num): pass_start_time = time.time() for batch_id, data in enumerate(train_data()): feed_list = [] total_num_token = 0 lr_rate = lr_scheduler.update_learning_rate() for place_id, data_buffer in enumerate(data): data_input_dict, util_input_dict, num_token = prepare_batch_input( data_buffer, data_input_names, util_input_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) total_num_token += num_token feed_list.append( dict(data_input_dict.items() + util_input_dict.items() + {lr_scheduler.learning_rate.name: lr_rate}.items())) if not init: for pos_enc_param_name in pos_enc_param_names: tensor = position_encoding_init( ModelHyperParams.max_length + 1, ModelHyperParams.d_model) feed_list[place_id][pos_enc_param_name] = tensor for feed_dict in feed_list: feed_dict[ sum_cost.name + "@GRAD"] = 1. / total_num_token if TrainTaskConfig.use_avg_cost else np.asarray( [1.], dtype="float32") outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name], feed=feed_list) sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1]) total_sum_cost = sum_cost_val.sum( ) # sum the cost from multi devices total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" % (pass_id, batch_id, total_sum_cost, total_avg_cost, np.exp([min(total_avg_cost, 100)]))) init = True pass_end_time = time.time() # Validate and save the model for inference. val_avg_cost, val_ppl = test(test_exe) time_consumed = pass_end_time - pass_start_time print("pass_id = " + str(pass_id) + " time_consumed = " + str(time_consumed)) if pass_id == TrainTaskConfig.pass_num - 1: if args.gpu_card_num == 1: test_avg_ppl_kpi.add_record(np.array(val_ppl, dtype='float32')) train_pass_duration_kpi.add_record(time_consumed) test_avg_ppl_kpi.persist() train_pass_duration_kpi.persist() else: test_avg_ppl_kpi_card4.add_record( np.array(val_ppl, dtype='float32')) train_pass_duration_kpi_card4.add_record(time_consumed) test_avg_ppl_kpi_card4.persist() train_pass_duration_kpi_card4.persist()
def do_predict(args): if args.use_cuda: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() # define the data generator processor = reader.DataProcessor(fpattern=args.predict_file, src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, token_delimiter=args.token_delimiter, use_token_batch=False, batch_size=args.batch_size, device_count=1, pool_size=args.pool_size, sort_type=reader.SortType.NONE, shuffle=False, shuffle_batch=False, start_mark=args.special_token[0], end_mark=args.special_token[1], unk_mark=args.special_token[2], max_length=args.max_length, n_head=args.n_head) batch_generator = processor.data_generator(phase="predict", place=place) args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \ args.unk_idx = processor.get_vocab_summary() trg_idx2word = reader.DataProcessor.load_dict( dict_path=args.trg_vocab_fpath, reverse=True) args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \ args.unk_idx = processor.get_vocab_summary() with fluid.dygraph.guard(place): # define data loader test_loader = fluid.io.DataLoader.from_generator(capacity=10) test_loader.set_batch_generator(batch_generator, places=place) # define model transformer = Transformer( args.src_vocab_size, args.trg_vocab_size, args.max_length + 1, args.n_layer, args.n_head, args.d_key, args.d_value, args.d_model, args.d_inner_hid, args.prepostprocess_dropout, args.attention_dropout, args.relu_dropout, args.preprocess_cmd, args.postprocess_cmd, args.weight_sharing, args.bos_idx, args.eos_idx) # load the trained model assert args.init_from_params, ( "Please set init_from_params to load the infer model.") model_dict, _ = fluid.load_dygraph( os.path.join(args.init_from_params, "transformer")) # to avoid a longer length than training, reset the size of position # encoding to max_length model_dict["encoder.pos_encoder.weight"] = position_encoding_init( args.max_length + 1, args.d_model) model_dict["decoder.pos_encoder.weight"] = position_encoding_init( args.max_length + 1, args.d_model) transformer.load_dict(model_dict) # set evaluate mode transformer.eval() f = open(args.output_file, "wb") for input_data in test_loader(): (src_word, src_pos, src_slf_attn_bias, trg_word, trg_src_attn_bias) = input_data finished_seq, finished_scores = transformer.beam_search( src_word, src_pos, src_slf_attn_bias, trg_word, trg_src_attn_bias, bos_id=args.bos_idx, eos_id=args.eos_idx, beam_size=args.beam_size, max_len=args.max_out_len) finished_seq = finished_seq.numpy() finished_scores = finished_scores.numpy() for ins in finished_seq: for beam_idx, beam in enumerate(ins): if beam_idx >= args.n_best: break id_list = post_process_seq(beam, args.bos_idx, args.eos_idx) word_list = [trg_idx2word[id] for id in id_list] sequence = b" ".join(word_list) + b"\n" f.write(sequence)
def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict): # Initialize the parameters. if TrainTaskConfig.ckpt_path: fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path) lr_scheduler.current_steps = TrainTaskConfig.start_step else: logging.info("init fluid.framework.default_startup_program") exe.run(fluid.framework.default_startup_program()) logging.info("begin reader") train_data = reader.DataReader( src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, fpattern=args.train_file_pattern, token_delimiter=args.token_delimiter, use_token_batch=args.use_token_batch, batch_size=args.batch_size * (1 if args.use_token_batch else dev_count), pool_size=args.pool_size, sort_type=args.sort_type, shuffle=args.shuffle, shuffle_batch=args.shuffle_batch, start_mark=args.special_token[0], end_mark=args.special_token[1], unk_mark=args.special_token[2], # count start and end tokens out max_length=ModelHyperParams.max_length - 2, clip_last_batch=False) logging.info("begin read multiple") train_data = read_multiple(reader=train_data.batch_generator, count=dev_count if args.use_token_batch else 1) build_strategy = fluid.BuildStrategy() # Since the token number differs among devices, customize gradient scale to # use token average cost among multi-devices. and the gradient scale is # `1 / token_number` for average cost. build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized train_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu, loss_name=sum_cost.name, main_program=train_progm, build_strategy=build_strategy) data_input_names = encoder_data_input_fields + decoder_data_input_fields[: -1] + label_data_input_fields util_input_names = encoder_util_input_fields + decoder_util_input_fields if args.val_file_pattern is not None: test = test_context(train_progm, avg_cost, train_exe, dev_count, data_input_names, util_input_names, sum_cost, token_num) # the best cross-entropy value with label smoothing loss_normalizer = -((1. - TrainTaskConfig.label_smooth_eps) * np.log( (1. - TrainTaskConfig.label_smooth_eps)) + TrainTaskConfig.label_smooth_eps * np.log(TrainTaskConfig.label_smooth_eps / (ModelHyperParams.trg_vocab_size - 1) + 1e-20)) logging.info("begin train:") init = False for pass_id in xrange(TrainTaskConfig.pass_num): pass_start_time = time.time() logging.info("pass_id:{0}".format(pass_id)) avg_batch_time = time.time() for batch_id, data in enumerate(train_data()): logging.info("batch_id:{0} data_len:{1}".format( batch_id, len(data))) feed_list = [] total_num_token = 0 if args.local: lr_rate = lr_scheduler.update_learning_rate() for place_id, data_buffer in enumerate( split_data(data, num_part=dev_count)): data_input_dict, util_input_dict, num_token = prepare_batch_input( data_buffer, data_input_names, util_input_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) total_num_token += num_token feed_kv_pairs = data_input_dict.items( ) + util_input_dict.items() if args.local: feed_kv_pairs += { lr_scheduler.learning_rate.name: lr_rate }.items() feed_list.append(dict(feed_kv_pairs)) if not init: for pos_enc_param_name in pos_enc_param_names: pos_enc = position_encoding_init( ModelHyperParams.max_length + 1, ModelHyperParams.d_model) feed_list[place_id][pos_enc_param_name] = pos_enc for feed_dict in feed_list: feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token #outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name], # feed=feed_list) outs = train_exe.run( fetch_list=[sum_cost.name, token_num.name] if batch_id % 100 == 0 else [], feed=feed_list) if batch_id % 100 == 0 and batch_id > 0: sum_cost_val, token_num_val = np.array(outs[0]), np.array( outs[1]) total_sum_cost = sum_cost_val.sum( ) # sum the cost from multi-devices total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num logging.info( "epoch: %d, batch: %d, avg loss: %f, normalized loss: %f," " ppl: %f" % (pass_id, batch_id, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100)]))) logging.info("speed: {0} batch/s".format( 100.0 / (time.time() - avg_batch_time))) """ if batch_id > 0 and batch_id % 1000 == 0: fluid.io.save_persistables( exe, os.path.join(TrainTaskConfig.ckpt_dir, "latest.checkpoint")) """ init = True if batch_id % 100 == 0 and batch_id > 0: avg_batch_time = time.time() time_consumed = time.time() - pass_start_time # Validate and save the model for inference. if args.val_file_pattern is not None: val_avg_cost, val_ppl = test() logging.info( "epoch: %d, val avg loss: %f, val normalized loss: %f, val ppl: %f," " consumed %fs" % (pass_id, val_avg_cost, val_avg_cost - loss_normalizer, val_ppl, time_consumed)) else: logging.info("epoch: %d, consumed %fs" % (pass_id, time_consumed)) fluid.io.save_persistables( exe, os.path.join(TrainTaskConfig.ckpt_dir, "pass_" + str(pass_id) + ".checkpoint")) fluid.io.save_inference_model( os.path.join(TrainTaskConfig.model_dir, "pass_" + str(pass_id) + ".infer.model"), data_input_names[:-2] + util_input_names, [predict], exe) if args.enable_ce: # For CE print("kpis\ttrain_cost_card%d\t%f" % (dev_count, total_avg_cost)) print("kpis\ttest_cost_card%d\t%f" % (dev_count, val_avg_cost)) print("kpis\ttrain_duration_card%d\t%f" % (dev_count, time_consumed))
def main(): """ model train """ is_local = os.getenv("PADDLE_IS_LOCAL", "0") if is_local == '0': args.local = False else: args.local = True # init place = fluid.CUDAPlace(0) if args.device == 'GPU' else fluid.CPUPlace() training_role = os.getenv("TRAINING_ROLE", "TRAINER") if training_role == "PSERVER": place = fluid.CPUPlace() exe = fluid.Executor(place) sum_cost, avg_cost, predict, token_num = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, ModelHyperParams.src_pad_idx, ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx) warmup_steps = get_var("warmup_steps", value=TrainTaskConfig.warmup_steps) d_model = get_var("d_model", value=ModelHyperParams.d_model) lr_decay = fluid.layers\ .learning_rate_scheduler\ .noam_decay(d_model, warmup_steps) optimizer = fluid.optimizer.Adam( learning_rate = lr_decay, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) optimize_ops, params_grads = optimizer.minimize(avg_cost if TrainTaskConfig.use_avg_cost else sum_cost) # Program to do validation. inference_program = fluid.default_main_program().clone() with fluid.program_guard(inference_program): inference_program = fluid.io.get_inference_program([avg_cost]) def test(exe): test_total_cost = 0 test_total_token = 0 for batch_id, data in enumerate(test_reader()): data_input = prepare_batch_input( data, encoder_input_data_names + decoder_input_data_names[:-1] + label_data_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) test_sum_cost, test_token_num = exe.run( inference_program, feed=data_input, fetch_list=[sum_cost, token_num], use_program_cache=True) test_total_cost += test_sum_cost test_total_token += test_token_num test_avg_cost = test_total_cost / test_total_token test_ppl = np.exp([min(test_avg_cost, 100)]) return test_avg_cost, test_ppl def train_loop(exe, trainer_prog): for pass_id in xrange(args.pass_num): ts = time.time() total = 0 pass_start_time = time.time() #print len(train_reader) for batch_id, data in enumerate(train_reader): #print len(data) if len(data) != args.batch_size: continue total += len(data) start_time = time.time() data_input = prepare_batch_input( data, encoder_input_data_names + decoder_input_data_names[:-1] + label_data_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) outs = exe.run(trainer_prog, feed=data_input, fetch_list=[sum_cost, avg_cost], use_program_cache=True) sum_cost_val, avg_cost_val = np.array(outs[0]), np.array(outs[1]) print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f, speed: %.2f" % (pass_id, batch_id, sum_cost_val, avg_cost_val, np.exp([min(avg_cost_val[0], 100)]), len(data) / (time.time() - start_time))) if args.test_save: if batch_id == args.exit_batch_id: print("batch_id: %d exit!" % batch_id) break # Validate and save the model for inference. # val_avg_cost, val_ppl = test(exe) val_avg_cost, val_ppl = 0,0 pass_end_time = time.time() time_consumed = pass_end_time - pass_start_time print("pass_id = %s time_consumed = %s val_avg_cost=%f val_ppl=%f speed: %.2f" % \ (str(pass_id), str(time_consumed), \ val_avg_cost, val_ppl, total / (time.time() - ts))) fluid.io.save_inference_model( os.path.join(args.model_path, "pass_" + str(pass_id) + "_" + str(args.task_index) + ".infer.model"), encoder_input_data_names + decoder_input_data_names[:-1], [predict], exe) if args.test_save: break if args.local: # Initialize the parameters. print("local start_up:") exe.run(fluid.framework.default_startup_program()) #print(debuger.pprint_program_codes(fluid.framework.default_startup_program())) for pos_enc_param_name in pos_enc_param_names: #print("pos_enc_param_name:", pos_enc_param_name) pos_enc_param = fluid.global_scope().find_var( pos_enc_param_name).get_tensor() pos_enc_param.set( position_encoding_init(ModelHyperParams.max_length + 1, ModelHyperParams.d_model), place) #print "./nist06n/data-%d/part-*" % (args.task_index), train_reader = data_util.DataLoader( src_vocab_fpath="./thirdparty/nist06n/cn_30001.dict", trg_vocab_fpath="./thirdparty/nist06n/en_30001.dict", fpattern="./train/*" % (args.task_index), batch_size=args.batch_size, token_batch_size=TrainTaskConfig.token_batch_size, sort_by_length=TrainTaskConfig.sort_by_length, shuffle=True) train_loop(exe, fluid.default_main_program()) else: port = os.getenv("PADDLE_PORT", "6174") pserver_ips = os.getenv("PADDLE_PSERVERS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) t = fluid.DistributeTranspiler() t.transpile( trainer_id, pservers=pserver_endpoints, trainers=trainers) if training_role == "PSERVER": current_endpoint = os.getenv("POD_IP") + ":" + os.getenv( "PADDLE_PORT") if not current_endpoint: print("need env SERVER_ENDPOINT") exit(1) pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) if args.save_graph: block_no=0 for t in pserver_startup.blocks: block_name="pserver_startup_block_%04d" % block_no print block_name print(debuger.draw_block_graphviz(t, path="./" + block_name+".dot")) block_no+=1 block_no=0 for t in pserver_prog.blocks: block_name="pserver_prog_block_%04d" % block_no print(debuger.draw_block_graphviz(t, path="./" + block_name+".dot")) block_no+=1 print "begin run" exe.run(pserver_startup)#, save_program_to_file="./pserver_startup.desc") exe.run(pserver_prog)#, save_program_to_file="./pserver_loop.desc") elif training_role == "TRAINER": # Parameter initialization exe.run(fluid.default_startup_program()) #print("cluster start_up:") for pos_enc_param_name in pos_enc_param_names: #print("pos_enc_param_name:", pos_enc_param_name) pos_enc_param = fluid.global_scope().find_var( pos_enc_param_name).get_tensor() pos_enc_param.set( position_encoding_init(ModelHyperParams.max_length + 1, ModelHyperParams.d_model), place) train_reader = data_util.DataLoader( src_vocab_fpath="./thirdparty/nist06n/cn_30001.dict", trg_vocab_fpath="./thirdparty/nist06n/en_30001.dict", fpattern="./train/part-*", batch_size=args.batch_size, token_batch_size=TrainTaskConfig.token_batch_size, sort_by_length=TrainTaskConfig.sort_by_length, shuffle=True) trainer_prog = t.get_trainer_program() train_loop(exe, trainer_prog) else: print("environment var TRAINER_ROLE should be TRAINER os PSERVER")
def train_loop(exe, train_progm): def read_multiple(reader, count=dev_count if args.use_token_batch else 1, clip_last=True): """ Stack data from reader for multi-devices. """ def __impl__(): res = [] for item in reader(): res.append(item) if len(res) == count: yield res res = [] if len(res) == count: yield res elif not clip_last: data = [] for item in res: data += item if len(data) > count: inst_num_per_part = len(data) // count yield [ data[inst_num_per_part * i:inst_num_per_part * (i + 1)] for i in range(count) ] return __impl__ def split_data(data, num_part=dev_count): """ Split data for each device. """ if len(data) == num_part: return data data = data[0] inst_num_per_part = len(data) // num_part return [ data[inst_num_per_part * i:inst_num_per_part * (i + 1)] for i in range(num_part) ] # Initialize the parameters. if TrainTaskConfig.ckpt_path: fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path) lr_scheduler.current_steps = TrainTaskConfig.start_step else: print "init fluid.framework.default_startup_program" exe.run(fluid.framework.default_startup_program()) train_data = reader.DataReader( src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, fpattern=args.train_file_pattern, use_token_batch=args.use_token_batch, batch_size=args.batch_size * (1 if args.use_token_batch else dev_count), pool_size=args.pool_size, sort_type=args.sort_type, shuffle=args.shuffle, shuffle_batch=args.shuffle_batch, start_mark=args.special_token[0], end_mark=args.special_token[1], unk_mark=args.special_token[2], clip_last_batch=False) train_data = read_multiple(reader=train_data.batch_generator) build_strategy = fluid.BuildStrategy() # Since the token number differs among devices, customize gradient scale to # use token average cost among multi-devices. and the gradient scale is # `1 / token_number` for average cost. build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized #''' train_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu, loss_name=sum_cost.name, main_program=train_progm, build_strategy=build_strategy) #''' def test_context(): # Context to do validation. test_program = train_progm.clone() with fluid.program_guard(test_program): test_program = fluid.io.get_inference_program([avg_cost]) val_data = reader.DataReader( src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, fpattern=args.val_file_pattern, use_token_batch=args.use_token_batch, batch_size=args.batch_size * (1 if args.use_token_batch else dev_count), pool_size=args.pool_size, sort_type=args.sort_type, start_mark=args.special_token[0], end_mark=args.special_token[1], unk_mark=args.special_token[2], clip_last_batch=False, shuffle=False, shuffle_batch=False) test_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu, main_program=test_program, share_vars_from=train_exe) def test(exe=test_exe): test_total_cost = 0 test_total_token = 0 test_data = read_multiple(reader=val_data.batch_generator) for batch_id, data in enumerate(test_data()): feed_list = [] for place_id, data_buffer in enumerate(split_data(data)): data_input_dict, util_input_dict, _ = prepare_batch_input( data_buffer, data_input_names, util_input_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) feed_list.append( dict(data_input_dict.items() + util_input_dict.items())) outs = exe.run(feed=feed_list, fetch_list=[sum_cost.name, token_num.name]) sum_cost_val, token_num_val = np.array(outs[0]), np.array( outs[1]) test_total_cost += sum_cost_val.sum() test_total_token += token_num_val.sum() test_avg_cost = test_total_cost / test_total_token test_ppl = np.exp([min(test_avg_cost, 100)]) return test_avg_cost, test_ppl return test if args.val_file_pattern is not None: test = test_context() data_input_names = encoder_data_input_fields + decoder_data_input_fields[: -1] + label_data_input_fields util_input_names = encoder_util_input_fields + decoder_util_input_fields init = False for pass_id in xrange(TrainTaskConfig.pass_num): pass_start_time = time.time() for batch_id, data in enumerate(train_data()): feed_list = [] total_num_token = 0 #lr_rate = lr_scheduler.update_learning_rate() for place_id, data_buffer in enumerate(split_data(data)): data_input_dict, util_input_dict, num_token = prepare_batch_input( data_buffer, data_input_names, util_input_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) total_num_token += num_token feed_list.append( dict(data_input_dict.items() + util_input_dict.items())) if not init: for pos_enc_param_name in pos_enc_param_names: pos_enc = position_encoding_init( ModelHyperParams.max_length + 1, ModelHyperParams.d_model) feed_list[place_id][pos_enc_param_name] = pos_enc for feed_dict in feed_list: feed_dict[ sum_cost.name + "@GRAD"] = 1. / total_num_token if TrainTaskConfig.use_avg_cost else np.asarray( [1.], dtype="float32") outs = train_exe.run( fetch_list=[sum_cost.name, token_num.name], feed=feed_list) #outs = exe.run(train_progm,fetch_list=[sum_cost.name, token_num.name],feed=feed_list[0]) sum_cost_val, token_num_val = np.array(outs[0]), np.array( outs[1]) total_sum_cost = sum_cost_val.sum( ) # sum the cost from multi-devices total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num print( "epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" % (pass_id, batch_id, total_sum_cost, total_avg_cost, np.exp([min(total_avg_cost, 100)]))) init = True # Validate and save the model for inference. print("epoch: %d, " % pass_id + ("val avg loss: %f, val ppl: %f, " % test() if args.val_file_pattern is not None else "") + "consumed %fs" % (time.time() - pass_start_time)) fluid.io.save_persistables( exe, os.path.join(TrainTaskConfig.ckpt_dir, "pass_" + str(pass_id) + ".checkpoint")) fluid.io.save_inference_model( os.path.join(TrainTaskConfig.model_dir, "pass_" + str(pass_id) + ".infer.model"), data_input_names[:-2] + util_input_names, [predict], exe)
def train_loop(exe, train_progm, dev_count, sum_cost, avg_cost, lr_scheduler, token_num, predict): # Initialize the parameters. if TrainTaskConfig.ckpt_path: fluid.io.load_persistables(exe, TrainTaskConfig.ckpt_path) lr_scheduler.current_steps = TrainTaskConfig.start_step else: print "init fluid.framework.default_startup_program" exe.run(fluid.framework.default_startup_program()) train_data = reader.DataReader( src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, fpattern=args.train_file_pattern, use_token_batch=args.use_token_batch, batch_size=args.batch_size * (1 if args.use_token_batch else dev_count), pool_size=args.pool_size, sort_type=args.sort_type, shuffle=args.shuffle, shuffle_batch=args.shuffle_batch, start_mark=args.special_token[0], end_mark=args.special_token[1], unk_mark=args.special_token[2], # count start and end tokens out max_length=ModelHyperParams.max_length - 2, clip_last_batch=False) train_data = read_multiple(reader=train_data.batch_generator, count=dev_count if args.use_token_batch else 1) build_strategy = fluid.BuildStrategy() # Since the token number differs among devices, customize gradient scale to # use token average cost among multi-devices. and the gradient scale is # `1 / token_number` for average cost. build_strategy.gradient_scale_strategy = fluid.BuildStrategy.GradientScaleStrategy.Customized train_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu, loss_name=sum_cost.name, main_program=train_progm, build_strategy=build_strategy) data_input_names = encoder_data_input_fields + decoder_data_input_fields[: -1] + label_data_input_fields util_input_names = encoder_util_input_fields + decoder_util_input_fields if args.val_file_pattern is not None: test = test_context(train_progm, avg_cost, train_exe, dev_count, data_input_names, util_input_names, sum_cost, token_num) init = False for pass_id in xrange(TrainTaskConfig.pass_num): pass_start_time = time.time() for batch_id, data in enumerate(train_data()): feed_list = [] total_num_token = 0 for place_id, data_buffer in enumerate( split_data(data, num_part=dev_count)): data_input_dict, util_input_dict, num_token = prepare_batch_input( data_buffer, data_input_names, util_input_names, ModelHyperParams.eos_idx, ModelHyperParams.eos_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) total_num_token += num_token feed_kv_pairs = data_input_dict.items( ) + util_input_dict.items() if args.local: lr_rate = lr_scheduler.update_learning_rate() feed_kv_pairs += { lr_scheduler.learning_rate.name: lr_rate }.items() feed_list.append(dict(feed_kv_pairs)) if not init: for pos_enc_param_name in pos_enc_param_names: pos_enc = position_encoding_init( ModelHyperParams.max_length + 1, ModelHyperParams.d_model) feed_list[place_id][pos_enc_param_name] = pos_enc for feed_dict in feed_list: feed_dict[sum_cost.name + "@GRAD"] = 1. / total_num_token outs = train_exe.run(fetch_list=[sum_cost.name, token_num.name], feed=feed_list) sum_cost_val, token_num_val = np.array(outs[0]), np.array(outs[1]) total_sum_cost = sum_cost_val.sum( ) # sum the cost from multi-devices total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" % (pass_id, batch_id, total_sum_cost, total_avg_cost, np.exp([min(total_avg_cost, 100)]))) init = True # Validate and save the model for inference. print("epoch: %d, " % pass_id + ("val avg loss: %f, val ppl: %f, " % test() if args.val_file_pattern is not None else "") + "consumed %fs" % (time.time() - pass_start_time)) fluid.io.save_persistables( exe, os.path.join(TrainTaskConfig.ckpt_dir, "pass_" + str(pass_id) + ".checkpoint")) fluid.io.save_inference_model( os.path.join(TrainTaskConfig.model_dir, "pass_" + str(pass_id) + ".infer.model"), data_input_names[:-2] + util_input_names, [predict], exe)
def main(): place = fluid.CUDAPlace(0) if TrainTaskConfig.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) sum_cost, avg_cost, predict, token_num = transformer( ModelHyperParams.src_vocab_size + 0, ModelHyperParams.trg_vocab_size + 0, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.dropout, ModelHyperParams.src_pad_idx, ModelHyperParams.trg_pad_idx, ModelHyperParams.pos_pad_idx) lr_scheduler = LearningRateScheduler(ModelHyperParams.d_model, TrainTaskConfig.warmup_steps, place, TrainTaskConfig.learning_rate) optimizer = fluid.optimizer.Adam(learning_rate=lr_scheduler.learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) optimizer.minimize(avg_cost if TrainTaskConfig.use_avg_cost else sum_cost) train_data = paddle.batch(paddle.reader.shuffle(nist_data_provider.train( "data", ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size), buf_size=100000), batch_size=TrainTaskConfig.batch_size) # Initialize the parameters. exe.run(fluid.framework.default_startup_program()) for pos_enc_param_name in pos_enc_param_names: pos_enc_param = fluid.global_scope().find_var( pos_enc_param_name).get_tensor() pos_enc_param.set( position_encoding_init(ModelHyperParams.max_length + 1, ModelHyperParams.d_model), place) for pass_id in xrange(TrainTaskConfig.pass_num): pass_start_time = time.time() for batch_id, data in enumerate(train_data()): data_input = prepare_batch_input( data, encoder_input_data_names + decoder_input_data_names[:-1] + label_data_names, ModelHyperParams.src_pad_idx, ModelHyperParams.trg_pad_idx, ModelHyperParams.n_head, ModelHyperParams.d_model) lr_scheduler.update_learning_rate(data_input) outs = exe.run(fluid.framework.default_main_program(), feed=data_input, fetch_list=[sum_cost, avg_cost], use_program_cache=True) sum_cost_val, avg_cost_val = np.array(outs[0]), np.array(outs[1]) print("epoch: %d, batch: %d, sum loss: %f, avg loss: %f, ppl: %f" % (pass_id, batch_id, sum_cost_val, avg_cost_val, np.exp([min(avg_cost_val[0], 100)]))) pass_end_time = time.time() time_consumed = pass_end_time - pass_start_time print("pass_id = " + str(pass_id) + " time_consumed = " + str(time_consumed)) fluid.io.save_inference_model( os.path.join(TrainTaskConfig.model_dir, "pass_" + str(pass_id) + ".infer.model"), encoder_input_data_names + decoder_input_data_names[:-1], [predict], exe)
def do_predict(args): if args.use_cuda: place = fluid.CUDAPlace(0) else: place = fluid.CPUPlace() # define the data generator ''' # old reader processor = reader.DataProcessor(fpattern=args.predict_file, src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, token_delimiter=args.token_delimiter, use_token_batch=False, batch_size=args.batch_size, device_count=1, pool_size=args.pool_size, sort_type=reader.SortType.NONE, shuffle=False, shuffle_batch=False, start_mark=args.special_token[0], end_mark=args.special_token[1], unk_mark=args.special_token[2], max_length=args.max_length, n_head=args.n_head) ''' processor = reader.DataProcessor(fpattern=args.predict_file, src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, token_delimiter=args.token_delimiter, use_token_batch=False, batch_size=args.batch_size, device_count=1, pool_size=args.pool_size, sort_type=reader.SortType.NONE, shuffle=False, shuffle_batch=False, only_src=args.only_src, start_mark=args.special_token[0], end_mark=args.special_token[1], unk_mark=args.special_token[2], max_length=args.max_length, n_head=args.n_head, stream=args.stream, src_bpe_dict=args.src_bpe_dict) batch_generator = processor.data_generator(phase="predict", place=place) args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \ args.unk_idx = processor.get_vocab_summary() trg_idx2word = reader.DataProcessor.load_dict( dict_path=args.trg_vocab_fpath, reverse=True) args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \ args.unk_idx = processor.get_vocab_summary() with fluid.dygraph.guard(place): # define data loader test_loader = fluid.io.DataLoader.from_generator(capacity=10) test_loader.set_batch_generator(batch_generator, places=place) # define model transformer = Transformer( args.src_vocab_size, args.trg_vocab_size, args.max_length + 1, args.n_layer, args.n_head, args.d_key, args.d_value, args.d_model, args.d_inner_hid, args.prepostprocess_dropout, args.attention_dropout, args.relu_dropout, args.preprocess_cmd, args.postprocess_cmd, args.weight_sharing, args.bos_idx, args.eos_idx) # load the trained model assert args.init_from_params, ( "Please set init_from_params to load the infer model.") model_dict, _ = fluid.load_dygraph( os.path.join(args.init_from_params, "transformer")) # to avoid a longer length than training, reset the size of position # encoding to max_length model_dict["encoder.pos_encoder.weight"] = position_encoding_init( args.max_length + 1, args.d_model) model_dict["decoder.pos_encoder.weight"] = position_encoding_init( args.max_length + 1, args.d_model) transformer.load_dict(model_dict) # set evaluate mode transformer.eval() f = open(args.output_file, "wb") detok = MosesDetokenizer(lang='en') detc = MosesDetruecaser() for input_data in test_loader(): if args.stream: (src_word, src_pos, src_slf_attn_bias, trg_word, trg_src_attn_bias, real_read) = input_data else: (src_word, src_pos, src_slf_attn_bias, trg_word, trg_src_attn_bias) = input_data finished_seq, finished_scores = transformer.beam_search( src_word, src_pos, src_slf_attn_bias, trg_word, trg_src_attn_bias, bos_id=args.bos_idx, eos_id=args.eos_idx, beam_size=args.beam_size, max_len=args.max_out_len, waitk=args.waitk, stream=args.stream) finished_seq = finished_seq.numpy() finished_scores = finished_scores.numpy() for idx, ins in enumerate(finished_seq): for beam_idx, beam in enumerate(ins): if beam_idx >= args.n_best: break id_list = post_process_seq(beam, args.bos_idx, args.eos_idx) word_list = [trg_idx2word[id] for id in id_list] if args.stream: if args.waitk > 0: # for wait-k models, wait k words in the beginning word_list = [b''] * (args.waitk - 1) + word_list else: # for full sentence model, wait until the end word_list = [b''] * (len(real_read[idx].numpy()) - 1) + word_list final_output = [] real_output = [] _read = real_read[idx].numpy() sent = '' bpe_flag = False for j in range(max(len(_read), len(word_list))): # append number of reads at step j r = _read[j] if j < len(_read) else 0 if r > 0: final_output += [b''] * (r - 1) # append number of writes at step j w = word_list[j] if j < len(word_list) else b'' w = w.decode('utf-8') real_output.append(w) # if bpe_flag: # _sent = ('%s@@ %s'%(sent, w)).strip() # else: # _sent = ('%s %s'%(sent, w)).strip() _sent = ' '.join(real_output) if len(_sent) > 0: _sent += ' a' _sent = ' '.join(_sent.split()) # if _sent.endswith('@@ a'): # bpe_flag = True # else: # bpe_flag = False _sent = _sent.replace('@@ ', '') _sent = detok.detokenize(_sent.split()) _sent = detc.detruecase(_sent) _sent = ' '.join(_sent) _sent = _sent[:-1].strip() incre = _sent[len(sent):] #print('_sent0:', _sent) sent = _sent #print('sent:', sent) if r > 0: # if there is read, append a word to write # final_output.append(w) final_output.append(str.encode(incre)) else: # if there is no read, append word to the final write if j >= len(word_list): break # final_output[-1] += b' '+w final_output[-1] += str.encode(incre) #print(final_output) #print('incre:', incre) #print('_sent1:', _sent) # f.write(bytes('part:'+_sent+'\n')) sequence = b"\n".join(final_output) + b" \n" f.write(sequence) # embed() else: sequence = b" ".join(word_list) + b"\n" f.write(sequence) f.flush()