def do_train(args): if args.use_cuda: if num_trainers > 1: # for multi-process gpu training dev_count = 1 else: dev_count = fluid.core.get_cuda_device_count() gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) else: dev_count = int(os.environ.get('CPU_NUM', 1)) place = fluid.CPUPlace() # define the data generator processor = reader.DataProcessor(fpattern=args.training_file, src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, token_delimiter=args.token_delimiter, use_token_batch=args.use_token_batch, batch_size=args.batch_size, device_count=dev_count, pool_size=args.pool_size, sort_type=args.sort_type, shuffle=args.shuffle, shuffle_batch=args.shuffle_batch, start_mark=args.special_token[0], end_mark=args.special_token[1], unk_mark=args.special_token[2], max_length=args.max_length, n_head=args.n_head) batch_generator = processor.data_generator(phase="train") if num_trainers > 1: # for multi-process gpu training batch_generator = fluid.contrib.reader.distributed_batch_reader( batch_generator) args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \ args.unk_idx = processor.get_vocab_summary() train_prog = fluid.default_main_program() startup_prog = fluid.default_startup_program() random_seed = eval(str(args.random_seed)) if random_seed is not None: train_prog.random_seed = random_seed startup_prog.random_seed = random_seed with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): # define input and reader input_field_names = desc.encoder_data_input_fields + \ desc.decoder_data_input_fields[:-1] + desc.label_data_input_fields input_descs = desc.get_input_descs(args.args) input_slots = [{ "name": name, "shape": input_descs[name][0], "dtype": input_descs[name][1] } for name in input_field_names] input_field = InputField(input_slots) input_field.build(build_pyreader=True) # define the network sum_cost, avg_cost, token_num = create_net(is_training=True, model_input=input_field, args=args) # define the optimizer with fluid.default_main_program()._lr_schedule_guard(): learning_rate = fluid.layers.learning_rate_scheduler.noam_decay( args.d_model, args.warmup_steps) * args.learning_rate optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, beta1=args.beta1, beta2=args.beta2, epsilon=float(args.eps)) optimizer.minimize(avg_cost) # prepare training ## decorate the pyreader with batch_generator input_field.loader.set_batch_generator(batch_generator) ## define the executor and program for training exe = fluid.Executor(place) exe.run(startup_prog) # init position_encoding for pos_enc_param_name in desc.pos_enc_param_names: pos_enc_param = fluid.global_scope().find_var( pos_enc_param_name).get_tensor() pos_enc_param.set( position_encoding_init(args.max_length + 1, args.d_model), place) assert (args.init_from_checkpoint == "") or (args.init_from_pretrain_model == "") ## init from some checkpoint, to resume the previous training if args.init_from_checkpoint: load(train_prog, os.path.join(args.init_from_checkpoint, "transformer"), exe) print("finish initing model from checkpoint from %s" % (args.init_from_checkpoint)) ## init from some pretrain models, to better solve the current task if args.init_from_pretrain_model: load(train_prog, os.path.join(args.init_from_pretrain_model, "transformer"), exe) print("finish initing model from pretrained params from %s" % (args.init_from_pretrain_model)) build_strategy = fluid.compiler.BuildStrategy() build_strategy.enable_inplace = True exec_strategy = fluid.ExecutionStrategy() if num_trainers > 1: dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog) exec_strategy.num_threads = 1 compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) # the best cross-entropy value with label smoothing loss_normalizer = -( (1. - args.label_smooth_eps) * np.log((1. - args.label_smooth_eps)) + args.label_smooth_eps * np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20)) # start training step_idx = 0 total_batch_num = 0 # this is for benchmark total_batch_token_num = 0 # this is for benchmark word count for pass_id in range(args.epoch): pass_start_time = time.time() input_field.loader.start() batch_id = 0 while True: if args.max_iter and total_batch_num == args.max_iter: # this for benchmark return try: outs = exe.run(compiled_train_prog, fetch_list=[sum_cost.name, token_num.name]) total_batch_token_num += np.asarray(outs[1]).sum() if step_idx % args.print_step == 0: sum_cost_val, token_num_val = np.asarray( outs[0]), np.asarray(outs[1]) # sum the cost from multi-devices total_sum_cost = sum_cost_val.sum() total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num if step_idx == 0: logging.info( "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " "normalized loss: %f, ppl: %f" % (step_idx, pass_id, batch_id, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100)]))) avg_batch_time = time.time() else: logging.info( "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " "normalized loss: %f, ppl: %f, batch speed: %.2f steps/s, ips: %.2f words/sec" % (step_idx, pass_id, batch_id, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100) ]), args.print_step / (time.time() - avg_batch_time), total_batch_token_num / (time.time() - avg_batch_time))) avg_batch_time = time.time() total_batch_token_num = 0 if step_idx % args.save_step == 0 and step_idx != 0: if args.save_model_path: model_path = os.path.join(args.save_model_path, "step_" + str(step_idx), "transformer") fluid.save(train_prog, model_path) batch_id += 1 step_idx += 1 total_batch_num = total_batch_num + 1 # this is for benchmark # profiler tools for benchmark if args.is_profiler and pass_id == 0 and batch_id == args.print_step: profiler.start_profiler("All") elif args.is_profiler and pass_id == 0 and batch_id == args.print_step + 5: profiler.stop_profiler("total", args.profiler_path) return except fluid.core.EOFException: input_field.loader.reset() break time_consumed = time.time() - pass_start_time if args.save_model_path: model_path = os.path.join(args.save_model_path, "step_final", "transformer") fluid.save(train_prog, model_path) if args.enable_ce: # For CE print("kpis\ttrain_cost_card%d\t%f" % (dev_count, total_avg_cost)) print("kpis\ttrain_duration_card%d\t%f" % (dev_count, time_consumed))
def do_predict(args): if args.use_cuda: dev_count = fluid.core.get_cuda_device_count() place = fluid.CUDAPlace(0) else: dev_count = int(os.environ.get('CPU_NUM', 1)) place = fluid.CPUPlace() # define the data generator processor = reader.DataProcessor(fpattern=args.predict_file, src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, token_delimiter=args.token_delimiter, use_token_batch=False, batch_size=args.batch_size, device_count=dev_count, pool_size=args.pool_size, sort_type=reader.SortType.NONE, shuffle=False, shuffle_batch=False, start_mark=args.special_token[0], end_mark=args.special_token[1], unk_mark=args.special_token[2], max_length=args.max_length, n_head=args.n_head) batch_generator = processor.data_generator(phase="predict", place=place) args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \ args.unk_idx = processor.get_vocab_summary() trg_idx2word = reader.DataProcessor.load_dict( dict_path=args.trg_vocab_fpath, reverse=True) test_prog = fluid.default_main_program() startup_prog = fluid.default_startup_program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): # define input and reader input_field_names = desc.encoder_data_input_fields + desc.fast_decoder_data_input_fields input_slots = [{ "name": name, "shape": desc.input_descs[name][0], "dtype": desc.input_descs[name][1] } for name in input_field_names] input_field = InputField(input_slots) input_field.build(build_pyreader=True) # define the network out_ids, out_scores, weight_matrix = create_net( is_training=False, model_input=input_field, args=args) out_ids.persistable = out_scores.persistable = weight_matrix.persistable = True # This is used here to set dropout to the test mode. test_prog = test_prog.clone(for_test=True) # prepare predicting ## define the executor and program for training exe = fluid.Executor(place) exe.run(startup_prog) assert (args.init_from_params) or (args.init_from_pretrain_model) if args.init_from_params: init_from_params(args, exe, test_prog) elif args.init_from_pretrain_model: init_from_pretrain_model(args, exe, test_prog) # to avoid a longer length than training, reset the size of position encoding to max_length for pos_enc_param_name in desc.pos_enc_param_names: pos_enc_param = fluid.global_scope().find_var( pos_enc_param_name).get_tensor() pos_enc_param.set( position_encoding_init(args.max_length + 1, args.d_model), place) exe_strategy = fluid.ExecutionStrategy() # to clear tensor array after each iteration exe_strategy.num_iteration_per_drop_scope = 1 compiled_test_prog = fluid.CompiledProgram(test_prog).with_data_parallel( exec_strategy=exe_strategy, places=place) f = open(args.output_file, "wb") # start predicting ## decorate the pyreader with batch_generator input_field.reader.decorate_batch_generator(batch_generator) input_field.reader.start() while True: try: #print(input_field.src_word) seq_ids, seq_scores, out_weight = exe.run( test_prog, fetch_list=[out_ids.name, out_scores.name, weight_matrix], return_numpy=False) # print(out_weight) #print(weight_matrix) # How to parse the results: # Suppose the lod of seq_ids is: # [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]] # then from lod[0]: # there are 2 source sentences, beam width is 3. # from lod[1]: # the first source sentence has 3 hyps; the lengths are 12, 12, 16 # the second source sentence has 3 hyps; the lengths are 14, 13, 15 hyps = [[] for i in range(len(seq_ids.lod()[0]) - 1)] scores = [[] for i in range(len(seq_scores.lod()[0]) - 1)] for i in range(len(seq_ids.lod()[0]) - 1): # for each source sentence start = seq_ids.lod()[0][i] end = seq_ids.lod()[0][i + 1] for j in range(end - start): # for each candidate sub_start = seq_ids.lod()[1][start + j] sub_end = seq_ids.lod()[1][start + j + 1] hyps[i].append(b" ".join([ trg_idx2word[idx] for idx in post_process_seq( np.array(seq_ids)[sub_start:sub_end], args.bos_idx, args.eos_idx) ])) scores[i].append(np.array(seq_scores)[sub_end - 1]) f.write(hyps[i][-1] + b"\n") if len(hyps[i]) >= args.n_best: break except fluid.core.EOFException: break f.close()