(task_name, card_num, ce_loss)) print("kpis\ttrain_acc_%s_card%s\t%f" % (task_name, card_num, ce_acc)) # evaluate on test set if not args.do_train and args.do_val: print("Final test result:") evaluate(test_exe, test_prog, test_loader, [loss.name, accuracy.name, num_seqs.name], "test") # infer if args.do_infer: print("Final infer result:") infer(test_exe, test_prog, infer_loader, [probs.name], "infer") def get_cards(): num = 0 cards = os.environ.get('CUDA_VISIBLE_DEVICES', '') if cards != '': num = len(cards.split(",")) return num if __name__ == "__main__": args = PDConfig('config.json') args.build() args.print_arguments() check_cuda(args.use_cuda) check_version() main(args)
def fast_infer(args): """ Inference by beam search decoder based solely on Fluid operators. """ out_ids, out_scores, pyreader = fast_decoder( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, InferTaskConfig.beam_size, InferTaskConfig.max_out_len, ModelHyperParams.bos_idx, ModelHyperParams.eos_idx, use_py_reader=args.use_py_reader) # This is used here to set dropout to the test mode. infer_program = fluid.default_main_program().clone(for_test=True) if args.use_mem_opt: fluid.memory_optimize(infer_program) if InferTaskConfig.use_gpu: check_cuda(InferTaskConfig.use_gpu) place = fluid.CUDAPlace(0) dev_count = fluid.core.get_cuda_device_count() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) fluid.io.load_vars(exe, InferTaskConfig.model_path, vars=[ var for var in infer_program.list_vars() if isinstance(var, fluid.framework.Parameter) ]) exec_strategy = fluid.ExecutionStrategy() # For faster executor exec_strategy.use_experimental_executor = True exec_strategy.num_threads = 1 build_strategy = fluid.BuildStrategy() infer_exe = fluid.ParallelExecutor(use_cuda=TrainTaskConfig.use_gpu, main_program=infer_program, build_strategy=build_strategy, exec_strategy=exec_strategy) # data reader settings for inference args.train_file_pattern = args.test_file_pattern args.use_token_batch = False args.sort_type = reader.SortType.NONE args.shuffle = False args.shuffle_batch = False test_data = prepare_data_generator( args, is_test=False, count=dev_count, pyreader=pyreader, py_reader_provider_wrapper=py_reader_provider_wrapper, place=place) if args.use_py_reader: pyreader.start() data_generator = None else: data_generator = test_data() trg_idx2word = reader.DataReader.load_dict(dict_path=args.trg_vocab_fpath, reverse=True) while True: try: feed_dict_list = prepare_feed_dict_list(data_generator, dev_count, place) if args.use_parallel_exe: seq_ids, seq_scores = infer_exe.run( fetch_list=[out_ids.name, out_scores.name], feed=feed_dict_list, return_numpy=False) else: seq_ids, seq_scores = exe.run( program=infer_program, fetch_list=[out_ids.name, out_scores.name], feed=feed_dict_list[0] if feed_dict_list is not None else None, return_numpy=False, use_program_cache=False) seq_ids_list, seq_scores_list = [ seq_ids ], [seq_scores] if isinstance( seq_ids, paddle.fluid.LoDTensor) else (seq_ids, seq_scores) for seq_ids, seq_scores in zip(seq_ids_list, seq_scores_list): # How to parse the results: # Suppose the lod of seq_ids is: # [[0, 3, 6], [0, 12, 24, 40, 54, 67, 82]] # then from lod[0]: # there are 2 source sentences, beam width is 3. # from lod[1]: # the first source sentence has 3 hyps; the lengths are 12, 12, 16 # the second source sentence has 3 hyps; the lengths are 14, 13, 15 hyps = [[] for i in range(len(seq_ids.lod()[0]) - 1)] scores = [[] for i in range(len(seq_scores.lod()[0]) - 1)] for i in range(len(seq_ids.lod()[0]) - 1): # for each source sentence start = seq_ids.lod()[0][i] end = seq_ids.lod()[0][i + 1] for j in range(end - start): # for each candidate sub_start = seq_ids.lod()[1][start + j] sub_end = seq_ids.lod()[1][start + j + 1] hyps[i].append(" ".join([ trg_idx2word[idx] for idx in post_process_seq( np.array(seq_ids)[sub_start:sub_end]) ])) scores[i].append(np.array(seq_scores)[sub_end - 1]) print(hyps[i][-1]) if len(hyps[i]) >= InferTaskConfig.n_best: break except (StopIteration, fluid.core.EOFException): # The data pass is over. if args.use_py_reader: pyreader.reset() break
def main(): args = parse_args() # check if set use_gpu=True in paddlepaddle cpu version check_cuda(args.use_gpu) # check if paddlepaddle version is satisfied check_version() logger = logging.getLogger("lm") logger.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') if args.log_path: file_handler = logging.FileHandler(args.log_path) file_handler.setLevel(logging.INFO) file_handler.setFormatter(formatter) logger.addHandler(file_handler) else: console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter(formatter) logger.addHandler(console_handler) logger.info('Running with args : {}'.format(args)) config = RNNConfig(args) # define train program main_program = fluid.Program() startup_program = fluid.Program() if args.enable_ce: startup_program.random_seed = SEED with fluid.program_guard(main_program, startup_program): with fluid.unique_name.guard(): res_vars = lm_model.lm_model( config.hidden_size, config.vocab_size, config.batch_size, num_layers=config.num_layers, num_steps=config.num_steps, init_scale=config.init_scale, dropout=config.dropout, rnn_model=config.rnn_model, use_dataloader=args.use_dataloader) if args.use_dataloader: dataloader = res_vars[-1] res_vars = res_vars[:-1] loss, last_hidden, last_cell, feed_order = res_vars fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=config.max_grad_norm)) learning_rate = fluid.layers.create_global_var( name="learning_rate", shape=[1], value=1.0, dtype='float32', persistable=True) optimizer = fluid.optimizer.SGD(learning_rate=learning_rate) optimizer.minimize(loss) # define inference program inference_program = fluid.Program() inference_startup_program = fluid.Program() with fluid.program_guard(inference_program, inference_startup_program): with fluid.unique_name.guard(): lm_model.lm_model( config.hidden_size, config.vocab_size, config.batch_size, num_layers=config.num_layers, num_steps=config.num_steps, init_scale=config.init_scale, dropout=config.dropout, rnn_model=config.rnn_model, use_dataloader=False) # Some op behaves differently for train and inference, we need to call # this clone function to ensure every op is right for inference. inference_program = inference_program.clone(for_test=True) place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() exe = Executor(place) exe.run(startup_program) if args.init_from_pretrain_model: if not os.path.exists(args.init_from_pretrain_model + '.pdparams'): print(args.init_from_pretrain_model) raise Warning("The pretrained params do not exist.") return fluid.load(main_program, args.init_from_pretrain_model) print("finish initing model from pretrained params from %s" % (args.init_from_pretrain_model)) device_count = len(fluid.cuda_places()) if args.use_gpu else len( fluid.cpu_places()) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = device_count exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = True if args.parallel: train_program = fluid.compiler.CompiledProgram( main_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: train_program = fluid.compiler.CompiledProgram(main_program) data_path = args.data_path print("begin to load data") ptb_data = reader.get_ptb_data(data_path) print("finished load data") train_data, valid_data, test_data = ptb_data def generate_init_data(): if args.rnn_model == "lod": init_hidden = np.zeros( (config.batch_size, config.num_layers, config.hidden_size), dtype='float32') init_cell = np.zeros( (config.batch_size, config.num_layers, config.hidden_size), dtype='float32') else: init_hidden = np.zeros( (config.num_layers, config.batch_size, config.hidden_size), dtype='float32') init_cell = np.zeros( (config.num_layers, config.batch_size, config.hidden_size), dtype='float32') return init_hidden, init_cell def generate_new_lr(epoch_id=0, device_count=1): new_lr = config.base_learning_rate * (config.lr_decay**max( epoch_id + 1 - config.epoch_start_decay, 0.0)) lr = np.ones((device_count), dtype='float32') * new_lr return lr def prepare_input(batch, init_hidden=None, init_cell=None, epoch_id=0, with_lr=True, device_count=1): x, y = batch batch_size = x.shape[0] x = x.reshape((-1, config.num_steps, 1)) y = y.reshape((-1, 1)) if args.rnn_model == "lod": x = to_lodtensor(x.reshape((-1, 1)), place, [ range(0, (batch_size + 1) * config.num_steps, config.num_steps) ]) y = to_lodtensor(y.reshape((-1, 1)), place, [ range(0, (batch_size + 1) * config.num_steps, config.num_steps) ]) res = {} res['x'] = x res['y'] = y if init_hidden is not None: res['init_hidden'] = init_hidden if init_cell is not None: res['init_cell'] = init_cell if with_lr: res['learning_rate'] = generate_new_lr(epoch_id, device_count) return res def eval(data): # when eval the batch_size set to 1 eval_data_iter = reader.get_data_iter(data, config.batch_size, config.num_steps) total_loss = 0.0 iters = 0 init_hidden, init_cell = generate_init_data() for batch_id, batch in enumerate(eval_data_iter): input_data_feed = prepare_input( batch, init_hidden, init_cell, epoch_id=0, with_lr=False) fetch_outs = exe.run( program=inference_program, feed=input_data_feed, fetch_list=[loss.name, last_hidden.name, last_cell.name], use_program_cache=False) cost_eval = np.array(fetch_outs[0]) init_hidden = np.array(fetch_outs[1]) init_cell = np.array(fetch_outs[2]) total_loss += cost_eval iters += config.num_steps ppl = np.exp(total_loss / iters) return ppl def get_log_interval(data_len): num_batchs = data_len // config.batch_size epoch_size = (num_batchs - 1) // config.num_steps log_interval = max(1, epoch_size // 10) return log_interval def train_an_epoch(epoch_id, batch_times): # get train epoch size log_interval = get_log_interval(len(train_data)) train_data_iter = reader.get_data_iter(train_data, config.batch_size, config.num_steps) total_loss = 0 iters = 0 init_hidden, init_cell = generate_init_data() for batch_id, batch in enumerate(train_data_iter): input_data_feed = prepare_input( batch, init_hidden=init_hidden, init_cell=init_cell, epoch_id=epoch_id, with_lr=True, device_count=device_count) batch_start_time = time.time() fetch_outs = exe.run(train_program, feed=input_data_feed, fetch_list=[ loss.name, "learning_rate", last_hidden.name, last_cell.name ], use_program_cache=True) batch_time = time.time() - batch_start_time batch_times.append(batch_time) cost_train = np.array(fetch_outs[0]) lr = np.array(fetch_outs[1]) init_hidden = np.array(fetch_outs[2]) init_cell = np.array(fetch_outs[3]) total_loss += cost_train iters += config.num_steps if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" % (epoch_id, batch_id, batch_time, ppl[0], lr[0])) ppl = np.exp(total_loss / iters) return ppl def train_an_epoch_dataloader(epoch_id, batch_times): # get train epoch size log_interval = get_log_interval(len(train_data)) init_hidden, init_cell = generate_init_data() total_loss = 0 iters = 0 dataloader.start() batch_id = 0 try: while True: data_feeds = {} if batch_id == 0: batch_time = 0 batch_start_time = time.time() else: batch_time = time.time() - batch_start_time batch_times.append(batch_time) batch_start_time = time.time() new_lr = generate_new_lr(epoch_id, device_count) data_feeds['learning_rate'] = new_lr data_feeds["init_hidden"] = init_hidden data_feeds["init_cell"] = init_cell fetch_outs = exe.run(train_program, feed=data_feeds, fetch_list=[ loss.name, "learning_rate", last_hidden.name, last_cell.name ], use_program_cache=True) cost_train = np.array(fetch_outs[0]) lr = np.array(fetch_outs[1]) init_hidden = np.array(fetch_outs[2]) init_cell = np.array(fetch_outs[3]) total_loss += cost_train iters += config.num_steps if batch_id > 0 and (log_interval == 0 or batch_id % log_interval == 0): ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" % (epoch_id, batch_id, batch_time, ppl[0], lr[0])) batch_id += 1 except fluid.core.EOFException: dataloader.reset() batch_times.append(time.time() - batch_start_time) ppl = np.exp(total_loss / iters) return ppl def train(): if args.use_dataloader: def data_gen(): data_iter_size = config.batch_size // device_count train_batches = reader.get_data_iter(train_data, data_iter_size, config.num_steps) for batch in train_batches: x, y = batch x = x.reshape((-1, config.num_steps, 1)) y = y.reshape((-1, 1)) if args.rnn_model == "lod": x = to_lodtensor(x.reshape((-1, 1)), place, [ range(0, (data_iter_size + 1) * config.num_steps, config.num_steps) ]) y = to_lodtensor(y.reshape((-1, 1)), place, [ range(0, (data_iter_size + 1) * config.num_steps, config.num_steps) ]) yield x, y dataloader.set_batch_generator(data_gen) total_time = 0.0 for epoch_id in range(config.max_epoch): batch_times = [] epoch_start_time = time.time() if args.use_dataloader: train_ppl = train_an_epoch_dataloader(epoch_id, batch_times) else: train_ppl = train_an_epoch(epoch_id, batch_times) epoch_time = time.time() - epoch_start_time total_time += epoch_time print( "\nTrain epoch:[%d]; epoch Time: %.5f; ppl: %.5f; avg_time: %.5f steps/s \n" % (epoch_id, epoch_time, train_ppl[0], len(batch_times) / sum(batch_times))) # FIXME(zjl): ppl[0] increases as batch_size increases. # We should find a better way to calculate ppl by normalizing batch_size. if device_count == 1 and config.batch_size <= 20 and epoch_id == 0 and train_ppl[ 0] > 1000: # for bad init, after first epoch, the loss is over 1000 # no more need to continue print( "Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch." ) print("Abort this training process and please start again.") return if epoch_id == config.max_epoch - 1 and args.enable_ce: # kpis print("ptblm\tlstm_language_model_%s_duration_card%d\t%s" % (args.rnn_model, device_count, total_time / config.max_epoch)) print("ptblm\tlstm_language_model_%s_loss_card%d\t%s" % (args.rnn_model, device_count, train_ppl[0])) # NOTE(zjl): sometimes we have not enough data for eval if batch_size is large, i.e., 2100 # Just skip to avoid error def is_valid_data(data, batch_size, num_steps): data_len = len(data) batch_len = data_len // batch_size epoch_size = (batch_len - 1) // num_steps return epoch_size >= 1 valid_data_valid = is_valid_data(valid_data, config.batch_size, config.num_steps) if valid_data_valid: valid_ppl = eval(valid_data) print("Valid ppl: %.5f" % valid_ppl[0]) else: print( 'WARNING: length of valid_data is {}, which is not enough for batch_size {} and num_steps {}'. format( len(valid_data), config.batch_size, config.num_steps)) save_model_dir = os.path.join(args.save_model_dir, str(epoch_id), "params") fluid.save(main_program, save_model_dir) print("Saved model to: %s.\n" % save_model_dir) with profile_context(args.profile): train() test_ppl = eval(test_data) print("Test ppl:", test_ppl[0])
def train(args): # priority: ENV > args > config is_local = os.getenv("PADDLE_IS_LOCAL", "1") if is_local == '0': args.local = False logging.info(args) if args.device == 'CPU': TrainTaskConfig.use_gpu = False training_role = os.getenv("TRAINING_ROLE", "TRAINER") if training_role == "PSERVER" or (not TrainTaskConfig.use_gpu): place = fluid.CPUPlace() # the default setting of CPU_NUM in paddle framework is 1 dev_count = int(os.environ.get('CPU_NUM', 1)) else: check_cuda(TrainTaskConfig.use_gpu) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) dev_count = get_device_num() exe = fluid.Executor(place) train_prog = fluid.Program() startup_prog = fluid.Program() if args.enable_ce: train_prog.random_seed = 1000 startup_prog.random_seed = 1000 with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): sum_cost, avg_cost, predict, token_num, pyreader = transformer( ModelHyperParams.src_vocab_size, ModelHyperParams.trg_vocab_size, ModelHyperParams.max_length + 1, ModelHyperParams.n_layer, ModelHyperParams.n_head, ModelHyperParams.d_key, ModelHyperParams.d_value, ModelHyperParams.d_model, ModelHyperParams.d_inner_hid, ModelHyperParams.prepostprocess_dropout, ModelHyperParams.attention_dropout, ModelHyperParams.relu_dropout, ModelHyperParams.preprocess_cmd, ModelHyperParams.postprocess_cmd, ModelHyperParams.weight_sharing, TrainTaskConfig.label_smooth_eps, ModelHyperParams.bos_idx, use_py_reader=args.use_py_reader, is_test=False) optimizer = None if args.sync: lr_decay = fluid.layers.learning_rate_scheduler.noam_decay( ModelHyperParams.d_model, TrainTaskConfig.warmup_steps) logging.info("before adam") with fluid.default_main_program()._lr_schedule_guard(): learning_rate = lr_decay * TrainTaskConfig.learning_rate optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, beta1=TrainTaskConfig.beta1, beta2=TrainTaskConfig.beta2, epsilon=TrainTaskConfig.eps) else: optimizer = fluid.optimizer.SGD(0.003) optimizer.minimize(avg_cost) if args.local: logging.info("local start_up:") train_loop(exe, train_prog, startup_prog, dev_count, sum_cost, avg_cost, token_num, predict, pyreader) else: if args.update_method == "nccl2": trainer_id = int(os.getenv("PADDLE_TRAINER_ID", "0")) port = os.getenv("PADDLE_PORT") worker_ips = os.getenv("PADDLE_TRAINERS") worker_endpoints = [] for ip in worker_ips.split(","): worker_endpoints.append(':'.join([ip, port])) trainers_num = len(worker_endpoints) current_endpoint = os.getenv("POD_IP") + ":" + port if trainer_id == 0: logging.info("train_id == 0, sleep 60s") time.sleep(60) logging.info("trainers_num:{}".format(trainers_num)) logging.info("worker_endpoints:{}".format(worker_endpoints)) logging.info("current_endpoint:{}".format(current_endpoint)) append_nccl2_prepare(startup_prog, trainer_id, worker_endpoints, current_endpoint) train_loop(exe, train_prog, startup_prog, dev_count, sum_cost, avg_cost, token_num, predict, pyreader, trainers_num, trainer_id) return port = os.getenv("PADDLE_PORT", "6174") pserver_ips = os.getenv("PADDLE_PSERVERS") # ip,ip... eplist = [] for ip in pserver_ips.split(","): eplist.append(':'.join([ip, port])) pserver_endpoints = ",".join(eplist) # ip:port,ip:port... trainers = int(os.getenv("PADDLE_TRAINERS_NUM", "0")) current_endpoint = os.getenv("POD_IP") + ":" + port trainer_id = int(os.getenv("PADDLE_TRAINER_ID")) logging.info("pserver_endpoints:{}".format(pserver_endpoints)) logging.info("current_endpoint:{}".format(current_endpoint)) logging.info("trainer_id:{}".format(trainer_id)) logging.info("pserver_ips:{}".format(pserver_ips)) logging.info("port:{}".format(port)) t = fluid.DistributeTranspiler() t.transpile(trainer_id, pservers=pserver_endpoints, trainers=trainers, program=train_prog, startup_program=startup_prog) if training_role == "PSERVER": logging.info("distributed: pserver started") current_endpoint = os.getenv("POD_IP") + ":" + os.getenv( "PADDLE_PORT") if not current_endpoint: logging.critical("need env SERVER_ENDPOINT") exit(1) pserver_prog = t.get_pserver_program(current_endpoint) pserver_startup = t.get_startup_program(current_endpoint, pserver_prog) exe.run(pserver_startup) exe.run(pserver_prog) elif training_role == "TRAINER": logging.info("distributed: trainer started") trainer_prog = t.get_trainer_program() train_loop(exe, train_prog, startup_prog, dev_count, sum_cost, avg_cost, token_num, predict, pyreader) else: logging.critical( "environment var TRAINER_ROLE should be TRAINER os PSERVER") exit(1)