def test_single_gpu(self): paddle.enable_static() fleet.init(is_collective=True) sharding_program = paddle.static.Program() sharding_startup_program = paddle.static.Program() strategy = fleet.DistributedStrategy() strategy.without_graph_optimization = True with fluid.program_guard(sharding_program, sharding_startup_program): with fluid.unique_name.guard(): input_x = paddle.static.data(name="x", shape=[None, 32], dtype='float32') input_y = paddle.static.data(name="y", shape=[None, 1], dtype='int64') cost = self.mlp(input_x=input_x, input_y=input_y) output_name = cost.name optimizer = fleet.distributed_optimizer( fluid.optimizer.Adam(), strategy) optimizer.minimize(cost) trainer_id = fleet.worker_index() exe = paddle.static.Executor(paddle.CUDAPlace(trainer_id)) rank = fleet.worker_index() exe.run(sharding_startup_program) exe.run(program=sharding_program, feed=self.gen_data())
def run_worker(self): logger.info("Run Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open("./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open("./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() save_model_path = self.config.get("runner.model_save_path") if save_model_path and (not os.path.exists(save_model_path)): os.makedirs(save_model_path) reader_type = self.config.get("runner.reader_type", None) epochs = int(self.config.get("runner.epochs")) sync_mode = self.config.get("runner.sync_mode") gpus_env = os.getenv("FLAGS_selected_gpus") self.PSGPU = paddle.fluid.core.PSGPU() gpuslot = [int(i) for i in range(1, self.model.sparse_inputs_slots)] print("gpuslot: {}".format(gpuslot)) self.PSGPU.set_slot_vector(gpuslot) self.PSGPU.init_gpu_ps([int(s) for s in gpus_env.split(",")]) opt_info = paddle.fluid.default_main_program()._fleet_opt opt_info['stat_var_names'] = [] for epoch in range(epochs): epoch_start_time = time.time() if sync_mode == "heter": self.heter_train_loop(epoch) elif sync_mode == "gpubox": self.dataset_train_loop(epoch) elif reader_type == "QueueDataset": self.dataset_train_loop(epoch) elif reader_type == "DataLoader": self.dataloader_train_loop(epoch) elif reader_type == None or reader_type == "RecDataset": self.recdataset_train_loop(epoch) epoch_time = time.time() - epoch_start_time epoch_speed = self.example_nums / epoch_time logger.info( "Epoch: {}, using time {} second, ips {} {}/sec.".format( epoch, epoch_time, epoch_speed, self.count_method)) self.train_result_dict["speed"].append(epoch_speed) model_dir = "{}/{}".format(save_model_path, epoch) if fleet.is_first_worker( ) and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var)
def run_online_worker(self): logger.info("Run Online Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open("./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open("./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() save_model_path = self.config.get("runner.model_save_path") if save_model_path and (not os.path.exists(save_model_path)): os.makedirs(save_model_path) days = os.popen("echo -n " + self.config.get("runner.days")).read().split(" ") pass_per_day = int(self.config.get("runner.pass_per_day")) for day_index in range(len(days)): day = days[day_index] for pass_index in range(1, pass_per_day + 1): logger.info("Day: {} Pass: {} Begin.".format(day, pass_index)) prepare_data_start_time = time.time() dataset = self.wait_and_prepare_dataset(day, pass_index) prepare_data_end_time = time.time() logger.info( "Prepare Dataset Done, using time {} second.".format(prepare_data_end_time - prepare_data_start_time)) train_start_time = time.time() self.dataset_train_loop(dataset, day, pass_index) train_end_time = time.time() logger.info( "Train Dataset Done, using time {} second.".format(train_end_time - train_start_time)) model_dir = "{}/{}/{}".format(save_model_path, day, pass_index) if fleet.is_first_worker() and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var, mode=2) if fleet.is_first_worker() and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var, mode=0)
def run_worker(self): logger.info("Run Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open( "./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open( "./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() save_model_path = self.config.get("runner.model_save_path") if save_model_path and not os.path.exists(save_model_path): os.makedirs(save_model_path) reader_type = self.config.get("runner.reader_type", None) epochs = int(self.config.get("runner.epochs")) sync_mode = self.config.get("runner.sync_mode") for epoch in range(epochs): epoch_start_time = time.time() if sync_mode == "heter": self.heter_train_loop(epoch) elif reader_type == "QueueDataset": self.dataset_train_loop(epoch) elif reader_type == "DataLoader": self.dataloader_train_loop(epoch) elif reader_type == None or reader_type == "RecDataset": self.recdataset_train_loop(epoch) epoch_time = time.time() - epoch_start_time epoch_speed = self.example_nums / epoch_time logger.info( "Epoch: {}, using time {} second, ips {} {}/sec.".format( epoch, epoch_time, epoch_speed, self.count_method)) self.train_result_dict["speed"].append(epoch_speed) model_dir = "{}/{}".format(save_model_path, epoch) if fleet.is_first_worker( ) and save_model_path and is_distributed_env(): fleet.save_inference_model( self.exe, model_dir, [feed.name for feed in self.input_data], self.inference_target_var)
def test_single_run_ps_minimize(self): paddle.enable_static() input_x = paddle.static.data(name="x", shape=[-1, 32], dtype='float32') input_y = paddle.static.data(name="y", shape=[-1, 1], dtype='int64') fc_1 = fluid.layers.fc(input=input_x, size=64, act='tanh') prediction = fluid.layers.fc(input=fc_1, size=2, act='softmax') cost = fluid.layers.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.mean(x=cost) fleet.init() strategy = paddle.distributed.fleet.DistributedStrategy() optimizer = fluid.optimizer.SGD(learning_rate=0.01) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) if fleet.is_server(): fleet.init_server() fleet.run_server() elif fleet.is_worker(): place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(paddle.static.default_startup_program()) step = 10 for i in range(step): cost_val = exe.run(program=fluid.default_main_program(), feed=self.gen_data(), fetch_list=[avg_cost.name]) print("worker_index: %d, step%d cost = %f" % (fleet.worker_index(), i, cost_val[0]))
def minimize(self, loss, startup_program=None, parameter_list=None, no_grad_set=None, auto_dp=False, rank_table_file=None, precision_mode="must_keep_origin_dtype"): minimized = None if self.inner_opt: minimized = self.inner_opt.minimize( loss, startup_program=startup_program) self.ascend_instance = core.AscendInstance() from paddle.distributed import fleet if auto_dp and fleet.world_size() > 1: from paddle.fluid.transpiler import ascend_transpiler t = ascend_transpiler.AscendTranspiler(startup_program, loss.block.program) t.transpile() #print(loss.block.program) # Config about Graph Engine can be found in https://support.huaweicloud.com/ config = { "ge.exec.deviceId": str(fleet.local_device_ids()), "ge.graphRunMode": "1", "ge.exec.precision_mode": precision_mode, } # if multi trainers if rank_table_file and fleet.world_size() > 1: config["ge.exec.rankTableFile"] = rank_table_file config["ge.exec.rankId"] = str(fleet.worker_index()) config["ge.exec.isUseHcom"] = "1" config["ge.exec.deployMode"] = "0" print("ge_initialize config:", config) core.ge_initialize(config) # Init Session self.ascend_instance.init_global_resources() main_block = loss.block self.parser = AscendIRParser( auto_dp=auto_dp, world_rank_size=fleet.world_size()) input_varlist = self._get_input_varlist(main_block.program) startup_graph, main_graph = self.parser.parse_program( startup_program, main_block.program, input_varlist, self.fetch_list) for cfg in self.parser.groups_to_create: print("create group (%s), nranks: %d, rank_ids: %s" % (cfg.name, cfg.nranks, cfg.rank_ids)) hccl.create_group(cfg.name, cfg.nranks, cfg.rank_ids) self.ascend_instance.add_ascend_subgraph(0, startup_graph) self.ascend_instance.add_ascend_subgraph(1, main_graph) return minimized
def get_file_list(data_path, config): assert os.path.exists(data_path) file_list = [data_path + "/%s" % x for x in os.listdir(data_path)] if config.get("runner.split_file_list"): logger.info("Split file list for worker {}".format(fleet.worker_index( ))) file_list = fleet.util.get_file_shard(file_list) logger.info("File list: {}".format(file_list)) return file_list
def init_fleet_with_gloo(self, use_gloo=False): if use_gloo: os.environ["PADDLE_WITH_GLOO"] = "1" fleet.init(self.role_maker) else: fleet.init() if fleet.is_server(): print("server: {} started".format(fleet.server_index())) else: print("worker: {} started".format(fleet.worker_index()))
def get_model(self, batch_size=2, use_dgc=False, dist_strategy=None): # Input data seq_len = 2 data_in = fluid.data(name='data_in', shape=[batch_size, seq_len, hidden], dtype=DTYPE) if dist_strategy: data_loader = fluid.io.DataLoader.from_generator( feed_list=[data_in], capacity=64, use_double_buffer=False, iterable=False) if dist_strategy: fleet.init(is_collective=True) strategy = fleet.DistributedStrategy() strategy.tensor_parallel = True strategy.tensor_parallel_configs = {'tensor_parallel_degree': 2} rank = fleet.worker_index() if dist_strategy else None avg_cost = create_model(data_in, rank) opt = fluid.optimizer.SGD(0.1) if dist_strategy: dist_opt = fleet.distributed_optimizer(optimizer=opt, strategy=strategy) dist_opt.minimize(avg_cost) else: opt.minimize(avg_cost) def gen_data(): np.random.seed(2021) while True: data = [np.random.random([seq_len, hidden]).astype(DTYPE)] yield data train_reader = paddle.batch(gen_data, batch_size=batch_size) if dist_strategy: return None, avg_cost, train_reader, None, None, None, data_loader else: return None, avg_cost, train_reader, None, None, None
def get_samples_mapping(indexed_dataset, data_prefix, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed, name, binary_head, share_folder): """Get a list that maps a sample index to a starting sentence index, end sentence index, and length""" if not num_epochs: if not max_num_samples: raise ValueError("Need to specify either max_num_samples " "or num_epochs") num_epochs = np.iinfo(np.int32).max - 1 if not max_num_samples: max_num_samples = np.iinfo(np.int64).max - 1 # Filename of the index mapping indexmap_filename = data_prefix indexmap_filename += '_{}_indexmap'.format(name) if num_epochs != (np.iinfo(np.int32).max - 1): indexmap_filename += '_{}ep'.format(num_epochs) if max_num_samples != (np.iinfo(np.int64).max - 1): indexmap_filename += '_{}mns'.format(max_num_samples) indexmap_filename += '_{}msl'.format(max_seq_length) indexmap_filename += '_{:0.2f}ssp'.format(short_seq_prob) indexmap_filename += '_{}s'.format(seed) indexmap_filename += '.npy' local_rank = 0 if fleet.local_rank() is None else int(fleet.local_rank()) if share_folder: local_rank = fleet.worker_index() # Build the indexed mapping if not exist. if local_rank == 0 and \ not os.path.isfile(indexmap_filename): print(' > WARNING: could not find index map file {}, building ' 'the indices on rank 0 ...'.format(indexmap_filename)) # Make sure the types match the helpers input types. assert indexed_dataset.doc_idx.dtype == np.int64 print(indexed_dataset.sizes.dtype) assert indexed_dataset.sizes.dtype == np.int32 # Build samples mapping verbose = local_rank == 0 start_time = time.time() print_rank_0( ' > building sapmles index mapping for {} ...'.format(name)) # First compile and then import. if local_rank == 0: compile_helper() import data_tools.helpers as helpers samples_mapping = helpers.build_mapping(indexed_dataset.doc_idx, indexed_dataset.sizes, num_epochs, max_num_samples, max_seq_length, short_seq_prob, seed, verbose, 2 if binary_head else 1) print_rank_0(' > done building sapmles index maping') np.save(indexmap_filename, samples_mapping, allow_pickle=True) print_rank_0( ' > saved the index mapping in {}'.format(indexmap_filename)) # Make sure all the ranks have built the mapping print_rank_0(' > elasped time to build and save samples mapping ' '(seconds): {:4f}'.format(time.time() - start_time)) else: while True: if (not os.path.isfile(indexmap_filename)): time.sleep(3) else: try: np.load(indexmap_filename, allow_pickle=True, mmap_mode='r') break except Exception as e: print( "%s file is still writing or damaged, please wait a moment." % indexmap_filename) time.sleep(3) # This should be a barrier but nccl barrier assumes # device_index=rank which is not the case for model # parallel case if paddle.distributed.get_world_size() > 1: if paddle.in_dynamic_mode(): paddle.distributed.barrier() # Load indexed dataset. print_rank_0( ' > loading indexed mapping from {}'.format(indexmap_filename)) start_time = time.time() samples_mapping = np.load(indexmap_filename, allow_pickle=True, mmap_mode='r') print_rank_0( ' loaded indexed file in {:3.3f} seconds'.format(time.time() - start_time)) print_rank_0(' total number of samples: {}'.format( samples_mapping.shape[0])) return samples_mapping
def do_train(args): # Initialize the paddle and paddle fleet execute environment paddle.enable_static() fleet.init(is_collective=True) # Create the random seed for the worker random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed) get_rng_state_tracker().add('global_seed', args.seed) get_rng_state_tracker().add('local_seed', args.seed + fleet.worker_index() + 2021) assert args.device in [ "cpu", "gpu", "xpu" ], "Invalid device! Available device should be cpu, gpu, or xpu." place = paddle.set_device(args.device) worker_num = fleet.worker_num() worker_index = fleet.worker_index() topo = Topology(device_rank=worker_index, world_size=worker_num, dp_degree=args.dp_degree, pp_degree=args.pp_degree, sharding_degree=args.sharding_degree, mp_degree=args.mp_degree) logger.info("The topo of hybrid parallelism:\n{}".format(topo)) dist_strategy = dist_optimizer(args, topo) # Create log write, train results show on last card of pipeline. if topo.is_last: log_writer_path = os.path.join( args.output_dir, "train_log", "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format( args.model_name_or_path, args.global_batch_size, args.use_amp, args.use_recompute, worker_index).lower()) if os.path.exists(log_writer_path): import shutil shutil.rmtree(log_writer_path) log_writer = LogWriter(log_writer_path) # Define the input data in the static mode model_class, tokenizer_class = MODEL_CLASSES[args.model_type] pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) data_file = get_train_data_file(args) main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() with paddle.static.program_guard(main_program, startup_program): with paddle.utils.unique_name.guard(): with paddle.static.device_guard('gpu:0'): data_holders = create_data_holder(args) [tokens, loss_mask, attention_mask, position_ids, labels] = data_holders tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path) eos_id = tokenizer.eos_token_id train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset( args, data_file, data_world_size=topo.data_info.size, data_world_rank=topo.data_info.rank, eos_id=eos_id, max_seq_len=args.max_seq_len, places=paddle.static.cuda_places(), data_holders=data_holders, pipeline_mode=False, ) if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] model_config[ "hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model_config["topo"] = topo model = guard(f'gpu:{args.pp_degree -1}')( GPTForPretraining)( guard(f'gpu:0')(GPTModel)(**model_config)) else: model, _ = GPTForPretraining.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args. attention_probs_dropout_prob, topo=topo) # Create the model for the gpt pretrain preds = model(tokens, position_ids, attention_mask) criterion = guard(f'gpu:{args.pp_degree -1}')( GPTPretrainingCriterion)(topo) loss = criterion(preds, labels, loss_mask) # Create the learning_rate sheduler and optimizer if args.decay_steps is None: args.decay_steps = args.max_steps warmup_step = args.warmup_rate * args.decay_steps # TODO @ZHUI Use paddle network to support lr scheduler lr_scheduler = lr.CosineAnnealingWithWarmupDecay( max_lr=args.max_lr, min_lr=args.min_lr, warmup_step=warmup_step, decay_step=args.decay_steps) clip = None if args.grad_clip > 0: clip = paddle.fluid.clip.GradientClipByGlobalNorm( clip_norm=args.grad_clip) decay_param = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, grad_clip=clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_param) # alias optimizer.apply_optimize = optimizer._apply_optimize if args.use_recompute: dist_strategy.recompute = True dist_strategy.recompute_configs = { "checkpoints": model.gpt.checkpoints } # Use the fleet api to compile the distributed optimizer optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(loss) logger.info(f'final strategy: {fleet._final_strategy()}') logger.info("The training meta optimizer is/are %s" % fleet._get_applied_meta_list()) program_desc_dir = os.path.join(args.output_dir, "program_desc") if not os.path.isdir(program_desc_dir): os.mkdir(program_desc_dir) with open(program_desc_dir + "/main_program.txt.%d" % worker_index, 'w') as f: f.write(str(main_program)) with open(program_desc_dir + "/startup_program.txt.%d" % worker_index, 'w') as f: f.write(str(startup_program)) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) test_program = main_program.clone(for_test=True) if args.model_name_or_path not in pretrained_models_list: logger.info("Try to load checkpoint from %s " % args.model_name_or_path) dygrah_path = os.path.join(args.model_name_or_path, "model_state.pdparams") static_path = os.path.join(args.model_name_or_path, "static_vars") flag_loaded = False if os.path.exists(static_path): if args.mp_degree > 1: logger.warning("MP should init with dygraph params") else: logger.info("Loading parameters from %s" % static_path) paddle.static.load(main_program, static_path, exe) flag_loaded = True if not flag_loaded and os.path.exists(dygrah_path): if args.sharding_degree > 1: logger.warning("Sharding should init with static vars") else: logger.info("Loading parameters from %s" % dygrah_path) init_static_with_params( model, paddle.load(dygrah_path, return_numpy=True), topo, main_program) flag_loaded = True if not flag_loaded: logger.error("No checkpoint load.") global_step = 0 tic_train = time.time() epoch = 0 learning_rate = main_program.global_block().vars["learning_rate_0"] while True: fetchs = [] if topo.is_last: fetchs = [loss, learning_rate] # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader # many times. and start a new random dataloader. valid_data_loader = valid_data_loader() test_data_loader = test_data_loader() for step, batch in enumerate(train_data_loader()): global_step += 1 ret = exe.run(main_program, feed=batch, fetch_list=fetchs, use_program_cache=True) # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() if global_step % args.logging_freq == 0: if topo.is_last: loss_return, lr_return = ret speed = args.logging_freq / (time.time() - tic_train) logger.info( "global step %d, epoch: %d, batch: %d, loss: %.9f, speed: %.2f steps/s, ips: %.0f tokens/s, learning rate: %.5e" % (global_step, epoch, step, loss_return[0], speed, speed * args.global_batch_size * args.max_seq_len, lr_return[0])) log_writer.add_scalar("loss", loss_return[0], global_step) log_writer.add_scalar("learning_rate", lr_return[0], global_step) tic_train = time.time() if args.check_accuracy: if global_step >= args.max_steps: return else: continue if global_step % args.eval_freq == 0: # TODO, check the input data of validation eval_fetch = [] if topo.is_last: eval_fetch = [loss] run_evaluate(valid_data_loader, exe, test_program, args.eval_iters, log_writer, global_step, args, epoch, topo.is_last, eval_fetch, "valid") tic_train = time.time() if global_step % args.save_steps == 0 or global_step >= args.max_steps: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) logger.debug("saving models to {}".format(output_dir)) save_persistables(exe, os.path.join(output_dir, "static_vars"), main_program) if global_step == args.save_steps: model.init_config["init_args"][0].init_config.pop( "topo", None) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) tic_train = time.time() if global_step >= args.max_steps: eval_fetch = [] if topo.is_last: eval_fetch = [loss] run_evaluate(test_data_loader, exe, test_program, args.test_iters, log_writer, global_step, args, epoch, topo.is_last, eval_fetch, "test") del train_data_loader return epoch += 1
def __init__(self, args, place): self.args = args self.place = place self.init_checkpoint = args.init_checkpoint self.init_pretraining_params = args.init_pretraining_params # optimizer related self.optimizer = args.optimizer self.learning_rate = args.learning_rate self.beta1 = args.beta1 self.beta2 = args.beta2 self.warmup_steps = args.warmup_steps self.lr_scheduler = args.lr_scheduler self.max_training_steps = args.max_training_steps self.min_learning_rate = args.min_learning_rate self.weight_decay = args.weight_decay self.max_grad_norm = args.max_grad_norm # training related self.is_distributed = args.get("is_distributed", False) self.use_recompute = args.use_recompute self.checkpointing_every_n_layers = args.checkpointing_every_n_layers self.use_amp = args.use_amp self.amp_loss_scaling = args.amp_loss_scaling self.use_sharding = args.use_sharding self.dp_degree = args.dp_degree self.sharding_degree = args.sharding_degree self.mp_degree = args.mp_degree self.pp_degree = args.pp_degree # setup topology if self.is_distributed: fleet.init(is_collective=True) if self.use_sharding: self.topo = Topology(device_rank=fleet.worker_index(), world_size=fleet.worker_num(), dp_degree=self.dp_degree, pp_degree=self.pp_degree, sharding_degree=self.sharding_degree, mp_degree=self.mp_degree) else: self.topo = Topology(device_rank=fleet.worker_index(), world_size=fleet.worker_num(), dp_degree=fleet.worker_num()) else: self.topo = Topology(device_rank=0, world_size=1) if self.use_recompute: print( "[WARN] Cannot support recomputation in non-distributed mode." ) if self.use_amp: print("[WARN] Cannot support AMP in non-distributed mode.") self.exe = fluid.Executor(place) # model mode self.run_infer = args.get("run_infer", False) self.batch_size = args.get("batch_size", 1) self._build_programs() return
def do_train(args): # Initialize the paddle and paddle fleet execute enviroment paddle.enable_static() place = paddle.set_device(args.select_device) fleet.init(is_collective=True) # paddle.distributed.init_parallel_env() worker_num = fleet.worker_num() worker_index = fleet.worker_index() # Create the random seed for the worker set_seed(args.seed) # worker_init = WorkerInitObj(args.seed + worker_index) worker_init = WorkerInitObj(args.seed) tracker = get_rng_state_tracker() tracker.add('global_seed', args.seed) tracker.add('local_seed', args.seed + worker_index + 2021) # Define the input data in the static mode main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() data_holders = create_data_holder(args) [ input_ids, segment_ids, input_mask, masked_lm_positions, masked_lm_labels, next_sentence_labels, masked_lm_scale ] = data_holders # Define the model structure in static mode args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) config = model_class.pretrained_init_configuration[args.model_name_or_path] if config["vocab_size"] % 8 != 0: config["vocab_size"] += 8 - (config["vocab_size"] % 8) config['num_partitions'] = args.num_partitions model = BertForPretraining(BertModel(**config), args.num_partitions) criterion = BertPretrainingCriterion(model.bert.config["vocab_size"]) prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_positions=masked_lm_positions) loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale) # Define the dynamic learing_reate scheduler and optimizer lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, lambda current_step, num_warmup_steps=args.warmup_steps, num_training_steps=args.max_steps if args.max_steps > 0 else (len(train_data_loader) * args.num_train_epochs): float( current_step) / float(max(1, num_warmup_steps)) if current_step < num_warmup_steps else max( 0.0, float(num_training_steps - current_step) / float( max(1, num_training_steps - num_warmup_steps)))) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) # if worker_num == 1 and args.use_amp: # amp_list = paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists( # custom_white_list=['softmax', 'layer_norm', 'gelu']) # optimizer = paddle.fluid.contrib.mixed_precision.decorate( # optimizer, # amp_list, # init_loss_scaling=args.scale_loss, # use_dynamic_loss_scaling=True) if fleet.worker_num() > 1: # Use the fleet api to compile the distributed optimizer optimizer = dist_optimizer(args, optimizer) optimizer.minimize(loss) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) # state_dict = model.state_dict() # Use the state dict to update the parameter # reset_state_dict = reset_program_state_dict(model, state_dict) # paddle.static.set_program_state(main_program, reset_state_dict) # if worker_num == 1: # # Construct the compiled program # main_program = build_compiled_program(main_program, loss) main_program._graph = None if fleet.worker_index() == 0: with open('startup_%d' % fleet.worker_num(), 'w') as f: f.writelines(str(startup_program)) with open('main_%d' % fleet.worker_num(), 'w') as f: f.writelines(str(main_program)) pool = ThreadPoolExecutor(1) global_step = 0 tic_train = time.time() epoch = 0 while True: files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and "training" in f ] files.sort() num_files = len(files) random.Random(args.seed + epoch).shuffle(files) f_start_id = 0 # Select one file for each worker and create the DataLoader for the file data_file = select_dataset_file_for_each_worker( files, f_start_id, 1, 0) #files, f_start_id, worker_num, worker_index) train_data_loader, _ = create_pretraining_dataset( data_file, args.max_predictions_per_seq, args, data_holders, worker_init, paddle.static.cuda_places()) for f_id in range(f_start_id + 1, len(files)): data_file = select_dataset_file_for_each_worker(files, f_id, 1, 0) # files, f_id, worker_num, worker_index) dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, args, data_holders, worker_init, paddle.static.cuda_places()) for step, batch in enumerate(train_data_loader): global_step += 1 if step == 10 and worker_index == 0: profiler.start_profiler("All") if step == 20 and worker_index == 0: profiler.stop_profiler("total", "/tmp/profile") loss_return = exe.run(main_program, feed=batch, fetch_list=[loss]) # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() if global_step % args.logging_steps == 0: time_cost = time.time() - tic_train print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, ips: %.2f sequences/s" % (global_step, epoch, step, loss_return[0], args.logging_steps / time_cost, args.logging_steps * args.batch_size / time_cost)) tic_train = time.time() if global_step % args.save_steps == 0: if worker_index == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # TODO(fangzeyang): Udpate the save_params to paddle.static paddle.fluid.io.save_params(exe, output_dir) tokenizer.save_pretrained(output_dir) if global_step >= args.max_steps: del train_data_loader return del train_data_loader train_data_loader, data_file = dataset_future.result(timeout=None) epoch += 1
def do_train(args): # Initialize the paddle and paddle fleet execute environment paddle.enable_static() fleet.init(is_collective=True) # Create the random seed for the worker random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed) get_rng_state_tracker().add('global_seed', args.seed) get_rng_state_tracker().add('local_seed', args.seed + fleet.worker_index() + 2021) assert args.device in [ "cpu", "gpu", "xpu" ], "Invalid device! Available device should be cpu, gpu, or xpu." place = paddle.set_device(args.device) worker_num = fleet.worker_num() worker_index = fleet.worker_index() assert args.dp_degree * args.sharding_degree * args.mp_degree * args.pp_degree == worker_num, \ "The product of degree num should be equal to worker_num." topo = Topology(device_rank=worker_index, world_size=worker_num, dp_degree=args.dp_degree, pp_degree=args.pp_degree, sharding_degree=args.sharding_degree, mp_degree=args.mp_degree) logger.info("The topo of hybrid parallelism:\n{}".format(topo)) dist_strategy = dist_optimizer(args, topo) # Create log write, train results show on last card of pipeline. if topo.is_last: log_writer_path = os.path.join( args.output_dir, "train_log", "{}_globalbsz_{}_amp_{}_recompute_{}_card_{}".format( args.model_name_or_path, args.global_batch_size, args.use_amp, args.use_recompute, worker_index).lower()) # if os.path.exists(log_writer_path): # shutil.rmtree(log_writer_path) log_writer = LogWriter(log_writer_path) # Define the input data in the static mode base_class, model_class, criterion_class, tokenizer_class = MODEL_CLASSES[ args.model_type] pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) # load config in checkpoint global_step = 0 consumed_samples = 0 checkpoint_dir = os.path.join(args.output_dir, "model_last") if os.path.exists(checkpoint_dir): if os.path.isfile(os.path.join(checkpoint_dir, "./config.yml")): with open(os.path.join(checkpoint_dir, "./config.yml"), "r") as f: step_config = yaml.load(f, Loader=yaml.FullLoader) assert step_config[ "global_batch_size"] == args.global_batch_size, "Please ensure checkpoint global batch size is the same. Folder: {}".format( checkpoint_dir) consumed_samples = step_config["consumed_samples"] global_step = step_config["global_step"] data_file = get_train_data_file(args) main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() with paddle.static.program_guard(main_program, startup_program): data_holders = create_data_holder(args) # 0. input_ids, # 1. segment_ids, # 2. input_mask, # 3. masked_lm_positions, # 4. masked_lm_labels, # 5. next_sentence_labels [ input_ids, segment_ids, input_mask, masked_lm_positions, masked_lm_labels, next_sentence_labels ] = data_holders tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) train_data_loader, valid_data_loader, test_data_loader = create_pretrained_dataset( args, data_file, tokenizer, data_world_size=topo.data_info.size, data_world_rank=topo.data_info.rank, max_seq_len=args.max_seq_len, places=paddle.static.cuda_places(), data_holders=data_holders, current_step=global_step) fleet.init(is_collective=True) if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] if model_config["vocab_size"] % 8 != 0: model_config["vocab_size"] += 8 - (model_config["vocab_size"] % 8) model_config["hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model = model_class(base_class(**model_config)) else: model, _ = model_class.from_pretrained( args.model_name_or_path, hidden_dropout_prob=args.hidden_dropout_prob, attention_probs_dropout_prob=args.attention_probs_dropout_prob, ) # Create the model for the gpt pretrain prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, position_ids=None, attention_mask=input_mask, masked_positions=masked_lm_positions) criterion = criterion_class(with_nsp_loss=args.binary_head) if args.binary_head: lm_loss, sop_loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels) loss = lm_loss + sop_loss else: loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels) # Create the learning_rate sheduler and optimizer if args.decay_steps is None: args.decay_steps = args.max_steps # lr_scheduler = CosineAnnealingWithWarmupDecay( # max_lr=args.max_lr, # min_lr=args.min_lr, # warmup_step=args.warmup_rate * args.max_steps, # decay_step=args.decay_steps, last_epoch=global_step) lr_scheduler = LinearDecayWithWarmup(args.max_lr, args.max_steps, args.warmup_rate, last_epoch=global_step) clip = None if args.grad_clip > 0: clip = paddle.fluid.clip.GradientClipByGlobalNorm( clip_norm=args.grad_clip) decay_param = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] logger.info("Using paddle.optimizer.AdamW.") optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, beta1=args.adam_beta1, beta2=args.adam_beta2, epsilon=args.adam_epsilon, grad_clip=clip, weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_param) # alias optimizer.apply_optimize = optimizer._apply_optimize # if args.use_recompute: # dist_strategy.recompute = True # dist_strategy.recompute_configs = { # "checkpoints": model.bert.checkpoints # } # Use the fleet api to compile the distributed optimizer optimizer = fleet.distributed_optimizer(optimizer, strategy=dist_strategy) optimizer.minimize(loss) logger.info(f'final strategy: {fleet._final_strategy()}') logger.info("The training meta optimizer is/are %s" % fleet._get_applied_meta_list()) program_desc_dir = os.path.join(args.output_dir, "program_desc") if not os.path.isdir(program_desc_dir): os.mkdir(program_desc_dir) with open(program_desc_dir + "/main_program.txt.%d" % worker_index, 'w') as f: f.write(str(main_program)) with open(program_desc_dir + "/startup_program.txt.%d" % worker_index, 'w') as f: f.write(str(startup_program)) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) test_program = main_program.clone(for_test=True) if args.model_name_or_path not in pretrained_models_list: logger.info("Try to load checkpoint from %s " % args.model_name_or_path) dygrah_path = os.path.join(args.model_name_or_path, "model_state.pdparams") static_path = os.path.join(args.model_name_or_path, "static_vars") flag_loaded = False if os.path.exists(static_path): if args.mp_degree > 1: logger.warning("MP should init with dygraph params") else: logger.info("Loading parameters from %s" % static_path) paddle.static.load(main_program, static_path, exe) flag_loaded = True if not flag_loaded and os.path.exists(dygrah_path): if args.sharding_degree > 1: logger.warning("Sharding should init with static vars") else: logger.info("Loading parameters from %s" % dygrah_path) init_static_with_params( model, paddle.load(dygrah_path, return_numpy=True), topo, main_program) flag_loaded = True if not flag_loaded: logger.error("No checkpoint load.") # load checkpoint vars if os.path.exists(checkpoint_dir): if os.path.isfile(os.path.join(checkpoint_dir, "./config.yml")): paddle.static.load(main_program, os.path.join(checkpoint_dir, "static_vars"), exe) fetch_loss_vars = collections.OrderedDict() fetch_other_vars = collections.OrderedDict() fetch_loss_vars["loss"] = loss if args.binary_head: fetch_loss_vars["lm_loss"] = lm_loss fetch_loss_vars["sop_loss"] = sop_loss fetch_other_vars["learning_rate"] = main_program.global_block( ).vars["learning_rate_0"] additional_vars = collections.OrderedDict() if args.use_amp: for key in ["loss_scaling", "num_good_steps", "num_bad_steps"]: additional_vars[key] = main_program.global_block().vars[key + "_0"] tic_train = time.time() while True: fetchs = [] fetchs_keys = [] if topo.is_last: fetchs = list(fetch_loss_vars.values()) + list( fetch_other_vars.values()) + list(additional_vars.values()) fetchs_keys = list(fetch_loss_vars.keys()) + list( fetch_other_vars.keys()) + list(additional_vars.keys()) # Bug fix, if not call valid_data_loader, the enumerate will call valid_data_loader # many times. and start a new random dataloader. valid_data_loader = valid_data_loader() test_data_loader = test_data_loader() for step, batch in enumerate(train_data_loader()): ret = exe.run(main_program, feed=batch, fetch_list=fetchs, use_program_cache=True) # Skip for accumulate_steps in global step if (step + 1) % args.accumulate_steps != 0: continue global_step += 1 # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() if global_step % args.logging_freq == 0: if topo.is_last: res = collections.defaultdict(float) for k, v in zip(fetchs_keys, ret): res[k] = v[0] speed = args.logging_freq / (time.time() - tic_train) loss_info = "loss: %.6f, lm_loss: %.6f, sop_loss: %.6f" loss_info = ", ".join([ "{}: {:.6f}".format(k, res[k]) for k in fetch_loss_vars.keys() ]) common_loginfo = "global step %d, %s, speed: %.2f steps/s, ips: %.2f seqs/s, learning rate: %.5e" % ( global_step, loss_info, speed, speed * args.global_batch_size, res["learning_rate"]) additional_loginfo = ", ".join([ "{}: {}".format(k, res[k]) for k in additional_vars.keys() ]) if additional_loginfo: common_loginfo += ", " + additional_loginfo logger.info(common_loginfo) for k, v in res.items(): log_writer.add_scalar(k, v, global_step) tic_train = time.time() #if args.check_accuracy: # if global_step >= args.max_steps: # return # else: # continue if global_step % args.eval_freq == 0: # TODO, check the input data of validation eval_fetch = collections.OrderedDict() if topo.is_last: eval_fetch["loss"] = loss if args.binary_head: eval_fetch["lm_loss"] = lm_loss eval_fetch["sop_loss"] = sop_loss run_evaluate(valid_data_loader, exe, test_program, args.eval_iters, log_writer, global_step, args, topo.is_last, eval_fetch, "valid") tic_train = time.time() if global_step % args.save_steps == 0 or global_step >= args.max_steps: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) logger.debug("saving models to {}".format(output_dir)) save_persistables(exe, os.path.join(output_dir, "static_vars"), main_program) if global_step == args.save_steps: model.init_config["init_args"][0].init_config.pop( "topo", None) model.save_pretrained(output_dir) tokenizer.save_pretrained(output_dir) tic_train = time.time() if global_step % args.checkpoint_steps == 0: output_dir = os.path.join(args.output_dir, "model_last") if worker_index == 0: if not os.path.exists(output_dir): os.mkdir(output_dir) output_dir_bak = os.path.join(args.output_dir, "model_last_bak") if os.path.exists(output_dir): if os.path.exists(output_dir_bak): shutil.rmtree(output_dir_bak) shutil.move(output_dir, output_dir_bak) os.mkdir(output_dir) step_config = { "model_name": args.model_name_or_path, "global_step": global_step, "global_batch_size": args.global_batch_size, "consumed_samples": global_step * args.global_batch_size, } with open(os.path.join(output_dir, "config.yml"), "w") as f: yaml.dump(step_config, f, encoding='utf-8', allow_unicode=True) fleet.barrier_worker() logger.debug("saving models to {}".format(output_dir)) if args.sharding_degree <= 1: # Save on the first worker by default. if worker_index == 0: paddle.static.save( main_program, os.path.join(output_dir, "static_vars")) else: # Use save_persistables in sharding, but more slower save_persistables(exe, os.path.join(output_dir, "static_vars"), main_program) if global_step >= args.max_steps: eval_fetch = collections.OrderedDict() if topo.is_last: eval_fetch["loss"] = loss if args.binary_head: eval_fetch["lm_loss"] = lm_loss eval_fetch["sop_loss"] = sop_loss run_evaluate(test_data_loader, exe, test_program, args.test_iters, log_writer, global_step, args, topo.is_last, eval_fetch, "test") del train_data_loader return
def main(args): np.random.seed(9001) run_id = args.run_id if not os.path.isdir(run_id): os.system('mkdir -p {}'.format(run_id)) profile = False batch_size = 512 * 50 lr = 1e-4 fleet.init(is_collective=True) # load Bert_large / Bert_base model model = X.applications.BertLarge(lang="en") model.main_prog.random_seed = 9001 model.startup_prog.random_seed = 9001 local_path = "./data" data_loader = model.get_val_dataloader(data_dir='{}'.format(local_path), max_seq_len=512, batch_size=batch_size, in_tokens=True, shuffle=False) place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 2 exec_strategy.num_iteration_per_drop_scope = 1 build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = True dist_strategy = fleet.DistributedStrategy() dist_strategy.execution_strategy = exec_strategy dist_strategy.build_strategy = build_strategy dist_strategy.nccl_comm_num = 1 dist_strategy.amp = args.use_amp # recompute checkpoints = [ 'elementwise_add_{}.tmp_0'.format(i * 2) for i in range(1, 24) ] dist_strategy.recompute = args.use_recompute if args.use_recompute: dist_strategy.recompute_configs = {"checkpoints": checkpoints} scheduled_lr = X.utils.linear_warmup_decay(lr, warmup_steps=4000, num_train_steps=1000000) optimizer = fluid.optimizer.Adam(learning_rate=scheduled_lr) optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) clip_norm_thres = 1.0 fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=clip_norm_thres)) ops, param_grads = optimizer.minimize(model.loss) filename = "./" + args.run_id + "/main_program.txt" with open(filename + str(int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f: f.write(str(fluid.default_main_program())) filename = "./" + args.run_id + "/start_program.txt" with open(filename + str(int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f: f.write(str(fluid.default_startup_program())) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) model.main_prog.random_seed = 9001 model.startup_prog.random_seed = 9001 np.random.seed(9001) fetch_list = [model.loss.name] + list(model.target.values()) + \ [scheduled_lr.name, "loss_scaling_0"] start_time = -1 speeds = [] profile = False costs = [] accs = [] print("============start training============") for i, data in enumerate(data_loader()): # profile if profile and i == 2050: print("begin profiler") profiler.start_profiler("All") elif profile and i == 2065: print("end profiler") filename = "./run_id/profile_" + str(fleet.worker_index()) profiler.stop_profiler("total", filename) print("end profiler break!") print("avg speed = {} step / s".format(np.mean(speeds))) sys.exit("profile finish !") cost_val, next_sent_acc, lm_loss, np_lr, loss_scaling_0 = exe.run( fluid.default_main_program(), feed=data, fetch_list=fetch_list, use_program_cache=True) costs.append(cost_val[0]) accs.append(next_sent_acc[0]) # count speed if (i + 1) % 10 == 0: duration = time.time() - start_time speed = 10 / duration print("step {}, loss {}, acc {}, np_lr {}". \ format(i, np.mean(costs), np.mean(accs), np_lr[0])) start_time = time.time() costs = [] accs = []
def infer_dst(args): """Inference main function.""" if args.is_distributed: fleet.init(is_collective=True) dev_count = fluid.core.get_cuda_device_count() gpu_id = int(os.getenv("FLAGS_selected_gpus")) trainers_num = fleet.worker_num() trainer_id = fleet.worker_index() phase = "distributed_test" else: dev_count = 1 gpu_id = 0 trainers_num = 1 trainer_id = 0 phase = "test" place = fluid.CUDAPlace(gpu_id) task = tasks.create_task(args) model = models.create_model(args, place) # task.debug() schema = get_schema(args.dataset) empty_ds_seq = "<ds/> " + " ".join(flatten_ds({}, schema)) + " </ds>" # record original order and init status output_order = [] # {"dial_id": {"prev_ds": "", "turns": [{"utts": utts, "turn_idx": turn_idx}], "cur_idx": 0}} dial_status = defaultdict(dict) with open(args.infer_file, "r") as fin: next(fin) for line in fin: dial_id, turn_idx, utts = line.strip().split("\t") output_order.append(f"{dial_id}-{turn_idx}") if dial_id not in dial_status: dial_status[dial_id]["prev_ds"] = empty_ds_seq dial_status[dial_id]["turns"] = [] dial_status[dial_id]["cur_idx"] = 0 dial_status[dial_id]["turns"].append({ "utts": utts, "turn_idx": turn_idx }) dial_ids = list(dial_status.keys()) # batch inference outputs = {} timer = Timer() while len(dial_ids) > 0: timer.start() cur_dial_ids = dial_ids[:args.dial_batch_size] logger.info(f"Sampled dialogue ids: {cur_dial_ids}") # 1st: basic generation basic_inputs = {} for cur_dial_id in cur_dial_ids: cur_idx = dial_status[cur_dial_id]["cur_idx"] cur_dial_turn = dial_status[cur_dial_id]["turns"][cur_idx] cur_utts = cur_dial_turn["utts"] prev_ds = dial_status[cur_dial_id]["prev_ds"] src = f"<gen/> {cur_utts} [SEP] {prev_ds} </gen>\x010" basic_inputs[f"{cur_dial_id}-{cur_dial_turn['turn_idx']}"] = src basic_outputs = generate(basic_inputs, model, task) # 2nd: amending generation amending_inputs = {} for cur_dial_id in cur_dial_ids: cur_idx = dial_status[cur_dial_id]["cur_idx"] cur_dial_turn = dial_status[cur_dial_id]["turns"][cur_idx] cur_utts = cur_dial_turn["utts"] basic_ds = basic_outputs[ f"{cur_dial_id}-{cur_dial_turn['turn_idx']}"] src = f"<amend/> {cur_utts} [SEP] {basic_ds} </amend>\x010" amending_inputs[f"{cur_dial_id}-{cur_dial_turn['turn_idx']}"] = src amending_outputs = generate(amending_inputs, model, task) outputs.update(amending_outputs) time_cost_infer = timer.pass_time logger.info(f"Time cost: {time_cost_infer}") # debug info for dial_turn_tag in basic_inputs: logger.debug(f"[basic input]: {basic_inputs[dial_turn_tag]}") logger.debug(f"[basic output]: {basic_outputs[dial_turn_tag]}") logger.debug(f"[amending input]: {amending_inputs[dial_turn_tag]}") logger.debug( f"[amending output]: {amending_outputs[dial_turn_tag]}") # update dial_status for dial_turn_tag in amending_outputs: dial_id, _ = dial_turn_tag.split("-") dial_status[dial_id]["cur_idx"] += 1 if dial_status[dial_id]["cur_idx"] >= len( dial_status[dial_id]["turns"]): dial_ids.remove(dial_id) else: dial_status[dial_id]["prev_ds"] = outputs[dial_turn_tag] timer.reset() # reorder and output if gpu_id == 0: pred_seqs = [] pred_labels = [] for dial_turn_tag in output_order: pred_seqs.append(outputs[dial_turn_tag]) pred_label = parse_ds(outputs[dial_turn_tag], schema) pred_labels.append(pred_label) out_seq_file = os.path.join(args.save_path, "inference_output.txt") out_label_file = os.path.join(args.save_path, "inference_labels.json") with open(out_seq_file, "w") as fout_seq, open(out_label_file, "w") as fout_label: fout_seq.write("\n".join(pred_seqs)) json.dump(pred_labels, fout_label, indent=2) logger.info(f"Save inference sequences to `{out_seq_file}`") logger.info(f"Save inference labels to `{out_label_file}`")
input_y = paddle.static.data(name="y", shape=[None, 1], dtype='int64') cost = mlp(input_x, input_y) optimizer = paddle.optimizer.SGD(learning_rate=0.01) role = role_maker.PaddleCloudRoleMaker() fleet.init(role) strategy = paddle.distributed.fleet.DistributedStrategy() strategy.a_sync = True optimizer = fleet.distributed_optimizer(optimizer, strategy) optimizer.minimize(cost) if fleet.is_server(): fleet.init_server() fleet.run_server() elif fleet.is_worker(): place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) step = 1001 for i in range(step): cost_val = exe.run(program=paddle.static.default_main_program(), feed=gen_data(), fetch_list=[cost.name]) print("worker_index: %d, step%d cost = %f" % (fleet.worker_index(), i, cost_val[0]))
def do_train(args): # Initialize the paddle and paddle fleet execute enviroment paddle.enable_static() place = paddle.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) fleet.init(is_collective=True) # Create the random seed for the worker set_seed(args.seed) worker_init = WorkerInitObj(args.seed + fleet.worker_index()) # Define the input data in the static mode data_holders = create_data_holder(args) [ input_ids, segment_ids, input_mask, masked_lm_positions, masked_lm_labels, next_sentence_labels, masked_lm_scale ] = data_holders # Define the model structure in static mode args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) model = BertForPretraining( BertModel(**model_class.pretrained_init_configuration[ args.model_name_or_path])) criterion = BertPretrainingCriterion(model.bert.config["vocab_size"]) prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_positions=masked_lm_positions) loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale) num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs # Define the dynamic learing_reate scheduler and optimizer lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) # Use the fleet api to compile the distributed optimizer strategy = fleet.DistributedStrategy() optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(loss) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(paddle.static.default_startup_program()) state_dict = model.state_dict() # Use the state dict to update the parameter reset_state_dict = reset_program_state_dict(model, state_dict) paddle.static.set_program_state(paddle.static.default_main_program(), reset_state_dict) pool = ThreadPoolExecutor(1) global_step = 0 tic_train = time.time() worker_num = fleet.worker_num() worker_index = fleet.worker_index() epoch = 0 while True: files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and "training" in f ] files.sort() num_files = len(files) random.Random(args.seed + epoch).shuffle(files) f_start_id = 0 # Select one file for each worker and create the DataLoader for the file data_file = select_dataset_file_for_each_worker( files, f_start_id, worker_num, worker_index) train_data_loader, _ = create_pretraining_dataset( data_file, args.max_predictions_per_seq, args, data_holders, worker_init, paddle.static.cuda_places()) for f_id in range(f_start_id + 1, len(files)): data_file = select_dataset_file_for_each_worker( files, f_id, worker_num, worker_index) dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, args, data_holders, worker_init, paddle.static.cuda_places()) for step, batch in enumerate(train_data_loader): global_step += 1 loss_return = exe.run(paddle.static.default_main_program(),\ feed=batch, fetch_list=[loss]) # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() if global_step % args.logging_steps == 0: time_cost = time.time() - tic_train print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, ips :%.2f sequences/s" % (global_step, epoch, step, loss_return[0], args.logging_steps / time_cost, args.logging_steps * args.batch_size / time_cost)) tic_train = time.time() if global_step % args.save_steps == 0: if worker_index == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # TODO(fangzeyang): Udpate the save_params to paddle.static paddle.fluid.io.save_params(exe, output_dir) tokenizer.save_pretrained(output_dir) if global_step >= args.max_steps: del train_data_loader return del train_data_loader train_data_loader, data_file = dataset_future.result(timeout=None) epoch += 1
def test_worker_index(): """test_worker_index""" assert fleet.worker_index() == 0 print("{} ... ok".format(sys._getframe().f_code.co_name))
def test_worker_index(self): role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) print(fleet.worker_index())
def write_xbox_donefile(output_path, day, pass_id, xbox_base_key, client, hadoop_fs_name="", monitor_data="", donefile_name=None): """ write xbox donefile when save xbox model Args: output_path(str): output path day(str|int): training day pass_id(str|int): training pass id xbox_base_key(str|int): xbox base key client(HDFSClient): hadoop client donefile_name(str): donefile name, default is "xbox_patch_done.txt" """ day = str(day) pass_id = str(pass_id) xbox_base_key = int(xbox_base_key) mode = None if pass_id != "-1": mode = "patch" suffix_name = "/%s/delta-%s/" % (day, pass_id) model_path = output_path.rstrip("/") + suffix_name if donefile_name is None: donefile_name = "xbox_patch_done.txt" else: mode = "base" suffix_name = "/%s/base/" % day model_path = output_path.rstrip("/") + suffix_name if donefile_name is None: donefile_name = "xbox_base_done.txt" if fleet.worker_index() == 0: donefile_path = output_path + "/" + donefile_name xbox_str = _get_xbox_str( model_path=model_path, xbox_base_key=xbox_base_key, hadoop_fs_name=hadoop_fs_name, monitor_data=monitor_data, mode=mode) if not is_local(donefile_path): if client.is_file(donefile_path): pre_content = client.cat(donefile_path) last_line = pre_content.split("\n")[-1] if last_line == '': last_line = pre_content.split("\n")[-2] last_dict = json.loads(last_line) last_day = last_dict["input"].split("/")[-3] last_pass = last_dict["input"].split("/")[-2].split("-")[-1] exist = False if int(day) < int(last_day) or \ int(day) == int(last_day) and \ int(pass_id) <= int(last_pass): exist = True if not exist: with open(donefile_name, "w") as f: f.write(pre_content + "\n") f.write(xbox_str + "\n") client.delete(donefile_path) client.upload( donefile_name, output_path, multi_processes=1, overwrite=False) logger.info("write %s/%s %s success" % \ (day, pass_id, donefile_name)) else: logger.info("do not write %s because %s/%s already " "exists" % (donefile_name, day, pass_id)) else: with open(donefile_name, "w") as f: f.write(xbox_str + "\n") client.upload( donefile_name, output_path, multi_processes=1, overwrite=False) logger.info("write %s/%s %s success" % \ (day, pass_id, donefile_name)) else: file = Path(donefile_path) if not file.is_file(): with open(donefile_path, "w") as f: f.write(xbox_str + "\n") return with open(donefile_path, encoding='utf-8') as f: pre_content = f.read().strip("\n") exist = False last_line = pre_content.split("\n")[-1] last_dict = json.loads(last_line, strict=False) last_day = last_dict["input"].split("/")[-3] last_pass = last_dict["input"].split("/")[-2].split("-")[-1] if int(day) < int(last_day) or \ int(day) == int(last_day) and \ int(pass_id) <= int(last_pass): exist = True if not exist: with open(donefile_path, "w") as f: f.write(pre_content + "\n") f.write(xbox_str + "\n")
def write_model_donefile(output_path, day, pass_id, xbox_base_key, client, donefile_name="donefile.txt"): """ write donefile when save model Args: output_path(str): output path day(str|int): training day pass_id(str|int): training pass id xbox_base_key(str|int): xbox base key client(HDFSClient): hadoop client donefile_name(str): donefile name, default is "donefile.txt"r """ day = str(day) pass_id = str(pass_id) xbox_base_key = int(xbox_base_key) if pass_id != "-1": suffix_name = "/%s/%s/" % (day, pass_id) model_path = output_path.rstrip("/") + suffix_name else: suffix_name = "/%s/0/" % day model_path = output_path.rstrip("/") + suffix_name if fleet.worker_index() == 0: donefile_path = output_path + "/" + donefile_name content = "%s\t%lu\t%s\t%s\t%d" % (day, xbox_base_key, \ model_path, pass_id, 0) if not is_local(model_path): if client.is_file(donefile_path): pre_content = client.cat(donefile_path) pre_content_list = pre_content.split("\n") day_list = [i.split("\t")[0] for i in pre_content_list] pass_list = [i.split("\t")[3] for i in pre_content_list] exist = False for i in range(len(day_list)): if int(day) == int(day_list[i]) and \ int(pass_id) == int(pass_list[i]): exist = True break if not exist: with open(donefile_name, "w") as f: f.write(pre_content + "\n") f.write(content + "\n") client.delete(donefile_path) client.upload(donefile_name, output_path) logger.info("write %s/%s %s succeed" % \ (day, pass_id, donefile_name)) else: logger.info("not write %s because %s/%s already " "exists" % (donefile_name, day, pass_id)) else: with open(donefile_name, "w") as f: f.write(content + "\n") client.upload(donefile_name, output_path) logger.info("write %s/%s %s succeed" % \ (day, pass_id, donefile_name)) else: file = Path(donefile_path) logger.info("model done file path = {}, content = {}".format( donefile_path, content)) if not file.is_file(): logger.info(" {} doesn't exist ".format(donefile_path)) with open(donefile_path, "w") as f: f.write(content + "\n") return with open(donefile_path, encoding='utf-8') as f: pre_content = f.read().strip("\n") logger.info("pre_content = {}".format(pre_content)) pre_content_list = pre_content.split("\n") day_list = [i.split("\t")[0] for i in pre_content_list] pass_list = [i.split("\t")[3] for i in pre_content_list] exist = False for i in range(len(day_list)): if int(day) == int(day_list[i]) and \ int(pass_id) == int(pass_list[i]): exist = True break if not exist: with open(donefile_path, "w") as f: f.write(pre_content + "\n") logger.info("write donefile {}".format(pre_content)) f.write(content + "\n") logger.info("write donefile {}".format(content)) logger.info("write %s/%s %s succeed" % \ (day, pass_id, donefile_name)) else: logger.info("not write %s because %s/%s already " "exists" % (donefile_name, day, pass_id))
def train(args): log.info("pretraining start") profile = False place = fluid.CUDAPlace(int(os.environ.get('FLAGS_selected_gpus', 0))) # set seed random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed) get_rng_state_tracker().add('global_seed', args.seed) get_rng_state_tracker().add('local_seed', args.seed + fleet.worker_index() + 2021) # define execution strategy exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 2 exec_strategy.num_iteration_per_drop_scope = 1 # define distribution strategy dist_strategy = fleet.DistributedStrategy() dist_strategy.execution_strategy = exec_strategy dist_strategy.nccl_comm_num = 3 if args.use_recompute: log.info("using recompute.") dist_strategy.recompute = args.use_recompute dist_strategy.sharding = args.use_sharding dist_strategy.pipeline = args.num_pp > 1 # define topology structure for dp/pp/mp topo = Topology(rank=fleet.worker_index(), world_size=fleet.worker_num(), dp=args.num_dp, pp=args.num_pp, sharding=args.num_sharding, mp=args.num_mp) is_last = False if topo.pp.rank == (topo.pp.size - 1): is_last = True dp_sharding_rank = topo.dp.rank * topo.sharding.size + topo.sharding.rank dp_worldsize = topo.dp.size * topo.sharding.size bsz_per_dp = args.global_bsz // dp_worldsize micro_bsz = args.micro_bsz assert args.global_bsz % micro_bsz == 0, f"cannot do gradient accumulate, globa_bsz: {args.bsz} micro_bsz: {micro_bsz}" acc_steps = bsz_per_dp // micro_bsz # sharding \ model parallel \ pipeline assert dist_strategy.sharding == True dist_strategy.sharding_configs = { "segment_broadcast_MB": 32, "sharding_degree": args.num_sharding, "mp_degree": args.num_mp, "pp_degree": args.num_pp, "dp_degree": args.num_dp, "optimize_offload": True, } dist_strategy.pipeline_configs = { "schedule_mode": "1F1B", "micro_batch_size": micro_bsz, "accumulate_steps": acc_steps, } log.info( f"using globa_bsz: {args.global_bsz} micro_bsz: {micro_bsz}, acc_steps: {acc_steps}" ) dist_strategy.amp = args.use_amp dist_strategy.amp_configs = { "custom_white_list": ['softmax', 'layer_norm', 'gelu'], "init_loss_scaling": 32768, "decr_every_n_nan_or_inf": 2, "incr_every_n_steps": 1000, "incr_ratio": 2.0, "use_dynamic_loss_scaling": True, "decr_ratio": 0.5, "use_pure_fp16": False, "use_fp16_guard": False, } dist_strategy.lamb = args.use_lamb dist_strategy.lamb_configs = { 'lamb_weight_decay': 0.01, 'exclude_from_weight_decay': ['layer_norm_bias', 'layer_norm_scale', '.b_0'] } train_program = fluid.Program() startup_program = fluid.Program() with fluid.program_guard(train_program, startup_program): with fluid.unique_name.guard(): graph_vars = create_model(args, 'train', micro_bsz, dp_sharding_rank, dp_worldsize, topo) data_loader = graph_vars['data_loader'] for op in train_program.global_block().ops: if op.type == 'fill_constant': op._set_attr( 'op_device', "gpu:0" ) # XXX: hack: https://github.com/PaddlePaddle/Paddle/blob/develop/python/paddle/fluid/layers/tensor.py#L1376 if args.use_recompute: dist_strategy.recompute_configs = { "checkpoints": graph_vars['checkpoints'], # "enable_offload": args.use_offload, # "checkpoint_shape": [micro_bsz, args.max_seq_len, 4096], } log.debug("base lr: {}".format(args.learning_rate)) scheduled_lr = linear_warmup_decay( learning_rate=args.learning_rate, warmup_steps=args.warmup_steps, num_train_steps=args.num_train_steps) clip_norm_thres = 1.0 if paddlenlp.ops.optimizer._jit_compile(): optimizer = paddlenlp.ops.optimizer.AdamwOptimizer( learning_rate=scheduled_lr, grad_clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=clip_norm_thres), weight_decay=args.weight_decay, apply_decay_param_fun=apply_weight_decay_fun) else: optimizer = fluid.optimizer.Adam( learning_rate=scheduled_lr, grad_clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=clip_norm_thres), #multi_precision=True, #weight_decay=args.weight_decay, # merge this pr to use weight_decay: https://github.com/PaddlePaddle/Paddle/pull/29248 #exclude_from_weight_decay_fn=exclude_from_weight_decay ) optimizer = fleet.distributed_optimizer(optimizer, dist_strategy) log.info(f"using dist strategy: {dist_strategy}") optimizer.minimize(graph_vars['total_loss']) final_strategy = fleet._final_strategy() applied_meta_list = fleet._get_applied_meta_list() log.info("final strategy: {}".format(final_strategy)) log.info("applied_meta_list: {}".format(applied_meta_list)) program_desc_dir = os.path.join(args.output_dir, "program_desc") if not os.path.isdir(program_desc_dir): os.mkdir(program_desc_dir) with open( program_desc_dir + "/main_program.txt.%d" % (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f: f.write(str(train_program)) with open( program_desc_dir + "/startup_program.txt.%d" % (int(os.environ.get('FLAGS_selected_gpus', 0))), 'w') as f: f.write(str(startup_program)) exe = fluid.Executor(place) exe.run(startup_program) optimizer.amp_init(place) #save_path = os.path.join(args.output_dir, 'step_0') #log.debug("saving models to {}".format(save_path)) #save_persistables(exe, save_path, train_program) if args.init_checkpoint and args.init_checkpoint != "": log.info(' ') log.info( '############################WARNING############################') log.info( '####### using ini_checkpoint, not init_pretraining_params ####') log.info( '## meaning hyper param e.g. lr will inherit from checkpoint ##') log.info( '###############################################################') init_checkpoint(exe, args.init_checkpoint, train_program) log.info(' ') output_dir = args.output_dir save_steps = args.save_steps total_time = 0 cost_vals, lm_losses, sop_accs = [], [], [] global_steps = args.global_steps + 1 steps = 0 log_path = 'train_log/node-%d' % fleet.worker_index() start_time = time.time() with LogWriter(os.path.join(args.output_dir, log_path)) as swriter: data_loader.start() while True: #if steps < global_steps: # steps += 1 # continue if not is_last: fetch_list = [] else: fetch_list = [ graph_vars['total_loss'], graph_vars['mean_mask_lm_loss'], scheduled_lr ] if args.use_sop: fetch_list.extend( [graph_vars['sop_acc'], graph_vars['sop_loss']]) if args.use_amp: loss_scaling = train_program.global_block( ).vars['loss_scaling_0'] fetch_list.append(loss_scaling) ret = exe.run(train_program, fetch_list=fetch_list ) # run one mini-batch(=acc_steps micro-batch) #use_program_cache=True) steps += 1 if is_last: if args.use_sop and args.use_amp: cost_val, lm_loss, lr, sop_acc, sop_loss, loss_scaling_0 = ret elif args.use_sop: cost_val, lm_loss, lr, sop_acc, sop_loss = ret elif args.use_amp: cost_val, lm_loss, lr, loss_scaling_0 = ret else: cost_val, lm_loss, lr = ret cost_vals.append(cost_val[0]) lm_losses.append(lm_loss[0]) if args.use_sop: sop_accs.append(sop_acc[0]) if steps > 0 and (steps % args.log_steps) == 0: end_time = time.time() total_time = end_time - start_time cost_val = np.mean(cost_vals) lm_loss = np.mean(lm_losses) swriter.add_scalar('loss/total_loss', cost_val, steps) swriter.add_scalar('loss/mlm_loss', lm_loss, steps) swriter.add_scalar('lr/scheduled_lr', lr[0], steps) if args.use_sop: sop_acc = np.mean(sop_accs) swriter.add_scalar('loss/sop_loss', sop_loss, steps) swriter.add_scalar('train/sop_acc', sop_acc, steps) else: sop_acc = 0.0 if args.use_amp: swriter.add_scalar('lr/loss_scaling', loss_scaling_0[0], steps) else: loss_scaling_0 = [0.0] log.info( "worker_index: %d, step: %d, cost: %f, " "mlm loss: %f, sentence order acc: %f, " "speed: %f steps/s, " "speed: %f samples/s, " "speed: %f tokens/s, " "learning rate: %.3e, loss_scalings: %f" % (fleet.worker_index(), steps, cost_val, lm_loss, sop_acc, args.log_steps / total_time, args.log_steps * args.global_bsz / total_time, args.log_steps * args.global_bsz * args.max_seq_len / total_time, lr[0], loss_scaling_0[0])) cost_vals, lm_losses, sop_accs = [], [], [] start_time = time.time() # TODO: add evaluation if steps > 0 and args.eval_steps > 0 and steps % args.eval_steps == 0: pass if steps > 0 and args.save_steps > 0 and steps % args.save_steps == 0: if args.use_hybrid_dp and fleet.worker_index() > 8: continue save_path = os.path.join(output_dir, 'step_' + str(steps)) log.debug("saving models to {}".format(save_path)) save_persistables(exe, save_path, train_program) if steps == args.num_train_steps: if args.use_hybrid_dp and fleet.worker_index() > 8: continue save_path = os.path.join(output_dir, 'final_step_' + str(steps)) save_persistables(exe, save_path, train_program) log.debug("saving final models to {}".format(save_path)) log.debug("end of training, total steps: {}".format(steps))
def do_train(args): # Initialize the paddle and paddle fleet execute enviroment paddle.enable_static() place = paddle.set_device(args.device) fleet.init(is_collective=True) worker_num = fleet.worker_num() worker_index = fleet.worker_index() # Create the random seed for the worker set_seed(args.seed) worker_init = WorkerInitObj(args.seed + worker_index) # Define the input data in the static mode main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() data_holders = create_data_holder(args) [ input_ids, segment_ids, input_mask, masked_lm_positions, masked_lm_labels, next_sentence_labels, masked_lm_scale ] = data_holders # Define the model structure in static mode args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) config = model_class.pretrained_init_configuration[args.model_name_or_path] if config["vocab_size"] % 8 != 0: config["vocab_size"] += 8 - (config["vocab_size"] % 8) model = BertForPretraining(BertModel(**config)) criterion = BertPretrainingCriterion(model.bert.config["vocab_size"]) prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_positions=masked_lm_positions) loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale) # Define the dynamic learing_reate scheduler and optimizer num_training_steps = args.max_steps if args.max_steps > 0 else len( train_data_loader) * args.num_train_epochs lr_scheduler = LinearDecayWithWarmup(args.learning_rate, num_training_steps, args.warmup_steps) # Generate parameter names needed to perform weight decay. # All bias and LayerNorm parameters are excluded. decay_params = [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ] optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in decay_params, multi_precision=args.use_pure_fp16) # Use the fleet api to compile the distributed optimizer optimizer = dist_optimizer(args, optimizer) optimizer.minimize(loss) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) state_dict = model.state_dict() # Use the state dict to update the parameter reset_state_dict = reset_program_state_dict(model, state_dict) paddle.static.set_program_state(main_program, reset_state_dict) if args.use_amp: optimizer.amp_init(place) pool = ThreadPoolExecutor(1) global_step = 0 tic_train = time.time() epoch = 0 while True: files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and "training" in f ] files.sort() num_files = len(files) random.Random(args.seed + epoch).shuffle(files) f_start_id = 0 # Select one file for each worker and create the DataLoader for the file data_file = select_dataset_file_for_each_worker( files, f_start_id, worker_num, worker_index) train_data_loader, _ = create_pretraining_dataset( data_file, args.max_predictions_per_seq, args, data_holders, worker_init, paddle.static.cuda_places()) for f_id in range(f_start_id + 1, len(files)): data_file = select_dataset_file_for_each_worker( files, f_id, worker_num, worker_index) dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, args, data_holders, worker_init, paddle.static.cuda_places()) train_cost_avg = TimeCostAverage() reader_cost_avg = TimeCostAverage() total_samples = 0 batch_start = time.time() for step, batch in enumerate(train_data_loader): train_reader_cost = time.time() - batch_start reader_cost_avg.record(train_reader_cost) global_step += 1 train_start = time.time() loss_return = exe.run(main_program, feed=batch, fetch_list=[loss]) total_samples += args.batch_size # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() train_run_cost = time.time() - batch_start train_cost_avg.record(train_run_cost) # Profile for model benchmark if args.profiler_options is not None: profiler.add_profiler_step(args.profiler_options) if global_step % args.logging_steps == 0: print( "tobal step: %d, epoch: %d, batch: %d, loss: %f, " "avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec" % (global_step, epoch, step, loss_return[0], reader_cost_avg.get_average(), train_cost_avg.get_average(), total_samples / args.logging_steps, args.batch_size / ( reader_cost_avg.get_average() + train_cost_avg.get_average()))) total_samples = 0 train_cost_avg.reset() reader_cost_avg.reset() if global_step % args.save_steps == 0: if worker_index == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) model.save_model_config(output_dir) paddle.static.save(main_program, os.path.join(output_dir, "model_state")) tokenizer.save_pretrained(output_dir) if global_step >= args.max_steps: reader_start = time.time() del train_data_loader return batch_start = time.time() del train_data_loader train_data_loader, data_file = dataset_future.result(timeout=None) epoch += 1
def do_generation(args): # Initialize the paddle and paddle fleet execute environment paddle.enable_static() assert args.dp_degree == 1, "Data parallel is not supported in inference" assert args.sharding_degree == 1, "Sharding parallel is temporarily not supported in inference" assert args.pp_degree == 1, "Pipeline parallel will be supported later" if args.mp_degree == 1: args.mp_degree = paddle.distributed.get_world_size() else: assert args.mp_degree == paddle.distributed.get_world_size(), \ "If mp_degree is specified, the size must be the same as world_size" strategy = fleet.DistributedStrategy() strategy.tensor_parallel = True strategy.tensor_parallel_configs = { "tensor_parallel_degree": args.mp_degree } fleet.init(is_collective=True, strategy=strategy) # temp use dynamic init, use HybridParallelInferenceHelper in future? paddle.distributed.init_parallel_env() # Create the random seed for the worker random.seed(args.seed) np.random.seed(args.seed) paddle.seed(args.seed) get_rng_state_tracker().add('global_seed', args.seed) get_rng_state_tracker().add('local_seed', args.seed + fleet.worker_index() + 2021) if args.use_amp and args.amp_level == "O2": assert (args.mp_degree == 1 and args.pp_degree == 1 ), "When amp level is O2, mp_degree and pp_degree should be 1." assert (args.use_sharding == False ), "When amp level is O2, use_sharding should be False." assert args.device in [ "cpu", "gpu", "xpu" ], "Invalid device! Available device should be cpu, gpu, or xpu." place = paddle.set_device(args.device) worker_num = fleet.worker_num() worker_index = fleet.worker_index() local_rank = 0 if fleet.local_rank() is None else int(fleet.local_rank()) topo = Topology( device_rank=worker_index, world_size=worker_num, dp_degree=args.dp_degree, pp_degree=args.pp_degree, sharding_degree=args.sharding_degree, mp_degree=args.mp_degree) logger.info("The topo of hybrid parallelism:\n{}".format(topo)) model_class, tokenizer_class = MODEL_CLASSES[args.model_type] pretrained_models_list = list( model_class.pretrained_init_configuration.keys()) data_file = get_data_file(args) main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() with paddle.static.program_guard(main_program, startup_program): with paddle.utils.unique_name.guard(): with paddle.static.device_guard('gpu:0'): feeds = create_data_holder(args) tokenizer = tokenizer_class.from_pretrained( args.model_name_or_path) eos_id = tokenizer.eos_token_id _, _, test_data_loader = create_pretrained_dataset( args, data_file, local_rank=local_rank, data_world_size=topo.data_info.size, data_world_rank=topo.data_info.rank, eos_id=eos_id, max_seq_len=args.max_seq_len, places=paddle.static.cuda_places(), data_holders=feeds, pipeline_mode=False) if args.model_name_or_path in pretrained_models_list: model_config = model_class.pretrained_init_configuration[ args.model_name_or_path] model_config[ "hidden_dropout_prob"] = args.hidden_dropout_prob model_config[ "attention_probs_dropout_prob"] = args.attention_probs_dropout_prob model_config["topo"] = topo model_config["fuse"] = args.fuse model = GPTForGeneration( GPTModel(**model_config), max_length=args.max_dec_len, decoding_strategy=args.decoding_strategy, temperature=args.temperature, top_k=args.topk, top_p=args.topp, eos_id=eos_id, fuse=args.fuse) else: logger.error("No checkpoint load.") model.eval() ins = {v.name: v for v in feeds} preds = model(ins) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) main_program = main_program.clone(for_test=True) model_urls = model.pretrained_resource_files_map['model_state'] model_path = args.model_name_or_path if model_path in pretrained_models_list and model_path in model_urls: flag_loaded = False from paddle.utils.download import get_weights_path_from_url dygraph_path = get_weights_path_from_url(model_urls[model_path]) if os.path.exists(dygraph_path): if args.sharding_degree > 1: logger.warning("Sharding should init with static vars") else: logger.info("Loading parameters from %s" % dygraph_path) init_static_with_params( model, paddle.load( dygraph_path, return_numpy=True), topo, main_program) flag_loaded = True if not flag_loaded: logger.error("No checkpoint load.") global_step = 0 epoch = 0 fetchs = [preds] ### check resutls text = [ "Question: Where is the capital of China? Answer:", "Question:Who is the CEO of Apple? Answer:" ] inputs = tokenizer( text, padding=True, return_attention_mask=True, return_position_ids=True) ids = np.array(inputs["input_ids"]).reshape(len(text), -1).astype('int64') position_ids = np.array(inputs["position_ids"]).reshape(len(text), -1).astype('int64') attention_mask = np.array(inputs["attention_mask"]).reshape( len(text), -1).astype('float32') t_ids = paddle.fluid.core.Tensor() t_ids.set(ids, place) t_mask = paddle.fluid.core.Tensor() t_mask.set(attention_mask, place) t_pos = paddle.fluid.core.Tensor() t_pos.set(position_ids, place) feed_data = {'src_ids': t_ids, 'pos_ids': t_pos, 'input_mask': t_mask} ret = exe.run(main_program, feed=feed_data, fetch_list=fetchs) ret = np.array(ret[0]) for i in range(ret.shape[0]): o = [int(x) for x in ret[i]] ret_str = tokenizer.convert_ids_to_string(o) ret_str = text[i] + ret_str logger.info(ret_str) ################## for step, batch in enumerate(test_data_loader()): ret = exe.run(main_program, feed=batch, fetch_list=fetchs) if step == 5: break if args.save_inference_model_then_exist: save_inference_model_dir = 'inference_model_pp{pp_degree}mp{mp_degree}'.format( pp_degree=args.pp_degree, mp_degree=args.mp_degree) inference_save_path = os.path.join(save_inference_model_dir, 'rank_' + str(fleet.worker_index()), 'step_' + str(0)) print("saving inference models to {}".format(inference_save_path)) feed_names = [v.name for v in feeds] fetchs_names = [v.name for v in fetchs] print('feeds: ', feed_names, 'fetches: ', fetchs_names) paddle.static.save_inference_model( inference_save_path, feeds, fetchs, exe, program=main_program)
def run_online_worker(self): logger.info("Run Online Worker Begin") use_cuda = int(config.get("runner.use_gpu")) place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() self.exe = paddle.static.Executor(place) with open("./{}_worker_main_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_main_program())) with open("./{}_worker_startup_program.prototxt".format( fleet.worker_index()), 'w+') as f: f.write(str(paddle.static.default_startup_program())) self.exe.run(paddle.static.default_startup_program()) fleet.init_worker() self.online_intervals = get_online_pass_interval( self.split_interval, self.split_per_pass, False) if is_local(self.save_model_path) and self.save_model_path and ( not os.path.exists(self.save_model_path)): os.makedirs(self.save_model_path) last_day, last_pass, last_path, xbox_base_key = get_last_save_model( self.save_model_path, self.hadoop_client) logger.info( "get_last_save_model last_day = {}, last_pass = {}, last_path = {}, xbox_base_key = {}". format(last_day, last_pass, last_path, xbox_base_key)) if last_day != -1 and fleet.is_first_worker(): load_model(last_path, 0, self.hadoop_client) fleet.barrier_worker() day = self.start_day infer_first = True while int(day) <= int(self.end_day): logger.info("training a new day {}, end_day = {}".format( day, self.end_day)) if last_day != -1 and int(day) < last_day: day = get_next_day(day) continue # base_model_saved = False for pass_id in range(1, 1 + len(self.online_intervals)): print(last_day, day, last_pass, pass_id) if (last_day != -1 and int(day) == last_day) and ( last_pass != -1 and int(pass_id) <= last_pass): continue if self.save_first_base and fleet.is_first_worker(): self.save_first_base = False last_base_day, last_base_path, tmp_xbox_base_key = \ get_last_save_xbox_base(self.save_model_path, self.hadoop_client) logger.info( "get_last_save_xbox_base, last_base_day = {}, last_base_path = {}, tmp_xbox_base_key = {}". format(last_base_day, last_base_path, tmp_xbox_base_key)) if int(day) > last_base_day: xbox_base_key = int(time.time()) save_xbox_model(self.save_model_path, day, -1, self.exe, self.inference_feed_vars, self.inference_target_var, self.hadoop_client) write_xbox_donefile( output_path=self.save_model_path, day=day, pass_id=-1, xbox_base_key=xbox_base_key, client=self.hadoop_client) elif int(day) == last_base_day: xbox_base_key = tmp_xbox_base_key fleet.barrier_worker() logger.info("training a new day = {} new pass = {}".format( day, pass_id)) logger.info("Day:{}, Pass: {}, Prepare Dataset Begin.".format( day, pass_id)) begin_train = time.time() begin = time.time() dataset = self.wait_and_prepare_dataset(day, pass_id) end = time.time() read_data_cost = (end - begin) / 60.0 logger.info("Prepare Dataset Done, using time {} mins.".format( read_data_cost)) infer_cost = 0 infer_metric_cost = 0 if infer_first: infer_first = False else: logger.info("Day:{}, Pass: {}, Infering Dataset Begin.". format(day, pass_id)) begin = time.time() self.dataset_infer_loop(dataset, day, pass_id) end = time.time() infer_cost = (end - begin) / 60.0 logger.info("Infering Dataset Done, using time {} mins.". format(infer_cost)) begin = time.time() metric_str = get_global_metrics_str(fluid.global_scope(), self.metric_list, "") logger.info("Day:{}, Pass: {}, Infer Global Metric: {}". format(day, pass_id, metric_str)) clear_metrics(fluid.global_scope(), self.metric_list, self.metric_types) end = time.time() infer_metric_cost = (end - begin) / 60.0 logger.info("Day:{}, Pass: {}, Training Dataset Begin.".format( day, pass_id)) begin = time.time() self.dataset_train_loop(dataset, day, pass_id, self.need_train_dump) end = time.time() avg_cost = get_avg_cost_mins(end - begin) get_max_cost_mins(end - begin) get_min_cost_mins(end - begin) train_cost = avg_cost logger.info("Training Dataset Done, using time {} mins.". format(train_cost)) begin = time.time() dataset.release_memory() end = time.time() release_cost = (end - begin) / 60.0 begin = time.time() metric_str = get_global_metrics_str(fluid.global_scope(), self.metric_list, "") logger.info("Day:{}, Pass: {}, Train Global Metric: {}".format( day, pass_id, metric_str)) clear_metrics(fluid.global_scope(), self.metric_list, self.metric_types) end = time.time() metric_cost = (end - begin) / 60 end_train = time.time() total_cost = (end_train - begin_train) / 60 other_cost = total_cost - read_data_cost - train_cost - release_cost - metric_cost - infer_cost - infer_metric_cost log_str = "finished train epoch %d time cost:%s min job time cost" \ ":[read_data:%s min][train: %s min][metric: %s min][release: %s min]" \ "[infer:%s min][infer_metric: %s min][other:%s min]" \ % (pass_id, total_cost, read_data_cost, train_cost, metric_cost, release_cost, infer_cost, infer_metric_cost, other_cost) logger.info(log_str) if self.need_infer_dump: prepare_data_start_time = time.time() dump_dataset = self.wait_and_prepare_infer_dataset(day, pass_id) prepare_data_end_time = time.time() logger.info( "Prepare Infer Dump Dataset Done, using time {} second.". format(prepare_data_end_time - prepare_data_start_time)) dump_start_time = time.time() self.dataset_infer_loop(dump_dataset, day, pass_id, True) dump_end_time = time.time() logger.info( "Infer Dump Dataset Done, using time {} second.". format(dump_end_time - dump_start_time)) dump_dataset.release_memory() if fleet.is_first_worker(): if pass_id % self.checkpoint_per_pass == 0: save_model(self.exe, self.save_model_path, day, pass_id) write_model_donefile( output_path=self.save_model_path, day=day, pass_id=pass_id, xbox_base_key=xbox_base_key, client=self.hadoop_client) if pass_id % self.save_delta_frequency == 0: last_xbox_day, last_xbox_pass, last_xbox_path, _ = get_last_save_xbox( self.save_model_path, self.hadoop_client) if int(day) < last_xbox_day or int( day) == last_xbox_day and int( pass_id) <= last_xbox_pass: log_str = "delta model exists" logger.info(log_str) else: save_xbox_model(self.save_model_path, day, pass_id, self.exe, self.inference_feed_vars, self.inference_target_var, self.hadoop_client) # 1 delta write_xbox_donefile( output_path=self.save_model_path, day=day, pass_id=pass_id, xbox_base_key=xbox_base_key, client=self.hadoop_client, hadoop_fs_name=self.hadoop_fs_name, monitor_data=metric_str) fleet.barrier_worker() logger.info("shrink table") begin = time.time() fleet.shrink() end = time.time() logger.info("shrink table done, cost %s min" % ( (end - begin) / 60.0)) if fleet.is_first_worker(): last_base_day, last_base_path, last_base_key = get_last_save_xbox_base( self.save_model_path, self.hadoop_client) logger.info( "one epoch finishes, get_last_save_xbox, last_base_day = {}, last_base_path = {}, last_base_key = {}". format(last_base_day, last_base_path, last_base_key)) next_day = get_next_day(day) if int(next_day) <= last_base_day: logger.info("batch model/base xbox model exists") else: xbox_base_key = int(time.time()) save_xbox_model(self.save_model_path, next_day, -1, self.exe, self.inference_feed_vars, self.inference_target_var, self.hadoop_client) write_xbox_donefile( output_path=self.save_model_path, day=next_day, pass_id=-1, xbox_base_key=xbox_base_key, client=self.hadoop_client, hadoop_fs_name=self.hadoop_fs_name, monitor_data=metric_str) save_batch_model(self.exe, self.save_model_path, next_day) write_model_donefile( output_path=self.save_model_path, day=next_day, pass_id=-1, xbox_base_key=xbox_base_key, client=self.hadoop_client) fleet.barrier_worker() day = get_next_day(day)
def do_train(args): # Initialize the paddle and paddle fleet execute enviroment paddle.enable_static() place = paddle.set_device(args.select_device) fleet.init(is_collective=True) worker_num = fleet.worker_num() worker_index = fleet.worker_index() # Create the random seed for the worker set_seed(args.seed) worker_init = WorkerInitObj(args.seed + worker_index) # Define the input data in the static mode main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() data_holders = create_data_holder(args) [ input_ids, segment_ids, input_mask, masked_lm_positions, masked_lm_labels, next_sentence_labels, masked_lm_scale ] = data_holders # Define the model structure in static mode args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) config = model_class.pretrained_init_configuration[args.model_name_or_path] if config["vocab_size"] % 8 != 0: config["vocab_size"] += 8 - (config["vocab_size"] % 8) model = BertForPretraining(BertModel(**config)) criterion = BertPretrainingCriterion(model.bert.config["vocab_size"]) prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_positions=masked_lm_positions) loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale) # Define the dynamic learing_reate scheduler and optimizer lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, lambda current_step, num_warmup_steps=args.warmup_steps, num_training_steps=args.max_steps if args.max_steps > 0 else (len(train_data_loader) * args.num_train_epochs): float( current_step) / float(max(1, num_warmup_steps)) if current_step < num_warmup_steps else max( 0.0, float(num_training_steps - current_step) / float( max(1, num_training_steps - num_warmup_steps)))) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ], multi_precision=args.use_pure_fp16) if worker_num == 1 and args.use_amp: custom_black_list = (['lookup_table', 'lookup_table_v2'] if args.use_pure_fp16 else None) amp_list = paddle.static.amp.AutoMixedPrecisionLists( custom_white_list=['softmax', 'layer_norm', 'gelu'], custom_black_list=custom_black_list) optimizer = paddle.static.amp.decorate( optimizer, amp_list, init_loss_scaling=args.scale_loss, use_dynamic_loss_scaling=True, use_pure_fp16=args.use_pure_fp16) if worker_num > 1: # Use the fleet api to compile the distributed optimizer optimizer = dist_optimizer(args, optimizer) optimizer.minimize(loss) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) state_dict = model.state_dict() # Use the state dict to update the parameter reset_state_dict = reset_program_state_dict(model, state_dict) paddle.static.set_program_state(main_program, reset_state_dict) if args.use_amp: optimizer.amp_init(place) if worker_num == 1: # Construct the compiled program main_program = build_compiled_program(main_program, loss) pool = ThreadPoolExecutor(1) global_step = 0 tic_train = time.time() epoch = 0 while True: files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and "training" in f ] files.sort() num_files = len(files) random.Random(args.seed + epoch).shuffle(files) f_start_id = 0 # Select one file for each worker and create the DataLoader for the file data_file = select_dataset_file_for_each_worker( files, f_start_id, worker_num, worker_index) train_data_loader, _ = create_pretraining_dataset( data_file, args.max_predictions_per_seq, args, data_holders, worker_init, paddle.static.cuda_places()) for f_id in range(f_start_id + 1, len(files)): data_file = select_dataset_file_for_each_worker( files, f_id, worker_num, worker_index) dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, args, data_holders, worker_init, paddle.static.cuda_places()) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 reader_start = time.time() for step, batch in enumerate(train_data_loader): train_reader_cost += time.time() - reader_start global_step += 1 train_start = time.time() loss_return = exe.run(main_program, feed=batch, fetch_list=[loss]) train_run_cost += time.time() - train_start total_samples += args.batch_size # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() if global_step % args.logging_steps == 0: print( "tobal step: %d, epoch: %d, batch: %d, loss: %f, " "avg_reader_cost: %.5f sec, avg_batch_cost: %.5f sec, avg_samples: %.5f, ips: %.5f sequences/sec" % (global_step, epoch, step, loss_return[0], train_reader_cost / args.logging_steps, (train_reader_cost + train_run_cost) / args.logging_steps, total_samples / args.logging_steps, total_samples / (train_reader_cost + train_run_cost))) train_reader_cost = 0.0 train_run_cost = 0.0 total_samples = 0 if global_step % args.save_steps == 0: if worker_index == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # TODO(fangzeyang): Udpate the save_params to paddle.static paddle.fluid.io.save_params(exe, output_dir) tokenizer.save_pretrained(output_dir) if global_step >= args.max_steps: reader_start = time.time() del train_data_loader return reader_start = time.time() del train_data_loader train_data_loader, data_file = dataset_future.result(timeout=None) epoch += 1
def infer(args): """Inference main function.""" if args.is_distributed: fleet.init(is_collective=True) dev_count = fluid.core.get_cuda_device_count() gpu_id = int(os.getenv("FLAGS_selected_gpus")) trainers_num = fleet.worker_num() trainer_id = fleet.worker_index() phase = "distributed_test" else: dev_count = 1 gpu_id = 0 trainers_num = 1 trainer_id = 0 phase = "test" place = fluid.CUDAPlace(gpu_id) task = tasks.create_task(args) model = models.create_model(args, place) infer_generator = task.get_data_loader(model, input_file=args.infer_file, num_part=trainers_num, part_id=trainer_id, phase=phase, is_infer=True) # run inference timer = Timer() timer.start() infer_out = {} step = 0 for step, data in enumerate(infer_generator(), 1): predictions = task.infer_step(model, data) for pred in predictions: infer_out[pred["data_id"]] = pred if step % args.log_steps == 0: time_cost = timer.pass_time print(f"\tstep: {step}, time: {time_cost:.3f}, " f"queue size: {infer_generator.queue.size()}, " f"speed: {step / time_cost:.3f} steps/s") time_cost = timer.pass_time print(f"[infer] steps: {step} time cost: {time_cost}, " f"speed: {step / time_cost} steps/s") if args.is_distributed: # merge inference outputs in distributed mode. part_file = os.path.join(args.save_path, f"inference_output.part_{gpu_id}") with open(part_file, "w") as fp: json.dump(infer_out, fp, ensure_ascii=False, indent=2) part_finish_file = os.path.join( args.save_path, f"inference_output.part_{gpu_id}.finish") with open(part_finish_file, "w"): pass # Only run on master GPU in each node if gpu_id != 0: return if args.is_distributed: part_files = f"inference_output.part_*.finish" while True: ret = subprocess.getoutput( f"find {args.save_path} -maxdepth 1 -name {part_files}") num_completed = len(ret.split("\n")) if num_completed != dev_count: time.sleep(1) continue infer_out = {} for dev_id in range(dev_count): part_file = os.path.join(args.save_path, f"inference_output.part_{dev_id}") with open(part_file, "r") as fp: part_infer_out = json.load(fp) for data_id in part_infer_out: infer_out[data_id] = part_infer_out[data_id] break subprocess.getoutput( "rm " + os.path.join(args.save_path, f"inference_output.part*")) # save inference outputs inference_output = os.path.join(args.save_path, args.save_name) save_array = [] for i in range(len(infer_out)): save_array.append(infer_out[str(i)]["emb"]) np_array = np.array(save_array) np.save(inference_output, np_array) return