def train_with_dataloader(exe, train_prog, compiled_train_prog, train_dataloader, \ train_fetch_list, train_metrics, epochs = 10, \ log_interval = 0, valid_interval = 0, save_dir = './', \ save_model_name = 'model', fix_random_seed = False, \ compiled_test_prog = None, test_dataloader = None, \ test_fetch_list = None, test_metrics = None, \ is_profiler = None, profiler_path = None): if not train_dataloader: logger.error("[TRAIN] get dataloader failed.") epoch_periods = [] train_loss = 0 for epoch in range(epochs): log_lr_and_step() train_iter = 0 epoch_periods = [] for data in train_dataloader(): cur_time = time.time() train_outs = exe.run(compiled_train_prog, fetch_list=train_fetch_list, feed=data) period = time.time() - cur_time epoch_periods.append(period) if log_interval > 0 and (train_iter % log_interval == 0): train_metrics.calculate_and_log_out(train_outs, \ info = '[TRAIN] Epoch {}, iter {} '.format(epoch, train_iter)) train_iter += 1 # NOTE: profiler tools, used for benchmark if is_profiler and epoch == 0 and train_iter == log_interval: profiler.start_profiler("All") elif is_profiler and epoch == 0 and train_iter == log_interval + 5: profiler.stop_profiler("total", profiler_path) return if len(epoch_periods) < 1: logger.info( 'No iteration was executed, please check the data reader') sys.exit(1) logger.info( '[TRAIN] Epoch {} training finished, average time: {}'.format( epoch, np.mean(epoch_periods[1:]))) save_model(exe, train_prog, save_dir, save_model_name, "_epoch{}".format(epoch)) if compiled_test_prog and valid_interval > 0 and ( epoch + 1) % valid_interval == 0: test_with_dataloader(exe, compiled_test_prog, test_dataloader, test_fetch_list, test_metrics, log_interval, save_model_name) save_model(exe, train_prog, save_dir, save_model_name) #when fix_random seed for debug if fix_random_seed: cards = os.environ.get('CUDA_VISIBLE_DEVICES') gpu_num = len(cards.split(",")) print("kpis\ttrain_cost_card{}\t{}".format(gpu_num, train_loss)) print("kpis\ttrain_speed_card{}\t{}".format(gpu_num, np.mean(epoch_periods)))
def train_prog(exe, program, loss, node2vec_pyreader, args, train_steps): step = 0 node2vec_pyreader.start() profiler.start_profiler("All") while True: try: begin_time = time.time() loss_val = exe.run(program, fetch_list=[loss]) log.info("step %s: loss %.5f speed: %.5f s/step" % (step, np.mean(loss_val), time.time() - begin_time)) step += 1 except F.core.EOFException: node2vec_pyreader.reset() if step % args.steps_per_save == 0 or step == train_steps: profiler.stop_profiler("total", "/tmp/profile") model_save_dir = args.save_path model_path = os.path.join(model_save_dir, str(step)) if not os.path.exists(model_save_dir): os.makedirs(model_save_dir) #fleet.save_persistables(exe, model_path) F.io.save_params(exe, dirname=model_path, main_program=program) if step == train_steps: break
def infer(place, save_dirname=None, trans=False): if save_dirname is None: return exe = fluid.Executor(place) inference_scope = fluid.core.Scope() with fluid.scope_guard(inference_scope): # Use fluid.io.load_inference_model to obtain the inference program desc, # the feed_target_names (the names of variables that will be feeded # data using feed operators), and the fetch_targets (variables that # we want to obtain data from using fetch operators). [inference_program, feed_target_names, fetch_targets ] = fluid.io.load_inference_model(save_dirname, exe, model_filename='model', params_filename='params') assert feed_target_names[0] == 'data' #assert feed_target_names[1] == 'label' print(feed_target_names) print(fetch_targets) if (trans): inference_transpiler_program = inference_program.clone() t = fluid.transpiler.InferenceTranspiler() t.transpile(inference_transpiler_program, place) prog = inference_transpiler_program else: prog = inference_program """ for block in inference_program.blocks: for op in block.ops: print(op.type) print("----------------") for block in inference_transpiler_program.blocks: for op in block.ops: print(op.type) print(debugger.pprint_program_codes(inference_program)) print("----------------") print(debugger.pprint_program_codes(inference_transpiler_program)) exit() """ for i in range(10): img_data = np.random.random([1, 3, 224, 224]).astype('float32') if (i == 9): profiler.start_profiler("All") exe.run(prog, feed={feed_target_names[0]: img_data}, fetch_list=fetch_targets) if (i == 9): profiler.stop_profiler("total", "/tmp/profile")
def train_loop_pyreader(): py_reader.start() train_stats = TrainingStats(cfg.log_window, keys) try: start_time = time.time() prev_start_time = start_time for iter_id in range(cfg.max_iter): prev_start_time = start_time start_time = time.time() outs = train_exe.run(fetch_list=[v.name for v in fetch_list]) stats = { k: np.array(v).mean() for k, v in zip(keys, outs[:-1]) } train_stats.update(stats) logs = train_stats.log() strs = '{}, iter: {}, lr: {:.5f}, {}, time: {:.3f}'.format( now_time(), iter_id, np.mean(outs[-1]), logs, start_time - prev_start_time) print(strs) sys.stdout.flush() if (iter_id + 1) % cfg.TRAIN.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) #profiler tools, used for benchmark if args.is_profiler and iter_id == 10: profiler.start_profiler("All") elif args.is_profiler and iter_id == 15: profiler.stop_profiler("total", args.profiler_path) return end_time = time.time() total_time = end_time - start_time last_loss = np.array(outs[0]).mean() if cfg.enable_ce: gpu_num = devices_num epoch_idx = iter_id + 1 loss = last_loss print("kpis\teach_pass_duration_card%s\t%s" % (gpu_num, total_time / epoch_idx)) print("kpis\ttrain_loss_card%s\t%s" % (gpu_num, loss)) except (StopIteration, fluid.core.EOFException): py_reader.reset()
def train(args): """Train model Args: args: all arguments. """ startup_prog = fluid.Program() train_prog = fluid.Program() train_out = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) train_data_loader = train_out[-1] if args.use_ema: train_fetch_vars = train_out[:-2] ema = train_out[-2] else: train_fetch_vars = train_out[:-1] train_fetch_list = [var.name for var in train_fetch_vars] if args.validate: test_prog = fluid.Program() test_out = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, args=args) test_data_loader = test_out[-1] test_fetch_vars = test_out[:-1] test_fetch_list = [var.name for var in test_fetch_vars] #Create test_prog and set layers' is_test params to True test_prog = test_prog.clone(for_test=True) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) #init model by checkpoint or pretrianed model. init_model(exe, args, train_prog) num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) if args.use_dali: import dali train_iter = dali.train(settings=args) if trainer_id == 0: test_iter = dali.val(settings=args) else: imagenet_reader = reader.ImageNetReader(0 if num_trainers > 1 else None) train_reader = imagenet_reader.train(settings=args) if args.use_gpu: if num_trainers <= 1: places = fluid.framework.cuda_places() else: places = place else: if num_trainers <= 1: places = fluid.framework.cpu_places() else: places = place train_data_loader.set_sample_list_generator(train_reader, places) if args.validate: test_reader = imagenet_reader.val(settings=args) test_data_loader.set_sample_list_generator(test_reader, places) compiled_train_prog = best_strategy_compiled(args, train_prog, train_fetch_vars[0], exe) #NOTE: this for benchmark total_batch_num = 0 for pass_id in range(args.num_epochs): if num_trainers > 1 and not args.use_dali: imagenet_reader.set_shuffle_seed(pass_id + ( args.random_seed if args.random_seed else 0)) train_batch_id = 0 train_batch_time_record = [] train_batch_metrics_record = [] if not args.use_dali: train_iter = train_data_loader() if args.validate: test_iter = test_data_loader() t1 = time.time() for batch in train_iter: #NOTE: this is for benchmark if args.max_iter and total_batch_num == args.max_iter: return train_batch_metrics = exe.run(compiled_train_prog, feed=batch, fetch_list=train_fetch_list) t2 = time.time() train_batch_elapse = t2 - t1 train_batch_time_record.append(train_batch_elapse) train_batch_metrics_avg = np.mean( np.array(train_batch_metrics), axis=1) train_batch_metrics_record.append(train_batch_metrics_avg) if trainer_id == 0: print_info("batch", train_batch_metrics_avg, train_batch_elapse, pass_id, train_batch_id, args.print_step) sys.stdout.flush() train_batch_id += 1 t1 = time.time() #NOTE: this for benchmark profiler total_batch_num = total_batch_num + 1 if args.is_profiler and pass_id == 0 and train_batch_id == args.print_step: profiler.start_profiler("All") elif args.is_profiler and pass_id == 0 and train_batch_id == args.print_step + 5: profiler.stop_profiler("total", args.profiler_path) return if args.use_dali: train_iter.reset() if trainer_id == 0 and args.validate: if args.use_ema: logger.info('ExponentialMovingAverage validate start...') with ema.apply(exe): validate(args, test_iter, exe, test_prog, test_fetch_list, pass_id, train_batch_metrics_record, compiled_train_prog) logger.info('ExponentialMovingAverage validate over!') validate(args, test_iter, exe, test_prog, test_fetch_list, pass_id, train_batch_metrics_record, train_batch_time_record, compiled_train_prog) if args.use_dali: test_iter.reset() if pass_id % args.save_step == 0: save_model(args, exe, train_prog, pass_id)
def train(): # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) if cfg.debug or args.enable_ce: fluid.default_startup_program().random_seed = 1000 fluid.default_main_program().random_seed = 1000 random.seed(0) np.random.seed(0) if not os.path.exists(cfg.model_save_dir): os.makedirs(cfg.model_save_dir) model = YOLOv3() model.build_model() input_size = cfg.input_size loss = model.loss() loss.persistable = True devices_num = get_device_num() if cfg.use_gpu else 1 print("Found {} CUDA/CPU devices.".format(devices_num)) learning_rate = cfg.learning_rate boundaries = cfg.lr_steps gamma = cfg.lr_gamma step_num = len(cfg.lr_steps) values = [learning_rate * (gamma**i) for i in range(step_num + 1)] optimizer = fluid.optimizer.Momentum( learning_rate=exponential_with_warmup_decay( learning_rate=learning_rate, boundaries=boundaries, values=values, warmup_iter=cfg.warm_up_iter, warmup_factor=cfg.warm_up_factor), regularization=fluid.regularizer.L2Decay(cfg.weight_decay), momentum=cfg.momentum) optimizer.minimize(loss) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) if cfg.pretrain: if not os.path.exists(cfg.pretrain): print("Pretrain weights not found: {}".format(cfg.pretrain)) def if_exist(var): return os.path.exists(os.path.join(cfg.pretrain, var.name)) fluid.io.load_vars(exe, cfg.pretrain, predicate=if_exist) build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = False #gc and memory optimize may conflict syncbn = cfg.syncbn if (syncbn and devices_num <= 1) or num_trainers > 1: print("Disable syncbn in single device") syncbn = False build_strategy.sync_batch_norm = syncbn exec_strategy = fluid.ExecutionStrategy() if cfg.use_gpu and num_trainers > 1: dist_utils.prepare_for_multi_process(exe, build_strategy, fluid.default_main_program()) exec_strategy.num_threads = 1 compile_program = fluid.compiler.CompiledProgram( fluid.default_main_program()).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) random_sizes = [cfg.input_size] if cfg.random_shape: random_sizes = [32 * i for i in range(10, 20)] total_iter = cfg.max_iter - cfg.start_iter mixup_iter = total_iter - cfg.no_mixup_iter shuffle = True if args.enable_ce: shuffle = False shuffle_seed = None # NOTE: yolov3 is a special model, if num_trainers > 1, each process # trian the completed dataset. # if num_trainers > 1: shuffle_seed = 1 train_reader = reader.train( input_size, batch_size=cfg.batch_size, shuffle=shuffle, shuffle_seed=shuffle_seed, total_iter=total_iter * devices_num, mixup_iter=mixup_iter * devices_num, random_sizes=random_sizes, use_multiprocess_reader=cfg.use_multiprocess_reader, num_workers=cfg.worker_num) py_reader = model.py_reader py_reader.decorate_paddle_reader(train_reader) def save_model(postfix): model_path = os.path.join(cfg.model_save_dir, postfix) if os.path.isdir(model_path): shutil.rmtree(model_path) fluid.io.save_persistables(exe, model_path) fetch_list = [loss] py_reader.start() smoothed_loss = SmoothedValue() try: start_time = time.time() prev_start_time = start_time snapshot_loss = 0 snapshot_time = 0 for iter_id in range(cfg.start_iter, cfg.max_iter): prev_start_time = start_time start_time = time.time() losses = exe.run(compile_program, fetch_list=[v.name for v in fetch_list]) smoothed_loss.add_value(np.mean(np.array(losses[0]))) snapshot_loss += np.mean(np.array(losses[0])) snapshot_time += start_time - prev_start_time lr = np.array( fluid.global_scope().find_var('learning_rate').get_tensor()) print("Iter {:d}, lr {:.6f}, loss {:.6f}, time {:.5f}".format( iter_id, lr[0], smoothed_loss.get_mean_value(), start_time - prev_start_time)) sys.stdout.flush() #add profiler tools if args.is_profiler and iter_id == 5: profiler.start_profiler("All") elif args.is_profiler and iter_id == 10: profiler.stop_profiler("total", args.profiler_path) return if (iter_id + 1) % cfg.snapshot_iter == 0: save_model("model_iter{}".format(iter_id)) print("Snapshot {} saved, average loss: {}, \ average time: {}".format( iter_id + 1, snapshot_loss / float(cfg.snapshot_iter), snapshot_time / float(cfg.snapshot_iter))) if args.enable_ce and iter_id == cfg.max_iter - 1: if devices_num == 1: print("kpis\ttrain_cost_1card\t%f" % (snapshot_loss / float(cfg.snapshot_iter))) print("kpis\ttrain_duration_1card\t%f" % (snapshot_time / float(cfg.snapshot_iter))) else: print("kpis\ttrain_cost_8card\t%f" % (snapshot_loss / float(cfg.snapshot_iter))) print("kpis\ttrain_duration_8card\t%f" % (snapshot_time / float(cfg.snapshot_iter))) snapshot_loss = 0 snapshot_time = 0 except fluid.core.EOFException: py_reader.reset() save_model('model_final')
def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, args, train_prog, startup_prog, nccl_id_var, num_trainers, trainer_id): feed_var_list = [ var for var in train_prog.global_block().vars.itervalues() if var.is_data ] # generate fake: if args.use_fake_data: for var in feed_var_list: v = startup_prog.global_block().clone_variable(var) var.persistable = True v.persistable = True real_shape = list(var.shape) real_shape[0] = args.batch_size / args.gpus startup_prog.global_block().append_op( outputs={"Out": v}, type="fill_constant", attrs={"shape": real_shape, "value": 1.0, "dtype": var.dtype}) place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) if nccl_id_var and trainer_id == 0: #FIXME(wuyi): wait other trainer to start listening time.sleep(30) startup_exe = fluid.Executor(place) startup_exe.run(startup_prog) strategy = fluid.ExecutionStrategy() strategy.num_threads = 1 strategy.allow_op_delay = False exe = fluid.ParallelExecutor( True, avg_loss.name, exec_strategy=strategy, num_trainers=num_trainers, trainer_id=trainer_id) feeder = fluid.DataFeeder(feed_var_list, place) for pass_id in range(args.pass_num): num_samples = 0 iters = 0 start_time = time.time() for batch_id, data in enumerate(train_reader()): if args.profile and pass_id == 0 and batch_id == 5: profiler.start_profiler("All") elif args.profile and pass_id == 0 and batch_id == 10: profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id) if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 if iters == args.iterations: break if args.use_fake_data: loss, = exe.run([avg_loss.name]) else: loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) if args.update_method == "pserver": exe.bcast_params() num_samples += len(data) iters += 1 if batch_id % 1 == 0: print("Pass %d, batch %d, loss %s" % (pass_id, batch_id, np.array(loss))) train_elapsed = time.time() - start_time examples_per_sec = num_samples / train_elapsed print('\nTotal examples: %d, total time: %.5f, %.5f examples/sed\n' % (num_samples, train_elapsed, examples_per_sec)) if not args.no_test and batch_acc != None: test_acc = test(startup_exe, infer_prog, test_reader, feeder, batch_acc) print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc)) exit(0)
def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, batch_size_tensor, args, train_prog, startup_prog, nccl_id_var, num_trainers, trainer_id): feed_var_list = [ var for var in train_prog.global_block().vars.itervalues() if var.is_data ] # generate fake: if args.use_fake_data: for var in feed_var_list: v = startup_prog.global_block().clone_variable(var) var.persistable = True v.persistable = True real_shape = list(var.shape) real_shape[0] = args.batch_size / args.gpus startup_prog.global_block().append_op(outputs={"Out": v}, type="fill_constant", attrs={ "shape": real_shape, "value": 1.0, "dtype": var.dtype }) place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) if nccl_id_var and trainer_id == 0: #FIXME(wuyi): wait other trainer to start listening time.sleep(30) startup_exe = fluid.Executor(place) startup_exe.run(startup_prog) strategy = fluid.ExecutionStrategy() strategy.num_threads = 1 strategy.allow_op_delay = False exe = fluid.ParallelExecutor(True, avg_loss.name, exec_strategy=strategy, num_trainers=num_trainers, trainer_id=trainer_id) feeder = fluid.DataFeeder(feed_var_list, place) acc_4passes = None converge_speed = None accuracy_evaluator = fluid.metrics.Accuracy() fetch_list = [avg_loss.name] if batch_acc is not None: fetch_list.append(batch_acc.name) start_time = time.time() for pass_id in range(args.pass_num): num_samples = 0 iters = 0 pass_start_time = time.time() accuracy_evaluator.reset() for batch_id, data in enumerate(train_reader()): if args.profile and pass_id == 0 and batch_id == 5: profiler.start_profiler("All") elif args.profile and pass_id == 0 and batch_id == 10: profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id) if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 if iters == args.iterations: break if args.use_fake_data: outs = exe.run(fetch_list) else: outs = exe.run(fetch_list, feed=feeder.feed(data)) if args.update_method == "pserver": exe.bcast_params() num_samples += len(data) iters += 1 if batch_acc is not None: acc = np.mean(outs[1]).item() accuracy_evaluator.update(value=acc, weight=len(data)) else: acc = None if batch_id % 1 == 0: print("Pass %d, batch %d, loss %s, acc %s" % (pass_id, batch_id, np.mean(outs[0]), str(acc))) if converge_speed is None and args.acc_target and acc >= args.acc_target: converge_speed = time.time() - start_time print("converge_speed set with %f" % converge_speed) pass_elapsed = time.time() - pass_start_time examples_per_sec = num_samples / pass_elapsed if batch_acc is not None: pass_train_acc = accuracy_evaluator.eval() else: pass_train_acc = None if pass_id == 4 and batch_acc is not None: print("acc_4passes set with %f" % pass_train_acc) acc_4passes = float(pass_train_acc) output_metric_data(pass_id, examples_per_sec, pass_train_acc, acc_4passes, converge_speed) if not args.no_test and batch_acc != None: test_acc = test(startup_exe, infer_prog, test_reader, feeder, batch_acc) print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc)) exit(0)
def train(cfg): startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() if args.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 drop_last = True dataset = SegDataset( file_list=cfg.DATASET.TRAIN_FILE_LIST, mode=ModelPhase.TRAIN, shuffle=True, data_dir=cfg.DATASET.DATA_DIR) def data_generator(): if args.use_mpio: data_gen = dataset.multiprocess_generator( num_processes=cfg.DATALOADER.NUM_WORKERS, max_queue_size=cfg.DATALOADER.BUF_SIZE) else: data_gen = dataset.generator() batch_data = [] for b in data_gen: batch_data.append(b) if len(batch_data) == (cfg.BATCH_SIZE // cfg.NUM_TRAINERS): for item in batch_data: yield item[0], item[1], item[2] batch_data = [] # If use sync batch norm strategy, drop last batch if number of samples # in batch_data is less then cfg.BATCH_SIZE to avoid NCCL hang issues if not cfg.TRAIN.SYNC_BATCH_NORM: for item in batch_data: yield item[0], item[1], item[2] # Get device environment gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() places = fluid.cuda_places() if args.use_gpu else fluid.cpu_places() # Get number of GPU dev_count = cfg.NUM_TRAINERS if cfg.NUM_TRAINERS > 1 else len(places) print_info("#Device count: {}".format(dev_count)) # Make sure BATCH_SIZE can divided by GPU cards assert cfg.BATCH_SIZE % dev_count == 0, ( 'BATCH_SIZE:{} not divisble by number of GPUs:{}'.format( cfg.BATCH_SIZE, dev_count)) # If use multi-gpu training mode, batch data will allocated to each GPU evenly batch_size_per_dev = cfg.BATCH_SIZE // dev_count print_info("batch_size_per_dev: {}".format(batch_size_per_dev)) data_loader, avg_loss, lr, pred, grts, masks = build_model( train_prog, startup_prog, phase=ModelPhase.TRAIN) build_model(test_prog, fluid.Program(), phase=ModelPhase.EVAL) data_loader.set_sample_generator( data_generator, batch_size=batch_size_per_dev, drop_last=drop_last) exe = fluid.Executor(place) exe.run(startup_prog) exec_strategy = fluid.ExecutionStrategy() # Clear temporary variables every 100 iteration if args.use_gpu: exec_strategy.num_threads = fluid.core.get_cuda_device_count() exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() if cfg.NUM_TRAINERS > 1 and args.use_gpu: dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog) exec_strategy.num_threads = 1 if cfg.TRAIN.SYNC_BATCH_NORM and args.use_gpu: if dev_count > 1: # Apply sync batch norm strategy print_info("Sync BatchNorm strategy is effective.") build_strategy.sync_batch_norm = True else: print_info( "Sync BatchNorm strategy will not be effective if GPU device" " count <= 1") compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=avg_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) # Resume training begin_epoch = cfg.SOLVER.BEGIN_EPOCH if cfg.TRAIN.RESUME_MODEL_DIR: begin_epoch = load_checkpoint(exe, train_prog) # Load pretrained model elif os.path.exists(cfg.TRAIN.PRETRAINED_MODEL_DIR): load_pretrained_weights(exe, train_prog, cfg.TRAIN.PRETRAINED_MODEL_DIR) else: print_info( 'Pretrained model dir {} not exists, training from scratch...'. format(cfg.TRAIN.PRETRAINED_MODEL_DIR)) fetch_list = [avg_loss.name, lr.name] if args.debug: # Fetch more variable info and use streaming confusion matrix to # calculate IoU results if in debug mode np.set_printoptions( precision=4, suppress=True, linewidth=160, floatmode="fixed") fetch_list.extend([pred.name, grts.name, masks.name]) cm = ConfusionMatrix(cfg.DATASET.NUM_CLASSES, streaming=True) if args.use_vdl: if not args.vdl_log_dir: print_info("Please specify the log directory by --vdl_log_dir.") exit(1) from visualdl import LogWriter log_writer = LogWriter(args.vdl_log_dir) # trainer_id = int(os.getenv("PADDLE_TRAINER_ID", 0)) # num_trainers = int(os.environ.get('PADDLE_TRAINERS_NUM', 1)) step = 0 all_step = cfg.DATASET.TRAIN_TOTAL_IMAGES // cfg.BATCH_SIZE if cfg.DATASET.TRAIN_TOTAL_IMAGES % cfg.BATCH_SIZE and drop_last != True: all_step += 1 all_step *= (cfg.SOLVER.NUM_EPOCHS - begin_epoch + 1) avg_loss = 0.0 best_mIoU = 0.0 timer = Timer() timer.start() if begin_epoch > cfg.SOLVER.NUM_EPOCHS: raise ValueError( ("begin epoch[{}] is larger than cfg.SOLVER.NUM_EPOCHS[{}]").format( begin_epoch, cfg.SOLVER.NUM_EPOCHS)) if args.use_mpio: print_info("Use multiprocess reader") else: print_info("Use multi-thread reader") for epoch in range(begin_epoch, cfg.SOLVER.NUM_EPOCHS + 1): data_loader.start() while True: try: if args.debug: # Print category IoU and accuracy to check whether the # traning process is corresponed to expectation loss, lr, pred, grts, masks = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) cm.calculate(pred, grts, masks) avg_loss += np.mean(np.array(loss)) step += 1 if step % args.log_steps == 0: speed = args.log_steps / timer.elapsed_time() avg_loss /= args.log_steps category_acc, mean_acc = cm.accuracy() category_iou, mean_iou = cm.mean_iou() print_info(( "epoch={} step={} lr={:.5f} loss={:.4f} acc={:.5f} mIoU={:.5f} step/sec={:.3f} | ETA {}" ).format(epoch, step, lr[0], avg_loss, mean_acc, mean_iou, speed, calculate_eta(all_step - step, speed))) print_info("Category IoU: ", category_iou) print_info("Category Acc: ", category_acc) if args.use_vdl: log_writer.add_scalar('Train/mean_iou', mean_iou, step) log_writer.add_scalar('Train/mean_acc', mean_acc, step) log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/step/sec', speed, step) sys.stdout.flush() avg_loss = 0.0 cm.zero_matrix() timer.restart() else: # If not in debug mode, avoid unnessary log and calculate loss, lr = exe.run( program=compiled_train_prog, fetch_list=fetch_list, return_numpy=True) avg_loss += np.mean(np.array(loss)) step += 1 if step % args.log_steps == 0 and cfg.TRAINER_ID == 0: avg_loss /= args.log_steps speed = args.log_steps / timer.elapsed_time() print(( "epoch={} step={} lr={:.5f} loss={:.4f} step/sec={:.3f} | ETA {}" ).format(epoch, step, lr[0], avg_loss, speed, calculate_eta(all_step - step, speed))) if args.use_vdl: log_writer.add_scalar('Train/loss', avg_loss, step) log_writer.add_scalar('Train/lr', lr[0], step) log_writer.add_scalar('Train/speed', speed, step) sys.stdout.flush() avg_loss = 0.0 timer.restart() # NOTE : used for benchmark, profiler tools if args.is_profiler and epoch == 1 and step == args.log_steps: profiler.start_profiler("All") elif args.is_profiler and epoch == 1 and step == args.log_steps + 5: profiler.stop_profiler("total", args.profiler_path) return except fluid.core.EOFException: data_loader.reset() break except Exception as e: print(e) if (epoch % cfg.TRAIN.SNAPSHOT_EPOCH == 0 or epoch == cfg.SOLVER.NUM_EPOCHS) and cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(train_prog, epoch) save_infer_program(test_prog, ckpt_dir) if args.do_eval: print("Evaluation start") _, mean_iou, _, mean_acc = evaluate( cfg=cfg, ckpt_dir=ckpt_dir, use_gpu=args.use_gpu, use_mpio=args.use_mpio) if args.use_vdl: log_writer.add_scalar('Evaluate/mean_iou', mean_iou, step) log_writer.add_scalar('Evaluate/mean_acc', mean_acc, step) if mean_iou > best_mIoU: best_mIoU = mean_iou update_best_model(ckpt_dir) print_info("Save best model {} to {}, mIoU = {:.4f}".format( ckpt_dir, os.path.join(cfg.TRAIN.MODEL_SAVE_DIR, 'best_model'), mean_iou)) # Use VisualDL to visualize results if args.use_vdl and cfg.DATASET.VIS_FILE_LIST is not None: visualize( cfg=cfg, use_gpu=args.use_gpu, vis_file_list=cfg.DATASET.VIS_FILE_LIST, vis_dir="visual", ckpt_dir=ckpt_dir, log_writer=log_writer) # save final model if cfg.TRAINER_ID == 0: ckpt_dir = save_checkpoint(train_prog, 'final') save_infer_program(test_prog, ckpt_dir)
def train(args): # parameters from arguments model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model with_memory_optimization = args.with_mem_opt model_save_dir = args.model_save_dir startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() if args.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 #train_py_reader, train_cost, train_acc1, train_acc5 = build_program( infer_prog, train_out, train_cost, train_acc1, train_acc5 = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args) #test_py_reader, test_cost, test_acc1, test_acc5 = build_program( test_cost, test_acc1, test_acc5 = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, args=args) test_prog = test_prog.clone(for_test=True) if with_memory_optimization: fluid.memory_optimize(train_prog) fluid.memory_optimize(test_prog) """ print("-------------------------------------") for block in train_prog.blocks: for op in block.ops: print("op_train: ", op.type) print("-------------------------------------") for block in test_prog.blocks: for op in block.ops: print("op_infer: ", op.type) exit() """ #place = fluid.CUDAPlace(0) if args.use_gpu else fluid.CPUPlace() #place = fluid.XSIMPlace() #place = fluid.XCPUPlace() if args.place == "cuda": place = fluid.CUDAPlace(0) elif args.place == "xsim": place = fluid.XSIMPlace() elif args.place == "xpu": place = fluid.XPUPlace() else: print("Unsurpported place!") exit() exe = fluid.Executor(place) print("Run startup...") exe.run(startup_prog) train_fetch_list = [train_cost.name, train_acc1.name, train_acc5.name] if (args.run_mode == "train"): prog = train_prog elif (args.run_mode == "infer"): prog = test_prog elif (args.run_mode == "fused_infer"): print("Transpiling...") inference_transpiler_program = test_prog.clone() t = fluid.transpiler.InferenceXPUTranspiler() config = { "use_fake_max": True, "conv_weight_type": args.precision, "fc_weight_type": args.precision, "fc_pretrans_a": False, "fc_pretrans_b": True, "batch_size": args.batch_size } t.transpile_xpu(inference_transpiler_program, place, config) prog = inference_transpiler_program else: print("bad run_mode: ", args.run_mode) exit() print("Running...") img_data = np.random.random([args.batch_size, 3, 224, 224]).astype('float32') y_data = np.random.random([args.batch_size, 1]).astype('int64') if args.place == "cuda": # warm up loss, acc1, acc5 = exe.run(prog, feed={"data": img_data, "label": y_data}, fetch_list=train_fetch_list) profiler.start_profiler("All") loss, acc1, acc5 = exe.run(prog, feed={"data": img_data, "label": y_data}, fetch_list=train_fetch_list) if args.place == "cuda": profiler.stop_profiler("total", "/tmp/profile")
def do_train(args): # Initialize the paddle and paddle fleet execute enviroment paddle.enable_static() place = paddle.set_device(args.select_device) fleet.init(is_collective=True) # paddle.distributed.init_parallel_env() worker_num = fleet.worker_num() worker_index = fleet.worker_index() # Create the random seed for the worker set_seed(args.seed) # worker_init = WorkerInitObj(args.seed + worker_index) worker_init = WorkerInitObj(args.seed) tracker = get_rng_state_tracker() tracker.add('global_seed', args.seed) tracker.add('local_seed', args.seed + worker_index + 2021) # Define the input data in the static mode main_program = paddle.static.default_main_program() startup_program = paddle.static.default_startup_program() data_holders = create_data_holder(args) [ input_ids, segment_ids, input_mask, masked_lm_positions, masked_lm_labels, next_sentence_labels, masked_lm_scale ] = data_holders # Define the model structure in static mode args.model_type = args.model_type.lower() model_class, tokenizer_class = MODEL_CLASSES[args.model_type] tokenizer = tokenizer_class.from_pretrained(args.model_name_or_path) config = model_class.pretrained_init_configuration[args.model_name_or_path] if config["vocab_size"] % 8 != 0: config["vocab_size"] += 8 - (config["vocab_size"] % 8) config['num_partitions'] = args.num_partitions model = BertForPretraining(BertModel(**config), args.num_partitions) criterion = BertPretrainingCriterion(model.bert.config["vocab_size"]) prediction_scores, seq_relationship_score = model( input_ids=input_ids, token_type_ids=segment_ids, attention_mask=input_mask, masked_positions=masked_lm_positions) loss = criterion(prediction_scores, seq_relationship_score, masked_lm_labels, next_sentence_labels, masked_lm_scale) # Define the dynamic learing_reate scheduler and optimizer lr_scheduler = paddle.optimizer.lr.LambdaDecay( args.learning_rate, lambda current_step, num_warmup_steps=args.warmup_steps, num_training_steps=args.max_steps if args.max_steps > 0 else (len(train_data_loader) * args.num_train_epochs): float( current_step) / float(max(1, num_warmup_steps)) if current_step < num_warmup_steps else max( 0.0, float(num_training_steps - current_step) / float( max(1, num_training_steps - num_warmup_steps)))) optimizer = paddle.optimizer.AdamW( learning_rate=lr_scheduler, epsilon=args.adam_epsilon, parameters=model.parameters(), weight_decay=args.weight_decay, apply_decay_param_fun=lambda x: x in [ p.name for n, p in model.named_parameters() if not any(nd in n for nd in ["bias", "norm"]) ]) # if worker_num == 1 and args.use_amp: # amp_list = paddle.fluid.contrib.mixed_precision.AutoMixedPrecisionLists( # custom_white_list=['softmax', 'layer_norm', 'gelu']) # optimizer = paddle.fluid.contrib.mixed_precision.decorate( # optimizer, # amp_list, # init_loss_scaling=args.scale_loss, # use_dynamic_loss_scaling=True) if fleet.worker_num() > 1: # Use the fleet api to compile the distributed optimizer optimizer = dist_optimizer(args, optimizer) optimizer.minimize(loss) # Define the Executor for running the static model exe = paddle.static.Executor(place) exe.run(startup_program) # state_dict = model.state_dict() # Use the state dict to update the parameter # reset_state_dict = reset_program_state_dict(model, state_dict) # paddle.static.set_program_state(main_program, reset_state_dict) # if worker_num == 1: # # Construct the compiled program # main_program = build_compiled_program(main_program, loss) main_program._graph = None if fleet.worker_index() == 0: with open('startup_%d' % fleet.worker_num(), 'w') as f: f.writelines(str(startup_program)) with open('main_%d' % fleet.worker_num(), 'w') as f: f.writelines(str(main_program)) pool = ThreadPoolExecutor(1) global_step = 0 tic_train = time.time() epoch = 0 while True: files = [ os.path.join(args.input_dir, f) for f in os.listdir(args.input_dir) if os.path.isfile(os.path.join(args.input_dir, f)) and "training" in f ] files.sort() num_files = len(files) random.Random(args.seed + epoch).shuffle(files) f_start_id = 0 # Select one file for each worker and create the DataLoader for the file data_file = select_dataset_file_for_each_worker( files, f_start_id, 1, 0) #files, f_start_id, worker_num, worker_index) train_data_loader, _ = create_pretraining_dataset( data_file, args.max_predictions_per_seq, args, data_holders, worker_init, paddle.static.cuda_places()) for f_id in range(f_start_id + 1, len(files)): data_file = select_dataset_file_for_each_worker(files, f_id, 1, 0) # files, f_id, worker_num, worker_index) dataset_future = pool.submit(create_pretraining_dataset, data_file, args.max_predictions_per_seq, args, data_holders, worker_init, paddle.static.cuda_places()) for step, batch in enumerate(train_data_loader): global_step += 1 if step == 10 and worker_index == 0: profiler.start_profiler("All") if step == 20 and worker_index == 0: profiler.stop_profiler("total", "/tmp/profile") loss_return = exe.run(main_program, feed=batch, fetch_list=[loss]) # In the new 2.0 api, must call this function to change the learning_rate lr_scheduler.step() if global_step % args.logging_steps == 0: time_cost = time.time() - tic_train print( "global step %d, epoch: %d, batch: %d, loss: %f, speed: %.2f step/s, ips: %.2f sequences/s" % (global_step, epoch, step, loss_return[0], args.logging_steps / time_cost, args.logging_steps * args.batch_size / time_cost)) tic_train = time.time() if global_step % args.save_steps == 0: if worker_index == 0: output_dir = os.path.join(args.output_dir, "model_%d" % global_step) if not os.path.exists(output_dir): os.makedirs(output_dir) # TODO(fangzeyang): Udpate the save_params to paddle.static paddle.fluid.io.save_params(exe, output_dir) tokenizer.save_pretrained(output_dir) if global_step >= args.max_steps: del train_data_loader return del train_data_loader train_data_loader, data_file = dataset_future.result(timeout=None) epoch += 1
def train(args): # parameters from arguments model_name = args.model checkpoint = args.checkpoint pretrained_model = args.pretrained_model model_save_dir = args.model_save_dir use_mixup = args.use_mixup use_ngraph = os.getenv('FLAGS_use_ngraph') startup_prog = fluid.Program() train_prog = fluid.Program() test_prog = fluid.Program() exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = args.num_threads exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope dist_strategy = DistributedStrategy() dist_strategy.exec_strategy = exec_strategy dist_strategy.enable_inplace = args.with_inplace if not args.fuse: dist_strategy.fuse_all_reduce_ops = False dist_strategy.nccl_comm_num = args.nccl_comm_num dist_strategy.fuse_elewise_add_act_ops=args.fuse_elewise_add_act_ops role = role_maker.PaddleCloudRoleMaker(is_collective=True) fleet.init(role) b_out = build_program( is_train=True, main_prog=train_prog, startup_prog=startup_prog, args=args, dist_strategy=dist_strategy, data_layout=args.data_format) if use_mixup: train_data_loader, train_cost, global_lr = b_out[0], b_out[1], b_out[2] train_fetch_vars = [train_cost, global_lr] train_fetch_list = [] for var in train_fetch_vars: var.persistable=True train_fetch_list.append(var.name) else: train_data_loader, train_cost, train_acc1, train_acc5, global_lr = b_out[0],b_out[1],b_out[2],b_out[3],b_out[4] train_fetch_vars = [train_cost, train_acc1, train_acc5, global_lr] train_fetch_list = [] for var in train_fetch_vars: var.persistable=True train_fetch_list.append(var.name) train_prog = fleet.main_program b_out_test = build_program( is_train=False, main_prog=test_prog, startup_prog=startup_prog, args=args, dist_strategy=dist_strategy, data_layout=args.data_format) test_data_loader, test_cost, test_acc1, test_acc5 = b_out_test[0],b_out_test[1],b_out_test[2],b_out_test[3] test_prog = test_prog.clone(for_test=True) test_prog = compiler.CompiledProgram(test_prog).with_data_parallel(loss_name=test_cost.name, exec_strategy=exec_strategy) gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) if args.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) if checkpoint is not None: fluid.io.load_persistables(exe, checkpoint, main_program=train_prog) if pretrained_model: def if_exist(var): return os.path.exists(os.path.join(pretrained_model, var.name)) fluid.io.load_vars( exe, pretrained_model, main_program=train_prog, predicate=if_exist) if args.use_gpu: device_num = get_device_num() else: device_num = 1 train_batch_size = args.batch_size print("train_batch_size: %d device_num:%d" % (train_batch_size, device_num)) test_batch_size = args.batch_size # NOTE: the order of batch data generated by batch_reader # must be the same in the respective processes. shuffle_seed = 1 if num_trainers > 1 else None if args.use_dali: import dali train_iter = dali.train(settings=args, trainer_id=trainer_id, trainers_num=num_trainers, gpu_id=gpu_id, data_layout=args.data_format) else: train_reader = reader.train(settings=args, data_dir=args.data_dir, pass_id_as_seed=shuffle_seed, data_layout=args.data_format, threads=10) train_batch_reader=paddle.batch(train_reader, batch_size=train_batch_size) test_reader = reader.val(settings=args, data_dir=args.data_dir, data_layout=args.data_format, threads=10) test_batch_reader=paddle.batch(test_reader, batch_size=test_batch_size) places = place if num_trainers <= 1 and args.use_gpu: places = fluid.framework.cuda_places() train_data_loader.set_sample_list_generator(train_batch_reader, places) test_data_loader.set_sample_list_generator(test_batch_reader, place) test_fetch_vars = [test_cost, test_acc1, test_acc5] test_fetch_list = [] for var in test_fetch_vars: var.persistable=True test_fetch_list.append(var.name) train_exe = exe params = models.__dict__[args.model]().params train_speed_list = [] acc1_logs = [] acc5_logs = [] for pass_id in range(params["num_epochs"]): train_info = [[], [], []] test_info = [[], [], []] train_begin=time.time() batch_id = 0 time_record=[] if not args.use_dali: train_iter = train_data_loader() for data in train_iter: t1 = time.time() if batch_id % args.fetch_steps != 0: train_exe.run(train_prog, feed=data) else: if use_mixup: loss, lr = train_exe.run(train_prog, feed=data, fetch_list=train_fetch_list) else: loss, acc1, acc5, lr = train_exe.run(train_prog, feed=data, fetch_list=train_fetch_list) acc1 = np.mean(np.array(acc1)) acc5 = np.mean(np.array(acc5)) train_info[1].append(acc1) train_info[2].append(acc5) t2 = time.time() period = t2 - t1 time_record.append(period) if args.profile and batch_id == 100: print("begin profiler") if trainer_id == 0: profiler.start_profiler("All") elif args.profile and batch_id == 105: print("begin to end profiler") if trainer_id == 0: profiler.stop_profiler("total", "./profile_pass_%d" % (pass_id)) print("end profiler break!") args.profile=False if batch_id % args.fetch_steps == 0: loss = np.mean(np.array(loss)) train_info[0].append(loss) lr = np.mean(np.array(lr)) period = np.mean(time_record) speed = args.batch_size * 1.0 / period time_record=[] if use_mixup: print("Pass {0}, trainbatch {1}, loss {2}, lr {3}, time {4}, speed {5}" .format(pass_id, batch_id, "%.5f"%loss, "%.5f" %lr, "%2.4f sec" % period, "%.2f" % speed)) else: print("Pass {0}, trainbatch {1}, loss {2}, \ acc1 {3}, acc5 {4}, lr {5}, time {6}, speed {7}" .format(pass_id, batch_id, "%.5f"%loss, "%.5f"%acc1, "%.5f"%acc5, "%.5f" % lr, "%2.4f sec" % period, "%.2f" % speed)) sys.stdout.flush() batch_id += 1 if args.use_dali: train_iter.reset() train_loss = np.array(train_info[0]).mean() if not use_mixup: train_acc1 = np.array(train_info[1]).mean() train_acc5 = np.array(train_info[2]).mean() train_end=time.time() train_speed = (batch_id * train_batch_size) / (train_end - train_begin) train_speed_list.append(train_speed) if trainer_id == 0 and (args.do_test or (pass_id + 1) == params["num_epochs"]): if args.use_dali: test_iter = dali.val(settings=args, trainer_id=trainer_id, trainers_num=num_trainers, gpu_id=gpu_id, data_layout=args.data_format) else: test_iter = test_data_loader() test_batch_id = 0 for data in test_iter: t1 = time.time() loss, acc1, acc5 = exe.run(program=test_prog, feed=data, fetch_list=test_fetch_list) t2 = time.time() period = t2 - t1 loss = np.mean(loss) acc1 = np.mean(acc1) acc5 = np.mean(acc5) test_info[0].append(loss) test_info[1].append(acc1) test_info[2].append(acc5) if test_batch_id % 10 == 0: test_speed = test_batch_size * 1.0 / period print("Pass {0},testbatch {1},loss {2}, \ acc1 {3},acc5 {4},time {5},speed {6}" .format(pass_id, test_batch_id, "%.5f"%loss,"%.5f"%acc1, "%.5f"%acc5, "%2.2f sec" % period, "%.2f" % test_speed)) sys.stdout.flush() test_batch_id += 1 if args.use_dali: test_iter.reset() del test_iter test_loss = np.array(test_info[0]).mean() test_acc1 = np.array(test_info[1]).mean() test_acc5 = np.array(test_info[2]).mean() acc1_logs.append(test_acc1) acc5_logs.append(test_acc5) if use_mixup: print("End pass {0}, train_loss {1}, test_loss {2}, test_acc1 {3}, test_acc5 {4}, speed {5}".format( pass_id, "%.5f"%train_loss, "%.5f"%test_loss, "%.5f"%test_acc1, "%.5f"%test_acc5, "%.2f" % train_speed)) else: print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, " "test_loss {4}, test_acc1 {5}, test_acc5 {6}, speed {7}".format( pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.5f"%test_loss, "%.5f"%test_acc1, "%.5f"%test_acc5, "%.2f" % train_speed)) else: if use_mixup: print("End pass {0}, train_loss {1}, speed {2}".format(pass_id, "%.5f"%train_loss, "%.2f" % train_speed)) else: print("End pass {0}, train_loss {1}, train_acc1 {2}, train_acc5 {3}, ""speed {4}".format( pass_id, "%.5f"%train_loss, "%.5f"%train_acc1, "%.5f"%train_acc5, "%.2f" % train_speed)) sys.stdout.flush() # save in last epoch if trainer_id == 0: model_path = os.path.join(model_save_dir + '/' + model_name, str(pass_id)) if not os.path.isdir(model_path): os.makedirs(model_path) fluid.io.save_persistables(exe, model_path, main_program=fleet._origin_program) if args.benchmark_test: if not os.path.isdir("./benchmark_logs/"): os.makedirs("./benchmark_logs/") with open("./benchmark_logs/log_%d" % trainer_id, 'w') as f: result = dict() result['0'] = dict() result['0']['acc1'] = test_acc1 result['0']['acc5'] = test_acc5 result['0']['result_log'] = dict() result['0']['result_log']['acc1'] = acc1_logs result['0']['result_log']['acc5'] = acc5_logs # maximum speed of all epochs result['1'] = max(train_speed_list) * num_trainers result['14'] = args.batch_size print(str(result)) f.writelines(str(result))
def train_parallel(avg_loss, infer_prog, optimizer, train_reader, test_reader, batch_acc, args, train_prog, startup_prog, nccl_id_var, num_trainers, trainer_id): place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) if not args.use_reader_op: feed_var_list = [ var for var in train_prog.global_block().vars.itervalues() if var.is_data ] feeder = fluid.DataFeeder(feed_var_list, place) # generate fake: if args.use_fake_data: for var in feed_var_list: v = startup_prog.global_block()._clone_variable(var) var.persistable = True v.persistable = True real_shape = list(var.shape) real_shape[0] = args.batch_size / args.gpus startup_prog.global_block().append_op(outputs={"Out": v}, type="fill_constant", attrs={ "shape": real_shape, "value": 1.0, "dtype": var.dtype }) if nccl_id_var and trainer_id == 0: #FIXME(wuyi): wait other trainer to start listening time.sleep(30) startup_exe = fluid.Executor(place) startup_exe.run(startup_prog) strategy = fluid.ExecutionStrategy() strategy.num_threads = 1 strategy.allow_op_delay = False exe = fluid.ParallelExecutor(True, avg_loss.name, exec_strategy=strategy, num_trainers=num_trainers, trainer_id=trainer_id) for pass_id in range(args.pass_num): num_samples = 0 iters = 0 start_time = time.time() if not args.use_reader_op: reader_generator = train_reader() batch_id = 0 data = None while True: if not args.use_reader_op: data = next(reader_generator, None) if data == None: break if iters == args.iterations: break if args.profile and pass_id == 0 and batch_id == 5: profiler.start_profiler("All") elif args.profile and pass_id == 0 and batch_id == 10: profiler.stop_profiler("total", "/tmp/profile_%d" % trainer_id) if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 if args.use_fake_data or args.use_reader_op: try: loss, = exe.run([avg_loss.name]) except fluid.core.EnforceNotMet as ex: break else: loss, = exe.run([avg_loss.name], feed=feeder.feed(data)) if args.use_reader_op: num_samples += args.batch_size * args.gpus else: num_samples += len(data) iters += 1 if batch_id % 1 == 0: print("Pass %d, batch %d, loss %s" % (pass_id, batch_id, np.array(loss))) batch_id += 1 print_train_time(start_time, time.time(), num_samples) print("current activate thread num: ", threading.active_count()) if not args.no_test and batch_acc and not args.use_reader_op: # we have not implement record io for test # skip test when use args.use_reader_op test_acc = test(startup_exe, infer_prog, test_reader, feeder, batch_acc) print("Pass: %d, Test Accuracy: %f\n" % (pass_id, test_acc)) print_test_acc(pass_id, test_acc)
def train_parallel(train_args, test_args, args, train_prog, test_prog, startup_prog, nccl_id_var, num_trainers, trainer_id): over_all_start = time.time() place = core.CPUPlace() if args.device == 'CPU' else core.CUDAPlace(0) feeder = None if not args.use_reader_op: feed_var_list = [ var for var in train_prog.global_block().vars.itervalues() if var.is_data ] feeder = fluid.DataFeeder(feed_var_list, place) # generate fake: if args.use_fake_data: for var in feed_var_list: v = startup_prog.global_block()._clone_variable(var) var.persistable = True v.persistable = True real_shape = list(var.shape) real_shape[0] = args.batch_size / args.gpus startup_prog.global_block().append_op(outputs={"Out": v}, type="fill_constant", attrs={ "shape": real_shape, "value": 1.0, "dtype": var.dtype }) if nccl_id_var and trainer_id == 0: #FIXME(wuyi): wait other trainer to start listening time.sleep(30) startup_exe = fluid.Executor(place) startup_exe.run(startup_prog) strategy = fluid.ExecutionStrategy() strategy.num_threads = args.cpus strategy.allow_op_delay = False build_strategy = fluid.BuildStrategy() if args.reduce_strategy == "reduce": build_strategy.reduce_strategy = fluid.BuildStrategy( ).ReduceStrategy.Reduce else: build_strategy.reduce_strategy = fluid.BuildStrategy( ).ReduceStrategy.AllReduce build_strategy.fuse_broadcast_op = args.fuse_broadcast_op avg_loss = train_args[0] if args.update_method == "pserver": # parameter server mode distributed training, merge # gradients on local server, do not initialize # ParallelExecutor with multi server all-reduce mode. num_trainers = 1 trainer_id = 0 exe = fluid.ParallelExecutor(True, avg_loss.name, main_program=train_prog, exec_strategy=strategy, build_strategy=build_strategy, num_trainers=num_trainers, trainer_id=trainer_id) if not args.no_test: if args.update_method == "pserver": test_scope = None else: # NOTE: use an empty scope to avoid test exe using NCCLID test_scope = fluid.Scope() test_exe = fluid.ParallelExecutor(True, main_program=test_prog, share_vars_from=exe) for pass_id in range(args.pass_num): num_samples = 0 iters = 0 start_time = time.time() if not args.use_reader_op: reader_generator = train_args[3]() #train_reader batch_id = 0 data = None if args.use_reader_op: train_args[4].start() while True: if not args.use_reader_op: data = next(reader_generator, None) if data == None: break if args.profile and batch_id == 5: profiler.start_profiler("All") profiler.reset_profiler() elif args.profile and batch_id == 10: print("profiling total time: ", time.time() - start_time) profiler.stop_profiler( "total", "/tmp/profile_%d_pass%d" % (trainer_id, pass_id)) if iters == args.iterations: reader_generator.close() break if iters == args.skip_batch_num: start_time = time.time() num_samples = 0 fetch_list = [avg_loss.name] acc_name_list = [v.name for v in train_args[2]] fetch_list.extend(acc_name_list) if args.use_fake_data or args.use_reader_op: try: fetch_ret = exe.run(fetch_list) except fluid.core.EOFException as eof: break except fluid.core.EnforceNotMet as ex: traceback.print_exc() break else: fetch_ret = exe.run(fetch_list, feed=feeder.feed(data)) if args.use_reader_op: num_samples += args.batch_size * args.gpus else: num_samples += len(data) iters += 1 if batch_id % 1 == 0: fetched_data = [np.mean(np.array(d)) for d in fetch_ret] print("Pass %d, batch %d, loss %s, accucacys: %s" % (pass_id, batch_id, fetched_data[0], fetched_data[1:])) batch_id += 1 print_train_time(start_time, time.time(), num_samples) if args.use_reader_op: train_args[4].reset() # reset reader handle else: del reader_generator if not args.no_test and test_args[2]: test_feeder = None if not args.use_reader_op: test_feed_var_list = [ var for var in test_prog.global_block().vars.itervalues() if var.is_data ] test_feeder = fluid.DataFeeder(test_feed_var_list, place) test_ret = test_parallel(test_exe, test_args, args, test_prog, test_feeder) print("Pass: %d, Test Accuracy: %s\n" % (pass_id, [np.mean(np.array(v)) for v in test_ret])) print("total train time: ", time.time() - over_all_start)
def main(args): bert_config = BertConfig(args.bert_config_path) bert_config.print_config() if args.use_cuda: place = fluid.CUDAPlace(int(os.getenv('FLAGS_selected_gpus', '0'))) dev_count = get_device_num() else: place = fluid.CPUPlace() dev_count = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) exe = fluid.Executor(place) task_name = args.task_name.lower() processors = { 'xnli': reader.XnliProcessor, 'cola': reader.ColaProcessor, 'mrpc': reader.MrpcProcessor, 'mnli': reader.MnliProcessor, } processor = processors[task_name](data_dir=args.data_dir, vocab_path=args.vocab_path, max_seq_len=args.max_seq_len, do_lower_case=args.do_lower_case, in_tokens=args.in_tokens, random_seed=args.random_seed) num_labels = len(processor.get_labels()) if not (args.do_train or args.do_val or args.do_test): raise ValueError("For args `do_train`, `do_val` and `do_test`, at " "least one of them must be True.") train_program = fluid.Program() startup_prog = fluid.Program() if args.random_seed is not None: startup_prog.random_seed = args.random_seed train_program.random_seed = args.random_seed if args.do_train: # NOTE: If num_trainers > 1, the shuffle_seed must be set, because # the order of batch data generated by reader # must be the same in the respective processes. shuffle_seed = 1 if num_trainers > 1 else None train_data_generator = processor.data_generator( batch_size=args.batch_size, phase='train', epoch=args.epoch, dev_count=dev_count, shuffle=args.shuffle, shuffle_seed=shuffle_seed) num_train_examples = processor.get_num_examples(phase='train') if args.in_tokens: max_train_steps = args.epoch * num_train_examples // ( args.batch_size // args.max_seq_len) // dev_count else: max_train_steps = args.epoch * num_train_examples // args.batch_size // dev_count warmup_steps = int(max_train_steps * args.warmup_proportion) print("Device count: %d" % dev_count) print("Num train examples: %d" % num_train_examples) print("Max train steps: %d" % max_train_steps) print("Num warmup steps: %d" % warmup_steps) with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_data_loader, loss, probs, accuracy, num_seqs = create_model( args, bert_config=bert_config, num_labels=num_labels) scheduled_lr, loss_scaling = optimization( loss=loss, warmup_steps=warmup_steps, num_train_steps=max_train_steps, learning_rate=args.learning_rate, train_program=train_program, startup_prog=startup_prog, weight_decay=args.weight_decay, scheduler=args.lr_scheduler, use_fp16=args.use_fp16, use_dynamic_loss_scaling=args.use_dynamic_loss_scaling, init_loss_scaling=args.init_loss_scaling, incr_every_n_steps=args.incr_every_n_steps, decr_every_n_nan_or_inf=args.decr_every_n_nan_or_inf, incr_ratio=args.incr_ratio, decr_ratio=args.decr_ratio) if args.do_val: dev_prog = fluid.Program() with fluid.program_guard(dev_prog, startup_prog): with fluid.unique_name.guard(): dev_data_loader, loss, probs, accuracy, num_seqs = create_model( args, bert_config=bert_config, num_labels=num_labels) dev_prog = dev_prog.clone(for_test=True) dev_data_loader.set_batch_generator( processor.data_generator(batch_size=args.batch_size, phase='dev', epoch=1, dev_count=1, shuffle=False), place) if args.do_test: test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_data_loader, loss, probs, accuracy, num_seqs = create_model( args, bert_config=bert_config, num_labels=num_labels) test_prog = test_prog.clone(for_test=True) test_data_loader.set_batch_generator( processor.data_generator(batch_size=args.batch_size, phase='test', epoch=1, dev_count=1, shuffle=False), place) exe.run(startup_prog) if args.do_train: if args.init_checkpoint and args.init_pretraining_params: print( "WARNING: args 'init_checkpoint' and 'init_pretraining_params' " "both are set! Only arg 'init_checkpoint' is made valid.") if args.init_checkpoint: init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) elif args.init_pretraining_params: init_pretraining_params(exe, args.init_pretraining_params, main_program=startup_prog, use_fp16=args.use_fp16) elif args.do_val or args.do_test: if not args.init_checkpoint: raise ValueError("args 'init_checkpoint' should be set if" "only doing validation or testing!") init_checkpoint(exe, args.init_checkpoint, main_program=startup_prog, use_fp16=args.use_fp16) if args.do_train: exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = args.use_fast_executor exec_strategy.num_threads = dev_count exec_strategy.num_iteration_per_drop_scope = args.num_iteration_per_drop_scope build_strategy = fluid.BuildStrategy() if args.use_cuda and num_trainers > 1: assert shuffle_seed is not None dist_utils.prepare_for_multi_process(exe, build_strategy, train_program) train_data_generator = fluid.contrib.reader.distributed_batch_reader( train_data_generator) train_compiled_program = fluid.CompiledProgram( train_program).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy) train_data_loader.set_batch_generator(train_data_generator, place) if args.do_train: train_data_loader.start() steps = 0 total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() throughput = [] ce_info = [] total_batch_num = 0 # used for benchmark while True: try: steps += 1 total_batch_num += 1 # used for benchmark if args.max_iter and total_batch_num == args.max_iter: # used for benchmark return if steps % args.skip_steps == 0: if args.use_fp16: fetch_list = [ loss.name, accuracy.name, scheduled_lr.name, num_seqs.name, loss_scaling.name ] else: fetch_list = [ loss.name, accuracy.name, scheduled_lr.name, num_seqs.name ] else: fetch_list = [] outputs = exe.run(train_compiled_program, fetch_list=fetch_list) if steps % args.skip_steps == 0: if args.use_fp16: np_loss, np_acc, np_lr, np_num_seqs, np_scaling = outputs else: np_loss, np_acc, np_lr, np_num_seqs = outputs total_cost.extend(np_loss * np_num_seqs) total_acc.extend(np_acc * np_num_seqs) total_num_seqs.extend(np_num_seqs) if args.verbose: verbose = "train data_loader queue size: %d, " % train_data_loader.queue.size( ) verbose += "learning rate: %f" % np_lr[0] if args.use_fp16: verbose += ", loss scaling: %f" % np_scaling[0] print(verbose) current_example, current_epoch = processor.get_train_progress( ) time_end = time.time() used_time = time_end - time_begin # profiler tools if args.is_profiler and current_epoch == 0 and steps == args.skip_steps: profiler.start_profiler("All") elif args.is_profiler and current_epoch == 0 and steps == args.skip_steps * 2: profiler.stop_profiler("total", args.profiler_path) return log_record = "epoch: {}, progress: {}/{}, step: {}, ave loss: {}, ave acc: {}".format( current_epoch, current_example, num_train_examples, steps, np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs)) ce_info.append([ np.sum(total_cost) / np.sum(total_num_seqs), np.sum(total_acc) / np.sum(total_num_seqs), used_time ]) if steps > 0: throughput.append(args.skip_steps / used_time) log_record = log_record + ", speed: %f steps/s" % ( args.skip_steps / used_time) print(log_record) else: print(log_record) total_cost, total_acc, total_num_seqs = [], [], [] time_begin = time.time() if steps % args.save_steps == 0: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.save(program=train_program, model_path=save_path) if steps % args.validation_steps == 0: print("Average throughtput: %s" % (np.average(throughput))) throughput = [] # evaluate dev set if args.do_val: evaluate(exe, dev_prog, dev_data_loader, [loss.name, accuracy.name, num_seqs.name], "dev") # evaluate test set if args.do_test: evaluate(exe, test_prog, test_data_loader, [loss.name, accuracy.name, num_seqs.name], "test") except fluid.core.EOFException: save_path = os.path.join(args.checkpoints, "step_" + str(steps)) fluid.save(program=train_program, model_path=save_path) train_data_loader.reset() break if args.enable_ce: card_num = get_cards() ce_cost = 0 ce_acc = 0 ce_time = 0 try: ce_cost = ce_info[-2][0] ce_acc = ce_info[-2][1] ce_time = ce_info[-2][2] except: print("ce info error") print("kpis\ttrain_duration_%s_card%s\t%s" % (args.task_name, card_num, ce_time)) print("kpis\ttrain_cost_%s_card%s\t%f" % (args.task_name, card_num, ce_cost)) print("kpis\ttrain_acc_%s_card%s\t%f" % (args.task_name, card_num, ce_acc)) # final eval on dev set if args.do_val: print("Final validation result:") evaluate(exe, dev_prog, dev_data_loader, [loss.name, accuracy.name, num_seqs.name], "dev") # final eval on test set if args.do_test: print("Final test result:") evaluate(exe, test_prog, test_data_loader, [loss.name, accuracy.name, num_seqs.name], "test")
def train_with_dataloader(exe, train_prog, compiled_train_prog, train_dataloader, train_fetch_list, train_metrics, train_batch_size=None, epochs=10, log_interval=0, valid_interval=0, save_dir='./', num_trainers=1, trainer_id=0, save_model_name='model', fix_random_seed=False, compiled_test_prog=None, test_dataloader=None, test_fetch_list=None, test_metrics=None, is_profiler=None, profiler_path=None): if not train_dataloader: logger.error("[TRAIN] get dataloader failed.") train_loss = 0 epoch_periods = [] reader_cost_averager = TimeAverager() batch_cost_averager = TimeAverager() for epoch in range(epochs): log_lr_and_step() train_iter = 0 epoch_periods = [] batch_start = time.time() for data in train_dataloader(): reader_cost_averager.record(time.time() - batch_start) train_outs = exe.run(compiled_train_prog, fetch_list=train_fetch_list, feed=data) batch_cost = time.time() - batch_start epoch_periods.append(batch_cost) batch_cost_averager.record(batch_cost, num_samples=train_batch_size) local_time = time.localtime(time.time()) str_time = time.strftime("%Y-%m-%d %H:%M:%S", local_time) if log_interval > 0 and (train_iter % log_interval == 0): time_info_str = "batch_cost: {:.5f} sec, reader_cost: {:.5f} sec".format( batch_cost_averager.get_average(), reader_cost_averager.get_average()) if train_batch_size: time_info_str += ", ips: {:.5f} samples/sec".format( batch_cost_averager.get_ips_average()) train_metrics.calculate_and_log_out( train_outs, info='[TRAIN {}] Epoch {}, iter {}, {}'.format( str_time, epoch, train_iter, time_info_str)) reader_cost_averager.reset() batch_cost_averager.reset() train_iter += 1 batch_start = time.time() # NOTE: profiler tools, used for benchmark if is_profiler and epoch == 0 and train_iter == log_interval: profiler.start_profiler("All") elif is_profiler and epoch == 0 and train_iter == log_interval + 5: profiler.stop_profiler("total", profiler_path) return if len(epoch_periods) < 1: logger.info( 'No iteration was executed, please check the data reader') sys.exit(1) logger.info( '[TRAIN] Epoch {} training finished, average time: {:.5f} sec'. format(epoch, np.mean(epoch_periods[1:]))) if trainer_id == 0: save_model(exe, train_prog, save_dir, save_model_name, "_epoch{}".format(epoch)) if compiled_test_prog and valid_interval > 0 and ( epoch + 1) % valid_interval == 0: test_with_dataloader(exe, compiled_test_prog, test_dataloader, test_fetch_list, test_metrics, log_interval, save_model_name) if trainer_id == 0: save_model(exe, train_prog, save_dir, save_model_name) #when fix_random seed for debug if fix_random_seed: cards = os.environ.get('CUDA_VISIBLE_DEVICES') gpu_num = len(cards.split(",")) print("kpis\ttrain_cost_card{}\t{}".format(gpu_num, train_loss)) print("kpis\ttrain_speed_card{}\t{}".format(gpu_num, np.mean(epoch_periods)))
def do_train(args): if args.use_cuda: if num_trainers > 1: # for multi-process gpu training dev_count = 1 else: dev_count = fluid.core.get_cuda_device_count() gpu_id = int(os.environ.get('FLAGS_selected_gpus', 0)) place = fluid.CUDAPlace(gpu_id) else: dev_count = int(os.environ.get('CPU_NUM', 1)) place = fluid.CPUPlace() # define the data generator processor = reader.DataProcessor(fpattern=args.training_file, src_vocab_fpath=args.src_vocab_fpath, trg_vocab_fpath=args.trg_vocab_fpath, token_delimiter=args.token_delimiter, use_token_batch=args.use_token_batch, batch_size=args.batch_size, device_count=dev_count, pool_size=args.pool_size, sort_type=args.sort_type, shuffle=args.shuffle, shuffle_batch=args.shuffle_batch, start_mark=args.special_token[0], end_mark=args.special_token[1], unk_mark=args.special_token[2], max_length=args.max_length, n_head=args.n_head) batch_generator = processor.data_generator(phase="train") if num_trainers > 1: # for multi-process gpu training batch_generator = fluid.contrib.reader.distributed_batch_reader( batch_generator) args.src_vocab_size, args.trg_vocab_size, args.bos_idx, args.eos_idx, \ args.unk_idx = processor.get_vocab_summary() train_prog = fluid.default_main_program() startup_prog = fluid.default_startup_program() random_seed = eval(str(args.random_seed)) if random_seed is not None: train_prog.random_seed = random_seed startup_prog.random_seed = random_seed with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): # define input and reader input_field_names = desc.encoder_data_input_fields + \ desc.decoder_data_input_fields[:-1] + desc.label_data_input_fields input_descs = desc.get_input_descs(args.args) input_slots = [{ "name": name, "shape": input_descs[name][0], "dtype": input_descs[name][1] } for name in input_field_names] input_field = InputField(input_slots) input_field.build(build_pyreader=True) # define the network sum_cost, avg_cost, token_num = create_net(is_training=True, model_input=input_field, args=args) # define the optimizer with fluid.default_main_program()._lr_schedule_guard(): learning_rate = fluid.layers.learning_rate_scheduler.noam_decay( args.d_model, args.warmup_steps) * args.learning_rate optimizer = fluid.optimizer.Adam(learning_rate=learning_rate, beta1=args.beta1, beta2=args.beta2, epsilon=float(args.eps)) optimizer.minimize(avg_cost) # prepare training ## decorate the pyreader with batch_generator input_field.loader.set_batch_generator(batch_generator) ## define the executor and program for training exe = fluid.Executor(place) exe.run(startup_prog) # init position_encoding for pos_enc_param_name in desc.pos_enc_param_names: pos_enc_param = fluid.global_scope().find_var( pos_enc_param_name).get_tensor() pos_enc_param.set( position_encoding_init(args.max_length + 1, args.d_model), place) assert (args.init_from_checkpoint == "") or (args.init_from_pretrain_model == "") ## init from some checkpoint, to resume the previous training if args.init_from_checkpoint: load(train_prog, os.path.join(args.init_from_checkpoint, "transformer"), exe) print("finish initing model from checkpoint from %s" % (args.init_from_checkpoint)) ## init from some pretrain models, to better solve the current task if args.init_from_pretrain_model: load(train_prog, os.path.join(args.init_from_pretrain_model, "transformer"), exe) print("finish initing model from pretrained params from %s" % (args.init_from_pretrain_model)) build_strategy = fluid.compiler.BuildStrategy() build_strategy.enable_inplace = True exec_strategy = fluid.ExecutionStrategy() if num_trainers > 1: dist_utils.prepare_for_multi_process(exe, build_strategy, train_prog) exec_strategy.num_threads = 1 compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=avg_cost.name, build_strategy=build_strategy, exec_strategy=exec_strategy) # the best cross-entropy value with label smoothing loss_normalizer = -( (1. - args.label_smooth_eps) * np.log((1. - args.label_smooth_eps)) + args.label_smooth_eps * np.log(args.label_smooth_eps / (args.trg_vocab_size - 1) + 1e-20)) # start training step_idx = 0 total_batch_num = 0 # this is for benchmark total_batch_token_num = 0 # this is for benchmark word count for pass_id in range(args.epoch): pass_start_time = time.time() input_field.loader.start() batch_id = 0 while True: if args.max_iter and total_batch_num == args.max_iter: # this for benchmark return try: outs = exe.run(compiled_train_prog, fetch_list=[sum_cost.name, token_num.name]) total_batch_token_num += np.asarray(outs[1]).sum() if step_idx % args.print_step == 0: sum_cost_val, token_num_val = np.asarray( outs[0]), np.asarray(outs[1]) # sum the cost from multi-devices total_sum_cost = sum_cost_val.sum() total_token_num = token_num_val.sum() total_avg_cost = total_sum_cost / total_token_num if step_idx == 0: logging.info( "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " "normalized loss: %f, ppl: %f" % (step_idx, pass_id, batch_id, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100)]))) avg_batch_time = time.time() else: logging.info( "step_idx: %d, epoch: %d, batch: %d, avg loss: %f, " "normalized loss: %f, ppl: %f, batch speed: %.2f steps/s, ips: %.2f words/sec" % (step_idx, pass_id, batch_id, total_avg_cost, total_avg_cost - loss_normalizer, np.exp([min(total_avg_cost, 100) ]), args.print_step / (time.time() - avg_batch_time), total_batch_token_num / (time.time() - avg_batch_time))) avg_batch_time = time.time() total_batch_token_num = 0 if step_idx % args.save_step == 0 and step_idx != 0: if args.save_model_path: model_path = os.path.join(args.save_model_path, "step_" + str(step_idx), "transformer") fluid.save(train_prog, model_path) batch_id += 1 step_idx += 1 total_batch_num = total_batch_num + 1 # this is for benchmark # profiler tools for benchmark if args.is_profiler and pass_id == 0 and batch_id == args.print_step: profiler.start_profiler("All") elif args.is_profiler and pass_id == 0 and batch_id == args.print_step + 5: profiler.stop_profiler("total", args.profiler_path) return except fluid.core.EOFException: input_field.loader.reset() break time_consumed = time.time() - pass_start_time if args.save_model_path: model_path = os.path.join(args.save_model_path, "step_final", "transformer") fluid.save(train_prog, model_path) if args.enable_ce: # For CE print("kpis\ttrain_cost_card%d\t%f" % (dev_count, total_avg_cost)) print("kpis\ttrain_duration_card%d\t%f" % (dev_count, time_consumed))
def main(): args = parse_args() model_type = args.model_type rnn_model = args.rnn_model logger = logging.getLogger("lm") logger.setLevel(logging.INFO) formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s') if args.log_path: file_handler = logging.FileHandler(args.log_path) file_handler.setLevel(logging.INFO) file_handler.setFormatter(formatter) logger.addHandler(file_handler) else: console_handler = logging.StreamHandler() console_handler.setLevel(logging.INFO) console_handler.setFormatter(formatter) logger.addHandler(console_handler) logger.info('Running with args : {}'.format(args)) vocab_size = 10000 if model_type == "test": num_layers = 1 batch_size = 2 hidden_size = 10 num_steps = 3 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 1 max_epoch = 1 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "small": num_layers = 2 batch_size = 20 hidden_size = 200 num_steps = 20 init_scale = 0.1 max_grad_norm = 5.0 epoch_start_decay = 4 max_epoch = 13 dropout = 0.0 lr_decay = 0.5 base_learning_rate = 1.0 elif model_type == "medium": num_layers = 2 batch_size = 20 hidden_size = 650 num_steps = 35 init_scale = 0.05 max_grad_norm = 5.0 epoch_start_decay = 6 max_epoch = 39 dropout = 0.5 lr_decay = 0.8 base_learning_rate = 1.0 elif model_type == "large": num_layers = 2 batch_size = 20 hidden_size = 1500 num_steps = 35 init_scale = 0.04 max_grad_norm = 10.0 epoch_start_decay = 14 max_epoch = 55 dropout = 0.65 lr_decay = 1.0 / 1.15 base_learning_rate = 1.0 else: print("model type not support") return if not args.save_model_dir: save_model_dir = model_type + "_models" if args.use_gpu: save_model_dir = "gpu_" + save_model_dir else: save_model_dir = "cpu_" + save_model_dir if args.inference_only: save_model_dir = "infer_" + save_model_dir else: save_model_dir = "train_" + save_model_dir else: save_model_dir = args.save_model_dir if args.batch_size > 0: batch_size = args.batch_size if args.max_epoch > 0: max_epoch = args.max_epoch if args.profile: print( "\nProfiler is enabled, only 1 epoch will be ran (set max_epoch = 1).\n" ) max_epoch = 1 main_program = fluid.Program() startup_program = fluid.Program() if args.enable_ce: startup_program.random_seed = SEED with fluid.program_guard(main_program, startup_program): # Training process loss, last_hidden, last_cell, feed_order = lm_model.lm_model( hidden_size, vocab_size, batch_size, num_layers=num_layers, num_steps=num_steps, init_scale=init_scale, dropout=dropout, rnn_model=rnn_model) # clone from default main program and use it as the validation program inference_program = fluid.default_main_program().clone(for_test=True) #print(inference_program) fluid.clip.set_gradient_clip(clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=max_grad_norm)) learning_rate = fluid.layers.create_global_var(name="learning_rate", shape=[1], value=1.0, dtype='float32', persistable=True) optimizer = fluid.optimizer.SGD(learning_rate=learning_rate) optimizer.minimize(loss) place = core.CUDAPlace(0) if args.use_gpu else core.CPUPlace() exe = Executor(place) exe.run(startup_program) device_count = fluid.core.get_cuda_device_count() exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = device_count exec_strategy.use_experimental_executor = False exec_strategy.num_iteration_per_drop_scope = 100 build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = True build_strategy.memory_optimize = True build_strategy.remove_unnecessary_lock = True build_strategy.enable_sequential_execution = False build_strategy.cache_runtime_context = True build_strategy.cache_expected_kernel = True build_strategy.fuse_all_optimizer_ops = True if args.parallel: train_program = fluid.compiler.CompiledProgram( main_program).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: train_program = fluid.compiler.CompiledProgram(main_program) data_path = args.data_path print("begin to load data") raw_data = reader.ptb_raw_data(data_path) print("finished load data") train_data, valid_data, test_data, _ = raw_data def prepare_input(batch, init_hidden, init_cell, epoch_id=0, with_lr=True, device_count=1): x, y = batch new_lr = base_learning_rate * (lr_decay**max( epoch_id + 1 - epoch_start_decay, 0.0)) res = {} if device_count > 1 and args.parallel: lr = np.ones((device_count), dtype='float32') * new_lr x = x.reshape((-1, num_steps, 1)) y = y.reshape((-1, 1)) else: lr = np.ones((1), dtype='float32') * new_lr x = x.reshape((-1, num_steps, 1)) y = y.reshape((-1, 1)) res['x'] = x res['y'] = y res['init_hidden'] = init_hidden res['init_cell'] = init_cell if with_lr: res['learning_rate'] = lr return res def eval(data): if args.inference_only and args.init_params_path: dirname = args.init_params_path filename = None if not os.path.isdir(args.init_params_path): dirname = os.path.dirname(args.init_params_path) filename = os.path.basename(args.init_params_path) fluid.io.load_persistables(exe, dirname, main_program=main_program, filename=filename) print("Load parameters from: %s." % args.init_params_path) batch_times = [] start_time = time.time() # when eval the batch_size set to 1 eval_data_iter = reader.get_data_iter(data, batch_size, num_steps) total_loss = 0.0 iters = 0 init_hidden = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') for batch_id, batch in enumerate(eval_data_iter): input_data_feed = prepare_input(batch, init_hidden, init_cell, epoch_id=0, with_lr=False) batch_start_time = time.time() # eval should not run the grad op and change the parameters. # use Executor to eval fetch_outs = exe.run( program=inference_program, feed=input_data_feed, fetch_list=[loss.name, last_hidden.name, last_cell.name], use_program_cache=True) batch_times.append(time.time() - batch_start_time) cost_train = np.array(fetch_outs[0]) init_hidden = np.array(fetch_outs[1]) init_cell = np.array(fetch_outs[2]) total_loss += cost_train iters += num_steps ppl = np.exp(total_loss / iters) eval_time_total = time.time() - start_time eval_time_run = np.sum(batch_times) # Benchmark if args.inference_only: print("\n======== Benchmark Result ========") print( "Eval batch_size: %d; Time (total): %.5f s; Time (only run): %.5f s; ppl: %.5f" % (batch_size, eval_time_total, eval_time_run, ppl[0])) print("") # Save the inference model for C++ inference purpose fluid.io.save_inference_model(save_model_dir, feed_order, [loss, last_hidden, last_cell], exe, main_program=inference_program, model_filename="model", params_filename="params") print("Save inference model to: %s." % save_model_dir) return ppl def train_an_epoch(epoch_id, batch_times): # get train epoch size num_batchs = len(train_data) // batch_size epoch_size = (num_batchs - 1) // num_steps if args.profile: log_interval = 1 else: log_interval = max(1, epoch_size // 10) data_iter_size = batch_size if device_count > 1 and args.parallel: data_iter_size = batch_size * device_count train_data_iter = reader.get_data_iter(train_data, data_iter_size, num_steps) total_loss = 0 iters = 0 if device_count > 1 and args.parallel: init_hidden = np.zeros( (num_layers * device_count, batch_size, hidden_size), dtype='float32') init_cell = np.zeros( (num_layers * device_count, batch_size, hidden_size), dtype='float32') else: init_hidden = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') init_cell = np.zeros((num_layers, batch_size, hidden_size), dtype='float32') for batch_id, batch in enumerate(train_data_iter): input_data_feed = prepare_input(batch, init_hidden, init_cell, epoch_id=epoch_id, device_count=device_count) batch_start_time = time.time() fetch_outs = exe.run(train_program, feed=input_data_feed, fetch_list=[ loss.name, last_hidden.name, last_cell.name, "learning_rate" ], use_program_cache=True) batch_time = time.time() - batch_start_time batch_times.append(batch_time) cost_train = np.array(fetch_outs[0]) init_hidden = np.array(fetch_outs[1]) init_cell = np.array(fetch_outs[2]) lr = np.array(fetch_outs[3]) total_loss += cost_train iters += num_steps if batch_id > 0 and batch_id % log_interval == 0: ppl = np.exp(total_loss / iters) print( "-- Epoch:[%d]; Batch:[%d]; Time: %.5f s; ppl: %.5f, lr: %.5f" % (epoch_id, batch_id, batch_time, ppl[0], lr[0])) if args.profile: if batch_id == 1: profiler.reset_profiler() elif batch_id >= 11: break ppl = np.exp(total_loss / iters) return ppl def train(): total_time = 0.0 for epoch_id in range(max_epoch): batch_times = [] epoch_start_time = time.time() train_ppl = train_an_epoch(epoch_id, batch_times) epoch_time = time.time() - epoch_start_time total_time += epoch_time print( "\nTrain epoch:[%d]; epoch Time: %.5f; ppl: %.5f; avg_time: %.5f steps/s \n" % (epoch_id, epoch_time, train_ppl[0], len(batch_times) / sum(batch_times))) # FIXME(zjl): ppl[0] increases as batch_size increases. # We should find a better way to calculate ppl by normalizing batch_size. if device_count == 1 and batch_size <= 20 and epoch_id == 0 and train_ppl[ 0] > 1000: # for bad init, after first epoch, the loss is over 1000 # no more need to continue print( "Parameters are randomly initialized and not good this time because the loss is over 1000 after the first epoch." ) print("Abort this training process and please start again.") return if epoch_id == max_epoch - 1 and args.enable_ce: # kpis print("ptblm\tlstm_language_model_duration\t%s" % (total_time / max_epoch)) print("ptblm\tlstm_language_model_loss\t%s" % train_ppl[0]) if not args.profile: # NOTE(zjl): sometimes we have not enough data for eval if batch_size is large, i.e., 2100 # Just skip to avoid error def is_valid_data(data, batch_size, num_steps): data_len = len(data) batch_len = data_len // batch_size epoch_size = (batch_len - 1) // num_steps return epoch_size >= 1 valid_data_valid = is_valid_data(valid_data, batch_size, num_steps) test_data_valid = is_valid_data(test_data, batch_size, num_steps) if valid_data_valid and test_data_valid: valid_ppl = eval(valid_data) print("Valid ppl: %.5f" % valid_ppl[0]) test_ppl = eval(test_data) print("Test ppl: %.5f" % test_ppl[0]) else: if not valid_data_valid: print( 'WARNING: length of valid_data is {}, which is not enough for batch_size {} and num_steps {}' .format(len(valid_data), batch_size, num_steps)) if not test_data_valid: print( 'WARNING: length of test_data is {}, which is not enough for batch_size {} and num_steps {}' .format(len(test_data), batch_size, num_steps)) filename = "params_%05d" % epoch_id fluid.io.save_persistables(executor=exe, dirname=save_model_dir, main_program=main_program, filename=filename) print("Saved model to: %s/%s.\n" % (save_model_dir, filename)) if args.profile: if args.use_gpu: profiler.start_profiler("All") if not args.inference_only: profile_filename = "train_padding_rnn.gpu.profile" train() else: profile_filename = "infer_padding_rnn.gpu.profile" eval(test_data) profiler.stop_profiler("total", profile_filename) else: profiler.start_profiler("CPU") if not args.inference_only: profile_filename = "train_padding_rnn.cpu.profile" train() else: profile_filename = "infer_padding_rnn.cpu.profile" eval(test_data) profiler.stop_profiler("total", profile_filename) else: if not args.inference_only: train() else: eval(test_data)
def train(args): #获取GPU place = fluid.CUDAPlace(fluid.dygraph.parallel.Env().dev_id) print(place) with fluid.dygraph.guard(place): #多卡上下文 strategy = fluid.dygraph.parallel.prepare_context() print('strategy', strategy) # parse config config = parse_config(args.config) train_config = merge_configs(config, 'train', vars(args)) valid_config = merge_configs(config, 'valid', vars(args)) print_configs(train_config, 'Train') print(train_config) # if args.fix_random_seed: # startup.random_seed = 1000 # train_prog.random_seed = 1000 train_model = Tpn_Model( None, cfg=train_config, mode='train' ) # models.get_model(args.model_name, train_config, mode='train') valid_model = Tpn_Model( None ) # models.get_model(args.model_name, valid_config, mode='valid') train_model.build_input() train_dataloader = train_model.dataloader() opt = train_model.optimizer() # load weights weight, _ = fluid.load_dygraph('./ckpt/k400_tpn_r50f32s2') model_weights = train_model.state_dict() model_weights.update( {k: v for k, v in weight.items() if k in model_weights}) train_model.load_dict(model_weights) print('load model success') # 模型并行 train_model = fluid.dygraph.parallel.DataParallel( train_model, strategy) log_interval = args.log_interval is_profiler = args.is_profiler profiler_path = args.profiler_path trainer_id = 0 fix_random_seed = args.fix_random_seed save_dir = args.save_dir save_model_name = args.model_name # if args.resume: # # if resume weights is given, load resume weights directly # assert os.path.exists(args.resume + '.pdparams'), \ # "Given resume weight dir {}.pdparams not exist.".format(args.resume) # fluid.load(train_prog, model_path=args.resume, executor=exe) # else: # # if not in resume mode, load pretrain weights # if args.pretrain: # assert os.path.exists(args.pretrain), \ # "Given pretrain weight dir {} not exist.".format(args.pretrain) # pretrain = args.pretrain or train_model.get_pretrain_weights() # if pretrain: # train_model.load_pretrain_params(exe, pretrain, train_prog, place) # get reader bs_denominator = 1 if args.use_gpu: # check number of GPUs gpus = os.getenv("CUDA_VISIBLE_DEVICES", "") if gpus == "": pass else: gpus = gpus.split(",") num_gpus = len(gpus) assert num_gpus == train_config.TRAIN.num_gpus, \ "num_gpus({}) set by CUDA_VISIBLE_DEVICES " \ "shoud be the same as that " \ "set in {}({})".format( num_gpus, args.config, train_config.TRAIN.num_gpus) bs_denominator = train_config.TRAIN.num_gpus train_config.TRAIN.batch_size = int(train_config.TRAIN.batch_size / bs_denominator) valid_config.VALID.batch_size = int(valid_config.VALID.batch_size / bs_denominator) train_reader = get_reader(args.model_name.upper(), 'train', train_config) valid_reader = get_reader(args.model_name.upper(), 'valid', valid_config) # get metrics train_metrics = get_metrics(args.model_name.upper(), 'train', train_config) valid_metrics = get_metrics(args.model_name.upper(), 'valid', valid_config) epochs = args.epoch #or train_model.epoch_num() print() train_dataloader.set_sample_list_generator(train_reader, places=place) # valid_dataloader.set_sample_list_generator(valid_reader, places=exe_places) ##多GPU数据读取,必须确保每个进程读取的数据是不同的 train_dataloader = fluid.contrib.reader.distributed_batch_reader( train_dataloader) train_model.train() for epoch in range(epochs): log_lr_and_step() train_iter = 0 epoch_periods = [] cur_time = time.time() for data in train_dataloader(): train_outs = train_model(data) losses, _, _ = train_outs log_vars = OrderedDict() for loss_name, loss_value in losses.items(): # print(loss_name, ':', loss_value.numpy()) log_vars[loss_name] = fluid.layers.reduce_mean(loss_value) # print(loss_name, ':', log_vars[loss_name].numpy()) loss = sum(_value for _key, _value in log_vars.items() if 'loss' in _key) # print('total loss', loss.numpy()) train_outs = [ loss.numpy(), train_outs[1].numpy(), train_outs[2].numpy() ] # print(train_outs[0]) # print(train_outs[1].shape) # print(train_outs[2].shape) # # #分类结果 # prob = softmax(train_outs[1].squeeze()) # # idx = np.argsort(-prob) # #print('idx', idx) # for i in range(0, 5): # print('{:.3f} -> {}'.format(prob[idx[i]], [idx[i]]),train_outs[2]) avg_loss = loss # 多GPU训练需要对Loss做出调整,并聚合不同设备上的参数梯度 #avg_loss = train_model.scale_loss(avg_loss) avg_loss.backward() # 多GPU #train_model.apply_collective_grads() opt.minimize(avg_loss) train_model.clear_gradients() period = time.time() - cur_time epoch_periods.append(period) timeStamp = time.time() localTime = time.localtime(timeStamp) strTime = time.strftime("%Y-%m-%d %H:%M:%S", localTime) if log_interval > 0 and (train_iter % log_interval == 0): train_metrics.calculate_and_log_out(train_outs, \ info='[TRAIN {}] Epoch {}, iter {}, time {}, '.format(strTime, epoch, train_iter, period)) # print('[TRAIN {}] Epoch {}, iter {}, time {}, total_loss {}, loss_cls {},loss_aux {}'. # format(strTime, epoch, train_iter, period, loss.numpy(), # log_vars['loss_cls'].numpy(), log_vars['loss_aux'].numpy() # )) train_iter += 1 cur_time = time.time() # NOTE: profiler tools, used for benchmark if is_profiler and epoch == 0 and train_iter == log_interval: profiler.start_profiler("All") elif is_profiler and epoch == 0 and train_iter == log_interval + 5: profiler.stop_profiler("total", profiler_path) return if len(epoch_periods) < 1: logger.info( 'No iteration was executed, please check the data reader') sys.exit(1) logger.info( '[TRAIN] Epoch {} training finished, average time: {}'.format( epoch, np.mean(epoch_periods[1:]))) # if trainer_id == 0: # save_model(exe, train_prog, save_dir, save_model_name, # "_epoch{}".format(epoch)) # if compiled_test_prog and valid_interval > 0 and ( # epoch + 1) % valid_interval == 0: # test_with_dataloader(exe, compiled_test_prog, test_dataloader, # test_fetch_list, test_metrics, log_interval, # save_model_name) if trainer_id == 0: # save_model(exe, train_prog, save_dir, save_model_name) fluid.save_dygraph(train_model.state_dict(), "{}/{}".format(save_dir, save_model_name)) fluid.save_dygraph(opt.state_dict(), "{}/{}}".format(save_dir, save_model_name)) # when fix_random seed for debug if fix_random_seed: cards = os.environ.get('CUDA_VISIBLE_DEVICES') gpu_num = len(cards.split(",")) print("kpis\ttrain_cost_card{}\t{}".format(gpu_num, loss)) print("kpis\ttrain_speed_card{}\t{}".format( gpu_num, np.mean(epoch_periods)))
def main(): env = os.environ FLAGS.dist = 'PADDLE_TRAINER_ID' in env and 'PADDLE_TRAINERS_NUM' in env if FLAGS.dist: trainer_id = int(env['PADDLE_TRAINER_ID']) local_seed = (99 + trainer_id) random.seed(local_seed) np.random.seed(local_seed) if FLAGS.enable_ce: random.seed(0) np.random.seed(0) cfg = load_config(FLAGS.config) merge_config(FLAGS.opt) check_config(cfg) # check if set use_gpu=True in paddlepaddle cpu version check_gpu(cfg.use_gpu) # check if paddlepaddle version is satisfied check_version() save_only = getattr(cfg, 'save_prediction_only', False) if save_only: raise NotImplementedError('The config file only support prediction,' ' training stage is not implemented now') main_arch = cfg.architecture if cfg.use_gpu: devices_num = fluid.core.get_cuda_device_count() else: devices_num = int(os.environ.get('CPU_NUM', 1)) if 'FLAGS_selected_gpus' in env: device_id = int(env['FLAGS_selected_gpus']) else: device_id = 0 place = fluid.CUDAPlace(device_id) if cfg.use_gpu else fluid.CPUPlace() exe = fluid.Executor(place) lr_builder = create('LearningRate') optim_builder = create('OptimizerBuilder') # build program startup_prog = fluid.Program() train_prog = fluid.Program() if FLAGS.enable_ce: startup_prog.random_seed = 1000 train_prog.random_seed = 1000 with fluid.program_guard(train_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) if FLAGS.fp16: assert (getattr(model.backbone, 'norm_type', None) != 'affine_channel'), \ '--fp16 currently does not support affine channel, ' \ ' please modify backbone settings to use batch norm' with mixed_precision_context(FLAGS.loss_scale, FLAGS.fp16) as ctx: inputs_def = cfg['TrainReader']['inputs_def'] feed_vars, train_loader = model.build_inputs(**inputs_def) train_fetches = model.train(feed_vars) loss = train_fetches['loss'] if FLAGS.fp16: loss *= ctx.get_loss_scale_var() lr = lr_builder() optimizer = optim_builder(lr) optimizer.minimize(loss) if FLAGS.fp16: loss /= ctx.get_loss_scale_var() if 'use_ema' in cfg and cfg['use_ema']: global_steps = _decay_step_counter() ema = ExponentialMovingAverage( cfg['ema_decay'], thres_steps=global_steps) ema.update() # parse train fetches train_keys, train_values, _ = parse_fetches(train_fetches) train_values.append(lr) if FLAGS.eval: eval_prog = fluid.Program() with fluid.program_guard(eval_prog, startup_prog): with fluid.unique_name.guard(): model = create(main_arch) inputs_def = cfg['EvalReader']['inputs_def'] feed_vars, eval_loader = model.build_inputs(**inputs_def) fetches = model.eval(feed_vars) eval_prog = eval_prog.clone(True) eval_reader = create_reader(cfg.EvalReader, devices_num=1) eval_loader.set_sample_list_generator(eval_reader, place) # parse eval fetches extra_keys = [] if cfg.metric == 'COCO': extra_keys = ['im_info', 'im_id', 'im_shape'] if cfg.metric == 'VOC': extra_keys = ['gt_bbox', 'gt_class', 'is_difficult'] if cfg.metric == 'WIDERFACE': extra_keys = ['im_id', 'im_shape', 'gt_bbox'] eval_keys, eval_values, eval_cls = parse_fetches(fetches, eval_prog, extra_keys) # compile program for multi-devices build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = False # only enable sync_bn in multi GPU devices sync_bn = getattr(model.backbone, 'norm_type', None) == 'sync_bn' build_strategy.sync_batch_norm = sync_bn and devices_num > 1 \ and cfg.use_gpu exec_strategy = fluid.ExecutionStrategy() # iteration number when CompiledProgram tries to drop local execution scopes. # Set it to be 1 to save memory usages, so that unused variables in # local execution scopes can be deleted after each iteration. exec_strategy.num_iteration_per_drop_scope = 1 if FLAGS.dist: dist_utils.prepare_for_multi_process(exe, build_strategy, startup_prog, train_prog) exec_strategy.num_threads = 1 exe.run(startup_prog) compiled_train_prog = fluid.CompiledProgram(train_prog).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) if FLAGS.eval: compiled_eval_prog = fluid.CompiledProgram(eval_prog) fuse_bn = getattr(model.backbone, 'norm_type', None) == 'affine_channel' ignore_params = cfg.finetune_exclude_pretrained_params \ if 'finetune_exclude_pretrained_params' in cfg else [] start_iter = 0 if FLAGS.resume_checkpoint: checkpoint.load_checkpoint(exe, train_prog, FLAGS.resume_checkpoint) start_iter = checkpoint.global_step() elif cfg.pretrain_weights and fuse_bn and not ignore_params: checkpoint.load_and_fusebn(exe, train_prog, cfg.pretrain_weights) elif cfg.pretrain_weights: checkpoint.load_params( exe, train_prog, cfg.pretrain_weights, ignore_params=ignore_params) train_reader = create_reader( cfg.TrainReader, (cfg.max_iters - start_iter) * devices_num, cfg, devices_num=devices_num) train_loader.set_sample_list_generator(train_reader, place) # whether output bbox is normalized in model output layer is_bbox_normalized = False if hasattr(model, 'is_bbox_normalized') and \ callable(model.is_bbox_normalized): is_bbox_normalized = model.is_bbox_normalized() # if map_type not set, use default 11point, only use in VOC eval map_type = cfg.map_type if 'map_type' in cfg else '11point' train_stats = TrainingStats(cfg.log_smooth_window, train_keys) train_loader.start() start_time = time.time() end_time = time.time() cfg_name = os.path.basename(FLAGS.config).split('.')[0] save_dir = os.path.join(cfg.save_dir, cfg_name) time_stat = deque(maxlen=cfg.log_smooth_window) best_box_ap_list = [0.0, 0] #[map, iter] # use VisualDL to log data if FLAGS.use_vdl: from visualdl import LogWriter vdl_writer = LogWriter(FLAGS.vdl_log_dir) vdl_loss_step = 0 vdl_mAP_step = 0 for it in range(start_iter, cfg.max_iters): start_time = end_time end_time = time.time() time_stat.append(end_time - start_time) time_cost = np.mean(time_stat) eta_sec = (cfg.max_iters - it) * time_cost eta = str(datetime.timedelta(seconds=int(eta_sec))) outs = exe.run(compiled_train_prog, fetch_list=train_values) stats = {k: np.array(v).mean() for k, v in zip(train_keys, outs[:-1])} # use vdl-paddle to log loss if FLAGS.use_vdl: if it % cfg.log_iter == 0: for loss_name, loss_value in stats.items(): vdl_writer.add_scalar(loss_name, loss_value, vdl_loss_step) vdl_loss_step += 1 train_stats.update(stats) logs = train_stats.log() if it % cfg.log_iter == 0 and (not FLAGS.dist or trainer_id == 0): strs = 'iter: {}, lr: {:.6f}, {}, time: {:.3f}, eta: {}'.format( it, np.mean(outs[-1]), logs, time_cost, eta) logger.info(strs) # NOTE : profiler tools, used for benchmark if FLAGS.is_profiler and it == 5: profiler.start_profiler("All") elif FLAGS.is_profiler and it == 10: profiler.stop_profiler("total", FLAGS.profiler_path) return if (it > 0 and it % cfg.snapshot_iter == 0 or it == cfg.max_iters - 1) \ and (not FLAGS.dist or trainer_id == 0): save_name = str(it) if it != cfg.max_iters - 1 else "model_final" if 'use_ema' in cfg and cfg['use_ema']: exe.run(ema.apply_program) checkpoint.save(exe, train_prog, os.path.join(save_dir, save_name)) if FLAGS.eval: # evaluation resolution = None if 'Mask' in cfg.architecture: resolution = model.mask_head.resolution results = eval_run( exe, compiled_eval_prog, eval_loader, eval_keys, eval_values, eval_cls, cfg, resolution=resolution) box_ap_stats = eval_results( results, cfg.metric, cfg.num_classes, resolution, is_bbox_normalized, FLAGS.output_eval, map_type, cfg['EvalReader']['dataset']) # use vdl_paddle to log mAP if FLAGS.use_vdl: vdl_writer.add_scalar("mAP", box_ap_stats[0], vdl_mAP_step) vdl_mAP_step += 1 if box_ap_stats[0] > best_box_ap_list[0]: best_box_ap_list[0] = box_ap_stats[0] best_box_ap_list[1] = it checkpoint.save(exe, train_prog, os.path.join(save_dir, "best_model")) logger.info("Best test box ap: {}, in iter: {}".format( best_box_ap_list[0], best_box_ap_list[1])) if 'use_ema' in cfg and cfg['use_ema']: exe.run(ema.restore_program) train_loader.reset()
def train(num_pass=300, use_cuda=False, mem_opt=False): dict_size = 100000 hash_size = 100000 print_iter = 100 eval_iter = 6000 batch_size = 1280 cpu_num = int(os.environ.get('CPU_NUM', multiprocessing.cpu_count())) debug = False fluid.default_startup_program().random_seed = 1 fluid.default_main_program().random_seed = 1 np.random.seed = 1 # construct network loss, pos_sim, train_program, test_program = net(hash_size=hash_size, dict_size=dict_size) # optimizer = fluid.optimizer.Adam(learning_rate=1e-4) # optimizer = fluid.optimizer.SGD(learning_rate=1e-4) # optimizer.minimize(loss) # memory optimize if mem_opt: fluid.memory_optimize(fluid.default_main_program()) for var in train_program.blocks[0].vars: # if "GRAD" not in var and not train_program.blocks[0].var(var).is_data: if not train_program.blocks[0].var(var).is_data: train_program.blocks[0].var(var).persistable = True print(var, train_program.blocks[0].var(var).persistable, train_program.blocks[0].var(var).shape) # initialize place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) print('startup_program', fluid.default_startup_program()) print('train_program', train_program) # print('test_program', test_program) if debug: var_name_list = ( "cos_sim_1.tmp_0@GRAD", "fc_2.tmp_1@GRAD", "fc_2.tmp_0@GRAD", "softsign_2.tmp_0@GRAD", "reduce_sum_2.tmp_0@GRAD", "stack_2.tmp_0@GRAD", "sequence_pool_23.tmp_0@GRAD", "sequence_pool_23.tmp_0@GRAD", "embedding_23.tmp_0@GRAD", "PyramidHash_emb_0@GRAD@RENAME@0", "PyramidHash_emb_0@GRAD@RENAME@1", "PyramidHash_emb_0@GRAD@RENAME@2", "PyramidHash_emb_0@GRAD@RENAME@3", "PairwiseMarginLoss_0.tmp_0@GRAD", "cos_sim_1.tmp_0", "cos_sim_1.tmp_0@GRAD", "fc_2.tmp_1@GRAD", "fc_2.tmp_0@GRAD", "softsign_2.tmp_0@GRAD", "reduce_sum_2.tmp_0@GRAD", "stack_2.tmp_0@GRAD", "sequence_pool_23.tmp_0@GRAD", "embedding_23.tmp_0@GRAD", "PyramidHash_emb_0@GRAD", "FC_1@GRAD", "EmbeddingWithVSum_emb_0@GRAD", "fc_0.w_0@GRAD", "PairwiseMarginLoss_0.tmp_0", "PairwiseMarginLoss_0.tmp_1") # var_name_list = ("sequence_pool_23.tmp_0@GRAD", "embedding_23.tmp_0@GRAD", "PyramidHash_emb_0@GRAD@RENAME@0", "PyramidHash_emb_0@GRAD", "FC_1@GRAD", "EmbeddingWithVSum_emb_0@GRAD", "fc_0.w_0@GRAD", "PairwiseMarginLoss_0.tmp_0", "PairwiseMarginLoss_0.tmp_1") for name in var_name_list: train_program.blocks[0].var(name).persistable = True print('find var', name, train_program.blocks[0].var(name).persistable) # PE exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_cuda = use_cuda exec_strategy.allow_op_delay = True exec_strategy.num_threads = 1 # exec_strategy.num_threads = int(os.environ.get('THREAD_NUM', 1)) * cpu_num - 1 # exec_strategy.num_threads = 25 exec_strategy.use_experimental_executor = True build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce # build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce # build_strategy.optimize_strategy = fluid.BuildStrategy.OptimizeStrategy.NoLock # pass_builder = build_strategy._create_passes_from_strategy() # pass_builder.insert_pass(0, "lock_free_optimize_pass") train_exe = fluid.ParallelExecutor(use_cuda=use_cuda, loss_name=None, main_program=train_program, build_strategy=build_strategy, exec_strategy=exec_strategy) test_exe = fluid.ParallelExecutor( use_cuda=use_cuda, main_program=test_program, share_vars_from=train_exe, ) # DataFeeder feed_var_names = [ 'query_basic', 'query_phrase', 'pos_title_basic', 'pos_title_phrase', 'neg_title_basic', 'neg_title_phrase', 'label' ] feed_list = [ train_program.global_block().var(var_name) for var_name in feed_var_names ] feeder = fluid.DataFeeder(feed_list, place) # batch_train_reader = feeder.decorate_reader( # paddle.batch(reader.train_reader, batch_size=batch_size // cpu_num), # multi_devices=true) batch_train_reader = feeder.decorate_reader(paddle.batch( reader.train_reader, batch_size=1280), multi_devices=True) test_feed_var_names = [ 'query_basic', 'query_phrase', 'pos_title_basic', 'pos_title_phrase', 'neg_title_basic', 'neg_title_phrase' ] test_feed_list = [ train_program.global_block().var(var_name) for var_name in test_feed_var_names ] test_feeder = fluid.DataFeeder(test_feed_list, place) # train for epoch in six.moves.xrange(num_pass): count = 0 total_loss = .0 total_time = .0 read_data_start = time.time() for train_data in batch_train_reader(): read_data_end = time.time() # print('read data: ', read_data_end - read_data_start) if count == 1 and epoch >= 1: # if count % eval_iter == 0: print('start eval') t2 = time.time() # with open('./eval_log/train_mini_data_' + str(epoch) + '_' + str(count) + '_' + str(time.time()), 'w') as f: with open( './eval_res/z_' + paddle.version.commit + 'sgd_nolock_result_' + str(epoch) + '_' + str(time.time()), 'w') as f: test_batch_reader = paddle.batch( reader.test_reader, # batch_size=cpu_num * 128) batch_size=1280) for test_data in test_batch_reader(): qids = [] labels = [] data_list = [] for one_data in test_data: qids.append(one_data[0]) labels.append(int(one_data[-1][0])) data_list.append((one_data[1:-1])) predicts = test_exe.run( feed=test_feeder.feed(data_list), fetch_list=[pos_sim.name]) scores = np.array(predicts[0]) for qid, label, score in six.moves.zip( qids, labels, scores): f.write( str(qid) + '\t' + str(score[0]) + '\t' + str(label) + '\n') print('end eval', time.time() - t2) start = time.time() if epoch == 0 and count == 5: profiler.start_profiler("CPU") elif epoch == 0 and count == 10: profiler.stop_profiler("total", "/paddle/Pyramid_DNN/fluid/profile") t1 = time.time() cost = train_exe.run(feed=train_data, fetch_list=[]) total_time += time.time() - t1 # total_loss += np.array(cost[0]).mean() count += 1 if debug: for name in var_name_list: var = np.array( fluid.executor._fetch_var(name, return_numpy=False)) if name == "PyramidHash_emb_0@GRAD@RENAME@0": print('fetch var', name, var) print('check not zero', name, np.count_nonzero(var)) print('fetch var', name, var) print('check nan var', name, np.isnan(var).any()) print('check inf var', name, np.isinf(var).any()) if count % print_iter == 0: print('epoch: %d, batch_id: %d, avg_cost: %s, avg_time: %f' % (epoch, count, total_loss / print_iter, float(total_time) / print_iter)) import sys sys.stdout.flush() total_time = .0 total_loss = .0 read_data_start = time.time()