def check_network_convergence(self, use_cuda, build_strategy=None): os.environ['CPU_NUM'] = str(4) main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): loss = simple_fc_net() test_program = main.clone(for_test=True) opt = fluid.optimizer.SGD(learning_rate=0.001) opt.minimize(loss) batch_size = 32 image = np.random.normal(size=(batch_size, 784)).astype('float32') label = np.random.randint(0, 10, (batch_size, 1), dtype="int64") place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) feed_dict = {'image': image, 'label': label} train_cp = compiler.CompiledProgram(main).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) test_cp = compiler.CompiledProgram(test_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, share_vars_from=train_cp) for i in range(5): _ = exe.run(train_cp, fetch_list=[loss.name], feed=feed_dict) test_loss, = exe.run(test_cp, fetch_list=[loss.name], feed=feed_dict) train_loss = exe.run(train_cp, fetch_list=[loss.name], feed=feed_dict) avg_test_loss_val = np.array(test_loss).mean() if math.isnan(float(avg_test_loss_val)): sys.exit("got NaN loss, testing failed.") avg_train_loss_val = np.array(train_loss).mean() if math.isnan(float(avg_train_loss_val)): sys.exit("got NaN loss, training failed.") self.assertTrue( np.allclose( train_loss, test_loss, atol=1e-8), "Train loss: " + str(train_loss) + "\n Test loss:" + str(test_loss))
def main(self, network_func, iter=10, iter_per_pe=10, use_gpu=True, use_experimental_executor=False): if use_gpu and not fluid.core.is_compiled_with_cuda(): logging.warning( "Paddle is not compiled with CUDA, skip GPU unittests") return main_prog = fluid.Program() startup_prog = fluid.Program() scope = fluid.Scope() with fluid.program_guard(main_prog, startup_prog): with fluid.scope_guard(scope): loss = network_func() exe = fluid.Executor( fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()) exe.run(startup_prog) exe_strategy = fluid.ExecutionStrategy() exe_strategy._dry_run = True exe_strategy.use_experimental_executor = use_experimental_executor train_cp = compiler.CompiledProgram( main_prog).with_data_parallel(loss_name=loss.name, exec_strategy=exe_strategy) for _ in six.moves.xrange(iter): for _ in six.moves.xrange(iter_per_pe): exe.run(train_cp)
def _feed_data_in_executor(self, in_size, label_size, feed_in_data, feed_label, use_cuda, use_parallel_executor): startup_program = fluid.Program() main_program = fluid.Program() with fluid.program_guard(main_program, startup_program): in_data, label, loss = self._simple_fc_net(in_size, label_size, self.class_num, self.hidden_sizes) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_program) train_program = main_program if use_parallel_executor: train_program = compiler.CompiledProgram( main_program).with_data_parallel(loss_name=loss.name) for i in range(self.iterations): fetches = exe.run(train_program, feed={ in_data.name: feed_in_data, label.name: feed_label }, fetch_list=[loss.name])
def parallel_exe(self, use_cuda, run_parallel_exe, seed=1): main_program = fluid.Program() startup = fluid.Program() startup.random_seed = seed with fluid.program_guard(main_program, startup): data = fluid.layers.data(name='image', shape=[3, 224, 224], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') out = Lenet(data, class_dim=102) loss = fluid.layers.cross_entropy(input=out, label=label) loss = fluid.layers.mean(loss) opt = fluid.optimizer.Momentum( learning_rate=0.1, momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) opt.minimize(loss) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) #FIXME force disable enable_inplace and memory_optimize to pass the unittest build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = False build_strategy.memory_optimize = False train_cp = compiler.CompiledProgram(main_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) run_parallel_exe(train_cp, exe, use_cuda, data, label, loss)
def compare(self, place, layout, only_forward, activation, alpha, use_cuda): seed = 10 os.environ['FLAGS_cudnn_deterministic'] = "1" data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2 fetch_outs = [] fetch_names = [] for inplace in [False, True]: main, startup, outs = self.build_program( place, layout, seed, only_forward, activation, alpha, inplace=inplace) exe = fluid.Executor(place) exe.run(startup) fetch_name = [v.name for v in outs] + [ 'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias' ] if not only_forward: others = [ 'inplace_abn_0.tmp_0' if inplace else 'batch_norm_0.tmp_0', 'inplace_abn_0.tmp_1' if inplace else 'batch_norm_0.tmp_1', 'bn_scale@GRAD', 'bn_bias@GRAD', 'input@GRAD', ] fetch_name += others for nm in fetch_name: fv = fluid.framework._get_var(str(nm), program=main) fv.persistable = True build_strategy = fluid.BuildStrategy() build_strategy.sync_batch_norm = use_cuda and \ fluid.core.get_cuda_device_count() > 1 build_strategy.enable_inplace = inplace exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 1 if os.name == 'nt' else 0 comp_prog1 = compiler.CompiledProgram(main).with_data_parallel( outs[0].name if not only_forward else None, build_strategy=build_strategy, exec_strategy=exec_strategy) bn_fetches = exe.run(program=main, feed={'input': data}, fetch_list=fetch_name) fetch_outs.append(bn_fetches) fetch_names.append(fetch_name) for bn_val, inplace_abn_val, name1, name2 in zip(*( fetch_outs + fetch_names)): self.assertTrue( np.allclose( bn_val, inplace_abn_val, atol=1e-2), "Output (" + name1 + ":" + name2 + ") has diff on {} with {} layout and {} activation. \n".format( place, layout, activation) + "\nBN " + str(bn_val) + "\n" + "Inplace ABN " + str(inplace_abn_val))
def init_infer_program(self): # define inferer self.infer_program = fluid.Program() startup_prog = fluid.Program() # prepare the network with fluid.program_guard(self.infer_program, startup_prog): with fluid.unique_name.guard(): self.infer_feeder, self.infer_log_probs, _ = self.create_network(is_infer=True) self.infer_program = self.infer_program.clone(for_test=True) self.infer_exe = fluid.Executor(self._place) self.infer_exe.run(startup_prog) # init param from pretrained_model if not self._init_from_pretrained_model: exit("预训练模型文件不存在!") self.init_from_pretrained_model(self.infer_exe, self.infer_program) # 支持多卡推理 build_strategy = compiler.BuildStrategy() exec_strategy = fluid.ExecutionStrategy() self.infer_compiled_prog = compiler.CompiledProgram(self.infer_program).with_data_parallel( build_strategy=build_strategy, exec_strategy=exec_strategy)
def train(exe, train_program, train_out, test_program, test_out, args): loss, acc, global_lr, train_reader = train_out fetch_list_train = [loss.name, acc.name, global_lr.name] build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_optimizer_ops = True compiled_prog = compiler.CompiledProgram( train_program, build_strategy=build_strategy).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) best_ave = 0 for epoch_id in range(args.start_epoch, args.total_epoch): for batch_id, data in enumerate(train_reader()): loss, acc, global_lr = exe.run(compiled_prog, feed=data, fetch_list=fetch_list_train) avg_loss = np.mean(np.array(loss)) avg_acc = np.mean(np.array(acc)) print( '{} Epoch: {:^4d} step: {:^4d} loss: {:.6f}, acc: {:.6f}, lr: {}'. format(now(), epoch_id, batch_id, avg_loss, avg_acc, float(np.mean(np.array(global_lr))))) if batch_id % args.save_frequency == 0: model_path = os.path.join(args.save_ckpt, str(epoch_id)) fluid.io.save_persistables( executor=exe, dirname=model_path, main_program=train_program) temp_ave = test(exe, test_program, test_out, args) if temp_ave > best_ave: best_ave = temp_ave print('Best AVE: {}'.format(best_ave)) out_feature, test_reader, flods, flags = test_out fluid.io.save_inference_model( executor=exe, dirname='./out_inference', feeded_var_names=['image_test'], target_vars=[out_feature], main_program=test_program)
def run_parallel_exe(self, place, feed_list, loss, use_reduce=False, use_fast_executor=False, use_ir_memory_optimize=False): exe = fluid.Executor(place) feeder = fluid.DataFeeder(feed_list=feed_list, place=place) exe.run(fluid.default_startup_program()) exec_strategy = fluid.ExecutionStrategy() if use_fast_executor: exec_strategy.use_experimental_executor = True build_strategy = fluid.BuildStrategy() build_strategy.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce \ if use_reduce else fluid.BuildStrategy.ReduceStrategy.AllReduce build_strategy.memory_optimize = use_ir_memory_optimize train_cp = compiler.CompiledProgram( fluid.default_main_program()).with_data_parallel( loss_name=loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) loss_set = [] for data in self.train_data: out = exe.run(train_cp, feed=feeder.feed(data), fetch_list=[loss.name]) loss_set.append(np.average(out)) return loss_set
def train(network, use_cuda, use_parallel_executor, batch_size=32, pass_num=2): if use_cuda and not core.is_compiled_with_cuda(): print('Skip use_cuda=True because Paddle is not compiled with cuda') return if use_parallel_executor and os.name == 'nt': print( 'Skip use_parallel_executor=True because Paddle comes without parallel support on windows' ) return word_dict_size = 5147 reader = fake_imdb_reader(word_dict_size, batch_size * 40) train_reader = paddle.batch(reader, batch_size=batch_size) data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1) label = fluid.layers.data(name="label", shape=[1], dtype="int64") cost = network(data, label, word_dict_size) cost.persistable = True optimizer = fluid.optimizer.Adagrad(learning_rate=0.2) optimizer.minimize(cost) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() feeder = fluid.DataFeeder(feed_list=[data, label], place=place) reader = feeder.decorate_reader(train_reader, multi_devices=use_parallel_executor) exe = fluid.Executor(place) fluid.default_startup_program().random_seed = 1 fluid.default_main_program().random_seed = 1 exe.run(fluid.default_startup_program()) train_cp = fluid.default_main_program() if use_parallel_executor: train_cp = compiler.CompiledProgram( fluid.default_main_program()).with_data_parallel( loss_name=cost.name) fetch_list = [cost.name] else: fetch_list = [cost] for pass_id in six.moves.xrange(pass_num): batch_id = 0 for data in reader(): exe.run(train_cp, feed=data, fetch_list=fetch_list if batch_id % 4 == 0 else []) batch_id += 1 if batch_id > 16: break
def _try_to_compile(self, startup_program, main_program): node_num = self._node_num() assert node_num >= 1, "nccl2 node_num must >= 1, now:{}" % node_num self._strategy.fuse_all_reduce_ops = True exec_strategy = self._strategy.exec_strategy if node_num <= 1: if self._strategy.nccl_comm_num > 1: logging.warn("set nccl_comm_num=1 since you only have 1 node.") self._strategy.nccl_comm_num = 1 if self._strategy.use_hierarchical_allreduce: logging.warn( "set use_hierarchical_allreduce=False since you only have 1 node." ) self._strategy.use_hierarchical_allreduce = False sync_allreduce = os.getenv("FLAGS_sync_nccl_allreduce") if sync_allreduce is None or sync_allreduce == "1": exec_strategy.num_threads = self._strategy.nccl_comm_num + 1 if self._strategy.use_hierarchical_allreduce: exec_strategy.num_threads = 2 * self._strategy.nccl_comm_num + 1 if exec_strategy.num_threads > 4: logging.warn( "if you use use_hierarchical_allreduce or " "with multi nccl comm, please export FLAGS_sync_nccl_allreduce = 0" ) if self.print_config: print("node_num:", node_num, "num_threads:", exec_strategy.num_threads, "use_hierarchical_allreduce:", self._strategy.use_hierarchical_allreduce, "nccl_comm_num:", self._strategy.nccl_comm_num, "FLAGS_sync_nccl_allreduce:", sync_allreduce) self._transpile(startup_program, main_program) if self._strategy.mode == "collective": return main_program self._strategy.num_trainers = fleet.worker_num() self._strategy.trainer_id = fleet.worker_index() self._strategy.trainers_endpoints = fleet.worker_endpoints() self._strategy.enable_backward_optimizer_op_deps = True self._compiled_program = compiler.CompiledProgram(main_program) self._compiled_program.with_data_parallel( loss_name=self._loss.name, build_strategy=self._strategy, exec_strategy=self._strategy.exec_strategy, share_vars_from=None) return self._compiled_program
def _try_to_compile(self, main_program, loss): dist_strategy = self._get_distributed_strategy() build_strategy = dist_strategy.get_build_strategy() exec_strategy = dist_strategy.get_execute_strategy() self._compiled_program = compiler.CompiledProgram(main_program) self._compiled_program.with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy, share_vars_from=None) return self._compiled_program
def main(self, with_double_buffer): main_prog = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(main_prog, startup_prog): image = fluid.layers.data(name='image', shape=self.ins_shape, dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') data_reader_handle = fluid.io.PyReader( feed_list=[image, label], capacity=16, iterable=False, use_double_buffer=with_double_buffer) fetch_list = [image.name, label.name] place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) data_reader_handle.decorate_sample_list_generator( paddle.batch(self.prepare_data(), batch_size=self.batch_size)) train_cp = compiler.CompiledProgram(main_prog).with_data_parallel( places=[place]) batch_id = 0 pass_count = 0 while pass_count < self.test_pass_num: data_reader_handle.start() try: while True: data_val, label_val = exe.run(train_cp, fetch_list=fetch_list, return_numpy=True) ins_num = data_val.shape[0] broadcasted_label = np.ones( (ins_num, ) + tuple(self.ins_shape)) * label_val.reshape( (ins_num, 1)) self.assertEqual(data_val.all(), broadcasted_label.all()) batch_id += 1 except fluid.core.EOFException: data_reader_handle.reset() pass_count += 1 self.assertEqual(pass_count * self.batch_num, batch_id) self.assertEqual(pass_count, self.test_pass_num)
def test_main(use_cuda, use_py_func_op, use_parallel_executor): if use_cuda and not fluid.core.is_compiled_with_cuda(): return None with fluid.program_guard(fluid.Program(), fluid.Program()): with fluid.scope_guard(fluid.core.Scope()): fluid.default_main_program().random_seed = 1 fluid.default_startup_program().random_seed = 1 np.random.seed(1) img = fluid.layers.data(name='image', shape=[784], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') loss = simple_fc_net(img, label, use_py_func_op) optimizer = fluid.optimizer.SGD(learning_rate=1e-3) optimizer.minimize(loss) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() feeder = fluid.DataFeeder(feed_list=[img, label], place=place) r = paddle.batch(reader, batch_size=10) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) #FIXME force use old memory optimzie strategy here to pass the unittest #since open the new strategy will crash the unittest fluid.memory_optimize(fluid.default_main_program()) train_cp = compiler.CompiledProgram(fluid.default_main_program()) if use_parallel_executor: train_cp = train_cp.with_data_parallel(loss_name=loss.name) fetch_list = [loss.name] else: fetch_list = [loss] ret = [] for epoch_id in six.moves.range(2): for d in r(): L, = exe.run(train_cp, feed=feeder.feed(d), fetch_list=fetch_list) ret.append(L) return np.array(ret)
def main(self, with_double_buffer): main_prog = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(main_prog, startup_prog): data_reader_handle = fluid.layers.io.open_files( filenames=[self.data_file_name], shapes=[[-1] + self.ins_shape, [-1, 1]], lod_levels=[0, 0], dtypes=['float32', 'int64'], thread_num=1, pass_num=1) data_reader = fluid.layers.io.batch(data_reader_handle, self.batch_size) if with_double_buffer: data_reader = fluid.layers.double_buffer(data_reader) image, label = fluid.layers.read_file(data_reader) fetch_list = [image.name, label.name] place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_prog) train_cp = compiler.CompiledProgram(main_prog).with_data_parallel() pass_count = 0 while (True): try: data_val, label_val = exe.run(train_cp, fetch_list=fetch_list, return_numpy=True) ins_num = data_val.shape[0] broadcasted_label = np.ones( (ins_num, ) + tuple(self.ins_shape)) * label_val.reshape( (ins_num, 1)) self.assertEqual(data_val.all(), broadcasted_label.all()) except fluid.core.EOFException: pass_count += 1 if pass_count < self.test_pass_num: data_reader_handle.reset() else: break
def check_pass_conflict(cls, method, use_device=DeviceType.CUDA, feed_dict=None, get_data_from_feeder=None, use_reduce=False, use_ir_memory_optimize=True, enable_inplace=True, fuse_elewise_add_act_ops=False, fuse_all_optimizer_ops=False, fuse_all_reduce_ops=False, fuse_relu_depthwise_conv=False, optimizer=fluid.optimizer.Adam, use_fast_executor=True, enable_sequential_execution=False): main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): feed_dict, loss = cls.build_model(feed_dict, get_data_from_feeder, main, method, optimizer) place = fluid.CUDAPlace( 0) if use_device == DeviceType.CUDA else fluid.XPUPlace( 0) if use_device == DeviceType.XPU else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) build_strategy, exec_strategy = cls.set_strategy( enable_inplace, enable_sequential_execution, fuse_all_optimizer_ops, fuse_all_reduce_ops, fuse_elewise_add_act_ops, fuse_relu_depthwise_conv, use_fast_executor, use_ir_memory_optimize, use_reduce, use_device) binary = compiler.CompiledProgram(main).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) exe.run(binary, feed=feed_dict, fetch_list=[loss.name])
def parallel_exe(self, use_cuda, run_parallel_exe, use_faster_executor=False, num_threads=4, seed=1): main_program = fluid.Program() startup = fluid.Program() startup.random_seed = seed with fluid.program_guard(main_program, startup): data = fluid.layers.data(name='image', shape=[3, 224, 224], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='int64') out = Lenet(data, class_dim=102) loss = fluid.layers.cross_entropy(input=out, label=label) loss = fluid.layers.mean(loss) opt = fluid.optimizer.Momentum( learning_rate=0.1, momentum=0.9, regularization=fluid.regularizer.L2Decay(1e-4)) opt.minimize(loss) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) build_strategy = fluid.BuildStrategy() exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_experimental_executor = use_faster_executor exec_strategy.num_threads = num_threads train_cp = compiler.CompiledProgram(main_program).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) run_parallel_exe(train_cp, exe, use_cuda, data, label, loss)
def train(self, train_batch_reader, dev_batch_reader, learning_rate, gradient_clipping, num_epoch, batch_size, num_samples, test_off=False): """Train the model. :param train_batch_reader: Train data reader. :type train_batch_reader: callable :param dev_batch_reader: Validation data reader. :type dev_batch_reader: callable :param feeding_dict: Feeding is a map of field name and tuple index of the data that reader returns. :type feeding_dict: dict|list :param learning_rate: Learning rate for ADAM optimizer. :type learning_rate: float :param gradient_clipping: Gradient clipping threshold. :type gradient_clipping: float :param num_epoch: Number of training epochs. :type num_epoch: int :param batch_size: Number of batch size. :type batch_size: int :param num_samples: The num of train samples. :type num_samples: int :param num_iterations_print: Number of training iterations for printing a training loss. :type num_iteratons_print: int :param only_train_batch:Every epoch only train only_train_batch batch. Avoid insufficient video memory :type only_train_batch:int :param test_off: Turn off testing. :type test_off: bool """ # prepare model output directory if not os.path.exists(self._output_model_dir): mkpath(self._output_model_dir) if isinstance(self._place, fluid.CUDAPlace): dev_count = fluid.core.get_cuda_device_count() learning_rate = learning_rate * dev_count else: dev_count = int(os.environ.get('CPU_NUM', 1)) # prepare the network train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_reader, _, ctc_loss = self.create_network() # 学习率 learning_rate = fluid.layers.exponential_decay( learning_rate=learning_rate, decay_steps=num_samples / batch_size / dev_count, decay_rate=0.83, staircase=True) # 准备优化器 optimizer = fluid.optimizer.AdamOptimizer( learning_rate=learning_rate, regularization=fluid.regularizer.L2Decay(0.0001), grad_clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=gradient_clipping)) optimizer.minimize(loss=ctc_loss) exe = fluid.Executor(self._place) exe.run(startup_prog) # init from some pretrain models, to better solve the current task pre_epoch = 0 if self._init_from_pretrained_model: pre_epoch = self.init_from_pretrained_model(exe, train_program) build_strategy = compiler.BuildStrategy() exec_strategy = fluid.ExecutionStrategy() # pass the build_strategy to with_data_parallel API train_compiled_prog = compiler.CompiledProgram(train_program).with_data_parallel(loss_name=ctc_loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) train_reader.set_batch_generator(train_batch_reader) train_step = 0 test_step = 0 num_batch = -1 # run train for epoch_id in range(num_epoch): train_reader.start() epoch_loss = [] time_begin = time.time() batch_id = 0 while True: try: fetch_list = [ctc_loss.name, learning_rate.name] if batch_id % 100 == 0: fetch = exe.run(program=train_compiled_prog, fetch_list=fetch_list, return_numpy=False) each_loss = fetch[0] each_learning_rate = np.array(fetch[1])[0] epoch_loss.extend(np.array(each_loss[0]) / batch_size) print("Train [%s] epoch: [%d/%d], batch: [%d/%d], learning rate: %f, train loss: %f\n" % (datetime.now(), epoch_id, num_epoch, batch_id, num_batch, each_learning_rate, np.mean(each_loss[0]) / batch_size)) # 记录训练损失值 self.writer.add_scalar('Train loss', np.mean(each_loss[0]) / batch_size, train_step) self.writer.add_scalar('Learning rate', each_learning_rate, train_step) train_step += 1 else: _ = exe.run(program=train_compiled_prog, fetch_list=[], return_numpy=False) # 每2000个batch保存一次模型 if batch_id % 2000 == 0 and batch_id != 0: self.save_param(exe, train_program, "epoch_" + str(epoch_id + pre_epoch)) batch_id = batch_id + 1 except fluid.core.EOFException: train_reader.reset() break num_batch = batch_id # 每一个epoch保存一次模型 self.save_param(exe, train_program, "epoch_" + str(epoch_id + pre_epoch)) used_time = time.time() - time_begin if test_off: print('======================last Train=====================') print("Train time: %f sec, epoch: %d, train loss: %f\n" % (used_time, epoch_id, np.mean(np.array(epoch_loss)))) print('======================last Train=====================') else: print('\n======================Begin test=====================') # 设置临时模型的路径 self._init_from_pretrained_model = self.save_model_path # 执行测试 test_result = self.test(test_reader=dev_batch_reader) print("Train time: %f sec, epoch: %d, train loss: %f, test %s: %f" % (used_time, epoch_id + pre_epoch, np.mean(np.array(epoch_loss)), self.error_rate_type, test_result)) print('======================Stop Train=====================\n') # 记录测试结果 self.writer.add_scalar('Test %s' % self.error_rate_type, test_result, test_step) test_step += 1 self.save_param(exe, train_program, "step_final") print("\n------------Training finished!!!-------------")
def run_trainer(self, args): self.lr = args.lr if args.nccl2_reduce_layer_local_run: test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=args.batch_size, single_device=True) elif args.use_dgc: test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=args.batch_size, use_dgc=args.use_dgc) else: test_program, avg_cost, train_reader, test_reader, batch_acc, predict = \ self.get_model(batch_size=args.batch_size) if args.update_method == "pserver": print_to_err( type(self).__name__, "begin to run transpile on trainer with pserver mode") t = self.get_transpiler(trainer_id=args.trainer_id, main_program=fluid.default_main_program(), pserver_endpoints=args.endpoints, trainers=args.trainers, sync_mode=args.sync_mode, dc_asgd=args.dc_asgd, hogwild_mode=args.hogwild) trainer_prog = t.get_trainer_program() print_to_err( type(self).__name__, "get trainer program done with pserver mode.") elif args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer": # transpile for nccl2 config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" config.nccl_comm_num = args.nccl_comm_num if args.use_hallreduce: config.use_hierarchical_allreduce = True config.hierarchical_allreduce_inter_nranks = args.hallreduce_inter_nranks print_to_err( type(self).__name__, "begin to run transpile on trainer with nccl2 mode") nccl2_t = fluid.DistributeTranspiler(config=config) nccl2_t.transpile(args.trainer_id, program=fluid.default_main_program(), startup_program=fluid.default_startup_program(), trainers=args.endpoints, current_endpoint=args.current_endpoint) print_to_err( type(self).__name__, "get trainer program done. with nccl2 mode") trainer_prog = fluid.default_main_program() else: print_to_err( type(self).__name__, "do nothing about main program, just use it") trainer_prog = fluid.default_main_program() print_to_err(type(self).__name__, "use main program done.") # FIXME(gongwb):wait pserver initialization. time.sleep(1) if args.use_cuda: device_id = int(os.getenv("FLAGS_selected_gpus", "0")) place = fluid.CUDAPlace(device_id) else: place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) print_to_err(type(self).__name__, "run worker startup program done.") exec_strategy = fluid.ExecutionStrategy() exec_strategy.num_threads = 1 build_stra = fluid.BuildStrategy() # FIXME force disable enable_inplace and memory_optimize build_stra.enable_inplace = False build_stra.memory_optimize = False if args.hogwild: build_stra.async_mode = True if args.enable_backward_deps: build_stra.enable_backward_optimizer_op_deps = True if args.use_reduce: build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.Reduce else: build_stra.reduce_strategy = fluid.BuildStrategy.ReduceStrategy.AllReduce pass_builder = None if args.batch_merge_repeat > 1: pass_builder = build_stra._finalize_strategy_and_create_passes() mypass = pass_builder.insert_pass(0, "multi_batch_merge_pass") mypass.set("num_repeats", args.batch_merge_repeat) if args.update_method == "nccl2" or args.update_method == "nccl2_reduce_layer": build_stra.num_trainers = len(args.endpoints.split(",")) build_stra.trainer_id = args.trainer_id else: # case args.update_method == "nccl2_reduce_layer": build_stra.num_trainers = 1 build_stra.trainer_id = 0 if args.use_dgc: # fuse_all_reduce_ops require that gradients should not be sparse types build_stra.fuse_all_reduce_ops = False print_to_err( type(self).__name__, "begin to compile with data parallel") binary = compiler.CompiledProgram(trainer_prog).with_data_parallel( loss_name=avg_cost.name, build_strategy=build_stra, exec_strategy=exec_strategy) print_to_err( type(self).__name__, "program compiled with data parallel") feed_var_list = [ var for var in trainer_prog.global_block().vars.values() if var.is_data ] feeder = fluid.DataFeeder(feed_var_list, place) reader_generator = train_reader() def get_data(): origin_batch = next(reader_generator) if args.update_method != "local" and args.use_reader_alloc: new_batch = [] for offset, item in enumerate(origin_batch): if offset % 2 == args.trainer_id: new_batch.append(item) return new_batch else: return origin_batch print_to_err(type(self).__name__, "begin to train on trainer") out_losses = [] for i in six.moves.xrange(RUN_STEP): loss, = exe.run(binary, fetch_list=[avg_cost.name], feed=feeder.feed(get_data())) out_losses.append(loss[0]) print_to_err(type(self).__name__, "run step %d finished" % i) print_to_err(type(self).__name__, "trainer run finished") print_to_out(out_losses)
data = fluid.data(name="char", shape=[None, 50], dtype="int64", lod_level=0) #data = fluid.data(name="char", shape=[None, 50], dtype="float32", lod_level=0) label = fluid.data(name="label", shape=[None, 1], dtype="int64", lod_level=0) reader = fluid.io.PyReader(feed_list=[data, label], capacity=40, iterable=True, return_list=False) reader.decorate_sample_list_generator(train_reader, place) emb = fluid.embedding(data, size=[10, 64]) prob = fluid.layers.fc(emb, size=2, act='softmax') #prob = fluid.layers.fc(data, size=2, act='softmax') ce = fluid.layers.cross_entropy(prob, label) loss = fluid.layers.mean(ce) exe = fluid.Executor(place[0]) fluid.optimizer.SGD(learning_rate=0.01).minimize(loss) exe.run(fluid.default_startup_program()) build_strategy = fluid.BuildStrategy() build_strategy.fuse_all_reduce_ops = False compiled_train_prog = compiler.CompiledProgram( fluid.default_main_program()).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy) for data in reader(): loss_data = exe.run(compiled_train_prog, feed=data, fetch_list=[loss.name]) break print(loss_data)
def train(self, train_batch_reader, dev_batch_reader, feeding_dict, learning_rate, gradient_clipping, num_epoch, batch_size, num_samples, save_epoch=100, num_iterations_print=100, test_off=False): """Train the model. :param train_batch_reader: Train data reader. :type train_batch_reader: callable :param dev_batch_reader: Validation data reader. :type dev_batch_reader: callable :param feeding_dict: Feeding is a map of field name and tuple index of the data that reader returns. :type feeding_dict: dict|list :param learning_rate: Learning rate for ADAM optimizer. :type learning_rate: float :param gradient_clipping: Gradient clipping threshold. :type gradient_clipping: float :param num_epoch: Number of training epochs. :type num_epoch: int :param batch_size: Number of batch size. :type batch_size: int :param num_samples: The num of train samples. :type num_samples: int :param save_epoch: Number of training iterations for save checkpoint and params. :type save_epoch: int :param num_iterations_print: Number of training iterations for printing a training loss. :type num_iteratons_print: int :param test_off: Turn off testing. :type test_off: bool """ # prepare model output directory if not os.path.exists(self._output_model_dir): mkpath(self._output_model_dir) # adapt the feeding dict according to the network adapted_feeding_dict = self._adapt_feeding_dict(feeding_dict) if isinstance(self._place, fluid.CUDAPlace): dev_count = fluid.core.get_cuda_device_count() else: dev_count = int(os.environ.get('CPU_NUM', 1)) # prepare the network train_program = fluid.Program() startup_prog = fluid.Program() with fluid.program_guard(train_program, startup_prog): with fluid.unique_name.guard(): train_reader, log_probs, ctc_loss = self.create_network() # prepare optimizer optimizer = fluid.optimizer.AdamOptimizer( learning_rate=fluid.layers.exponential_decay( learning_rate=learning_rate, decay_steps=num_samples / batch_size / dev_count, decay_rate=0.83, staircase=True)) fluid.clip.set_gradient_clip( clip=fluid.clip.GradientClipByGlobalNorm( clip_norm=gradient_clipping)) optimizer.minimize(loss=ctc_loss) test_prog = fluid.Program() with fluid.program_guard(test_prog, startup_prog): with fluid.unique_name.guard(): test_reader, _, ctc_loss = self.create_network() test_prog = test_prog.clone(for_test=True) exe = fluid.Executor(self._place) exe.run(startup_prog) # init from some pretrain models, to better solve the current task pre_epoch = 0 if self._init_from_pretrained_model: pre_epoch = self.init_from_pretrained_model(exe, train_program) build_strategy = compiler.BuildStrategy() exec_strategy = fluid.ExecutionStrategy() # pass the build_strategy to with_data_parallel API compiled_prog = compiler.CompiledProgram( train_program).with_data_parallel(loss_name=ctc_loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) train_reader.set_batch_generator(train_batch_reader) test_reader.set_batch_generator(dev_batch_reader) # run train for epoch_id in range(num_epoch): train_reader.start() epoch_loss = [] time_begin = time.time() batch_id = 0 step = 0 while True: try: fetch_list = [ctc_loss.name] if batch_id % num_iterations_print == 0: fetch = exe.run(program=compiled_prog, fetch_list=fetch_list, return_numpy=False) each_loss = fetch[0] epoch_loss.extend(np.array(each_loss[0]) / batch_size) print("epoch: %d, batch: %d, train loss: %f\n" % (epoch_id, batch_id, np.mean(each_loss[0]) / batch_size)) else: each_loss = exe.run(program=compiled_prog, fetch_list=[], return_numpy=False) batch_id = batch_id + 1 except fluid.core.EOFException: train_reader.reset() break time_end = time.time() used_time = time_end - time_begin if test_off: print("\n--------Time: %f sec, epoch: %d, train loss: %f\n" % (used_time, epoch_id, np.mean(np.array(epoch_loss)))) else: print('\n----------Begin test...') test_loss = self.test(exe, dev_batch_reader=dev_batch_reader, test_program=test_prog, test_reader=test_reader, fetch_list=[ctc_loss]) print( "--------Time: %f sec, epoch: %d, train loss: %f, test loss: %f" % (used_time, epoch_id + pre_epoch, np.mean(np.array(epoch_loss)), test_loss / batch_size)) if (epoch_id + 1) % save_epoch == 0: self.save_param(exe, train_program, "epoch_" + str(epoch_id + pre_epoch)) self.save_param(exe, train_program, "step_final") print("\n------------Training finished!!!-------------")
def run_main(self, place, with_data_parallel): self.place = place self.with_data_parallel = with_data_parallel if not core.is_compiled_with_cuda() and isinstance( self.place, core.CUDAPlace): return if isinstance(self.place, core.CUDAPlace): device_cnt = core.get_cuda_device_count( ) if self.with_data_parallel else 1 else: device_cnt = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count()) ) if self.with_data_parallel else 1 d0 = layers.data("d0", shape=[10], append_batch_size=False, dtype='float32') d1 = layers.data("d1", shape=[10], append_batch_size=False, dtype='float32') d2 = layers.data("d2", shape=[10], append_batch_size=False, dtype='float32') i = layers.zeros(shape=[1], dtype='int64') i.stop_gradient = True init = layers.zeros(shape=[10], dtype='float32') mem_array = layers.array_write(x=init, i=i) data_array = layers.array_write(x=d0, i=i) i = layers.increment(i) layers.array_write(d1, i, array=data_array) i = layers.increment(i) layers.array_write(d2, i, array=data_array) i = layers.zeros(shape=[1], dtype='int64') i.stop_gradient = True array_len = layers.fill_constant(shape=[1], dtype='int64', value=1) array_len.stop_gradient = True cond = layers.less_than(x=i, y=array_len) j = layers.fill_constant(shape=[1], dtype='int64', value=1) j.stop_gradient = True array_len2 = layers.fill_constant(shape=[1], dtype='int64', value=3) array_len2.stop_gradient = True cond2 = layers.less_than(x=j, y=array_len2) while_op = layers.While(cond=cond) while_op2 = layers.While(cond=cond2) with while_op.block(): d = layers.array_read(array=data_array, i=i) prev = layers.array_read(array=mem_array, i=i) d = layers.reshape(d, shape=[10]) prev = layers.reshape(prev, shape=[10]) result = layers.sums(input=[d, prev]) i = layers.increment(x=i, in_place=True) layers.array_write(result, i=i, array=mem_array) layers.less_than(x=i, y=array_len, cond=cond) with while_op2.block(): d2 = layers.array_read(array=data_array, i=j) prev2 = layers.array_read(array=mem_array, i=j) d2 = layers.reshape(d2, shape=[10]) prev2 = layers.reshape(prev2, shape=[10]) result2 = layers.sums(input=[d2, prev2]) j = layers.increment(x=j, in_place=True) layers.array_write(result2, i=j, array=mem_array) layers.less_than(x=j, y=array_len2, cond=cond2) sum_result = layers.array_read(array=mem_array, i=j) sum_result.persistable = True tmp = layers.unsqueeze(sum_result, axes=[0]) tmp = layers.expand(tmp, expand_times=[10, 1]) fc = layers.fc(tmp, size=256) loss = layers.mean(sum_result) optim = fluid.optimizer.Adam(learning_rate=1e-3) optim.minimize(loss) exe = Executor(self.place) exe.run(fluid.default_startup_program()) prog = fluid.default_main_program() if self.with_data_parallel: prog = compiler.CompiledProgram( fluid.default_main_program()).with_data_parallel( loss_name=loss.name) for _ in range(5): d = [] for i in range(3): tmp = numpy.random.random(size=[10]).astype('float32') if not self.with_data_parallel: d.append(tmp) else: d.append(numpy.array([tmp] * device_cnt)) outs = exe.run(program=prog, feed={ 'd0': d[0], 'd1': d[1], 'd2': d[2] }, fetch_list=[sum_result]) self.assertAlmostEqual(numpy.sum(d), numpy.sum(outs[0]), delta=0.01)
import paddle.fluid as fluid import paddle.fluid.compiler as compiler import numpy import os place = fluid.CUDAPlace(0) # fluid.CPUPlace() exe = fluid.Executor(place) data = fluid.layers.data(name='X', shape=[1], dtype='float32') hidden = fluid.layers.fc(input=data, size=10) loss = fluid.layers.mean(hidden) fluid.optimizer.SGD(learning_rate=0.01).minimize(loss) fluid.default_startup_program().random_seed = 1 exe.run(fluid.default_startup_program()) compiled_prog = compiler.CompiledProgram(fluid.default_main_program()) x = numpy.random.random(size=(10, 1)).astype('float32') loss_data, = exe.run(compiled_prog, feed={"X": x}, fetch_list=[loss.name]) print("loss: {}".format(loss_data[0]))
def check_network_convergence(self, use_cuda=True, use_mem_opt=False, iter_num=5): prog = Program() startup_prog = Program() prog.random_seed = 100 startup_prog.random_seed = 100 with program_guard(prog, startup_prog): image = layers.data(name='x', shape=[784], dtype='float32') label = layers.data(name='y', shape=[1], dtype='int64') limit = layers.fill_constant(shape=[1], dtype='int64', value=5) cond = layers.less_than(x=label, y=limit) ie = layers.IfElse(cond) with ie.true_block(): true_image = ie.input(image) hidden = layers.fc(input=true_image, size=100, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') ie.output(prob) with ie.false_block(): false_image = ie.input(image) hidden = layers.fc(input=false_image, size=200, act='tanh') prob = layers.fc(input=hidden, size=10, act='softmax') ie.output(prob) prob = ie() loss = layers.cross_entropy(input=prob[0], label=label) avg_loss = layers.mean(loss) optimizer = MomentumOptimizer(learning_rate=0.001, momentum=0.9) optimizer.minimize(avg_loss, startup_prog) train_reader = paddle.batch( paddle.dataset.mnist.train(), batch_size=200) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = Executor(place) exec_strategy = fluid.ExecutionStrategy() exec_strategy.use_cuda = use_cuda build_strategy = fluid.BuildStrategy() build_strategy.memory_optimize = use_mem_opt train_cp = compiler.CompiledProgram(fluid.default_main_program()) train_cp = train_cp.with_data_parallel( loss_name=avg_loss.name, exec_strategy=exec_strategy, build_strategy=build_strategy) fetch_list = [avg_loss.name] exe.run(startup_prog) PASS_NUM = 100 loop = 0 ret = [] for pass_id in range(PASS_NUM): for data in train_reader(): x_data = np.array([x[0] for x in data]).astype("float32") y_data = np.array([x[1] for x in data]).astype("int64") y_data = y_data.reshape((y_data.shape[0], 1)) outs = exe.run(train_cp, feed={'x': x_data, 'y': y_data}, fetch_list=[avg_loss]) loop += 1 ret.append(outs[0]) if iter_num == loop: return ret return ret
def check_network_convergence(self, network, use_cuda=True, memory_opt=True, use_ir_memory_optimize=True, enable_inplace=True, iter=5): if use_cuda and not core.is_compiled_with_cuda(): print( 'Skip use_cuda=True because Paddle is not compiled with cuda') return if os.name == 'nt': print( 'Skip use_parallel_executor=True because Paddle comes without parallel support on windows' ) return fluid.default_startup_program().random_seed = 100 fluid.default_main_program().random_seed = 100 data = fluid.layers.data(name="words", shape=[1], dtype="int64", lod_level=1) label = fluid.layers.data(name="label", shape=[1], dtype="int64") cost = network(data, label, len(self.word_dict)) optimizer = fluid.optimizer.Adam(learning_rate=0.001) optimizer.minimize(cost) build_strategy = fluid.BuildStrategy() build_strategy.enable_inplace = False build_strategy.memory_optimize = False if memory_opt: fluid.memory_optimize(fluid.default_main_program()) else: build_strategy.enable_inplace = use_ir_memory_optimize build_strategy.memory_optimize = enable_inplace # execution place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() feeder = fluid.DataFeeder(feed_list=[data, label], place=place) reader = feeder.decorate_reader(self.train_reader, multi_devices=True) exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) train_cp = compiler.CompiledProgram(fluid.default_main_program()) train_cp = train_cp.with_data_parallel(loss_name=cost.name, build_strategy=build_strategy) fetch_list = [cost.name] begin = time.time() first_loss, last_loss = None, None step_id = 0 custom_iter = getattr(self, "iter", None) if not custom_iter == None: iter = custom_iter for data in reader(): ret = exe.run(train_cp, feed=data, fetch_list=fetch_list) print(ret) step_id += 1 if step_id == 1: first_loss = ret[0] if step_id == iter: last_loss = ret[0] break end = time.time() print("%.4f Instance per second" % ((self.batch_size * iter) / (end - begin))) print(first_loss, last_loss) avg_last_loss_val = np.array(last_loss).mean() avg_first_loss_val = np.array(first_loss).mean() if math.isnan(float(avg_last_loss_val)) or math.isnan( float(avg_first_loss_val)): sys.exit("got NaN loss, training failed.") return first_loss, last_loss
def check_network_convergence(self, is_sparse, build_strategy=None, use_cuda=True): os.environ['CPU_NUM'] = str(4) main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): word = fluid.layers.data( name='word_data', shape=[1], dtype='int64', lod_level=1) predicate = fluid.layers.data( name='verb_data', shape=[1], dtype='int64', lod_level=1) ctx_n2 = fluid.layers.data( name='ctx_n2_data', shape=[1], dtype='int64', lod_level=1) ctx_n1 = fluid.layers.data( name='ctx_n1_data', shape=[1], dtype='int64', lod_level=1) ctx_0 = fluid.layers.data( name='ctx_0_data', shape=[1], dtype='int64', lod_level=1) ctx_p1 = fluid.layers.data( name='ctx_p1_data', shape=[1], dtype='int64', lod_level=1) ctx_p2 = fluid.layers.data( name='ctx_p2_data', shape=[1], dtype='int64', lod_level=1) mark = fluid.layers.data( name='mark_data', shape=[1], dtype='int64', lod_level=1) feature_out = db_lstm(**locals()) target = fluid.layers.data( name='target', shape=[1], dtype='int64', lod_level=1) crf_cost = fluid.layers.linear_chain_crf( input=feature_out, label=target, param_attr=fluid.ParamAttr( name='crfw', learning_rate=1e-1)) avg_cost = fluid.layers.mean(crf_cost) sgd_optimizer = fluid.optimizer.SGD( learning_rate=fluid.layers.exponential_decay( learning_rate=0.01, decay_steps=100000, decay_rate=0.5, staircase=True)) sgd_optimizer.minimize(avg_cost) train_data = paddle.batch( paddle.reader.shuffle( paddle.dataset.conll05.test(), buf_size=8192), batch_size=16) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) train_cp = compiler.CompiledProgram(main).with_data_parallel( loss_name=avg_cost.name, build_strategy=build_strategy) feeder = fluid.DataFeeder( feed_list=[ word, ctx_n2, ctx_n1, ctx_0, ctx_p1, ctx_p2, predicate, mark, target ], place=fluid.CPUPlace()) data = train_data() for i in range(10): cur_batch = next(data) print(exe.run(train_cp, feed=feeder.feed(cur_batch), fetch_list=[avg_cost.name])[0])
def minimize(self, loss, startup_program=None, parameter_list=None, no_grad_set=None): """ Add distributed operations to minimize ``loss`` by updating ``parameter_list``. Args: loss (Tensor): A ``Tensor`` containing the value to minimize. startup_program (Program, optional): :ref:`api_fluid_Program` for initializing parameters in ``parameter_list``. The default value is None, at this time :ref:`api_fluid_default_startup_program` will be used. parameter_list (Iterable, optional): Iterable of ``Tensor`` or ``Tensor.name`` to update to minimize ``loss``. The default value is None, at this time all parameters will be updated. no_grad_set (set, optional): Set of ``Tensor`` or ``Tensor.name`` that don't need to be updated. The default value is None. Returns: tuple: tuple (optimize_ops, params_grads), A list of operators appended by minimize and a list of (param, grad) tensor pairs, param is ``Parameter``, grad is the gradient value corresponding to the parameter. The returned tuple can be passed to ``fetch_list`` in ``Executor.run()`` to indicate program pruning. If so, the program will be pruned by ``feed`` and ``fetch_list`` before run, see details in ``Executor``. Examples: .. code-block:: python import paddle paddle.enable_static() import paddle.distributed.fleet as fleet import paddle.nn.functional as F hid_dim = 10 label_dim = 2 input_x = paddle.static.data(name='x', shape=[None, 13], dtype='float32') input_y = paddle.static.data(name='y', shape=[None, 1], dtype='int64') fc_1 = paddle.static.nn.fc(x=input_x, size=hid_dim, activation='tanh') fc_2 = paddle.static.nn.fc(x=fc_1, size=hid_dim, activation='tanh') prediction = paddle.static.nn.fc(x=[fc_2], size=label_dim, activation='softmax') cost = F.cross_entropy(input=prediction, label=input_y) avg_cost = paddle.mean(x=cost) fleet.init(is_collective=True) strategy = fleet.DistributedStrategy() optimizer = paddle.optimizer.SGD(learning_rate=0.001) optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy) optimizer.minimize(avg_cost) # for more examples, please reference https://github.com/PaddlePaddle/FleetX """ context = {} context["user_defined_strategy"] = copy.deepcopy( self._user_defined_strategy) if paddle.fluid.framework.in_dygraph_mode(): # imitate target optimizer retrieval target_opt = self.user_defined_optimizer self._context = context return target_opt.minimize(loss) # cache original feed forward program self.origin_main_program = loss.block.program context["origin_main_program"] = self.origin_main_program context["loss"] = loss if startup_program == None: self.origin_startup_program = \ paddle.static.default_startup_program().clone(for_test=False) startup_program = paddle.static.default_startup_program() else: self.origin_startup_program = \ startup_program.clone(for_test=False) context["origin_startup_program"] = startup_program context["role_maker"] = self._role_maker # compile time distributed_optimizer_list = \ MetaOptimizerFactory()._get_valid_meta_optimizers( self.user_defined_optimizer) context["user_defined_strategy"] = copy.deepcopy( self._user_defined_strategy) copy_user_defined_strategy = copy.deepcopy(self._user_defined_strategy) # trigger the auto-parallel in very strict condition # strategy = DistributedStrategy() # strategy.auto = True # optimizer = paddle.optimizer.SGD(learning_rate=0.1) # optimizer = fleet.distributed_optimizer(optimizer, strategy) if copy_user_defined_strategy._is_strict_auto(): # turn on all the strategy for each optimizer for opt in distributed_optimizer_list: opt._enable_strategy(copy_user_defined_strategy, context) valid_optimizer_list = [] valid_graph_optimizer_list = [] can_not_apply_optimizer_list = [] # recall meta optimizers for ranking for opt in distributed_optimizer_list: opt._set_basic_info(loss, self._role_maker, self.user_defined_optimizer, copy_user_defined_strategy) if opt._can_apply() and not opt._is_graph_out(): valid_optimizer_list.append(opt) elif opt._can_apply() and opt._is_graph_out(): valid_graph_optimizer_list.append(opt) else: can_not_apply_optimizer_list.append(opt) # combine recalled meta optimizers to be a valid meta optimizer meta_optimizer, graph_optimizer = \ self.strategy_compiler.generate_optimizer( loss, self._role_maker, self.user_defined_optimizer, copy_user_defined_strategy, valid_optimizer_list, valid_graph_optimizer_list) valid_strategy = self.strategy_compiler._get_valid_strategy( copy_user_defined_strategy, can_not_apply_optimizer_list) context["valid_strategy"] = copy.deepcopy(valid_strategy) applied_meta_list = self.strategy_compiler._get_applied_meta_list() applied_graph_list = self.strategy_compiler._get_applied_graph_list() context['applied_meta_list'] = applied_meta_list context['applied_graph_list'] = applied_graph_list self._context = context self.valid_strategy = valid_strategy self.valid_strategy._enable_env() optimize_ops = [] params_grads = [] if self._role_maker._is_non_distributed() and not self._is_collective: if self._runtime_handle is None: self._runtime_handle = RuntimeFactory()._create_runtime(context) compiled_program = compiler.CompiledProgram( self.origin_main_program).with_data_parallel( loss_name=loss.name, share_vars_from=None) loss.block.program._graph = compiled_program return self.user_defined_optimizer.minimize( loss, startup_program, parameter_list, no_grad_set=no_grad_set) if meta_optimizer: optimize_ops, params_grads = meta_optimizer.minimize( loss, startup_program, parameter_list, no_grad_set=no_grad_set) default_program = paddle.static.default_main_program() if id(default_program) != id(loss.block.program): paddle.fluid.framework.switch_main_program(loss.block.program) else: optimize_ops, params_grads = self.user_defined_optimizer.minimize( loss, startup_program, parameter_list, no_grad_set=no_grad_set) context["program_optimize_ops"] = optimize_ops context["program_params_grads"] = params_grads if graph_optimizer: optimize_ops, params_grads = graph_optimizer.minimize( loss, startup_program, parameter_list, no_grad_set=no_grad_set) # since we do not encourage users to use graph operations # if a graph optimizer takes effect, mostly # optimizers_ops and params_grads are None # i.e. users can not modify current computation graph anymore context["graph_optimize_ops"] = optimize_ops context["graph_optimize_grads"] = params_grads if self._runtime_handle is None: self._runtime_handle = RuntimeFactory()._create_runtime(context) import paddle.distributed.fleet as fleet fleet.util._set_strategy(context["valid_strategy"]) return optimize_ops, params_grads
def _compare(self, place, layout, only_forward): """Compare results.""" seed = 10 os.environ['FLAGS_cudnn_deterministic'] = "1" scope = core.Scope() data = np.random.random(size=self.dshape).astype(self.dtype) * 4. - 2 data = create_or_get_tensor(scope, "input", OpTest.np_dtype_to_fluid_dtype(data), place) # Single-GPU, N = 32 per GPU main, startup, outs = self._build_program(place, layout, seed, False, only_forward) exe = fluid.Executor(place) exe.run(startup) fetch_names = [v.name for v in outs] + [ 'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias' ] if not only_forward: others = [ 'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD', 'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD' ] fetch_names += others bn_fetches = exe.run(program=main, feed={'input': data}, fetch_list=fetch_names) ##################################################################### # Multi-GPUs, self.N / core.get_cuda_device_count() per GPU assert core.get_cuda_device_count() > 1 main, startup, outs = self._build_program(place, layout, seed, True, only_forward) exe = fluid.Executor(place) exe.run(startup) fetch_names = [v.name for v in outs] + [ 'bn_moving_mean', 'bn_moving_variance', 'bn_scale', 'bn_bias' ] if not only_forward: others = [ 'batch_norm_0.tmp_0', 'batch_norm_0.tmp_1', 'bn_scale@GRAD', 'bn_bias@GRAD', 'batch_norm_0.tmp_2@GRAD', 'conv2d_0.tmp_0@GRAD' ] fetch_names += others for nm in fetch_names: fv = fluid.framework._get_var(str(nm), program=main) fv.persistable = True build_strategy = fluid.BuildStrategy() build_strategy.sync_batch_norm = True build_strategy.enable_inplace = False build_strategy.memory_optimize = False comp_prog = compiler.CompiledProgram(main).with_data_parallel( outs[0].name if not only_forward else None, build_strategy=build_strategy) sync_bn_fetches = exe.run(program=comp_prog, feed={'input': data}, fetch_list=fetch_names) for i in six.moves.xrange(1, len(sync_bn_fetches)): bn_val = bn_fetches[i] sync_bn_val = sync_bn_fetches[i] if sync_bn_val.shape != bn_val.shape: sync_bn_val = sync_bn_val[:bn_val.shape[0]] self.assertTrue( np.allclose(bn_val, sync_bn_val, atol=self.atol), "Output (" + fetch_names[i] + ") has diff. \n" + "\nBN " + str(bn_val) + "\n" + "Sync BN " + str(sync_bn_val))
def main(self, use_cuda=True, use_parallel_executor=False, use_double_buffer=False, use_feed_list=False, use_decorate_paddle_reader=False): assert not use_cuda or use_cuda and core.is_compiled_with_cuda() self.use_cuda = use_cuda self.use_parallel_executor = use_parallel_executor self.use_double_buffer = use_double_buffer self.use_feed_list = use_feed_list self.use_decorate_paddle_reader = use_decorate_paddle_reader startup_program = fluid.Program() main_program = fluid.Program() with fluid.program_guard(main_program, startup_program): in_data, label, loss, optimizer, feed_queue, py_reader = simple_fc_net( in_size=self.in_size, class_num=self.class_num, hidden_sizes=self.hidden_sizes, batch_size=self.batch_size, queue_capacity=self.queue_capacity, use_double_buffer=self.use_double_buffer, use_feed_list=self.use_feed_list) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup_program) train_cp = main_program if use_parallel_executor: train_cp = compiler.CompiledProgram( main_program).with_data_parallel(loss_name=loss.name) if use_cuda: self.batch_size_times = core.get_cuda_device_count() else: self.batch_size_times = int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) else: self.batch_size_times = 1 reader = self.tensor_reader(use_decorate_paddle_reader) batch_reader = paddle.batch(reader, batch_size=self.batch_size) self.inputs = [] self.outputs = [] if use_decorate_paddle_reader: if use_feed_list: py_reader.decorate_paddle_reader(batch_reader) else: py_reader.decorate_sample_list_generator(batch_reader) py_reader.start() else: thread = threading.Thread(target=feed_data, args=(feed_queue, batch_reader)) thread.daemon = True thread.start() try: while True: fetches = exe.run(train_cp, fetch_list=[in_data.name, label.name]) fetches = [as_numpy(fetch) for fetch in fetches] self.outputs.append(fetches) except fluid.core.EOFException: pass feed_queue.close() self.validate() if use_decorate_paddle_reader: py_reader.exited = True py_reader.thread.join() else: thread.join()
def check_network_convergence(cls, method, use_cuda=True, iter=5, batch_size=None, feed_dict=None, feed_data_reader=None, get_data_from_feeder=None, use_parallel_executor=True, use_reduce=False, use_ir_memory_optimize=True, enable_inplace=True, fuse_elewise_add_act_ops=False, fuse_all_optimizer_ops=False, fuse_all_reduce_ops=False, fuse_relu_depthwise_conv=False, optimizer=fluid.optimizer.Adam, use_fast_executor=False, enable_sequential_execution=False): def run_executor(exe, binary, feed, fetch_list): if feed_data_reader is None: res = exe.run(binary, feed=feed, fetch_list=fetch_list) else: res = exe.run(binary, feed=feed_data_reader.get_next(exe, binary), fetch_list=fetch_list) return res if feed_data_reader is not None: assert isinstance( feed_data_reader, FeedDataReader ), "feed_data_reader must be type of FeedDataReader" paddle.manual_seed(1) paddle.framework.random._manual_program_seed(1) main = fluid.Program() startup = fluid.Program() with fluid.program_guard(main, startup): feed_dict, loss = cls.build_model(feed_dict, get_data_from_feeder, main, method, optimizer) place = fluid.CUDAPlace(0) if use_cuda else fluid.CPUPlace() exe = fluid.Executor(place) exe.run(startup) build_strategy, exec_strategy = cls.set_strategy( enable_inplace, enable_sequential_execution, fuse_all_optimizer_ops, fuse_all_reduce_ops, fuse_elewise_add_act_ops, fuse_relu_depthwise_conv, use_fast_executor, use_ir_memory_optimize, use_reduce, use_cuda) if use_parallel_executor: binary = compiler.CompiledProgram(main).with_data_parallel( loss_name=loss.name, build_strategy=build_strategy, exec_strategy=exec_strategy) else: binary = main if batch_size is not None: batch_size *= fluid.core.get_cuda_device_count( ) if use_cuda else int( os.environ.get('CPU_NUM', multiprocessing.cpu_count())) begin = time.time() first_loss, = run_executor(exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) for _ in range(iter): run_executor(exe=exe, binary=binary, feed=feed_dict, fetch_list=[]) last_loss, = run_executor(exe=exe, binary=binary, feed=feed_dict, fetch_list=[loss.name]) end = time.time() if batch_size is not None: print("%.4f Instance per second" % ((batch_size * iter + 2) / (end - begin))) avg_last_loss_val = np.array(last_loss).mean() avg_first_loss_val = np.array(first_loss).mean() if math.isnan(float(avg_last_loss_val)) or math.isnan( float(avg_first_loss_val)): sys.exit("got NaN loss, training failed.") print(first_loss, last_loss) # self.assertGreater(first_loss[0], last_loss[0]) return first_loss, last_loss
def _try_to_compile(self, startup_program, main_program, loss): dist_strategy = self.user_defined_strategy local_build_strategy = dist_strategy.build_strategy local_build_strategy.use_hierarchical_allreduce = \ dist_strategy.use_hierarchical_allreduce local_build_strategy.hierarchical_allreduce_inter_nranks = \ dist_strategy.hierarchical_allreduce_inter_nranks local_build_strategy.sync_batch_norm = \ dist_strategy.sync_batch_norm local_build_strategy.fuse_all_reduce_ops = \ dist_strategy.fuse_all_reduce_ops local_build_strategy.nccl_comm_num = \ dist_strategy.nccl_comm_num gradient_scale_configs = self.user_defined_strategy.gradient_scale_configs scale_strategys = { 'avg': BuildStrategy.GradientScaleStrategy.CoeffNumDevice, 'sum': BuildStrategy.GradientScaleStrategy.One, 'customized': BuildStrategy.GradientScaleStrategy.Customized, } assert gradient_scale_configs['scale_strategy'] in scale_strategys, \ "gradient_scale_configs.scale_strategy must be 'avg', 'sum' or 'customized'" local_build_strategy.gradient_scale_strategy = \ scale_strategys[gradient_scale_configs['scale_strategy']] if self.user_defined_strategy.recompute == True: logging.warn( "set enable_sequential_execution=True since you have enable the recompute strategy" ) local_build_strategy.enable_sequential_execution = True exe_strategy = self.user_defined_strategy.execution_strategy worker_num = self.role_maker._worker_num() node_num = self.role_maker._node_num() if self.role_maker._is_collective: assert worker_num >= 1, "nccl2 worker_num must >= 1, now:{}" % worker_num if worker_num <= 1: # local mode if local_build_strategy.nccl_comm_num > 1: logging.warn("set nccl_comm_num=1 since you only have 1 node.") local_build_strategy.nccl_comm_num = 1 if node_num <= 1: if local_build_strategy.use_hierarchical_allreduce: logging.warn( "set hierachical_allreduce=False since you only have 1 node." ) local_build_strategy.use_hierarchical_allreduce = False sync_allreduce = dist_strategy.sync_nccl_allreduce if sync_allreduce: exe_strategy.num_threads = max( local_build_strategy.nccl_comm_num + 1, exe_strategy.num_threads) if local_build_strategy.nccl_comm_num > 1: logging.warn( "nccl_comm_num > 1, you may need to set sync_nccl_allreduce=False to ensure that different nccl comms can overlap" ) sync_batch_norm = local_build_strategy.sync_batch_norm if sync_batch_norm: local_build_strategy.nccl_comm_num = 1 local_build_strategy.use_hierarchical_allreduce = False exe_strategy.num_threads = 1 logging.warn( "use sync_batch_norm will hang when set num_threads > 1, so " "set num_threads=1, nccl_comm_num=1, hierachical_allreduce=False." ) # NOTE. compatible with compiler, otherwise these values will be overwritten by compiler main_program._nccl_comm_num = local_build_strategy.nccl_comm_num main_program._use_hierarchical_allreduce = local_build_strategy.use_hierarchical_allreduce main_program._hierarchical_allreduce_inter_nranks = local_build_strategy.hierarchical_allreduce_inter_nranks # TODO(guru4elephant): should be an independent optimizer if worker_num > 1: self._setup_nccl_op(startup_program, main_program, local_build_strategy) local_build_strategy.num_trainers = self.role_maker._worker_num() local_build_strategy.trainer_id = self.role_maker._worker_index() local_build_strategy.trainers_endpoints = self.role_maker._get_trainer_endpoints( ) local_build_strategy.enable_backward_optimizer_op_deps = True self._compiled_program = compiler.CompiledProgram(main_program) self._compiled_program.with_data_parallel( loss_name=loss.name, build_strategy=local_build_strategy, exec_strategy=exe_strategy, share_vars_from=None) return self._compiled_program