def _compare_fused_optimizer_ops(self, model, use_cuda, optimizer=fluid.optimizer.Adam): if use_cuda and not core.is_compiled_with_cuda(): return img, label = init_data() feed_dict = {"image": img, "label": label} not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( model, feed_dict=feed_dict, use_cuda=use_cuda, fuse_all_optimizer_ops=False, memory_opt=False, # avoid the gradient's name changed in Python side. optimizer=optimizer) fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( model, feed_dict=feed_dict, use_cuda=use_cuda, fuse_all_optimizer_ops=True, memory_opt=False, # avoid the gradient's name changed in Python side. optimizer=optimizer) for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss): self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
def check_simple_fc_parallel_accuracy(self, use_device): if use_device and not core.is_compiled_with_cuda(): return img, label = init_data() single_first_loss, single_last_loss = self.check_network_convergence( method=simple_fc_net, feed_dict={ "image": img, "label": label }, use_device=use_device, use_parallel_executor=False) parallel_first_loss, parallel_last_loss = self.check_network_convergence( method=simple_fc_net, feed_dict={ "image": img, "label": label }, use_device=use_device, use_parallel_executor=True) self.assertAlmostEquals( np.mean(parallel_first_loss), single_first_loss, delta=1e-6, ) self.assertAlmostEquals(np.mean(parallel_last_loss), single_last_loss, delta=1e-6)
def test_backward(self): batch_size = 2 img, label = init_data(batch_size, img_shape=[784], label_range=9) feed_dict = {'image': img, 'label': label} self.check_backward(case1_fill_grad_vars, feed_dict) self.check_backward(case2_prune_no_grad_branch, feed_dict) self.check_backward(case3_prune_no_grad_branch2, {'label': label}) self.check_backward(case4_with_no_grad_op_maker, {})
def setUpClass(cls): cls.save_dirname = "./" cls.model_filename = "test_parallel_executor_run_load_infer_program_model" cls.params_filename = "test_parallel_executor_run_load_infer_program_parameter" cls.place = fluid.CPUPlace() cls.exe = fluid.Executor(cls.place) img, label = init_data() cls.batch_data = [] for img, label in zip(img, label): cls.batch_data.append([img, label])
def setUpClass(cls): os.environ['CPU_NUM'] = str(4) batch_size = 4 cls.img, cls.label = init_data(batch_size, img_shape=[784], label_range=9) cls.feed_dict = { 'image': cls.img, 'label': cls.label, 'learning_rate': numpy.array([1.0]).astype("float32") }
def check_simple_fc_convergence(self, use_device, use_reduce=False): if use_device == DeviceType.CUDA and not core.is_compiled_with_cuda(): return img, label = init_data() self.check_network_convergence(simple_fc_net, feed_dict={ "image": img, "label": label }, use_device=use_device, use_reduce=use_reduce)
def check_model(self, use_cuda): img, label = init_data( batch_size=batch_size, img_shape=img_shape, label_range=9) img = np.float16(img).view(np.uint16) feed_dict = {"image": img, "label": label} TestParallelExecutorBase.check_network_convergence( conv_net, feed_dict=feed_dict, iter=10, use_cuda=use_cuda, fuse_all_reduce_ops=True, optimizer=_optimizer)
def test_batchnorm_fc(self): def optimizer(): optimizer = fluid.optimizer.SGD( learning_rate=0.001, regularization=fluid.regularizer.L2Decay(1e-4)) return optimizer with self.program_scope_guard(): img, label = init_data() self.check_prune_correctness( method=fc_with_batchnorm, feed_dict={"image": img, "label": label}, optimizer=optimizer)
def _compare_fuse_elewise_add_act_ops(self, model, use_cuda): if use_cuda and not core.is_compiled_with_cuda(): return img, label = init_data() def _optimizer(learning_rate=1e-6): optimizer = fluid.optimizer.SGD( learning_rate=learning_rate, regularization=fluid.regularizer.L2Decay(1e-6)) return optimizer # NOTE(dzh): # need to make it compatible with elewise fuse act # FIXME (liuwei12) # the new memory optimize strategy will crash this unittest # add enable_inplace=False here to force pass the unittest not_fuse_op_first_loss, not_fuse_op_last_loss = self.check_network_convergence( model, feed_dict={ "image": img, "label": label }, use_cuda=use_cuda, fuse_elewise_add_act_ops=False, memory_opt=False, use_ir_memory_optimize=False, enable_inplace=False, optimizer=_optimizer) fuse_op_first_loss, fuse_op_last_loss = self.check_network_convergence( model, feed_dict={ "image": img, "label": label }, use_cuda=use_cuda, fuse_elewise_add_act_ops=True, memory_opt=False, use_ir_memory_optimize=False, enable_inplace=False, optimizer=_optimizer) for loss in zip(not_fuse_op_first_loss, fuse_op_first_loss): self.assertAlmostEquals(loss[0], loss[1], delta=1e-6) for loss in zip(not_fuse_op_last_loss, fuse_op_last_loss): self.assertAlmostEquals(loss[0], loss[1], delta=1e-6)
def test_trainable(self): batch_size = 2 img, label = init_data(batch_size, img_shape=[784], label_range=9) feed_dict = {'image': img, 'label': label} # Note that, because the Weight of FC is not trainable and the x is stop_gradient, # so the 'mul_grad' should not be appended. self.check_trainable(test_trainable, feed_dict, op_count={ 'adam': 1, 'scale': 0, 'mul_grad': 0 }) self.check_trainable( test_trainable, feed_dict, op_count={ 'adamax': 1, 'scale': 1, 'mul_grad': 0 }, optimizer=fluid.optimizer.Adamax(learning_rate=0.2))
def check_backward(self, use_cuda): main = paddle.static.Program() startup = paddle.static.Program() with program_guard(main, startup): loss = simple_fc_net() loss = paddle.static.Print(loss) paddle.optimizer.Adam().minimize(loss) print_ops = [op for op in main.blocks[0].ops if op.type == u'print'] assert len(print_ops) == 2, "The number of print op should be 2" place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(startup) binary = paddle.static.CompiledProgram(main).with_data_parallel( loss_name=loss.name) img, label = init_data() feed_dict = {"image": img, "label": label} exe.run(binary, feed_dict)
return optimizer model = SE_ResNeXt50Small def batch_size(): return 12 def iter(use_cuda): if use_cuda: return 10 return 2 gpu_img, gpu_label = init_data(batch_size=batch_size(), img_shape=img_shape, label_range=999) cpu_img, cpu_label = init_data(batch_size=batch_size(), img_shape=img_shape, label_range=999) feed_dict_gpu = {"image": gpu_img, "label": gpu_label} feed_dict_cpu = {"image": cpu_img, "label": cpu_label} def feed_dict(use_cuda): if use_cuda: return feed_dict_gpu return feed_dict_cpu
def _get_feed_dict(self): img, label = init_data() return {"image": img, "label": label}
def batch_size(use_device): if use_device == DeviceType.CUDA: # Paddle uses 8GB P4 GPU for unittest so we decreased the batch size. return 4 return 12 def iter(use_device): if use_device == DeviceType.CUDA: return 10 return 1 gpu_img, gpu_label = init_data( batch_size=batch_size(use_device=DeviceType.CUDA), img_shape=img_shape, label_range=999) cpu_img, cpu_label = init_data( batch_size=batch_size(use_device=DeviceType.CPU), img_shape=img_shape, label_range=999) feed_dict_gpu = {"image": gpu_img, "label": gpu_label} feed_dict_cpu = {"image": cpu_img, "label": cpu_label} def feed_dict(use_device): if use_device == DeviceType.CUDA: return feed_dict_gpu return feed_dict_cpu
model = SE_ResNeXt50Small def batch_size(use_cuda): if use_cuda: # Paddle uses 8GB P4 GPU for unittest so we decreased the batch size. return 8 return 12 def iter(use_cuda): if use_cuda: return 10 return 1 gpu_img, gpu_label = init_data(batch_size=batch_size(use_cuda=True), img_shape=img_shape, label_range=999) cpu_img, cpu_label = init_data(batch_size=batch_size(use_cuda=False), img_shape=img_shape, label_range=999) feed_dict_gpu = {"image": gpu_img, "label": gpu_label} feed_dict_cpu = {"image": cpu_img, "label": cpu_label} def feed_dict(use_cuda): if use_cuda: return feed_dict_gpu return feed_dict_cpu