def train(dot_save_dir, prefix, seed=1234): np.random.seed(seed) paddle.seed(seed) if paddle.is_compiled_with_cuda(): paddle.set_flags({'FLAGS_cudnn_deterministic': 1}) startup_program = paddle.static.Program() main_program = paddle.static.Program() img, label, loss = build_program(main_program, startup_program) place = paddle.CUDAPlace( 0) if paddle.is_compiled_with_cuda() else paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(startup_program) build_strategy = paddle.static.BuildStrategy() build_strategy.debug_graphviz_path = os.path.join(dot_save_dir, prefix) compiled_program = paddle.static.CompiledProgram( main_program, build_strategy).with_data_parallel(loss_name=loss.name) iters = 100 feed = rand_data(img.name, label.name, iters) loss_values = [] for step in range(iters): loss_v = exe.run(compiled_program, feed=feed[step], fetch_list=[loss], return_merged=False) loss_values.append(loss_v[0][0][0]) return loss_values
def setUp(self): self.custom_ops = [ custom_module.custom_relu, custom_module.custom_relu_dup ] self.dtypes = ['float32', 'float64'] if paddle.is_compiled_with_cuda(): self.dtypes.append('float16') self.devices = ['cpu'] if paddle.is_compiled_with_cuda(): self.devices.append('gpu')
def test_large_data(self): if not paddle.is_compiled_with_cuda(): return x = np.random.rand(226862, 256).astype("float32") index = np.random.randint(0, 22682, size=(11859027)) def test_dygraph(): with fluid.dygraph.guard(): gpu_out = paddle.gather(paddle.to_tensor(x), paddle.to_tensor(index)) return gpu_out.numpy() @switch_to_static_graph def test_static_graph(): with paddle.static.program_guard(paddle.static.Program(), paddle.static.Program()): x_t = paddle.static.data(name="x", dtype=x.dtype, shape=x.shape) index_t = paddle.static.data(name="index", dtype=index.dtype, shape=index.shape) out_t = paddle.gather(x_t, index_t) feed = {x_t.name: x, index_t.name: index} fetch = [out_t] gpu_exe = paddle.static.Executor(paddle.CUDAPlace(0)) gpu_value = gpu_exe.run(feed=feed, fetch_list=fetch)[0] return gpu_value self.assertTrue(np.array_equal(test_dygraph(), test_static_graph()))
def test_set_current_stream_raise_error(self): if paddle.is_compiled_with_cuda(): self.assertRaises(TypeError, paddle.device.cuda._set_current_stream, np.zeros(5)) self.assertRaises(TypeError, paddle.device.cuda._set_current_stream, None)
def test_sparse_coo_tensor_sorted(self): with _test_eager_guard(): for device in devices: if device == 'cpu' or (device == 'gpu' and paddle.is_compiled_with_cuda()): paddle.device.set_device(device) #test unsorted and duplicate indices indices = [[1, 0, 0], [0, 1, 1]] values = [1.0, 2.0, 3.0] indices = paddle.to_tensor(indices, dtype='int32') values = paddle.to_tensor(values, dtype='float32') sparse_x = paddle.incubate.sparse.sparse_coo_tensor( indices, values) indices_sorted = [[0, 1], [1, 0]] values_sorted = [5.0, 1.0] assert np.array_equal(indices_sorted, sparse_x.indices().numpy()) assert np.array_equal(values_sorted, sparse_x.values().numpy()) # test the non-zero values is a vector values = [[1.0, 1.0], [2.0, 2.0], [3.0, 3.0]] values = paddle.to_tensor(values, dtype='float32') sparse_x = paddle.incubate.sparse.sparse_coo_tensor( indices, values) values_sorted = [[5.0, 5.0], [1.0, 1.0]] assert np.array_equal(indices_sorted, sparse_x.indices().numpy()) assert np.array_equal(values_sorted, sparse_x.values().numpy())
def run_test(clip_after_allreduce=True, max_global_norm=-1.0, gradient_merge_steps=1): if not paddle.is_compiled_with_cuda(): return if os.name == 'nt': return args = locals() log_dir = 'log_{}'.format(os.getpid()) cmd = [ sys.executable, '-u', '-m', 'paddle.distributed.launch', '--log_dir', log_dir, get_test_file(), ] cmd = ' '.join([shlex.quote(c) for c in cmd]) os.environ['CLIP_AFTER_ALLREDUCE'] = str(clip_after_allreduce) os.environ['MAX_GLOBAL_NORM'] = str(max_global_norm) os.environ['GRADIENT_MERGE_STEPS'] = str(gradient_merge_steps) touch_file_env = 'SUCCESS_TOUCH_FILE' touch_file_name = 'distributed_fused_lamb_touch_file_{}'.format(os.getpid()) os.environ[touch_file_env] = touch_file_name remove_file_if_exists(touch_file_name) try: assert os.system(cmd) == 0 and os.path.exists( touch_file_name), 'Test failed when {}'.format(args) finally: remove_file_if_exists(touch_file_name) remove_file_if_exists(log_dir)
def test_fast_math(self): if not paddle.is_compiled_with_cuda(): return def use_fast_math(enabled): paddle.set_flags({'FLAGS_use_fast_math': enabled}) shape = [11, 17, 8] x_np = np.random.uniform(-1, 1, size=shape).astype(np.float16) y_g_np = np.random.uniform(-1, 1, size=shape).astype(np.float16) def run_gelu_op(approximate): with dg.guard(): x = paddle.to_tensor(x_np) x.stop_gradient = False y = F.gelu(x, approximate=approximate) x_grad = paddle.grad([y], [x], [paddle.to_tensor(y_g_np)])[0] return y.numpy(), x_grad.numpy() use_fast_math(True) y_fast_math, x_g_fast_math = run_gelu_op(True) use_fast_math(False) y_ref, x_g_ref = run_gelu_op(True) self.assertTrue(np.allclose(y_ref, y_fast_math, rtol=1e-5, atol=5e-4)) self.assertTrue( np.allclose(x_g_ref, x_g_fast_math, rtol=1e-5, atol=5e-4))
def run_program(self, device_type): if device_type == DeviceType.CUDA: if not paddle.is_compiled_with_cuda(): return places = paddle.static.cuda_places() else: self.assertEqual(device_type, DeviceType.CPU) places = paddle.static.cpu_places(4) paddle.seed(10) with paddle.fluid.unique_name.guard(): main = paddle.static.Program() startup = paddle.static.Program() with paddle.static.program_guard(main, startup): loss = simple_fc_net(use_feed=True) optimizer = paddle.optimizer.SGD(learning_rate=0.0) optimizer.minimize(loss) grads = [p.name + '@GRAD' for p in main.all_parameters()] no_reduce = paddle.static.BuildStrategy.ReduceStrategy._NoReduce build_strategy = paddle.static.BuildStrategy() build_strategy.reduce_strategy = no_reduce main_multi_place = paddle.static.CompiledProgram( main).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy, places=places) build_strategy = paddle.static.BuildStrategy() build_strategy.reduce_strategy = no_reduce main_single_place = paddle.static.CompiledProgram( main.clone()).with_data_parallel(loss_name=loss.name, build_strategy=build_strategy, places=places[0]) image, label = init_data() feed = {'image': image, 'label': label} exe = paddle.static.Executor(places[0]) scope = paddle.static.Scope() with paddle.static.scope_guard(scope): exe.run(startup) grads_multi_place = exe.run(main_multi_place, feed=feed, fetch_list=[grads]) feeds = self.split_feed(feed, len(places)) grads_single_place = [list() for _ in range(len(grads))] for f in feeds: gs = exe.run(main_single_place, feed=f, fetch_list=[grads]) for i, g in enumerate(gs): grads_single_place[i].append(g) for i in range(len(grads)): grads_single_place[i] = np.concatenate(grads_single_place[i], axis=0) / len(places) self.assertEqual(len(grads_multi_place), len(grads_single_place)) for g1, g2 in zip(grads_multi_place, grads_single_place): self.assertTrue(np.allclose(g1, g2), 'g1 = {}\ng2 = {}\n'.format(g1, g2))
def test_combined_loss(self, ): shape = [32, 16] x_feat_name = "student" y_feat_name = "teacher" pairs = [[x_feat_name, y_feat_name]] paddle.seed(0) predicts = { "student": paddle.rand(shape), "teacher": paddle.rand(shape), } devices = ["cpu"] if paddle.is_compiled_with_cuda(): devices.append("gpu") loss_cfg_list = [ { "loss_function": "DMLLoss", "weight": 1.0, "act": "softmax", "model_name_pairs": pairs }, ] for device in devices: paddle.set_device(device) loss_func = CombinedLoss(loss_config_list=loss_cfg_list) pd_result_dict = loss_func(predicts, None) np_result_dict = self.np_combined_loss(predicts, loss_cfg_list) for k in pd_result_dict: pd_result = pd_result_dict[k].numpy() np_result = np_result_dict[k] self.assertTrue(np.allclose(np_result, pd_result))
def calc_distillation_distance_loss(self, predicts, pairs, key=None): modes = ["l1", "l2", "smooth_l1"] reductions = ["none", "mean", "sum"] devices = ["cpu"] if paddle.is_compiled_with_cuda(): devices.append("gpu") for device in devices: paddle.set_device(device) for reduction in reductions: for mode in modes: loss_func = DistillationLoss( mode=mode, loss_function='DistanceLoss', model_name_pairs=pairs, layers_name=[key, key] if key != None else None, reduction=reduction) np_result_dict = self.dist_np_distance_loss( predicts, loss_function='DistanceLoss', mode=mode, reduction=reduction, model_name_pairs=pairs, key=key) pd_result_dict = loss_func(predicts, None) for k in np_result_dict: pd_result = pd_result_dict[k].numpy() np_result = np_result_dict[k] self.assertTrue(np.allclose(np_result, pd_result))
def __init__(self, net, size, mean=0.0, std=1.0, nms_method=None, iou_threshold=0.3, filter_threshold=0.01, candidate_size=200, sigma=0.5, device=None): self.net = net self.transform = PredictionTransform(size, mean, std) self.iou_threshold = iou_threshold self.filter_threshold = filter_threshold self.candidate_size = candidate_size self.nms_method = nms_method self.sigma = sigma if device: self.device = device else: self.device = paddle.set_device( "cuda" if paddle.is_compiled_with_cuda() else "cpu") self.net.to(self.device) self.net.eval() self.timer = Timer()
def test_synchronize(self): if paddle.is_compiled_with_cuda(): self.assertIsNone(cuda.synchronize()) self.assertIsNone(cuda.synchronize(0)) self.assertIsNone(cuda.synchronize(paddle.CUDAPlace(0))) self.assertRaises(ValueError, cuda.synchronize, "gpu:0")
def run_main(self, use_fp16, use_master_param_norm=True): if not paddle.is_compiled_with_cuda(): return if not use_fp16: self.assertTrue(use_master_param_norm) base_config = self.config() config1 = dict(base_config) config1['use_distributed_lamb'] = True config1['use_fp16'] = use_fp16 config1['use_master_param_norm'] = use_master_param_norm config2 = dict(base_config) config2['use_distributed_lamb'] = False config2['use_fp16'] = use_fp16 config2['use_master_param_norm'] = use_master_param_norm result1 = run_model(**config1) result2 = run_model(**config2) self.assertEqual(len(result1), len(result2)) if use_fp16: atol = 8e-4 if use_master_param_norm else 1e-3 else: atol = 1e-7 for ret1, ret2 in zip(result1, result2): max_diff = np.max(np.abs(ret1 - ret2)) msg = 'max_diff = {} atol = {} when use_fp16 = {} , use_master_param_norm = {}'.format( max_diff, atol, use_fp16, use_master_param_norm) self.assertTrue(max_diff < atol, msg) print(msg)
def test_check_output_gpu(self): if paddle.is_compiled_with_cuda(): paddle.disable_static(place=paddle.CUDAPlace(0)) input_real_data = paddle.to_tensor(self.x_np) actual_w, actual_v = paddle.linalg.eigh(input_real_data, self.UPLO) valid_eigh_result(self.x_np, actual_w.numpy(), actual_v.numpy(), self.UPLO)
def test_check_output_gpu(self): if paddle.is_compiled_with_cuda(): paddle.disable_static(place=paddle.CUDAPlace(0)) input_real_data = paddle.to_tensor(self.x_np) expected_w = np.linalg.eigvalsh(self.x_np) actual_w = paddle.linalg.eigvalsh(input_real_data) compare_result(actual_w.numpy(), expected_w)
def _enable_gpu(self): enable_gpu = self.resource_quota.on_gpu if enable_gpu and not paddle.is_compiled_with_cuda(): # type: ignore raise BentoMLException( "`resource_quota.on_gpu=True` while CUDA is not currently supported by existing paddlepaddle." " Make sure to install `paddlepaddle-gpu` and try again.") return enable_gpu
def _build_predict_fn(self, rebuild: bool = False): if self.predict_fn is not None: assert callable(self.predict_fn), "predict_fn is predefined before, but is not callable." \ "Check it again." import paddle if self.predict_fn is None or rebuild: if not paddle.is_compiled_with_cuda() and self.device[:3] == 'gpu': print("Paddle is not installed with GPU support. Change to CPU version now.") self.device = 'cpu' # set device. self.device is one of ['cpu', 'gpu:0', 'gpu:1', ...] paddle.set_device(self.device) # to get gradients, the ``train`` mode must be set. self.paddle_model.eval() def predict_fn(data): assert len(data.shape) == 4 # [bs, h, w, 3] with paddle.no_grad(): logits = self.paddle_model(paddle.to_tensor(data)) # get logits, [bs, num_c] probas = paddle.nn.functional.softmax(logits, axis=1) # get probabilities. return probas.numpy() self.predict_fn = predict_fn
def train(self, place, iters, feed, use_cinn=False, seed=1234): np.random.seed(seed) paddle.seed(seed) if paddle.is_compiled_with_cuda(): paddle.set_flags({'FLAGS_cudnn_deterministic': 1}) set_cinn_flag(use_cinn) startup_program = paddle.static.Program() main_program = paddle.static.Program() loss = self.build_program(main_program, startup_program) exe = paddle.static.Executor(place) compiled_prog = paddle.static.CompiledProgram( main_program).with_data_parallel(loss_name=loss.name) loss_vals = [] scope = paddle.static.Scope() with paddle.static.scope_guard(scope): exe.run(startup_program) for step in range(iters): loss_v = exe.run(compiled_prog, feed=feed[step], fetch_list=[loss], return_numpy=True) loss_vals.append(loss_v[0][0]) return loss_vals
def test_fixed_random_number(self): # Test GPU Fixed random number, which is generated by 'curandStatePhilox4_32_10_t' if not paddle.is_compiled_with_cuda(): return print("Test Fixed Random number on GPU------>") paddle.disable_static() paddle.set_device('gpu') paddle.seed(100) np.random.seed(100) x_np = np.random.rand(32, 1024, 1024) x = paddle.to_tensor(x_np, dtype='float64') y = paddle.bernoulli(x).numpy() index0, index1, index2 = np.nonzero(y) self.assertEqual(np.sum(index0), 260028995) self.assertEqual(np.sum(index1), 8582429431) self.assertEqual(np.sum(index2), 8581445798) expect = [0., 0., 0., 0., 0., 0., 0., 1., 1., 1.] self.assertTrue(np.array_equal(y[16, 500, 500:510], expect)) x = paddle.to_tensor(x_np, dtype='float32') y = paddle.bernoulli(x).numpy() index0, index1, index2 = np.nonzero(y) self.assertEqual(np.sum(index0), 260092343) self.assertEqual(np.sum(index1), 8583509076) self.assertEqual(np.sum(index2), 8582778540) expect = [0., 0., 1., 1., 1., 1., 0., 1., 1., 1.] self.assertTrue(np.array_equal(y[16, 500, 500:510], expect)) paddle.enable_static()
def test_identity(self): self.place = paddle.CPUPlace() self._test_identity() if paddle.is_compiled_with_cuda(): self.place = paddle.CUDAPlace(0) self._test_identity()
def setUp(self): cur_dir = os.path.dirname(os.path.abspath(__file__)) # compile, install the custom op egg into site-packages under background if os.name == 'nt': cmd = 'cd /d {} && python custom_relu_setup.py install'.format( cur_dir) else: cmd = 'cd {} && python custom_relu_setup.py install'.format( cur_dir) run_cmd(cmd) # NOTE(Aurelius84): Normally, it's no need to add following codes for users. # But we simulate to pip install in current process, so interpreter don't snap # sys.path has been updated. So we update it manually. # See: https://stackoverflow.com/questions/56974185/import-runtime-installed-module-using-pip-in-python-3 if os.name == 'nt': # NOTE(zhouwei25): getsitepackages on windows will return a list: [python install dir, site packages dir] site_dir = site.getsitepackages()[1] else: site_dir = site.getsitepackages()[0] custom_egg_path = [ x for x in os.listdir(site_dir) if 'custom_relu_module_setup' in x ] assert len(custom_egg_path ) == 1, "Matched egg number is %d." % len(custom_egg_path) sys.path.append(os.path.join(site_dir, custom_egg_path[0])) # usage: import the package directly import custom_relu_module_setup # `custom_relu_dup` is same as `custom_relu_dup` self.custom_ops = [ custom_relu_module_setup.custom_relu, custom_relu_module_setup.custom_relu_dup ] self.dtypes = ['float32', 'float64'] if paddle.is_compiled_with_cuda(): self.dtypes.append('float16') self.devices = ['cpu'] if paddle.is_compiled_with_cuda(): self.devices.append('gpu') # config seed SEED = 2021 paddle.seed(SEED) paddle.framework.random._manual_program_seed(SEED)
def setUp(self): self.init_input_data() self.UPLO = 'L' self.rtol = 1e-5 # for test_eigh_grad self.atol = 1e-5 # for test_eigh_grad self.place = paddle.CUDAPlace(0) if paddle.is_compiled_with_cuda() \ else paddle.CPUPlace() np.random.seed(123)
def execute(main_program, startup_program): if paddle.is_compiled_with_cuda(): place = paddle.CUDAPlace(0) else: place = paddle.CPUPlace() exe = paddle.static.Executor(place) exe.run(startup_program) exe.run(main_program)
def setUpClass(cls): if not paddle.is_compiled_with_cuda(): return paddle.enable_static() paddle.set_flags({'FLAGS_cudnn_deterministic': True}) _clip_by_global_norm_using_mp_type(True) fleet.init(role_maker=get_role_maker())
def test_check_output_gpu(self): if paddle.is_compiled_with_cuda(): paddle.disable_static(place=paddle.CUDAPlace(0)) input_real_data = paddle.to_tensor(self.x_np) expected_w = np.linalg.eigvalsh(self.x_np) actual_w = paddle.linalg.eigvalsh(input_real_data) np.testing.assert_allclose( actual_w, expected_w, rtol=self.rtol, atol=self.atol)
def test_stream_guard_default_stream(self): if paddle.is_compiled_with_cuda(): s1 = paddle.device.cuda.current_stream() with paddle.device.cuda.stream_guard(s1): pass s2 = paddle.device.cuda.current_stream() self.assertTrue(id(s1) == id(s2))
def setUp(self): self.seed = 2021 self.in_size = 10 self.out_size = 10 self.batch_size = 4 self.devices = ["cpu"] if paddle.is_compiled_with_cuda(): self.devices.append("gpu")
def setUp(self): self.dtypes = ['float32', 'float64'] self.devices = ['cpu'] if paddle.is_compiled_with_cuda(): self.devices.append('gpu') self.np_x = np.random.random((3, 2)).astype("float32") self.np_weight = np.full([2, 4], fill_value=0.5, dtype="float32") self.np_bias = np.ones([4], dtype="float32")
def setUp(self): paddle.framework.random.set_random_seed_generator('seed0', 123) paddle.framework.random.set_random_seed_generator('seed1', 123) rng0 = paddle.framework.random.get_random_seed_generator('seed0') rng1 = paddle.framework.random.get_random_seed_generator('seed1') self.places = [paddle.CPUPlace()] if paddle.is_compiled_with_cuda(): self.places.append(paddle.CUDAPlace(0))
def test_ptq(self): seed = 1 np.random.seed(seed) paddle.static.default_main_program().random_seed = seed paddle.static.default_startup_program().random_seed = seed _logger.info("create the fp32 model") fp32_lenet = ImperativeLenet() _logger.info("prepare data") batch_size = 64 transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])]) train_dataset = paddle.vision.datasets.MNIST( mode='train', backend='cv2', transform=transform) val_dataset = paddle.vision.datasets.MNIST( mode='test', backend='cv2', transform=transform) place = paddle.CUDAPlace(0) \ if paddle.is_compiled_with_cuda() else paddle.CPUPlace() train_reader = paddle.io.DataLoader( train_dataset, drop_last=True, places=place, batch_size=batch_size, return_list=True) test_reader = paddle.io.DataLoader( val_dataset, places=place, batch_size=batch_size, return_list=True) _logger.info("train the fp32 model") self.model_train(fp32_lenet, train_reader) _logger.info("test fp32 model") fp32_top1, fp32_top5 = self.model_test(fp32_lenet, test_reader) _logger.info("quantize the fp32 model") quanter = PTQ() quant_lenet = quanter.quantize(fp32_lenet, fuse=True) _logger.info("calibrate") self.calibrate(quant_lenet, test_reader) _logger.info("save and test the quantized model") save_path = "./tmp/model" input_spec = paddle.static.InputSpec( shape=[None, 1, 28, 28], dtype='float32') quanter.save_quantized_model( quant_lenet, save_path, input_spec=[input_spec]) quant_top1, quant_top5 = self.model_test(quant_lenet, test_reader) _logger.info("FP32 acc: top1: {}, top5: {}".format(fp32_top1, fp32_top5)) _logger.info("Int acc: top1: {}, top5: {}".format(quant_top1, quant_top5)) diff = 0.002 self.assertTrue( fp32_top1 - quant_top1 < diff, msg="The acc of quant model is too lower than fp32 model")