def test_smooth_l1_loss(_): arg_dict = OrderedDict() arg_dict["device_type"] = ["gpu", "cpu"] arg_dict["prediction_shape"] = [ (100, ), (10, 10), ] arg_dict["data_type"] = ["float32", "double"] arg_dict["beta"] = [0, 0.5, 1] for case in GenArgList(arg_dict): device_type, prediction_shape, data_type, beta = case assert device_type in ["gpu", "cpu"] assert data_type in ["float32", "double", "int8", "int32", "int64"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) prediction = np.random.randn(*prediction_shape).astype( type_name_to_np_type[data_type]) label = np.random.randn(*prediction_shape).astype( type_name_to_np_type[data_type]) np_result = gen_numpy_data(prediction, label, beta) def assert_prediction_grad(b): prediction_grad = np_result["prediction_grad"] assert prediction_grad.dtype == type_name_to_np_type[data_type] assert np.allclose(prediction_grad, b.numpy()), ( case, prediction_grad, b.numpy(), ) @flow.global_function(type="train", function_config=func_config) def TestJob( prediction: oft.Numpy.Placeholder( prediction_shape, dtype=type_name_to_flow_type[data_type]), label: oft.Numpy.Placeholder( prediction_shape, dtype=type_name_to_flow_type[data_type]), ): v = flow.get_variable( "prediction", shape=prediction_shape, dtype=type_name_to_flow_type[data_type], initializer=flow.constant_initializer(0), trainable=True, ) flow.watch_diff(v, assert_prediction_grad) prediction += v with flow.scope.placement(device_type, "0:0"): loss = flow.smooth_l1_loss(prediction, label, beta) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [1e-4]), momentum=0, ).minimize(loss) return loss loss_np = np_result["loss"] assert loss_np.dtype == type_name_to_np_type[data_type] loss = TestJob(prediction, label).get().numpy() assert np.allclose(loss_np, loss), (case, loss_np, loss)
def _run_test_moving_average_min_max_observer( test_case, device_type, device_num, dtype, activation_shape, quantization_bit, quantization_scheme, momentum, ): assert device_type in ["gpu", "cpu"] flow.clear_default_session() if device_type == "cpu": flow.config.cpu_device_num(device_num) else: flow.config.gpu_device_num(device_num) @flow.global_function(type="train", function_config=flow.FunctionConfig()) def QuantizeJob(activation: oft.Numpy.Placeholder( activation_shape, dtype=type_name_to_flow_type[dtype])): with flow.scope.placement(device_type, "0:0-%d" % (device_num - 1)): x = flow.get_variable( "x", shape=activation_shape, dtype=activation.dtype, initializer=flow.zeros_initializer(activation.dtype), trainable=True, ) scale, zero_point = flow.quantization.moving_average_min_maxObserver( activation, quantization_bit, quantization_scheme, momentum, ) fake = x + activation loss = flow.math.reduce_mean(fake) flow.optimizer.Adam( flow.optimizer.PiecewiseConstantScheduler( [], [0.001]), ).minimize(loss) return scale, zero_point check_point = flow.train.CheckPoint() check_point.init() moving_max_np = np.zeros((1, )) moving_min_np = np.zeros((1, )) for i in range(10): activation = (np.random.random(activation_shape) - 0.5).astype( type_name_to_np_type[dtype]) scale, zero_point = QuantizeJob(activation).get() _check_moving_average_min_max_observer( test_case, activation, scale.numpy(), zero_point.numpy(), moving_max_np, moving_min_np, quantization_bit, quantization_scheme, momentum, )
def compare_with_tensorflow(device_type, activation_type, shape, data_type): assert device_type in ["gpu", "cpu"] flow.clear_default_session() flow.config.enable_debug_mode(True) func_config = flow.FunctionConfig() if data_type == flow.float16: func_config.enable_auto_mixed_precision(True) data_type = flow.float func_config.default_data_type(data_type) of_activation_map = { "relu": flow.nn.relu, "sigmoid": flow.math.sigmoid, "tanh": flow.math.tanh, } tf_activation_map = { "relu": tf.nn.relu, "sigmoid": tf.math.sigmoid, "tanh": tf.math.tanh, # "gelu": tfa.activations.gelu, } @flow.global_function(type="train", function_config=func_config) def ActivationJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=shape, dtype=data_type, initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) loss = of_activation_map[activation_type](x) lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [1e-4]) flow.optimizer.SGD(lr_scheduler, momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss # OneFlow check_point = flow.train.CheckPoint() check_point.init() of_out = ActivationJob().get() # TensorFlow with tf.GradientTape(persistent=True) as tape: x = tf.Variable(test_global_storage.Get("x")) tf_out = tf_activation_map[activation_type](x) loss_diff = test_global_storage.Get("loss_diff") tf_x_diff = tape.gradient(tf_out, x, loss_diff) rtol = 1e-3 if activation_type is "gelu" else 1e-5 atol = 1e-3 if activation_type is "gelu" else 1e-5 assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol, atol) assert np.allclose(test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol, atol)
def CompareNnBnWithTensorFlow( device_type, input_shape, data_type, axis, epsilon, input_minval=-10, input_maxval=10, y_rtol=1e-5, y_atol=1e-5, x_diff_rtol=1e-5, x_diff_atol=1e-5, ): flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_logical_view(flow.scope.consistent_view()) func_config.default_data_type(flow.float32) x = np.random.uniform(low=input_minval, high=input_maxval, size=input_shape).astype(np.float32) param_shape = input_shape[axis] mean = np.random.uniform(low=input_minval, high=input_maxval, size=param_shape).astype(np.float32) variance = np.random.uniform(low=0, high=input_maxval, size=param_shape).astype(np.float32) offset = np.random.uniform(low=input_minval, high=input_maxval, size=param_shape).astype(np.float32) scale = np.random.uniform(low=input_minval, high=input_maxval, size=param_shape).astype(np.float32) @flow.global_function(type="train", function_config=func_config) def FlowNnBnJob( x_full_precision: oft.Numpy.Placeholder(x.shape), mean: oft.Numpy.Placeholder(mean.shape), variance: oft.Numpy.Placeholder(variance.shape), offset: oft.Numpy.Placeholder(offset.shape), scale: oft.Numpy.Placeholder(scale.shape), ): with flow.scope.placement(device_type, "0:0"): x_full_precision += flow.get_variable( name="v1", shape=(1, ), dtype=flow.float32, initializer=flow.zeros_initializer(), ) if data_type == "float16": x = flow.cast(x_full_precision, flow.float16) else: x = x_full_precision y = flow.nn.batch_normalization(x, mean, variance, offset, scale, epsilon, axis=axis) y = flow.cast(y, flow.float32) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler([], [0]), momentum=0).minimize(y) flow.watch_diff(x_full_precision, test_global_storage.Setter("x_diff")) return y check_point = flow.train.CheckPoint() check_point.init() of_y = FlowNnBnJob(x, mean, variance, offset, scale).get().numpy() of_x_diff = test_global_storage.Get("x_diff") def TensorFlowNnBn(x, mean, variance, offset, scale): tf_params_shape = [1, 1, 1, 1] tf_params_shape[axis] = input_shape[axis] with tf.GradientTape(persistent=True) as tape: x = tf.Variable(x) if data_type == "float16": x = tf.cast(x, tf.float16) mean = tf.Variable(mean.reshape(tf_params_shape)) variance = tf.Variable(variance.reshape(tf_params_shape)) offset = tf.Variable(offset.reshape(tf_params_shape)) scale = tf.Variable(scale.reshape(tf_params_shape)) y = tf.cast( tf.nn.batch_normalization(x, mean, variance, offset, scale, epsilon), tf.float32, ) x_diff = tape.gradient(y, x) return y.numpy(), x_diff.numpy() tf_y, tf_x_diff = TensorFlowNnBn(x, mean, variance, offset, scale) assert np.allclose(of_y, tf_y, rtol=y_rtol, atol=y_atol) assert np.allclose(of_x_diff, tf_x_diff, rtol=x_diff_rtol, atol=x_diff_atol)
def setUp(self): global _unittest_env_initilized global _unittest_worker_initilized if has_node_list(): assert node_size() > 1 if _unittest_worker_initilized == False: master_port = os.getenv("ONEFLOW_TEST_MASTER_PORT") assert master_port, "env var ONEFLOW_TEST_MASTER_PORT not set" oneflow.env.ctrl_port(int(master_port)) if enable_init_by_host_list(): oneflow.env.machine(node_list()) data_port = os.getenv("ONEFLOW_TEST_DATA_PORT") if data_port: oneflow.env.data_port(int(data_port)) ssh_port = os.getenv("ONEFLOW_TEST_SSH_PORT") print("initializing worker...") oneflow.deprecated.init_worker(scp_binary=True, use_uuid=True, ssh_port=int(ssh_port)) atexit.register(oneflow.deprecated.delete_worker, ssh_port=ssh_port) _unittest_worker_initilized = True else: ctrl_port = os.getenv("ONEFLOW_TEST_CTRL_PORT") config_rank_ctrl_port = -1 if ctrl_port: config_rank_ctrl_port = int(ctrl_port) if has_world_size(): config_world_size = world_size() else: config_world_size = 0 bootstrap_conf_list = oneflow.env.init_bootstrap_confs( node_list(), int(master_port), config_world_size, config_rank_ctrl_port, ) data_port = os.getenv("ONEFLOW_TEST_DATA_PORT") if data_port: oneflow.env.data_port(int(data_port)) ssh_port = os.getenv("ONEFLOW_TEST_SSH_PORT") print("initializing worker...") oneflow.deprecated.init_worker( scp_binary=True, use_uuid=True, ssh_port=int(ssh_port), bootstrap_conf_list=bootstrap_conf_list, ) atexit.register( oneflow.deprecated.delete_worker_by_bootstrap, ssh_port=ssh_port) _unittest_worker_initilized = True log_dir = os.getenv("ONEFLOW_TEST_LOG_DIR") if log_dir: oneflow.env.log_dir(log_dir) if _unittest_env_initilized == False: oneflow.env.init() _unittest_env_initilized = True oneflow.clear_default_session() oneflow.enable_eager_execution(eager_execution_enabled()) oneflow.experimental.enable_typing_check(typing_check_enabled())
def compare_with_numpy_indexed_slices_sgdw( device_type, model_shape, ids_shape, grad_shape, momentum_beta, learning_rate, train_iters, mul_scalar, weight_decay, ): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float32) func_config.indexed_slices_optimizer_conf( dict(include_op_names=dict(op_name=["embeddings"]))) @flow.global_function(type="train", function_config=func_config) def testIndexedSlicesSGDW( sparse_ids: flow.typing.Numpy.Placeholder(ids_shape, dtype=flow.int32), ) -> flow.typing.Numpy: with flow.scope.placement(device_type, "0:0"): embedding_table = flow.get_variable( name="embeddings", shape=model_shape, initializer=flow.random_uniform_initializer(minval=0, maxval=100), ) embedding = flow.gather(params=embedding_table * mul_scalar, indices=sparse_ids) loss = flow.math.reduce_mean(embedding) flow.optimizer.SGDW( flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]), momentum=momentum_beta, weight_decay=weight_decay, ).minimize(loss) return embedding_table sparse_ids = np.random.randint(model_shape[0], size=ids_shape).astype(np.int32) init_value = None for i in range(train_iters + 1): x = testIndexedSlicesSGDW(sparse_ids) if i == 0: init_value = np.copy(x) def indexed_slices_update_numpy( param, unique_dict, iter, momentum, lr=0.001, momentum_beta=0, weight_decay=0.9, ): param_t = np.copy(param) momentum_t = np.copy(momentum) for ids in unique_dict.keys(): next_momentum = momentum_beta * momentum_t[ids] - lr * unique_dict[ ids] momentum_t[ids] = next_momentum param_t_o = param[ ids] + next_momentum - lr * weight_decay * param[ids] param_t[ids] = param_t_o return param_t, momentum_t param = init_value gradient = np.full(grad_shape, float(mul_scalar) / np.prod(grad_shape)) momentum = np.zeros(param.shape) unique_dict = unique_grads(sparse_ids, gradient) for i in range(train_iters): param, momentum = indexed_slices_update_numpy(param, unique_dict, i, momentum, learning_rate, momentum_beta, weight_decay) assert np.allclose( x.flatten(), param.flatten(), rtol=1e-4, atol=1e-4, )
def compare_with_tensorflow_adam(device_type, x_shape, beta1, beta2, epsilon, learning_rate, train_iters): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float32) @flow.global_function(type="train", function_config=func_config) def testAdam(random_mask: flow.typing.Numpy.Placeholder( x_shape, dtype=flow.float32)) -> flow.typing.Numpy: with flow.scope.placement(device_type, "0:0-0"): x = flow.get_variable( name="x", shape=x_shape, dtype=flow.float32, initializer=flow.random_uniform_initializer(minval=0, maxval=100), trainable=True, ) loss = flow.math.reduce_mean(x * random_mask) flow.optimizer.Adam( flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]), beta1=beta1, beta2=beta2, epsilon=epsilon, do_bias_correction=True, ).minimize(loss) return x # generate random number sequences random_masks_seq = [] for i in range(train_iters + 1): random_masks_seq.append( np.random.uniform(size=x_shape).astype(np.float32)) init_value = None for i in range(train_iters + 1): x = testAdam(random_masks_seq[i]) if i == 0: init_value = np.copy(x) var = tf.Variable(init_value) opt = tf.keras.optimizers.Adam( learning_rate=learning_rate, beta_1=beta1, beta_2=beta2, epsilon=epsilon, amsgrad=False, ) for i in range(train_iters): with tf.GradientTape() as tape: random_mask = tf.Variable(random_masks_seq[i]) loss = tf.reduce_mean(var * random_mask) gradients = tape.gradient(loss, var) opt.apply_gradients(zip([gradients], [var])) assert np.allclose( x.flatten(), var.numpy().flatten(), rtol=1e-4, atol=1e-4, )
def compare_with_tensorflow( device_type, x_shape, filters, kernel_size, groups, of_padding="SAME", tf_padding="SAME", stride_h=1, stride_w=1, data_format="NCHW", dilation_h=1, dilation_w=1, ): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.default_logical_view(flow.scope.consistent_view()) if data_format == "NCHW": xy_data_transpose = (0, 2, 3, 1) weight_data_transpose = (2, 3, 1, 0) else: xy_data_transpose = (0, 1, 2, 3) weight_data_transpose = (1, 2, 3, 0) @flow.global_function(type="train", function_config=func_config) def ConvJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=x_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=0, maxval=100), trainable=True, ) if data_format == "NCHW": weight_shape = (filters, x.shape[1] // groups, kernel_size, kernel_size) else: weight_shape = (filters, kernel_size, kernel_size, x.shape[3] // groups) weight = flow.get_variable( "conv-weight", shape=weight_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=0, maxval=100), ) loss = flow.nn.conv2d( x, weight, strides=[stride_h, stride_w], padding=of_padding, data_format=data_format, dilations=[dilation_h, dilation_w], groups=groups, ) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(weight, test_global_storage.Setter("weight")) flow.watch_diff(weight, test_global_storage.Setter("weight_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss # OneFlow of_out = ConvJob().get() # TensorFlow with tf.GradientTape(persistent=True) as tape: x = tf.Variable( test_global_storage.Get("x").transpose(xy_data_transpose)) assert groups > 0 assert x_shape[1] % groups == 0 assert filters % groups == 0 weight = tf.Variable( test_global_storage.Get("weight").transpose(weight_data_transpose)) tf_out = tf.nn.conv2d( x, weight, strides=[1, stride_h, stride_w, 1], padding=tf_padding, data_format="NHWC", dilations=[1, dilation_h, dilation_w, 1], ) loss_diff = test_global_storage.Get("loss_diff").transpose( xy_data_transpose) tf_x_diff = tape.gradient(tf_out, x, loss_diff) tf_weight_diff = tape.gradient(tf_out, weight, loss_diff) idx = np.where( np.abs(of_out.numpy().transpose(xy_data_transpose) - tf_out.numpy()) > 5e-4) assert np.allclose( of_out.numpy().transpose(xy_data_transpose), tf_out.numpy(), rtol=1e-5, atol=1e-5, ) assert np.allclose( test_global_storage.Get("x_diff").transpose(xy_data_transpose), tf_x_diff.numpy(), rtol=1e-4, atol=1e-4, ) assert np.allclose( test_global_storage.Get("weight_diff").transpose( weight_data_transpose), tf_weight_diff.numpy(), rtol=1e-5, atol=1e-5, )
def compare_with_tensorflow(device_type, data_type, x_shape, case): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) @flow.global_function(type="train", function_config=func_config) def ScalarAddByTensorJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=x_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=0, maxval=100), trainable=True, ) y = flow.get_variable( "y", shape=(1, ), dtype=flow.float, initializer=flow.random_uniform_initializer(minval=0, maxval=100), trainable=True, ) if case == "add": loss = flow.math.add(x, y) elif case == "sub": loss = flow.math.subtract(x, y) elif case == "mul": loss = flow.math.multiply(x, y) elif case == "div": loss = flow.math.divide(x, y) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch(y, test_global_storage.Setter("y")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch_diff(y, test_global_storage.Setter("y_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss # OneFlow of_out = ScalarAddByTensorJob().get() # TensorFlow with tf.GradientTape(persistent=True) as tape: x = tf.Variable(test_global_storage.Get("x")) y = tf.Variable(test_global_storage.Get("y")) if case == "add": tf_out = x + y elif case == "sub": tf_out = x - y elif case == "mul": tf_out = x * y elif case == "div": tf_out = x / y loss_diff = test_global_storage.Get("loss_diff") tf_x_diff = tape.gradient(tf_out, x, loss_diff) tf_y_diff = tape.gradient(tf_out, y, loss_diff) assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-5, atol=1e-5) assert np.allclose(test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-5, atol=1e-5) assert np.allclose(test_global_storage.Get("y_diff"), tf_y_diff.numpy(), rtol=1e-5, atol=1e-5)
def _compare_mseloss_with_np( input_shape, target_shape, device_type, machine_ids, device_counts ): input = np.random.random(size=input_shape).astype(np.float32) target = np.random.random(size=target_shape).astype(np.float32) assert device_type in ["cpu", "gpu"] flow.clear_default_session() if device_type == "cpu": flow.config.cpu_device_num(device_counts) else: flow.config.gpu_device_num(device_counts) func_config = flow.FunctionConfig() def np_mseloss(np_input, np_target): np_mse = np.square(np_target - np_input) np_mse_mean = np.mean(np_mse) np_mse_sum = np.sum(np_mse) return { "np_mse_loss": np_mse, "np_mse_loss_mean": np_mse_mean, "np_mse_loss_sum": np_mse_sum, } def np_mseloss_grad(np_input, np_target): elem_cnt = np_input.size np_mse_grad_mean = (-2 * (np_target - np_input)) / elem_cnt # TODO: if you want to get the grad when the reduction="sum", you can use the follow code # np_mse_grad_sum = -2 * (np_target - np_input) return { "np_mse_grad_mean": np_mse_grad_mean, } # Use Numpy to compute mseloss np_out_mseloss_dict = np_mseloss(input, target) # Use Numpy to compute mseloss grad np_grad_dict = np_mseloss_grad(input, target) def assert_prediction_grad(blob: tp.Numpy): # Evaluate the gradient. Here we only test the reduction type == "mean" assert np.allclose(blob, np_grad_dict["np_mse_grad_mean"]) @flow.global_function( type="train", function_config=func_config, ) def oneflow_mseloss( of_input: tp.Numpy.Placeholder(shape=input.shape), of_target: tp.Numpy.Placeholder(shape=target.shape), ) -> Dict[str, tp.Numpy]: with flow.scope.placement(device_type, "0:0"): v = flow.get_variable( shape=input.shape, dtype=flow.float32, initializer=flow.zeros_initializer(), name="x_var", ) x_var = of_input + v flow.watch_diff(x_var, assert_prediction_grad) mseloss = flow.nn.MSELoss(x_var, of_target, reduction="none", name="of_mseloss") mseloss_mean = flow.nn.MSELoss( x_var, of_target, reduction="mean", name="of_mseloss_reduce_mean" ) mseloss_sum = flow.nn.MSELoss( x_var, of_target, reduction="sum", name="of_mseloss_reduce_sum" ) with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [1e-3]), momentum=0 ).minimize(mseloss_mean) return { "of_mse_loss": mseloss, "of_mse_loss_mean": mseloss_mean, "of_mse_loss_sum": mseloss_sum, } of_out_mseloss_dict = oneflow_mseloss(input, target) assert np.allclose( of_out_mseloss_dict["of_mse_loss"], np_out_mseloss_dict["np_mse_loss"] ) assert np.allclose( of_out_mseloss_dict["of_mse_loss_mean"], np_out_mseloss_dict["np_mse_loss_mean"] ) assert np.allclose( of_out_mseloss_dict["of_mse_loss_sum"], np_out_mseloss_dict["np_mse_loss_sum"] )
def _make_op_function( test_case, input, padding, grad, device_type, value_type, machine_ids, device_counts, ): flow.clear_default_session() if device_type == "cpu": flow.config.cpu_device_num(device_counts) else: flow.config.gpu_device_num(device_counts) func_config = flow.FunctionConfig() # global function needs float32 as type of argument and return value if value_type == flow.float16: func_config.default_data_type(flow.float32) else: func_config.default_data_type(value_type) func_config.default_placement_scope( flow.scope.placement(device_type, machine_ids)) func_config.default_logical_view(flow.scope.consistent_view()) def _compare_diff(blob: tp.Numpy): test_case.assertTrue(np.allclose(grad, blob, 1e-3, 1e-3)) if value_type == flow.float32 or value_type == flow.float64: @flow.global_function(type="train", function_config=func_config) def op_function(x: tp.Numpy.Placeholder(input.shape, dtype=value_type)): with flow.scope.placement(device_type, "0:0"): x += flow.get_variable( name="input", shape=input.shape, dtype=value_type, initializer=flow.zeros_initializer(), ) out = flow.reflection_pad2d(x, padding) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0]), momentum=0).minimize(out) flow.watch_diff(x, _compare_diff) return out return op_function elif value_type == flow.int32: @flow.global_function(type="train", function_config=func_config) def op_function(x: tp.Numpy.Placeholder(input.shape, dtype=flow.float32)): with flow.scope.placement(device_type, "0:0"): x += flow.get_variable( name="input", shape=input.shape, dtype=flow.float32, initializer=flow.zeros_initializer(), ) y_int32 = flow.reflection_pad2d(x, padding) y_fp32 = flow.cast(y_int32, dtype=flow.float32) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0]), momentum=0).minimize(y_fp32) flow.watch_diff(x, _compare_diff) return y_fp32 return op_function elif value_type == flow.float16: @flow.global_function(type="train", function_config=func_config) def op_function(x: tp.Numpy.Placeholder(input.shape, dtype=flow.float32)): with flow.scope.placement(device_type, "0:0"): x_var = flow.get_variable( name="input", shape=input.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), ) x_var = flow.cast_to_current_logical_view(x_var) input_x = x_var + x x_fp32 = flow.cast(input_x, flow.float32) x_fp16 = flow.cast(input_x, dtype=flow.float16) y_fp16 = flow.reflection_pad2d(x_fp16, padding) y_fp32 = flow.cast(y_fp16, dtype=flow.float32) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0]), momentum=0).minimize(y_fp32) flow.watch_diff(x_fp32, _compare_diff) return y_fp32 return op_function
def _test_batchnorm_add_relu(test_case, input_shape, axis, data_type): flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_logical_view(flow.scope.consistent_view()) func_config.default_data_type(flow.float32) @flow.global_function(type="train", function_config=func_config) def test_job( x: oft.Numpy.Placeholder(input_shape, dtype=flow.float32), addend: oft.Numpy.Placeholder(input_shape, dtype=flow.float32), ): v = flow.get_variable( name="v", shape=(1, ), dtype=flow.float32, initializer=flow.zeros_initializer(), ) x = x + v addend = addend + v x1 = flow.identity(x) x2 = flow.identity(x) addend1 = flow.identity(addend) addend2 = flow.identity(addend) flow.watch_diff(x1, test_global_storage.Setter("x1_diff")) flow.watch_diff(x2, test_global_storage.Setter("x2_diff")) flow.watch_diff(addend1, test_global_storage.Setter("addend1_diff")) flow.watch_diff(addend2, test_global_storage.Setter("addend2_diff")) x1 = flow.cast(x1, data_type) x2 = flow.cast(x2, data_type) addend1 = flow.cast(addend1, data_type) addend2 = flow.cast(addend2, data_type) y1 = flow.layers.batch_normalization_add_relu(x1, addend=addend1, axis=axis, name="BN1") y2 = flow.math.relu( flow.layers.batch_normalization(x2, axis=axis, name="BN2") + addend2) y1 = flow.cast(y1, flow.float32) y2 = flow.cast(y2, flow.float32) flow.watch(y1, test_global_storage.Setter("y1")) flow.watch(y2, test_global_storage.Setter("y2")) loss = flow.math.reduce_mean(y1 + y2) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0).minimize(flow.math.reduce_sum(loss)) return loss x = np.random.rand(*input_shape).astype(np.float32) addend = np.random.rand(*input_shape).astype(np.float32) test_job(x, addend).get() test_case.assertTrue( np.allclose(test_global_storage.Get("y1"), test_global_storage.Get("y2"))) test_case.assertTrue( np.allclose(test_global_storage.Get("x1_diff"), test_global_storage.Get("x2_diff"))) test_case.assertTrue( np.allclose( test_global_storage.Get("addend1_diff"), test_global_storage.Get("addend2_diff"), ))
def _compare_with_np( test_case, shape, index_shape, dynamic_shape=None, dynamic_index_shape=None, dtype="float32", index_dtype="int32", device_type="gpu", device_num=1, dynamic=False, ): x_is_floating = _is_floating_dtype(dtype) need_grad = True if x_is_floating else False x_of_dtype = type_name_to_flow_type[dtype] index_of_dtype = type_name_to_flow_type[index_dtype] x_dtype = type_name_to_np_type[dtype] index_dtype = type_name_to_np_type[index_dtype] if dynamic_shape is None: dynamic_shape = shape else: dynamic = True if dynamic_index_shape is None: dynamic_index_shape = index_shape else: dynamic = True if dynamic: x, index, y, dx = [], [], [], [] for _ in range(device_num): x_, index_ = _random_inputs( dynamic_shape, x_dtype, dynamic_index_shape, index_dtype ) y_, dx_ = _gather_nd_np(x_, index_, need_grad) x.append(x_) index.append(index_) y.append(y_) dx.append(dx_) def comp_diff(dx_blob: flow.typing.ListNumpy): for dx_blob_, dx_ in zip(dx_blob, dx): test_case.assertTrue(np.array_equal(dx_blob_, dx_)) else: x, index = _random_inputs( dynamic_shape, x_dtype, dynamic_index_shape, index_dtype ) y, dx = _gather_nd_np(x, index, need_grad) def comp_diff(dx_blob: flow.typing.Numpy): test_case.assertTrue(np.array_equal(dx_blob, dx)) flow.clear_default_session() gather_nd_fn = _make_gather_nd_fn( shape, index_shape, x_of_dtype, index_of_dtype, device_type, device_num, dynamic, need_grad, comp_diff if device_num == 1 else None, ) ret_y = gather_nd_fn(x, index) if dynamic: for ret_y_, y_ in zip(ret_y, y): test_case.assertTrue(np.array_equal(ret_y_, y_)) else: test_case.assertTrue(np.array_equal(ret_y, y))
def test_alexnet(test_case, batch_size=DEFAULT_BATCH_SIZE, num_batchs=6): init_env() alexnet_infer, input_lbns, output_lbns = make_alexnet_infer_func( batch_size, (DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE, 3)) flow.load_variables(flow.checkpoint.get(DEFAULT_CHECKPOINT_DIR)) # save model saved_model_path = "alexnet_models" model_name = "alexnet" model_version = 1 model_version_path = os.path.join(saved_model_path, str(model_version)) if os.path.exists(saved_model_path) and os.path.isdir( saved_model_path): print("WARNING: The model version path '{}' already exist" ", old version directory will be removed".format( model_version_path)) shutil.rmtree(saved_model_path) saved_model_builder = flow.saved_model.ModelBuilder(saved_model_path) signature_builder = (saved_model_builder.ModelName(model_name).Version( model_version).AddFunction(alexnet_infer).AddSignature("regress")) for input_name, lbn in input_lbns.items(): signature_builder.Input(input_name, lbn) for output_name, lbn in output_lbns.items(): signature_builder.Output(output_name, lbn) saved_model_builder.Save() # test data new_batch_size = int(batch_size / 2) dataset = ImageNetRecordDataset( batch_size=new_batch_size, image_resize_size=DEFAULT_IMAGE_SIZE, data_format="NHWC", ) image_list, label_list = dataset.load_batchs(num_batchs) assert image_list[0].shape[0] == new_batch_size image_size = tuple(image_list[0].shape[1:]) flow.clear_default_session() alexnet_infer, _, _ = make_alexnet_infer_func(new_batch_size, image_size) flow.load_variables(flow.checkpoint.get(DEFAULT_CHECKPOINT_DIR)) print("alexnet inference result:") origin_outputs = [] for i, (image, label) in enumerate(zip(image_list, label_list)): output = alexnet_infer(image, label) # origin_outputs.append(output.item()) # print("iter#{:<6} output:".format(i), output.item()) origin_outputs.append(output) print("iter#{:<6} output:".format(i), output) origin_outputs = np.array(origin_outputs, dtype=np.float32) # load model and run flow.clear_default_session() model_meta_file_path = os.path.join(saved_model_path, str(model_version), "saved_model.prototxt") saved_model_proto = load_saved_model(model_meta_file_path) sess = flow.serving.InferenceSession() checkpoint_path = os.path.join(saved_model_path, str(model_version), saved_model_proto.checkpoint_dir) sess.set_checkpoint_path(checkpoint_path) graph_name = saved_model_proto.default_graph_name graph_def = saved_model_proto.graphs[graph_name] signature_def = graph_def.signatures[graph_def.default_signature_name] with sess.open(graph_name, signature_def, new_batch_size): sess.compile(graph_def.op_list) # sess.print_job_set() sess.launch() job_name = sess.list_jobs()[0] input_names = sess.list_inputs() print("input names:", input_names) for input_name in input_names: print('input "{}" info: {}'.format( input_name, sess.input_info(input_name, job_name))) output_names = sess.list_outputs() print("output names:", output_names) for output_name in output_names: print('output "{}" info: {}'.format( output_name, sess.output_info(output_name, job_name))) print("load saved alexnet and inference result:") print_input_info = False cmp_outputs = [] for i, (image, label) in enumerate(zip(image_list, label_list)): if print_input_info: print("image shape: {}, dtype: {}".format( image.shape, image.dtype)) print("label shape: {}, dtype: {}, data: {}".format( label.shape, label.dtype, label)) if i > 1: print((image - image_list[i - 1]).mean()) outputs = sess.run(alexnet_infer.__name__, image=image, label=label) # cmp_outputs.append(outputs[0].item()) # print("iter#{:<6} output:".format(i), outputs[0].item()) cmp_outputs.append(outputs[0]) print("iter#{:<6} output:".format(i), outputs[0]) cmp_outputs = np.array(cmp_outputs, dtype=np.float32) test_case.assertTrue(np.allclose(origin_outputs, cmp_outputs)) sess.close()
def compare_with_numpy_lazy_adam( device_type, x_shape, beta1, beta2, epsilon, learning_rate, train_iters, ): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float32) @flow.global_function(type="train", function_config=func_config) def testLazyAdam() -> flow.typing.Numpy: with flow.scope.placement(device_type, "0:0-0"): x = flow.get_variable( name="x", shape=x_shape, dtype=flow.float32, initializer=flow.random_uniform_initializer(minval=0, maxval=100), trainable=True, ) loss = flow.math.reduce_mean(x) flow.optimizer.LazyAdam( flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]), beta1=beta1, beta2=beta2, epsilon=epsilon, ).minimize(loss) return x init_value = None for i in range(train_iters + 1): x = testLazyAdam() if i == 0: init_value = np.copy(x) def lazy_adam_update_numpy( param, gradient, iter, m, v, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-7, ): lr_t = lr * np.sqrt(1 - beta2**(iter + 1)) / (1 - beta1**(iter + 1)) m_t = np.copy(m) v_t = np.copy(v) m_t_o = beta1 * m + (1 - beta1) * gradient v_t_o = beta2 * v + (1 - beta2) * gradient * gradient m_t = m_t_o v_t = v_t_o param_t = np.copy(param) param_t_o = param - lr_t * m_t / (np.sqrt(v_t) + epsilon) param_t = param_t_o return param_t, m_t, v_t param = init_value gradient = np.full(param.shape, 1.0 / np.prod(param.shape)) m = np.zeros(param.shape) v = np.zeros(param.shape) for i in range(train_iters): param, m, v = lazy_adam_update_numpy(param, gradient, i, m, v, learning_rate, beta1, beta2, epsilon) assert np.allclose( x.flatten(), param.flatten(), rtol=1e-4, atol=1e-4, )
def compare_with_tensorflow(device_type, x_shape, data_type, axis): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() if data_type == "float16": dtype = flow.float else: dtype = type_name_to_flow_type[data_type] @flow.global_function(type="train", function_config=func_config) def SoftmaxJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=x_shape, dtype=dtype, initializer=flow.random_uniform_initializer(minval=-1.0, maxval=1.0), trainable=True, ) x1 = x x = flow.identity(x) if data_type == "float16": loss = flow.cast( flow.nn.softmax(flow.cast(x, dtype=flow.float16), axis=axis), dtype=flow.float, ) else: loss = flow.nn.softmax(x, axis=axis) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) total_loss = loss * x1 flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(total_loss) return loss # OneFlow of_out = SoftmaxJob().get() # TensorFlow with tf.GradientTape(persistent=True) as tape: x = tf.Variable(test_global_storage.Get("x")) tf_out = tf.nn.softmax(x, axis=axis) loss_diff = test_global_storage.Get("loss_diff") tf_x_diff = tape.gradient(tf_out, x, loss_diff) if data_type == "float16": tolerance = 1e-3 else: tolerance = 1e-5 assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=tolerance, atol=tolerance) assert np.allclose( test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=tolerance, atol=tolerance, )
def compare_with_numpy_lars( device_type, x_shape, momentum_beta, epsilon, lars_coefficient, learning_rate, weight_decay, train_iters, ): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float32) @flow.global_function(type="train", function_config=func_config) def testLars(random_mask: flow.typing.Numpy.Placeholder( x_shape, dtype=flow.float32)) -> flow.typing.Numpy: with flow.scope.placement(device_type, "0:0-0"): x = flow.get_variable( name="x", shape=x_shape, dtype=flow.float32, initializer=flow.random_uniform_initializer(minval=0, maxval=100), trainable=True, ) loss = flow.math.reduce_mean(x * random_mask) flow.optimizer.LARS( flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]), momentum_beta=momentum_beta, epsilon=epsilon, lars_coefficient=lars_coefficient, weight_decay=weight_decay, ).minimize(loss) return x # generate random number sequences random_masks_seq = [] for i in range(train_iters + 1): random_masks_seq.append( np.random.uniform(size=x_shape).astype(np.float32)) init_value = None for i in range(train_iters + 1): x = testLars(random_masks_seq[i]) if i == 0: init_value = np.copy(x) def lars_update_numpy( param, gradient, momentum, learning_rate, momentum_beta, weight_decay, epsilon, lars_coefficient, ): import math model_norm = math.sqrt(np.sum(param * param)) model_diff_norm = math.sqrt(np.sum(gradient * gradient)) if model_norm > 0 and model_diff_norm > 0: lars = (lars_coefficient * model_norm / (model_diff_norm + weight_decay * model_norm + epsilon)) else: lars = 1.0 local_learning_rate = learning_rate * lars momentum_t = momentum_beta * momentum - local_learning_rate * gradient param_t = param + momentum_t - local_learning_rate * weight_decay * param return param_t, momentum_t param = init_value gradient = np.full(param.shape, 1.0 / np.prod(param.shape)) momentum = np.zeros(param.shape) for i in range(train_iters): param, momentum = lars_update_numpy( param, gradient * random_masks_seq[i], momentum, learning_rate, momentum_beta, weight_decay, epsilon, lars_coefficient, ) assert np.allclose( x.flatten(), param.flatten(), rtol=1e-4, atol=1e-4, )
def compare_with_not_fused(test_case, device_type, x_shape, data_type, data_format): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() if data_type == "float16": dtype = flow.float else: dtype = type_name_to_flow_type[data_type] if data_format == "NCHW": bias_shape = (x_shape[1], ) elif data_format == "NHWC": bias_shape = (x_shape[len(x_shape) - 1], ) @flow.global_function(type="train", function_config=func_config) def FlowJob( value: oft.Numpy.Placeholder(x_shape), bias: oft.Numpy.Placeholder(bias_shape), ): with flow.scope.placement(device_type, "0:0"): value += flow.get_variable( name="v1", shape=(1, ), dtype=flow.float, initializer=flow.zeros_initializer(), ) bias += flow.get_variable( name="v2", shape=(1, ), dtype=flow.float, initializer=flow.zeros_initializer(), ) x1 = flow.identity(value) x2 = flow.identity(value) bias1 = flow.identity(bias) bias2 = flow.identity(bias) flow.watch_diff(x1, test_global_storage.Setter("x1_diff")) flow.watch_diff(x2, test_global_storage.Setter("x2_diff")) flow.watch_diff(bias1, test_global_storage.Setter("bias1_diff")) flow.watch_diff(bias2, test_global_storage.Setter("bias2_diff")) if data_type == "float16": y1 = flow.cast( flow.math.gelu( flow.nn.bias_add( flow.cast(x1, dtype=flow.float16), flow.cast(bias1, dtype=flow.float16), data_format=data_format, ), ), dtype=flow.float, ) y2 = flow.cast( flow.nn.fused_bias_add_gelu( flow.cast(x2, dtype=flow.float16), flow.cast(bias2, dtype=flow.float16), data_format=data_format, ), dtype=flow.float, ) else: y1 = flow.math.gelu( flow.nn.bias_add(x1, bias1, data_format=data_format)) y2 = flow.nn.fused_bias_add_gelu(x2, bias2, data_format=data_format) flow.watch(y1, test_global_storage.Setter("y1")) flow.watch(y2, test_global_storage.Setter("y2")) flow.watch_diff(y1, test_global_storage.Setter("y1_diff")) flow.watch_diff(y2, test_global_storage.Setter("y2_diff")) loss = y1 + y2 flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0).minimize(flow.math.reduce_sum(loss)) return loss x = np.random.uniform(low=0, high=10, size=x_shape).astype(np.float32) bias = np.random.uniform(low=0, high=10, size=bias_shape).astype(np.float32) of_out = FlowJob(x, bias).get() y1 = test_global_storage.Get("y1") y2 = test_global_storage.Get("y2") tol = 1e-5 test_case.assertTrue( np.allclose(y1, y2, rtol=tol, atol=tol, equal_nan=True)) x1_diff = test_global_storage.Get("x1_diff") x2_diff = test_global_storage.Get("x2_diff") test_case.assertTrue( np.allclose(x1_diff, x2_diff, rtol=tol, atol=tol, equal_nan=True)) bias1_diff = test_global_storage.Get("bias1_diff") bias2_diff = test_global_storage.Get("bias2_diff") test_case.assertTrue( np.allclose(bias1_diff, bias2_diff, rtol=tol, atol=tol, equal_nan=True))
def compare_with_numpy_indexed_slices_adamw( device_type, model_shape, ids_shape, grad_shape, beta1, beta2, epsilon, learning_rate, train_iters, mul_scalar, weight_decay, ): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float32) func_config.indexed_slices_optimizer_conf( dict(include_op_names=dict(op_name=["embeddings"]))) @flow.global_function(type="train", function_config=func_config) def testIndexedSlicesAdamW( sparse_ids: flow.typing.Numpy.Placeholder(ids_shape, dtype=flow.int32), ) -> flow.typing.Numpy: with flow.scope.placement(device_type, "0:0"): embedding_table = flow.get_variable( name="embeddings", shape=model_shape, initializer=flow.random_uniform_initializer(minval=0, maxval=100), ) embedding = flow.gather(params=embedding_table * mul_scalar, indices=sparse_ids) loss = flow.math.reduce_mean(embedding) flow.optimizer.AdamW( flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]), beta1=beta1, beta2=beta2, epsilon=epsilon, do_bias_correction=True, weight_decay=weight_decay, ).minimize(loss) return embedding_table sparse_ids = np.random.randint(model_shape[0], size=ids_shape).astype(np.int32) init_value = None for i in range(train_iters + 1): x = testIndexedSlicesAdamW(sparse_ids) if i == 0: init_value = np.copy(x) def indexed_slices_update_numpy( param, unique_dict, iter, m, v, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-7, weight_decay=0.9, ): param_t = np.copy(param) m_t = np.copy(m) v_t = np.copy(v) for ids in unique_dict.keys(): lr_t = lr * np.sqrt(1 - beta2**(iter + 1)) / (1 - beta1**(iter + 1)) m_t_o = beta1 * m[ids] + (1 - beta1) * unique_dict[ids] v_t_o = beta2 * v[ids] + ( 1 - beta2) * unique_dict[ids] * unique_dict[ids] m_t[ids] = m_t_o v_t[ids] = v_t_o param_t_o = param[ids] - lr_t * (m_t[ids] / (np.sqrt(v_t[ids]) + epsilon) + weight_decay * param[ids]) param_t[ids] = param_t_o return param_t, m_t, v_t param = init_value gradient = np.full(grad_shape, float(mul_scalar) / np.prod(grad_shape)) m = np.zeros(param.shape) v = np.zeros(param.shape) unique_dict = unique_grads(sparse_ids, gradient) for i in range(train_iters): param, m, v = indexed_slices_update_numpy( param, unique_dict, i, m, v, learning_rate, beta1, beta2, epsilon, weight_decay, ) assert np.allclose( x.flatten(), param.flatten(), rtol=1e-4, atol=1e-4, )
def _compare_instance_norm_2d_with_np(input_shape, device_type, machine_ids, device_counts, eps, affine): assert device_type in ["cpu", "gpu"] assert len(input_shape) == 4 flow.clear_default_session() if device_type == "cpu": flow.config.cpu_device_num(device_counts) else: flow.config.gpu_device_num(device_counts) func_config = flow.FunctionConfig() func_config.default_placement_scope( flow.scope.placement(device_type, machine_ids)) input = np.random.random(size=input_shape).astype(np.float32) gout = np.random.random(size=input_shape).astype(np.float32) # compute instance normalization in numpy gamma = np.ones((1, input_shape[1], 1, 1), dtype=np.float32) mean_np = np.mean(input, axis=(2, 3), keepdims=True) in_sub_mean = input - mean_np var_np = np.mean(np.square(in_sub_mean), axis=(2, 3), keepdims=True) invar_np = 1.0 / np.sqrt(var_np + eps) out_np = in_sub_mean * invar_np * gamma def assert_prediction_grad(gin_of: tp.Numpy): # compute the gradient of variance gvar = gout * gamma * in_sub_mean * -0.5 * np.power(var_np + eps, -1.5) gvar = np.sum(gvar, axis=(2, 3), keepdims=True) # compute the gradient of mean gmean = np.sum(gout * gamma, axis=(2, 3), keepdims=True) gmean *= -invar_np scale = 1.0 / (input_shape[2] * input_shape[3]) tmp = scale * np.sum(-2.0 * in_sub_mean, axis=(2, 3), keepdims=True) * gvar gmean += tmp # compute the gradient of input gin_np = (gout * gamma * invar_np + gvar * scale * 2.0 * in_sub_mean + gmean * scale) assert np.allclose(gin_of, gin_np, atol=1e-5) @flow.global_function(type="train", function_config=func_config) def instanceNormJob( of_input: tp.Numpy.Placeholder(shape=input.shape), multipler: tp.Numpy.Placeholder(shape=input.shape), ) -> tp.Numpy: with flow.scope.placement(device_type, "0:0"): v = flow.get_variable( shape=of_input.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), name="v", ) x_var = of_input + v # watch the gradient flow.watch_diff(x_var, assert_prediction_grad) out = flow.nn.InstanceNorm2d(x_var, eps=eps, affine=affine) with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-3]), momentum=0).minimize(out * multipler) return out check = flow.train.CheckPoint() check.init() of_out = instanceNormJob(input, gout) assert np.allclose(of_out, out_np, atol=1e-5)
def compare_with_flow_job_fused_adam_model_update(device_type, x_shape, beta1, beta2, epsilon, learning_rate, train_iters): assert device_type in ["gpu", "cpu"] flow.clear_default_session() def flow_net(var_name, random_mask): with flow.scope.placement(device_type, "0:0-0"): x = flow.get_variable( name=var_name, shape=x_shape, dtype=flow.float32, initializer=flow.ones_initializer(), trainable=True, ) constant_val = flow.constant(3.0, dtype=flow.float32, shape=(1, )) x = x * constant_val x = x * 2.0 if device_type == "gpu": x = flow.cast(x, flow.float16) x = flow.math.relu(x) x = flow.cast(x, flow.float) loss = flow.math.reduce_mean(x * random_mask) flow.optimizer.Adam( flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]), beta1=beta1, beta2=beta2, epsilon=epsilon, do_bias_correction=True, ).minimize(loss) return x def make_adam_job(): func_config = flow.FunctionConfig() func_config.default_data_type(flow.float32) @flow.global_function(type="train", function_config=func_config) def testAdam(random_mask: flow.typing.Numpy.Placeholder( x_shape, dtype=flow.float32)) -> flow.typing.Numpy: return flow_net("x1", random_mask) return testAdam def make_fused_adam_job(): func_config = flow.FunctionConfig() func_config.default_data_type(flow.float32) func_config.enable_fuse_model_update_ops(True) @flow.global_function(type="train", function_config=func_config) def testFusedAdam(random_mask: flow.typing.Numpy.Placeholder( x_shape, dtype=flow.float32)) -> flow.typing.Numpy: return flow_net("x2", random_mask) return testFusedAdam adam_job = make_adam_job() fused_adam_job = make_fused_adam_job() # generate random number sequences random_masks_seq = [] for i in range(train_iters + 1): random_masks_seq.append( np.random.uniform(size=x_shape).astype(np.float32)) for i in range(train_iters + 1): var1 = adam_job(random_masks_seq[i]) for i in range(train_iters + 1): var2 = fused_adam_job(random_masks_seq[i]) assert np.allclose( var1.flatten(), var2.flatten(), rtol=1e-4, atol=1e-4, )
def compare_with_tensorflow(device_type, params_case, dilations, data_format): input_shape, output_shape, padding, strides, kernel_size = params_case assert data_format in ["NCHW", "NHWC"] out_channels = output_shape[1] if data_format == "NCHW" else output_shape[3] in_channels = input_shape[1] if data_format == "NCHW" else input_shape[3] assert device_type in ["gpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) @flow.global_function(type="train", function_config=func_config) def DeconvJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=input_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) if data_format == "NCHW": weight = flow.get_variable( "weight", shape=(in_channels, out_channels, kernel_size, kernel_size), dtype=flow.float, initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) else: weight = flow.get_variable( "weight", shape=(in_channels, kernel_size, kernel_size, out_channels), dtype=flow.float, initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) loss = flow.nn.conv2d_transpose( x, weight, strides=strides, output_shape=output_shape, dilations=dilations, padding=padding, data_format=data_format, ) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(weight, test_global_storage.Setter("weight")) flow.watch_diff(weight, test_global_storage.Setter("weight_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss # OneFlow check_point = flow.train.CheckPoint() check_point.init() of_out = DeconvJob().get() # Tensorflow if data_format == "NCHW": with tf.GradientTape(persistent=True) as tape: x = tf.Variable(test_global_storage.Get("x").transpose(0, 2, 3, 1)) output_shape = ( output_shape[0], output_shape[2], output_shape[3], output_shape[1], ) w = tf.Variable( test_global_storage.Get("weight").transpose(2, 3, 1, 0)) tf_out = tf.nn.conv2d_transpose( x, w, output_shape=output_shape, strides=[1, strides, strides, 1], padding=padding, data_format="NHWC", ) loss_diff = test_global_storage.Get("loss_diff").transpose(0, 2, 3, 1) tf_x_diff = tape.gradient(tf_out, x, loss_diff) tf_weight_diff = tape.gradient(tf_out, w, loss_diff) assert np.allclose(of_out.numpy().transpose(0, 2, 3, 1), tf_out.numpy(), rtol=1e-02, atol=1e-02) assert np.allclose( test_global_storage.Get("x_diff").transpose(0, 2, 3, 1), tf_x_diff.numpy(), rtol=1e-4, atol=1e-4, ) assert np.allclose( test_global_storage.Get("weight_diff").transpose(2, 3, 1, 0), tf_weight_diff.numpy(), rtol=1e-4, atol=1e-4, ) else: with tf.GradientTape(persistent=True) as tape: x = tf.Variable(test_global_storage.Get("x")) w = tf.Variable( test_global_storage.Get("weight").transpose(1, 2, 3, 0)) tf_out = tf.nn.conv2d_transpose( x, w, output_shape=output_shape, strides=[1, strides, strides, 1], padding=padding, data_format="NHWC", ) loss_diff = test_global_storage.Get("loss_diff") tf_x_diff = tape.gradient(tf_out, x, loss_diff) tf_weight_diff = tape.gradient(tf_out, w, loss_diff) assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-02, atol=1e-02), (of_out.numpy() - tf_out.numpy()) assert np.allclose(test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-02, atol=1e-02) assert np.allclose( test_global_storage.Get("weight_diff").transpose(1, 2, 3, 0), tf_weight_diff.numpy(), rtol=1e-2, atol=1e-2, )
def compare_with_tensorflow(device_type, data_type, shape): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() dtype = type_name_to_flow_type[data_type] def np_sigmoid(x): return 1 / (1 + np.exp(-x)) @flow.global_function(type="train", function_config=func_config) def SigmoidCrossEntropyWithLogitsJob(labels: oft.Numpy.Placeholder( shape, dtype)): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=shape, dtype=type_name_to_flow_type[data_type], initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) loss = flow.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=x) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss # fake labels labels = np_sigmoid(np.random.randint(0, 10, size=shape)).astype( type_name_to_np_type[data_type]) # OneFlow check_point = flow.train.CheckPoint() check_point.init() of_out = SigmoidCrossEntropyWithLogitsJob(labels).get() # TensorFlow with tf.GradientTape(persistent=True) as tape: x = tf.Variable(test_global_storage.Get("x")) tf_out = tf.nn.sigmoid_cross_entropy_with_logits(labels, x) loss_diff = test_global_storage.Get("loss_diff") tf_x_diff = tape.gradient(tf_out, x, loss_diff) tolerance = 1e-5 assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=tolerance, atol=tolerance) assert np.allclose( test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=tolerance, atol=tolerance, ) flow.clear_default_session()
def _make_dim_gather_fn( test_case, input, index, dim, grad, device_type, value_type, index_type, machine_ids, device_counts, ): flow.clear_default_session() if device_type == "cpu": flow.config.cpu_device_num(device_counts) else: flow.config.gpu_device_num(device_counts) func_config = flow.FunctionConfig() # global function needs float32 as type of argument and return value if value_type == flow.float16: func_config.default_data_type(flow.float32) else: func_config.default_data_type(value_type) func_config.default_placement_scope( flow.scope.placement(device_type, machine_ids)) func_config.default_logical_view(flow.scope.consistent_view()) def _compare_diff(blob: oft.Numpy): test_case.assertTrue(np.allclose(grad, blob)) if value_type == flow.float16: @flow.global_function(type="train", function_config=func_config) def gather_fn( params_def: oft.Numpy.Placeholder(input.shape, dtype=flow.float32), indices_def: oft.Numpy.Placeholder(index.shape, dtype=index_type), ) -> oft.Numpy: with flow.scope.placement(device_type, "0:0"): x_var = flow.get_variable( "input", shape=input.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), ) x_var = flow.cast_to_current_logical_view(x_var) x = x_var + params_def x_f16 = flow.cast(x, flow.float16) y_f16 = flow.dim_gather(x_f16, dim, indices_def) x_f32 = flow.cast(x, flow.float32) y_f32 = flow.cast(y_f16, flow.float32) y = flow.dim_gather(x, dim, indices_def) with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-3]), momentum=0).minimize(y_f32) flow.watch_diff(x_f32, _compare_diff) return y_f32 return gather_fn elif value_type == flow.float32 or value_type == flow.float64: @flow.global_function(type="train", function_config=func_config) def gather_fn( params_def: oft.Numpy.Placeholder(input.shape, dtype=value_type), indices_def: oft.Numpy.Placeholder(index.shape, dtype=index_type), ) -> oft.Numpy: with flow.scope.placement(device_type, "0:0"): x_var = flow.get_variable( "input", shape=input.shape, dtype=value_type, initializer=flow.constant_initializer(0), ) x_var = flow.cast_to_current_logical_view(x_var) x = x_var + params_def y = flow.dim_gather(x, dim, indices_def) with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-3]), momentum=0).minimize(y) flow.watch_diff(x, _compare_diff) return y return gather_fn elif value_type == flow.int32: @flow.global_function(type="train", function_config=func_config) def gather_fn( params_def: oft.Numpy.Placeholder(input.shape, dtype=flow.float32), indices_def: oft.Numpy.Placeholder(index.shape, dtype=index_type), ) -> oft.Numpy: with flow.scope.placement(device_type, "0:0"): x_var = flow.get_variable( "input", shape=input.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), ) x_var = flow.cast_to_current_logical_view(x_var) x = x_var + params_def x_int32 = flow.cast(x, dtype=flow.int32) y_int32 = flow.dim_gather(x, dim, indices_def) y_fp32 = flow.cast(y_int32, dtype=flow.float32) with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-3]), momentum=0).minimize(y_fp32) flow.watch_diff(x, _compare_diff) return y_fp32 return gather_fn
def setUp(self): oneflow.clear_default_session() oneflow.enable_eager_execution(False)
def test_resnet(test_case, batch_size=DEFAULT_BATCH_SIZE, num_batchs=6): init_env() # input image format NCHW image_size = (3, DEFAULT_IMAGE_SIZE, DEFAULT_IMAGE_SIZE) resnet_infer, input_lbns, output_lbns = make_resnet_infer_func( batch_size, image_size) # resnet inference model parameters flow.load_variables(flow.checkpoint.get(DEFAULT_CHECKPOINT_DIR)) # test data dataset = ImageNetRecordDataset( batch_size=batch_size, image_resize_size=DEFAULT_IMAGE_SIZE, data_format="NCHW", ) image_list, label_list = dataset.load_batchs(num_batchs) print("resnet inference result:") origin_outputs = [] for i, (image, label) in enumerate(zip(image_list, label_list)): output = resnet_infer(image) arg_max = np.argmax(output, axis=1) origin_outputs.append(arg_max) print("iter#{:<6} predict: ".format(i), arg_max, "label: ", label) origin_outputs = np.array(origin_outputs, dtype=np.float32) # save model saved_model_path = "resnet50_models" model_version = 1 model_version_path = os.path.join(saved_model_path, str(model_version)) if os.path.exists(model_version_path) and os.path.isdir( model_version_path): print("WARNING: The model version path '{}' already exist" ", old version directory will be removed".format( model_version_path)) shutil.rmtree(model_version_path) saved_model_builder = flow.saved_model.ModelBuilder(saved_model_path) signature_builder = (saved_model_builder.ModelName("resnet50").Version( model_version).AddFunction(resnet_infer).AddSignature("regress")) for input_name, lbn in input_lbns.items(): signature_builder.Input(input_name, lbn) for output_name, lbn in output_lbns.items(): signature_builder.Output(output_name, lbn) saved_model_builder.Save() # load model and run flow.clear_default_session() sess = flow.serving.InferenceSession() sess.load_saved_model(saved_model_path) # sess.print_job_set() sess.launch() job_name = sess.list_jobs()[0] input_names = sess.list_inputs() print("input names:", input_names) for input_name in input_names: print('input "{}" info: {}'.format( input_name, sess.input_info(input_name, job_name))) print("load saved resnet and inference result:") cmp_outputs = [] for i, (image, label) in enumerate(zip(image_list, label_list)): outputs = sess.run(resnet_infer.__name__, image=image) arg_max = np.argmax(outputs[0], axis=1) cmp_outputs.append(arg_max) print("iter#{:<6} output:".format(i), arg_max, "label: ", label) cmp_outputs = np.array(cmp_outputs, dtype=np.float32) test_case.assertTrue(np.allclose(origin_outputs, cmp_outputs)) sess.close()
def _run_test_fake_quantize( test_case, device_type, device_num, dtype, in_shape, quantization_bit, quantization_scheme, per_layer_quantization, ): assert device_type in ["gpu", "cpu"] flow.clear_default_session() if device_type == "cpu": flow.config.cpu_device_num(device_num) else: flow.config.gpu_device_num(device_num) @flow.global_function(type="train", function_config=flow.FunctionConfig()) def QuantizeJob(input: oft.Numpy.Placeholder( in_shape, dtype=type_name_to_flow_type[dtype])): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=in_shape, dtype=input.dtype, initializer=flow.zeros_initializer(input.dtype), trainable=True, ) input_x = input + x flow.watch_diff(input_x, test_global_storage.Setter("input_diff")) with flow.scope.placement(device_type, "0:0-%d" % (device_num - 1)): scale, zero_point = flow.quantization.min_max_observer( input_x, quantization_bit, quantization_scheme, per_layer_quantization) out = flow.quantization.fake_quantization(input_x, scale, zero_point, quantization_bit, quantization_scheme) loss = flow.math.reduce_mean(out) flow.optimizer.Adam( flow.optimizer.PiecewiseConstantScheduler( [], [0.001]), ).minimize(loss) return out check_point = flow.train.CheckPoint() check_point.init() input = (np.random.random(in_shape) - 0.5).astype( type_name_to_np_type[dtype]) out = QuantizeJob(input).get() input_diff = test_global_storage.Get("input_diff") _check_fake_quantize( test_case, input, input_diff.flatten(), out.numpy().flatten(), quantization_bit, quantization_scheme, per_layer_quantization, )
def compare_with_numpy_adamw( device_type, x_shape, beta1, beta2, epsilon, weight_decay, learning_rate, train_iters, ): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float32) @flow.global_function(type="train", function_config=func_config) def testAdamW(random_mask: flow.typing.Numpy.Placeholder( x_shape, dtype=flow.float32)) -> flow.typing.Numpy: with flow.scope.placement(device_type, "0:0-0"): x = flow.get_variable( name="x", shape=x_shape, dtype=flow.float32, initializer=flow.random_uniform_initializer(minval=0, maxval=100), trainable=True, ) loss = flow.math.reduce_mean(x * random_mask) flow.optimizer.AdamW( flow.optimizer.PiecewiseConstantScheduler([], [learning_rate]), beta1=beta1, beta2=beta2, epsilon=epsilon, weight_decay=weight_decay, do_bias_correction=True, ).minimize(loss) return x # generate random number sequences random_masks_seq = [] for i in range(train_iters + 1): random_masks_seq.append( np.random.uniform(size=x_shape).astype(np.float32)) init_value = None for i in range(train_iters + 1): x = testAdamW(random_masks_seq[i]) if i == 0: init_value = np.copy(x) def adamw_update_numpy( param, gradient, iter, m, v, lr=0.001, beta1=0.9, beta2=0.999, epsilon=1e-7, weight_decay=0.9, ): lr_t = lr * np.sqrt(1 - beta2**(iter + 1)) / (1 - beta1**(iter + 1)) m_t = beta1 * m + (1 - beta1) * gradient v_t = beta2 * v + (1 - beta2) * gradient * gradient param_t = param - lr_t * (m_t / (np.sqrt(v_t) + epsilon) + weight_decay * param) return param_t, m_t, v_t param = init_value gradient = np.full(param.shape, 1.0 / np.prod(param.shape)) m = np.zeros(param.shape) v = np.zeros(param.shape) for i in range(train_iters): param, m, v = adamw_update_numpy( param, gradient * random_masks_seq[i], i, m, v, learning_rate, beta1, beta2, epsilon, weight_decay, ) assert np.allclose( x.flatten(), param.flatten(), rtol=1e-4, atol=1e-4, )
def _test_hybrid_concat(test_case, static_shape, axis, max_dim_size=None, verbose=False): flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_logical_view(flow.scope.mirrored_view()) def compare_var_diff(var_blob): test_case.assertTrue( np.array_equal(var_blob.numpy(), np.ones(shape=static_shape, dtype=np.single))) rand_sub_shape = list(static_shape).copy() rand_sub_shape[axis] = random.randrange(1, static_shape[axis]) rand_sub_shape = tuple(rand_sub_shape) @flow.global_function(type="train", function_config=func_config) def hybrid_concat_job( input_0_def: oft.ListNumpy.Placeholder(shape=static_shape, dtype=flow.float), input_1_def: oft.ListNumpy.Placeholder(shape=static_shape, dtype=flow.float), ): var = flow.get_variable( "var", shape=static_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(), trainable=True, ) constant = flow.constant(1.0, dtype=flow.float, shape=rand_sub_shape) inputs = [ flow.cast_to_current_logical_view(input) for input in [var, input_0_def, input_1_def, constant] ] concated = flow.concat( inputs, axis=axis, max_dim_size=max_dim_size, ) if verbose: print("concated static shape:", concated.shape) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler([], [1e-3]), momentum=0).minimize(concated) flow.watch_diff(var, compare_var_diff) if max_dim_size is None: test_case.assertTrue( concated.shape[axis] == (static_shape[axis] * 3 + rand_sub_shape[axis])) else: test_case.assertTrue(concated.shape[axis] == max_dim_size) return var, concated output, inputs = _rand_inputs(static_shape, axis, 2) if verbose: print("static_shape:", static_shape) print("input_0 shape:", inputs[0].shape) print("input_1 shape:", inputs[1].shape) print("output shape:", output.shape) print("rand_sub_shape:", rand_sub_shape) var, concated = hybrid_concat_job([inputs[0]], [inputs[1]]).get() if verbose: print("var shape:", var.numpy().shape) print("concated shape:", concated.numpy(0).shape) test_case.assertTrue( np.array_equal( np.concatenate( [ var.numpy(), output, np.ones(shape=rand_sub_shape, dtype=np.single) ], axis=axis, ), concated.numpy(0), ))
def _test_masked_fill_fw_bw(test_case, device, x_shape, mask_shape, type_name, value=0): flow.clear_default_session() func_config = flow.FunctionConfig() if type_name == "float16": flow_type = flow.float np_type = np.float32 else: flow_type = type_name_to_flow_type[type_name] np_type = type_name_to_np_type[type_name] func_config.default_data_type(flow_type) @flow.global_function(type="train", function_config=func_config) def test_masked_fill_fw_bw_job( x: oft.Numpy.Placeholder(x_shape, dtype=flow_type), mask: oft.Numpy.Placeholder(mask_shape, dtype=flow_type), ): with flow.scope.placement(device, "0:0"): y = flow.get_variable( name="vx", shape=(1, ), dtype=flow.float, initializer=flow.zeros_initializer(), ) x += flow.cast(y, flow_type) mask = flow.cast(mask, dtype=flow.int8) if type_name == "float16": out = flow.cast( flow.masked_fill(flow.cast(x, flow.float16), mask, value), flow.float, ) else: out = flow.masked_fill(x, mask, value) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(out) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(out, test_global_storage.Setter("out")) flow.watch_diff(out, test_global_storage.Setter("out_diff")) return out check_point = flow.train.CheckPoint() check_point.init() x = np.random.randint(low=0, high=100, size=x_shape) mask = np.random.randint(low=0, high=2, size=mask_shape) test_masked_fill_fw_bw_job(x.astype(np_type), mask.astype(np_type)).get() out_diff = test_global_storage.Get("out_diff") np_out, np_x_diff = _masked_fill_np_fw_bw(x, mask, out_diff, np_type, value) if type_name == "float16": tolerance = 1e-3 else: tolerance = 1e-5 test_case.assertTrue( np.allclose(np_out, test_global_storage.Get("out"), rtol=tolerance, atol=tolerance)) test_case.assertTrue( np.allclose(np_x_diff, test_global_storage.Get("x_diff"), rtol=tolerance, atol=tolerance))