def _of_tensor_scatter_nd_add( params, indices, updates, device_type, mirrored, params_grad_watcher, updates_grad_watcher, ): flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) def do_tensor_scatter_nd_add(params_blob, indices_blob, updates_blob): with flow.scope.placement(device_type, "0:0"): params_var = flow.get_variable( "params", shape=params_blob.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), ) updates_var = flow.get_variable( "updates", shape=updates_blob.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), ) params_var = flow.cast_to_current_logical_view(params_var) params_blob = flow.cast_to_current_logical_view(params_blob) updates_blob = flow.cast_to_current_logical_view(updates_blob) updates_var = flow.cast_to_current_logical_view(updates_var) params_var = params_var + params_blob updates_var = updates_var + updates_blob out = flow.tensor_scatter_nd_add(params_var, indices_blob, updates_var) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-3]), momentum=0).minimize(out) flow.watch_diff(params_var, params_grad_watcher) flow.watch_diff(updates_var, updates_grad_watcher) return out if mirrored: func_config.default_logical_view(flow.scope.mirrored_view()) @flow.global_function(type="train", function_config=func_config) def tensor_scatter_nd_add_fn( params_def: oft.ListNumpy.Placeholder(params.shape, dtype=flow.float), indices_def: oft.ListNumpy.Placeholder(indices.shape, dtype=flow.int32), updates_def: oft.ListNumpy.Placeholder(updates.shape, dtype=flow.float), ): return do_tensor_scatter_nd_add(params_def, indices_def, updates_def) return (tensor_scatter_nd_add_fn([params], [indices], [updates]).get().numpy_list()[0]) else: func_config.default_logical_view(flow.scope.consistent_view()) @flow.global_function(type="train", function_config=func_config) def tensor_scatter_nd_add_fn( params_def: oft.Numpy.Placeholder(params.shape, dtype=flow.float), indices_def: oft.Numpy.Placeholder(indices.shape, dtype=flow.int32), updates_def: oft.Numpy.Placeholder(updates.shape, dtype=flow.float), ): return do_tensor_scatter_nd_add(params_def, indices_def, updates_def) return tensor_scatter_nd_add_fn(params, indices, updates).get().numpy()
def _of_clip_by_value(values, min, max, device_type="gpu", dynamic=False, grad_cb=None): data_type = _np_dtype_to_of_dtype(values.dtype) if callable(grad_cb): def clip(values_blob): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "values", shape=values.shape, dtype=data_type, initializer=flow.constant_initializer(0), ) x = flow.cast_to_current_logical_view(x) x = x + values_blob y = flow.clip_by_value(x, min, max) flow.losses.add_loss(y) flow.watch_diff(x, grad_cb) return y else: def clip(values_blob): with flow.scope.placement(device_type, "0:0"): return flow.clip_by_value(values_blob, min, max, name="Clip") flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(data_type) if grad_cb is not None: func_config.train.primary_lr(1e-3) func_config.train.model_update_conf(dict(naive_conf={})) if dynamic: func_config.default_logical_view(flow.scope.mirrored_view()) @flow.global_function(func_config) def clip_fn(values_def: oft.ListNumpy.Placeholder(values.shape, dtype=data_type)): return clip(values_def) check_point = flow.train.CheckPoint() check_point.init() return clip_fn([values]).get().numpy_list()[0] else: func_config.default_logical_view(flow.scope.consistent_view()) @flow.global_function(func_config) def clip_fn(values_def: oft.Numpy.Placeholder(values.shape, dtype=data_type)): return clip(values_def) check_point = flow.train.CheckPoint() check_point.init() return clip_fn(values).get().numpy()
def compare_with_tensorflow( device_type, x_shape, filters, kernel_size, groups, data_format="NCHW", padding="VALID", stride=1, ): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) if data_format == "NCHW": xy_data_transpose = (0, 2, 3, 1) weight_data_transpose = (2, 3, 1, 0) else: xy_data_transpose = (0, 1, 2, 3) weight_data_transpose = (1, 2, 3, 0) @flow.global_function(type="train", function_config=func_config) def ConvJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=x_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=0, maxval=100), trainable=True, ) if data_format == "NCHW": weight_shape = (filters, x.shape[1] // groups, kernel_size, kernel_size) else: weight_shape = (filters, kernel_size, kernel_size, x.shape[3] // groups) weight = flow.get_variable( "conv-weight", shape=weight_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=0, maxval=100), ) loss = flow.nn.conv2d( x, weight, strides=[stride, stride], padding=padding, data_format=data_format, dilations=[1, 1], groups=groups, name="conv", ) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(weight, test_global_storage.Setter("weight")) flow.watch_diff(weight, test_global_storage.Setter("weight_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss # OneFlow check_point = flow.train.CheckPoint() check_point.init() of_out = ConvJob().get() # TensorFlow with tf.GradientTape(persistent=True) as tape: x = tf.Variable( test_global_storage.Get("x").transpose(xy_data_transpose)) assert groups > 0 assert filters % groups == 0 if groups == 1: weight = tf.Variable( test_global_storage.Get("weight").transpose( weight_data_transpose)) tf_out = tf.nn.conv2d( x, weight, strides=[1, stride, stride, 1], padding=padding, data_format="NHWC", ) else: weight = tf.Variable( test_global_storage.Get("weight").transpose( weight_data_transpose)) tf_out = grouped_convolution2D(x, weight, padding=padding, num_groups=groups) loss_diff = test_global_storage.Get("loss_diff").transpose( xy_data_transpose) tf_x_diff = tape.gradient(tf_out, x, loss_diff) tf_weight_diff = tape.gradient(tf_out, weight, loss_diff) max_diff = np.max( np.absolute(of_out.numpy().transpose(xy_data_transpose) - tf_out.numpy())) assert np.allclose( of_out.numpy().transpose(xy_data_transpose), tf_out.numpy(), rtol=1e-5, atol=1e-5, ), max_diff assert np.allclose( test_global_storage.Get("x_diff").transpose(xy_data_transpose), tf_x_diff.numpy(), rtol=1e-4, atol=1e-4, ) assert np.allclose( test_global_storage.Get("weight_diff").transpose( weight_data_transpose), tf_weight_diff.numpy(), rtol=1e-5, atol=1e-5, )
def _test_fused_scale_tril_fw_bw(test_case, device, shape, type_name, diagonal, fill_value, scale): flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) if type_name == "float16": flow_type = flow.float np_type = np.float32 else: flow_type = type_name_to_flow_type[type_name] np_type = type_name_to_np_type[type_name] @flow.global_function(type="train", function_config=func_config) def test_fused_scale_tril_fw_bw_job( x: oft.Numpy.Placeholder(shape, dtype=flow_type), ): with flow.scope.placement(device, "0:0"): x_var = flow.get_variable( name="xv", shape=(1, ), dtype=flow.float, initializer=flow.zeros_initializer(), ) x += flow.cast(x_var, dtype=flow_type) if type_name == "float16": out = flow.cast( flow.math.fused_scale_tril(flow.cast(x, flow.float16), diagonal, scale=scale), flow.float, ) else: out = flow.math.fused_scale_tril(x, diagonal, scale=scale) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(out) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(out, test_global_storage.Setter("out")) flow.watch_diff(out, test_global_storage.Setter("out_diff")) return out check_point = flow.train.CheckPoint() check_point.init() x = np.random.randint(low=0, high=100, size=shape) test_fused_scale_tril_fw_bw_job(x.astype(np_type)).get() np_out = np.where( np.tril(np.ones(shape), diagonal), test_global_storage.Get("x") * scale, np.full(shape, fill_value).astype(np_type), ) np_x_diff = np.tril(test_global_storage.Get("out_diff"), diagonal) * scale if type_name == "float16": tolerance = 1e-3 else: tolerance = 1e-5 test_case.assertTrue( np.allclose(np_out, test_global_storage.Get("out"), rtol=tolerance, atol=tolerance)) test_case.assertTrue( np.allclose(np_x_diff, test_global_storage.Get("x_diff"), rtol=tolerance, atol=tolerance))
def test_layer_norm(_): confs = [{ "x_shape": (4, 5, 2, 6), "begin_norm_axis": -1, "begin_params_axis": -1 }] arg_dict = OrderedDict() arg_dict["device_type"] = ["gpu"] arg_dict["confs"] = confs arg_dict["data_type"] = ["float32"] arg_dict["trainable"] = [True, False] arg_dict["center"] = [True, False] arg_dict["scale"] = [True, False] arg_dict["epsilon"] = [0.0, 1e-10] for case in GenArgList(arg_dict): (device_type, confs, data_type, trainable, center, scale, epsilon) = case x_shape = confs["x_shape"] begin_norm_axis = confs["begin_norm_axis"] begin_params_axis = confs["begin_params_axis"] flow.clear_default_session() # Random inputs x = np.random.randn(*x_shape).astype(type_name_to_np_type[data_type]) dim = len(x.shape) - 2 # TF results with tf.GradientTape(persistent=True) as tape: x_tf = tf.Variable(x) y_tf = tf.keras.layers.LayerNormalization( axis=begin_norm_axis, epsilon=epsilon, center=center, scale=scale, beta_initializer="zeros", gamma_initializer="ones", beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None, trainable=trainable, )(x_tf) dx_tf = tape.gradient(y_tf, x_tf, tf.constant(1.0, shape=y_tf.shape)) def assert_grad(b): assert np.allclose(dx_tf.numpy(), b.numpy(), rtol=1e-5, atol=1e-5), ( case, dx_tf.numpy(), b.numpy(), ) # 1F results dtype = type_name_to_flow_type[data_type] func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.train.primary_lr(1e-4) func_config.train.model_update_conf(dict(naive_conf={})) @flow.global_function(func_config) def test_job(x: oft.Numpy.Placeholder(x_shape, dtype=dtype)): v = flow.get_variable( "x", shape=x_shape, dtype=dtype, initializer=flow.constant_initializer(0), trainable=True, ) flow.watch_diff(v, assert_grad) x += v with flow.scope.placement(device_type, "0:0"): y = flow.layers.layer_norm( x, begin_norm_axis=begin_norm_axis, begin_params_axis=begin_params_axis, center=center, scale=scale, ) flow.losses.add_loss(y) return y check_point = flow.train.CheckPoint() check_point.init() y = test_job(x).get() assert y.numpy().shape == y_tf.numpy().shape, ( y.numpy().shape, y_tf.numpy().shape, ) diff = y.numpy() - y_tf.numpy() max_diff = np.max(np.abs(diff)) assert np.allclose(y.numpy(), y_tf.numpy(), rtol=1e-5, atol=2e-3), ( case, max_diff, )
def compare_with_not_fused( test_case, device_type, x_shape, data_type, data_format, rate, seed, fuse_add_to_output, ): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.enable_fuse_add_to_output(fuse_add_to_output) if data_type == "float16": dtype = flow.float else: dtype = type_name_to_flow_type[data_type] if data_format == "NCHW": bias_shape = (x_shape[1], ) elif data_format == "NHWC": bias_shape = (x_shape[len(x_shape) - 1], ) @flow.global_function(type="train", function_config=func_config) def FlowJob( value: oft.Numpy.Placeholder(x_shape), bias: oft.Numpy.Placeholder(bias_shape), addend: oft.Numpy.Placeholder(x_shape), ): with flow.scope.placement(device_type, "0:0"): value += flow.get_variable( name="v1", shape=(1, ), dtype=flow.float, initializer=flow.zeros_initializer(), ) bias += flow.get_variable( name="v2", shape=(1, ), dtype=flow.float, initializer=flow.zeros_initializer(), ) addend += flow.get_variable( name="v3", shape=(1, ), dtype=flow.float, initializer=flow.zeros_initializer(), ) x1 = flow.identity(value) x2 = flow.identity(value) bias1 = flow.identity(bias) bias2 = flow.identity(bias) addend1 = flow.identity(addend) addend2 = flow.identity(addend) flow.watch_diff(x1, test_global_storage.Setter("x1_diff")) flow.watch_diff(x2, test_global_storage.Setter("x2_diff")) flow.watch_diff(bias1, test_global_storage.Setter("bias1_diff")) flow.watch_diff(bias2, test_global_storage.Setter("bias2_diff")) flow.watch_diff(addend1, test_global_storage.Setter("addend1_diff")) flow.watch_diff(addend2, test_global_storage.Setter("addend2_diff")) if data_type == "float16": out1 = flow.nn.dropout( flow.nn.bias_add( flow.cast(x1, dtype=flow.float16), flow.cast(bias1, dtype=flow.float16), data_format=data_format, ), rate=rate, seed=seed, name="dropout", ) y1 = flow.cast( out1 + flow.cast(addend1, dtype=flow.float16), dtype=flow.float, ) out2 = flow.nn.fused_bias_add_dropout( flow.cast(x2, dtype=flow.float16), flow.cast(bias2, dtype=flow.float16), data_format=data_format, rate=rate, seed=seed, ) y2 = flow.cast( out2 + flow.cast(addend2, dtype=flow.float16), dtype=flow.float, ) else: y1 = (flow.nn.dropout( flow.nn.bias_add(x1, bias1, data_format=data_format), rate=rate, seed=seed, name="dropout", ) + addend1) y2 = (flow.nn.fused_bias_add_dropout( x2, bias2, data_format=data_format, rate=rate, seed=seed, ) + addend2) flow.watch(y1, test_global_storage.Setter("y1")) flow.watch(y2, test_global_storage.Setter("y2")) flow.watch_diff(y1, test_global_storage.Setter("y1_diff")) flow.watch_diff(y2, test_global_storage.Setter("y2_diff")) loss = y1 + y2 flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0).minimize(flow.math.reduce_sum(loss)) return loss x = np.random.uniform(low=0, high=10, size=x_shape).astype(np.float32) bias = np.random.uniform(low=0, high=10, size=bias_shape).astype(np.float32) add = np.random.uniform(low=0, high=10, size=x_shape).astype(np.float32) of_out = FlowJob(x, bias, add).get() y1 = test_global_storage.Get("y1") y2 = test_global_storage.Get("y2") tol = 1e-5 test_case.assertTrue( np.allclose(y1, y2, rtol=tol, atol=tol, equal_nan=True)) x1_diff = test_global_storage.Get("x1_diff") x2_diff = test_global_storage.Get("x2_diff") test_case.assertTrue( np.allclose(x1_diff, x2_diff, rtol=tol, atol=tol, equal_nan=True)) bias1_diff = test_global_storage.Get("bias1_diff") bias2_diff = test_global_storage.Get("bias2_diff") test_case.assertTrue( np.allclose(bias1_diff, bias2_diff, rtol=tol, atol=tol, equal_nan=True)) bias1_diff = test_global_storage.Get("bias1_diff") bias2_diff = test_global_storage.Get("bias2_diff") test_case.assertTrue( np.allclose(bias1_diff, bias2_diff, rtol=tol, atol=tol, equal_nan=True))
def main(args): flow.config.machine_num(args.num_nodes) flow.config.gpu_device_num(args.gpu_num_per_node) func_config = flow.FunctionConfig() func_config.default_distribute_strategy(flow.scope.consistent_view()) func_config.default_data_type(flow.float) func_config.train.primary_lr(0.00001) func_config.train.model_update_conf(dict(naive_conf={})) func_config.cudnn_conv_force_fwd_algo(0) func_config.cudnn_conv_force_bwd_data_algo(1) func_config.cudnn_conv_force_bwd_filter_algo(1) @flow.global_function(func_config) def alexnet_train_job(): (labels, images) = _data_load_layer(args, args.train_dir) loss = alexnet(args, images, labels) flow.losses.add_loss(loss) return loss func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) # print(func_config.function_desc.job_config_proto) @flow.global_function(func_config) def alexnet_eval_job(): with flow.scope.consistent_view(): (labels, images) = _data_load_layer(args, args.eval_dir) return alexnet(args, images, labels, False) check_point = flow.train.CheckPoint() if not args.model_load_dir: check_point.init() else: check_point.load(args.model_load_dir) num_nodes = args.num_nodes print("Traning alexnet: num_gpu_per_node = {}, num_nodes = {}.".format( args.gpu_num_per_node, num_nodes)) print("{:>12} {:>12} {:>12}".format("iter", "loss type", "loss value")) loss = [] for i in range(args.iter_num): train_loss = alexnet_train_job().get().mean() loss.append(train_loss) fmt_str = "{:>12} {:>12} {:>12.6f}" print(fmt_str.format(i, "train loss:", train_loss)) # if (i + 1) % 10 == 0: # eval_loss = alexnet_eval_job().get().mean() # print( # fmt_str.format( # i, "eval loss:", eval_loss # ) # ) if (i + 1) % 100 == 0: check_point.save(_MODEL_SAVE_DIR + str(i)) # save loss to file loss_file = "{}n{}c.npy".format(str(num_nodes), str(args.gpu_num_per_node * num_nodes)) loss_path = "./of_loss/alexnet" if not os.path.exists(loss_path): os.makedirs(loss_path) numpy.save(os.path.join(loss_path, loss_file), loss)
def compare_with_tensorflow( device_type, x_shape, filters, kernel_size, groups, of_padding="SAME", tf_padding="SAME", stride=1, data_format="NCHW", ): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.default_logical_view(flow.scope.mirrored_view()) if data_format == "NCHW": xy_data_transpose = (0, 2, 3, 1) weight_data_transpose = (2, 3, 1, 0) else: xy_data_transpose = (0, 1, 2, 3) weight_data_transpose = (1, 2, 3, 0) @flow.global_function(type="train", function_config=func_config) def DynamicConvJob(x: oft.ListNumpy.Placeholder((10, 3, 100, 100))): with flow.scope.placement(device_type, "0:0"): x_var = flow.get_variable( name="v1", shape=(1, ), dtype=flow.float, initializer=flow.zeros_initializer(), ) x_var = flow.cast_to_current_logical_view(x_var) x += x_var if data_format == "NCHW": weight_shape = (filters, x_shape[1] // groups, kernel_size, kernel_size) else: weight_shape = (filters, kernel_size, kernel_size, x_shape[3] // groups) weight = flow.get_variable( "conv-weight", shape=weight_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=0, maxval=100), ) weight = flow.cast_to_current_logical_view(weight) loss = flow.nn.conv2d( x, weight, strides=[stride, stride], padding=of_padding, data_format=data_format, dilations=[1, 1], groups=groups, ) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(loss) flow.watch(x, global_storage_setter("x")) flow.watch_diff(x, global_storage_setter("x_diff")) flow.watch(weight, global_storage_setter("weight")) flow.watch_diff(weight, global_storage_setter("weight_diff")) flow.watch(loss, global_storage_setter("loss")) flow.watch_diff(loss, global_storage_setter("loss_diff")) return loss # OneFlow check_point = flow.train.CheckPoint() check_point.init() data = [np.random.rand(*x_shape).astype(np.float32)] of_out = DynamicConvJob(data).get().numpy_list()[0] # TensorFlow with tf.GradientTape(persistent=True) as tape: x = tf.Variable(data[0].transpose(xy_data_transpose)) assert groups > 0 assert x_shape[1] % groups == 0 assert filters % groups == 0 weight = tf.Variable( global_storage["weight"].numpy().transpose(weight_data_transpose)) tf_out = tf.nn.conv2d( x, weight, strides=[1, stride, stride, 1], padding=tf_padding, data_format="NHWC", ) idx = np.where( np.abs(of_out.transpose(xy_data_transpose) - tf_out.numpy()) > 5e-4) assert np.allclose( of_out.transpose(xy_data_transpose), tf_out.numpy(), rtol=1e-5, atol=1e-5, ) loss_diff = global_storage["loss_diff"].numpy_list()[0].transpose( xy_data_transpose) tf_x_diff = tape.gradient(tf_out, x, loss_diff) tf_weight_diff = tape.gradient(tf_out, weight, loss_diff) rtol = 1e-4 atol = 1e-4 if device_type == "cpu": rtol *= 100 atol *= 100 assert np.allclose( global_storage["x_diff"].numpy_list()[0].transpose(xy_data_transpose), tf_x_diff.numpy(), rtol=rtol, atol=atol, ), (global_storage["x_diff"].numpy_list()[0].transpose(xy_data_transpose) - tf_x_diff.numpy()) assert np.allclose( global_storage["weight_diff"].numpy().transpose(weight_data_transpose), tf_weight_diff.numpy(), rtol=1e-5, atol=1e-5, )
def train(self, epochs): # download data train_data = TrainSet(args) val_data = TestSet(args) # save loss, psnr, ssim Loss = [] Val_psnr = [] Val_ssim = [] # config func_config = flow.FunctionConfig() func_config.default_data_type(flow.double) flow.config.gpu_device_num(self.gpu_num_per_node) flow.config.enable_debug_mode(True) # train config lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [self.lr]) @flow.global_function(type="predict", function_config=func_config) def train_lte(input: tp.Numpy.Placeholder( (self.batch_size, 3, 160, 160))) -> Tuple[tp.Numpy, tp.Numpy, tp.Numpy]: x_lv1, x_lv2, x_lv3 = self.LTE(input, trainable=True) return x_lv1, x_lv2, x_lv3 @flow.global_function(type="predict", function_config=func_config) def train_searchtransfer(lrsr_lv3_unfold: tp.Numpy.Placeholder( (self.batch_size, 2304, 1600)), refsr_lv3_unfold: tp.Numpy.Placeholder( (self.batch_size, 2304, 1600)), ref_lv3_unfold: tp.Numpy.Placeholder( (self.batch_size, 2304, 1600)), ref_lv2_unfold: tp.Numpy.Placeholder( (self.batch_size, 4608, 1600)), ref_lv1_unfold: tp.Numpy.Placeholder( (self.batch_size, 9216, 1600))) -> Tuple[tp.Numpy, tp.Numpy, tp.Numpy, tp.Numpy]: refsr_lv3_unfold = flow.transpose(refsr_lv3_unfold, perm=[0, 2, 1]) refsr_lv3_unfold = flow.math.l2_normalize( refsr_lv3_unfold, axis=2) # [N, Hr*Wr, C*k*k] lrsr_lv3_unfold = flow.math.l2_normalize(lrsr_lv3_unfold, axis=1) # [N, C*k*k, H*W] R_lv3 = flow.matmul(refsr_lv3_unfold, lrsr_lv3_unfold) # [N, Hr*Wr, H*W] R_lv3_star = flow.math.reduce_max(R_lv3, axis=1) # [N, H*W] R_lv3_star_arg = flow.math.argmax(R_lv3, axis=1) # [N, H*W] T_lv3_unfold = self.bis(ref_lv3_unfold, R_lv3_star_arg) T_lv2_unfold = self.bis(ref_lv2_unfold, R_lv3_star_arg) T_lv1_unfold = self.bis(ref_lv1_unfold, R_lv3_star_arg) return R_lv3_star, T_lv3_unfold, T_lv2_unfold, T_lv1_unfold @flow.global_function(type="train", function_config=func_config) def train_mainnet(lr: tp.Numpy.Placeholder( (self.batch_size, 3, 40, 40)), S: tp.Numpy.Placeholder( (self.batch_size, 1, 40, 40)), T_lv3: tp.Numpy.Placeholder( (self.batch_size, 256, 40, 40)), T_lv2: tp.Numpy.Placeholder( (self.batch_size, 128, 80, 80)), T_lv1: tp.Numpy.Placeholder( (self.batch_size, 64, 160, 160)), hr: tp.Numpy.Placeholder( (self.batch_size, 3, 160, 160))) -> tp.Numpy: sr = self.mainnet(lr, S, T_lv3, T_lv2, T_lv1, trainable=True) loss = flow.math.reduce_mean( flow.math.abs(flow.math.subtract(sr, hr))) flow.optimizer.Adam(lr_scheduler, 0.9, 0.999).minimize(loss) return loss @flow.global_function(type="predict", function_config=func_config) def eval_lte(input: tp.Numpy.Placeholder( (1, 3, 160, 160))) -> Tuple[tp.Numpy, tp.Numpy, tp.Numpy]: x_lv1, x_lv2, x_lv3 = self.LTE(input, trainable=False) return x_lv1, x_lv2, x_lv3 @flow.global_function(type="predict", function_config=func_config) def eval_searchtransfer(lrsr_lv3_unfold: tp.Numpy.Placeholder( (1, 2304, 1600)), refsr_lv3_unfold: tp.Numpy.Placeholder( (1, 2304, 1600)), ref_lv3_unfold: tp.Numpy.Placeholder( (1, 2304, 1600)), ref_lv2_unfold: tp.Numpy.Placeholder( (1, 4608, 1600)), ref_lv1_unfold: tp.Numpy.Placeholder( (1, 9216, 1600))) -> Tuple[tp.Numpy, tp.Numpy, tp.Numpy, tp.Numpy]: refsr_lv3_unfold = flow.transpose(refsr_lv3_unfold, perm=[0, 2, 1]) refsr_lv3_unfold = flow.math.l2_normalize( refsr_lv3_unfold, axis=2) # [N, Hr*Wr, C*k*k] lrsr_lv3_unfold = flow.math.l2_normalize(lrsr_lv3_unfold, axis=1) # [N, C*k*k, H*W] R_lv3 = flow.matmul(refsr_lv3_unfold, lrsr_lv3_unfold) # [N, Hr*Wr, H*W] R_lv3_star = flow.math.reduce_max(R_lv3, axis=1) # [N, H*W] R_lv3_star_arg = flow.math.argmax(R_lv3, axis=1) # [N, H*W] T_lv3_unfold = self.bis(ref_lv3_unfold, R_lv3_star_arg) T_lv2_unfold = self.bis(ref_lv2_unfold, R_lv3_star_arg) T_lv1_unfold = self.bis(ref_lv1_unfold, R_lv3_star_arg) return R_lv3_star, T_lv3_unfold, T_lv2_unfold, T_lv1_unfold @flow.global_function(type="predict", function_config=func_config) def eval_mainnet(lr: tp.Numpy.Placeholder( (1, 3, 40, 40)), S: tp.Numpy.Placeholder( (1, 1, 40, 40)), T_lv3: tp.Numpy.Placeholder((1, 256, 40, 40)), T_lv2: tp.Numpy.Placeholder( (1, 128, 80, 80)), T_lv1: tp.Numpy.Placeholder( (1, 64, 160, 160))) -> tp.Numpy: sr = self.mainnet(lr, S, T_lv3, T_lv2, T_lv1, trainable=False) return sr check_point = flow.train.CheckPoint() check_point.load(self.vgg_path) batch_num = len(train_data) // self.batch_size pre_best, best_psnr = -1, 0 print("****************** start training *****************") for epoch_idx in range(epochs): start = time.time() train_data.shuffle(epoch_idx) print("****************** train *****************") for batch_idx in range(batch_num): lr, lr_sr, hr, ref, ref_sr = [], [], [], [], [] for idx in range(self.batch_size): sample = train_data[batch_idx * self.batch_size + idx] lr.append(sample['LR'][np.newaxis, :]) lr_sr.append(sample['LR_sr'][np.newaxis, :]) hr.append(sample['HR'][np.newaxis, :]) ref.append(sample['Ref'][np.newaxis, :]) ref_sr.append(sample['Ref_sr'][np.newaxis, :]) lr = np.ascontiguousarray(np.concatenate(lr, axis=0)) lr_sr = np.ascontiguousarray(np.concatenate(lr_sr, axis=0)) hr = np.ascontiguousarray(np.concatenate(hr, axis=0)) ref = np.ascontiguousarray(np.concatenate(ref, axis=0)) ref_sr = np.ascontiguousarray(np.concatenate(ref_sr, axis=0)) _, _, lrsr_lv3 = train_lte((lr_sr + 1.) / 2.) _, _, refsr_lv3 = train_lte((ref_sr + 1.) / 2.) ref_lv1, ref_lv2, ref_lv3 = train_lte((ref + 1.) / 2.) ### search lrsr_lv3_unfold = self.unfold(lrsr_lv3) refsr_lv3_unfold = self.unfold(refsr_lv3) ### transfer ref_lv3_unfold = self.unfold(ref_lv3) ref_lv2_unfold = self.unfold(ref_lv2, kernel_size=6, padding=2, stride=2) ref_lv1_unfold = self.unfold(ref_lv1, kernel_size=12, padding=4, stride=4) R_lv3_star, T_lv3_unfold, T_lv2_unfold, T_lv1_unfold = train_searchtransfer( lrsr_lv3_unfold, refsr_lv3_unfold, ref_lv3_unfold, ref_lv2_unfold, ref_lv1_unfold) T_lv3 = self.fold(T_lv3_unfold, output_size=lrsr_lv3.shape[-2:], kernel_size=3, padding=1, stride=1) / (3. * 3.) T_lv2 = self.fold( T_lv2_unfold, output_size=(lrsr_lv3.shape[2] * 2, lrsr_lv3.shape[3] * 2), kernel_size=6, padding=2, stride=2) / (3. * 3.) T_lv1 = self.fold( T_lv1_unfold, output_size=(lrsr_lv3.shape[2] * 4, lrsr_lv3.shape[3] * 4), kernel_size=12, padding=4, stride=4) / (3. * 3.) S = np.reshape(R_lv3_star, [ R_lv3_star.shape[0], 1, lrsr_lv3.shape[2], lrsr_lv3.shape[3] ]) loss = train_mainnet(lr, S, T_lv3, T_lv2, T_lv1, hr) if (batch_idx + 1) % self.print_interval == 0: print("{}th epoch, {}th batch, loss:{} ".format( epoch_idx + 1, batch_idx + 1, loss)) Loss.append(loss) print("Time for epoch {} is {} sec.".format( epoch_idx + 1, time.time() - start)) if (epoch_idx + 1) % self.val_every == 0: val_psnr, val_ssim = 0., 0. val_batch_num = len(val_data) for batch_idx in range(val_batch_num): sample = val_data[batch_idx] lr = np.ascontiguousarray(sample['LR'][np.newaxis, :]) lr_sr = np.ascontiguousarray( sample['LR_sr'][np.newaxis, :]) hr = np.ascontiguousarray(sample['HR'][np.newaxis, :]) ref = np.ascontiguousarray(sample['Ref'][np.newaxis, :]) ref_sr = np.ascontiguousarray( sample['Ref_sr'][np.newaxis, :]) _, _, lrsr_lv3 = eval_lte((lr_sr + 1.) / 2.) _, _, refsr_lv3 = eval_lte((ref_sr + 1.) / 2.) ref_lv1, ref_lv2, ref_lv3 = eval_lte((ref + 1.) / 2.) ### search lrsr_lv3_unfold = self.unfold(lrsr_lv3) refsr_lv3_unfold = self.unfold(refsr_lv3) ### transfer ref_lv3_unfold = self.unfold(ref_lv3) ref_lv2_unfold = self.unfold(ref_lv2, kernel_size=6, padding=2, stride=2) ref_lv1_unfold = self.unfold(ref_lv1, kernel_size=12, padding=4, stride=4) R_lv3_star, T_lv3_unfold, T_lv2_unfold, T_lv1_unfold = eval_searchtransfer( lrsr_lv3_unfold, refsr_lv3_unfold, ref_lv3_unfold, ref_lv2_unfold, ref_lv1_unfold) T_lv3 = self.fold(T_lv3_unfold, output_size=lrsr_lv3.shape[-2:], kernel_size=3, padding=1, stride=1) / (3. * 3.) T_lv2 = self.fold(T_lv2_unfold, output_size=(lrsr_lv3.shape[2] * 2, lrsr_lv3.shape[3] * 2), kernel_size=6, padding=2, stride=2) / (3. * 3.) T_lv1 = self.fold(T_lv1_unfold, output_size=(lrsr_lv3.shape[2] * 4, lrsr_lv3.shape[3] * 4), kernel_size=12, padding=4, stride=4) / (3. * 3.) S = np.reshape(R_lv3_star, [ R_lv3_star.shape[0], 1, lrsr_lv3.shape[2], lrsr_lv3.shape[3] ]) sr = eval_mainnet(lr, S, T_lv3, T_lv2, T_lv1) # sr: range [-1, 1] # hr: range [-1, 1] ### prepare data sr = (sr + 1.) * 127.5 hr = (hr + 1.) * 127.5 sr = np.transpose(np.round(np.squeeze(sr)), (1, 2, 0)) hr = np.transpose(np.round(np.squeeze(hr)), (1, 2, 0)) ### calculate psnr and ssim val_psnr += self.calc_psnr(sr, hr) val_ssim += self.calc_ssim(sr, hr) val_psnr = val_psnr / val_batch_num val_ssim = val_ssim / val_batch_num Val_psnr.append(val_psnr) Val_ssim.append(val_ssim) print("****************** evalute *****************") print("{}th epoch, val_psnr:{}, val_ssim:{}.".format( epoch_idx + 1, val_psnr, val_ssim)) if epoch_idx + 1 > 10 and val_psnr > best_psnr: best_psnr = val_psnr if pre_best != -1: # delete the previous best checkpoint print( "delete the previous best {}th epoch model".format( pre_best)) shutil.rmtree( os.path.join(self.checkpoint_path, "{}th_epoch".format(pre_best))) # save parameters check_point.save( os.path.join(self.checkpoint_path, "{}th_epoch".format(epoch_idx + 1))) pre_best = epoch_idx + 1 print("save the best {}th epoch model at {}.".format( epoch_idx + 1, str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S")))) # save train loss and val np.save(os.path.join(self.loss_path, 'loss_{}.npy'.format(epochs)), Loss) np.save(os.path.join(self.loss_path, 'Val_psnr_{}.npy'.format(epochs)), Val_psnr) np.save(os.path.join(self.loss_path, 'Val_ssim_{}.npy'.format(epochs)), Val_ssim) print("*************** Train {} done ***************** ".format( self.path))
def _compare_kldivloss_with_np( input_shape, target_shape, log_target, device_type, machine_ids, device_counts, ): input = np.random.random(size=input_shape).astype(np.float32) target = np.random.random(size=target_shape).astype(np.float32) log_target = log_target[0] assert device_type in ["cpu", "gpu"] flow.clear_default_session() if device_type == "cpu": flow.config.cpu_device_num(device_counts) else: flow.config.gpu_device_num(device_counts) func_config = flow.FunctionConfig() func_config.default_placement_scope( flow.scope.placement(device_type, machine_ids)) func_config.default_logical_view(flow.scope.consistent_view()) def np_kldivloss(np_input, np_target, np_log_target): if log_target: np_kl_div_loss = np.exp(np_target) * (np_target - np_input) else: np_kl_div_out_loss = target * (np.log(target) - np_input) np_zeros = np.zeros_like(np_kl_div_out_loss, dtype=np.float32) # when target < 0, we set to `0`, when target > 0, we set to `1`. # set the element in _kl_div_loss as `0` to avoid `nan` value. np_kl_div_loss = np.where(target > 0, np_kl_div_out_loss, np_zeros) return { "np_kldivloss": np_kl_div_loss, "np_kldivloss_mean": np.mean(np_kl_div_loss), "np_kldivloss_sum": np.sum(np_kl_div_loss), } np_out_kldivloss_dict = np_kldivloss(input, target, log_target) def np_kldivloss_diff(input, target, np_log_target): elem_cnt = input.size if np_log_target: _np_diff = -np.exp(target) else: _np_diff = -target # Because when np_log_target == False, the loss will be set to zero when target < 0 _zero_index = np.where(target > 0, 1, 0) _np_diff = _np_diff * _zero_index return { "np_kldivloss_grad": _np_diff, "np_kldivloss_grad_mean": _np_diff / elem_cnt, } np_grad_dict = np_kldivloss_diff(input, target, log_target) def assert_prediction_grad(blob: tp.Numpy): # validate the correstness of gradient assert np.allclose(blob, np_grad_dict["np_kldivloss_grad_mean"], atol=1e-4) @flow.global_function( type="train", function_config=func_config, ) def oneflow_kldivloss( of_input: tp.Numpy.Placeholder(shape=input.shape), of_target: tp.Numpy.Placeholder(shape=target.shape), ) -> Dict[str, tp.Numpy]: with flow.scope.placement(device_type, "0:0"): v = flow.get_variable( shape=input.shape, dtype=flow.float32, initializer=flow.zeros_initializer(), name="x_var", ) of_input = of_input + v flow.watch_diff(of_input, assert_prediction_grad) of_kldivloss = flow.nn.KLDivLoss( of_input, of_target, log_target=log_target, reduction="none", name="kldivloss", ) of_kldivloss_mean = flow.nn.KLDivLoss( of_input, of_target, log_target=log_target, reduction="mean", name="kldivloss_mean", ) of_kldivloss_sum = flow.nn.KLDivLoss( of_input, of_target, log_target=log_target, reduction="sum", name="kldivloss_sum", ) with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-3]), momentum=0).minimize(of_kldivloss_mean) return { "of_kldivloss": of_kldivloss, "of_kldivloss_mean": of_kldivloss_mean, "of_kldivloss_sum": of_kldivloss_sum, } of_out_kldivloss_dict = oneflow_kldivloss(input, target) assert np.allclose( of_out_kldivloss_dict["of_kldivloss"], np_out_kldivloss_dict["np_kldivloss"], atol=1e-5, ) assert np.allclose( of_out_kldivloss_dict["of_kldivloss_mean"], np_out_kldivloss_dict["np_kldivloss_mean"], ) assert np.allclose( of_out_kldivloss_dict["of_kldivloss_sum"], np_out_kldivloss_dict["np_kldivloss_sum"], )
def compare_with_tensorflow(test_case, device_type, x_shape, filters, kernel_size, groups): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) @flow.global_function(type="train", function_config=func_config) def ConvJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=x_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=0, maxval=100), trainable=True, ) loss = flow.layers.conv2d( x, filters, kernel_size=kernel_size, strides=[1, 1], padding="valid", data_format="NCHW", dilation_rate=1, groups=groups, use_bias=False, kernel_initializer=flow.random_uniform_initializer(minval=0, maxval=100), weight_name="conv2d_weight", ) weight_shape = (filters, x.shape[1] // groups, kernel_size, kernel_size) weight = flow.get_variable( name="conv2d_weight", shape=weight_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=0, maxval=100), ) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(weight, test_global_storage.Setter("weight")) flow.watch_diff(weight, test_global_storage.Setter("weight_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss # OneFlow of_out = ConvJob().get() # TensorFlow with tf.GradientTape(persistent=True) as tape: x = tf.Variable(test_global_storage.Get("x").transpose(0, 2, 3, 1)) assert groups > 0 assert x_shape[1] % groups == 0 assert filters % groups == 0 if groups == 1: weight = tf.Variable( test_global_storage.Get("weight").transpose(2, 3, 1, 0)) tf_out = tf.nn.conv2d(x, weight, strides=[1, 1, 1, 1], padding="VALID", data_format="NHWC") else: weight = tf.Variable( test_global_storage.Get("weight").transpose(2, 3, 1, 0)) tf_out = grouped_convolution2D(x, weight, padding="VALID", num_groups=groups) loss_diff = test_global_storage.Get("loss_diff").transpose(0, 2, 3, 1) tf_x_diff = tape.gradient(tf_out, x, loss_diff) tf_weight_diff = tape.gradient(tf_out, weight, loss_diff) of_out_np = of_out.numpy().transpose(0, 2, 3, 1) tf_out_np = tf_out.numpy() max_abs_diff = np.max(np.absolute(of_out_np - tf_out_np)) fail_info = "\nshape (of vs. tf): {} vs. {}\nmax_abs_diff: {}".format( of_out_np.shape, tf_out_np.shape, max_abs_diff) test_case.assertTrue( np.allclose(of_out_np, tf_out_np, rtol=1e-5, atol=1e-5), fail_info) of_x_diff_arr = test_global_storage.Get("x_diff").transpose(0, 2, 3, 1) tf_x_diff_arr = tf_x_diff.numpy() max_abs_diff = np.max(np.abs(of_x_diff_arr - tf_x_diff_arr)) test_case.assertTrue( np.allclose(of_x_diff_arr, tf_x_diff_arr, rtol=1e-5, atol=1e-4)) test_case.assertTrue( np.allclose( test_global_storage.Get("weight_diff").transpose(2, 3, 1, 0), tf_weight_diff.numpy(), rtol=1e-5, atol=1e-5, ))
def test_ccrelu_2n1c(test_case): func_config = flow.FunctionConfig() func_config.default_logical_view(flow.scope.consistent_view()) fixed_tensor_def_test(test_case, func_config)
def test_mirror_ccrelu(test_case): func_config = flow.FunctionConfig() func_config.default_logical_view(flow.scope.mirrored_view()) mirrored_tensor_def_test(test_case, func_config)
def main(args): flow.config.machine_num(args.num_nodes) flow.config.gpu_device_num(args.gpu_num_per_node) train_config = flow.FunctionConfig() train_config.default_logical_view(flow.scope.consistent_view()) train_config.default_data_type(flow.float) train_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision) @flow.global_function(type="train", function_config=train_config) def vgg_train_job(): (labels, images) = _data_load_layer(args, args.train_dir) to_return = vgg(images, labels) loss = to_return[-1] flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.00001]), momentum=0).minimize(loss) return loss eval_config = flow.FunctionConfig() eval_config.default_logical_view(flow.scope.consistent_view()) eval_config.default_data_type(flow.float) eval_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision) @flow.global_function(function_config=eval_config) def vgg_eval_job(): (labels, images) = _data_load_layer(args, args.eval_dir) return vgg(images, labels, False) check_point = flow.train.CheckPoint() if not args.model_load_dir: check_point.init() else: check_point.load(args.model_load_dir) num_nodes = args.num_nodes print("Traning vgg16: num_gpu_per_node = {}, num_nodes = {}.".format( args.gpu_num_per_node, num_nodes)) print("{:>12} {:>12} {:>12}".format("iter", "loss type", "loss value")) loss = [] for i in range(args.iter_num): train_loss = vgg_train_job().get().mean() loss.append(train_loss) fmt_str = "{:>12} {:>12} {:>12.6f}" print(fmt_str.format(i, "train loss:", train_loss)) # if (i + 1) % 10 == 0: # eval_loss = alexnet_eval_job().get().mean() # print( # fmt_str.format( # i, "eval loss:", eval_loss # ) # ) if (i + 1) % 100 == 0: check_point.save(_MODEL_SAVE_DIR + str(i)) # save loss to file loss_file = "{}n{}c.npy".format(str(num_nodes), str(args.gpu_num_per_node * num_nodes)) loss_path = "./of_loss/vgg16" if not os.path.exists(loss_path): os.makedirs(loss_path) numpy.save(os.path.join(loss_path, loss_file), loss)
def compare_with_tensorflow(device_type, x_shape, filters, kernel_size, groups, padding="VALID", stride=1): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.train.primary_lr(1e-4) func_config.train.model_update_conf(dict(naive_conf={})) @flow.global_function(func_config) def ConvJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=x_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=0, maxval=100), trainable=True, ) weight_shape = (filters, int(x.shape[1] / groups), kernel_size, kernel_size) weight = flow.get_variable( "conv-weight", shape=weight_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=0, maxval=100), ) loss = flow.nn.conv2d( x, weight, strides=[stride, stride], padding=padding, data_format="NCHW", dilations=[1, 1], groups=groups, ) flow.losses.add_loss(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(weight, test_global_storage.Setter("weight")) flow.watch_diff(weight, test_global_storage.Setter("weight_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss # OneFlow check_point = flow.train.CheckPoint() check_point.init() of_out = ConvJob().get() # TensorFlow with tf.GradientTape(persistent=True) as tape: x = tf.Variable(test_global_storage.Get("x").transpose(0, 2, 3, 1)) assert groups > 0 assert x_shape[1] % groups == 0 assert filters % groups == 0 if groups == 1: weight = tf.Variable( test_global_storage.Get("weight").transpose(2, 3, 1, 0)) tf_out = tf.nn.conv2d( x, weight, strides=[1, stride, stride, 1], padding=padding, data_format="NHWC", ) else: weight = tf.Variable( test_global_storage.Get("weight").transpose(2, 3, 1, 0)) tf_out = grouped_convolution2D(x, weight, padding=padding, num_groups=groups) loss_diff = test_global_storage.Get("loss_diff").transpose(0, 2, 3, 1) tf_x_diff = tape.gradient(tf_out, x, loss_diff) tf_weight_diff = tape.gradient(tf_out, weight, loss_diff) max_diff = np.max( np.absolute(of_out.numpy().transpose(0, 2, 3, 1) - tf_out.numpy())) assert np.allclose(of_out.numpy().transpose(0, 2, 3, 1), tf_out.numpy(), rtol=1e-5, atol=1e-5), max_diff assert np.allclose( test_global_storage.Get("x_diff").transpose(0, 2, 3, 1), tf_x_diff.numpy(), rtol=1e-4, atol=1e-4, ) assert np.allclose( test_global_storage.Get("weight_diff").transpose(2, 3, 1, 0), tf_weight_diff.numpy(), rtol=1e-5, atol=1e-5, )
def test(self, model_path): # download data val_data = TestSet(args) # config func_config = flow.FunctionConfig() func_config.default_data_type(flow.double) flow.config.gpu_device_num(self.gpu_num_per_node) flow.config.enable_debug_mode(True) @flow.global_function(type="predict", function_config=func_config) def eval_lte(input: tp.Numpy.Placeholder( (1, 3, 160, 160))) -> Tuple[tp.Numpy, tp.Numpy, tp.Numpy]: x_lv1, x_lv2, x_lv3 = self.LTE(input, trainable=False) return x_lv1, x_lv2, x_lv3 @flow.global_function(type="predict", function_config=func_config) def eval_searchtransfer(lrsr_lv3_unfold: tp.Numpy.Placeholder( (1, 2304, 1600)), refsr_lv3_unfold: tp.Numpy.Placeholder( (1, 2304, 1600)), ref_lv3_unfold: tp.Numpy.Placeholder( (1, 2304, 1600)), ref_lv2_unfold: tp.Numpy.Placeholder( (1, 4608, 1600)), ref_lv1_unfold: tp.Numpy.Placeholder( (1, 9216, 1600))) -> Tuple[tp.Numpy, tp.Numpy, tp.Numpy, tp.Numpy]: refsr_lv3_unfold = flow.transpose(refsr_lv3_unfold, perm=[0, 2, 1]) refsr_lv3_unfold = flow.math.l2_normalize( refsr_lv3_unfold, axis=2) # [N, Hr*Wr, C*k*k] lrsr_lv3_unfold = flow.math.l2_normalize(lrsr_lv3_unfold, axis=1) # [N, C*k*k, H*W] R_lv3 = flow.matmul(refsr_lv3_unfold, lrsr_lv3_unfold) # [N, Hr*Wr, H*W] R_lv3_star = flow.math.reduce_max(R_lv3, axis=1) # [N, H*W] R_lv3_star_arg = flow.math.argmax(R_lv3, axis=1) # [N, H*W] T_lv3_unfold = self.bis(ref_lv3_unfold, R_lv3_star_arg) T_lv2_unfold = self.bis(ref_lv2_unfold, R_lv3_star_arg) T_lv1_unfold = self.bis(ref_lv1_unfold, R_lv3_star_arg) return R_lv3_star, T_lv3_unfold, T_lv2_unfold, T_lv1_unfold @flow.global_function(type="predict", function_config=func_config) def eval_mainnet(lr: tp.Numpy.Placeholder( (1, 3, 40, 40)), S: tp.Numpy.Placeholder( (1, 1, 40, 40)), T_lv3: tp.Numpy.Placeholder((1, 256, 40, 40)), T_lv2: tp.Numpy.Placeholder( (1, 128, 80, 80)), T_lv1: tp.Numpy.Placeholder( (1, 64, 160, 160))) -> tp.Numpy: sr = self.mainnet(lr, S, T_lv3, T_lv2, T_lv1, trainable=False) return sr check_point = flow.train.CheckPoint() check_point.load(model_path) val_psnr, val_ssim = 0., 0. val_batch_num = len(val_data) for batch_idx in range(val_batch_num): sample = val_data[batch_idx] lr = np.ascontiguousarray(sample['LR'][np.newaxis, :]) lr_sr = np.ascontiguousarray(sample['LR_sr'][np.newaxis, :]) hr = np.ascontiguousarray(sample['HR'][np.newaxis, :]) ref = np.ascontiguousarray(sample['Ref'][np.newaxis, :]) ref_sr = np.ascontiguousarray(sample['Ref_sr'][np.newaxis, :]) _, _, lrsr_lv3 = eval_lte((lr_sr + 1.) / 2.) _, _, refsr_lv3 = eval_lte((ref_sr + 1.) / 2.) ref_lv1, ref_lv2, ref_lv3 = eval_lte((ref + 1.) / 2.) ### search lrsr_lv3_unfold = self.unfold(lrsr_lv3) refsr_lv3_unfold = self.unfold(refsr_lv3) ### transfer ref_lv3_unfold = self.unfold(ref_lv3) ref_lv2_unfold = self.unfold(ref_lv2, kernel_size=6, padding=2, stride=2) ref_lv1_unfold = self.unfold(ref_lv1, kernel_size=12, padding=4, stride=4) R_lv3_star, T_lv3_unfold, T_lv2_unfold, T_lv1_unfold = eval_searchtransfer( lrsr_lv3_unfold, refsr_lv3_unfold, ref_lv3_unfold, ref_lv2_unfold, ref_lv1_unfold) T_lv3 = self.fold(T_lv3_unfold, output_size=lrsr_lv3.shape[-2:], kernel_size=3, padding=1, stride=1) / (3. * 3.) T_lv2 = self.fold( T_lv2_unfold, output_size=(lrsr_lv3.shape[2] * 2, lrsr_lv3.shape[3] * 2), kernel_size=6, padding=2, stride=2) / (3. * 3.) T_lv1 = self.fold( T_lv1_unfold, output_size=(lrsr_lv3.shape[2] * 4, lrsr_lv3.shape[3] * 4), kernel_size=12, padding=4, stride=4) / (3. * 3.) S = np.reshape( R_lv3_star, [R_lv3_star.shape[0], 1, lrsr_lv3.shape[2], lrsr_lv3.shape[3]]) sr = eval_mainnet(lr, S, T_lv3, T_lv2, T_lv1) # sr: range [-1, 1] # hr: range [-1, 1] ### prepare data sr = (sr + 1.) * 127.5 hr = (hr + 1.) * 127.5 sr = np.transpose(np.round(np.squeeze(sr)), (1, 2, 0)) hr = np.transpose(np.round(np.squeeze(hr)), (1, 2, 0)) ### calculate psnr and ssim val_psnr += self.calc_psnr(sr, hr) val_ssim += self.calc_ssim(sr, hr) val_psnr = val_psnr / val_batch_num val_ssim = val_ssim / val_batch_num print("****************** evalute *****************") print("val_psnr:{}, val_ssim:{}.".format(val_psnr, val_ssim))
def compare_with_tensorflow(device_type, x_shape, data_type, axis): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() if data_type == "float16": dtype = flow.float else: dtype = type_name_to_flow_type[data_type] @flow.global_function(type="train", function_config=func_config) def SoftmaxJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=x_shape, dtype=dtype, initializer=flow.random_uniform_initializer(minval=-0.1, maxval=0.1), trainable=True, ) if data_type == "float16": loss = flow.cast( flow.nn.softmax(flow.cast(x, dtype=flow.float16), axis=axis), dtype=flow.float, ) else: loss = flow.nn.softmax(x, axis=axis) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss # OneFlow check_point = flow.train.CheckPoint() check_point.init() of_out = SoftmaxJob().get() # TensorFlow with tf.GradientTape(persistent=True) as tape: x = tf.Variable(test_global_storage.Get("x")) tf_out = tf.nn.softmax(x, axis=axis) loss_diff = test_global_storage.Get("loss_diff") tf_x_diff = tape.gradient(tf_out, x, loss_diff) if data_type == "float16": tolerance = 1e-3 else: tolerance = 1e-5 assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=tolerance, atol=tolerance) assert np.allclose( test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=tolerance, atol=tolerance, )
def summary_demo(): func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.default_logical_view(flow.scope.mirrored_view()) with tempfile.TemporaryDirectory() as logdir: @flow.global_function(function_config=func_config) def CreateWriter(): flow.summary.create_summary_writer(logdir) @flow.global_function(function_config=func_config) def ScalarJob( value: flow.typing.ListNumpy.Placeholder((1, ), dtype=flow.float), step: flow.typing.ListNumpy.Placeholder((1, ), dtype=flow.int64), tag: flow.typing.ListNumpy.Placeholder((1000, ), dtype=flow.int8), ): flow.summary.scalar(value, step, tag) @flow.global_function(function_config=func_config) def HistogramJob( value: flow.typing.ListNumpy.Placeholder((200, 200, 200), dtype=flow.float), step: flow.typing.ListNumpy.Placeholder((1, ), dtype=flow.int64), tag: flow.typing.ListNumpy.Placeholder((9, ), dtype=flow.int8), ): flow.summary.histogram(value, step, tag) @flow.global_function(function_config=func_config) def PbJob( value: flow.typing.ListNumpy.Placeholder((1500, ), dtype=flow.int8), step: flow.typing.ListNumpy.Placeholder((1, ), dtype=flow.int64), ): flow.summary.pb(value, step=step) @flow.global_function(function_config=func_config) def ImageJob( value: flow.typing.ListNumpy.Placeholder(shape=(100, 2000, 2000, 4), dtype=flow.uint8), step: flow.typing.ListNumpy.Placeholder((1, ), dtype=flow.int64), tag: flow.typing.ListNumpy.Placeholder((10, ), dtype=flow.int8), ): flow.summary.image(value, step=step, tag=tag) @flow.global_function(function_config=func_config) def FlushJob(): flow.summary.flush_summary_writer() CreateWriter() projecotr = flow.summary.Projector(logdir) projecotr.create_embedding_projector() projecotr.create_exception_projector() hparams = { flow.summary.HParam("learning_rate", flow.summary.RealRange(1e-2, 1e-1)): 0.02, flow.summary.HParam("dense_layers", flow.summary.IntegerRange( 2, 7)): 5, flow.summary.HParam("optimizer", flow.summary.ValueSet(["adam", "sgd"])): "adam", flow.summary.HParam("accuracy", flow.summary.RealRange(1e-2, 1e-1)): 0.001, flow.summary.HParam("magic", flow.summary.ValueSet([False, True])): True, flow.summary.Metric("loss", float): 0.02, "dropout": 0.6, } for i in range(200): t = ["vgg16", "resnet50", "mask-rcnn", "yolov3"] pb = flow.summary.text(t) value = np.fromstring(str(pb), dtype=np.int8) step = np.array([i], dtype=np.int64) PbJob([value], [step]) pb2 = flow.summary.hparams(hparams) value = np.fromstring(str(pb2), dtype=np.int8) step = np.array([i], dtype=np.int64) PbJob([value], [step]) for idx in range(100): value = np.array([idx], dtype=np.float32) step = np.array([idx], dtype=np.int64) tag = np.fromstring("scalar", dtype=np.int8) ScalarJob([value], [step], [tag]) value = np.array( [ [[1, 2, 3, 0], [0, 2, 3, 1], [2, 3, 4, 1]], [[1, 0, 2, 0], [2, 1, 2, 0], [2, 1, 1, 1]], ], dtype=np.float64, ) for idx in range(20): value = np.random.rand(100, 100, 100).astype(np.float32) step = np.array([idx], dtype=np.int64) tag = np.fromstring("histogram", dtype=np.int8) HistogramJob([value], [step], [tag]) value_ = np.random.rand(10, 10, 10).astype(np.float32) label = (np.random.rand(10) * 10).astype(np.int64) x = (np.random.rand(10, 10, 10) * 255).astype(np.uint8) sample_name = "sample" sample_type = "image" step = 1 tag_exception = "exception_projector" tag_embedding = "embedding_projector" for i in range(20): projecotr.exception_projector( value=value, tag=tag_exception, step=step, sample_name=sample_name, sample_type=sample_type, x=x, ) projecotr.embedding_projector( value=value, label=label, tag=tag_embedding, step=step, sample_name=sample_name, sample_type=sample_type, x=x, ) images = [ cv2.cvtColor(np.ones([512, 512], np.uint8), cv2.COLOR_BGR2RGB).astype(np.uint8), cv2.cvtColor(np.ones([512, 512], np.uint8), cv2.COLOR_BGR2RGB).astype(np.uint8), ] images = np.array(images, dtype=np.uint8) imageRed = np.ones([512, 512, 3]).astype(np.uint8) Red = np.array([0, 255, 255], dtype=np.uint8) imageNew = np.multiply(imageRed, Red) imageNew = np.expand_dims(imageNew, axis=0) images = np.concatenate((images, imageNew), axis=0) step = np.array([1], dtype=np.int64) tag = np.fromstring("image", dtype=np.int8) for i in range(20): ImageJob([images], [step], [tag]) graph = flow.summary.Graph(logdir) graph.write_structure_graph() time.sleep(1) FlushJob() time.sleep(1)
def train(self, epochs): # download data npy train_hr_data_path = os.path.join( self.data_dir, "{}_{}hr_imgs.npy".format("train", self.hr_size)) train_lr_data_path = os.path.join( self.data_dir, "{}_{}lr_imgs.npy".format("train", self.lr_size)) val_hr_data_path = os.path.join( self.data_dir, "{}_{}hr_imgs.npy".format("val", self.hr_size)) val_lr_data_path = os.path.join( self.data_dir, "{}_{}lr_imgs.npy".format("val", self.lr_size)) train_hr_data = np.load(train_hr_data_path) train_lr_data = np.load(train_lr_data_path) val_hr_data = np.load(val_hr_data_path) val_lr_data = np.load(val_lr_data_path) assert train_hr_data.shape == ( 16700, 3, self.hr_size, self.hr_size), "The shape of train_hr_data is {}".format( train_hr_data.shape) assert val_lr_data.shape == ( 425, 3, self.lr_size, self.lr_size), "The shape of val_lr_data is {}".format( val_lr_data.shape) # save loss G_l2_loss = [] G_gan_loss = [] G_perceptual_loss = [] G_tv_loss = [] G_total_loss = [] D_total_loss = [] Val_l2_error = [] Val_ssim = [] Val_psnr = [] # config func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) flow.config.gpu_device_num(self.gpu_num_per_node) # train config lr_scheduler = flow.optimizer.PiecewiseConstantScheduler([], [self.lr]) @flow.global_function(type="predict", function_config=func_config) def eval_generator(input: tp.Numpy.Placeholder( (self.batch_size, 3, self.lr_size, self.lr_size))) -> tp.Numpy: g_out = self.Generator(input, trainable=False) return g_out @flow.global_function(type="train", function_config=func_config) def train_generator(input: tp.Numpy.Placeholder( (self.batch_size, 3, self.lr_size, self.lr_size)), target: tp.Numpy.Placeholder( (self.batch_size, 3, self.hr_size, self.hr_size))) -> Tuple[tp.Numpy, tp.Numpy, tp.Numpy, tp.Numpy, tp.Numpy, tp.Numpy]: g_out = self.Generator(input, trainable=True) g_logits = self.Discriminator(g_out, trainable=False) # Adversarial Loss g_gan_loss = 0.001 * flow.math.reduce_mean(1 - g_logits) # Image Loss g_l2_loss = self.mseloss(g_out, target) # TV Loss g_tv_loss = self.total_variance_loss(g_out, weight=2e-8) # Perceptual loss def perceptual_loss(fake, real, weight=1.0): fake_feature = self.vgg16bn(fake, trainable=False) real_feature = self.vgg16bn(real, trainable=False, reuse=True) return self.mseloss(fake_feature, real_feature, weight=weight) g_perceptual_loss = perceptual_loss(g_out, target, weight=0.006) g_total_loss = g_l2_loss + g_gan_loss + g_perceptual_loss + g_tv_loss flow.optimizer.Adam(lr_scheduler, beta1=0.5, beta2=0.999).minimize(g_total_loss) return g_l2_loss, g_gan_loss, g_perceptual_loss, g_tv_loss, g_total_loss, g_out @flow.global_function(type="train", function_config=func_config) def train_discriminator(input: tp.Numpy.Placeholder( (self.batch_size, 3, self.lr_size, self.lr_size)), target: tp.Numpy.Placeholder( (self.batch_size, 3, self.hr_size, self.hr_size))) -> tp.Numpy: g_out = self.Generator(input, trainable=False) g_logits = self.Discriminator(g_out, trainable=True) d_logits = self.Discriminator(target, trainable=True, reuse=True) d_loss = 1 - flow.math.reduce_mean(d_logits - g_logits) flow.optimizer.Adam(lr_scheduler, beta1=0.5, beta2=0.999).minimize(d_loss) return d_loss # load trained weight of vgg16bn and initialize automatically GAN model flow.load_variables(flow.checkpoint.get(self.vgg_path)) # trained weights of vgg need to be changed, because vgg is used twice like Discriminator. Please use weights in of_vgg16bn_reuse path to load vgg for perceptual loss. # flow.checkpoint.save("vgg_checkpoint") batch_num = len(train_hr_data) // self.batch_size pre_best, best_psnr = -1, 0 print("****************** start training *****************") for epoch_idx in range(epochs): start = time.time() print("****************** train *****************") for batch_idx in range(batch_num): inputs = train_lr_data[batch_idx * self.batch_size:(batch_idx + 1) * self.batch_size].astype(np.float32, order="C") target = train_hr_data[batch_idx * self.batch_size:(batch_idx + 1) * self.batch_size].astype(np.float32, order="C") d_loss = train_discriminator(inputs, target) g_l2_loss, g_gan_loss, g_perceptual_loss, g_tv_loss, g_total_loss, g_out = train_generator( inputs, target) d_loss = d_loss.mean() g_l2_loss = g_l2_loss.mean() g_gan_loss = g_gan_loss.mean() g_perceptual_loss = g_perceptual_loss.mean() g_tv_loss = g_tv_loss.mean() g_total_loss = g_total_loss.mean() if (batch_idx + 1) % self.print_interval == 0: print( "{}th epoch, {}th batch, g_l2_loss:{}, g_gan_loss:{}, g_perceptual_loss:{}, g_tv_loss:{}, gloss:{}, dloss:{} " .format(epoch_idx + 1, batch_idx + 1, g_l2_loss, g_gan_loss, g_perceptual_loss, g_tv_loss, g_total_loss, d_loss)) G_l2_loss.append(g_l2_loss) G_gan_loss.append(g_gan_loss) G_perceptual_loss.append(g_perceptual_loss) G_tv_loss.append(g_tv_loss) G_total_loss.append(g_total_loss) D_total_loss.append(d_loss) print("Time for epoch {} is {} sec.".format( epoch_idx + 1, time.time() - start)) if (epoch_idx + 1) % 1 == 0: # save train images # self.save_images(g_out, inputs, target, epoch_idx, name="train") # save val images, trainable = False # and calculate MSE, SSIMs, SSIM, PSNR val_l2_error, val_ssim, val_psnr = 0, 0, 0 val_batch_num = len(val_hr_data) // self.batch_size for val_batch_idx in range(val_batch_num): val_inputs = val_lr_data[val_batch_idx * self.batch_size:(val_batch_idx + 1) * self.batch_size].astype( np.float32, order="C") val_target = val_hr_data[val_batch_idx * self.batch_size:(val_batch_idx + 1) * self.batch_size].astype( np.float32, order="C") val_g_out = eval_generator(val_inputs) val_l2_error += (np.square(val_g_out - val_target).mean()) val_ssim += self.ssim(val_target.transpose(0, 2, 3, 1), val_g_out.transpose(0, 2, 3, 1)) # val_ssims += (pytorch_ssim.ssim(val_g_out, val_target, oneflow=True).item()) val_psnr += self.psnr(val_target.transpose(0, 2, 3, 1), val_g_out.transpose(0, 2, 3, 1)) # save val images self.save_images(val_g_out, val_inputs, val_target, epoch_idx, name="val") val_l2_error = val_l2_error / val_batch_num val_ssim = val_ssim / val_batch_num val_psnr = val_psnr / val_batch_num # val_psnr = 10 * np.log10(1 / val_l2_error) Val_l2_error.append(val_l2_error) Val_ssim.append(val_ssim) Val_psnr.append(val_psnr) print("****************** evalute *****************") print( "{}th epoch, {}th batch, val_l2_error:{}, val_ssim:{}, val_psnr:{}." .format(epoch_idx + 1, batch_idx + 1, val_l2_error, val_ssim, val_psnr)) if epoch_idx + 1 > 50 and val_psnr > best_psnr: best_psnr = val_psnr if pre_best != -1: # delete the previous best checkpoint print( "delete the previous best {}th epoch model".format( pre_best)) shutil.rmtree( os.path.join(self.checkpoint_path, "{}th_epoch".format(pre_best))) # save parameters flow.checkpoint.save( os.path.join(self.checkpoint_path, "{}th_epoch".format(epoch_idx + 1))) pre_best = epoch_idx + 1 print("save the best {}th epoch model at {}.".format( epoch_idx + 1, str(datetime.now().strftime("%Y-%m-%d-%H:%M:%S")))) # save train loss and val error to plot np.save( os.path.join(self.loss_path, 'G_l2_loss_{}.npy'.format(epochs)), G_l2_loss) np.save( os.path.join(self.loss_path, 'G_gan_loss_{}.npy'.format(epochs)), G_gan_loss) np.save( os.path.join(self.loss_path, 'G_perceptual_loss_{}.npy'.format(epochs)), G_perceptual_loss) np.save( os.path.join(self.loss_path, 'G_tv_loss_{}.npy'.format(epochs)), G_tv_loss) np.save( os.path.join(self.loss_path, 'G_total_loss_{}.npy'.format(epochs)), G_total_loss) np.save( os.path.join(self.loss_path, 'D_total_loss_{}.npy'.format(epochs)), D_total_loss) np.save( os.path.join(self.loss_path, 'Val_l2_error_{}.npy'.format(epochs)), Val_l2_error) np.save(os.path.join(self.loss_path, 'Val_ssim_{}.npy'.format(epochs)), Val_ssim) np.save(os.path.join(self.loss_path, 'Val_psnr_{}.npy'.format(epochs)), Val_psnr) print("*************** Train {} done ***************** ".format( self.path))
def compare_with_tensorflow(device_type, data_type, x_shape, case): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) @flow.global_function(type="train", function_config=func_config) def ScalarAddByTensorJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=x_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=0, maxval=100), trainable=True, ) y = flow.get_variable( "y", shape=(1,), dtype=flow.float, initializer=flow.random_uniform_initializer(minval=0, maxval=100), trainable=True, ) if case == "add": loss = flow.math.add(x, y) elif case == "sub": loss = flow.math.subtract(x, y) elif case == "mul": loss = flow.math.multiply(x, y) elif case == "div": loss = flow.math.divide(x, y) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [1e-4]), momentum=0 ).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch(y, test_global_storage.Setter("y")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch_diff(y, test_global_storage.Setter("y_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss # OneFlow check_point = flow.train.CheckPoint() check_point.init() of_out = ScalarAddByTensorJob().get() # TensorFlow with tf.GradientTape(persistent=True) as tape: x = tf.Variable(test_global_storage.Get("x")) y = tf.Variable(test_global_storage.Get("y")) if case == "add": tf_out = x + y elif case == "sub": tf_out = x - y elif case == "mul": tf_out = x * y elif case == "div": tf_out = x / y loss_diff = test_global_storage.Get("loss_diff") tf_x_diff = tape.gradient(tf_out, x, loss_diff) tf_y_diff = tape.gradient(tf_out, y, loss_diff) assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-5, atol=1e-5) assert np.allclose( test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-5, atol=1e-5 ) assert np.allclose( test_global_storage.Get("y_diff"), tf_y_diff.numpy(), rtol=1e-5, atol=1e-5 )
def test_layer_norm(_): confs = [ { "x_shape": (40, 64), "begin_norm_axis": -1, "begin_params_axis": -1 }, ] arg_dict = OrderedDict() arg_dict["device_type"] = ["cpu", "gpu"] arg_dict["confs"] = confs arg_dict["data_type"] = ["float32", "float16"] arg_dict["trainable"] = [True, False] arg_dict["center"] = [True, False] arg_dict["scale"] = [True, False] arg_dict["epsilon"] = [1e-5, 1e-10] arg_dict["fuse_add_to_output"] = [True, False] for case in GenArgList(arg_dict): ( device_type, confs, data_type, trainable, center, scale, epsilon, fuse_add_to_output, ) = case if device_type == "cpu" and data_type == "float16": continue if device_type == "cpu" and fuse_add_to_output == True: continue x_shape = confs["x_shape"] begin_norm_axis = confs["begin_norm_axis"] begin_params_axis = confs["begin_params_axis"] flow.clear_default_session() assert (begin_norm_axis == begin_params_axis ), "tf doesn't support a dedicated begin_params_axis" # Random inputs if data_type == "float16": x = (np.random.uniform(low=-1, high=1, size=x_shape).astype( np.float16).astype(np.float32)) else: x = np.random.uniform(low=-1, high=1, size=x_shape).astype( type_name_to_np_type[data_type]) dim = len(x.shape) - 2 # TF results with tf.GradientTape(persistent=True) as tape: x_tf = tf.Variable(x) if data_type == "float16": x_tf = tf.cast(x_tf, dtype=tf.float16) tf.keras.backend.set_floatx("float16") layer = tf.keras.layers.LayerNormalization( axis=begin_norm_axis, epsilon=epsilon, center=center, scale=scale, beta_initializer="zeros", gamma_initializer="ones", beta_regularizer=None, gamma_regularizer=None, beta_constraint=None, gamma_constraint=None, trainable=trainable, ) y_tf = layer(x_tf) z_tf = y_tf + x_tf if data_type == "float16": dx_tf = tape.gradient( z_tf, x_tf, tf.constant(1.0, shape=z_tf.shape, dtype=tf.float16)) else: dx_tf = tape.gradient(z_tf, x_tf, tf.constant(1.0, shape=z_tf.shape)) grad = tape.gradient(z_tf, layer.trainable_variables) if trainable: if scale and center: tf_gamma_diff = grad[0] tf_beta_diff = grad[1] elif scale and not center: tf_gamma_diff = grad[0] elif not scale and center: tf_beta_diff = grad[0] else: pass else: pass def assert_grad(b): diff = dx_tf.numpy() - b.numpy() max_diff = np.max(np.abs(diff)) if data_type == "float16": tolerance = 3e-3 else: tolerance = 1e-5 assert np.allclose(dx_tf.numpy(), b.numpy(), rtol=tolerance, atol=tolerance), ( case, max_diff, ) def assert_grad_gamma(b): diff = tf_gamma_diff.numpy() - b.numpy() max_diff = np.max(np.abs(diff)) assert np.allclose(tf_gamma_diff.numpy(), b.numpy(), rtol=1e-4, atol=1e-4), ( case, max_diff, ) def assert_grad_beta(b): diff = tf_beta_diff.numpy() - b.numpy() max_diff = np.max(np.abs(diff)) assert np.allclose(tf_beta_diff.numpy(), b.numpy(), rtol=1e-5, atol=1e-5), ( case, max_diff, ) # 1F results if data_type == "float16": dtype = flow.float else: dtype = type_name_to_flow_type[data_type] func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.enable_fuse_add_to_output(fuse_add_to_output) @flow.global_function(type="train", function_config=func_config) def test_job(x: oft.Numpy.Placeholder(x_shape, dtype=dtype)): v = flow.get_variable( "x", shape=x_shape, dtype=dtype, initializer=flow.constant_initializer(0), trainable=True, ) flow.watch_diff(v, assert_grad) x += v if data_type == "float16": x = flow.cast(x, dtype=flow.float16) with flow.scope.placement(device_type, "0:0"): param_shape = x.shape[begin_params_axis:] gamma = None beta = None if center: with flow.scope.namespace("LayerNorm"): beta = flow.get_variable( name="beta", shape=param_shape, dtype=flow.float, initializer=flow.constant_initializer(0.0), trainable=trainable, model_name="beta", reuse=False, ) if trainable: flow.watch_diff(beta, assert_grad_beta) if data_type == "float16": beta = flow.cast(beta, dtype=flow.float16) if scale: with flow.scope.namespace("LayerNorm"): gamma = flow.get_variable( name="gamma", shape=param_shape, dtype=flow.float, initializer=flow.constant_initializer(1.0), trainable=trainable, model_name="gamma", reuse=False, ) if trainable: if data_type == "float16": flow.watch_diff( gamma, test_global_storage.Setter( "gamma_diff")) else: flow.watch_diff(gamma, assert_grad_gamma) if data_type == "float16": gamma = flow.cast(gamma, dtype=flow.float16) x = flow.identity(x) y = flow.nn.layer_norm( x, gamma=gamma, beta=beta, begin_norm_axis=begin_norm_axis, begin_params_axis=begin_params_axis, epsilon=epsilon, ) z = y + x if data_type == "float16": y = flow.cast(y, dtype=flow.float) z = flow.cast(z, dtype=flow.float) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(z) return y y = test_job(x).get() assert y.numpy().shape == y_tf.numpy().shape, ( y.numpy().shape, y_tf.numpy().shape, ) diff = y.numpy() - y_tf.numpy() max_diff = np.max(np.abs(diff)) assert np.allclose(y.numpy(), y_tf.numpy(), rtol=1e-5, atol=2e-3), ( case, max_diff, ) if data_type == "float16" and trainable and scale: np_dy = np.ones(x.shape).astype(np.float32) np_gamma_diff = np.sum(np_dy * y.numpy().astype(np.float32), axis=0).astype(np.float16) max_diff = np.max( np.abs(np_gamma_diff - test_global_storage.Get( "gamma_diff").astype(np.float16))) assert np.allclose( np_gamma_diff, test_global_storage.Get("gamma_diff").astype(np.float16), rtol=5e-2, atol=5e-2, ), ( case, max_diff, )
def compare_with_tensorflow(device_type, params_case, dilations, data_format): input_shape, output_shape, padding, strides, kernel_size = params_case assert data_format in ["NCHW", "NHWC"] out_channels = output_shape[1] if data_format == "NCHW" else output_shape[3] in_channels = input_shape[1] if data_format == "NCHW" else input_shape[3] assert device_type in ["gpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.train.primary_lr(1e-4) func_config.train.model_update_conf(dict(naive_conf={})) @flow.global_function(func_config) def DeconvJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=input_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) if data_format == "NCHW": weight = flow.get_variable( "weight", shape=(in_channels, out_channels, kernel_size, kernel_size), dtype=flow.float, initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) else: weight = flow.get_variable( "weight", shape=(in_channels, kernel_size, kernel_size, out_channels), dtype=flow.float, initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) loss = flow.nn.conv2d_transpose( x, weight, strides=strides, output_shape=output_shape, dilations=dilations, padding=padding, data_format=data_format, ) flow.losses.add_loss(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(weight, test_global_storage.Setter("weight")) flow.watch_diff(weight, test_global_storage.Setter("weight_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss # OneFlow check_point = flow.train.CheckPoint() check_point.init() of_out = DeconvJob().get() # Tensorflow if data_format == "NCHW": with tf.GradientTape(persistent=True) as tape: x = tf.Variable(test_global_storage.Get("x").transpose(0, 2, 3, 1)) output_shape = ( output_shape[0], output_shape[2], output_shape[3], output_shape[1], ) w = tf.Variable(test_global_storage.Get("weight").transpose(2, 3, 1, 0)) tf_out = tf.nn.conv2d_transpose( x, w, output_shape=output_shape, strides=[1, strides, strides, 1], padding=padding, data_format="NHWC", ) loss_diff = test_global_storage.Get("loss_diff").transpose(0, 2, 3, 1) tf_x_diff = tape.gradient(tf_out, x, loss_diff) tf_weight_diff = tape.gradient(tf_out, w, loss_diff) assert np.allclose( of_out.numpy().transpose(0, 2, 3, 1), tf_out.numpy(), rtol=1e-02, atol=1e-02 ) assert np.allclose( test_global_storage.Get("x_diff").transpose(0, 2, 3, 1), tf_x_diff.numpy(), rtol=1e-4, atol=1e-4, ) assert np.allclose( test_global_storage.Get("weight_diff").transpose(2, 3, 1, 0), tf_weight_diff.numpy(), rtol=1e-4, atol=1e-4, ) else: with tf.GradientTape(persistent=True) as tape: x = tf.Variable(test_global_storage.Get("x")) w = tf.Variable(test_global_storage.Get("weight").transpose(1, 2, 3, 0)) tf_out = tf.nn.conv2d_transpose( x, w, output_shape=output_shape, strides=[1, strides, strides, 1], padding=padding, data_format="NHWC", ) loss_diff = test_global_storage.Get("loss_diff") tf_x_diff = tape.gradient(tf_out, x, loss_diff) tf_weight_diff = tape.gradient(tf_out, w, loss_diff) assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=1e-02, atol=1e-02), ( of_out.numpy() - tf_out.numpy() ) assert np.allclose( test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=1e-02, atol=1e-02 ) assert np.allclose( test_global_storage.Get("weight_diff").transpose(1, 2, 3, 0), tf_weight_diff.numpy(), rtol=1e-2, atol=1e-2, )
def _compare_mish_with_np(input_shape, device_type, machine_ids, device_counts): input_1 = np.random.random(size=input_shape).astype(np.float32) assert device_type in ["cpu", "gpu"] flow.clear_default_session() if device_type == "cpu": flow.config.cpu_device_num(device_counts) else: flow.config.gpu_device_num(device_counts) func_config = flow.FunctionConfig() func_config.default_placement_scope( flow.scope.placement(device_type, machine_ids)) def np_mish(input): return input * np.tanh(np.log1p(np.exp(input))) np_out_mish = np_mish(input_1) def np_diff(input): u = np.log1p(np.exp(input)) return np.tanh(u) + input * (1 - np.tanh(u)**2) * (np.exp(input) / (1 + np.exp(input))) _np_grad = np_diff(input_1) def assert_prediction_grad(blob: tp.Numpy): assert np.allclose(blob, _np_grad) @flow.global_function( type="train", function_config=func_config, ) def oneflow_mish( of_input_1: tp.Numpy.Placeholder(shape=input_1.shape), ) -> tp.Numpy: with flow.scope.placement(device_type, "0:0"): v = flow.get_variable( shape=input_1.shape, dtype=flow.float32, initializer=flow.zeros_initializer(), name="x_var", ) x_var = of_input_1 + v flow.watch_diff(x_var, assert_prediction_grad) of_mish_out = flow.math.mish(x_var) with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-3]), momentum=0).minimize(of_mish_out) return of_mish_out of_out_mish = oneflow_mish(input_1) assert np.allclose(of_out_mish, np_out_mish)
def _compare_triplet_margin_loss_with_np( anchor_shape, pos_shape, neg_shape, eps, margin, p, swap, device_type, machine_ids, device_counts, ): anchor = np.random.random(size=anchor_shape).astype(np.float32) pos = np.random.random(size=pos_shape).astype(np.float32) neg = np.random.random(size=neg_shape).astype(np.float32) eps = eps assert device_type in ["cpu", "gpu"] flow.clear_default_session() if device_type == "cpu": flow.config.cpu_device_num(device_counts) else: flow.config.gpu_device_num(device_counts) func_config = flow.FunctionConfig() func_config.default_placement_scope( flow.scope.placement(device_type, machine_ids)) func_config.default_logical_view(flow.scope.consistent_view()) def np_triplet_margin_loss(np_anchor, np_pos, np_neg, eps, np_margin, np_p, swap): np_d_1_norm = np.power(np.abs((np_anchor - np_pos + eps)), np_p) np_d_2_norm = np.power(np.abs((np_anchor - np_neg + eps)), np_p) np_d_1 = np.power(np.sum(np_d_1_norm, axis=-1), 1.0 / np_p) np_d_2 = np.power(np.sum(np_d_2_norm, axis=-1), 1.0 / np_p) if swap: np_dist_swap = np.power(np.abs((np_pos - np_neg + eps)), np_p) np_dist_swap = np.power(np.sum(np_dist_swap, axis=-1), 1.0 / np_p) np_d_2 = np.minimum(np_d_2, np_dist_swap) np_triplet_margin_loss = np.maximum((np_margin + np_d_1 - np_d_2), 0) np_triplet_margin_loss_mean = np.mean(np_triplet_margin_loss) np_triplet_margin_loss_sum = np.sum(np_triplet_margin_loss) return { "np_triplet_margin_loss": np_triplet_margin_loss, "np_triplet_margin_loss_mean": np_triplet_margin_loss_mean, "np_triplet_margin_loss_sum": np_triplet_margin_loss_sum, } np_out_tripletloss_dict = np_triplet_margin_loss(anchor, pos, neg, eps, margin, p, swap) def np_triplet_loss_diff(anchor, pos, neg, margin, p): def _compute_distance(x1, x2, x3): d_1_norm = np.power(np.abs((x1 - x2 + 1e-6)), p) d_2_norm = np.power(np.abs((x1 - x3 + 1e-6)), p) d_1 = np.power(np.sum(d_1_norm, axis=-1), 1.0 / p) d_2 = np.power(np.sum(d_2_norm, axis=-1), 1.0 / p) return d_1 - d_2 + margin def _compute_per_diff(x1, x2, p, eps=1e-6): # Add epsilon to avoid divided by zero _abs_index = np.where(x1 - x2 > 0, 1, -1) # When element == 0, its grad = 0 _abs_index_support = np.where(x1 - x2 == 0, 1, 0) _abs_grad = _abs_index + _abs_index_support _abs_val = np.abs(x1 - x2 + eps) _power_abs_val = np.power(_abs_val, p) _sum_val = np.sum(_power_abs_val, axis=1, keepdims=True) # Add epsilon to avoid divided by zero _sqrt_sum_val = np.power(_sum_val + eps, 1.0 / p - 1) _power_val = np.power(_abs_val, p - 1) _grad = np.multiply(_sqrt_sum_val, _power_val) # Multiply the abs grad _grad *= _abs_grad return _grad / x1.shape[0] d = _compute_distance(anchor, pos, neg) # Because We use max(x, 0), the value less than 0, the corresponding grad is 0 # So Here we compute the index that its grad need to be place to 0 zero_index = np.where(d < -1e-6) anchor_grad_1 = _compute_per_diff(anchor, pos, p) anchor_grad_2 = _compute_per_diff(anchor, neg, p) total_grad = anchor_grad_1 - anchor_grad_2 for i in zero_index: total_grad[i] = 0 grad_dict = { "np_triplet_loss_grad_mean": total_grad, } return grad_dict np_grad_dict = np_triplet_loss_diff(anchor, pos, neg, margin, p) def assert_prediction_grad(blob: tp.Numpy): # Evaluate the gradient assert np.allclose(blob, np_grad_dict["np_triplet_loss_grad_mean"], rtol=1e-3) @flow.global_function( type="train", function_config=func_config, ) def oneflow_marginloss( of_anchor: tp.Numpy.Placeholder(shape=anchor.shape), of_pos: tp.Numpy.Placeholder(shape=pos.shape), of_neg: tp.Numpy.Placeholder(shape=neg.shape), ) -> Dict[str, tp.Numpy]: with flow.scope.placement(device_type, "0:0"): v = flow.get_variable( shape=anchor.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), name="x_var", ) x_anchor = of_anchor + v flow.watch_diff(x_anchor, assert_prediction_grad) triplet_marginloss = flow.nn.TripletMarginLoss( x_anchor, of_pos, of_neg, margin=margin, p=p, swap=swap, reduction="none", name="of_tripletmarginloss", ) triplet_marginloss_mean = flow.nn.TripletMarginLoss( x_anchor, of_pos, of_neg, margin=margin, p=p, swap=swap, reduction="mean", name="of_tripletmarginloss_mean", ) triplet_marginloss_sum = flow.nn.TripletMarginLoss( x_anchor, of_pos, of_neg, margin=margin, p=p, swap=swap, reduction="sum", name="of_tripletmarginloss_sum", ) with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-3]), momentum=0).minimize(triplet_marginloss_mean) return { "of_triplet_margin_loss": triplet_marginloss, "of_triplet_margin_loss_mean": triplet_marginloss_mean, "of_triplet_margin_loss_sum": triplet_marginloss_sum, } of_out_tripletloss_dict = oneflow_marginloss(anchor, pos, neg) assert np.allclose( of_out_tripletloss_dict["of_triplet_margin_loss"], np_out_tripletloss_dict["np_triplet_margin_loss"], ) assert np.allclose( of_out_tripletloss_dict["of_triplet_margin_loss_mean"], np_out_tripletloss_dict["np_triplet_margin_loss_mean"], ) assert np.allclose( of_out_tripletloss_dict["of_triplet_margin_loss_sum"], np_out_tripletloss_dict["np_triplet_margin_loss_sum"], )
def compare_with_tensorflow(device_type, data_type, shape): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() dtype = type_name_to_flow_type[data_type] def np_sigmoid(x): return 1 / (1 + np.exp(-x)) @flow.global_function(type="train", function_config=func_config) def SigmoidCrossEntropyWithLogitsJob(labels: oft.Numpy.Placeholder( shape, dtype)): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=shape, dtype=type_name_to_flow_type[data_type], initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) loss = flow.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=x) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss # fake labels labels = np_sigmoid(np.random.randint(0, 10, size=shape)).astype( type_name_to_np_type[data_type]) # OneFlow of_out = SigmoidCrossEntropyWithLogitsJob(labels).get() # TensorFlow with tf.GradientTape(persistent=True) as tape: x = tf.Variable(test_global_storage.Get("x")) tf_out = tf.nn.sigmoid_cross_entropy_with_logits(labels, x) loss_diff = test_global_storage.Get("loss_diff") tf_x_diff = tape.gradient(tf_out, x, loss_diff) tolerance = 1e-5 assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=tolerance, atol=tolerance) assert np.allclose( test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=tolerance, atol=tolerance, ) flow.clear_default_session()
def make_matmul_func( a_shape, b_shape, trans_a, trans_b, alpha, dtype, device_type, test_add_to_output, fuse_add_to_output, tf32, ): assert device_type in ["gpu", "cpu"] flow.clear_default_session() flow.config.enable_tensor_float_32_compute(tf32) func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.enable_fuse_add_to_output(fuse_add_to_output) func_config.default_placement_scope(flow.scope.placement(device_type, "0:0")) @flow.global_function(type="train", function_config=func_config) def matmul_job() -> typing.Tuple[ flow.typing.Numpy, flow.typing.Numpy, flow.typing.Numpy, flow.typing.Numpy ]: a_var = flow.get_variable( "a", shape=a_shape, dtype=flow.float32, initializer=flow.random_uniform_initializer(minval=0, maxval=1), trainable=True, ) b_var = flow.get_variable( "b", shape=b_shape, dtype=flow.float32, initializer=flow.random_uniform_initializer(minval=0, maxval=1), trainable=True, ) flow.watch_diff(a_var, test_global_storage.Setter("a_diff")) flow.watch_diff(b_var, test_global_storage.Setter("b_diff")) if dtype is flow.float16: a = flow.amp_white_identity(a_var) b = flow.amp_white_identity(b_var) else: a = a_var b = b_var c = flow.matmul(a, b, trans_a, trans_b, alpha) add_to = flow.get_variable( "c", shape=c.shape, dtype=flow.float32, initializer=flow.random_uniform_initializer(minval=-1, maxval=1), trainable=True, ) if test_add_to_output: flow.watch_diff(add_to, test_global_storage.Setter("add_to_diff")) if dtype is flow.float16: add_to = flow.amp_white_identity(add_to) c = c + add_to flow.watch_diff(c, test_global_storage.Setter("c_diff")) get_optimizer().minimize(c) return a_var, b_var, add_to, c return matmul_job
def compare_with_tensorflow( device_type, x_shape, filters, kernel_size, groups, of_padding="SAME", tf_padding="SAME", stride_d=1, stride_h=1, stride_w=1, data_format="NCDHW", dilation_d=1, dilation_h=1, dilation_w=1, ): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.default_logical_view(flow.scope.consistent_view()) func_config.cudnn_conv_heuristic_search_algo(False) if data_format == "NCDHW": xy_data_transpose = (0, 2, 3, 4, 1) weight_data_transpose = (2, 3, 4, 1, 0) else: xy_data_transpose = (0, 1, 2, 3, 4) weight_data_transpose = (1, 2, 3, 4, 0) @flow.global_function(type="train", function_config=func_config) def ConvJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=x_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=0, maxval=100), trainable=True, ) if data_format == "NCDHW": weight_shape = ( filters, x.shape[1] // groups, kernel_size, kernel_size, kernel_size, ) else: weight_shape = ( filters, kernel_size, kernel_size, kernel_size, x.shape[4] // groups, ) weight = flow.get_variable( "conv-weight", shape=weight_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=0, maxval=100), ) loss = flow.nn.conv3d( x, weight, strides=[stride_d, stride_h, stride_w], padding=of_padding, data_format=data_format, dilations=[dilation_d, dilation_h, dilation_w], groups=groups, ) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-4]), momentum=0).minimize(loss) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(weight, test_global_storage.Setter("weight")) flow.watch_diff(weight, test_global_storage.Setter("weight_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss # OneFlow check_point = flow.train.CheckPoint() check_point.init() of_out = ConvJob().get() # TensorFlow with tf.GradientTape(persistent=True) as tape: x = tf.Variable( test_global_storage.Get("x").transpose(xy_data_transpose)) assert groups > 0 assert x_shape[1] % groups == 0 assert filters % groups == 0 weight = tf.Variable( test_global_storage.Get("weight").transpose(weight_data_transpose)) tf_out = tf.nn.conv3d( x, weight, strides=[1, stride_d, stride_h, stride_w, 1], padding=tf_padding, data_format="NDHWC", dilations=[1, dilation_d, dilation_h, dilation_w, 1], ) loss_diff = test_global_storage.Get("loss_diff").transpose( xy_data_transpose) tf_x_diff = tape.gradient(tf_out, x, loss_diff) tf_weight_diff = tape.gradient(tf_out, weight, loss_diff) assert np.allclose( of_out.numpy().transpose(xy_data_transpose), tf_out.numpy(), rtol=1e-5, atol=1e-5, ) diff_idx = np.where( np.abs( test_global_storage.Get("x_diff").transpose(xy_data_transpose) - tf_x_diff.numpy()) > 5e-4) assert np.allclose( test_global_storage.Get("x_diff").transpose(xy_data_transpose), tf_x_diff.numpy(), rtol=1e-4, atol=1e-4, ) assert np.allclose( test_global_storage.Get("weight_diff").transpose( weight_data_transpose), tf_weight_diff.numpy(), rtol=1e-5, atol=1e-5, )
def compare_with_tensorflow( device_type, a_shape, b_shape, transpose_a, transpose_b, data_type, fuse_add_to_output, enable_tf32, alpha, ): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.enable_fuse_add_to_output(fuse_add_to_output) flow.config.enable_tensor_float_32_compute(enable_tf32) if data_type == "float16": dtype = flow.float else: dtype = type_name_to_flow_type[data_type] @flow.global_function(type="train", function_config=func_config) def MatmulJob(): with flow.scope.placement(device_type, "0:0"): a = flow.get_variable( "a", shape=a_shape, dtype=dtype, initializer=flow.random_uniform_initializer(minval=0, maxval=1), trainable=True, ) b = flow.get_variable( "b", shape=b_shape, dtype=dtype, initializer=flow.random_uniform_initializer(minval=0, maxval=1), trainable=True, ) if data_type == "float16": out = flow.matmul( flow.cast(a, dtype=flow.float16), flow.cast(b, dtype=flow.float16), transpose_a, transpose_b, alpha, ) c = flow.get_variable( "c", shape=out.shape, dtype=dtype, initializer=flow.random_uniform_initializer(minval=-1, maxval=1), trainable=True, ) loss = flow.cast( out + flow.cast(c, dtype=flow.float16), dtype=flow.float ) else: out = flow.matmul(a, b, transpose_a, transpose_b, alpha) c = flow.get_variable( "c", shape=out.shape, dtype=dtype, initializer=flow.random_uniform_initializer(minval=-1, maxval=1), trainable=True, ) loss = out + c flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [1e-4]), momentum=0 ).minimize(loss) flow.watch(a, test_global_storage.Setter("a")) flow.watch_diff(a, test_global_storage.Setter("a_diff")) flow.watch(b, test_global_storage.Setter("b")) flow.watch_diff(b, test_global_storage.Setter("b_diff")) flow.watch(c, test_global_storage.Setter("c")) flow.watch_diff(c, test_global_storage.Setter("c_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) return loss # OneFlow of_out = MatmulJob().get() # TensorFlow with tf.GradientTape(persistent=True) as tape: a = tf.Variable(test_global_storage.Get("a")) b = tf.Variable(test_global_storage.Get("b")) c = tf.Variable(test_global_storage.Get("c")) if data_type == "float16": a = tf.cast(a, tf.float16) b = tf.cast(b, tf.float16) c = tf.cast(c, tf.float16) tf_out = tf.matmul(a, b, transpose_a, transpose_b) tf_out = tf_out * alpha tf_out = tf_out + c if data_type == "float16": tf_out = tf.cast(tf_out, tf.float32) loss_diff = test_global_storage.Get("loss_diff") tf_a_diff = tape.gradient(tf_out, a, loss_diff) tf_b_diff = tape.gradient(tf_out, b, loss_diff) tf_c_diff = tape.gradient(tf_out, c, loss_diff) if data_type == "float16": tolerance = 2e-3 else: tolerance = 1e-3 assert np.allclose( of_out.numpy(), tf_out.numpy(), rtol=tolerance, atol=tolerance ), np.max(np.abs(of_out.numpy() - tf_out.numpy())) assert np.allclose( test_global_storage.Get("a_diff"), tf_a_diff.numpy(), rtol=tolerance, atol=tolerance, ) assert np.allclose( test_global_storage.Get("b_diff"), tf_b_diff.numpy(), rtol=tolerance, atol=tolerance, ) assert np.allclose( test_global_storage.Get("c_diff"), tf_c_diff.numpy(), rtol=tolerance, atol=tolerance, )
http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. """ import unittest import numpy as np import oneflow as flow import oneflow.typing as oft func_config = flow.FunctionConfig() func_config.default_logical_view(flow.scope.mirrored_view()) func_config.default_data_type(flow.float) @flow.unittest.skip_unless_1n1d() class TestUnpackPack(flow.unittest.TestCase): def test_unpack_pack(test_case): if flow.eager_execution_enabled(): return @flow.global_function(function_config=func_config) def UnpackPackJob(a: oft.Numpy.Placeholder((3, 4))): return flow.pack(flow.unpack(a, 3), 3) x = np.random.rand(3, 4).astype(np.float32)
def _compare_scatter_nd_update_with_tf( test_case, device_type, params_shape, indices_shape, updates_shape, allow_duplicate_index=False, verbose=False, ): params, updates, indices = _random_inputs(params_shape, indices_shape, updates_shape, allow_duplicate_index) x_const = tf.constant(params) y_const = tf.constant(updates) i_const = tf.constant(indices) with tf.GradientTape() as t1: x = tf.Variable(params) z1 = tf.tensor_scatter_nd_update(x, i_const, y_const) dz_dx = t1.gradient(z1, x) with tf.GradientTape() as t2: y = tf.Variable(updates) z2 = tf.tensor_scatter_nd_update(x_const, i_const, y) dz_dy = t2.gradient(z2, y) test_case.assertTrue(np.allclose(z1.numpy(), z2.numpy())) def compare_dz_dx(params_grad): test_case.assertTrue(np.allclose(dz_dx.numpy(), params_grad.numpy())) def compare_dz_dy(updates_grad): test_case.assertTrue(np.allclose(dz_dy.numpy(), updates_grad.numpy())) flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.default_logical_view(flow.scope.consistent_view()) @flow.global_function(type="train", function_config=func_config) def scatter_nd_update_grad_fn( x_def: oft.Numpy.Placeholder(params.shape, dtype=flow.float), indices_def: oft.Numpy.Placeholder(indices.shape, dtype=flow.int32), y_def: oft.Numpy.Placeholder(updates.shape, dtype=flow.float), ): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "params", shape=params.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), ) y = flow.get_variable( "updates", shape=updates.shape, dtype=flow.float32, initializer=flow.constant_initializer(0), ) x = x + x_def y = y + y_def z = flow.tensor_scatter_nd_update(x, indices_def, y) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [1e-3]), momentum=0).minimize(z) flow.watch_diff(x, compare_dz_dx) flow.watch_diff(y, compare_dz_dy) return z of_z = scatter_nd_update_grad_fn(params, indices, updates).get() if verbose is True: print("device_type:", device_type) print("x:", params) print("y:", updates) print("indices:", indices) print("tf_z:", z1.numpy()) print("of_z:", of_z.numpy()) test_case.assertTrue(np.allclose(z1.numpy(), of_z.numpy()))