def main(args): flow.config.machine_num(args.num_nodes) flow.config.gpu_device_num(args.gpu_num_per_node) flow.config.enable_legacy_model_io(True) func_config = flow.FunctionConfig() func_config.default_logical_view(flow.scope.consistent_view()) func_config.default_data_type(flow.float) func_config.cudnn_conv_force_fwd_algo(0) func_config.cudnn_conv_force_bwd_data_algo(1) func_config.cudnn_conv_force_bwd_filter_algo(1) func_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision) @flow.global_function(type="train", function_config=func_config) def alexnet_train_job(): (labels, images) = _data_load_layer(args, args.train_dir) loss = alexnet(args, images, labels) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [1e-05]), momentum=0 ).minimize(loss) return loss func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.enable_auto_mixed_precision(args.enable_auto_mixed_precision) @flow.global_function(function_config=func_config) def alexnet_eval_job(): with flow.scope.consistent_view(): (labels, images) = _data_load_layer(args, args.eval_dir) return alexnet(args, images, labels, False) check_point = flow.train.CheckPoint() if not args.model_load_dir: check_point.init() else: check_point.load(args.model_load_dir) num_nodes = args.num_nodes print( "Traning alexnet: num_gpu_per_node = {}, num_nodes = {}.".format( args.gpu_num_per_node, num_nodes ) ) print("{:>12} {:>12} {:>12}".format("iter", "loss type", "loss value")) loss = [] for i in range(args.iter_num): train_loss = alexnet_train_job().get().mean() loss.append(train_loss) fmt_str = "{:>12} {:>12} {:>12.6f}" print(fmt_str.format(i, "train loss:", train_loss)) if (i + 1) % 100 == 0: check_point.save(_MODEL_SAVE_DIR + str(i)) loss_file = "{}n{}c.npy".format( str(num_nodes), str(args.gpu_num_per_node * num_nodes) ) loss_path = "./of_loss/alexnet" if not os.path.exists(loss_path): os.makedirs(loss_path) numpy.save(os.path.join(loss_path, loss_file), loss)
def test_shuffle(_): arg_dict = OrderedDict() arg_dict["device_type"] = ["gpu", "cpu"] arg_dict["x_shape"] = [(100,), (10, 1000), (10, 10, 2000)] arg_dict["data_type"] = ["float32", "double", "int32", "int64"] for (device_type, x_shape, data_type) in GenArgList(arg_dict): assert device_type in ["gpu", "cpu"] assert data_type in ["float32", "double", "int8", "int32", "int64"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) @flow.global_function(function_config=flow.FunctionConfig()) def TestJob( x: oft.Numpy.Placeholder( x_shape, dtype=type_name_to_flow_type[data_type] ) ): with flow.scope.placement(device_type, "0:0"): return flow.random.shuffle(x) x = np.random.randn(*x_shape).astype(type_name_to_np_type[data_type]) ret = TestJob(x).get().numpy() assert np.array_equal(x, ret) == False, x_shape x.sort(0) ret.sort(0) assert np.array_equal(x, ret), x_shape assert device_type in ["gpu", "cpu"] assert data_type in ["float32", "double", "int8", "int32", "int64"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) @flow.global_function(function_config=flow.FunctionConfig()) def TestJob1( x: oft.Numpy.Placeholder( x_shape, dtype=type_name_to_flow_type[data_type] ) ): with flow.scope.placement(device_type, "0:0"): return flow.random.generate_random_batch_permutation_indices(x) x = np.random.randn(*x_shape).astype(type_name_to_np_type[data_type]) ret = TestJob1(x).get().numpy() idx = np.arange(x_shape[0]).astype(np.int32) assert np.array_equal(idx, ret) == False, x_shape idx.sort() ret.sort() assert np.array_equal(idx, ret), x_shape
def run_fuse_cast_scale_mlir(test_case, device=None, in_type=None, out_type=None, shape=None): flow.clear_default_session() func_config = flow.FunctionConfig() @flow.global_function(function_config=func_config) def FuseCastScaleJob(x: oft.Numpy.Placeholder( shape, dtype=in_type)) -> Tuple[oft.Numpy, oft.Numpy]: with flow.scope.placement(device, "0:0-0"): scale = flow.get_variable( "scale", shape=(1, ), dtype=out_type, initializer=flow.random_uniform_initializer(), trainable=False, ) loss = flow.cast(x, dtype=out_type) * scale return (loss, scale) np_in_type = dtype_util.convert_oneflow_dtype_to_numpy_dtype(in_type) x = (np.random.rand(*shape) * 10).astype(np_in_type) ret = FuseCastScaleJob(x) (loss, scale) = ret test_case.assertTrue(np.allclose(loss, x * scale))
def test_sync_dynamic_resize(_): arg_dict = OrderedDict() arg_dict["device_type"] = ["gpu", "cpu"] arg_dict["x_shape"] = [(100, ), (1000, 10)] arg_dict["data_type"] = ["float32", "double", "int32", "int64"] arg_dict["size_type"] = ["int32", "int64"] for (device_type, x_shape, data_type, size_type) in GenArgList(arg_dict): flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) @flow.global_function(function_config=func_config) def TestJob( x: oft.Numpy.Placeholder( x_shape, dtype=type_name_to_flow_type[data_type]), size: oft.Numpy.Placeholder( (1, ), dtype=type_name_to_flow_type[size_type]), ): with flow.scope.placement(device_type, "0:0"): return flow.sync_dynamic_resize(x, size) size = np.random.randint(0, x_shape[0]) x = np.random.rand(*x_shape).astype( type_name_to_np_type[data_type]) y = (TestJob( x, np.array([size]).astype( type_name_to_np_type[size_type])).get().numpy_list()[0]) assert np.array_equal(y, x[:size])
def test_multi_node_comm_net(test_case): func_config = flow.FunctionConfig() func_config.default_logical_view(flow.scope.consistent_view()) func_config.default_data_type(flow.float) flow.config.gpu_device_num(1) @flow.global_function(function_config=func_config) def ReluJob(x: oft.Numpy.Placeholder((10, 2))): with flow.scope.placement("gpu", "0:0"): out0 = ccrelu(x, "my_op_0_0") with flow.scope.placement("gpu", "1:0"): out1 = ccrelu(out0, "my_op_1_0") with flow.scope.placement("gpu", "0:0"): out2 = ccrelu(out1, "my_op_print") return out2 index = [-2, -1, 0, 1, 2] data = [] for i in index: data.append(np.ones((10, 2), dtype=np.float32) * i) for i in range(5): ret = ReluJob(data[i]).get().numpy() print(ret) if index[i] > 0: test_case.assertTrue( np.array_equal( ret, np.ones((10, 2), dtype=np.float32) * index[i])) else: test_case.assertTrue( np.array_equal(ret, np.zeros((10, 2), dtype=np.float32)))
def distribute_reshape_test(device_type, device_num, input_shape, shape): assert device_type in ["gpu", "cpu"] flow.clear_default_session() flow.config.gpu_device_num(device_num) func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) @flow.global_function(type="train", function_config=func_config) def ReshapeJob(): with flow.scope.placement(device_type, "0:0-{}".format(device_num - 1)): x = flow.get_variable( "var_x", shape=input_shape, dtype=flow.float, initializer=flow.random_uniform_initializer(minval=2, maxval=5), trainable=True, distribute=flow.distribute.split(2), ) loss = flow.reshape(x, shape) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0 ).minimize(loss) return (x, loss) (x, loss) = ReshapeJob().get()
def WatchDiff(test_case, device_type, input_shape, dtype): assert device_type in ["gpu", "cpu"] assert dtype in ["float32", "double"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) def CheckOnes(diff): ones = np.ones(input_shape) test_case.assertTrue( np.allclose(diff.numpy(), ones, rtol=1e-05, atol=1e-05)) @flow.global_function(type="train", function_config=func_config) def TrainJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "in", shape=input_shape, dtype=type_name_to_flow_type[dtype], initializer=flow.random_uniform_initializer(), trainable=True, ) flow.watch_diff(x, CheckOnes) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.0001]), momentum=0).minimize(x) TrainJob()
def _run_multi_count_test(test_case, device_type, x1_shape, x2_shape, dtype, x1_count, x2_count): flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) @flow.global_function(function_config=func_config) def multi_count_not_finite_job( x1: oft.Numpy.Placeholder(x1_shape, dtype=type_name_to_flow_type[dtype]), x2: oft.Numpy.Placeholder(x2_shape, dtype=type_name_to_flow_type[dtype]), ): x_list = [] for i in range(x1_count): x_list.append(x1) for i in range(x2_count): x_list.append(x2) with flow.scope.placement(device_type, "0:0"): return flow.multi_count_not_finite(x_list) x1 = np.random.randn(*x1_shape).astype(type_name_to_np_type[dtype]) x1[0] = np.nan x1[3] = np.inf x2 = np.random.randn(*x2_shape).astype(type_name_to_np_type[dtype]) x2[2] = np.inf x2[6, 5] = np.nan y = multi_count_not_finite_job(x1, x2).get() x1_not_finite = x1.size - np.sum(np.isfinite(x1)) x2_not_finite = x2.size - np.sum(np.isfinite(x2)) np_y = x1_not_finite * x1_count + x2_not_finite * x2_count assert y.numpy() == np_y
def _test_split_to_broadcast(test_case, src_device_type, dst_device_type, src_axis): flow.clear_default_session() flow.config.gpu_device_num(4) func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.default_logical_view(flow.scope.consistent_view()) def build_s2b(input_blob, src_device_num, dst_device_num): with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)): src = flow.identity( input_blob.with_distribute(flow.distribute.split(src_axis))) with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)): dst = flow.identity( src.with_distribute(flow.distribute.broadcast())) return dst @flow.global_function(function_config=func_config) def split_to_broadcast_job(input_blob: oft.Numpy.Placeholder((96, 96))): result_list = [] for i in (1, 2, 3): for j in (1, 2, 3): result_list.append(build_s2b(input_blob, i, j)) return tuple(result_list) x = np.random.rand(96, 96).astype(np.float32) result_tuple = split_to_broadcast_job(x).get() for out in result_tuple: test_case.assertTrue(np.array_equal(x, out.numpy()))
def _test_multi_lbi(test_case, src_device_type, dst_device_type, src_device_num, dst_device_num): flow.clear_default_session() flow.config.gpu_device_num(4) func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.default_logical_view(flow.scope.consistent_view()) @flow.global_function(function_config=func_config) def multi_lbi_job(x: oft.Numpy.Placeholder((96, 96, 96))): with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)): src_s0 = flow.identity(x.with_distribute(flow.distribute.split(0))) src_s1 = flow.identity(x.with_distribute(flow.distribute.split(1))) src_b = flow.identity(x.with_distribute(flow.distribute.split(1))) (t0_0, t0_1, t0_2) = flow.identity_n((src_s0, src_s1, src_b)) with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)): t0_0 = t0_0.with_distribute(flow.distribute.split(1)) t0_1 = t0_1.with_distribute(flow.distribute.broadcast()) t0_2 = t0_2.with_distribute(flow.distribute.split(1)) (t1_0, t1_1, t1_2) = flow.identity_n((t0_0, t0_1, t0_2)) return (t1_0, t1_1, t1_2) x = np.random.uniform(-1e-05, 1e-05, (96, 96, 96)).astype(np.float32) r0 = multi_lbi_job(x).get()[0].numpy() r1 = multi_lbi_job(x).get()[1].numpy() r2 = multi_lbi_job(x).get()[2].numpy() test_case.assertTrue(np.array_equal(x, r0)) test_case.assertTrue(np.array_equal(x, r1)) test_case.assertTrue(np.array_equal(x, r2))
def _test_partial_sum_to_broadcast(test_case, src_device_type, dst_device_type): flow.clear_default_session() flow.config.gpu_device_num(4) func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.default_logical_view(flow.scope.consistent_view()) def build_p2b(input_blob, src_device_num, dst_device_num): with flow.scope.placement(src_device_type, "0:0-" + str(src_device_num - 1)): src = flow.identity( input_blob.with_distribute(flow.distribute.split(0))) src = flow.math.reduce_sum(src, axis=0) with flow.scope.placement(dst_device_type, "0:0-" + str(dst_device_num - 1)): dst = flow.identity( src.with_distribute(flow.distribute.broadcast())) return dst @flow.global_function(function_config=func_config) def partial_sum_to_broadcast_job(input_blob: oft.Numpy.Placeholder( (96, 96, 96))): result_list = [] for i in (2, 3): for j in (1, 2, 3): result_list.append(build_p2b(input_blob, i, j)) return tuple(result_list) x = np.random.uniform(-1e-05, 1e-05, (96, 96, 96)).astype(np.float32) result_tuple = partial_sum_to_broadcast_job(x).get() for out in result_tuple: test_case.assertTrue(np.allclose(np.sum(x, axis=0), out.numpy()))
def _test_slice_update( test_case, input, update, slice_args, output, dtype=flow.float32, device_tag=DEFAULT_DEVICE_TAG, verbose=False, ): input = input.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype)) update = update.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype)) output = output.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype)) flow.clear_default_session() func_cfg = flow.FunctionConfig() func_cfg.default_data_type(dtype) func_cfg.default_placement_scope(flow.scope.placement(device_tag, "0:0")) slice_func = _make_slice_update_func( slice_args, input.shape, update.shape, dtype, func_cfg ) of_output = slice_func(input, update) if verbose: print("input:\n{}".format(input)) print("update:\n{}".format(update)) print("slice_args:", slice_args) print("output:\n{}".format(output)) print("dtype:", dtype) print("device_tag:", device_tag) print("of_output:\n{}".format(of_output)) test_case.assertTrue(np.array_equal(output, of_output))
def _test_slice_dynamic( test_case, input, slice_args, outputs, static_shape=None, dtype=flow.float32, device_tag=DEFAULT_DEVICE_TAG, ): input = input.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype)) outputs = [ output.astype(flow.convert_oneflow_dtype_to_numpy_dtype(dtype)) for output in outputs ] if static_shape is None: static_shape = input.shape flow.clear_default_session() func_cfg = flow.FunctionConfig() func_cfg.default_data_type(dtype) func_cfg.default_placement_scope(flow.scope.placement(device_tag, "0:0")) func_cfg.default_logical_view(flow.scope.mirrored_view()) slice_func = _make_slice_dynamic_func(slice_args, static_shape, dtype, func_cfg) of_outputs = slice_func([input]) for (out, of_out) in zip(outputs, of_outputs): test_case.assertTrue(np.array_equal(out, of_out[0]))
def _of_target_resize_bbox_scale(images, bbox_list, target_size, max_size): image_shape = _get_images_static_shape(images) bbox_shape = _get_bbox_static_shape(bbox_list) flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.default_logical_view(flow.scope.mirrored_view()) @flow.global_function(function_config=func_config) def target_resize_bbox_scale_job( image_def: oft.ListListNumpy.Placeholder(shape=tuple(image_shape), dtype=flow.float), bbox_def: oft.ListListNumpy.Placeholder(shape=tuple(bbox_shape), dtype=flow.float), ): images_buffer = flow.tensor_list_to_tensor_buffer(image_def) (resized_images_buffer, new_size, scale) = flow.image_target_resize(images_buffer, target_size=target_size, max_size=max_size) bbox_buffer = flow.tensor_list_to_tensor_buffer(bbox_def) scaled_bbox = flow.object_bbox_scale(bbox_buffer, scale) scaled_bbox_list = flow.tensor_buffer_to_tensor_list( scaled_bbox, shape=bbox_shape[1:], dtype=flow.float) return (scaled_bbox_list, new_size) input_image_list = [np.expand_dims(image, axis=0) for image in images] input_bbox_list = [np.expand_dims(bbox, axis=0) for bbox in bbox_list] (output_bbox_list, output_image_size) = target_resize_bbox_scale_job( [input_image_list], [input_bbox_list]).get() return (output_bbox_list.numpy_lists()[0], output_image_size.numpy_list()[0])
def _of_image_decode(images): image_files = [open(im, "rb") for im in images] images_bytes = [imf.read() for imf in image_files] static_shape = (len(images_bytes), max([len(bys) for bys in images_bytes])) for imf in image_files: imf.close() flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.default_logical_view(flow.scope.mirrored_view()) @flow.global_function(function_config=func_config) def image_decode_job(images_def: oft.ListListNumpy.Placeholder( shape=static_shape, dtype=flow.int8)): images_buffer = flow.tensor_list_to_tensor_buffer(images_def) decoded_images_buffer = flow.image_decode(images_buffer) return flow.tensor_buffer_to_tensor_list(decoded_images_buffer, shape=(640, 640, 3), dtype=flow.uint8) images_np_arr = [ np.frombuffer(bys, dtype=np.byte).reshape(1, -1) for bys in images_bytes ] decoded_images = image_decode_job([images_np_arr]).get().numpy_lists() return decoded_images[0]
def compare_with_tensorflow(device_type, x_shape, axis): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) def check_grad(x_diff_blob): assert np.array_equal(x_diff_blob.numpy(), np.ones(x_shape)) @flow.global_function(type="train", function_config=func_config) def ExpandDimsJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "var", shape=x_shape, dtype=flow.float, initializer=flow.ones_initializer(), trainable=True, ) flow.watch_diff(x, check_grad) loss = flow.expand_dims(x, axis) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.0001]), momentum=0).minimize(loss) return loss of_out = ExpandDimsJob().get().numpy() tf_out = tf.expand_dims(np.ones(x_shape, dtype=np.float32), axis).numpy() assert np.array_equal(of_out, tf_out)
def _compare_ones_with_np(input_shape, device_type, machine_ids, device_counts): assert device_type in ["cpu", "gpu"] flow.clear_default_session() if device_type == "cpu": flow.config.cpu_device_num(device_counts) else: flow.config.gpu_device_num(device_counts) func_config = flow.FunctionConfig() func_config.default_placement_scope( flow.scope.placement(device_type, machine_ids)) np_out_ones = np.ones(shape=input_shape, dtype=np.float32) @flow.global_function(type="train", function_config=func_config) def oneflow_ones() -> tp.Numpy: with flow.scope.placement(device_type, "0:0"): v = flow.get_variable( shape=np_out_ones.shape, dtype=flow.float32, initializer=flow.zeros_initializer(), name="x_var", ) of_ones = flow.ones(shape=input_shape, dtype=flow.float32) of_out = of_ones + v with flow.scope.placement(device_type, "0:0"): flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.001]), momentum=0).minimize(of_out) return of_ones of_out_ones = oneflow_ones() assert np.allclose(of_out_ones, np_out_ones)
def compare_with_tensorflow(device_type, data_type, input_shape, axis, keepdims, rtol=1e-05, atol=1e-05): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) @flow.global_function(function_config=func_config) def ReduceSumLikeJob(x: oft.Numpy.Placeholder(input_shape)): with flow.scope.placement(device_type, "0:0"): if data_type == "float16": x = flow.cast(x, dtype=flow.float16) like = flow.math.reduce_sum(x, axis=axis, keepdims=keepdims) y = reduce_sum_like(x, like, axis=axis) y = flow.cast(y, dtype=flow.float32) else: like = flow.math.reduce_sum(x, axis=axis, keepdims=keepdims) y = reduce_sum_like(x, like, axis=axis) return y x = np.random.rand(*input_shape).astype(np.float16).astype(np.float32) of_out = ReduceSumLikeJob(x).get() tf_out = tf.math.reduce_sum(x, axis=axis, keepdims=keepdims) if data_type == "float16": tf_out = tf.cast(tf_out, dtype=tf.float16) tf_out = tf.cast(tf_out, dtype=tf.float32) assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=rtol, atol=atol), ( of_out.numpy(), tf_out.numpy(), )
def _make_gpt_data_loader_func( data_file_prefix, seq_length, num_samples, batch_size, dtype, shuffle=None, random_seed=None, split_sizes=None, split_index=None, machine_num=1, device_num=1, nd_sbp=None, start_from_saved_progress=False, ): assert machine_num > 0 assert device_num > 0 and device_num <= 4 parallel_hierachy = None if machine_num == 1: device_strs = "0:0-{}".format(device_num - 1) elif machine_num > 1: device_strs = [ "{}:0-{}".format(machine_id, device_num - 1) for machine_id in range(machine_num) ] parallel_hierachy = (machine_num, device_num) else: raise ValueError("invalid machine_num", machine_num) flow.clear_default_session() flow.config.cpu_device_num(4) flow.config.enable_legacy_model_io(True) func_cfg = flow.FunctionConfig() func_cfg.default_logical_view(flow.scope.consistent_view()) @flow.global_function("predict", function_config=func_cfg) def gpt_loader_fn() -> flow.typing.Numpy: with flow.scope.placement("cpu", device_strs, parallel_hierachy): tokens = flow.data.megatron_gpt_mmap_data_loader( data_file_prefix=data_file_prefix, seq_length=seq_length, num_samples=num_samples, batch_size=batch_size, dtype=dtype, shuffle=shuffle, random_seed=random_seed, split_sizes=split_sizes, split_index=split_index, nd_sbp=nd_sbp, start_from_saved_progress=start_from_saved_progress, name="GPTDataLoader", ) if isinstance(nd_sbp, list) and len(nd_sbp) > 1: tokens = flow.hierarchical_parallel_cast(tokens, nd_sbp=["B", "B"]) tokens = flow.hierarchical_parallel_cast(tokens, nd_sbp=["B"]) return tokens check_point = flow.train.CheckPoint() check_point.init() return gpt_loader_fn
def compare_with_tensorflow(device_type, x_shape, data_type, axis): assert device_type in ["gpu", "cpu"] flow.clear_default_session() func_config = flow.FunctionConfig() if data_type == "float16": dtype = flow.float else: dtype = type_name_to_flow_type[data_type] @flow.global_function(type="train", function_config=func_config) def SoftmaxJob(): with flow.scope.placement(device_type, "0:0"): x = flow.get_variable( "x", shape=x_shape, dtype=dtype, initializer=flow.random_uniform_initializer(minval=-1.0, maxval=1.0), trainable=True, ) x1 = x x = flow.identity(x) if data_type == "float16": loss = flow.cast( flow.nn.softmax(flow.cast(x, dtype=flow.float16), axis=axis), dtype=flow.float, ) else: loss = flow.nn.softmax(x, axis=axis) flow.watch(x, test_global_storage.Setter("x")) flow.watch_diff(x, test_global_storage.Setter("x_diff")) flow.watch(loss, test_global_storage.Setter("loss")) flow.watch_diff(loss, test_global_storage.Setter("loss_diff")) total_loss = loss * x1 flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.0001]), momentum=0).minimize(total_loss) return loss of_out = SoftmaxJob().get() with tf.GradientTape(persistent=True) as tape: x = tf.Variable(test_global_storage.Get("x")) tf_out = tf.nn.softmax(x, axis=axis) loss_diff = test_global_storage.Get("loss_diff") tf_x_diff = tape.gradient(tf_out, x, loss_diff) if data_type == "float16": tolerance = 0.001 else: tolerance = 1e-05 assert np.allclose(of_out.numpy(), tf_out.numpy(), rtol=tolerance, atol=tolerance) assert np.allclose( test_global_storage.Get("x_diff"), tf_x_diff.numpy(), rtol=tolerance, atol=tolerance, )
def GetSeveralLossesAsNumpy(enable_inplace, num_iters=10): flow.config.enable_debug_mode(True) flow.config.gpu_device_num(1) train_config = flow.FunctionConfig() train_config.default_logical_view(flow.scope.consistent_view()) train_config.enable_inplace(enable_inplace) @flow.global_function(type="train", function_config=train_config) def PretrainJob(): loss = BuildPreTrainNet( batch_size=FLAGS.batch_size, data_part_num=FLAGS.data_part_num, seq_length=FLAGS.seq_length, max_position_embeddings=FLAGS.max_position_embeddings, num_hidden_layers=1, num_attention_heads=FLAGS.num_attention_heads, hidden_dropout_prob=FLAGS.hidden_dropout_prob, attention_probs_dropout_prob=FLAGS.attention_probs_dropout_prob, vocab_size=FLAGS.vocab_size, type_vocab_size=FLAGS.type_vocab_size, max_predictions_per_seq=FLAGS.max_predictions_per_seq, ) CreateOptimizer().minimize(loss) return loss check_point = flow.train.CheckPoint() check_point.load(FLAGS.model_load_dir) ret = [PretrainJob().get().mean() for _ in range(num_iters)] flow.clear_default_session() return np.array(ret)
def _of_assign_and_relu(value, dtype, device_type, assign=flow.assign): flow.clear_default_session() if os.getenv("ONEFLOW_TEST_CPU_ONLY") is None: flow.config.gpu_device_num(1) flow.config.cpu_device_num(1) func_config = flow.FunctionConfig() func_config.default_data_type(dtype) func_config.default_placement_scope( flow.scope.placement(device_type, "0:0")) @flow.global_function(function_config=func_config) def assign_fn(value_def: oft.Numpy.Placeholder(value.shape, dtype=dtype)): var = flow.get_variable( name="var", shape=value.shape, dtype=dtype, initializer=flow.constant_initializer(0), ) assign(var, value_def) @flow.global_function(function_config=func_config) def relu_fn(): var = flow.get_variable( name="var", shape=value.shape, dtype=dtype, initializer=flow.constant_initializer(0), ) return flow.nn.relu(var) assign_fn(value) return relu_fn().get().numpy()
def test_dynamic_reshape(test_case): data_shape = (10, 10, 10) flow.config.gpu_device_num(2) func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) func_config.default_logical_view(flow.scope.mirrored_view()) @flow.global_function(type="train", function_config=func_config) def DynamicReshapeJob(x: oft.ListNumpy.Placeholder(data_shape)): reshape_out1 = flow.reshape(x, (-1, 20)) my_model = flow.get_variable( "my_model", shape=(20, 32), dtype=flow.float, initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) my_model = flow.cast_to_current_logical_view(my_model) mm_out = flow.matmul(reshape_out1, my_model) reshape_out2 = flow.reshape(mm_out, (-1, 8, 4)) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.0001]), momentum=0).minimize(reshape_out2) return reshape_out1 data = [ np.random.rand(*data_shape).astype(np.float32) for i in range(2) ] out = DynamicReshapeJob(data).get().numpy_list() for i in range(2): test_case.assertTrue( np.array_equal(np.reshape(data[i], (50, 20)), out[i]))
def compare_with_tensorflow(device_type, in_shape, axis, k, data_type, sorted): assert device_type in ["gpu", "cpu"] assert data_type in ["float32", "double", "int8", "int32", "int64"] flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_logical_view(flow.scope.mirrored_view()) func_config.default_data_type(flow.float) @flow.global_function(function_config=func_config) def TopKJob(input: oft.ListNumpy.Placeholder( tuple([dim + 10 for dim in in_shape]), dtype=type_name_to_flow_type[data_type], )): with flow.scope.placement(device_type, "0:0"): return flow.math.top_k(input, axis, k, sorted) input = (np.random.random(in_shape) * 100).astype( type_name_to_np_type[data_type]) of_out = TopKJob([input]).get().numpy_list()[0] if k <= in_shape[axis]: perm = get_perm_when_transpose_axis_to_last_dim(len(in_shape), axis) x = tf.transpose(input, perm) (_, indices) = tf.math.top_k(x, k, sorted) tf_out = tf.transpose(indices, get_inversed_perm(perm)) else: tf_out = tf.argsort(input, axis, direction="DESCENDING", stable=True) assert np.array_equal(of_out, tf_out.numpy())
def test_multi_node_comm_net_dynamic(test_case): func_config = flow.FunctionConfig() func_config.default_logical_view(flow.scope.mirrored_view()) func_config.default_placement_scope(flow.scope.placement("gpu", "0:0")) func_config.default_data_type(flow.float) flow.config.machine_num(2) flow.config.gpu_device_num(1) @flow.global_function(function_config=func_config) def ReluJob(x: oft.ListNumpy.Placeholder((10, 2))): with flow.scope.placement("gpu", "0:0"): out0 = flow.math.relu(x) with flow.scope.placement("gpu", "1:0"): out1 = flow.math.relu(out0) with flow.scope.placement("gpu", "0:0"): out2 = flow.math.relu(out1) return out2 index = [-2, -1, 0, 1, 2] data = [] for i in index: data.append(np.ones((5, 2), dtype=np.float32) * i) for i in range(5): ret = ReluJob([data[i]]).get().numpy_list()[0] print(ret) if index[i] > 0: test_case.assertTrue( np.array_equal( ret, np.ones((5, 2), dtype=np.float32) * index[i])) else: test_case.assertTrue( np.array_equal(ret, np.zeros((5, 2), dtype=np.float32)))
def test_name_scope(test_case): flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float) def get_var(var_name): return flow.get_variable( name=var_name, shape=(2, 256, 14, 14), dtype=flow.float32, initializer=flow.random_uniform_initializer(), ) @flow.global_function(function_config=func_config) def test_name_scope_job(): with flow.scope.namespace("backbone"): with flow.scope.namespace("branch"): var1 = get_var("var") with flow.scope.namespace("branch"): var2 = get_var("var") var3 = get_var("backbone-branch-var") return (var1, var2, var3) (var1, var2, var3) = test_name_scope_job().get() test_case.assertTrue(np.array_equal(var1.numpy(), var2.numpy())) test_case.assertTrue(np.array_equal(var1.numpy(), var3.numpy()))
def _test_reshape(test_case): flow.clear_default_session() flow.config.gpu_device_num(4) flow.config.enable_legacy_model_io(True) flow.config.enable_model_io_v2(True) func_config = flow.FunctionConfig() func_config.default_data_type(flow.float32) @flow.global_function(type="train", function_config=func_config) def FlowJob(x: flow.typing.Numpy.Placeholder((4, 6), dtype=flow.float)): with flow.scope.placement("gpu", "0:0-3", (2, 2)): v = flow.get_variable( "x", shape=(4, 6), dtype=flow.float, initializer=flow.constant_initializer(0), trainable=True, nd_sbp=["S(0)", "S(1)"], ) x = flow.hierarchical_parallel_cast(x, nd_sbp=["S(0)", "S(1)"]) x += v loss = flow.reshape(x, (4, 2, 3)) loss = flow.hierarchical_parallel_cast(loss, nd_sbp=["S(0)"]) flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler([], [0.0001]), momentum=0).minimize(loss) return loss x = np.random.randn(4, 6).astype(np.float32) my_loss = FlowJob(x).get() test_case.assertTrue(np.allclose(x.flatten(), my_loss.numpy().flatten()))
def run_job(test_case, device=None, in_type=None, shape=None): assert shape is not None flow.clear_default_session() func_config = flow.FunctionConfig() @flow.global_function(type="train", function_config=func_config) def FuseBnAddReluJob(x: oft.Numpy.Placeholder( shape, dtype=in_type)) -> oft.Numpy: addend = flow.constant_like(x, 2) with flow.scope.placement(device, "0:0-0"): x = (flow.get_variable( "x1", shape=shape, dtype=in_type, initializer=flow.random_uniform_initializer(minval=-10, maxval=10), trainable=True, ) + x) loss = flow.nn.relu(_batch_norm(x, last=False) + addend) + 1 flow.optimizer.SGD(flow.optimizer.PiecewiseConstantScheduler( [], [0.0001]), momentum=0).minimize(loss) return loss np_in_type = dtype_util.convert_oneflow_dtype_to_numpy_dtype(in_type) x = (np.random.rand(*shape) * 10).astype(np_in_type) FuseBnAddReluJob(x)
def _make_dim_gather_fn(test_case, sample, datashape): flow.clear_default_session() func_config = flow.FunctionConfig() func_config.default_data_type(flow.float32) func_config.default_logical_view(flow.scope.mirrored_view()) func_config.default_placement_scope(flow.scope.placement("gpu", "0:0")) def _compare_diff(blob: oft.ListNumpy): test_case.assertTrue(np.allclose(sample["grad"], blob[0])) @flow.global_function(type="train", function_config=func_config) def DynamicDimGatherJob( params_def: oft.ListNumpy.Placeholder(datashape, dtype=flow.float32), index_def: oft.ListNumpy.Placeholder(datashape, dtype=flow.int32), ) -> oft.ListNumpy: x_var = flow.get_variable( "input", shape=(1,), dtype=flow.float32, initializer=flow.constant_initializer(0), ) x_var = flow.cast_to_current_logical_view(x_var) x = x_var + params_def y = flow.dim_gather(x, sample["dim"], index_def) flow.optimizer.SGD( flow.optimizer.PiecewiseConstantScheduler([], [0.001]), momentum=0 ).minimize(y) flow.watch_diff(x, _compare_diff) return y return DynamicDimGatherJob
def _get_train_conf(): train_conf = flow.FunctionConfig() train_conf.default_data_type(flow.float) train_conf.indexed_slices_optimizer_conf( dict(include_op_names=dict( op_name=['wide_embedding', 'deep_embedding']))) return train_conf