def _test_0d_randint(test_case, device, shape, low, high): y1 = flow.randint(low, high, shape, device=flow.device(device)) y2 = flow.randint(low, high, shape, device=flow.device(device)) test_case.assertTrue( np.allclose(y1.numpy(), y2.numpy(), atol=1e-4, rtol=1e-4)) # 0d is [] and [] test_case.assertTrue(shape == y1.shape)
def setup_class(cls): cls.config_file_path = "/workspace/models/nlp/chinese_wwm_ext/bert_config.json" cls.tf_checkpoint_path = "/workspace/models/nlp/chinese_wwm_ext/bert_model.ckpt" cls.huggingface_model_path = "/workspace/models/nlp/chinese_wwm_ext" cls.model_path = "/workspace/models/nlp/chinese_wwm_ext/oneflow" model_cfg = dict( type="OFBertForPreTraining", config=dict(type="ConfigBase", json_file=cls.config_file_path), ) cls.config = build_config(model_cfg["config"]) cls.model_tf = build_of_models(model_cfg) cls.model_hf = build_of_models(model_cfg) cls.model_base = transformers.BertModel.from_pretrained( cls.huggingface_model_path) cls.model_base.eval() cls.model_base_mlm = transformers.BertForPreTraining.from_pretrained( cls.huggingface_model_path) cls.model_base_mlm.eval() model_cfg.update({"model_path": cls.model_path}) cls.model = build_of_models(model_cfg) cls.model.eval() cls.batch_size = 4 cls.seq_length = 10 cls.tokens_tensor = { "input_ids": flow.randint( low=1, high=100, size=(cls.batch_size, cls.seq_length), dtype=flow.long), "attention_mask": flow.randint( low=0, high=2, size=(cls.batch_size, cls.seq_length), dtype=flow.long), "token_type_ids": flow.randint( low=0, high=2, size=(cls.batch_size, cls.seq_length), dtype=flow.long), "position_ids": flow.randint( low=0, high=cls.seq_length, size=(cls.batch_size, cls.seq_length), dtype=flow.long), }
def __init__( self, batch_size, image_size=224, num_classes=1000, placement=None, sbp=None, channel_last=False, ): super().__init__() if channel_last: self.image_shape = (batch_size, image_size, image_size, 3) else: self.image_shape = (batch_size, 3, image_size, image_size) self.label_shape = (batch_size,) self.num_classes = num_classes self.placement = placement self.sbp = sbp if self.placement is not None and self.sbp is not None: self.image = flow.nn.Parameter( flow.randint( 0, high=256, size=self.image_shape, dtype=flow.float32, placement=self.placement, sbp=self.sbp, ), requires_grad=False, ) self.label = flow.nn.Parameter( flow.randint( 0, high=self.num_classes, size=self.label_shape, placement=self.placement, sbp=self.sbp, ).to(dtype=flow.int32), requires_grad=False, ) else: self.image = flow.randint( 0, high=256, size=self.image_shape, dtype=flow.float32, device="cuda" ) self.label = flow.randint( 0, high=self.num_classes, size=self.label_shape, device="cuda", ).to(dtype=flow.int32)
def _test_consistent_randint(test_case, shape, placement, sbp, dtype): x = flow.randint(1, 10, shape, placement=placement, sbp=sbp, dtype=dtype) test_case.assertEqual(x.shape, flow.Size(shape)) test_case.assertEqual(x.sbp, sbp) test_case.assertEqual(x.placement, placement) test_case.assertEqual(x.dtype, dtype)
def build(self): x = flow.randint(1, 10, shape, placement=placement, sbp=sbp, dtype=dtype) return x
def _rand_input(placement=None, sbp=None): generator = flow.Generator() generator.manual_seed(0) return flow.randint(0, 10, (8, ), generator=generator, placement=placement, sbp=sbp)
def _test_with_generator(test_case, device, shape, low, high): gen = flow.Generator() gen.manual_seed(0) y1 = flow.randint(low, high, shape, dtype=flow.float32, device=flow.device(device), generator=gen) gen.manual_seed(0) y2 = flow.randint(low, high, shape, dtype=flow.float32, device=flow.device(device), generator=gen) test_case.assertTrue( np.allclose(y1.numpy(), y2.numpy(), atol=1e-4, rtol=1e-4))
def test_model_split(test_case): pred = flow.randn(8, 10) label = flow.randint(0, 10, (8,)) placement = flow.placement("cuda", list(range(flow.env.get_world_size()))) pred = pred.to_global(placement=placement, sbp=flow.sbp.broadcast()) label = label.to_global(placement=placement, sbp=flow.sbp.broadcast()) _compare_with_nn_cross_entropy_loss( test_case, pred, label, flow.sbp.split(1), flow.sbp.broadcast() )
def test_parital_fc(test_case): p = flow.env.all_device_placement("cuda") w = flow.randn(50000, 128, placement=p, sbp=flow.sbp.broadcast) label = flow.randint(0, 50000, (512, ), placement=p, sbp=flow.sbp.broadcast) num_sample = 5000 out = flow.distributed_partial_fc_sample(w, label, num_sample) test_case.assertTrue(out[0].shape == flow.Size([512])) test_case.assertTrue(out[1].shape == flow.Size([5000])) test_case.assertTrue(out[2].shape == flow.Size([5000, 128]))
def test_parital_fc(test_case): p = flow.env.all_device_placement("cuda") w = flow.randn( 50000, 128, placement=p, sbp=flow.sbp.broadcast, requires_grad=True ) label = flow.randint(0, 50000, (512,), placement=p, sbp=flow.sbp.broadcast) num_sample = 5000 out = flow.distributed_partial_fc_sample(w, label, num_sample) test_case.assertTrue(out[0].shape == flow.Size([512])) test_case.assertTrue(out[1].shape == flow.Size([5000])) test_case.assertTrue(out[2].shape == flow.Size([5000, 128])) # test gradient function sample_weight = out[2] sample_weight.sum().backward()
def _test_different_dtype(test_case, device, shape, low, high): for dtype in [ flow.uint8, flow.int8, flow.int32, flow.int64, flow.float32, flow.float64, ]: y = flow.randint(low, high, shape, dtype=dtype, device=flow.device(device)) test_case.assertTrue(y.dtype == dtype) test_case.assertTrue(y.shape == shape)
def test_consistent_different_types(test_case): for dtype in [ flow.int8, flow.int32, flow.int64, flow.float32, flow.float64, ]: placement = flow.placement("cpu", {0: [0]}) sbp = (flow.sbp.broadcast, ) x = flow.randint(0, 16, (10, 1), placement=placement, sbp=sbp, dtype=dtype) test_case.assertEqual(x.dtype, dtype) test_case.assertEqual(x.sbp, sbp) test_case.assertEqual(x.placement, placement)
def test_2d_split(test_case): pred = flow.randn(8, 10) label = flow.randint(0, 10, (8,)) placement = flow.placement( "cuda", np.array(range(flow.env.get_world_size())).reshape(2, 2) ) pred = pred.to_global( placement=placement, sbp=[flow.sbp.broadcast(), flow.sbp.broadcast()] ) label = label.to_global( placement=placement, sbp=[flow.sbp.broadcast(), flow.sbp.broadcast()] ) _compare_with_nn_cross_entropy_loss( test_case, pred, label, [flow.sbp.split(0), flow.sbp.split(1)], [flow.sbp.split(0), flow.sbp.broadcast()], )
def _test_consistent_rand(test_case, low, high, shape, placement, sbp): x = flow.randint(low, high, shape, placement=placement, sbp=sbp) test_case.assertEqual(x.shape, shape) test_case.assertEqual(x.sbp, sbp) test_case.assertEqual(x.placement, placement)
def test_local(test_case): pred = flow.randn(8, 10).to("cuda") label = flow.randint(0, 10, (8,)).to("cuda") _compare_with_nn_cross_entropy_loss(test_case, pred, label)
def __init__(self, batch_size=8, feat1=10, feat2=8, device="cuda", parallel_mode=None): input = flow.randn(batch_size, feat1).to(device) param1 = flow.randn(feat2, feat1).to(device) param2 = flow.randn(feat2, feat1).to(device) target = flow.randint(0, 10, (batch_size, )).to(device) ranks = np.array(range(flow.env.get_world_size())) placement = flow.placement(device, ranks) self.input = input.to_global(placement, sbp=flow.sbp.broadcast) self.param1 = param1.to_global(placement, sbp=flow.sbp.broadcast) self.param2 = param2.to_global(placement, sbp=flow.sbp.broadcast) self.target = target.to_global(placement, sbp=flow.sbp.broadcast) self.input_sbp = None self.target_sbp = None self.param1_sbp = None self.param2_sbp = None self.placement1 = None self.placement2 = None if parallel_mode is not None: assert isinstance(parallel_mode, str) or isinstance( parallel_mode, (list, tuple)) if isinstance(parallel_mode, str): parallel_mode = [parallel_mode] assert all(p.upper() in ("DP", "MP", "PP") for p in parallel_mode) assert len(parallel_mode) > 0 and len(parallel_mode) <= 2 self.input_sbp = [] self.target_sbp = [] self.param1_sbp = [] self.param2_sbp = [] has_pp = False for p in parallel_mode: if p == "DP": self.input_sbp.append(flow.sbp.split(0)) self.target_sbp.append(flow.sbp.split(0)) self.param1_sbp.append(flow.sbp.broadcast()) self.param2_sbp.append(flow.sbp.broadcast()) elif p == "MP": self.input_sbp.append(flow.sbp.broadcast()) self.target_sbp.append(flow.sbp.broadcast()) self.param1_sbp.append(flow.sbp.split(0)) self.param2_sbp.append(flow.sbp.split(0)) elif p == "PP": ranks = ranks.reshape(2, -1) self.placement1 = flow.placement(device, ranks[0]) self.placement2 = flow.placement(device, ranks[1]) has_pp = True else: raise ValueError if len(parallel_mode) > 1 and not has_pp: ranks = ranks.reshape(2, -1) self.placement1 = flow.placement(device, ranks) self.placement2 = flow.placement(device, ranks) if len(self.input_sbp) == 0: self.input_sbp = None if len(self.target_sbp) == 0: self.target_sbp = None if len(self.param1_sbp) == 0: self.param1_sbp = None if len(self.param2_sbp) == 0: self.param2_sbp = None
def test_graph_process_num_greater_than_device(test_case): # NOTE(chengcheng): this test case is ONLY for 1n8d in 4d env. if not (flow.env.get_node_size() == 1 and flow.env.get_world_size() == 8): return if not oneflow.sysconfig.has_rpc_backend_grpc(): return BATCH_SIZE = 64 BROADCAST = [flow.sbp.broadcast] P0 = flow.placement("cpu", ranks=[0, 1, 2, 3]) P1 = flow.placement("cpu", ranks=[4, 5, 6, 7]) class Stage0Module(flow.nn.Module): def __init__(self): super().__init__() self.flatten = flow.nn.Flatten() self.linear0 = flow.nn.Linear(28 * 28, 512) self.relu0 = flow.nn.ReLU() def forward(self, x): out = self.flatten(x) out = self.linear0(out) out = self.relu0(out) return out class Stage1Module(flow.nn.Module): def __init__(self): super().__init__() self.linear1 = flow.nn.Linear(512, 512) self.relu1 = flow.nn.ReLU() self.linear2 = flow.nn.Linear(512, 10) self.relu2 = flow.nn.ReLU() def forward(self, x): out = self.linear1(x) out = self.relu1(out) out = self.linear2(out) out = self.relu2(out) return out class PipelineModule(flow.nn.Module): def __init__(self): super().__init__() self.m_stage0 = Stage0Module() self.m_stage1 = Stage1Module() self.m_stage0.to_global(placement=P0, sbp=BROADCAST) self.m_stage1.to_global(placement=P1, sbp=BROADCAST) def forward(self, x): out_stage0 = self.m_stage0(x) in_stage1 = out_stage0.to_global(placement=P1, sbp=flow.sbp.split(0)) out_stage1 = self.m_stage1(in_stage1) return out_stage1 module_pipeline = PipelineModule() sgd = flow.optim.SGD(module_pipeline.parameters(), lr=0.001) class PipelineGraph(flow.nn.Graph): def __init__(self): super().__init__() self.module_pipeline = module_pipeline self.module_pipeline.m_stage0.config.stage_id = 0 self.module_pipeline.m_stage1.config.stage_id = 1 self.loss_fn = flow.nn.CrossEntropyLoss(reduction="none") self.config.set_gradient_accumulation_steps(2) self.add_optimizer(sgd) def build(self, x, y): out = self.module_pipeline(x) loss = self.loss_fn(out, y).sum() loss = loss.to_global(placement=P1, sbp=BROADCAST) loss.backward() return loss graph_pipeline = PipelineGraph() graph_pipeline.debug(1) x = flow.randn(BATCH_SIZE, 1, 28, 28) x = x.to_global(P0, sbp=flow.sbp.split(0)) y = flow.randint(0, 10, (BATCH_SIZE, 1)) y = y.to_global(P1, sbp=flow.sbp.split(0)) for i in range(2): loss = graph_pipeline(x, y) print(">>>>>>>", flow.env.get_rank(), loss.to_local().numpy(), flush=True)
def _test_graph_reshape_acc(test_case): class StageLayerModule(flow.nn.Module): def __init__(self): super().__init__() self.linear1 = flow.nn.Linear(10, 8, False) self.linear2 = flow.nn.Linear(8, 10, False) flow.nn.init.constant_(self.linear1.weight, 0.023) flow.nn.init.constant_(self.linear2.weight, 1.23) def forward(self, x): out0 = self.linear1(x) out0 = flow.reshape(out0, (-1, 2, 4)) out0 = out0 + 1.0 out0 = out0 * 2.0 out0 = flow.reshape(out0, (-1, 8)) out1 = self.linear2(out0) return out1 P0 = flow.placement("cuda", {0: [0]}) P1 = flow.placement("cuda", {0: [1]}) B = flow.sbp.broadcast class PipelineModule(flow.nn.Module): def __init__(self): super().__init__() self.layer_0 = StageLayerModule() self.layer_1 = StageLayerModule() self.layer_0.to_consistent(P0, B) self.layer_1.to_consistent(P1, B) def forward(self, x): # stage 0 x = flow.flatten(x, start_dim=1) in0 = x.to_consistent(P0, B) out0 = self.layer_0(in0) # stage 1 in1 = out0.to_consistent(P1, B) out1 = self.layer_1(in1) return out1 pp_m = PipelineModule() pp_m.train() sgd = flow.optim.SGD(pp_m.parameters(), lr=0.001) class PipelineGraph(flow.nn.Graph): def __init__(self): super().__init__() self.pp_m = pp_m self.pp_m.layer_0.config.stage_id = 0 self.pp_m.layer_1.config.stage_id = 1 self.loss_fn = flow.nn.CrossEntropyLoss() self.config.set_gradient_accumulation_steps(2) self.add_optimizer(sgd) def build(self, x, y): out = self.pp_m(x) y = y.to_consistent(P1, B) loss = self.loss_fn(out, y) loss.backward() return loss pp_g = PipelineGraph() for i in range(20): x = flow.randn(6, 2, 5) y = flow.randint(0, 10, (6, )) x = x.to_consistent(P0, B) y = y.to_consistent(P1, B) out = pp_g(x, y)
def test_consistent_naive(test_case): placement = flow.placement("cpu", {0: [0]}) sbp = (flow.sbp.broadcast, ) x = flow.randint(0, 16, (10, 1), placement=placement, sbp=sbp) test_case.assertEqual(x.sbp, sbp) test_case.assertEqual(x.placement, placement)
def _test_0rank(test_case, device, shape, low, high): y1 = flow.randint(low, high, shape, device=flow.device(device)) test_case.assertTrue(y1.shape == shape)
def train_with_graph(iter_num=1): P = flow.placement("cuda", ranks=[0, 1]) B = flow.sbp.broadcast S0 = flow.sbp.split(0) linear = flow.nn.Linear(8, 4) linear = linear.to_global(placement=P, sbp=B) flow.nn.init.constant_(linear.weight, 2.068758) flow.nn.init.constant_(linear.bias, 0.23) of_sgd = flow.optim.SGD(linear.parameters(), lr=0.001, momentum=0.9) grad_scaler = flow.amp.StaticGradScaler(200) x = flow.randint(1, 100, (4, 8), dtype=flow.float32, placement=P, sbp=S0) class LinearTrainGraphWithZeRO(flow.nn.Graph): def __init__(self): super().__init__() self.linear = linear self.add_optimizer(of_sgd) self.config.enable_amp(True) self.set_grad_scaler(grad_scaler) if zero_stage == 1: print("zero stage 1 optimization") self.config.set_zero_redundancy_optimizer_mode( "distributed_split") self.config.set_zero_redundancy_optimizer_min_size_after_split( 1) if zero_stage == 2: self.config.set_zero_redundancy_optimizer_mode( "distributed_split") self.config.set_zero_redundancy_optimizer_min_size_after_split( 1) flow.boxing.nccl.enable_use_compute_stream(True) if zero_stage == 3: print("zero stage 3 optimization") self.config.set_zero_redundancy_optimizer_mode( "distributed_split") self.config.set_zero_redundancy_optimizer_min_size_after_split( 1) flow.boxing.nccl.enable_use_compute_stream(True) flow.boxing.nccl.disable_group_boxing_by_dst_parallel(True) def build(self, x): out = self.linear(x) loss = out.sum() loss.backward() return out class LinearEvalGraphWithZeRO(flow.nn.Graph): def __init__(self): super().__init__() self.linear = linear self.config.enable_amp(True) def build(self, x): out = self.linear(x) return out linear_t_g = LinearTrainGraphWithZeRO() linear_e_g = LinearEvalGraphWithZeRO() def one_train_iter(): out = linear_t_g(x) def one_eval_iter(): out = linear_e_g(x) for i in range(iter_num): one_train_iter() # After pass rewrite in training graph, parameters' sbp has been # changed from flow.sbp.broadcast to flow.sbp.split(0) test_case.assertEqual(linear.weight.sbp[0], S0) test_case.assertEqual(linear.bias.sbp[0], S0) # In evaluation graph, paramters's sbp are flow.sbp.split(0). # But their consumer will consum them as flow.sbp.broadcast. one_eval_iter()
def __init__( self, num_dense_fields: int = 13, num_wide_sparse_fields: int = 2, num_deep_sparse_fields: int = 26, batch_size: int = 1, total_batch_size: int = 1, placement=None, sbp=None, ): super(SyntheticDataLoader, self).__init__() print("use synthetic data") self.batch_size = batch_size self.total_batch_size = total_batch_size self.placement = placement self.sbp = sbp self.label_shape = (batch_size, 1) self.dense_fields_shape = (batch_size, num_dense_fields) self.wide_sparse_fields_shape = (batch_size, num_wide_sparse_fields) self.deep_sparse_fields_shape = (batch_size, num_deep_sparse_fields) if self.placement is not None and self.sbp is not None: self.labels = flow.randint( 0, high=2, size=self.label_shape, dtype=flow.int32, placement=self.placement, sbp=self.sbp, ) self.dense_fields = flow.randint( 0, high=256, size=self.dense_fields_shape, dtype=flow.float, placement=self.placement, sbp=self.sbp, ) self.wide_sparse_fields = flow.randint( 0, high=256, size=self.wide_sparse_fields_shape, dtype=flow.int32, placement=self.placement, sbp=self.sbp, ) self.deep_sparse_fields = flow.randint( 0, high=256, size=self.deep_sparse_fields_shape, dtype=flow.int32, placement=self.placement, sbp=self.sbp, ) else: self.labels = flow.randint(0, high=2, size=self.label_shape, dtype=flow.int32, device="cpu") self.dense_fields = flow.randint( 0, high=256, size=self.dense_fields_shape, dtype=flow.float, device="cpu", ) self.wide_sparse_fields = flow.randint( 0, high=256, size=self.wide_sparse_fields_shape, dtype=flow.int32, device="cpu", ) self.deep_sparse_fields = flow.randint( 0, high=256, size=self.deep_sparse_fields_shape, dtype=flow.int32, device="cpu", )
def test_graph_reuse_var(test_case): rank = flow.env.get_rank() P = flow.placement("cuda", ranks=[0, 1]) B = flow.sbp.broadcast class ReuseVarModule(flow.nn.Module): def __init__(self): super().__init__() self.linear1 = flow.nn.Linear(2, 2) self.linear2 = flow.nn.Linear(2, 2) # Reuse parameter self.linear2.weight = self.linear1.weight def forward(self, x): # Allow user to call parameter outside it's module. self.linear1.weight x = self.linear1(x) x = self.linear2(x) return x reuse_var_m = ReuseVarModule() reuse_var_m.to_global(placement=P, sbp=B) of_sgd = flow.optim.SGD(reuse_var_m.parameters(), lr=0.001, momentum=0.9) class ReuseVarGraph(flow.nn.Graph): def __init__(self): super().__init__() self.reuse_var_m = reuse_var_m self.add_optimizer(of_sgd) def build(self, x): x = self.reuse_var_m(x) loss = x.sum() loss.backward() return loss x = flow.randint(0, 1, (2, 2), placement=P, sbp=B, dtype=flow.float32) reuse_var_g = ReuseVarGraph() loss = reuse_var_g(x) # check lazy tensor builder block = reuse_var_g.reuse_var_m test_case.assertEqual( block.linear1.weight.lazy_origin_builder().name, "reuse_var_m.linear1.weight", ) test_case.assertEqual( block.linear1.weight.lazy_origin_builder().name, block.linear2.weight.lazy_origin_builder().name, ) # check optimizer's variable list var_list = [ "reuse_var_m.linear1.weight", "reuse_var_m.linear1.bias", "reuse_var_m.linear2.bias", ] var_list_in_conf = reuse_var_g._graph_proto.job_conf.train_conf.optimizer_conf[ 0 ].variable_op_names test_case.assertEqual(len(var_list_in_conf), 3) for idx in range(3): test_case.assertEqual(var_list[idx], var_list_in_conf[idx]) if rank == 0: print(var_list_in_conf[idx])
def build(self): x = flow.randint(low, high, shape, placement=placement, sbp=sbp) return x
def test_non_default_device(test_case): x = flow.randint(low=1, high=2, size=flow.Size((2, 3)), device="cuda:1") test_case.assertEqual(x.device, flow.device("cuda:1"))