def _test_broadcast_buffer(test_case, dev_type): rank = flow.env.get_rank() class CustomModule(flow.nn.Module): def __init__(self): super().__init__() self.register_buffer("buf", flow.tensor([1, 2]) * (rank + 1)) def forward(self, x): res = self.buf + x self.buf.copy_(x) return res x = flow.tensor([2, 3]) * (rank + 1) x = x.to(dev_type) m = CustomModule() m = m.to(dev_type) m = ddp(m) y1 = m(x) y2 = m(x) m = CustomModule() m = m.to(dev_type) m = ddp(m, broadcast_buffers=False) y3 = m(x) y4 = m(x) if rank == 0: test_case.assertTrue( np_allclose_with_shape(y1.numpy(), np.array([3, 5]))) test_case.assertTrue( np_allclose_with_shape(y2.numpy(), np.array([4, 6]))) test_case.assertTrue( np_allclose_with_shape(y3.numpy(), np.array([3, 5]))) test_case.assertTrue( np_allclose_with_shape(y4.numpy(), np.array([4, 6]))) elif rank == 1: test_case.assertTrue( np_allclose_with_shape(y1.numpy(), np.array([5, 8]))) test_case.assertTrue( np_allclose_with_shape(y2.numpy(), np.array([6, 9]))) test_case.assertTrue( np_allclose_with_shape(y3.numpy(), np.array([6, 10]))) test_case.assertTrue( np_allclose_with_shape(y4.numpy(), np.array([8, 12]))) else: raise ValueError()
def train_eager(self): self.train_module = ddp(self.train_module) for epoch in range(self.start_epoch, self.cfg.num_epoch): self.train_module.train() one_epoch_steps = len(self.train_data_loader) for steps in range(one_epoch_steps): self.global_step += 1 image, label = self.train_data_loader() image = image.to("cuda") label = label.to("cuda") features_fc7 = self.train_module(image, label) features_fc7 = self.margin_softmax(features_fc7, label) * 64 loss = self.of_cross_entropy(features_fc7, label) loss.backward() self.optimizer.step() self.optimizer.zero_grad() loss = loss.numpy() self.losses.update(loss, 1) self.callback_logging(self.global_step, self.losses, epoch, False, self.scheduler.get_last_lr()[0]) self.callback_verification(self.global_step, self.backbone) self.scheduler.step() self.callback_checkpoint(self.global_step, epoch, self.train_module)
def init_model(self): self.logger.print("***** Model Init *****", print_ranks=[0]) start_t = time.perf_counter() if self.is_global: placement = flow.env.all_device_placement("cuda") self.model = self.model.to_global(placement=placement, sbp=flow.sbp.broadcast) else: self.model = self.model.to("cuda") if self.load_path is None: self.legacy_init_parameters() else: self.load_state_dict() if self.ddp: self.model = ddp(self.model) if self.save_init: self.save("init") end_t = time.perf_counter() self.logger.print( f"***** Model Init Finish, time escapled: {end_t - start_t:.5f} s *****", print_ranks=[0], )
def _test_ddp_with_unused_param(test_case, dev_type): class Model(flow.nn.Module): def __init__(self): super().__init__() self.w = flow.nn.Parameter(flow.Tensor([1])) self.used_only_in_rank0 = flow.nn.Parameter(flow.Tensor([2])) self.unused_in_all_ranks = flow.nn.Parameter(flow.Tensor([3])) def forward(self, x): x = x * self.w if flow.env.get_rank() == 0: x = x * self.used_only_in_rank0 return x rank = flow.env.get_rank() if rank == 0: x = flow.Tensor([1]) elif rank == 1: x = flow.Tensor([2]) else: raise ValueError() x = x.to(dev_type) m = Model().to(dev_type) m = ddp(m) y = m(x) y.backward() test_case.assertTrue(np_allclose_with_shape(m.w.grad.numpy(), np.array([2]))) test_case.assertTrue( np_allclose_with_shape(m.used_only_in_rank0.grad.numpy(), np.array([0.5])) ) test_case.assertTrue( np_allclose_with_shape(m.unused_in_all_ranks.grad.numpy(), np.array([0])) )
def _test_ddp_basic(test_case, dev_type): class Mul(flow.nn.Module): def __init__(self): super().__init__() self.w = flow.nn.Parameter(flow.Tensor([1, 1])) def forward(self, x): return x * self.w rank = flow.env.get_rank() if rank == 0: x = flow.Tensor([1, 1]) elif rank == 1: x = flow.Tensor([2, 2]) else: raise ValueError() x = x.to(dev_type) m = Mul().to(dev_type) m = ddp(m) y = m(x) y.sum().backward() test_case.assertTrue( np_allclose_with_shape(m.w.grad.numpy(), np.array([1.5, 1.5])) )
def _test_ddp_multiple_buckets(test_case, dev_type): class Mul(flow.nn.Module): def __init__(self): super().__init__() for i in range(10): self.register_parameter( f"w{i}", flow.nn.Parameter(flow.Tensor([i % 2 + 1, i % 2 + 1]))) def forward(self, x): for i in range(10): x = x * getattr(self, f"w{i}") return x rank = flow.env.get_rank() if rank == 0: x = flow.Tensor([1, 1]) elif rank == 1: x = flow.Tensor([2, 2]) else: raise ValueError() x = x.to(dev_type) m = Mul().to(dev_type) m = ddp(m, bucket_size=3) y = m(x) y.sum().backward() for i in range(10): test_case.assertTrue( np_allclose_with_shape( getattr(m, f"w{i}").grad.numpy(), np.array([48, 48]) if i % 2 == 0 else np.array([24, 24]), ))
def train(test_case, train_x, device, output, requires_grad): m = Model().to(device) m = ddp(m) loss = flow.nn.MSELoss(reduction="sum") optimizer = flow.optim.SGD(m.parameters(), m.lr) for i in range(0, m.iter_count): rank = flow.env.get_rank() x = train_x[rank].clone().to(device) y = output[rank].clone().to(device) y.requires_grad = requires_grad y_pred, y2 = m(x, y) test_case.assertEqual(y2.requires_grad, y.requires_grad) l = loss(y_pred, y) l.backward() optimizer.step() optimizer.zero_grad()
def _test_out_of_order_execution(test_case, dev_type): class Model(flow.nn.Module): def __init__(self): super().__init__() self.w1 = flow.nn.Parameter(flow.Tensor([1])) self.w2 = flow.nn.Parameter(flow.Tensor([2])) self.w3 = flow.nn.Parameter(flow.Tensor([3])) def forward(self, x): if flow.env.get_rank() == 0: x *= self.w1 x *= self.w2 x *= self.w3 else: x *= self.w3 x *= self.w2 x *= self.w1 return x rank = flow.env.get_rank() if rank == 0: x = flow.Tensor([1]) elif rank == 1: x = flow.Tensor([2]) else: raise ValueError() x = x.to(dev_type) m = Model().to(dev_type) m = ddp(m, bucket_size=1) y = m(x) y.backward() test_case.assertTrue( np_allclose_with_shape(m.w1.grad.numpy(), np.array([9]))) test_case.assertTrue( np_allclose_with_shape(m.w2.grad.numpy(), np.array([4.5]))) test_case.assertTrue( np_allclose_with_shape(m.w3.grad.numpy(), np.array([3])))
def main(): args = get_config() if args.with_cuda: device = flow.device("cuda") else: device = flow.device("cpu") print("Creating Dataloader") train_data_loader = OfRecordDataLoader( ofrecord_dir=args.ofrecord_path, mode="train", dataset_size=args.train_dataset_size, batch_size=args.train_batch_size, data_part_num=args.train_data_part, seq_length=args.seq_length, max_predictions_per_seq=args.max_predictions_per_seq, consistent=False, ) test_data_loader = OfRecordDataLoader( ofrecord_dir=args.ofrecord_path, mode="test", dataset_size=1024, batch_size=args.val_batch_size, data_part_num=4, seq_length=args.seq_length, max_predictions_per_seq=args.max_predictions_per_seq, consistent=False, ) print("Building BERT Model") hidden_size = 64 * args.num_attention_heads intermediate_size = 4 * hidden_size bert_model = BertForPreTraining( args.vocab_size, args.seq_length, hidden_size, args.num_hidden_layers, args.num_attention_heads, intermediate_size, nn.GELU(), args.hidden_dropout_prob, args.attention_probs_dropout_prob, args.max_position_embeddings, args.type_vocab_size, ) # Load the same initial parameters with lazy model. # from utils.compare_lazy_outputs import load_params_from_lazy # load_params_from_lazy( # bert_model.state_dict(), # "../../OneFlow-Benchmark/LanguageModeling/BERT/initial_model", # ) bert_model = bert_model.to(device) if args.use_ddp: bert_model = ddp(bert_model) optimizer = build_optimizer( args.optim_name, bert_model, args.lr, args.weight_decay, weight_decay_excludes=["bias", "LayerNorm", "layer_norm"], clip_grad_max_norm=1, clip_grad_norm_type=2.0, ) steps = args.epochs * len(train_data_loader) warmup_steps = int(steps * args.warmup_proportion) lr_scheduler = PolynomialLR(optimizer, steps=steps, end_learning_rate=0.0) lr_scheduler = flow.optim.lr_scheduler.WarmUpLR(lr_scheduler, warmup_factor=0, warmup_iters=warmup_steps, warmup_method="linear") ns_criterion = nn.CrossEntropyLoss(reduction="mean") mlm_criterion = nn.CrossEntropyLoss(reduction="none") def get_masked_lm_loss( logit_blob, masked_lm_positions, masked_lm_labels, label_weights, max_prediction_per_seq, ): # gather valid position indices logit_blob = flow.gather( logit_blob, index=masked_lm_positions.unsqueeze(2).repeat( 1, 1, args.vocab_size), dim=1, ) logit_blob = flow.reshape(logit_blob, [-1, args.vocab_size]) label_id_blob = flow.reshape(masked_lm_labels, [-1]) # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. pre_example_loss = mlm_criterion(logit_blob, label_id_blob) pre_example_loss = flow.reshape(pre_example_loss, [-1, max_prediction_per_seq]) numerator = flow.sum(pre_example_loss * label_weights) denominator = flow.sum(label_weights) + 1e-5 loss = numerator / denominator return loss train_total_losses = [] for epoch in range(args.epochs): metric = Metric( desc="bert pretrain", print_steps=args.loss_print_every_n_iters, batch_size=args.train_batch_size, keys=["total_loss", "mlm_loss", "nsp_loss", "pred_acc"], ) # Train bert_model.train() for step in range(len(train_data_loader)): bert_outputs = pretrain( train_data_loader, bert_model, ns_criterion, partial( get_masked_lm_loss, max_prediction_per_seq=args.max_predictions_per_seq, ), optimizer, lr_scheduler, ) if flow.env.get_rank() == 0: metric.metric_cb(step, epoch=epoch)(bert_outputs) train_total_losses.append(bert_outputs["total_loss"]) # Eval bert_model.eval() val_acc = validation(epoch, test_data_loader, bert_model, args.val_print_every_n_iters) save_model(bert_model, args.checkpoint_path, epoch, val_acc, False)