def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): seed = 42 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) onnxruntime.set_seed(seed) model = BertForPreTraining(config=config) model.eval() loss, prediction_scores, seq_relationship_score = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, next_sentence_label=sequence_labels) model_desc = ModelDescription([ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc, self.masked_lm_labels_desc, self.next_sentence_label_desc ], [ self.loss_desc, self.prediction_scores_desc, self.seq_relationship_scores_desc ]) from collections import namedtuple MyArgs = namedtuple( "MyArgs", "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len" ) args = MyArgs(local_rank=0, world_size=1, max_steps=100, learning_rate=0.00001, warmup_proportion=0.01, batch_size=13, seq_len=7) def get_lr_this_step(global_step): return get_lr(args, global_step) loss_scaler = LossScaler('loss_scale_input_name', True, up_scale_window=2000) # It would be better to test both with/without mixed precision and allreduce_post_accumulation. # However, stress test of all the 4 cases is not stable at lease on the test machine. # There we only test mixed precision and allreduce_post_accumulation because it is the most useful use cases. option_fp16 = [True] option_allreduce_post_accumulation = [True] option_gradient_accumulation_steps = [1, 8] option_use_internal_get_lr_this_step = [True, False] option_use_internal_loss_scaler = [True, False] option_split_batch = [BatchArgsOption.ListAndDict] for fp16 in option_fp16: for allreduce_post_accumulation in option_allreduce_post_accumulation: for gradient_accumulation_steps in option_gradient_accumulation_steps: for use_internal_get_lr_this_step in option_use_internal_get_lr_this_step: for use_internal_loss_scaler in option_use_internal_loss_scaler: for split_batch in option_split_batch: print("gradient_accumulation_steps:", gradient_accumulation_steps) print("use_internal_loss_scaler:", use_internal_loss_scaler) loss_ort, prediction_scores_ort, seq_relationship_score_ort =\ run_test(model, model_desc, self.device, args, gradient_accumulation_steps, fp16, allreduce_post_accumulation, get_lr_this_step, use_internal_get_lr_this_step, loss_scaler, use_internal_loss_scaler, split_batch) print(loss_ort) print(prediction_scores_ort) print(seq_relationship_score_ort)
def create_and_check_bert_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, option_fp16, option_allreduce_post_accumulation, option_gradient_accumulation_steps, option_split_batch, option_use_internal_get_lr_this_step=[True], option_use_internal_loss_scaler=[True], ): seed = 42 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) onnxruntime.set_seed(seed) model = BertForPreTraining(config=config) model.eval() loss, prediction_scores, seq_relationship_score = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, next_sentence_label=sequence_labels, ) model_desc = ModelDescription( [ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc, self.masked_lm_labels_desc, self.next_sentence_label_desc, ], [ self.loss_desc, self.prediction_scores_desc, self.seq_relationship_scores_desc ], ) from collections import namedtuple MyArgs = namedtuple( "MyArgs", "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len" ) dataset_len = 100 epochs = 8 max_steps = epochs * dataset_len args = MyArgs( local_rank=0, world_size=1, max_steps=max_steps, learning_rate=0.00001, warmup_proportion=0.01, batch_size=13, seq_len=7, ) def get_lr_this_step(global_step): return get_lr(args, global_step) loss_scaler = LossScaler("loss_scale_input_name", True, up_scale_window=2000) for fp16 in option_fp16: for allreduce_post_accumulation in option_allreduce_post_accumulation: for gradient_accumulation_steps in option_gradient_accumulation_steps: for use_internal_get_lr_this_step in option_use_internal_get_lr_this_step: for use_internal_loss_scaler in option_use_internal_loss_scaler: for split_batch in option_split_batch: print("gradient_accumulation_steps:", gradient_accumulation_steps) print("split_batch:", split_batch) seed = 42 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) onnxruntime.set_seed(seed) ( old_api_loss_ort, old_api_prediction_scores_ort, old_api_seq_relationship_score_ort, ) = run_test( model, model_desc, self.device, args, gradient_accumulation_steps, fp16, allreduce_post_accumulation, get_lr_this_step, use_internal_get_lr_this_step, loss_scaler, use_internal_loss_scaler, split_batch, dataset_len, epochs, use_new_api=False, ) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) onnxruntime.set_seed(seed) if use_internal_get_lr_this_step and use_internal_loss_scaler: ( new_api_loss_ort, new_api_prediction_scores_ort, new_api_seq_relationship_score_ort, ) = run_test( model, model_desc, self.device, args, gradient_accumulation_steps, fp16, allreduce_post_accumulation, get_lr_this_step, use_internal_get_lr_this_step, loss_scaler, use_internal_loss_scaler, split_batch, dataset_len, epochs, use_new_api=True, ) assert_allclose( old_api_loss_ort, new_api_loss_ort) assert_allclose( old_api_prediction_scores_ort, new_api_prediction_scores_ort) assert_allclose( old_api_seq_relationship_score_ort, new_api_seq_relationship_score_ort)