def test_bert(self): device = torch.device("cpu") model_tester = BertModelTest.BertModelTester(self) config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels = model_tester.prepare_config_and_inputs( ) model = BertForPreTraining(config=config) model.eval() loss, prediction_scores, seq_relationship_score = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, next_sentence_label=sequence_labels) model_desc = ModelDescription([ model_tester.input_ids_desc, model_tester.attention_mask_desc, model_tester.token_type_ids_desc, model_tester.masked_lm_labels_desc, model_tester.next_sentence_label_desc ], [ model_tester.loss_desc, model_tester.prediction_scores_desc, model_tester.seq_relationship_scores_desc ]) from collections import namedtuple MyArgs = namedtuple( "MyArgs", "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len" ) args = MyArgs(local_rank=0, world_size=1, max_steps=100, learning_rate=0.00001, warmup_proportion=0.01, batch_size=13, seq_len=7) dataset_len = 100 dataloader = create_ort_test_dataloader(model_desc.inputs_, args.batch_size, args.seq_len, dataset_len, device) learning_rate = torch.tensor(1.0e+0, dtype=torch.float32).to(device) for b in dataloader: batch = b break learning_rate = torch.tensor([1.00e+00]).to(device) inputs = batch + [ learning_rate, ] onnx_model = self.get_onnx_model(model, model_desc, inputs, device, _extra_postprocess=postprocess_model) self._bert_helper(onnx_model)
def testWrapModelLossFnStateDict(self): torch.manual_seed(1) device = torch.device("cuda") class LinearModel(torch.nn.Module): def __init__(self): super().__init__() self.linear = torch.nn.Linear(2, 4) def forward(self, y=None, x=None): if y is not None: return self.linear(x) + y else: return self.linear(x) + torch.ones(2, 4) pt_model = LinearModel() data = torch.randn(2, 2) label = torch.tensor([0, 1], dtype=torch.int64) input_desc = IODescription('x', [2, 2], torch.float32) label_desc = IODescription('label', [2, ], torch.int64, num_classes=4) output_desc = IODescription('output', [2, 4], torch.float32) loss_desc = IODescription('loss', [], torch.float32) model_desc = ModelDescription([input_desc, label_desc], [loss_desc, output_desc]) def loss_fn(x, label): return F.nll_loss(F.log_softmax(x, dim=1), label) def get_lr_this_step(global_step): learningRate = 0.02 return torch.tensor([learningRate]) ort_trainer = ORTTrainer( pt_model, loss_fn, model_desc, "SGDOptimizer", None, IODescription('Learning_Rate', [1, ], torch.float32), device, get_lr_this_step=get_lr_this_step) ort_trainer.train_step(x=data, label=label) state_dict = ort_trainer.state_dict() assert state_dict.keys() == {'linear.bias', 'linear.weight'}
def test_layer_norm(self): class LayerNormNet(nn.Module): def __init__(self, target): super(LayerNormNet, self).__init__() self.ln_1 = nn.LayerNorm(10) self.loss = nn.CrossEntropyLoss() self.target = target def forward(self, x): output1 = self.ln_1(x) loss = self.loss(output1, self.target) return loss, output1 device = torch.device("cpu") target = torch.ones(20, 10, 10, dtype=torch.int64).to(device) model = LayerNormNet(target) input = torch.randn(20, 5, 10, 10, dtype=torch.float32).to(device) input_desc = IODescription('input', [], "float32") output0_desc = IODescription('output0', [], "float32") output1_desc = IODescription('output1', [20, 5, 10, 10], "float32") model_desc = ModelDescription([input_desc], [output0_desc, output1_desc]) learning_rate = torch.tensor([1.0000000e+00]).to(device) input_args=[input, learning_rate] onnx_model = self.get_onnx_model(model, model_desc, input_args, device) count_layer_norm = self.count_nodes(onnx_model, "LayerNormalization") count_nodes = self.count_all_nodes(onnx_model) assert count_layer_norm == 1 assert count_nodes == 3
def bert_model_description(): vocab_size = 30528 input_ids_desc = IODescription('input_ids', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes=vocab_size) segment_ids_desc = IODescription('segment_ids', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes=2) input_mask_desc = IODescription('input_mask', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes=2) masked_lm_labels_desc = IODescription('masked_lm_labels', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes=vocab_size) next_sentence_labels_desc = IODescription('next_sentence_labels', [ 'batch', ], torch.int64, num_classes=2) loss_desc = IODescription('loss', [], torch.float32) return ModelDescription([ input_ids_desc, segment_ids_desc, input_mask_desc, masked_lm_labels_desc, next_sentence_labels_desc ], [loss_desc])
def bart_model_description(args): vocab_size = 50349 batch = 3 max_tokens_valid = 1023 max_tokens = 3069 #''' # allow variable input sizes: src_tokens_desc = IODescription('src_tokens', ['batch', 'max_src_tokens'], torch.int64, num_classes=vocab_size) src_lengths_desc = IODescription('src_lengths', ['batch'], torch.int64, num_classes=args.max_tokens_valid) prev_output_tokens_desc = IODescription('prev_output_tokens', ['batch', 'max_out_tokens'], torch.int64, num_classes=vocab_size) target_desc = IODescription('target', ['max_tgt_tokens'], torch.int64, num_classes=vocab_size) #''' ''' # set concrete input sizes to permit optimization src_tokens_desc = IODescription('src_tokens', [batch, max_tokens_valid], torch.int64, num_classes = vocab_size) src_lengths_desc = IODescription('src_lengths', [batch], torch.int64, num_classes = args.max_tokens_valid) prev_output_tokens_desc = IODescription('prev_output_tokens', [batch, max_tokens_valid], torch.int64, num_classes = vocab_size) target_desc = IODescription('target', [max_tokens], torch.int64, num_classes = vocab_size) ''' loss_desc = IODescription('loss', [], torch.float32) #return ModelDescription([src_tokens_desc, src_lengths_desc, prev_output_tokens_desc, target_desc], [loss_desc]) return ModelDescription( [src_tokens_desc, prev_output_tokens_desc, target_desc], [loss_desc])
def model_description(): input_desc = IODescription('src', [bptt, batch_size], torch.float32) label_desc = IODescription('label', [bptt, batch_size, ntokens], torch.int64) loss_desc = IODescription('loss', [], torch.float32) output_desc = IODescription('output', [bptt, batch_size, ntokens], torch.float32) return ModelDescription([input_desc, label_desc], [loss_desc, output_desc])
def model_to_desc(self, model_name, model): if model_name.startswith('bert') or model_name.startswith('xlnet'): model_desc = ModelDescription([ IODescription('input_ids', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes=model.config.vocab_size), IODescription('attention_mask', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes=2), IODescription('token_type_ids', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes=2), IODescription( 'labels', [ 'batch', ], torch.int64, num_classes=2) ], [ IODescription('loss', [], torch.float32), IODescription('logits', ['batch', 2], torch.float32) ]) elif model_name.startswith('roberta'): model_desc = ModelDescription([ IODescription('input_ids', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes=model.config.vocab_size), IODescription('attention_mask', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes=2), IODescription( 'labels', [ 'batch', ], torch.int64, num_classes=2) ], [ IODescription('loss', [], torch.float32), IODescription('logits', ['batch', 2], torch.float32) ]) else: raise RuntimeError( "unsupported base model name {}.".format(model_name)) return model_desc
def create_and_check_bert_model(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertModel(config=config) model.to(input_ids.device) model.eval() sequence_output, pooled_output = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids) # failed because there is not loss output model_desc = ModelDescription([ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc ], [self.last_hidden_state_desc, self.pooler_output_desc]) args_gradient_accumulation_steps = 8 args_local_rank = 0 args_world_size = 1 args_fp16 = True args_allreduce_post_accumulation = True model = ORTTrainer( model, None, model_desc, "LambOptimizer", map_optimizer_attributes=map_optimizer_attributes, learning_rate_description=IODescription( 'Learning_Rate', [ 1, ], torch.float32), device=self.device, postprocess_model=postprocess_model, gradient_accumulation_steps=args_gradient_accumulation_steps, world_rank=args_local_rank, world_size=args_world_size, use_mixed_precision=True if args_fp16 else False, allreduce_post_accumulation=True if args_allreduce_post_accumulation else False) sequence_output, pooled_output = model( input_ids, token_type_ids=token_type_ids) sequence_output, pooled_output = model(input_ids) result = { "sequence_output": sequence_output, "pooled_output": pooled_output, } self.parent.assertListEqual( list(result["sequence_output"].size()), [self.batch_size, self.seq_length, self.hidden_size]) self.parent.assertListEqual(list(result["pooled_output"].size()), [self.batch_size, self.hidden_size])
def transformer_model_description(): input_desc = IODescription('input1', [bptt, batch_size], torch.float32) label_desc = IODescription('label', [bptt, batch_size, ntokens], torch.int64) loss_desc = IODescription('loss', [], torch.float32) return ModelDescription([input_desc, label_desc], [loss_desc]), IODescription( 'Learning_Rate', [ lr, ], torch.float32)
def mnist_model_description(): input_desc = IODescription('input1', ['batch', 784], torch.float32) label_desc = IODescription('label', [ 'batch', ], torch.int64, num_classes=10) loss_desc = IODescription('loss', [], torch.float32) probability_desc = IODescription('probability', ['batch', 10], torch.float32) return ModelDescription([input_desc, label_desc], [loss_desc, probability_desc])
def BertForPreTraining_descs(self): return ModelDescription( [ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc, self.masked_lm_labels_desc, self.next_sentence_label_desc ], # returns loss_desc if both masked_lm_labels_desc, next_sentence_label are provided # hidden_states_desc, attentions_desc shall be included according to config.output_attentions, config.output_hidden_states [ self.loss_desc, self.prediction_scores_desc, self.seq_relationship_scores_desc, #hidden_states_desc, attentions_desc ])
def mnist_model_description(): input_desc = IODescription("input1", ["batch", 784], torch.float32) label_desc = IODescription( "label", [ "batch", ], torch.int64, num_classes=10, ) loss_desc = IODescription("loss", [], torch.float32) probability_desc = IODescription("probability", ["batch", 10], torch.float32) return ModelDescription([input_desc, label_desc], [loss_desc, probability_desc])
def testTrainingAndEvalDropout(self): # Temporarily disable this test. # The graph below will trigger ORT # to sort backward graph before forward graph which gives incorrect result. # TODO Re-enable when that is fixed. return class TwoDropoutNet(nn.Module): def __init__(self, drop_prb_1, drop_prb_2, dim_size): super(TwoDropoutNet, self).__init__() self.drop_1 = nn.Dropout(drop_prb_1) self.drop_2 = nn.Dropout(drop_prb_2) self.weight_1 = torch.nn.Parameter(torch.zeros(dim_size, dtype=torch.float32)) def forward(self, x): x = x + self.weight_1 x = self.drop_1(x) x = self.drop_2(x) output = x return output[0] dim_size = 3 device = torch.device("cuda", 0) # This will drop all values, therefore expecting all 0 in output tensor model = TwoDropoutNet(0.999, 0.999, dim_size) input_desc = IODescription('input', [dim_size], torch.float32) output_desc = IODescription('output', [], torch.float32) model_desc = ModelDescription([input_desc], [output_desc]) lr_desc = ort_trainer_learning_rate_description() model = ORTTrainer(model, None, model_desc, "LambOptimizer", map_optimizer_attributes, lr_desc, device, postprocess_model=process_dropout, world_rank=0, world_size=1) input = torch.ones(dim_size, dtype=torch.float32).to(device) expected_training_output = [0.0] expected_eval_output = [1.0] learning_rate = torch.tensor([1.0000000e+00]).to(device) input_args=[input, learning_rate] train_output = model.train_step(*input_args) rtol = 1e-04 assert_allclose(expected_training_output, train_output.item(), rtol=rtol, err_msg="dropout training loss mismatch") eval_output = model.eval_step(input) assert_allclose(expected_eval_output, eval_output.item(), rtol=rtol, err_msg="dropout eval loss mismatch") # Do another train step to make sure it's using original ratios train_output_2 = model.train_step(*input_args) assert_allclose(expected_training_output, train_output_2.item(), rtol=rtol, err_msg="dropout training loss 2 mismatch")
def create_and_check_bert_for_masked_lm(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForMaskedLM(config=config) model.eval() loss, prediction_scores = model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels) ##### model_desc = ModelDescription([ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc, self.masked_lm_labels_desc ], [self.loss_desc, self.prediction_scores_desc]) args_gradient_accumulation_steps = 8 args_local_rank = 0 args_world_size = 1 args_fp16 = True args_allreduce_post_accumulation = True model = ORTTrainer( model, None, model_desc, "LambOptimizer", map_optimizer_attributes=map_optimizer_attributes, learning_rate_description=IODescription( 'Learning_Rate', [ 1, ], torch.float32), device=self.device, postprocess_model=postprocess_model, gradient_accumulation_steps=args_gradient_accumulation_steps, world_rank=args_local_rank, world_size=args_world_size, use_mixed_precision=True if args_fp16 else False, allreduce_post_accumulation=True if args_allreduce_post_accumulation else False) model(input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels)
def gpt2_model_description(self, n_head, vocab_size, n_hidden, n_layer, n_ctx, batch_size): logger.info("****num of head is: {}".format(n_head)) logger.info("****vocab size is: {}".format(vocab_size)) logger.info("****num of hidden layer is: {}".format(n_hidden)) logger.info("****num of layer is: {}".format(n_layer)) logger.info("****seq length is: {}".format(n_ctx)) input_ids_desc = IODescription('input_ids', [batch_size, n_ctx], torch.int64, num_classes=vocab_size) labels_desc = IODescription('labels', [batch_size, n_ctx], torch.int64, num_classes=vocab_size) loss_desc = IODescription('loss', [], torch.float32) return ModelDescription([input_ids_desc, labels_desc], [loss_desc])
def bert_model_description(args): vocab_size = 30528 # allow variable input sizes: # input_ids_desc = IODescription('input_ids', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes = vocab_size) # segment_ids_desc = IODescription('segment_ids', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes = 2) # input_mask_desc = IODescription('input_mask', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes = 2) # masked_lm_labels_desc = IODescription('masked_lm_labels', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes = vocab_size) # next_sentence_labels_desc = IODescription('next_sentence_labels', ['batch',], torch.int64, num_classes = 2) # set concrete input sizes to permit optimization input_ids_desc = IODescription('input_ids', [args.train_batch_size, args.max_seq_length], torch.int64, num_classes = vocab_size) segment_ids_desc = IODescription('segment_ids', [args.train_batch_size, args.max_seq_length], torch.int64, num_classes = 2) input_mask_desc = IODescription('input_mask', [args.train_batch_size, args.max_seq_length], torch.int64, num_classes = 2) masked_lm_labels_desc = IODescription('masked_lm_labels', [args.train_batch_size, args.max_seq_length], torch.int64, num_classes = vocab_size) next_sentence_labels_desc = IODescription('next_sentence_labels', [args.train_batch_size,2], torch.int64, num_classes = 2) loss_desc = IODescription('loss', [], torch.float32) return ModelDescription([input_ids_desc, segment_ids_desc, input_mask_desc, masked_lm_labels_desc, next_sentence_labels_desc], [loss_desc])
def bert_model_description(): vocab_size = 30528 input_ids_desc = IODescription( "input_ids", ["batch", "max_seq_len_in_batch"], torch.int64, num_classes=vocab_size, ) segment_ids_desc = IODescription("segment_ids", ["batch", "max_seq_len_in_batch"], torch.int64, num_classes=2) input_mask_desc = IODescription("input_mask", ["batch", "max_seq_len_in_batch"], torch.int64, num_classes=2) masked_lm_labels_desc = IODescription( "masked_lm_labels", ["batch", "max_seq_len_in_batch"], torch.int64, num_classes=vocab_size, ) next_sentence_labels_desc = IODescription( "next_sentence_labels", [ "batch", ], torch.int64, num_classes=2, ) loss_desc = IODescription("loss", [], torch.float32) return ModelDescription( [ input_ids_desc, segment_ids_desc, input_mask_desc, masked_lm_labels_desc, next_sentence_labels_desc, ], [loss_desc], )
def test_expand(self): class ExpandNet(nn.Module): def __init__(self, target): super(ExpandNet, self).__init__() self.loss = nn.CrossEntropyLoss() self.target = target self.linear = torch.nn.Linear(2, 2) def forward(self, x, x1): output = x.expand_as(x1) output = self.linear(output) output = output + output loss = self.loss(output, self.target) return loss, output device = torch.device("cpu") target = torch.ones(5, 5, 2, dtype=torch.int64).to(device) model = ExpandNet(target).to(device) x = torch.randn(5, 3, 1, 2, dtype=torch.float32).to(device) x1 = torch.randn(5, 3, 5, 2, dtype=torch.float32).to(device) input0_desc = IODescription('x', [5, 3, 1, 2], "float32") input1_desc = IODescription('x1', [5, 3, 5, 2], "float32") output0_desc = IODescription('output0', [], "float32") output1_desc = IODescription('output1', [5, 3, 5, 2], "float32") model_desc = ModelDescription([input0_desc, input1_desc], [output0_desc, output1_desc]) learning_rate = torch.tensor([1.0000000e+00]).to(device) input_args = [x, x1, learning_rate] onnx_model = self.get_onnx_model(model, model_desc, input_args, device) # check that expand output has shape expand_nodes = self.find_nodes(onnx_model, "Expand") assert len(expand_nodes) == 1 model_info = onnx_model.graph.value_info assert model_info[0].name == expand_nodes[0].output[0] assert model_info[0].type == onnx_model.graph.input[1].type
def create_and_check_bert_for_pretraining( self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels, option_fp16, option_allreduce_post_accumulation, option_gradient_accumulation_steps, option_split_batch, option_use_internal_get_lr_this_step=[True], option_use_internal_loss_scaler=[True], ): seed = 42 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) onnxruntime.set_seed(seed) model = BertForPreTraining(config=config) model.eval() loss, prediction_scores, seq_relationship_score = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, next_sentence_label=sequence_labels, ) model_desc = ModelDescription( [ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc, self.masked_lm_labels_desc, self.next_sentence_label_desc, ], [ self.loss_desc, self.prediction_scores_desc, self.seq_relationship_scores_desc ], ) from collections import namedtuple MyArgs = namedtuple( "MyArgs", "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len" ) dataset_len = 100 epochs = 8 max_steps = epochs * dataset_len args = MyArgs( local_rank=0, world_size=1, max_steps=max_steps, learning_rate=0.00001, warmup_proportion=0.01, batch_size=13, seq_len=7, ) def get_lr_this_step(global_step): return get_lr(args, global_step) loss_scaler = LossScaler("loss_scale_input_name", True, up_scale_window=2000) for fp16 in option_fp16: for allreduce_post_accumulation in option_allreduce_post_accumulation: for gradient_accumulation_steps in option_gradient_accumulation_steps: for use_internal_get_lr_this_step in option_use_internal_get_lr_this_step: for use_internal_loss_scaler in option_use_internal_loss_scaler: for split_batch in option_split_batch: print("gradient_accumulation_steps:", gradient_accumulation_steps) print("split_batch:", split_batch) seed = 42 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) onnxruntime.set_seed(seed) ( old_api_loss_ort, old_api_prediction_scores_ort, old_api_seq_relationship_score_ort, ) = run_test( model, model_desc, self.device, args, gradient_accumulation_steps, fp16, allreduce_post_accumulation, get_lr_this_step, use_internal_get_lr_this_step, loss_scaler, use_internal_loss_scaler, split_batch, dataset_len, epochs, use_new_api=False, ) random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) onnxruntime.set_seed(seed) if use_internal_get_lr_this_step and use_internal_loss_scaler: ( new_api_loss_ort, new_api_prediction_scores_ort, new_api_seq_relationship_score_ort, ) = run_test( model, model_desc, self.device, args, gradient_accumulation_steps, fp16, allreduce_post_accumulation, get_lr_this_step, use_internal_get_lr_this_step, loss_scaler, use_internal_loss_scaler, split_batch, dataset_len, epochs, use_new_api=True, ) assert_allclose( old_api_loss_ort, new_api_loss_ort) assert_allclose( old_api_prediction_scores_ort, new_api_prediction_scores_ort) assert_allclose( old_api_seq_relationship_score_ort, new_api_seq_relationship_score_ort)
def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): seed = 42 random.seed(seed) np.random.seed(seed) torch.manual_seed(seed) torch.cuda.manual_seed_all(seed) onnxruntime.set_seed(seed) model = BertForPreTraining(config=config) model.eval() loss, prediction_scores, seq_relationship_score = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, next_sentence_label=sequence_labels) model_desc = ModelDescription([ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc, self.masked_lm_labels_desc, self.next_sentence_label_desc ], [ self.loss_desc, self.prediction_scores_desc, self.seq_relationship_scores_desc ]) from collections import namedtuple MyArgs = namedtuple( "MyArgs", "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len" ) args = MyArgs(local_rank=0, world_size=1, max_steps=100, learning_rate=0.00001, warmup_proportion=0.01, batch_size=13, seq_len=7) def get_lr_this_step(global_step): return get_lr(args, global_step) loss_scaler = LossScaler('loss_scale_input_name', True, up_scale_window=2000) # It would be better to test both with/without mixed precision and allreduce_post_accumulation. # However, stress test of all the 4 cases is not stable at lease on the test machine. # There we only test mixed precision and allreduce_post_accumulation because it is the most useful use cases. option_fp16 = [True] option_allreduce_post_accumulation = [True] option_gradient_accumulation_steps = [1, 8] option_use_internal_get_lr_this_step = [True, False] option_use_internal_loss_scaler = [True, False] option_split_batch = [BatchArgsOption.ListAndDict] for fp16 in option_fp16: for allreduce_post_accumulation in option_allreduce_post_accumulation: for gradient_accumulation_steps in option_gradient_accumulation_steps: for use_internal_get_lr_this_step in option_use_internal_get_lr_this_step: for use_internal_loss_scaler in option_use_internal_loss_scaler: for split_batch in option_split_batch: print("gradient_accumulation_steps:", gradient_accumulation_steps) print("use_internal_loss_scaler:", use_internal_loss_scaler) loss_ort, prediction_scores_ort, seq_relationship_score_ort =\ run_test(model, model_desc, self.device, args, gradient_accumulation_steps, fp16, allreduce_post_accumulation, get_lr_this_step, use_internal_get_lr_this_step, loss_scaler, use_internal_loss_scaler, split_batch) print(loss_ort) print(prediction_scores_ort) print(seq_relationship_score_ort)
def run_glue(self, model_name, task_name, fp16): model_args = ModelArguments(model_name_or_path=model_name, cache_dir=self.cache_dir) data_args = GlueDataTrainingArguments( task_name=task_name, data_dir=self.data_dir + "/" + task_name, max_seq_length=self.max_seq_length) training_args = TrainingArguments( output_dir=self.output_dir + "/" + task_name, do_train=True, do_eval=True, per_gpu_train_batch_size=self.train_batch_size, learning_rate=self.learning_rate, num_train_epochs=self.num_train_epochs, local_rank=self.local_rank, overwrite_output_dir=self.overwrite_output_dir, gradient_accumulation_steps=self.gradient_accumulation_steps, fp16=fp16, logging_steps=self.logging_steps) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) set_seed(training_args.seed) onnxruntime.set_seed(training_args.seed) try: num_labels = glue_tasks_num_labels[data_args.task_name] output_mode = glue_output_modes[data_args.task_name] except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForSequenceClassification.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) train_dataset = (GlueDataset(data_args, tokenizer=tokenizer) if training_args.do_train else None) eval_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="dev") if training_args.do_eval else None) def compute_metrics(p: EvalPrediction) -> Dict: if output_mode == "classification": preds = np.argmax(p.predictions, axis=1) elif output_mode == "regression": preds = np.squeeze(p.predictions) return glue_compute_metrics(data_args.task_name, preds, p.label_ids) model_desc = ModelDescription([ IODescription('input_ids', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes=model.config.vocab_size), IODescription('attention_mask', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes=2), IODescription('token_type_ids', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes=2), IODescription('labels', [ 'batch', ], torch.int64, num_classes=2) ], [ IODescription('loss', [], torch.float32), IODescription('logits', ['batch', 2], torch.float32) ]) # Initialize the ORTTrainer within ORTTransformerTrainer trainer = ORTTransformerTrainer( model=model, model_desc=model_desc, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train() trainer.save_model() # Evaluation results = {} if training_args.do_eval and training_args.local_rank in [-1, 0]: logger.info("*** Evaluate ***") result = trainer.evaluate() logger.info("***** Eval results {} *****".format( data_args.task_name)) for key, value in result.items(): logger.info(" %s = %s", key, value) results.update(result) return results
def model_to_desc(self, model_name, model): if model_name.startswith('bert') or model_name.startswith('xlnet'): new_model_desc = { 'inputs': [( 'input_ids', ['batch', 'max_seq_len_in_batch'], ), ( 'attention_mask', ['batch', 'max_seq_len_in_batch'], ), ( 'token_type_ids', ['batch', 'max_seq_len_in_batch'], ), ( 'labels', [ 'batch', ], )], 'outputs': [('loss', [], True), ('logits', ['batch', 2])] } model_desc = ModelDescription([ IODescription('input_ids', ['batch', 'max_seq_len_in_batch']), IODescription('attention_mask', ['batch', 'max_seq_len_in_batch']), IODescription('token_type_ids', ['batch', 'max_seq_len_in_batch']), IODescription('labels', [ 'batch', ]) ], [ IODescription('loss', []), IODescription('logits', ['batch', 2]) ]) elif model_name.startswith('roberta'): new_model_desc = { 'inputs': [( 'input_ids', ['batch', 'max_seq_len_in_batch'], ), ( 'attention_mask', ['batch', 'max_seq_len_in_batch'], ), ( 'labels', [ 'batch', ], )], 'outputs': [('loss', [], True), ('logits', ['batch', 2])] } model_desc = ModelDescription([ IODescription('input_ids', ['batch', 'max_seq_len_in_batch']), IODescription('attention_mask', ['batch', 'max_seq_len_in_batch']), IODescription('labels', [ 'batch', ]) ], [ IODescription('loss', []), IODescription('logits', ['batch', 2]) ]) else: raise RuntimeError( "unsupported base model name {}.".format(model_name)) return model_desc, new_model_desc
def run_multiple_choice(self, model_name, task_name, fp16): model_args = ModelArguments(model_name_or_path=model_name, cache_dir=self.cache_dir) data_args = DataTrainingArguments(task_name=task_name, data_dir=self.data_dir, max_seq_length=self.max_seq_length) training_args = TrainingArguments( output_dir=os.path.join(self.output_dir, task_name), do_train=True, do_eval=True, per_gpu_train_batch_size=self.train_batch_size, per_gpu_eval_batch_size=self.eval_batch_size, learning_rate=self.learning_rate, num_train_epochs=self.num_train_epochs, local_rank=self.local_rank, overwrite_output_dir=self.overwrite_output_dir, gradient_accumulation_steps=self.gradient_accumulation_steps, fp16=fp16, logging_steps=self.logging_steps) # Setup logging logging.basicConfig( format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", datefmt="%m/%d/%Y %H:%M:%S", level=logging.INFO if training_args.local_rank in [-1, 0] else logging.WARN, ) logger.warning( "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s", training_args.local_rank, training_args.device, training_args.n_gpu, bool(training_args.local_rank != -1), training_args.fp16, ) logger.info("Training/evaluation parameters %s", training_args) set_seed(training_args.seed) onnxruntime.set_seed(training_args.seed) try: processor = SwagProcessor() label_list = processor.get_labels() num_labels = len(label_list) except KeyError: raise ValueError("Task not found: %s" % (data_args.task_name)) config = AutoConfig.from_pretrained( model_args.config_name if model_args.config_name else model_args.model_name_or_path, num_labels=num_labels, finetuning_task=data_args.task_name, cache_dir=model_args.cache_dir, ) tokenizer = AutoTokenizer.from_pretrained( model_args.tokenizer_name if model_args.tokenizer_name else model_args.model_name_or_path, cache_dir=model_args.cache_dir, ) model = AutoModelForMultipleChoice.from_pretrained( model_args.model_name_or_path, from_tf=bool(".ckpt" in model_args.model_name_or_path), config=config, cache_dir=model_args.cache_dir, ) # Get datasets train_dataset = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, processor=processor, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.train, ) if training_args.do_train else None) eval_dataset = (MultipleChoiceDataset( data_dir=data_args.data_dir, tokenizer=tokenizer, task=data_args.task_name, processor=processor, max_seq_length=data_args.max_seq_length, overwrite_cache=data_args.overwrite_cache, mode=Split.dev, ) if training_args.do_eval else None) def compute_metrics(p: EvalPrediction) -> Dict: preds = np.argmax(p.predictions, axis=1) return {"acc": simple_accuracy(preds, p.label_ids)} if model_name.startswith('bert'): model_desc = ModelDescription([ IODescription('input_ids', [ self.train_batch_size, num_labels, data_args.max_seq_length ], torch.int64, num_classes=model.config.vocab_size), IODescription('attention_mask', [ self.train_batch_size, num_labels, data_args.max_seq_length ], torch.int64, num_classes=2), IODescription('token_type_ids', [ self.train_batch_size, num_labels, data_args.max_seq_length ], torch.int64, num_classes=2), IODescription('labels', [self.train_batch_size, num_labels], torch.int64, num_classes=num_labels) ], [ IODescription('loss', [], torch.float32), IODescription('reshaped_logits', [self.train_batch_size, num_labels], torch.float32) ]) else: model_desc = ModelDescription([ IODescription('input_ids', ['batch', num_labels, 'max_seq_len_in_batch'], torch.int64, num_classes=model.config.vocab_size), IODescription('attention_mask', ['batch', num_labels, 'max_seq_len_in_batch'], torch.int64, num_classes=2), IODescription('labels', ['batch', num_labels], torch.int64, num_classes=num_labels) ], [ IODescription('loss', [], torch.float32), IODescription('reshaped_logits', ['batch', num_labels], torch.float32) ]) # Initialize the ORTTrainer within ORTTransformerTrainer trainer = ORTTransformerTrainer( model=model, model_desc=model_desc, args=training_args, train_dataset=train_dataset, eval_dataset=eval_dataset, compute_metrics=compute_metrics, ) # Training if training_args.do_train: trainer.train() trainer.save_model() # Evaluation results = {} if training_args.do_eval and training_args.local_rank in [-1, 0]: logger.info("*** Evaluate ***") result = trainer.evaluate() logger.info("***** Eval results {} *****".format( data_args.task_name)) for key, value in result.items(): logger.info(" %s = %s", key, value) results.update(result) return results
def create_and_check_bert_for_pretraining(self, config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels): model = BertForPreTraining(config=config) model.eval() loss, prediction_scores, seq_relationship_score = model( input_ids, attention_mask=input_mask, token_type_ids=token_type_ids, masked_lm_labels=token_labels, next_sentence_label=sequence_labels) model_desc = ModelDescription([ self.input_ids_desc, self.attention_mask_desc, self.token_type_ids_desc, self.masked_lm_labels_desc, self.next_sentence_label_desc ], [ self.loss_desc, self.prediction_scores_desc, self.seq_relationship_scores_desc ]) import argparse args_ = argparse.Namespace(fp16=True, amp_opt_level='O1') from collections import namedtuple MyArgs = namedtuple( "MyArgs", "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len" ) args = MyArgs(local_rank=0, world_size=1, max_steps=100, learning_rate=0.00001, warmup_proportion=0.01, batch_size=13, seq_len=7) from train_with_ort_trainer import get_lr def get_lr_this_step(global_step): return get_lr(args, global_step) loss_scaler = LossScaler('loss_scale_input_name', True, up_scale_window=2000) option_gradient_accumulation_steps = [8] option_fp16 = [True, False] option_allreduce_post_accumulation = True option_use_internal_get_lr_this_step = False option_use_internal_loss_scaler = False # TODO: with with fetches for gradient_accumulation_steps in option_gradient_accumulation_steps: for fp16 in option_fp16: for option_split_batch in BatchArgsOption: loss_ort, prediction_scores_ort, seq_relationship_score_ort =\ run_test(model, model_desc, self.device, args, gradient_accumulation_steps, fp16, option_allreduce_post_accumulation, get_lr_this_step, option_use_internal_get_lr_this_step, loss_scaler, option_use_internal_loss_scaler, option_split_batch) print(loss_ort) print(prediction_scores_ort) print(seq_relationship_score_ort)
def test_extra_postpass(self): def postpass_replace_first_add_with_sub(model): # this post pass replaces the first Add node with Sub in the model. # Previous graph # (subgraph 1) (subgraph 2) # | | # | | # |________ ________| # | | # Add # | # (subgraph 3) # # Post graph # (subgraph 1) (subgraph 2) # | | # | | # |________ ________| # | | # Sub # | # (subgraph 3) add_nodes = [n for n in model.graph.node if n.op_type == 'Add'] add_nodes[0].op_type = "Sub" class MultiAdd(nn.Module): def __init__(self, target): super(MultiAdd, self).__init__() self.loss = nn.CrossEntropyLoss() self.target = target self.linear = torch.nn.Linear(2, 2, bias=False) def forward(self, x, x1): output = x + x1 output = output + x output = output + x1 output = self.linear(output) loss = self.loss(output, self.target) return loss, output device = torch.device("cpu") target = torch.ones(5, 2, dtype=torch.int64).to(device) model = MultiAdd(target).to(device) x = torch.randn(5, 5, 2, dtype=torch.float32).to(device) x1 = torch.randn(5, 5, 2, dtype=torch.float32).to(device) input0_desc = IODescription('x', [5, 5, 2], "float32") input1_desc = IODescription('x1', [5, 5, 2], "float32") output0_desc = IODescription('output0', [], "float32") output1_desc = IODescription('output1', [5, 5, 2], "float32") model_desc = ModelDescription([input0_desc, input1_desc], [output0_desc, output1_desc]) learning_rate = torch.tensor([1.0000000e+00]).to(device) input_args = [x, x1, learning_rate] onnx_model = self.get_onnx_model(model, model_desc, input_args, device, _extra_postprocess=postpass_replace_first_add_with_sub) # check that extra postpass is called, and called only once. add_nodes = self.find_nodes(onnx_model, "Add") sub_nodes = self.find_nodes(onnx_model, "Sub") assert len(add_nodes) == 2 assert len(sub_nodes) == 1 unprocessed_onnx_model = self.get_onnx_model(model, model_desc, input_args, device, _extra_postprocess=None, _enable_internal_postprocess=False) # check that the model is unchanged. add_nodes = self.find_nodes(unprocessed_onnx_model, "Add") sub_nodes = self.find_nodes(unprocessed_onnx_model, "Sub") assert len(add_nodes) == 3 assert len(sub_nodes) == 0 processed_onnx_model = self.get_onnx_model(unprocessed_onnx_model, model_desc, input_args, device, _extra_postprocess=postpass_replace_first_add_with_sub) # check that extra postpass is called, and called only once. add_nodes = self.find_nodes(processed_onnx_model, "Add") sub_nodes = self.find_nodes(processed_onnx_model, "Sub") assert len(add_nodes) == 2 assert len(sub_nodes) == 1