コード例 #1
0
    def test_bert(self):
        device = torch.device("cpu")

        model_tester = BertModelTest.BertModelTester(self)
        config, input_ids, token_type_ids, input_mask, sequence_labels, token_labels, choice_labels = model_tester.prepare_config_and_inputs(
        )

        model = BertForPreTraining(config=config)
        model.eval()

        loss, prediction_scores, seq_relationship_score = model(
            input_ids,
            attention_mask=input_mask,
            token_type_ids=token_type_ids,
            masked_lm_labels=token_labels,
            next_sentence_label=sequence_labels)

        model_desc = ModelDescription([
            model_tester.input_ids_desc, model_tester.attention_mask_desc,
            model_tester.token_type_ids_desc,
            model_tester.masked_lm_labels_desc,
            model_tester.next_sentence_label_desc
        ], [
            model_tester.loss_desc, model_tester.prediction_scores_desc,
            model_tester.seq_relationship_scores_desc
        ])

        from collections import namedtuple
        MyArgs = namedtuple(
            "MyArgs",
            "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len"
        )
        args = MyArgs(local_rank=0,
                      world_size=1,
                      max_steps=100,
                      learning_rate=0.00001,
                      warmup_proportion=0.01,
                      batch_size=13,
                      seq_len=7)

        dataset_len = 100
        dataloader = create_ort_test_dataloader(model_desc.inputs_,
                                                args.batch_size, args.seq_len,
                                                dataset_len, device)
        learning_rate = torch.tensor(1.0e+0, dtype=torch.float32).to(device)
        for b in dataloader:
            batch = b
            break
        learning_rate = torch.tensor([1.00e+00]).to(device)
        inputs = batch + [
            learning_rate,
        ]

        onnx_model = self.get_onnx_model(model,
                                         model_desc,
                                         inputs,
                                         device,
                                         _extra_postprocess=postprocess_model)

        self._bert_helper(onnx_model)
コード例 #2
0
    def testWrapModelLossFnStateDict(self):
        torch.manual_seed(1)
        device = torch.device("cuda")
        class LinearModel(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.linear = torch.nn.Linear(2, 4)
            def forward(self, y=None, x=None):
                if y is not None:
                    return self.linear(x) + y
                else:
                    return self.linear(x) + torch.ones(2, 4)

        pt_model = LinearModel()
        data = torch.randn(2, 2)
        label = torch.tensor([0, 1], dtype=torch.int64)
        input_desc = IODescription('x', [2, 2], torch.float32)
        label_desc = IODescription('label', [2, ], torch.int64, num_classes=4)
        output_desc = IODescription('output', [2, 4], torch.float32)
        loss_desc = IODescription('loss', [], torch.float32)
        model_desc = ModelDescription([input_desc, label_desc], [loss_desc, output_desc])
        def loss_fn(x, label):
            return F.nll_loss(F.log_softmax(x, dim=1), label)

        def get_lr_this_step(global_step):
            learningRate = 0.02
            return torch.tensor([learningRate])

        ort_trainer = ORTTrainer(
            pt_model, loss_fn, model_desc, "SGDOptimizer", None,
            IODescription('Learning_Rate', [1, ], torch.float32), device,
            get_lr_this_step=get_lr_this_step)
        ort_trainer.train_step(x=data, label=label)
        state_dict = ort_trainer.state_dict()
        assert state_dict.keys() == {'linear.bias', 'linear.weight'}
コード例 #3
0
    def test_layer_norm(self):
        class LayerNormNet(nn.Module):
            def __init__(self, target):
                super(LayerNormNet, self).__init__()
                self.ln_1 = nn.LayerNorm(10)
                self.loss = nn.CrossEntropyLoss()
                self.target = target

            def forward(self, x):
                output1 = self.ln_1(x)
                loss = self.loss(output1, self.target)
                return loss, output1

        device = torch.device("cpu")
        target = torch.ones(20, 10, 10, dtype=torch.int64).to(device)
        model = LayerNormNet(target)
        input = torch.randn(20, 5, 10, 10, dtype=torch.float32).to(device)

        input_desc = IODescription('input', [], "float32")
        output0_desc = IODescription('output0', [], "float32")
        output1_desc = IODescription('output1', [20, 5, 10, 10], "float32")
        model_desc = ModelDescription([input_desc], [output0_desc, output1_desc])

        learning_rate = torch.tensor([1.0000000e+00]).to(device)
        input_args=[input, learning_rate]

        onnx_model = self.get_onnx_model(model, model_desc, input_args, device)

        count_layer_norm = self.count_nodes(onnx_model, "LayerNormalization")
        count_nodes = self.count_all_nodes(onnx_model)

        assert count_layer_norm == 1
        assert count_nodes == 3
コード例 #4
0
def bert_model_description():
    vocab_size = 30528
    input_ids_desc = IODescription('input_ids',
                                   ['batch', 'max_seq_len_in_batch'],
                                   torch.int64,
                                   num_classes=vocab_size)
    segment_ids_desc = IODescription('segment_ids',
                                     ['batch', 'max_seq_len_in_batch'],
                                     torch.int64,
                                     num_classes=2)
    input_mask_desc = IODescription('input_mask',
                                    ['batch', 'max_seq_len_in_batch'],
                                    torch.int64,
                                    num_classes=2)
    masked_lm_labels_desc = IODescription('masked_lm_labels',
                                          ['batch', 'max_seq_len_in_batch'],
                                          torch.int64,
                                          num_classes=vocab_size)
    next_sentence_labels_desc = IODescription('next_sentence_labels', [
        'batch',
    ],
                                              torch.int64,
                                              num_classes=2)
    loss_desc = IODescription('loss', [], torch.float32)

    return ModelDescription([
        input_ids_desc, segment_ids_desc, input_mask_desc,
        masked_lm_labels_desc, next_sentence_labels_desc
    ], [loss_desc])
コード例 #5
0
def bart_model_description(args):
    vocab_size = 50349
    batch = 3
    max_tokens_valid = 1023
    max_tokens = 3069
    #'''
    # allow variable input sizes:
    src_tokens_desc = IODescription('src_tokens', ['batch', 'max_src_tokens'],
                                    torch.int64,
                                    num_classes=vocab_size)
    src_lengths_desc = IODescription('src_lengths', ['batch'],
                                     torch.int64,
                                     num_classes=args.max_tokens_valid)
    prev_output_tokens_desc = IODescription('prev_output_tokens',
                                            ['batch', 'max_out_tokens'],
                                            torch.int64,
                                            num_classes=vocab_size)
    target_desc = IODescription('target', ['max_tgt_tokens'],
                                torch.int64,
                                num_classes=vocab_size)
    #'''
    '''
    # set concrete input sizes to permit optimization
    src_tokens_desc = IODescription('src_tokens', [batch, max_tokens_valid], torch.int64, num_classes = vocab_size)
    src_lengths_desc = IODescription('src_lengths', [batch], torch.int64, num_classes = args.max_tokens_valid)
    prev_output_tokens_desc = IODescription('prev_output_tokens', [batch, max_tokens_valid], torch.int64, num_classes = vocab_size)
    target_desc = IODescription('target', [max_tokens], torch.int64, num_classes = vocab_size)
    '''
    loss_desc = IODescription('loss', [], torch.float32)
    #return ModelDescription([src_tokens_desc, src_lengths_desc, prev_output_tokens_desc, target_desc], [loss_desc])
    return ModelDescription(
        [src_tokens_desc, prev_output_tokens_desc, target_desc], [loss_desc])
コード例 #6
0
def model_description():
    input_desc = IODescription('src', [bptt, batch_size], torch.float32)
    label_desc = IODescription('label', [bptt, batch_size, ntokens],
                               torch.int64)
    loss_desc = IODescription('loss', [], torch.float32)
    output_desc = IODescription('output', [bptt, batch_size, ntokens],
                                torch.float32)
    return ModelDescription([input_desc, label_desc], [loss_desc, output_desc])
コード例 #7
0
    def model_to_desc(self, model_name, model):
        if model_name.startswith('bert') or model_name.startswith('xlnet'):
            model_desc = ModelDescription([
                IODescription('input_ids', ['batch', 'max_seq_len_in_batch'],
                              torch.int64,
                              num_classes=model.config.vocab_size),
                IODescription('attention_mask',
                              ['batch', 'max_seq_len_in_batch'],
                              torch.int64,
                              num_classes=2),
                IODescription('token_type_ids',
                              ['batch', 'max_seq_len_in_batch'],
                              torch.int64,
                              num_classes=2),
                IODescription(
                    'labels', [
                        'batch',
                    ], torch.int64, num_classes=2)
            ], [
                IODescription('loss', [], torch.float32),
                IODescription('logits', ['batch', 2], torch.float32)
            ])
        elif model_name.startswith('roberta'):
            model_desc = ModelDescription([
                IODescription('input_ids', ['batch', 'max_seq_len_in_batch'],
                              torch.int64,
                              num_classes=model.config.vocab_size),
                IODescription('attention_mask',
                              ['batch', 'max_seq_len_in_batch'],
                              torch.int64,
                              num_classes=2),
                IODescription(
                    'labels', [
                        'batch',
                    ], torch.int64, num_classes=2)
            ], [
                IODescription('loss', [], torch.float32),
                IODescription('logits', ['batch', 2], torch.float32)
            ])
        else:
            raise RuntimeError(
                "unsupported base model name {}.".format(model_name))

        return model_desc
コード例 #8
0
        def create_and_check_bert_model(self, config, input_ids,
                                        token_type_ids, input_mask,
                                        sequence_labels, token_labels,
                                        choice_labels):
            model = BertModel(config=config)
            model.to(input_ids.device)
            model.eval()

            sequence_output, pooled_output = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids)

            # failed because there is not loss output
            model_desc = ModelDescription([
                self.input_ids_desc, self.attention_mask_desc,
                self.token_type_ids_desc
            ], [self.last_hidden_state_desc, self.pooler_output_desc])
            args_gradient_accumulation_steps = 8
            args_local_rank = 0
            args_world_size = 1
            args_fp16 = True
            args_allreduce_post_accumulation = True

            model = ORTTrainer(
                model,
                None,
                model_desc,
                "LambOptimizer",
                map_optimizer_attributes=map_optimizer_attributes,
                learning_rate_description=IODescription(
                    'Learning_Rate', [
                        1,
                    ], torch.float32),
                device=self.device,
                postprocess_model=postprocess_model,
                gradient_accumulation_steps=args_gradient_accumulation_steps,
                world_rank=args_local_rank,
                world_size=args_world_size,
                use_mixed_precision=True if args_fp16 else False,
                allreduce_post_accumulation=True
                if args_allreduce_post_accumulation else False)

            sequence_output, pooled_output = model(
                input_ids, token_type_ids=token_type_ids)
            sequence_output, pooled_output = model(input_ids)

            result = {
                "sequence_output": sequence_output,
                "pooled_output": pooled_output,
            }
            self.parent.assertListEqual(
                list(result["sequence_output"].size()),
                [self.batch_size, self.seq_length, self.hidden_size])
            self.parent.assertListEqual(list(result["pooled_output"].size()),
                                        [self.batch_size, self.hidden_size])
コード例 #9
0
def transformer_model_description():
    input_desc = IODescription('input1', [bptt, batch_size], torch.float32)
    label_desc = IODescription('label', [bptt, batch_size, ntokens],
                               torch.int64)
    loss_desc = IODescription('loss', [], torch.float32)
    return ModelDescription([input_desc, label_desc],
                            [loss_desc]), IODescription(
                                'Learning_Rate', [
                                    lr,
                                ], torch.float32)
コード例 #10
0
 def mnist_model_description():
     input_desc = IODescription('input1', ['batch', 784], torch.float32)
     label_desc = IODescription('label', [
         'batch',
     ],
                                torch.int64,
                                num_classes=10)
     loss_desc = IODescription('loss', [], torch.float32)
     probability_desc = IODescription('probability', ['batch', 10],
                                      torch.float32)
     return ModelDescription([input_desc, label_desc],
                             [loss_desc, probability_desc])
コード例 #11
0
 def BertForPreTraining_descs(self):
     return ModelDescription(
         [
             self.input_ids_desc, self.attention_mask_desc,
             self.token_type_ids_desc, self.masked_lm_labels_desc,
             self.next_sentence_label_desc
         ],
         # returns loss_desc if both masked_lm_labels_desc, next_sentence_label are provided
         # hidden_states_desc, attentions_desc shall be included according to config.output_attentions, config.output_hidden_states
         [
             self.loss_desc,
             self.prediction_scores_desc,
             self.seq_relationship_scores_desc,
             #hidden_states_desc, attentions_desc
         ])
コード例 #12
0
ファイル: mnist_training.py プロジェクト: xadupre/onnxruntime
def mnist_model_description():
    input_desc = IODescription("input1", ["batch", 784], torch.float32)
    label_desc = IODescription(
        "label",
        [
            "batch",
        ],
        torch.int64,
        num_classes=10,
    )
    loss_desc = IODescription("loss", [], torch.float32)
    probability_desc = IODescription("probability", ["batch", 10],
                                     torch.float32)
    return ModelDescription([input_desc, label_desc],
                            [loss_desc, probability_desc])
コード例 #13
0
    def testTrainingAndEvalDropout(self):
        # Temporarily disable this test.
        # The graph below will trigger ORT
        # to sort backward graph before forward graph which gives incorrect result.
        # TODO Re-enable when that is fixed.
        return
        class TwoDropoutNet(nn.Module):
            def __init__(self, drop_prb_1, drop_prb_2, dim_size):
                super(TwoDropoutNet, self).__init__()
                self.drop_1 = nn.Dropout(drop_prb_1)
                self.drop_2 = nn.Dropout(drop_prb_2)
                self.weight_1 = torch.nn.Parameter(torch.zeros(dim_size, dtype=torch.float32))
            def forward(self, x):
                x = x + self.weight_1
                x = self.drop_1(x)
                x = self.drop_2(x)
                output = x
                return output[0]
        dim_size = 3
        device = torch.device("cuda", 0)
        # This will drop all values, therefore expecting all 0 in output tensor
        model = TwoDropoutNet(0.999, 0.999, dim_size)
        input_desc = IODescription('input', [dim_size], torch.float32)
        output_desc = IODescription('output', [], torch.float32)
        model_desc = ModelDescription([input_desc], [output_desc])
        lr_desc = ort_trainer_learning_rate_description()
        model = ORTTrainer(model, None, model_desc, "LambOptimizer",
                        map_optimizer_attributes,
                        lr_desc,
                        device,
                        postprocess_model=process_dropout,
                        world_rank=0, world_size=1)
        input = torch.ones(dim_size, dtype=torch.float32).to(device)
        expected_training_output = [0.0]
        expected_eval_output = [1.0]
        learning_rate = torch.tensor([1.0000000e+00]).to(device)
        input_args=[input, learning_rate]
        train_output = model.train_step(*input_args)

        rtol = 1e-04
        assert_allclose(expected_training_output, train_output.item(), rtol=rtol, err_msg="dropout training loss mismatch")

        eval_output = model.eval_step(input)
        assert_allclose(expected_eval_output, eval_output.item(), rtol=rtol, err_msg="dropout eval loss mismatch")
 
        # Do another train step to make sure it's using original ratios
        train_output_2 = model.train_step(*input_args)
        assert_allclose(expected_training_output, train_output_2.item(), rtol=rtol, err_msg="dropout training loss 2 mismatch")
コード例 #14
0
        def create_and_check_bert_for_masked_lm(self, config, input_ids,
                                                token_type_ids, input_mask,
                                                sequence_labels, token_labels,
                                                choice_labels):
            model = BertForMaskedLM(config=config)
            model.eval()
            loss, prediction_scores = model(input_ids,
                                            attention_mask=input_mask,
                                            token_type_ids=token_type_ids,
                                            masked_lm_labels=token_labels)

            #####
            model_desc = ModelDescription([
                self.input_ids_desc, self.attention_mask_desc,
                self.token_type_ids_desc, self.masked_lm_labels_desc
            ], [self.loss_desc, self.prediction_scores_desc])
            args_gradient_accumulation_steps = 8
            args_local_rank = 0
            args_world_size = 1
            args_fp16 = True
            args_allreduce_post_accumulation = True

            model = ORTTrainer(
                model,
                None,
                model_desc,
                "LambOptimizer",
                map_optimizer_attributes=map_optimizer_attributes,
                learning_rate_description=IODescription(
                    'Learning_Rate', [
                        1,
                    ], torch.float32),
                device=self.device,
                postprocess_model=postprocess_model,
                gradient_accumulation_steps=args_gradient_accumulation_steps,
                world_rank=args_local_rank,
                world_size=args_world_size,
                use_mixed_precision=True if args_fp16 else False,
                allreduce_post_accumulation=True
                if args_allreduce_post_accumulation else False)
            model(input_ids,
                  attention_mask=input_mask,
                  token_type_ids=token_type_ids,
                  masked_lm_labels=token_labels)
コード例 #15
0
    def gpt2_model_description(self, n_head, vocab_size, n_hidden, n_layer,
                               n_ctx, batch_size):

        logger.info("****num of head is: {}".format(n_head))
        logger.info("****vocab size is: {}".format(vocab_size))
        logger.info("****num of hidden layer is: {}".format(n_hidden))
        logger.info("****num of layer is: {}".format(n_layer))
        logger.info("****seq length is: {}".format(n_ctx))

        input_ids_desc = IODescription('input_ids', [batch_size, n_ctx],
                                       torch.int64,
                                       num_classes=vocab_size)
        labels_desc = IODescription('labels', [batch_size, n_ctx],
                                    torch.int64,
                                    num_classes=vocab_size)

        loss_desc = IODescription('loss', [], torch.float32)

        return ModelDescription([input_ids_desc, labels_desc], [loss_desc])
コード例 #16
0
ファイル: train.py プロジェクト: ganik/DeBERTa
def bert_model_description(args):
    vocab_size = 30528

    # allow variable input sizes:
    # input_ids_desc = IODescription('input_ids', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes = vocab_size)
    # segment_ids_desc = IODescription('segment_ids', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes = 2)
    # input_mask_desc = IODescription('input_mask', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes = 2)
    # masked_lm_labels_desc = IODescription('masked_lm_labels', ['batch', 'max_seq_len_in_batch'], torch.int64, num_classes = vocab_size)
    # next_sentence_labels_desc = IODescription('next_sentence_labels', ['batch',], torch.int64, num_classes = 2)

    # set concrete input sizes to permit optimization
    input_ids_desc = IODescription('input_ids', [args.train_batch_size, args.max_seq_length], torch.int64, num_classes = vocab_size)
    segment_ids_desc = IODescription('segment_ids', [args.train_batch_size, args.max_seq_length], torch.int64, num_classes = 2)
    input_mask_desc = IODescription('input_mask', [args.train_batch_size, args.max_seq_length], torch.int64, num_classes = 2)
    masked_lm_labels_desc = IODescription('masked_lm_labels', [args.train_batch_size, args.max_seq_length], torch.int64, num_classes = vocab_size)
    next_sentence_labels_desc = IODescription('next_sentence_labels', [args.train_batch_size,2], torch.int64, num_classes = 2)

    loss_desc = IODescription('loss', [], torch.float32)
    return ModelDescription([input_ids_desc, segment_ids_desc, input_mask_desc, masked_lm_labels_desc, next_sentence_labels_desc], [loss_desc])
コード例 #17
0
def bert_model_description():
    vocab_size = 30528
    input_ids_desc = IODescription(
        "input_ids",
        ["batch", "max_seq_len_in_batch"],
        torch.int64,
        num_classes=vocab_size,
    )
    segment_ids_desc = IODescription("segment_ids",
                                     ["batch", "max_seq_len_in_batch"],
                                     torch.int64,
                                     num_classes=2)
    input_mask_desc = IODescription("input_mask",
                                    ["batch", "max_seq_len_in_batch"],
                                    torch.int64,
                                    num_classes=2)
    masked_lm_labels_desc = IODescription(
        "masked_lm_labels",
        ["batch", "max_seq_len_in_batch"],
        torch.int64,
        num_classes=vocab_size,
    )
    next_sentence_labels_desc = IODescription(
        "next_sentence_labels",
        [
            "batch",
        ],
        torch.int64,
        num_classes=2,
    )
    loss_desc = IODescription("loss", [], torch.float32)

    return ModelDescription(
        [
            input_ids_desc,
            segment_ids_desc,
            input_mask_desc,
            masked_lm_labels_desc,
            next_sentence_labels_desc,
        ],
        [loss_desc],
    )
コード例 #18
0
    def test_expand(self):
        class ExpandNet(nn.Module):
            def __init__(self, target):
                super(ExpandNet, self).__init__()
                self.loss = nn.CrossEntropyLoss()
                self.target = target
                self.linear = torch.nn.Linear(2, 2)

            def forward(self, x, x1):
                output = x.expand_as(x1)
                output = self.linear(output)
                output = output + output
                loss = self.loss(output, self.target)
                return loss, output

        device = torch.device("cpu")
        target = torch.ones(5, 5, 2, dtype=torch.int64).to(device)
        model = ExpandNet(target).to(device)

        x = torch.randn(5, 3, 1, 2, dtype=torch.float32).to(device)
        x1 = torch.randn(5, 3, 5, 2, dtype=torch.float32).to(device)

        input0_desc = IODescription('x', [5, 3, 1, 2], "float32")
        input1_desc = IODescription('x1', [5, 3, 5, 2], "float32")
        output0_desc = IODescription('output0', [], "float32")
        output1_desc = IODescription('output1', [5, 3, 5, 2], "float32")
        model_desc = ModelDescription([input0_desc, input1_desc],
                                      [output0_desc, output1_desc])

        learning_rate = torch.tensor([1.0000000e+00]).to(device)
        input_args = [x, x1, learning_rate]

        onnx_model = self.get_onnx_model(model, model_desc, input_args, device)

        # check that expand output has shape
        expand_nodes = self.find_nodes(onnx_model, "Expand")
        assert len(expand_nodes) == 1

        model_info = onnx_model.graph.value_info
        assert model_info[0].name == expand_nodes[0].output[0]
        assert model_info[0].type == onnx_model.graph.input[1].type
コード例 #19
0
        def create_and_check_bert_for_pretraining(
            self,
            config,
            input_ids,
            token_type_ids,
            input_mask,
            sequence_labels,
            token_labels,
            choice_labels,
            option_fp16,
            option_allreduce_post_accumulation,
            option_gradient_accumulation_steps,
            option_split_batch,
            option_use_internal_get_lr_this_step=[True],
            option_use_internal_loss_scaler=[True],
        ):
            seed = 42
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed)
            torch.cuda.manual_seed_all(seed)
            onnxruntime.set_seed(seed)

            model = BertForPreTraining(config=config)
            model.eval()
            loss, prediction_scores, seq_relationship_score = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids,
                masked_lm_labels=token_labels,
                next_sentence_label=sequence_labels,
            )
            model_desc = ModelDescription(
                [
                    self.input_ids_desc,
                    self.attention_mask_desc,
                    self.token_type_ids_desc,
                    self.masked_lm_labels_desc,
                    self.next_sentence_label_desc,
                ],
                [
                    self.loss_desc, self.prediction_scores_desc,
                    self.seq_relationship_scores_desc
                ],
            )

            from collections import namedtuple

            MyArgs = namedtuple(
                "MyArgs",
                "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len"
            )

            dataset_len = 100
            epochs = 8
            max_steps = epochs * dataset_len
            args = MyArgs(
                local_rank=0,
                world_size=1,
                max_steps=max_steps,
                learning_rate=0.00001,
                warmup_proportion=0.01,
                batch_size=13,
                seq_len=7,
            )

            def get_lr_this_step(global_step):
                return get_lr(args, global_step)

            loss_scaler = LossScaler("loss_scale_input_name",
                                     True,
                                     up_scale_window=2000)

            for fp16 in option_fp16:
                for allreduce_post_accumulation in option_allreduce_post_accumulation:
                    for gradient_accumulation_steps in option_gradient_accumulation_steps:
                        for use_internal_get_lr_this_step in option_use_internal_get_lr_this_step:
                            for use_internal_loss_scaler in option_use_internal_loss_scaler:
                                for split_batch in option_split_batch:
                                    print("gradient_accumulation_steps:",
                                          gradient_accumulation_steps)
                                    print("split_batch:", split_batch)

                                    seed = 42
                                    random.seed(seed)
                                    np.random.seed(seed)
                                    torch.manual_seed(seed)
                                    torch.cuda.manual_seed_all(seed)
                                    onnxruntime.set_seed(seed)

                                    (
                                        old_api_loss_ort,
                                        old_api_prediction_scores_ort,
                                        old_api_seq_relationship_score_ort,
                                    ) = run_test(
                                        model,
                                        model_desc,
                                        self.device,
                                        args,
                                        gradient_accumulation_steps,
                                        fp16,
                                        allreduce_post_accumulation,
                                        get_lr_this_step,
                                        use_internal_get_lr_this_step,
                                        loss_scaler,
                                        use_internal_loss_scaler,
                                        split_batch,
                                        dataset_len,
                                        epochs,
                                        use_new_api=False,
                                    )

                                    random.seed(seed)
                                    np.random.seed(seed)
                                    torch.manual_seed(seed)
                                    torch.cuda.manual_seed_all(seed)
                                    onnxruntime.set_seed(seed)
                                    if use_internal_get_lr_this_step and use_internal_loss_scaler:
                                        (
                                            new_api_loss_ort,
                                            new_api_prediction_scores_ort,
                                            new_api_seq_relationship_score_ort,
                                        ) = run_test(
                                            model,
                                            model_desc,
                                            self.device,
                                            args,
                                            gradient_accumulation_steps,
                                            fp16,
                                            allreduce_post_accumulation,
                                            get_lr_this_step,
                                            use_internal_get_lr_this_step,
                                            loss_scaler,
                                            use_internal_loss_scaler,
                                            split_batch,
                                            dataset_len,
                                            epochs,
                                            use_new_api=True,
                                        )

                                        assert_allclose(
                                            old_api_loss_ort, new_api_loss_ort)
                                        assert_allclose(
                                            old_api_prediction_scores_ort,
                                            new_api_prediction_scores_ort)
                                        assert_allclose(
                                            old_api_seq_relationship_score_ort,
                                            new_api_seq_relationship_score_ort)
コード例 #20
0
        def create_and_check_bert_for_pretraining(self, config, input_ids,
                                                  token_type_ids, input_mask,
                                                  sequence_labels,
                                                  token_labels, choice_labels):
            seed = 42
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed)
            torch.cuda.manual_seed_all(seed)
            onnxruntime.set_seed(seed)

            model = BertForPreTraining(config=config)
            model.eval()
            loss, prediction_scores, seq_relationship_score = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids,
                masked_lm_labels=token_labels,
                next_sentence_label=sequence_labels)
            model_desc = ModelDescription([
                self.input_ids_desc, self.attention_mask_desc,
                self.token_type_ids_desc, self.masked_lm_labels_desc,
                self.next_sentence_label_desc
            ], [
                self.loss_desc, self.prediction_scores_desc,
                self.seq_relationship_scores_desc
            ])

            from collections import namedtuple
            MyArgs = namedtuple(
                "MyArgs",
                "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len"
            )
            args = MyArgs(local_rank=0,
                          world_size=1,
                          max_steps=100,
                          learning_rate=0.00001,
                          warmup_proportion=0.01,
                          batch_size=13,
                          seq_len=7)

            def get_lr_this_step(global_step):
                return get_lr(args, global_step)

            loss_scaler = LossScaler('loss_scale_input_name',
                                     True,
                                     up_scale_window=2000)

            # It would be better to test both with/without mixed precision and allreduce_post_accumulation.
            # However, stress test of all the 4 cases is not stable at lease on the test machine.
            # There we only test mixed precision and allreduce_post_accumulation because it is the most useful use cases.
            option_fp16 = [True]
            option_allreduce_post_accumulation = [True]
            option_gradient_accumulation_steps = [1, 8]
            option_use_internal_get_lr_this_step = [True, False]
            option_use_internal_loss_scaler = [True, False]
            option_split_batch = [BatchArgsOption.ListAndDict]

            for fp16 in option_fp16:
                for allreduce_post_accumulation in option_allreduce_post_accumulation:
                    for gradient_accumulation_steps in option_gradient_accumulation_steps:
                        for use_internal_get_lr_this_step in option_use_internal_get_lr_this_step:
                            for use_internal_loss_scaler in option_use_internal_loss_scaler:
                                for split_batch in option_split_batch:
                                    print("gradient_accumulation_steps:",
                                          gradient_accumulation_steps)
                                    print("use_internal_loss_scaler:",
                                          use_internal_loss_scaler)
                                    loss_ort, prediction_scores_ort, seq_relationship_score_ort =\
                                        run_test(model, model_desc, self.device, args, gradient_accumulation_steps, fp16,
                                                allreduce_post_accumulation,
                                                get_lr_this_step, use_internal_get_lr_this_step,
                                                loss_scaler, use_internal_loss_scaler,
                                                split_batch)

                                    print(loss_ort)
                                    print(prediction_scores_ort)
                                    print(seq_relationship_score_ort)
コード例 #21
0
    def run_glue(self, model_name, task_name, fp16):
        model_args = ModelArguments(model_name_or_path=model_name,
                                    cache_dir=self.cache_dir)
        data_args = GlueDataTrainingArguments(
            task_name=task_name,
            data_dir=self.data_dir + "/" + task_name,
            max_seq_length=self.max_seq_length)

        training_args = TrainingArguments(
            output_dir=self.output_dir + "/" + task_name,
            do_train=True,
            do_eval=True,
            per_gpu_train_batch_size=self.train_batch_size,
            learning_rate=self.learning_rate,
            num_train_epochs=self.num_train_epochs,
            local_rank=self.local_rank,
            overwrite_output_dir=self.overwrite_output_dir,
            gradient_accumulation_steps=self.gradient_accumulation_steps,
            fp16=fp16,
            logging_steps=self.logging_steps)

        # Setup logging
        logging.basicConfig(
            format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
            datefmt="%m/%d/%Y %H:%M:%S",
            level=logging.INFO
            if training_args.local_rank in [-1, 0] else logging.WARN,
        )
        logger.warning(
            "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
            training_args.local_rank,
            training_args.device,
            training_args.n_gpu,
            bool(training_args.local_rank != -1),
            training_args.fp16,
        )
        logger.info("Training/evaluation parameters %s", training_args)

        set_seed(training_args.seed)
        onnxruntime.set_seed(training_args.seed)

        try:
            num_labels = glue_tasks_num_labels[data_args.task_name]
            output_mode = glue_output_modes[data_args.task_name]
        except KeyError:
            raise ValueError("Task not found: %s" % (data_args.task_name))

        config = AutoConfig.from_pretrained(
            model_args.config_name
            if model_args.config_name else model_args.model_name_or_path,
            num_labels=num_labels,
            finetuning_task=data_args.task_name,
            cache_dir=model_args.cache_dir,
        )
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name
            if model_args.tokenizer_name else model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
        )

        model = AutoModelForSequenceClassification.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )

        train_dataset = (GlueDataset(data_args, tokenizer=tokenizer)
                         if training_args.do_train else None)

        eval_dataset = (GlueDataset(data_args, tokenizer=tokenizer, mode="dev")
                        if training_args.do_eval else None)

        def compute_metrics(p: EvalPrediction) -> Dict:
            if output_mode == "classification":
                preds = np.argmax(p.predictions, axis=1)
            elif output_mode == "regression":
                preds = np.squeeze(p.predictions)
            return glue_compute_metrics(data_args.task_name, preds,
                                        p.label_ids)

        model_desc = ModelDescription([
            IODescription('input_ids', ['batch', 'max_seq_len_in_batch'],
                          torch.int64,
                          num_classes=model.config.vocab_size),
            IODescription('attention_mask', ['batch', 'max_seq_len_in_batch'],
                          torch.int64,
                          num_classes=2),
            IODescription('token_type_ids', ['batch', 'max_seq_len_in_batch'],
                          torch.int64,
                          num_classes=2),
            IODescription('labels', [
                'batch',
            ], torch.int64, num_classes=2)
        ], [
            IODescription('loss', [], torch.float32),
            IODescription('logits', ['batch', 2], torch.float32)
        ])

        # Initialize the ORTTrainer within ORTTransformerTrainer
        trainer = ORTTransformerTrainer(
            model=model,
            model_desc=model_desc,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
        )

        # Training
        if training_args.do_train:
            trainer.train()
            trainer.save_model()

        # Evaluation
        results = {}
        if training_args.do_eval and training_args.local_rank in [-1, 0]:
            logger.info("*** Evaluate ***")

            result = trainer.evaluate()

            logger.info("***** Eval results {} *****".format(
                data_args.task_name))
            for key, value in result.items():
                logger.info("  %s = %s", key, value)

            results.update(result)

        return results
コード例 #22
0
    def model_to_desc(self, model_name, model):
        if model_name.startswith('bert') or model_name.startswith('xlnet'):
            new_model_desc = {
                'inputs': [(
                    'input_ids',
                    ['batch', 'max_seq_len_in_batch'],
                ), (
                    'attention_mask',
                    ['batch', 'max_seq_len_in_batch'],
                ), (
                    'token_type_ids',
                    ['batch', 'max_seq_len_in_batch'],
                ), (
                    'labels',
                    [
                        'batch',
                    ],
                )],
                'outputs': [('loss', [], True), ('logits', ['batch', 2])]
            }
            model_desc = ModelDescription([
                IODescription('input_ids', ['batch', 'max_seq_len_in_batch']),
                IODescription('attention_mask',
                              ['batch', 'max_seq_len_in_batch']),
                IODescription('token_type_ids',
                              ['batch', 'max_seq_len_in_batch']),
                IODescription('labels', [
                    'batch',
                ])
            ], [
                IODescription('loss', []),
                IODescription('logits', ['batch', 2])
            ])
        elif model_name.startswith('roberta'):
            new_model_desc = {
                'inputs': [(
                    'input_ids',
                    ['batch', 'max_seq_len_in_batch'],
                ), (
                    'attention_mask',
                    ['batch', 'max_seq_len_in_batch'],
                ), (
                    'labels',
                    [
                        'batch',
                    ],
                )],
                'outputs': [('loss', [], True), ('logits', ['batch', 2])]
            }
            model_desc = ModelDescription([
                IODescription('input_ids', ['batch', 'max_seq_len_in_batch']),
                IODescription('attention_mask',
                              ['batch', 'max_seq_len_in_batch']),
                IODescription('labels', [
                    'batch',
                ])
            ], [
                IODescription('loss', []),
                IODescription('logits', ['batch', 2])
            ])
        else:
            raise RuntimeError(
                "unsupported base model name {}.".format(model_name))

        return model_desc, new_model_desc
コード例 #23
0
    def run_multiple_choice(self, model_name, task_name, fp16):
        model_args = ModelArguments(model_name_or_path=model_name,
                                    cache_dir=self.cache_dir)
        data_args = DataTrainingArguments(task_name=task_name,
                                          data_dir=self.data_dir,
                                          max_seq_length=self.max_seq_length)

        training_args = TrainingArguments(
            output_dir=os.path.join(self.output_dir, task_name),
            do_train=True,
            do_eval=True,
            per_gpu_train_batch_size=self.train_batch_size,
            per_gpu_eval_batch_size=self.eval_batch_size,
            learning_rate=self.learning_rate,
            num_train_epochs=self.num_train_epochs,
            local_rank=self.local_rank,
            overwrite_output_dir=self.overwrite_output_dir,
            gradient_accumulation_steps=self.gradient_accumulation_steps,
            fp16=fp16,
            logging_steps=self.logging_steps)

        # Setup logging
        logging.basicConfig(
            format="%(asctime)s - %(levelname)s - %(name)s -   %(message)s",
            datefmt="%m/%d/%Y %H:%M:%S",
            level=logging.INFO
            if training_args.local_rank in [-1, 0] else logging.WARN,
        )
        logger.warning(
            "Process rank: %s, device: %s, n_gpu: %s, distributed training: %s, 16-bits training: %s",
            training_args.local_rank,
            training_args.device,
            training_args.n_gpu,
            bool(training_args.local_rank != -1),
            training_args.fp16,
        )
        logger.info("Training/evaluation parameters %s", training_args)

        set_seed(training_args.seed)
        onnxruntime.set_seed(training_args.seed)

        try:
            processor = SwagProcessor()
            label_list = processor.get_labels()
            num_labels = len(label_list)
        except KeyError:
            raise ValueError("Task not found: %s" % (data_args.task_name))

        config = AutoConfig.from_pretrained(
            model_args.config_name
            if model_args.config_name else model_args.model_name_or_path,
            num_labels=num_labels,
            finetuning_task=data_args.task_name,
            cache_dir=model_args.cache_dir,
        )
        tokenizer = AutoTokenizer.from_pretrained(
            model_args.tokenizer_name
            if model_args.tokenizer_name else model_args.model_name_or_path,
            cache_dir=model_args.cache_dir,
        )

        model = AutoModelForMultipleChoice.from_pretrained(
            model_args.model_name_or_path,
            from_tf=bool(".ckpt" in model_args.model_name_or_path),
            config=config,
            cache_dir=model_args.cache_dir,
        )

        # Get datasets
        train_dataset = (MultipleChoiceDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            task=data_args.task_name,
            processor=processor,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.train,
        ) if training_args.do_train else None)
        eval_dataset = (MultipleChoiceDataset(
            data_dir=data_args.data_dir,
            tokenizer=tokenizer,
            task=data_args.task_name,
            processor=processor,
            max_seq_length=data_args.max_seq_length,
            overwrite_cache=data_args.overwrite_cache,
            mode=Split.dev,
        ) if training_args.do_eval else None)

        def compute_metrics(p: EvalPrediction) -> Dict:
            preds = np.argmax(p.predictions, axis=1)
            return {"acc": simple_accuracy(preds, p.label_ids)}

        if model_name.startswith('bert'):
            model_desc = ModelDescription([
                IODescription('input_ids', [
                    self.train_batch_size, num_labels, data_args.max_seq_length
                ],
                              torch.int64,
                              num_classes=model.config.vocab_size),
                IODescription('attention_mask', [
                    self.train_batch_size, num_labels, data_args.max_seq_length
                ],
                              torch.int64,
                              num_classes=2),
                IODescription('token_type_ids', [
                    self.train_batch_size, num_labels, data_args.max_seq_length
                ],
                              torch.int64,
                              num_classes=2),
                IODescription('labels', [self.train_batch_size, num_labels],
                              torch.int64,
                              num_classes=num_labels)
            ], [
                IODescription('loss', [], torch.float32),
                IODescription('reshaped_logits',
                              [self.train_batch_size, num_labels],
                              torch.float32)
            ])
        else:
            model_desc = ModelDescription([
                IODescription('input_ids',
                              ['batch', num_labels, 'max_seq_len_in_batch'],
                              torch.int64,
                              num_classes=model.config.vocab_size),
                IODescription('attention_mask',
                              ['batch', num_labels, 'max_seq_len_in_batch'],
                              torch.int64,
                              num_classes=2),
                IODescription('labels', ['batch', num_labels],
                              torch.int64,
                              num_classes=num_labels)
            ], [
                IODescription('loss', [], torch.float32),
                IODescription('reshaped_logits', ['batch', num_labels],
                              torch.float32)
            ])

        # Initialize the ORTTrainer within ORTTransformerTrainer
        trainer = ORTTransformerTrainer(
            model=model,
            model_desc=model_desc,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=eval_dataset,
            compute_metrics=compute_metrics,
        )

        # Training
        if training_args.do_train:
            trainer.train()
            trainer.save_model()

        # Evaluation
        results = {}
        if training_args.do_eval and training_args.local_rank in [-1, 0]:
            logger.info("*** Evaluate ***")

            result = trainer.evaluate()

            logger.info("***** Eval results {} *****".format(
                data_args.task_name))
            for key, value in result.items():
                logger.info("  %s = %s", key, value)

            results.update(result)

        return results
コード例 #24
0
        def create_and_check_bert_for_pretraining(self, config, input_ids,
                                                  token_type_ids, input_mask,
                                                  sequence_labels,
                                                  token_labels, choice_labels):
            model = BertForPreTraining(config=config)
            model.eval()
            loss, prediction_scores, seq_relationship_score = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids,
                masked_lm_labels=token_labels,
                next_sentence_label=sequence_labels)
            model_desc = ModelDescription([
                self.input_ids_desc, self.attention_mask_desc,
                self.token_type_ids_desc, self.masked_lm_labels_desc,
                self.next_sentence_label_desc
            ], [
                self.loss_desc, self.prediction_scores_desc,
                self.seq_relationship_scores_desc
            ])

            import argparse
            args_ = argparse.Namespace(fp16=True, amp_opt_level='O1')

            from collections import namedtuple
            MyArgs = namedtuple(
                "MyArgs",
                "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len"
            )
            args = MyArgs(local_rank=0,
                          world_size=1,
                          max_steps=100,
                          learning_rate=0.00001,
                          warmup_proportion=0.01,
                          batch_size=13,
                          seq_len=7)

            from train_with_ort_trainer import get_lr

            def get_lr_this_step(global_step):
                return get_lr(args, global_step)

            loss_scaler = LossScaler('loss_scale_input_name',
                                     True,
                                     up_scale_window=2000)

            option_gradient_accumulation_steps = [8]
            option_fp16 = [True, False]
            option_allreduce_post_accumulation = True
            option_use_internal_get_lr_this_step = False
            option_use_internal_loss_scaler = False
            # TODO: with with fetches

            for gradient_accumulation_steps in option_gradient_accumulation_steps:
                for fp16 in option_fp16:
                    for option_split_batch in BatchArgsOption:
                        loss_ort, prediction_scores_ort, seq_relationship_score_ort =\
                            run_test(model, model_desc, self.device, args, gradient_accumulation_steps, fp16,
                                     option_allreduce_post_accumulation,
                                     get_lr_this_step, option_use_internal_get_lr_this_step,
                                     loss_scaler, option_use_internal_loss_scaler,
                                     option_split_batch)

                        print(loss_ort)
                        print(prediction_scores_ort)
                        print(seq_relationship_score_ort)
コード例 #25
0
    def test_extra_postpass(self):
        def postpass_replace_first_add_with_sub(model):
            # this post pass replaces the first Add node with Sub in the model.
            # Previous graph
            #   (subgraph 1)        (subgraph 2)
            #        |                   |
            #        |                   |
            #        |________   ________|
            #                 | |
            #                 Add
            #                  |
            #             (subgraph 3)
            #
            # Post graph
            #   (subgraph 1)        (subgraph 2)
            #        |                   |
            #        |                   |
            #        |________   ________|
            #                 | |
            #                 Sub
            #                  |
            #             (subgraph 3)
            add_nodes = [n for n in model.graph.node if n.op_type == 'Add']
            add_nodes[0].op_type = "Sub"

        class MultiAdd(nn.Module):
            def __init__(self, target):
                super(MultiAdd, self).__init__()
                self.loss = nn.CrossEntropyLoss()
                self.target = target
                self.linear = torch.nn.Linear(2, 2, bias=False)

            def forward(self, x, x1):
                output = x + x1
                output = output + x
                output = output + x1
                output = self.linear(output)
                loss = self.loss(output, self.target)
                return loss, output

        device = torch.device("cpu")
        target = torch.ones(5, 2, dtype=torch.int64).to(device)
        model = MultiAdd(target).to(device)

        x = torch.randn(5, 5, 2, dtype=torch.float32).to(device)
        x1 = torch.randn(5, 5, 2, dtype=torch.float32).to(device)

        input0_desc = IODescription('x', [5, 5, 2], "float32")
        input1_desc = IODescription('x1', [5, 5, 2], "float32")
        output0_desc = IODescription('output0', [], "float32")
        output1_desc = IODescription('output1', [5, 5, 2], "float32")
        model_desc = ModelDescription([input0_desc, input1_desc], [output0_desc, output1_desc])

        learning_rate = torch.tensor([1.0000000e+00]).to(device)
        input_args = [x, x1, learning_rate]

        onnx_model = self.get_onnx_model(model, model_desc, input_args, device,
                _extra_postprocess=postpass_replace_first_add_with_sub)

        # check that extra postpass is called, and called only once.
        add_nodes = self.find_nodes(onnx_model, "Add")
        sub_nodes = self.find_nodes(onnx_model, "Sub")
        assert len(add_nodes) == 2
        assert len(sub_nodes) == 1


        unprocessed_onnx_model = self.get_onnx_model(model, model_desc, input_args, device,
                _extra_postprocess=None, _enable_internal_postprocess=False)
        # check that the model is unchanged.
        add_nodes = self.find_nodes(unprocessed_onnx_model, "Add")
        sub_nodes = self.find_nodes(unprocessed_onnx_model, "Sub")
        assert len(add_nodes) == 3
        assert len(sub_nodes) == 0

        processed_onnx_model = self.get_onnx_model(unprocessed_onnx_model, model_desc, input_args, device,
                _extra_postprocess=postpass_replace_first_add_with_sub)
        # check that extra postpass is called, and called only once.
        add_nodes = self.find_nodes(processed_onnx_model, "Add")
        sub_nodes = self.find_nodes(processed_onnx_model, "Sub")
        assert len(add_nodes) == 2
        assert len(sub_nodes) == 1