Example #1
0
        def create_and_check_bert_for_pretraining(self, config, input_ids,
                                                  token_type_ids, input_mask,
                                                  sequence_labels,
                                                  token_labels, choice_labels):
            seed = 42
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed)
            torch.cuda.manual_seed_all(seed)
            onnxruntime.set_seed(seed)

            model = BertForPreTraining(config=config)
            model.eval()
            loss, prediction_scores, seq_relationship_score = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids,
                masked_lm_labels=token_labels,
                next_sentence_label=sequence_labels)
            model_desc = ModelDescription([
                self.input_ids_desc, self.attention_mask_desc,
                self.token_type_ids_desc, self.masked_lm_labels_desc,
                self.next_sentence_label_desc
            ], [
                self.loss_desc, self.prediction_scores_desc,
                self.seq_relationship_scores_desc
            ])

            from collections import namedtuple
            MyArgs = namedtuple(
                "MyArgs",
                "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len"
            )
            args = MyArgs(local_rank=0,
                          world_size=1,
                          max_steps=100,
                          learning_rate=0.00001,
                          warmup_proportion=0.01,
                          batch_size=13,
                          seq_len=7)

            def get_lr_this_step(global_step):
                return get_lr(args, global_step)

            loss_scaler = LossScaler('loss_scale_input_name',
                                     True,
                                     up_scale_window=2000)

            # It would be better to test both with/without mixed precision and allreduce_post_accumulation.
            # However, stress test of all the 4 cases is not stable at lease on the test machine.
            # There we only test mixed precision and allreduce_post_accumulation because it is the most useful use cases.
            option_fp16 = [True]
            option_allreduce_post_accumulation = [True]
            option_gradient_accumulation_steps = [1, 8]
            option_use_internal_get_lr_this_step = [True, False]
            option_use_internal_loss_scaler = [True, False]
            option_split_batch = [BatchArgsOption.ListAndDict]

            for fp16 in option_fp16:
                for allreduce_post_accumulation in option_allreduce_post_accumulation:
                    for gradient_accumulation_steps in option_gradient_accumulation_steps:
                        for use_internal_get_lr_this_step in option_use_internal_get_lr_this_step:
                            for use_internal_loss_scaler in option_use_internal_loss_scaler:
                                for split_batch in option_split_batch:
                                    print("gradient_accumulation_steps:",
                                          gradient_accumulation_steps)
                                    print("use_internal_loss_scaler:",
                                          use_internal_loss_scaler)
                                    loss_ort, prediction_scores_ort, seq_relationship_score_ort =\
                                        run_test(model, model_desc, self.device, args, gradient_accumulation_steps, fp16,
                                                allreduce_post_accumulation,
                                                get_lr_this_step, use_internal_get_lr_this_step,
                                                loss_scaler, use_internal_loss_scaler,
                                                split_batch)

                                    print(loss_ort)
                                    print(prediction_scores_ort)
                                    print(seq_relationship_score_ort)
Example #2
0
        def create_and_check_bert_for_pretraining(
            self,
            config,
            input_ids,
            token_type_ids,
            input_mask,
            sequence_labels,
            token_labels,
            choice_labels,
            option_fp16,
            option_allreduce_post_accumulation,
            option_gradient_accumulation_steps,
            option_split_batch,
            option_use_internal_get_lr_this_step=[True],
            option_use_internal_loss_scaler=[True],
        ):
            seed = 42
            random.seed(seed)
            np.random.seed(seed)
            torch.manual_seed(seed)
            torch.cuda.manual_seed_all(seed)
            onnxruntime.set_seed(seed)

            model = BertForPreTraining(config=config)
            model.eval()
            loss, prediction_scores, seq_relationship_score = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids,
                masked_lm_labels=token_labels,
                next_sentence_label=sequence_labels,
            )
            model_desc = ModelDescription(
                [
                    self.input_ids_desc,
                    self.attention_mask_desc,
                    self.token_type_ids_desc,
                    self.masked_lm_labels_desc,
                    self.next_sentence_label_desc,
                ],
                [
                    self.loss_desc, self.prediction_scores_desc,
                    self.seq_relationship_scores_desc
                ],
            )

            from collections import namedtuple

            MyArgs = namedtuple(
                "MyArgs",
                "local_rank world_size max_steps learning_rate warmup_proportion batch_size seq_len"
            )

            dataset_len = 100
            epochs = 8
            max_steps = epochs * dataset_len
            args = MyArgs(
                local_rank=0,
                world_size=1,
                max_steps=max_steps,
                learning_rate=0.00001,
                warmup_proportion=0.01,
                batch_size=13,
                seq_len=7,
            )

            def get_lr_this_step(global_step):
                return get_lr(args, global_step)

            loss_scaler = LossScaler("loss_scale_input_name",
                                     True,
                                     up_scale_window=2000)

            for fp16 in option_fp16:
                for allreduce_post_accumulation in option_allreduce_post_accumulation:
                    for gradient_accumulation_steps in option_gradient_accumulation_steps:
                        for use_internal_get_lr_this_step in option_use_internal_get_lr_this_step:
                            for use_internal_loss_scaler in option_use_internal_loss_scaler:
                                for split_batch in option_split_batch:
                                    print("gradient_accumulation_steps:",
                                          gradient_accumulation_steps)
                                    print("split_batch:", split_batch)

                                    seed = 42
                                    random.seed(seed)
                                    np.random.seed(seed)
                                    torch.manual_seed(seed)
                                    torch.cuda.manual_seed_all(seed)
                                    onnxruntime.set_seed(seed)

                                    (
                                        old_api_loss_ort,
                                        old_api_prediction_scores_ort,
                                        old_api_seq_relationship_score_ort,
                                    ) = run_test(
                                        model,
                                        model_desc,
                                        self.device,
                                        args,
                                        gradient_accumulation_steps,
                                        fp16,
                                        allreduce_post_accumulation,
                                        get_lr_this_step,
                                        use_internal_get_lr_this_step,
                                        loss_scaler,
                                        use_internal_loss_scaler,
                                        split_batch,
                                        dataset_len,
                                        epochs,
                                        use_new_api=False,
                                    )

                                    random.seed(seed)
                                    np.random.seed(seed)
                                    torch.manual_seed(seed)
                                    torch.cuda.manual_seed_all(seed)
                                    onnxruntime.set_seed(seed)
                                    if use_internal_get_lr_this_step and use_internal_loss_scaler:
                                        (
                                            new_api_loss_ort,
                                            new_api_prediction_scores_ort,
                                            new_api_seq_relationship_score_ort,
                                        ) = run_test(
                                            model,
                                            model_desc,
                                            self.device,
                                            args,
                                            gradient_accumulation_steps,
                                            fp16,
                                            allreduce_post_accumulation,
                                            get_lr_this_step,
                                            use_internal_get_lr_this_step,
                                            loss_scaler,
                                            use_internal_loss_scaler,
                                            split_batch,
                                            dataset_len,
                                            epochs,
                                            use_new_api=True,
                                        )

                                        assert_allclose(
                                            old_api_loss_ort, new_api_loss_ort)
                                        assert_allclose(
                                            old_api_prediction_scores_ort,
                                            new_api_prediction_scores_ort)
                                        assert_allclose(
                                            old_api_seq_relationship_score_ort,
                                            new_api_seq_relationship_score_ort)