def testToyBERTModelGradientAccumulationLegacyExperimental(
        gradient_accumulation_steps):
    # Common setup
    total_steps = 10
    device = "cuda"
    seed = 1

    # EXPERIMENTAL IMPLEMENTATION
    torch.manual_seed(seed)
    onnxruntime.set_seed(seed)
    model_desc = bert_model_description()
    model = load_bert_onnx_model()
    optim_config = optim.LambConfig()
    opts = orttrainer.ORTTrainerOptions({
        'debug': {
            'deterministic_compute': True
        },
        'device': {
            'id': device,
        },
        'batch': {
            'gradient_accumulation_steps': gradient_accumulation_steps
        },
    })
    trainer = orttrainer.ORTTrainer(model,
                                    model_desc,
                                    optim_config,
                                    options=opts)
    experimental_losses = []
    for i in range(total_steps):
        sample_input = generate_random_input_from_model_desc(model_desc, i)
        loss = trainer.train_step(*sample_input)
        experimental_losses.append(loss.cpu().item())

    # LEGACY IMPLEMENTATION
    device = torch.device(device)
    torch.manual_seed(seed)
    onnxruntime.set_seed(seed)
    legacy_model_desc, learning_rate_description, learning_rate = legacy_model_params(
        optim_config.lr)
    legacy_trainer = Legacy_ORTTrainer(
        model,
        None,
        legacy_model_desc,
        "LambOptimizer",
        None,
        learning_rate_description,
        device,
        _use_deterministic_compute=True,
        gradient_accumulation_steps=gradient_accumulation_steps)
    legacy_losses = []
    for i in range(total_steps):
        sample_input = generate_random_input_from_model_desc(model_desc, i)
        leg_loss = legacy_trainer.train_step(*sample_input, learning_rate)
        legacy_losses.append(leg_loss.cpu().item())

    # Check results
    _test_helpers.assert_model_outputs(experimental_losses,
                                       legacy_losses,
                                       rtol=1e-6)
コード例 #2
0
    def testWrapModelLossFnStateDict(self):
        torch.manual_seed(1)
        device = torch.device("cuda")
        class LinearModel(torch.nn.Module):
            def __init__(self):
                super().__init__()
                self.linear = torch.nn.Linear(2, 4)
            def forward(self, y=None, x=None):
                if y is not None:
                    return self.linear(x) + y
                else:
                    return self.linear(x) + torch.ones(2, 4)

        pt_model = LinearModel()
        data = torch.randn(2, 2)
        label = torch.tensor([0, 1], dtype=torch.int64)
        input_desc = IODescription('x', [2, 2], torch.float32)
        label_desc = IODescription('label', [2, ], torch.int64, num_classes=4)
        output_desc = IODescription('output', [2, 4], torch.float32)
        loss_desc = IODescription('loss', [], torch.float32)
        model_desc = ModelDescription([input_desc, label_desc], [loss_desc, output_desc])
        def loss_fn(x, label):
            return F.nll_loss(F.log_softmax(x, dim=1), label)

        def get_lr_this_step(global_step):
            learningRate = 0.02
            return torch.tensor([learningRate])

        ort_trainer = ORTTrainer(
            pt_model, loss_fn, model_desc, "SGDOptimizer", None,
            IODescription('Learning_Rate', [1, ], torch.float32), device,
            get_lr_this_step=get_lr_this_step)
        ort_trainer.train_step(x=data, label=label)
        state_dict = ort_trainer.state_dict()
        assert state_dict.keys() == {'linear.bias', 'linear.weight'}
コード例 #3
0
    def get_onnx_model(self,
                       model,
                       model_desc,
                       inputs,
                       device,
                       _enable_internal_postprocess=True,
                       _extra_postprocess=None):
        lr_desc = IODescription('Learning_Rate', [
            1,
        ], torch.float32)
        model = ORTTrainer(
            model,
            None,
            model_desc,
            "LambOptimizer",
            map_optimizer_attributes,
            lr_desc,
            device,
            world_rank=0,
            world_size=1,
            _opset_version=12,
            _enable_internal_postprocess=_enable_internal_postprocess,
            _extra_postprocess=_extra_postprocess)

        train_output = model.train_step(*inputs)
        return model.onnx_model_
コード例 #4
0
def testToyBERTModelMixedPrecisionLossScalerLegacyExperimental(
        loss_scaler, legacy_loss_scaler):
    # Common setup
    total_steps = 128
    device = "cuda"
    seed = 1

    # EXPERIMENTAL IMPLEMENTATION
    torch.manual_seed(seed)
    onnxruntime.set_seed(seed)
    model_desc = bert_model_description()
    model = load_bert_onnx_model()
    optim_config = optim.AdamConfig(lr=0.001)
    opts = orttrainer.ORTTrainerOptions({
        'debug': {
            'deterministic_compute': True
        },
        'device': {
            'id': device,
        },
        'mixed_precision': {
            'enabled': True,
            'loss_scaler': loss_scaler
        }
    })
    trainer = orttrainer.ORTTrainer(model,
                                    model_desc,
                                    optim_config,
                                    options=opts)
    experimental_losses = []
    for i in range(total_steps):
        sample_input = generate_random_input_from_model_desc(model_desc, i)
        experimental_losses.append(
            trainer.train_step(*sample_input).cpu().item())

    # LEGACY IMPLEMENTATION
    torch.manual_seed(seed)
    onnxruntime.set_seed(seed)
    device = torch.device(device)
    model = load_bert_onnx_model()
    legacy_model_desc, learning_rate_description, learning_rate = legacy_model_params(
        optim_config.lr)
    legacy_trainer = Legacy_ORTTrainer(model,
                                       None,
                                       legacy_model_desc,
                                       "AdamOptimizer",
                                       None,
                                       learning_rate_description,
                                       device,
                                       _use_deterministic_compute=True,
                                       use_mixed_precision=True,
                                       loss_scaler=legacy_loss_scaler)
    legacy_losses = []
    for i in range(total_steps):
        sample_input = generate_random_input_from_model_desc(model_desc, i)
        leg_loss = legacy_trainer.train_step(*sample_input, learning_rate)
        legacy_losses.append(leg_loss.cpu().item())

    # Check results
    _test_helpers.assert_model_outputs(experimental_losses, legacy_losses)
def testToyBERTModelLegacyExperimentalBasicTraining(optimizer_config):
    # Common setup
    train_steps = 512

    device = 'cuda'
    seed = 1
    torch.manual_seed(seed)
    onnxruntime.set_seed(seed)

    # EXPERIMENTAL API
    model_desc = bert_model_description()
    model = load_bert_onnx_model()
    opts =  orttrainer.ORTTrainerOptions({
        'debug' : {
            'deterministic_compute': True
        },
        'device': {
            'id': device,
        },
    })
    optim_config = optimizer_config(lr=0.01)
    trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, options=opts)
    experimental_losses = []
    for i in range(train_steps):
        sample_input = generate_random_input_from_model_desc(model_desc, i)
        experimental_losses.append(trainer.train_step(*sample_input).cpu().item())

    # LEGACY IMPLEMENTATION
    torch.manual_seed(seed)
    onnxruntime.set_seed(seed)

    if optimizer_config == optim.AdamConfig:
        legacy_optimizer = 'AdamOptimizer'
    elif optimizer_config == optim.LambConfig:
        legacy_optimizer = 'LambOptimizer'
    elif optimizer_config == optim.SGDConfig:
        legacy_optimizer = 'SGDOptimizer'
    else:
        raise RuntimeError("Invalid optimizer_config")

    device = torch.device(device)
    model = load_bert_onnx_model()
    legacy_model_desc, learning_rate_description, learning_rate = legacy_model_params(lr=optim_config.lr)
    legacy_trainer = Legacy_ORTTrainer(model, None, legacy_model_desc, legacy_optimizer,
                       None,
                       learning_rate_description,
                       device,
                       _use_deterministic_compute=True)
    legacy_losses = []
    for i in range(train_steps):
        sample_input = generate_random_input_from_model_desc(model_desc, i)
        leg_loss = legacy_trainer.train_step(*sample_input, learning_rate)
        legacy_losses.append(leg_loss.cpu().item())

    # Check results
    _test_helpers.assert_model_outputs(experimental_losses, legacy_losses, True)
コード例 #6
0
def testToyBERTModelLegacyExperimentalCustomOptimParameters(params, legacy_optim_map):
    # Common setup
    total_steps = 128
    device = "cuda"
    seed = 1

    # EXPERIMENTAL API
    torch.manual_seed(seed)
    onnxruntime.set_seed(seed)
    model_desc = bert_model_description()
    model = load_bert_onnx_model()

    optim_config = optim.AdamConfig(
        params, alpha=0.9, beta=0.999, lambda_coef=0.01, epsilon=1e-6, do_bias_correction=False
    )
    opts = orttrainer.ORTTrainerOptions(
        {
            "debug": {"deterministic_compute": True},
            "device": {
                "id": device,
            },
        }
    )
    trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, options=opts)

    experimental_losses = []
    for i in range(total_steps):
        sample_input = generate_random_input_from_model_desc(model_desc, i)
        experimental_losses.append(trainer.train_step(*sample_input).cpu().item())

    # LEGACY IMPLEMENTATION
    torch.manual_seed(seed)
    onnxruntime.set_seed(seed)
    device = torch.device(device)
    model = load_bert_onnx_model()
    legacy_model_desc, learning_rate_description, learning_rate = legacy_model_params(trainer.optim_config.lr)

    legacy_trainer = Legacy_ORTTrainer(
        model,
        None,
        legacy_model_desc,
        "AdamOptimizer",
        legacy_optim_map,
        learning_rate_description,
        device,
        _use_deterministic_compute=True,
    )
    legacy_losses = []
    for i in range(total_steps):
        sample_input = generate_random_input_from_model_desc(model_desc, i)
        legacy_sample_input = [*sample_input, learning_rate]
        legacy_losses.append(legacy_trainer.train_step(legacy_sample_input).cpu().item())

    # Check results
    _test_helpers.assert_model_outputs(experimental_losses, legacy_losses)
コード例 #7
0
    def _training_step(self, model: ORTTrainer,
                       inputs: Dict[str, torch.Tensor]) -> float:
        model.train()
        for k, v in inputs.items():
            inputs[k] = v.to(self.args.device)

        outputs = model(**inputs)
        loss = outputs[
            0]  # model outputs are always tuple in transformers (see doc)

        return loss.item()
def testToyBERTModelLegacyExperimentalBasicTraining():
    # Common setup
    train_steps = 10
    device = 'cuda'
    seed = 1
    torch.manual_seed(seed)
    onnxruntime.set_seed(seed)

    # EXPERIMENTAL API
    model_desc = bert_model_description()
    model = load_bert_onnx_model()
    params = optimizer_parameters(model)
    optim_config = optim.LambConfig()
    opts = orttrainer.ORTTrainerOptions({
        'debug': {
            'deterministic_compute': True
        },
        'device': {
            'id': device,
        },
    })
    trainer = orttrainer.ORTTrainer(model,
                                    model_desc,
                                    optim_config,
                                    options=opts)
    experimental_losses = []
    for i in range(train_steps):
        sample_input = generate_random_input_from_model_desc(model_desc, i)
        experimental_losses.append(
            trainer.train_step(*sample_input).cpu().item())

    # LEGACY IMPLEMENTATION
    torch.manual_seed(seed)
    onnxruntime.set_seed(seed)
    device = torch.device(device)
    legacy_model_desc, learning_rate_description, learning_rate = legacy_model_params(
        lr=0.001)
    legacy_trainer = Legacy_ORTTrainer(model, None, legacy_model_desc,
                                       "LambOptimizer", None,
                                       learning_rate_description, device)
    legacy_losses = []
    for i in range(train_steps):
        sample_input = generate_random_input_from_model_desc(model_desc, i)
        leg_loss = legacy_trainer.train_step(*sample_input, learning_rate)
        legacy_losses.append(leg_loss.cpu().item())

    # Check results
    _test_helpers.assert_model_outputs(experimental_losses,
                                       legacy_losses,
                                       True,
                                       rtol=1e-5)
コード例 #9
0
def testORTTrainerLegacyAndExperimentalPrecisionLossScaler(seed, device):
    # Common data
    total_steps = 5
    bptt=35

    # Setup experimental API
    torch.manual_seed(seed)
    set_seed(seed)
    loss_scaler = amp.DynamicLossScaler()
    options = orttrainer.ORTTrainerOptions({'device' : {'id' : device},
                                            'mixed_precision' : {
                                                'enabled' : True,
                                                'loss_scaler' : loss_scaler},
                                            'debug' : {'deterministic_compute' : True,}})
    model, model_desc, my_loss, batcher_fn, train_data, val_data, _ = _load_pytorch_transformer_model(device)
    optim_config = optim.LambConfig(lr=0.001)
    trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, loss_fn=my_loss, options=options)
    # Training loop
    experimental_loss = []
    experimental_preds_dtype = []
    for i in range(total_steps):
        data, targets = batcher_fn(train_data, i)
        exp_loss, exp_preds = trainer.train_step(data, targets)
        experimental_loss.append(exp_loss.cpu())
        experimental_preds_dtype.append(exp_preds.dtype)

    # Setup legacy API
    torch.manual_seed(seed)
    set_seed(seed)
    model, (model_desc, lr_desc), _, _, _, _, _ = _load_pytorch_transformer_model(device, legacy_api=True)
    loss_scaler = Legacy_LossScaler('ort_test_input_loss_scalar', True)
    legacy_trainer = Legacy_ORTTrainer(model, my_loss, model_desc, "LambOptimizer",
                                       None, lr_desc, device=device,
                                       _use_deterministic_compute=True,
                                       use_mixed_precision=True,
                                       loss_scaler=loss_scaler)
    # Training loop
    legacy_loss = []
    legacy_preds_dtype = []
    for i in range(total_steps):
        data, targets = batcher_fn(train_data, i)
        leg_loss, leg_preds = legacy_trainer.train_step(data, targets, torch.tensor([optim_config.lr]))
        legacy_loss.append(leg_loss.cpu())
        legacy_preds_dtype.append(leg_preds.dtype)

    # Compare legacy vs experimental APIs
    assert experimental_preds_dtype == legacy_preds_dtype
    _test_helpers.assert_legacy_onnx_weights(trainer, legacy_trainer, rtol=1e-4, atol=1e-2)
    _test_helpers.assert_model_outputs(legacy_loss, experimental_loss, rtol=1e-4)
コード例 #10
0
ファイル: train.py プロジェクト: ganik/DeBERTa
def create_ort_trainer(args, device, model):
    # set GPU memory limitation
    from onnxruntime.capi._pybind_state import set_cuda_mem_limit
    ort_cuda_mem_limit_in_gbs = 1
    set_cuda_mem_limit(int(ort_cuda_mem_limit_in_gbs * 1024 * 1024 *1024))

    # BertLAMB default initial settings: b1=0.9, b2=0.999, e=1e-6
    def map_optimizer_attributes(name):
        no_decay_keys = ["bias", "gamma", "beta", "LayerNorm"]
        no_decay = False
        for no_decay_key in no_decay_keys:
            if no_decay_key in name:
                no_decay = True
                break
        if no_decay:
            return {"alpha": 0.9, "beta": 0.999, "lambda": 0.0, "epsilon": 1e-6}
        else:
            return {"alpha": 0.9, "beta": 0.999, "lambda": 0.01, "epsilon": 1e-6}

    # we request ORTTrainer to create a LambOptimizer with given optimizer_attributes. 
    # train_step does forward, backward, and optimize step.
    model = ORTTrainer(model, None, bert_model_description(args), "LambOptimizer", 
        map_optimizer_attributes,
        IODescription('Learning_Rate', [1,], torch.float32),
        device,
        _opset_version = 10)

    if args.fp16:
        setattr(args, 'ort_loss_scale', LossScaler(model.loss_scale_input_name, True, up_scale_window=2000))

    return model
コード例 #11
0
 def get_trainer(
     self,
     model,
     model_desc,
     device,
     onnx_opset_ver=12,
     frozen_weights=[],
     internal_loss_fn=False,
     get_lr_this_step=None,
     optimizer="SGDOptimizer",
 ):
     loss_fn = MNISTWrapper.my_loss if not internal_loss_fn else None
     return ORTTrainer(
         model,
         loss_fn,
         model_desc,
         optimizer,
         None,
         IODescription(
             "Learning_Rate",
             [
                 1,
             ],
             torch.float32,
         ),
         device,
         _opset_version=onnx_opset_ver,
         frozen_weights=frozen_weights,
         get_lr_this_step=get_lr_this_step,
     )
コード例 #12
0
def create_ort_trainer(gradient_accumulation_steps,
                        use_mixed_precision,
                        allreduce_post_accumulation,
                        use_simple_model_desc=True,
                        loss_scaler=None,
                        partition_optimizer=False):
    model_desc = bert_model_description()
    simple_model_desc = remove_extra_info(model_desc) if use_simple_model_desc else model_desc
    learning_rate_description = ort_trainer_learning_rate_description()
    device = torch.device("cuda", 0)

    onnx_model = onnx.load(get_name("bert_toy_postprocessed.onnx"))

    model = ORTTrainer(onnx_model, None, simple_model_desc, "LambOptimizer",
                       map_optimizer_attributes,
                       learning_rate_description,
                       device, postprocess_model=None,
                       gradient_accumulation_steps=gradient_accumulation_steps,
                       world_rank=0, world_size=1,
                       loss_scaler=loss_scaler,
                       use_mixed_precision=use_mixed_precision,
                       allreduce_post_accumulation=allreduce_post_accumulation,
                       partition_optimizer = partition_optimizer)
    
    return model, model_desc, device
コード例 #13
0
def create_ort_trainer(args, device, model):

    # set GPU memory limitation (per card!)
    from onnxruntime.capi._pybind_state import set_cuda_mem_limit
    ort_cuda_mem_limit_in_gbs = args.gpu_memory_limit_gb
    set_cuda_mem_limit(int(ort_cuda_mem_limit_in_gbs * 1024 * 1024 * 1024))

    # BertLAMB default initial settings: b1=0.9, b2=0.999, e=1e-6
    def map_optimizer_attributes(name):
        no_decay_keys = ["bias", "gamma", "beta", "LayerNorm"]
        no_decay = False
        for no_decay_key in no_decay_keys:
            if no_decay_key in name:
                no_decay = True
                break
        if no_decay:
            return {
                "alpha": 0.9,
                "beta": 0.999,
                "lambda": 0.0,
                "epsilon": 1e-6
            }
        else:
            return {
                "alpha": 0.9,
                "beta": 0.999,
                "lambda": 0.01,
                "epsilon": 1e-6
            }

    # we request ORTTrainer to create a LambOptimizer with given optimizer_attributes.
    # train_step does forward, backward, and optimize step.
    model = ORTTrainer(
        model,
        None,
        bert_model_description(args),
        "LambOptimizer",
        map_optimizer_attributes,
        IODescription('Learning_Rate', [
            1,
        ], torch.float32),
        device,
        gradient_accumulation_steps=args.gradient_accumulation_steps,
        world_rank=args.world_rank,
        world_size=args.world_size,
        use_mixed_precision=True if args.fp16 else False,
        allreduce_post_accumulation=True
        if args.allreduce_post_accumulation else False,
        deepspeed_zero_stage=1 if args.deepspeed_zero_stage else 0,
        _opset_version=12)

    if args.fp16:
        setattr(
            args, 'ort_loss_scale',
            LossScaler(model.loss_scale_input_name, True,
                       up_scale_window=2000))

    return model
コード例 #14
0
        def create_and_check_bert_model(self, config, input_ids,
                                        token_type_ids, input_mask,
                                        sequence_labels, token_labels,
                                        choice_labels):
            model = BertModel(config=config)
            model.to(input_ids.device)
            model.eval()

            sequence_output, pooled_output = model(
                input_ids,
                attention_mask=input_mask,
                token_type_ids=token_type_ids)

            # failed because there is not loss output
            model_desc = ModelDescription([
                self.input_ids_desc, self.attention_mask_desc,
                self.token_type_ids_desc
            ], [self.last_hidden_state_desc, self.pooler_output_desc])
            args_gradient_accumulation_steps = 8
            args_local_rank = 0
            args_world_size = 1
            args_fp16 = True
            args_allreduce_post_accumulation = True

            model = ORTTrainer(
                model,
                None,
                model_desc,
                "LambOptimizer",
                map_optimizer_attributes=map_optimizer_attributes,
                learning_rate_description=IODescription(
                    'Learning_Rate', [
                        1,
                    ], torch.float32),
                device=self.device,
                postprocess_model=postprocess_model,
                gradient_accumulation_steps=args_gradient_accumulation_steps,
                world_rank=args_local_rank,
                world_size=args_world_size,
                use_mixed_precision=True if args_fp16 else False,
                allreduce_post_accumulation=True
                if args_allreduce_post_accumulation else False)

            sequence_output, pooled_output = model(
                input_ids, token_type_ids=token_type_ids)
            sequence_output, pooled_output = model(input_ids)

            result = {
                "sequence_output": sequence_output,
                "pooled_output": pooled_output,
            }
            self.parent.assertListEqual(
                list(result["sequence_output"].size()),
                [self.batch_size, self.seq_length, self.hidden_size])
            self.parent.assertListEqual(list(result["pooled_output"].size()),
                                        [self.batch_size, self.hidden_size])
コード例 #15
0
 def get_trainer(self, model, model_desc, device, onnx_opset_ver=12):
     return ORTTrainer(model,
                       MNISTWrapper.my_loss,
                       model_desc,
                       "SGDOptimizer",
                       None,
                       IODescription('Learning_Rate', [
                           1,
                       ], torch.float32),
                       device,
                       _opset_version=onnx_opset_ver)
コード例 #16
0
    def to_ort_model(self, model, config, args):
        model_desc = self.gpt2_model_description(config.n_head,
                                                 config.vocab_size,
                                                 config.n_embd, config.n_layer,
                                                 config.n_ctx,
                                                 args.per_gpu_train_batch_size)
        learning_rate_description = self.ort_trainer_learning_rate_description(
        )

        def map_optimizer_attributes(name):
            no_decay_keys = ["bias", "gamma", "beta", "LayerNorm"]
            no_decay = False
            for no_decay_key in no_decay_keys:
                if no_decay_key in name:
                    no_decay = True
                    break
            if no_decay:
                return {
                    "alpha": 0.9,
                    "beta": 0.999,
                    "lambda": 0.0,
                    "epsilon": args.adam_epsilon
                }
            else:
                return {
                    "alpha": 0.9,
                    "beta": 0.999,
                    "lambda": args.weight_decay,
                    "epsilon": args.adam_epsilon
                }

        from onnxruntime.capi._pybind_state import set_cuda_device_id, set_arena_extend_strategy, ArenaExtendStrategy
        set_arena_extend_strategy(ArenaExtendStrategy.kSameAsRequested)
        set_cuda_device_id(self.args.local_rank)

        model = ORTTrainer(
            model,
            None,
            model_desc,
            "AdamOptimizer",
            map_optimizer_attributes,
            learning_rate_description,
            args.device,
            gradient_accumulation_steps=args.gradient_accumulation_steps,
            world_rank=self.args.world_rank,
            world_size=self.args.world_size,
            use_mixed_precision=self.args.fp16,
            allreduce_post_accumulation=True,
            _opset_version=12)

        logger.info("****************************Model converted to ORT")
        return model
コード例 #17
0
def testORTTrainerLegacyAndExperimentalWeightsCheck(seed, device):
    # Common data
    total_steps = 5
    bptt = 35

    # Setup for the experimental ORTTRainer run
    torch.manual_seed(seed)
    set_seed(seed)
    optim_config = optim.LambConfig()
    opts = orttrainer.ORTTrainerOptions({
        'device' : {
            'id' : device
        },
        'debug' : {
            'deterministic_compute': True
        },
    })
    model, model_desc, my_loss, batcher_fn, train_data, val_data, _ = _load_pytorch_transformer_model(device)
    trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, loss_fn=my_loss, options=opts)
    # Training loop
    for i in range(total_steps):
        data, targets = batcher_fn(train_data, i)
        _ = trainer.train_step(data, targets)

    # Setup for the legacy ORTTrainer run
    torch.manual_seed(seed)
    set_seed(seed)
    model, (model_desc, lr_desc), _, _, _, _, _ = _load_pytorch_transformer_model(device, legacy_api=True)
    legacy_trainer = Legacy_ORTTrainer(model, my_loss, model_desc, "LambOptimizer", None, lr_desc,
                                       device, _use_deterministic_compute=True)
    # Training loop
    for i in range(total_steps):
        data, targets = batcher_fn(train_data, i)
        _, _ = legacy_trainer.train_step(data, targets, torch.tensor([optim_config.lr]))

    # Compare legacy vs experimental APIs
    _test_helpers.assert_legacy_onnx_weights(trainer, legacy_trainer, rtol=1e-4)
コード例 #18
0
def testORTTrainerLegacyAndExperimentalGradientAccumulation(seed, device, gradient_accumulation_steps, total_steps):
    # Common data
    torch.set_printoptions(precision=10)

    # Setup experimental API
    torch.manual_seed(seed)
    set_seed(seed)
    options = orttrainer.ORTTrainerOptions({'device' : {'id' : device},
                                            'batch' : {'gradient_accumulation_steps' : gradient_accumulation_steps},
                                            'debug' : {'deterministic_compute' : True}})
    model, model_desc, my_loss, batcher_fn, train_data, val_data, _ = _load_pytorch_transformer_model(device)
    optim_config = optim.LambConfig(lr=0.001)
    trainer = orttrainer.ORTTrainer(model, model_desc, optim_config, loss_fn=my_loss, options=options)
    # Training loop
    experimental_loss = []
    for i in range(total_steps):
        data, targets = batcher_fn(train_data, i)
        exp_loss, exp_preds = trainer.train_step(data, targets)
        experimental_loss.append(exp_loss.cpu())

    # Setup legacy API
    torch.manual_seed(seed)
    set_seed(seed)
    model, (model_desc, lr_desc), _, _, _, _, _ = _load_pytorch_transformer_model(device, legacy_api=True)
    legacy_trainer = Legacy_ORTTrainer(model, my_loss, model_desc, "LambOptimizer",
                                       None, lr_desc, device=device,
                                       _use_deterministic_compute=True,
                                       gradient_accumulation_steps=gradient_accumulation_steps)
    # Training loop
    legacy_loss = []
    for i in range(total_steps):
        data, targets = batcher_fn(train_data, i)
        leg_loss, leg_preds = legacy_trainer.train_step(data, targets, torch.tensor([optim_config.lr]))
        legacy_loss.append(leg_loss.cpu())

    # Compare legacy vs experimental APIs
    _test_helpers.assert_model_outputs(legacy_loss, experimental_loss, rtol=1e-6)
コード例 #19
0
    def testTrainingAndEvalDropout(self):
        # Temporarily disable this test.
        # The graph below will trigger ORT
        # to sort backward graph before forward graph which gives incorrect result.
        # TODO Re-enable when that is fixed.
        return
        class TwoDropoutNet(nn.Module):
            def __init__(self, drop_prb_1, drop_prb_2, dim_size):
                super(TwoDropoutNet, self).__init__()
                self.drop_1 = nn.Dropout(drop_prb_1)
                self.drop_2 = nn.Dropout(drop_prb_2)
                self.weight_1 = torch.nn.Parameter(torch.zeros(dim_size, dtype=torch.float32))
            def forward(self, x):
                x = x + self.weight_1
                x = self.drop_1(x)
                x = self.drop_2(x)
                output = x
                return output[0]
        dim_size = 3
        device = torch.device("cuda", 0)
        # This will drop all values, therefore expecting all 0 in output tensor
        model = TwoDropoutNet(0.999, 0.999, dim_size)
        input_desc = IODescription('input', [dim_size], torch.float32)
        output_desc = IODescription('output', [], torch.float32)
        model_desc = ModelDescription([input_desc], [output_desc])
        lr_desc = ort_trainer_learning_rate_description()
        model = ORTTrainer(model, None, model_desc, "LambOptimizer",
                        map_optimizer_attributes,
                        lr_desc,
                        device,
                        postprocess_model=process_dropout,
                        world_rank=0, world_size=1)
        input = torch.ones(dim_size, dtype=torch.float32).to(device)
        expected_training_output = [0.0]
        expected_eval_output = [1.0]
        learning_rate = torch.tensor([1.0000000e+00]).to(device)
        input_args=[input, learning_rate]
        train_output = model.train_step(*input_args)

        rtol = 1e-04
        assert_allclose(expected_training_output, train_output.item(), rtol=rtol, err_msg="dropout training loss mismatch")

        eval_output = model.eval_step(input)
        assert_allclose(expected_eval_output, eval_output.item(), rtol=rtol, err_msg="dropout eval loss mismatch")
 
        # Do another train step to make sure it's using original ratios
        train_output_2 = model.train_step(*input_args)
        assert_allclose(expected_training_output, train_output_2.item(), rtol=rtol, err_msg="dropout training loss 2 mismatch")
コード例 #20
0
        def create_and_check_bert_for_masked_lm(self, config, input_ids,
                                                token_type_ids, input_mask,
                                                sequence_labels, token_labels,
                                                choice_labels):
            model = BertForMaskedLM(config=config)
            model.eval()
            loss, prediction_scores = model(input_ids,
                                            attention_mask=input_mask,
                                            token_type_ids=token_type_ids,
                                            masked_lm_labels=token_labels)

            #####
            model_desc = ModelDescription([
                self.input_ids_desc, self.attention_mask_desc,
                self.token_type_ids_desc, self.masked_lm_labels_desc
            ], [self.loss_desc, self.prediction_scores_desc])
            args_gradient_accumulation_steps = 8
            args_local_rank = 0
            args_world_size = 1
            args_fp16 = True
            args_allreduce_post_accumulation = True

            model = ORTTrainer(
                model,
                None,
                model_desc,
                "LambOptimizer",
                map_optimizer_attributes=map_optimizer_attributes,
                learning_rate_description=IODescription(
                    'Learning_Rate', [
                        1,
                    ], torch.float32),
                device=self.device,
                postprocess_model=postprocess_model,
                gradient_accumulation_steps=args_gradient_accumulation_steps,
                world_rank=args_local_rank,
                world_size=args_world_size,
                use_mixed_precision=True if args_fp16 else False,
                allreduce_post_accumulation=True
                if args_allreduce_post_accumulation else False)
            model(input_ids,
                  attention_mask=input_mask,
                  token_type_ids=token_type_ids,
                  masked_lm_labels=token_labels)
コード例 #21
0
def testToyBERTModelLegacyExperimentalLRScheduler(initial_lr, lr_scheduler,
                                                  legacy_lr_scheduler):
    ############################################################################
    # These tests require hard-coded values for 'total_steps' and 'initial_lr' #
    ############################################################################

    # Common setup
    total_steps = 128
    device = 'cuda'
    seed = 1
    warmup = 0.05
    cycles = 0.5
    power = 1.
    lr_end = 1e-7

    # Setup both Experimental and Legacy LR Schedulers before the experimental loop
    if legacy_lr_scheduler == _test_commons.legacy_constant_lr_scheduler or legacy_lr_scheduler == _test_commons.legacy_linear_lr_scheduler:
        legacy_lr_scheduler = partial(legacy_lr_scheduler,
                                      initial_lr=initial_lr,
                                      total_steps=total_steps,
                                      warmup=warmup)
    elif legacy_lr_scheduler == _test_commons.legacy_cosine_lr_scheduler:
        legacy_lr_scheduler = partial(legacy_lr_scheduler,
                                      initial_lr=initial_lr,
                                      total_steps=total_steps,
                                      warmup=warmup,
                                      cycles=cycles)
    elif legacy_lr_scheduler == _test_commons.legacy_poly_lr_scheduler:
        legacy_lr_scheduler = partial(legacy_lr_scheduler,
                                      initial_lr=initial_lr,
                                      total_steps=total_steps,
                                      warmup=warmup,
                                      power=power,
                                      lr_end=lr_end)
    else:
        raise RuntimeError("Invalid legacy_lr_scheduler")
    if lr_scheduler == optim.lr_scheduler.ConstantWarmupLRScheduler or lr_scheduler == optim.lr_scheduler.LinearWarmupLRScheduler:
        lr_scheduler = lr_scheduler(total_steps=total_steps, warmup=warmup)
    elif lr_scheduler == optim.lr_scheduler.CosineWarmupLRScheduler:
        lr_scheduler = lr_scheduler(total_steps=total_steps,
                                    warmup=warmup,
                                    cycles=cycles)
    elif lr_scheduler == optim.lr_scheduler.PolyWarmupLRScheduler:
        lr_scheduler = lr_scheduler(total_steps=total_steps,
                                    warmup=warmup,
                                    power=power,
                                    lr_end=lr_end)
    else:
        raise RuntimeError("Invalid lr_scheduler")

    # EXPERIMENTAL API
    model_desc = bert_model_description()
    model = load_bert_onnx_model()
    torch.manual_seed(seed)
    onnxruntime.set_seed(seed)
    optim_config = optim.AdamConfig(lr=initial_lr)
    opts = orttrainer.ORTTrainerOptions({
        'debug': {
            'deterministic_compute': True
        },
        'device': {
            'id': device,
        },
        'lr_scheduler': lr_scheduler
    })
    trainer = orttrainer.ORTTrainer(model,
                                    model_desc,
                                    optim_config,
                                    options=opts)
    experimental_losses = []
    for i in range(total_steps):
        sample_input = generate_random_input_from_model_desc(model_desc, i)
        experimental_losses.append(
            trainer.train_step(*sample_input).cpu().item())
        assert_allclose(trainer.options.lr_scheduler.get_last_lr()[0],
                        legacy_lr_scheduler(i))

    # LEGACY IMPLEMENTATION
    torch.manual_seed(seed)
    onnxruntime.set_seed(seed)
    device = torch.device(device)
    model = load_bert_onnx_model()
    legacy_model_desc, learning_rate_description, learning_rate = legacy_model_params(
        initial_lr)
    legacy_trainer = Legacy_ORTTrainer(model,
                                       None,
                                       legacy_model_desc,
                                       "AdamOptimizer",
                                       None,
                                       learning_rate_description,
                                       device,
                                       _use_deterministic_compute=True,
                                       get_lr_this_step=legacy_lr_scheduler)
    legacy_losses = []
    for i in range(total_steps):
        sample_input = generate_random_input_from_model_desc(model_desc, i)
        leg_loss = legacy_trainer.train_step(*sample_input)
        legacy_losses.append(leg_loss.cpu().item())

    # Check results
    _test_helpers.assert_model_outputs(experimental_losses, legacy_losses)
コード例 #22
0
    def train(self):
        """
        Main training entry point.
        """
        train_dataloader = self.get_train_dataloader()

        if self.args.max_steps > 0:
            t_total = self.args.max_steps
            num_train_epochs = (
                self.args.max_steps // (len(train_dataloader) // self.args.gradient_accumulation_steps) + 1
            )
        else:
            t_total = int(len(train_dataloader) // self.args.gradient_accumulation_steps * self.args.num_train_epochs)
            num_train_epochs = self.args.num_train_epochs

        if self.use_new_api:
            lr_scheduler = orttrainer.optim.LinearWarmupLRScheduler(t_total, self.args.warmup_steps/float(t_total))

            loss_scaler = amp.DynamicLossScaler() if self.args.fp16 else None
            device = self.args.device.type
            device = f'{device}:{self.args.device.index}' if self.args.device.index else f'{device}:0'
            options = orttrainer.ORTTrainerOptions({'batch' : {
                                                        'gradient_accumulation_steps' : self.args.gradient_accumulation_steps},
                                                    'device': {'id': device},
                                                    'mixed_precision': {
                                                        'enabled': self.args.fp16,
                                                        'loss_scaler': loss_scaler},
                                                    'debug': {'deterministic_compute': True, },
                                                    'utils': {
                                                        'grad_norm_clip': False},
                                                    'distributed': {'allreduce_post_accumulation': True},
                                                    'lr_scheduler': lr_scheduler
                                                    })

            param_optimizer = list(self.model.named_parameters())
            params = [{
                'params': [n for n, p in param_optimizer if "bias" in n or "LayerNorm.weight" in n],
                "weight_decay_mode": 1, }, {
                'params': [n for n, p in param_optimizer if not ("bias" in n or "LayerNorm.weight" in n)],
                "weight_decay_mode": 1, }
                ]

            optim_config = optim.AdamConfig(params=params, lr=2e-5, do_bias_correction=True)
            self.model = orttrainer.ORTTrainer(self.model, self.new_model_desc, optim_config, options=options)
        else:
            def map_optimizer_attributes(name):
                no_decay = "bias" in name or "LayerNorm.weight" in name
                if no_decay:
                    return {"weight_decay_mode" : 1}
                else:
                    return {"weight_decay_mode" : 1}
            get_lr_this_step = get_linear_schedule_with_warmup(self.args.warmup_steps, t_total, self.args.learning_rate)
            loss_scaler = LossScaler('loss_scale_input_name', True, up_scale_window=2000) if self.args.fp16 else None
            self.model = ORTTrainer(self.model, None,
                self.model_desc,
                "AdamOptimizer",
                map_optimizer_attributes=map_optimizer_attributes,
                learning_rate_description=IODescription('Learning_Rate', [1,], torch.float32),
                device=self.args.device,
                gradient_accumulation_steps=self.args.gradient_accumulation_steps,
                use_mixed_precision=self.args.fp16,
                allreduce_post_accumulation=True,
                get_lr_this_step=get_lr_this_step,
                loss_scaler=loss_scaler,
                enable_grad_norm_clip=False,
                _opset_version=12,
                _use_deterministic_compute=True)

        # Train!
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataloader.dataset))
        logger.info("  Num Epochs = %d", num_train_epochs)
        logger.info("  Instantaneous batch size per GPU = %d", self.args.per_gpu_train_batch_size)
        logger.info(
            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
            self.args.train_batch_size
            * self.args.gradient_accumulation_steps
            * (torch.distributed.get_world_size() if self.args.local_rank != -1 else 1),
        )
        logger.info("  Gradient Accumulation steps = %d", self.args.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)

        global_step = 0
        epochs_trained = 0
        steps_trained_in_current_epoch = 0

        tr_loss = 0.0
        logging_loss = 0.0
        train_iterator = trange(
            epochs_trained, int(num_train_epochs), desc="Epoch", disable=self.args.local_rank not in [-1, 0],
        )

        for epoch in train_iterator:
            epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=self.args.local_rank not in [-1, 0])
            for step, inputs in enumerate(epoch_iterator):

                # Skip past any already trained steps if resuming training
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue

                tr_loss += self._training_step(self.model, inputs)

                if (step + 1) % self.args.gradient_accumulation_steps == 0 or (
                    len(epoch_iterator) <= self.args.gradient_accumulation_steps
                    and (step + 1) == len(epoch_iterator)
                ):
                    global_step += 1

                    if self.args.local_rank in [-1, 0]:
                        if (self.args.logging_steps > 0 and global_step % self.args.logging_steps == 0) or (
                            global_step == 1 and self.args.logging_first_step
                        ):
                            logs = {}
                            if self.args.evaluate_during_training:
                                results = self.evaluate()
                                for key, value in results.items():
                                    eval_key = "eval_{}".format(key)
                                    logs[eval_key] = value

                            loss_scalar = (tr_loss - logging_loss) / self.args.logging_steps
                            if not self.use_new_api:
                                learning_rate_scalar = get_lr_this_step(global_step)
                                logs["learning_rate"] = learning_rate_scalar
                            logs["loss"] = loss_scalar
                            logging_loss = tr_loss

                            epoch_iterator.write(json.dumps({**logs, **{"step": global_step}}))

                if self.args.max_steps > 0 and global_step > self.args.max_steps:
                    epoch_iterator.close()
                    break
            if self.args.max_steps > 0 and global_step > self.args.max_steps:
                train_iterator.close()
                break

        logger.info("\n\nTraining completed. \n\n")
        return TrainOutput(global_step, tr_loss / global_step)
コード例 #23
0
    def train(self):
        """
        Main training entry point.
        """
        train_dataloader = self.get_train_dataloader()

        if self.args.max_steps > 0:
            t_total = self.args.max_steps
            num_train_epochs = (self.args.max_steps //
                                (len(train_dataloader) //
                                 self.args.gradient_accumulation_steps) + 1)
        else:
            t_total = int(
                len(train_dataloader) //
                self.args.gradient_accumulation_steps *
                self.args.num_train_epochs)
            num_train_epochs = self.args.num_train_epochs

        get_lr_this_step = get_linear_schedule_with_warmup(
            self.args.warmup_steps, t_total, self.args.learning_rate)
        loss_scaler = LossScaler('loss_scale_input_name',
                                 True,
                                 up_scale_window=2000)

        def map_optimizer_attributes(name):
            # no_decay_keys = ["bias", "LayerNorm.weight"]
            no_decay = "bias" in name or "LayerNorm.weight" in name
            if no_decay:
                return {"weight_decay": 0.0, "weight_decay_mode": 1}
            else:
                return {
                    "weight_decay": self.args.weight_decay,
                    "weight_decay_mode": 1
                }

        self.model = ORTTrainer(
            self.model,
            None,
            self.model_desc,
            "AdamOptimizer",
            map_optimizer_attributes=map_optimizer_attributes,
            learning_rate_description=IODescription('Learning_Rate', [
                1,
            ], torch.float32),
            device=self.args.device,
            gradient_accumulation_steps=self.args.gradient_accumulation_steps,
            world_rank=0,
            world_size=1,  # only support single GPU cases
            use_mixed_precision=self.args.fp16,
            allreduce_post_accumulation=True,
            get_lr_this_step=get_lr_this_step,
            loss_scaler=loss_scaler,
            enable_grad_norm_clip=False,
            _opset_version=12,
            _use_deterministic_compute=True)

        # Train!
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataloader.dataset))
        logger.info("  Num Epochs = %d", num_train_epochs)
        logger.info("  Instantaneous batch size per GPU = %d",
                    self.args.per_gpu_train_batch_size)
        logger.info(
            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
            self.args.train_batch_size *
            self.args.gradient_accumulation_steps *
            (torch.distributed.get_world_size()
             if self.args.local_rank != -1 else 1),
        )
        logger.info("  Gradient Accumulation steps = %d",
                    self.args.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)

        global_step = 0
        epochs_trained = 0
        steps_trained_in_current_epoch = 0

        tr_loss = 0.0
        logging_loss = 0.0
        train_iterator = trange(
            epochs_trained,
            int(num_train_epochs),
            desc="Epoch",
            disable=self.args.local_rank not in [-1, 0],
        )

        for epoch in train_iterator:
            epoch_iterator = tqdm(train_dataloader,
                                  desc="Iteration",
                                  disable=self.args.local_rank not in [-1, 0])
            for step, inputs in enumerate(epoch_iterator):

                # Skip past any already trained steps if resuming training
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue

                tr_loss += self._training_step(self.model, inputs)

                if (step + 1) % self.args.gradient_accumulation_steps == 0 or (
                        len(epoch_iterator) <=
                        self.args.gradient_accumulation_steps and
                    (step + 1) == len(epoch_iterator)):
                    global_step += 1

                    if self.args.local_rank in [-1, 0]:
                        if (self.args.logging_steps > 0
                                and global_step % self.args.logging_steps
                                == 0) or (global_step == 1
                                          and self.args.logging_first_step):
                            logs = {}
                            if self.args.evaluate_during_training:
                                results = self.evaluate()
                                for key, value in results.items():
                                    eval_key = "eval_{}".format(key)
                                    logs[eval_key] = value

                            loss_scalar = (tr_loss - logging_loss
                                           ) / self.args.logging_steps
                            learning_rate_scalar = get_lr_this_step(
                                global_step)
                            logs["learning_rate"] = learning_rate_scalar
                            logs["loss"] = loss_scalar
                            logging_loss = tr_loss

                            epoch_iterator.write(
                                json.dumps({
                                    **logs,
                                    **{
                                        "step": global_step
                                    }
                                }))

                if self.args.max_steps > 0 and global_step > self.args.max_steps:
                    epoch_iterator.close()
                    break
            if self.args.max_steps > 0 and global_step > self.args.max_steps:
                train_iterator.close()
                break

        logger.info("\n\nTraining completed. \n\n")
        return TrainOutput(global_step, tr_loss / global_step)
コード例 #24
0
                                    lr,
                                ], torch.float32)


model_desc, lr_desc = transformer_model_description()


def get_lr_this_step(global_step):
    return 1


trainer = ORTTrainer(
    model,
    my_loss,
    model_desc,
    "LambOptimizer",
    None,
    lr_desc,
    device,
    _use_deterministic_compute=True)  #, get_lr_this_step=get_lr_this_step)
second_trainer = ORTTrainer(
    model,
    my_loss,
    model_desc,
    "LambOptimizer",
    None,
    lr_desc,
    device,
    _use_deterministic_compute=True)  #, get_lr_this_step=get_lr_this_step)

import time
コード例 #25
0
def run_test(model, model_desc, device, args, gradient_accumulation_steps,
             fp16, allreduce_post_accumulation, get_lr_this_step,
             use_internal_get_lr_this_step, loss_scaler,
             use_internal_loss_scaler, batch_args_option, dataset_len, epochs,
             use_new_api):
    dataloader = create_ort_test_dataloader(model_desc.inputs_,
                                            args.batch_size, args.seq_len,
                                            dataset_len, device)

    if use_new_api:
        assert use_internal_loss_scaler, 'new api should always use internal loss scaler'

        new_api_lr_scheduler = WrapLRScheduler(get_lr_this_step)

        new_api_loss_scaler = amp.DynamicLossScaler() if fp16 else None
        options = orttrainer.ORTTrainerOptions({
            'batch': {
                'gradient_accumulation_steps': gradient_accumulation_steps
            },
            'device': {
                'id': device
            },
            'mixed_precision': {
                'enabled': fp16,
                'loss_scaler': new_api_loss_scaler
            },
            'debug': {
                'deterministic_compute': True,
            },
            'utils': {
                'grad_norm_clip': True
            },
            'distributed': {
                'allreduce_post_accumulation': True
            },
            'lr_scheduler':
            new_api_lr_scheduler
        })

        param_optimizer = list(model.named_parameters())
        params = [{
            'params': [
                n for n, p in param_optimizer
                if "bias" in n or "LayerNorm.weight" in n
            ],
            "alpha":
            0.9,
            "beta":
            0.999,
            "lambda":
            0.0,
            "epsilon":
            1e-6
        }, {
            'params': [
                n for n, p in param_optimizer
                if not ("bias" in n or "LayerNorm.weight" in n)
            ],
            "alpha":
            0.9,
            "beta":
            0.999,
            "lambda":
            0.0,
            "epsilon":
            1e-6
        }]

        vocab_size = 99
        new_model_desc = {
            'inputs': [(
                'input_ids',
                ['batch', 'max_seq_len_in_batch'],
            ), (
                'attention_mask',
                ['batch', 'max_seq_len_in_batch'],
            ), (
                'token_type_ids',
                ['batch', 'max_seq_len_in_batch'],
            ), (
                'masked_lm_labels',
                ['batch', 'max_seq_len_in_batch'],
            ), ('next_sentence_label', [
                'batch',
            ])],
            'outputs': [('loss', [
                1,
            ], True),
                        ('prediction_scores',
                         ['batch', 'max_seq_len_in_batch', vocab_size]),
                        ('seq_relationship_scores', ['batch', 2])]
        }

        optim_config = optim.LambConfig(params=params, lr=2e-5)
        model = orttrainer.ORTTrainer(model,
                                      new_model_desc,
                                      optim_config,
                                      options=options)
        print("running with new frontend API")
    else:
        model = ORTTrainer(
            model,
            None,
            model_desc,
            "LambOptimizer",
            map_optimizer_attributes=map_optimizer_attributes,
            learning_rate_description=IODescription('Learning_Rate', [
                1,
            ], torch.float32),
            device=device,
            _enable_internal_postprocess=True,
            gradient_accumulation_steps=gradient_accumulation_steps,
            # BertLAMB default initial settings: b1=0.9, b2=0.999, e=1e-6
            world_rank=args.local_rank,
            world_size=args.world_size,
            use_mixed_precision=fp16,
            allreduce_post_accumulation=allreduce_post_accumulation,
            get_lr_this_step=get_lr_this_step
            if use_internal_get_lr_this_step else None,
            loss_scaler=loss_scaler if use_internal_loss_scaler else None,
            _opset_version=14,
            _use_deterministic_compute=True)
        print("running with old frontend API")

    # trainig loop
    eval_batch = None
    if not use_new_api:
        model.train()
    for epoch in range(epochs):
        for step, batch in enumerate(dataloader):
            if eval_batch is None:
                eval_batch = batch

            if not use_internal_get_lr_this_step:
                lr = get_lr_this_step(step)
                learning_rate = torch.tensor([lr])

            if not use_internal_loss_scaler and fp16:
                loss_scale = torch.tensor([loss_scaler.loss_scale_])

            if batch_args_option == BatchArgsOption.List:
                if not use_internal_get_lr_this_step:
                    batch = batch + [
                        learning_rate,
                    ]
                if not use_internal_loss_scaler and fp16:
                    batch = batch + [
                        loss_scale,
                    ]
                outputs = model.train_step(*batch)
            elif batch_args_option == BatchArgsOption.Dict:
                args, kwargs = split_batch(batch, model_desc.inputs_, 0)
                if not use_internal_get_lr_this_step:
                    kwargs['Learning_Rate'] = learning_rate
                if not use_internal_loss_scaler and fp16:
                    kwargs[model.loss_scale_input_name] = loss_scale
                outputs = model.train_step(*args, **kwargs)
            else:
                args_count = int(len(model_desc.inputs_) /
                                 2)  # approx helf args, half kwargs
                args, kwargs = split_batch(batch, model_desc.inputs_,
                                           args_count)
                if not use_internal_get_lr_this_step:
                    kwargs['Learning_Rate'] = learning_rate
                if not use_internal_loss_scaler and fp16:
                    kwargs[model.loss_scale_input_name] = loss_scale
                outputs = model.train_step(*args, **kwargs)

    # eval
    if batch_args_option == BatchArgsOption.List:
        outputs = model.eval_step(*batch)
    elif batch_args_option == BatchArgsOption.Dict:
        args, kwargs = split_batch(batch, model_desc.inputs_, 0)
        outputs = model.eval_step(*args, **kwargs)
    else:
        args_count = int(len(model_desc.inputs_) /
                         2)  # approx helf args, half kwargs
        args, kwargs = split_batch(batch, model_desc.inputs_, args_count)
        outputs = model.eval_step(*args, **kwargs)

    return (output.cpu().numpy() for output in outputs)
コード例 #26
0
def run_test(model, model_desc, device, args, gradient_accumulation_steps,
             fp16, allreduce_post_accumulation, get_lr_this_step,
             use_internal_get_lr_this_step, loss_scaler,
             use_internal_loss_scaler, batch_args_option):
    dataloader = create_ort_test_dataloader(model_desc.inputs_,
                                            args.batch_size, args.seq_len,
                                            device)

    model = ORTTrainer(
        model,
        None,
        model_desc,
        "LambOptimizer",
        map_optimizer_attributes=map_optimizer_attributes,
        learning_rate_description=IODescription('Learning_Rate', [
            1,
        ], torch.float32),
        device=device,
        postprocess_model=postprocess_model,
        gradient_accumulation_steps=gradient_accumulation_steps,
        # BertLAMB default initial settings: b1=0.9, b2=0.999, e=1e-6
        world_rank=args.local_rank,
        world_size=args.world_size,
        use_mixed_precision=fp16,
        allreduce_post_accumulation=allreduce_post_accumulation,
        get_lr_this_step=get_lr_this_step
        if use_internal_get_lr_this_step else None,
        loss_scaler=loss_scaler if use_internal_loss_scaler else None,
        _opset_version=12)

    # trainig loop
    eval_batch = None
    model.train()
    for step, batch in enumerate(dataloader):
        if eval_batch is None:
            eval_batch = batch

        if not use_internal_get_lr_this_step:
            lr = get_lr_this_step(step)
            learning_rate = torch.tensor([lr])

        if not use_internal_loss_scaler and fp16:
            loss_scale = torch.tensor([loss_scaler.loss_scale_])

        if batch_args_option == BatchArgsOption.List:
            if not use_internal_get_lr_this_step:
                batch = batch + [
                    learning_rate,
                ]
            if not use_internal_loss_scaler and fp16:
                batch = batch + [
                    loss_scale,
                ]
            outputs = model(*batch)
        elif batch_args_option == BatchArgsOption.Dict:
            args, kwargs = split_batch(batch, model_desc.inputs_, 0)
            if not use_internal_get_lr_this_step:
                kwargs['Learning_Rate'] = learning_rate
            if not use_internal_loss_scaler and fp16:
                kwargs[model.loss_scale_input_name] = loss_scale
            outputs = model(*args, **kwargs)
        else:
            args_count = int(len(model_desc.inputs_) /
                             2)  # approx helf args, half kwargs
            args, kwargs = split_batch(batch, model_desc.inputs_, args_count)
            if not use_internal_get_lr_this_step:
                kwargs['Learning_Rate'] = learning_rate
            if not use_internal_loss_scaler and fp16:
                kwargs[model.loss_scale_input_name] = loss_scale
            outputs = model(*args, **kwargs)

        print(outputs[0])

    # eval
    model.eval()
    if batch_args_option == BatchArgsOption.List:
        outputs = model(*batch)
    elif batch_args_option == BatchArgsOption.Dict:
        args, kwargs = split_batch(batch, model_desc.inputs_, 0)
        outputs = model(*args, **kwargs)
    else:
        args_count = int(len(model_desc.inputs_) /
                         2)  # approx helf args, half kwargs
        args, kwargs = split_batch(batch, model_desc.inputs_, args_count)
        outputs = model(*args, **kwargs)

    return (output.cpu().numpy() for output in outputs)
コード例 #27
0
ファイル: mnist_training.py プロジェクト: xadupre/onnxruntime
def main():
    # Training settings
    parser = argparse.ArgumentParser(description="PyTorch MNIST Example")
    parser.add_argument("--batch-size",
                        type=int,
                        default=64,
                        metavar="N",
                        help="input batch size for training (default: 64)")
    parser.add_argument("--test-batch-size",
                        type=int,
                        default=1000,
                        metavar="N",
                        help="input batch size for testing (default: 1000)")
    parser.add_argument("--epochs",
                        type=int,
                        default=10,
                        metavar="N",
                        help="number of epochs to train (default: 10)")
    parser.add_argument("--lr",
                        type=float,
                        default=0.01,
                        metavar="LR",
                        help="learning rate (default: 0.01)")
    parser.add_argument("--no-cuda",
                        action="store_true",
                        default=False,
                        help="disables CUDA training")
    parser.add_argument("--seed",
                        type=int,
                        default=1,
                        metavar="S",
                        help="random seed (default: 1)")
    parser.add_argument(
        "--log-interval",
        type=int,
        default=10,
        metavar="N",
        help="how many batches to wait before logging training status",
    )

    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    kwargs = {"num_workers": 0, "pin_memory": True}
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST(
            "../data",
            train=True,
            download=True,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ]),
        ),
        batch_size=args.batch_size,
        shuffle=True,
        **kwargs,
    )
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST(
            "../data",
            train=False,
            transform=transforms.Compose([
                transforms.ToTensor(),
                transforms.Normalize((0.1307, ), (0.3081, ))
            ]),
        ),
        batch_size=args.test_batch_size,
        shuffle=True,
        **kwargs,
    )

    comm = MPI.COMM_WORLD
    args.local_rank = (int(os.environ["OMPI_COMM_WORLD_LOCAL_RANK"]) if
                       ("OMPI_COMM_WORLD_LOCAL_RANK" in os.environ) else 0)
    args.world_rank = int(os.environ["OMPI_COMM_WORLD_RANK"]) if (
        "OMPI_COMM_WORLD_RANK" in os.environ) else 0
    args.world_size = comm.Get_size()
    if use_cuda:
        torch.cuda.set_device(args.local_rank)
        device = torch.device("cuda", args.local_rank)
        args.n_gpu = 1
        set_cuda_device_id(args.local_rank)
    else:
        device = torch.device("cpu")

    input_size = 784
    hidden_size = 500
    num_classes = 10
    model = NeuralNet(input_size, hidden_size, num_classes)

    model_desc = mnist_model_description()
    # use log_interval as gradient accumulate steps
    trainer = ORTTrainer(
        model,
        my_loss,
        model_desc,
        "SGDOptimizer",
        None,
        IODescription(
            "Learning_Rate",
            [
                1,
            ],
            torch.float32,
        ),
        device,
        1,
        args.world_rank,
        args.world_size,
        use_mixed_precision=False,
        allreduce_post_accumulation=True,
    )
    print("\nBuild ort model done.")

    for epoch in range(1, args.epochs + 1):
        train_with_trainer(args, trainer, device, train_loader, epoch)
        import pdb

        test_with_trainer(args, trainer, device, test_loader)
コード例 #28
0
class ORTTransformerTrainer:
    """
    """

    model: PreTrainedModel
    args: TrainingArguments
    train_dataset: Dataset
    eval_dataset: Dataset
    compute_metrics: Callable[[EvalPrediction], Dict]

    def __init__(
        self,
        model: PreTrainedModel,
        model_desc: ModelDescription,
        args: TrainingArguments,
        train_dataset: Dataset,
        eval_dataset: Dataset,
        compute_metrics: Callable[[EvalPrediction], Dict],
    ):
        """
        """

        self.model = model
        self.model_desc = model_desc
        self.args = args
        self.data_collator = DefaultDataCollator()
        self.train_dataset = train_dataset
        self.eval_dataset = eval_dataset
        self.compute_metrics = compute_metrics
        set_seed(self.args.seed)
        # Create output directory if needed
        if self.args.local_rank in [-1, 0]:
            os.makedirs(self.args.output_dir, exist_ok=True)

    def get_train_dataloader(self) -> DataLoader:
        if self.train_dataset is None:
            raise ValueError("Trainer: training requires a train_dataset.")
        train_sampler = (SequentialSampler(self.train_dataset)
                         if self.args.local_rank == -1 else DistributedSampler(
                             self.train_dataset))
        return DataLoader(
            self.train_dataset,
            batch_size=self.args.train_batch_size,
            sampler=train_sampler,
            collate_fn=self.data_collator.collate_batch,
        )

    def get_eval_dataloader(self) -> DataLoader:
        return DataLoader(
            self.eval_dataset,
            batch_size=self.args.eval_batch_size,
            shuffle=False,
            collate_fn=self.data_collator.collate_batch,
        )

    def get_test_dataloader(self, test_dataset: Dataset) -> DataLoader:
        # We use the same batch_size as for eval.
        return DataLoader(
            test_dataset,
            batch_size=self.args.eval_batch_size,
            shuffle=False,
            collate_fn=self.data_collator.collate_batch,
        )

    def train(self):
        """
        Main training entry point.
        """
        train_dataloader = self.get_train_dataloader()

        if self.args.max_steps > 0:
            t_total = self.args.max_steps
            num_train_epochs = (self.args.max_steps //
                                (len(train_dataloader) //
                                 self.args.gradient_accumulation_steps) + 1)
        else:
            t_total = int(
                len(train_dataloader) //
                self.args.gradient_accumulation_steps *
                self.args.num_train_epochs)
            num_train_epochs = self.args.num_train_epochs

        get_lr_this_step = get_linear_schedule_with_warmup(
            self.args.warmup_steps, t_total, self.args.learning_rate)
        loss_scaler = LossScaler('loss_scale_input_name',
                                 True,
                                 up_scale_window=2000)

        def map_optimizer_attributes(name):
            # no_decay_keys = ["bias", "LayerNorm.weight"]
            no_decay = "bias" in name or "LayerNorm.weight" in name
            if no_decay:
                return {"weight_decay": 0.0, "weight_decay_mode": 1}
            else:
                return {
                    "weight_decay": self.args.weight_decay,
                    "weight_decay_mode": 1
                }

        self.model = ORTTrainer(
            self.model,
            None,
            self.model_desc,
            "AdamOptimizer",
            map_optimizer_attributes=map_optimizer_attributes,
            learning_rate_description=IODescription('Learning_Rate', [
                1,
            ], torch.float32),
            device=self.args.device,
            gradient_accumulation_steps=self.args.gradient_accumulation_steps,
            world_rank=0,
            world_size=1,  # only support single GPU cases
            use_mixed_precision=self.args.fp16,
            allreduce_post_accumulation=True,
            get_lr_this_step=get_lr_this_step,
            loss_scaler=loss_scaler,
            enable_grad_norm_clip=False,
            _opset_version=12,
            _use_deterministic_compute=True)

        # Train!
        logger.info("***** Running training *****")
        logger.info("  Num examples = %d", len(train_dataloader.dataset))
        logger.info("  Num Epochs = %d", num_train_epochs)
        logger.info("  Instantaneous batch size per GPU = %d",
                    self.args.per_gpu_train_batch_size)
        logger.info(
            "  Total train batch size (w. parallel, distributed & accumulation) = %d",
            self.args.train_batch_size *
            self.args.gradient_accumulation_steps *
            (torch.distributed.get_world_size()
             if self.args.local_rank != -1 else 1),
        )
        logger.info("  Gradient Accumulation steps = %d",
                    self.args.gradient_accumulation_steps)
        logger.info("  Total optimization steps = %d", t_total)

        global_step = 0
        epochs_trained = 0
        steps_trained_in_current_epoch = 0

        tr_loss = 0.0
        logging_loss = 0.0
        train_iterator = trange(
            epochs_trained,
            int(num_train_epochs),
            desc="Epoch",
            disable=self.args.local_rank not in [-1, 0],
        )

        for epoch in train_iterator:
            epoch_iterator = tqdm(train_dataloader,
                                  desc="Iteration",
                                  disable=self.args.local_rank not in [-1, 0])
            for step, inputs in enumerate(epoch_iterator):

                # Skip past any already trained steps if resuming training
                if steps_trained_in_current_epoch > 0:
                    steps_trained_in_current_epoch -= 1
                    continue

                tr_loss += self._training_step(self.model, inputs)

                if (step + 1) % self.args.gradient_accumulation_steps == 0 or (
                        len(epoch_iterator) <=
                        self.args.gradient_accumulation_steps and
                    (step + 1) == len(epoch_iterator)):
                    global_step += 1

                    if self.args.local_rank in [-1, 0]:
                        if (self.args.logging_steps > 0
                                and global_step % self.args.logging_steps
                                == 0) or (global_step == 1
                                          and self.args.logging_first_step):
                            logs = {}
                            if self.args.evaluate_during_training:
                                results = self.evaluate()
                                for key, value in results.items():
                                    eval_key = "eval_{}".format(key)
                                    logs[eval_key] = value

                            loss_scalar = (tr_loss - logging_loss
                                           ) / self.args.logging_steps
                            learning_rate_scalar = get_lr_this_step(
                                global_step)
                            logs["learning_rate"] = learning_rate_scalar
                            logs["loss"] = loss_scalar
                            logging_loss = tr_loss

                            epoch_iterator.write(
                                json.dumps({
                                    **logs,
                                    **{
                                        "step": global_step
                                    }
                                }))

                if self.args.max_steps > 0 and global_step > self.args.max_steps:
                    epoch_iterator.close()
                    break
            if self.args.max_steps > 0 and global_step > self.args.max_steps:
                train_iterator.close()
                break

        logger.info("\n\nTraining completed. \n\n")
        return TrainOutput(global_step, tr_loss / global_step)

    def _training_step(self, model: ORTTrainer,
                       inputs: Dict[str, torch.Tensor]) -> float:
        for k, v in inputs.items():
            inputs[k] = v.to(self.args.device)

        outputs = model(**inputs)
        loss = outputs[
            0]  # model outputs are always tuple in transformers (see doc)

        return loss.item()

    def save_model(self, output_dir: Optional[str] = None):
        output_dir = output_dir if output_dir is not None else self.args.output_dir
        os.makedirs(output_dir, exist_ok=True)
        self.model.save_as_onnx(os.path.join(output_dir, "transformer.onnx"))

    def evaluate(self) -> Dict[str, float]:
        """
        Run evaluation and return metrics.

        Returns:
            A dict containing:
                - the eval loss
                - the potential metrics computed from the predictions
        """
        eval_dataloader = self.get_eval_dataloader()

        output = self._prediction_loop(eval_dataloader,
                                       description="Evaluation")
        return output.metrics

    def predict(self, test_dataset: Dataset) -> PredictionOutput:
        """
        Run prediction and return predictions and potential metrics.

        Depending on the dataset and your use case, your test dataset may contain labels.
        In that case, this method will also return metrics, like in evaluate().
        """
        test_dataloader = self.get_test_dataloader(test_dataset)
        return self._prediction_loop(test_dataloader, description="Prediction")

    def _prediction_loop(self, dataloader: DataLoader,
                         description: str) -> PredictionOutput:
        """
        Prediction/evaluation loop, shared by `evaluate()` and `predict()`.

        Works both with or without labels.
        """

        logger.info("***** Running %s *****", description)
        logger.info("  Num examples = %d", len(dataloader.dataset))
        logger.info("  Batch size = %d", dataloader.batch_size)
        eval_losses: List[float] = []
        preds: np.ndarray = None
        label_ids: np.ndarray = None
        self.model.eval()

        for inputs in tqdm(dataloader, desc=description):
            has_labels = any(
                inputs.get(k) is not None
                for k in ["labels", "masked_lm_labels"])

            for k, v in inputs.items():
                inputs[k] = v.to(self.args.device)

            with torch.no_grad():
                outputs = self.model(**inputs)
                if has_labels:
                    step_eval_loss, logits = outputs[:2]
                    eval_losses += [step_eval_loss.mean().item()]
                else:
                    logits = outputs[0]

            if preds is None:
                preds = logits.detach().cpu().numpy()
            else:
                preds = np.append(preds, logits.detach().cpu().numpy(), axis=0)
            if inputs.get("labels") is not None:
                if label_ids is None:
                    label_ids = inputs["labels"].detach().cpu().numpy()
                else:
                    label_ids = np.append(
                        label_ids,
                        inputs["labels"].detach().cpu().numpy(),
                        axis=0)

        if self.compute_metrics is not None and preds is not None and label_ids is not None:
            metrics = self.compute_metrics(
                EvalPrediction(predictions=preds, label_ids=label_ids))
        else:
            metrics = {}
        if len(eval_losses) > 0:
            metrics["loss"] = np.mean(eval_losses)

        return PredictionOutput(predictions=preds,
                                label_ids=label_ids,
                                metrics=metrics)
コード例 #29
0
def main():
#Training settings
    parser = argparse.ArgumentParser(description='PyTorch MNIST Example')
    parser.add_argument('--batch-size', type=int, default=64, metavar='N',
                        help='input batch size for training (default: 64)')
    parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N',
                        help='input batch size for testing (default: 1000)')
    parser.add_argument('--epochs', type=int, default=10, metavar='N',
                        help='number of epochs to train (default: 10)')
    parser.add_argument('--lr', type=float, default=0.01, metavar='LR',
                        help='learning rate (default: 0.01)')
    parser.add_argument('--momentum', type=float, default=0.5, metavar='M',
                        help='SGD momentum (default: 0.5)')
    parser.add_argument('--no-cuda', action='store_true', default=False,
                        help='disables CUDA training')
    parser.add_argument('--seed', type=int, default=1, metavar='S',
                        help='random seed (default: 1)')
    parser.add_argument('--log-interval', type=int, default=10, metavar='N',
                        help='how many batches to wait before logging training status')

    parser.add_argument('--save-model', action='store_true', default=False,
                        help='For Saving the current Model')

    parser.add_argument('--use-ort', action='store_true', default=False,
                        help='to use onnxruntime as training backend')

    parser.add_argument('--use-ort-trainer', action='store_true', default=False,
                        help='to use onnxruntime as training backend')

    args = parser.parse_args()
    use_cuda = not args.no_cuda and torch.cuda.is_available()

    torch.manual_seed(args.seed)

    kwargs = {'num_workers': 0, 'pin_memory': True}
    train_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=True, download=True,
                       transform=transforms.Compose([
                           transforms.ToTensor(),
                           transforms.Normalize((0.1307,), (0.3081,))
                       ])),
        batch_size=args.batch_size, shuffle=True, **kwargs)
    test_loader = torch.utils.data.DataLoader(
        datasets.MNIST('../data', train=False, transform=transforms.Compose([
            transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,))])),
        batch_size=args.test_batch_size, shuffle=True, **kwargs)


    comm = MPI.COMM_WORLD
    args.local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK']) if ('OMPI_COMM_WORLD_LOCAL_RANK' in os.environ) else 0
    args.world_rank = int(os.environ['OMPI_COMM_WORLD_RANK']) if ('OMPI_COMM_WORLD_RANK' in os.environ) else 0
    args.world_size=comm.Get_size()
    torch.cuda.set_device(args.local_rank)
    if use_cuda:
        device = torch.device("cuda", args.local_rank)
    else:
        device = torch.device("cpu")
    args.n_gpu = 1
    set_cuda_device_id(args.local_rank)

    input_size = 784
    hidden_size = 500
    num_classes = 10
    model = NeuralNet(input_size, hidden_size, num_classes)

    model_desc = mnist_model_description()
    if args.use_ort_trainer:
        # use log_interval as gradient accumulate steps
        trainer = ORTTrainer(model, my_loss, model_desc, "LambOptimizer", None, IODescription('Learning_Rate', [1,], torch.float32), device, 1, None,
        args.world_rank, args.world_size, use_mixed_precision=False, allreduce_post_accumulation = True)
        print('\nBuild ort model done.')

        for epoch in range(1, args.epochs + 1):
            train_with_trainer(args, trainer, device, train_loader, epoch)
            import pdb
            test_with_trainer(args, trainer, device, test_loader)
    else:
        model = ORTModel(model, my_loss, model_desc, device, None, args.world_rank, args.world_size)
        print('\nBuild ort model done.')

        optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum)

        for epoch in range(1, args.epochs + 1):
            train_with_model(args, model, device, train_loader, optimizer, epoch)
コード例 #30
0
model_desc = mnist_model_description()
# use log_interval as gradient accumulate steps

with open("models/mnist_ort_ONNX.pt", "rb") as f:
    bin_str = f.read()

    model = onnx.load_model_from_string(bin_str)
    #print(model)

trainer = ORTTrainer(model,
                     None,
                     model_desc,
                     "LambOptimizer",
                     None,
                     IODescription('Learning_Rate', [
                         1,
                     ], torch.float32),
                     device,
                     gradient_accumulation_steps=1,
                     world_rank=world_rank,
                     world_size=world_size,
                     use_mixed_precision=False,
                     allreduce_post_accumulation=True)
print('\nBuild ort model done.')

ort_sd = trainer.state_dict()
#print(ort_sd)

printSizes(torch_model, "PyTorch")
printSizes(trainer, "ORT")

compareModels(torch_model, trainer)