Ejemplo n.º 1
0
def test_repeating_loader():
    loader = [1, 2, 3]
    loader = RepeatingLoader(loader)

    for idx in range(50):
        assert next(loader) == 1
        assert next(loader) == 2
        assert next(loader) == 3
        def _run():
            args_defaults = {
                'num_layers': 8,
                'hidden_size': 128,
                'num_attention_heads': 8,
                'max_position_embeddings': 128,
            }

            topo = self.get_topology(mp_size, pp_size, world_size)
            gpt2_pipe_model = GPT2ModelPipe(num_layers=8,
                                            num_stages=pp_size,
                                            mp_size=mp_size,
                                            args_others=args_defaults,
                                            topo=topo)
            model = self.get_deepspeed_model(gpt2_pipe_model, tmpdir)

            tag = 'pp_basic'
            state_dict = {}
            state_dict['checkpoint_version'] = get_megatron_version()
            model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict)

            if model.is_first_stage() or model.is_last_stage():
                inputs = self.get_inputs()
                loader = RepeatingLoader([(inputs[0], 0)])
                data_iter = iter(loader)
            else:
                data_iter = None

            baseline = model.eval_batch(data_iter=data_iter,
                                        compute_loss=False,
                                        reduce_output=None)

            dist.barrier()
            model.load_checkpoint(tmpdir,
                                  tag=tag,
                                  load_optimizer_states=False,
                                  load_lr_scheduler_states=False)
            dist.barrier()

            test = model.eval_batch(data_iter=data_iter,
                                    compute_loss=False,
                                    reduce_output=None)

            if test is not None:
                assert len(baseline) == len(test)
                # Compare outputs of each microbatch
                for mb in range(len(baseline)):
                    for b, t in zip(baseline[mb], test[mb]):
                        if b.is_floating_point():  # don't compare masks
                            assert torch.allclose(
                                b, t, atol=1e-07
                            ), f"Baseline output {baseline} is not equal to save-then-load output {test}"
        def _run_resize(inputs, tag, output, quit_event):
            reset_random()
            args_defaults = {
                'num_layers': 8,
                'hidden_size': 128,
                'num_attention_heads': 8,
                'max_position_embeddings': 128,
            }

            topo = self.get_topology(mp_resize, pp_resize,
                                     mp_resize * pp_resize)
            gpt2_pipe_model = GPT2ModelPipe(num_layers=8,
                                            num_stages=pp_resize,
                                            mp_size=mp_resize,
                                            args_others=args_defaults,
                                            topo=topo)
            model = self.get_deepspeed_model(gpt2_pipe_model, tmpdir)

            with torch.no_grad():
                model.load_checkpoint(tmpdir,
                                      tag=tag,
                                      load_optimizer_states=False,
                                      load_lr_scheduler_states=False)
                inputs = [x.cuda() for x in inputs]
                if model.is_first_stage() or model.is_last_stage():
                    loader = RepeatingLoader([((inputs[0], inputs[1]), 0)])
                    data_iter = iter(loader)
                else:
                    data_iter = None

                test = model.eval_batch(data_iter=data_iter,
                                        compute_loss=False,
                                        reduce_output=None)

                if test is not None:
                    # test should be [[hidden, True]]]
                    assert len(test) == 1
                    assert len(test[0]) == 2
                    assert torch.is_tensor(test[0][0])
                    assert test[0][1].numel() == 1
                    output.put(test[0][0].cpu())

            quit_event.wait()
        def _run_baseline(inputs, tag, output, quit_event):
            reset_random()
            args_defaults = {
                'num_layers': 8,
                'hidden_size': 128,
                'num_attention_heads': 8,
                'max_position_embeddings': 128,
            }

            topo = self.get_topology(mp_size, pp_size, mp_size * pp_size)
            gpt2_pipe_model = GPT2ModelPipe(num_layers=8,
                                            num_stages=pp_size,
                                            mp_size=mp_size,
                                            args_others=args_defaults,
                                            topo=topo)
            model = self.get_deepspeed_model(gpt2_pipe_model, tmpdir)

            with torch.no_grad():
                inputs = [x.cuda() for x in inputs]
                if model.is_first_stage() or model.is_last_stage():
                    loader = RepeatingLoader([((inputs[0], inputs[1]), 0)])
                    data_iter = iter(loader)
                else:
                    data_iter = None

                baseline = model.eval_batch(data_iter=data_iter,
                                            compute_loss=False,
                                            reduce_output=None)

                if baseline is not None:
                    # baseline should be [[hidden, True]]]
                    assert len(baseline) == 1
                    assert len(baseline[0]) == 2
                    assert torch.is_tensor(baseline[0][0])
                    assert baseline[0][1].numel() == 1
                    output.put(baseline[0][0].cpu())

                state_dict = {}
                state_dict['checkpoint_version'] = get_megatron_version()
                model.save_checkpoint(tmpdir, tag=tag, client_state=state_dict)
                quit_event.wait()
Ejemplo n.º 5
0
def train_base(args):
    torch.manual_seed(args.seed)

    # VGG also works :-)
    #net = vgg19(num_classes=10)
    net = AlexNet(num_classes=10)

    trainset = cifar_trainset(args.local_rank)

    engine, _, dataloader, __ = deepspeed.initialize(
        args=args,
        model=net,
        model_parameters=[p for p in net.parameters() if p.requires_grad],
        training_data=trainset)

    dataloader = RepeatingLoader(dataloader)
    data_iter = iter(dataloader)

    rank = dist.get_rank()
    gas = engine.gradient_accumulation_steps()

    criterion = torch.nn.CrossEntropyLoss()

    total_steps = args.steps * engine.gradient_accumulation_steps()
    step = 0
    for micro_step in range(total_steps):
        batch = next(data_iter)
        inputs = batch[0].to(engine.device)
        labels = batch[1].to(engine.device)

        outputs = engine(inputs)
        loss = criterion(outputs, labels)
        engine.backward(loss)
        engine.step()

        if micro_step % engine.gradient_accumulation_steps() == 0:
            step += 1
            if rank == 0 and (step % 10 == 0):
                print(f'step: {step:3d} / {args.steps:3d} loss: {loss}')
Ejemplo n.º 6
0
    def _helper():
        base_model = copy.deepcopy(sequential_model)
        base_input = batch_input.clone().detach()
        base_output = base_model(base_input)
        base_output = base_output
        base_params = sum(p.numel() for p in base_model.parameters())

        pipe_model = copy.deepcopy(sequential_model)
        pipe_model = PipelineModule(layers=pipe_model, num_stages=4)

        # Ensure all parameters are accounted for.
        my_params = sum(p.numel() for p in pipe_model.parameters())
        total_pipe_params = torch.LongTensor([my_params]).to('cuda')
        dist.all_reduce(total_pipe_params)
        total_pipe_params = total_pipe_params.item()
        assert total_pipe_params == base_params

        pipe_model, _, _, _ = deepspeed.initialize(
            args=simple_args,
            model=pipe_model,
            model_parameters=[p for p in pipe_model.parameters()])

        if pipe_model.is_first_stage or pipe_model.is_last_stage:
            pipe_input = base_input.clone().detach().to('cuda')
            # label 0 is meaningless
            dataset = [(pipe_input, 0)]
            loader = RepeatingLoader(dataset)
            data_iter = iter(loader)
        else:
            data_iter = None

        pipe_output = pipe_model.eval_batch(data_iter=data_iter)

        base_output = base_output.to('cpu')
        pipe_output = pipe_output.to('cpu')

        assert torch.allclose(base_output, pipe_output)