Beispiel #1
0
    def _helper(topo, tmpdir, steps=500):
        assert steps >= 100

        base_net = copy.deepcopy(init_net)
        base_model = PipelineModule(layers=base_net.to_layers(),
                                    num_stages=1,
                                    loss_fn=nn.CrossEntropyLoss())

        # Train with just data parallelism
        base_losses = train_cifar(base_model,
                                  args,
                                  num_steps=steps,
                                  fp16=config_dict['fp16']['enabled'])

        test_net = copy.deepcopy(init_net)
        test_model = PipelineModule(layers=test_net.to_layers(),
                                    topology=topo,
                                    loss_fn=nn.CrossEntropyLoss())

        #test_model = AlexNetPipe(num_classes=10,
        #                         topology=test_topo,
        #                         seed_layers=config_dict['pipeline']['seed_layers'])
        test_losses = train_cifar(test_model,
                                  args,
                                  num_steps=steps,
                                  fp16=config_dict['fp16']['enabled'])

        abs_diffs = [l0 - l1 for l0, l1 in zip(base_losses, test_losses)]
        rel_diffs = [
            rel_diff(l0, l1) for l0, l1 in zip(base_losses, test_losses)
        ]
        if dist.get_rank() == 0:
            print(
                f'abs min={min(abs_diffs)} max={max(abs_diffs)} avg={sum(abs_diffs)/len(abs_diffs)}'
            )
            print(
                f'rel min={min(rel_diffs)} max={max(rel_diffs)} avg={sum(rel_diffs)/len(rel_diffs)}'
            )
            print(
                f'first: base={base_losses[0]} test={test_losses[0]} abs={abs_diffs[0]} rel={rel_diffs[0]}'
            )

            for lastX in [1, 10, 100]:
                base_avg = sum(base_losses[-lastX:]) / lastX
                test_avg = sum(test_losses[-lastX:]) / lastX
                print(
                    f'last-{lastX}: base={base_avg} test={test_avg} abs={base_avg - test_avg} rel={rel_diff(base_avg, test_avg)}'
                )

        lastX = 100
        base = base_losses[-lastX:]
        base_avg = sum(base) / len(base)
        test = test_losses[-lastX:]
        test_avg = sum(test) / len(test)
        assert rel_diff(base_avg, test_avg) < 0.03
Beispiel #2
0
    def _helper(topo, tmpdir, steps=500):
        assert steps >= 100

        test_net = copy.deepcopy(init_net)
        test_model = PipelineModule(layers=test_net.to_layers(),
                                    topology=topo,
                                    loss_fn=nn.CrossEntropyLoss())

        test_losses = train_cifar(test_model,
                                  args,
                                  num_steps=steps,
                                  fp16=config_dict['fp16']['enabled'])
Beispiel #3
0
    def test(self, topo_config):
        config_dict = {
            "train_batch_size": 16,
            "train_micro_batch_size_per_gpu": 4,
            "steps_per_print": 20,
            "optimizer": {
                "type": "ZeroOneAdam",
                "params": {
                    "lr": 0.00001,
                    "betas": [0.9, 0.999],
                    "eps": 1e-8,
                    "weight_decay": 3e-7,
                    "var_freeze_step": 4,
                    "var_update_scaler": 1,
                    "local_step_scaler": 1,
                    "local_step_clipper": 2,
                    "cuda_aware": False,
                    "comm_backend_name": "nccl",
                },
            },
            "gradient_clipping": 1.0,
            "zero_optimization": {
                "stage": 0
            },
            "fp16": {
                "enabled": True,
                "loss_scale": 0,
                "initial_scale_power": 16
            },
            "pipeline": {
                "seed_layers": True,
                "activation_checkpoint_interval": 1
            },
        }

        topo = PipeTopo(**topo_config)
        steps = 500  # Must be >=100

        # Allocate model for consistent initial weights.
        init_net = AlexNetPipe()

        test_net = copy.deepcopy(init_net)
        test_model = PipelineModule(layers=test_net.to_layers(),
                                    topology=topo,
                                    loss_fn=nn.CrossEntropyLoss())

        test_losses = train_cifar(
            test_model,
            config=config_dict,
            num_steps=steps,
            fp16=config_dict["fp16"]["enabled"],
        )
Beispiel #4
0
    def test(self, topo_config):
        config_dict = {
            "train_batch_size": 16,
            "train_micro_batch_size_per_gpu": 4,
            "steps_per_print": 20,
            "optimizer": {
                "type": "Adam",
                "params": {
                    "lr": 0.001,
                    "betas": [0.9, 0.999],
                    "eps": 1e-8,
                    "weight_decay": 3e-7
                }
            },
            "zero_optimization": {
                "stage": 0
            },
            "fp16": {
                "enabled": False
            },
            "pipeline": {
                "seed_layers": True,
                "activation_checkpoint_interval": 1
            }
        }

        topo = PipeTopo(**topo_config)
        steps = 500  # must be >=100

        # Allocate model for consistent initial weights.
        init_net = AlexNetPipe()

        base_net = copy.deepcopy(init_net)
        base_model = PipelineModule(layers=base_net.to_layers(),
                                    num_stages=1,
                                    loss_fn=nn.CrossEntropyLoss())

        # Train with just data parallelism
        base_losses = train_cifar(base_model,
                                  config=config_dict,
                                  num_steps=steps,
                                  fp16=config_dict['fp16']['enabled'])

        test_net = copy.deepcopy(init_net)
        test_model = PipelineModule(layers=test_net.to_layers(),
                                    topology=topo,
                                    loss_fn=nn.CrossEntropyLoss())

        test_losses = train_cifar(test_model,
                                  config=config_dict,
                                  num_steps=steps,
                                  fp16=config_dict['fp16']['enabled'])

        abs_diffs = [l0 - l1 for l0, l1 in zip(base_losses, test_losses)]
        rel_diffs = [
            rel_diff(l0, l1) for l0, l1 in zip(base_losses, test_losses)
        ]
        if dist.get_rank() == 0:
            print(
                f'abs min={min(abs_diffs)} max={max(abs_diffs)} avg={sum(abs_diffs)/len(abs_diffs)}'
            )
            print(
                f'rel min={min(rel_diffs)} max={max(rel_diffs)} avg={sum(rel_diffs)/len(rel_diffs)}'
            )
            print(
                f'first: base={base_losses[0]} test={test_losses[0]} abs={abs_diffs[0]} rel={rel_diffs[0]}'
            )

            for lastX in [1, 10, 100]:
                base_avg = sum(base_losses[-lastX:]) / lastX
                test_avg = sum(test_losses[-lastX:]) / lastX
                print(
                    f'last-{lastX}: base={base_avg} test={test_avg} abs={base_avg - test_avg} rel={rel_diff(base_avg, test_avg)}'
                )

        lastX = 100
        base = base_losses[-lastX:]
        base_avg = sum(base) / len(base)
        test = test_losses[-lastX:]
        test_avg = sum(test) / len(test)
        assert rel_diff(
            base_avg, test_avg
        ) < 0.05  # Originally 0.03, but seeing instability with AMD results