Ejemplo n.º 1
0
def main():
    torch.manual_seed(config["seed"] + config["rank"])
    np.random.seed(config["seed"] + config["rank"])

    # Run on the GPU is one is available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    torch.cuda.set_device(0)

    timer = Timer(verbosity_level=config["log_verbosity"], log_fn=log_metric)

    init_distributed_pytorch()
    assert config["n_workers"] == torch.distributed.get_world_size()

    if torch.distributed.get_rank() == 0:
        if config["task"] == "Cifar":
            download_cifar()
        elif config["task"] == "LSTM":
            download_wikitext2()
    torch.distributed.barrier()

    task = tasks.build(task_name=config["task"],
                       device=device,
                       timer=timer,
                       **config)

    local_optimizer = torch.optim.SGD(
        [
            {
                "params": [
                    p for p, name in zip(task.state, task.parameter_names)
                    if parameter_type(name) == "batch_norm"
                ],
                "weight_decay":
                0.0,
            },
            {
                "params": [
                    p for p, name in zip(task.state, task.parameter_names)
                    if parameter_type(name) != "batch_norm"
                ]
            },
        ],
        lr=config["learning_rate"],  # to correct for summed up gradients
        momentum=config["momentum"],
        weight_decay=config["weight_decay"],
        nesterov=(config["momentum"] > 0),
    )

    scheduler = torch.optim.lr_scheduler.LambdaLR(local_optimizer,
                                                  learning_rate_schedule)

    topology = get_topology()
    optimizer = get_optimizer(timer, topology, task.state,
                              local_optimizer.step)

    if "LSTM" in config["task"]:
        train_loader = task.train_iterator(config["batch_size"])
        batches_per_epoch = torch.tensor(len(train_loader))
        torch.distributed.all_reduce(batches_per_epoch,
                                     op=torch.distributed.ReduceOp.MIN)
        batches_per_epoch = batches_per_epoch.item()

    for epoch in range(config["num_epochs"]):
        timer.epoch = epoch

        epoch_metrics = MeanAccumulator()

        if not "LSTM" in config["task"]:
            train_loader = task.train_iterator(config["batch_size"])
            batches_per_epoch = len(train_loader)

        with timer("epoch.body"):
            my_rank = torch.distributed.get_rank()
            print(
                f"Worker {my_rank} starting epoch {epoch} with {len(train_loader)} batches"
            )
            for i, batch in enumerate(train_loader):
                if i >= batches_per_epoch:
                    break
                epoch_frac = epoch + i / batches_per_epoch
                scheduler.step(
                    epoch + (i + 1) /
                    batches_per_epoch)  # for compatibility with Choco code
                timer.epoch = epoch_frac
                info({"state.progress": epoch_frac / config["num_epochs"]})

                metrics = optimizer.step(
                    lambda: task.batch_loss_and_gradient(batch))
                epoch_metrics.add(metrics)

        with timer("epoch.post"):
            for key, value in epoch_metrics.value().items():
                log_metric(
                    key,
                    {
                        "value": value.item(),
                        "epoch": epoch + 1.0,
                        "bits": optimizer.bits_sent,
                        "messages": optimizer.messages_sent,
                    },
                    tags={"split": "train"},
                )

        if (epoch + 1) in config["spectrum_logging_epochs"]:
            with timer("spectrum_logging"):
                print("spectrum logging at epoch {}".format(epoch + 1))
                my_rank = torch.distributed.get_rank()
                for working_node, sending_node in config[
                        "spectrum_logging_worker_pairs"]:
                    for param, name in zip(task.state, task.parameter_names):
                        # print(name)
                        if name in config["spectrum_logging_params"]:
                            if my_rank == sending_node:
                                print(f"{my_rank} sending {name}")
                                torch.cuda.synchronize()
                                torch.distributed.send(param, working_node)
                            elif my_rank == working_node:
                                print(f"{my_rank} receiving {name}")
                                other_workers_param = torch.empty_like(param)
                                torch.cuda.synchronize()
                                torch.distributed.recv(other_workers_param,
                                                       sending_node)
                                u, s, v = torch.svd(
                                    (param - other_workers_param).view(
                                        param.shape[0], -1).cpu())
                                for i, val in enumerate(s):
                                    print(f"{i} / {val.cpu().item()}")
                                    log_metric(
                                        "spectrum",
                                        {
                                            "value": val.cpu().item(),
                                            "index": i
                                        },
                                        tags={
                                            "workers":
                                            f"{working_node}-{sending_node}",
                                            "parameter": name,
                                            "epoch": epoch + 1,
                                        },
                                    )
                                del u, s, v

        with timer("epoch.test"):
            test_stats = task.test()
            for key, value in test_stats.items():
                log_metric(
                    key,
                    {
                        "value": value.item(),
                        "epoch": epoch + 1.0,
                        "bits": optimizer.bits_sent,
                        "messages": optimizer.messages_sent,
                    },
                    tags={"split": "test"},
                )

            # Compute and test the average model + consensus distance
            buffer, shapes = pack(
                [t.float() for t in task.state_dict().values()])
            local_buffer = buffer.clone()
            torch.distributed.all_reduce(buffer)
            buffer /= torch.distributed.get_world_size()
            if torch.distributed.get_rank() == 0:
                log_metric(
                    "consensus_distance",
                    {
                        "value": (local_buffer - buffer).norm().item(),
                        "epoch": epoch + 1.0
                    },
                    {"type": "full_state_vector"},
                )
                if config["evaluate_average_model"]:
                    avg_model = {
                        key: value
                        for key, value in zip(task.state_dict().keys(),
                                              unpack(buffer, shapes))
                    }
                    test_stats = task.test(state_dict=avg_model)
                    for key, value in test_stats.items():
                        log_metric(
                            key,
                            {
                                "value": value.item(),
                                "epoch": epoch + 1.0,
                                "bits": optimizer.bits_sent,
                                "messages": optimizer.messages_sent,
                            },
                            tags={"split": "test_avg"},
                        )
            del local_buffer, buffer, shapes

        params_flat, shapes = pack(task.state)
        avg_params_flat = params_flat.clone()
        torch.distributed.all_reduce(avg_params_flat)
        avg_params_flat /= torch.distributed.get_world_size()
        if torch.distributed.get_rank() == 0:
            log_metric(
                "consensus_distance",
                {
                    "value": (params_flat - avg_params_flat).norm().item(),
                    "epoch": epoch + 1.0
                },
                {"type": "params_only"},
            )
        del params_flat, shapes, avg_params_flat

        for entry in timer.transcript():
            log_runtime(entry["event"], entry["mean"], entry["std"],
                        entry["instances"])

    info({"state.progress": 1.0})
Ejemplo n.º 2
0
def main():
    torch.manual_seed(config["seed"] + config["rank"])
    np.random.seed(config["seed"] + config["rank"])

    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    timer = Timer(verbosity_level=config["log_verbosity"], log_fn=metric)

    if torch.distributed.is_available():
        if config["distributed_init_file"] is None:
            config["distributed_init_file"] = os.path.join(
                output_dir, "dist_init")
        print("Distributed init: rank {}/{} - {}".format(
            config["rank"], config["n_workers"],
            config["distributed_init_file"]))
        torch.distributed.init_process_group(
            backend=config["distributed_backend"],
            init_method="file://" +
            os.path.abspath(config["distributed_init_file"]),
            timeout=datetime.timedelta(seconds=120),
            world_size=config["n_workers"],
            rank=config["rank"],
        )

    task = tasks.build(task_name=config["task"],
                       device=device,
                       timer=timer,
                       **config)
    reducer = get_reducer(device, timer)

    bits_communicated = 0
    runavg_model = MeanAccumulator()

    memories = [torch.zeros_like(param) for param in task.state]
    momenta = [torch.empty_like(param)
               for param in task.state]  # need initialization
    send_buffers = [torch.zeros_like(param) for param in task.state]
    for epoch in range(config["num_epochs"]):
        epoch_metrics = MeanAccumulator()
        info({
            "state.progress": float(epoch) / config["num_epochs"],
            "state.current_epoch": epoch
        })

        # This seems fine ...
        # check_model_consistency_across_workers(task._model, epoch)

        # Determine per-parameter optimization parameters
        wds = [get_weight_decay(epoch, name) for name in task.parameter_names]

        # Reset running average of the model
        if epoch % config["average_reset_epoch_interval"] == 0:
            runavg_model.reset()

        train_loader = task.train_iterator(config["optimizer_batch_size"])
        for i, batch in enumerate(train_loader):
            epoch_frac = epoch + i / len(train_loader)
            lrs = [
                get_learning_rate(epoch_frac, name)
                for name in task.parameter_names
            ]

            with timer("batch", epoch_frac):
                _, grads, metrics = task.batch_loss_and_gradient(batch)
                epoch_metrics.add(metrics)

                # Compute some derived metrics from the raw gradients
                with timer("batch.reporting.lr", epoch_frac, verbosity=2):
                    for name, param, grad, lr in zip(task.parameter_names,
                                                     task.state, grads, lrs):
                        if np.random.rand(
                        ) < 0.001:  # with a small probability
                            tags = {"weight": name.replace("module.", "")}
                            metric(
                                "effective_lr",
                                {
                                    "epoch":
                                    epoch_frac,
                                    "value":
                                    lr / max(l2norm(param).item()**2, 1e-8),
                                },
                                tags,
                            )
                            metric(
                                "grad_norm",
                                {
                                    "epoch": epoch_frac,
                                    "value": l2norm(grad).item()
                                },
                                tags,
                            )

                if config["optimizer_wd_before_reduce"]:
                    with timer("batch.weight_decay", epoch_frac, verbosity=2):
                        for grad, param, wd in zip(grads, task.state, wds):
                            if wd > 0:
                                grad.add_(wd, param.detach())

                if config["optimizer_mom_before_reduce"]:
                    with timer("batch.momentum", epoch_frac, verbosity=2):
                        for grad, momentum in zip(grads, momenta):
                            if epoch == 0 and i == 0:
                                momentum.data = grad.clone().detach()
                            else:
                                if (config["optimizer_momentum_type"] ==
                                        "exponential_moving_average"):
                                    momentum.mul_(
                                        config["optimizer_momentum"]).add_(
                                            1 - config["optimizer_momentum"],
                                            grad)
                                else:
                                    momentum.mul_(
                                        config["optimizer_momentum"]).add_(
                                            grad)
                            replace_grad_by_momentum(grad, momentum)

                with timer("batch.accumulate", epoch_frac, verbosity=2):
                    for grad, memory, send_bfr in zip(grads, memories,
                                                      send_buffers):
                        if config["optimizer_memory"]:
                            send_bfr.data[:] = grad + memory
                        else:
                            send_bfr.data[:] = grad

                with timer("batch.reduce", epoch_frac):
                    # Set 'grads' to the averaged value from the workers
                    bits_communicated += reducer.reduce(
                        send_buffers, grads, memories)

                if config["optimizer_memory"]:
                    with timer("batch.reporting.compr_err", verbosity=2):
                        for name, memory, send_bfr in zip(
                                task.parameter_names, memories, send_buffers):
                            if np.random.rand() < 0.001:
                                tags = {"weight": name.replace("module.", "")}
                                rel_compression_error = l2norm(
                                    memory) / l2norm(send_bfr)
                                metric(
                                    "rel_compression_error",
                                    {
                                        "epoch": epoch_frac,
                                        "value": rel_compression_error.item()
                                    },
                                    tags,
                                )

                if not config["optimizer_wd_before_reduce"]:
                    with timer("batch.wd", epoch_frac, verbosity=2):
                        for grad, param, wd in zip(grads, task.state, wds):
                            if wd > 0:
                                grad.add_(wd, param.detach())

                if not config["optimizer_mom_before_reduce"]:
                    with timer("batch.mom", epoch_frac, verbosity=2):
                        for grad, momentum in zip(grads, momenta):
                            if epoch == 0 and i == 0:
                                momentum.data = grad.clone().detach()
                            else:
                                if (config["optimizer_momentum_type"] ==
                                        "exponential_moving_average"):
                                    momentum.mul_(
                                        config["optimizer_momentum"]).add_(
                                            1 - config["optimizer_momentum"],
                                            grad)
                                else:
                                    momentum.mul_(
                                        config["optimizer_momentum"]).add_(
                                            grad)
                            replace_grad_by_momentum(grad, momentum)

                with timer("batch.step", epoch_frac, verbosity=2):
                    for param, grad, lr in zip(task.state, grads, lrs):
                        param.data.add_(-lr, grad)

                if config["fix_conv_weight_norm"]:
                    with timer("batch.normfix", epoch_frac, verbosity=2):
                        for param_name, param in zip(task.parameter_names,
                                                     task.state):
                            if is_conv_param(param_name):
                                param.data[:] /= l2norm(param)

                with timer("batch.update_runavg", epoch_frac, verbosity=2):
                    #print(type(task.state_dict()))
                    #print(isinstance(task.state_dict(), dict))
                    runavg_model.add(task.state_dict())

                if config["optimizer_memory"]:
                    with timer("batch.reporting.memory_norm",
                               epoch_frac,
                               verbosity=2):
                        if np.random.rand() < 0.001:
                            sum_of_sq = 0.0
                            for parameter_name, memory in zip(
                                    task.parameter_names, memories):
                                tags = {
                                    "weight":
                                    parameter_name.replace("module.", "")
                                }
                                sq_norm = torch.sum(memory**2)
                                sum_of_sq += torch.sqrt(sq_norm)
                                metric(
                                    "memory_norm",
                                    {
                                        "epoch": epoch_frac,
                                        "value": torch.sqrt(sq_norm).item()
                                    },
                                    tags,
                                )
                            metric(
                                "compression_error",
                                {
                                    "epoch": epoch_frac,
                                    "value": torch.sqrt(sum_of_sq).item()
                                },
                            )

        with timer("epoch_metrics.collect", epoch + 1.0, verbosity=2):
            epoch_metrics.reduce()
            for key, value in epoch_metrics.value().items():
                metric(
                    key,
                    {
                        "value": value,
                        "epoch": epoch + 1.0,
                        "bits": bits_communicated
                    },
                    tags={"split": "train"},
                )
                metric(
                    f"last_{key}",
                    {
                        "value": value,
                        "epoch": epoch + 1.0,
                        "bits": bits_communicated
                    },
                    tags={"split": "train"},
                )

        with timer("test.last", epoch):
            test_stats = task.test()
            for key, value in test_stats.items():
                metric(
                    f"last_{key}",
                    {
                        "value": value,
                        "epoch": epoch + 1.0,
                        "bits": bits_communicated
                    },
                    tags={"split": "test"},
                )

        with timer("test.runavg", epoch):
            test_stats = task.test(state_dict=runavg_model.value())
            for key, value in test_stats.items():
                metric(
                    f"runavg_{key}",
                    {
                        "value": value,
                        "epoch": epoch + 1.0,
                        "bits": bits_communicated
                    },
                    tags={"split": "test"},
                )

        if epoch in config["checkpoints"] and torch.distributed.get_rank(
        ) == 0:
            with timer("checkpointing"):
                save(
                    os.path.join(output_dir, "epoch_{:03d}".format(epoch)),
                    task.state_dict(),
                    epoch + 1.0,
                    test_stats,
                )
                # Save running average model @TODO

        print(timer.summary())
        if config["rank"] == 0:
            timer.save_summary(os.path.join(output_dir, "timer_summary.json"))

    info({"state.progress": 1.0})
Ejemplo n.º 3
0
def main():
    output_dir = "../output"
    seed = int(config["seed"])
    rank = int(config["rank"])
    n_workers = int(config["n_workers"])
    seed_everything(seed + rank)
    print('rank:{0}/{1}, local rank:{2}/{3}'.format(
        config["rank"], config["n_workers"], config["local_rank"],
        config["local_world_size"]))

    print('rank: {0}, available devices:{1}'.format(config["rank"],
                                                    torch.cuda.device_count()))

    device = torch.device(
        "cuda:" +
        str(config["local_rank"]) if torch.cuda.is_available() else "cpu")
    print('rank: {0}, current device:{1}'.format(config["rank"], device))
    timer = Timer(verbosity_level=config["log_verbosity"], log_fn=metric)
    if torch.distributed.is_available():
        if config["distributed_init_file"] is None:
            config["distributed_init_file"] = os.path.join(
                output_dir, "dist_init")
        print("Distributed init: rank {}/{} - {}".format(
            config["rank"], config["n_workers"],
            config["distributed_init_file"]))
        torch.distributed.init_process_group(
            backend=config["distributed_backend"],
            init_method="file://" +
            os.path.abspath(config["distributed_init_file"]),
            timeout=datetime.timedelta(seconds=120),
            world_size=n_workers,
            rank=rank,
        )
    task = tasks.build(task_name=config["task"],
                       device=device,
                       timer=timer,
                       **config)
    # calculate total dim here
    total_dim = get_total_dim(task.state)
    n_layers = len(task.state)
    reducer = get_reducer(device, timer, total_dim, n_layers)

    bits_communicated = 0
    memories = [torch.zeros_like(param) for param in task.state]
    momenta = [torch.empty_like(param) for param in task.state]
    send_buffers = [torch.zeros_like(param) for param in task.state]

    # collect info
    all_test_losses = []
    all_test_accs = []
    all_alphas = []
    all_bytes_communicated = []

    for epoch in range(config["num_epochs"]):
        print("state.progress: {0}/{1}, current epoch:{2}".format(
            float(epoch), config["num_epochs"], epoch))

        # Determine per-parameter optimization parameters
        wds = [get_weight_decay(epoch, name) for name in task.parameter_names]

        train_loader = task.train_iterator(config["optimizer_batch_size"])

        for i, batch in enumerate(train_loader):
            epoch_frac = epoch + i / len(train_loader)
            lrs = [
                get_learning_rate(epoch_frac, name)
                for name in task.parameter_names
            ]

            with timer("batch", epoch_frac):

                _, grads, _ = task.batch_loss_and_gradient(batch)

                if config["optimizer_wd_before_reduce"]:
                    with timer("batch.weight_decay", epoch_frac, verbosity=2):
                        for grad, param, wd in zip(grads, task.state, wds):
                            if wd > 0:
                                grad.add_(param.detach(), alpha=wd)

                if config["optimizer_mom_before_reduce"]:
                    with timer("batch.momentum", epoch_frac, verbosity=2):
                        for grad, momentum in zip(grads, momenta):
                            if epoch == 0 and i == 0:
                                momentum.data = grad.clone().detach()
                            else:
                                if (config["optimizer_momentum_type"] ==
                                        "exponential_moving_average"):
                                    momentum.mul_(
                                        config["optimizer_momentum"]).add_(
                                            grad,
                                            alpha=1 -
                                            config["optimizer_momentum"])
                                else:
                                    momentum.mul_(
                                        config["optimizer_momentum"]).add_(
                                            grad)
                            replace_grad_by_momentum(grad, momentum)

                with timer("batch.accumulate", epoch_frac, verbosity=2):
                    for grad, memory, send_bfr in zip(grads, memories,
                                                      send_buffers):
                        if config["optimizer_memory"]:
                            send_bfr.data[:] = grad + memory
                        else:
                            send_bfr.data[:] = grad

                with timer("batch.reduce", epoch_frac):
                    bits_communicated += reducer.reduce(
                        send_buffers, grads, memories)

                if not config["optimizer_wd_before_reduce"]:
                    with timer("batch.wd", epoch_frac, verbosity=2):
                        for grad, param, wd in zip(grads, task.state, wds):
                            if wd > 0:
                                grad.add_(param.detach(), alpha=wd)

                if not config["optimizer_mom_before_reduce"]:
                    with timer("batch.mom", epoch_frac, verbosity=2):
                        for grad, momentum in zip(grads, momenta):
                            if epoch == 0 and i == 0:
                                momentum.data = grad.clone().detach()
                            else:
                                if (config["optimizer_momentum_type"] ==
                                        "exponential_moving_average"):
                                    momentum.mul_(
                                        config["optimizer_momentum"]).add_(
                                            grad,
                                            alpha=1 -
                                            config["optimizer_momentum"])
                                else:
                                    momentum.mul_(
                                        config["optimizer_momentum"]).add_(
                                            grad)
                            replace_grad_by_momentum(grad, momentum)

                with timer("batch.step", epoch_frac, verbosity=2):
                    for param, grad, lr in zip(task.state, grads, lrs):
                        param.data.add_(grad, alpha=-lr)

        with timer("test.last", epoch):
            test_stats = task.test()

            all_test_info = test_stats
            if config["optimizer_reducer"] in [
                    "IntQuantReducer", "HintQuantReducer"
            ]:
                if torch.is_tensor(reducer.alpha):
                    alpha_val = reducer.alpha.item()
                else:
                    alpha_val = reducer.alpha
                all_alphas.append(alpha_val)

            if torch.is_tensor(all_test_info['cross_entropy']):
                ce_val = all_test_info['cross_entropy'].item()
            else:
                ce_val = all_test_info['cross_entropy']

            if torch.is_tensor(all_test_info['accuracy']):
                acc_val = all_test_info['accuracy'].item()
            else:
                acc_val = all_test_info['accuracy']
            all_test_losses.append(ce_val)
            all_test_accs.append(acc_val)
            all_bytes_communicated.append(bits_communicated / (8 * 1e6))

        if torch.distributed.get_rank() == 0:
            print("Epoch: {0}, Test loss: {1}, test acc: {2}".format(
                epoch, ce_val, acc_val))
            method_name = config['optimizer_reducer']
            if config["optimizer_reducer"] == "RankKReducer":
                method_name += ('_' + str(config['optimizer_memory']))
            elif config["optimizer_reducer"] == "IntQuantReducer":
                method_name += ('_' +
                                str(config['optimizer_reducer_rand_round']))
                method_name += ('_' +
                                str(config['optimizer_overflow_handling']))
                method_name += ('_' + str(config['optimizer_reducer_int']))
            elif config["optimizer_reducer"] == "HintQuantReducer":
                method_name += ('_' +
                                str(config['optimizer_reducer_rand_round']))
                method_name += ('_' +
                                str(config['optimizer_overflow_handling']))
                method_name += ('_' + str(config['optimizer_reducer_int']))
            fl_name = config[
                'task_architecture'] + "_" + method_name + "_" + str(
                    seed) + "_" + str(
                        config["n_workers"]) + "_timer_summary.json"
            timer.save_summary(os.path.join(output_dir, fl_name))

    method_name = config['optimizer_reducer']
    if config["optimizer_reducer"] == "RankKReducer":
        method_name += ('_' + str(config['optimizer_memory']))
    elif config["optimizer_reducer"] == "IntQuantReducer":
        method_name += ('_' + str(config['optimizer_reducer_rand_round']))
        method_name += ('_' + str(config['optimizer_overflow_handling']))
        method_name += ('_' + str(config['optimizer_reducer_int']))
    elif config["optimizer_reducer"] == "HintQuantReducer":
        method_name += ('_' + str(config['optimizer_reducer_rand_round']))
        method_name += ('_' + str(config['optimizer_overflow_handling']))
        method_name += ('_' + str(config['optimizer_reducer_int']))
    save_results(mbs=np.array(all_bytes_communicated),
                 test_losses=np.array(all_test_losses),
                 test_acc=np.array(all_test_accs),
                 seed=seed,
                 n_workers=config['n_workers'],
                 all_alphas=np.array(all_alphas),
                 method_name=method_name,
                 experiment=config['task_architecture'])
Ejemplo n.º 4
0
 def test_missing_destination(self):
     shutil.rmtree(tasks.CSS_DIR)
     tasks.build()
Ejemplo n.º 5
0
 def test_existing_desintation(self):
     shutil.rmtree(tasks.CSS_DIR)
     os.makedirs(tasks.CSS_DIR)
     tasks.build()
Ejemplo n.º 6
0
 def build(repo, tag='latest'):
     build(repo, tag)
Ejemplo n.º 7
0
 def compile(self):
     tasks.build()
Ejemplo n.º 8
0
 def on_any_event(self, event):
     build()
Ejemplo n.º 9
0
 def test_existing_desintation(self):
     shutil.rmtree(tasks.CSS_DIR)
     os.makedirs(tasks.CSS_DIR)
     tasks.build()
Ejemplo n.º 10
0
 def test_missing_destination(self):
     shutil.rmtree(tasks.CSS_DIR)
     tasks.build()
Ejemplo n.º 11
0
 def compile(self):
     tasks.build()
Ejemplo n.º 12
0
 def HandleBuild(self, args):
     build(args.Repo, args.Tag, args.dry_run)