def main(): torch.manual_seed(config["seed"] + config["rank"]) np.random.seed(config["seed"] + config["rank"]) # Run on the GPU is one is available device = torch.device("cuda" if torch.cuda.is_available() else "cpu") torch.cuda.set_device(0) timer = Timer(verbosity_level=config["log_verbosity"], log_fn=log_metric) init_distributed_pytorch() assert config["n_workers"] == torch.distributed.get_world_size() if torch.distributed.get_rank() == 0: if config["task"] == "Cifar": download_cifar() elif config["task"] == "LSTM": download_wikitext2() torch.distributed.barrier() task = tasks.build(task_name=config["task"], device=device, timer=timer, **config) local_optimizer = torch.optim.SGD( [ { "params": [ p for p, name in zip(task.state, task.parameter_names) if parameter_type(name) == "batch_norm" ], "weight_decay": 0.0, }, { "params": [ p for p, name in zip(task.state, task.parameter_names) if parameter_type(name) != "batch_norm" ] }, ], lr=config["learning_rate"], # to correct for summed up gradients momentum=config["momentum"], weight_decay=config["weight_decay"], nesterov=(config["momentum"] > 0), ) scheduler = torch.optim.lr_scheduler.LambdaLR(local_optimizer, learning_rate_schedule) topology = get_topology() optimizer = get_optimizer(timer, topology, task.state, local_optimizer.step) if "LSTM" in config["task"]: train_loader = task.train_iterator(config["batch_size"]) batches_per_epoch = torch.tensor(len(train_loader)) torch.distributed.all_reduce(batches_per_epoch, op=torch.distributed.ReduceOp.MIN) batches_per_epoch = batches_per_epoch.item() for epoch in range(config["num_epochs"]): timer.epoch = epoch epoch_metrics = MeanAccumulator() if not "LSTM" in config["task"]: train_loader = task.train_iterator(config["batch_size"]) batches_per_epoch = len(train_loader) with timer("epoch.body"): my_rank = torch.distributed.get_rank() print( f"Worker {my_rank} starting epoch {epoch} with {len(train_loader)} batches" ) for i, batch in enumerate(train_loader): if i >= batches_per_epoch: break epoch_frac = epoch + i / batches_per_epoch scheduler.step( epoch + (i + 1) / batches_per_epoch) # for compatibility with Choco code timer.epoch = epoch_frac info({"state.progress": epoch_frac / config["num_epochs"]}) metrics = optimizer.step( lambda: task.batch_loss_and_gradient(batch)) epoch_metrics.add(metrics) with timer("epoch.post"): for key, value in epoch_metrics.value().items(): log_metric( key, { "value": value.item(), "epoch": epoch + 1.0, "bits": optimizer.bits_sent, "messages": optimizer.messages_sent, }, tags={"split": "train"}, ) if (epoch + 1) in config["spectrum_logging_epochs"]: with timer("spectrum_logging"): print("spectrum logging at epoch {}".format(epoch + 1)) my_rank = torch.distributed.get_rank() for working_node, sending_node in config[ "spectrum_logging_worker_pairs"]: for param, name in zip(task.state, task.parameter_names): # print(name) if name in config["spectrum_logging_params"]: if my_rank == sending_node: print(f"{my_rank} sending {name}") torch.cuda.synchronize() torch.distributed.send(param, working_node) elif my_rank == working_node: print(f"{my_rank} receiving {name}") other_workers_param = torch.empty_like(param) torch.cuda.synchronize() torch.distributed.recv(other_workers_param, sending_node) u, s, v = torch.svd( (param - other_workers_param).view( param.shape[0], -1).cpu()) for i, val in enumerate(s): print(f"{i} / {val.cpu().item()}") log_metric( "spectrum", { "value": val.cpu().item(), "index": i }, tags={ "workers": f"{working_node}-{sending_node}", "parameter": name, "epoch": epoch + 1, }, ) del u, s, v with timer("epoch.test"): test_stats = task.test() for key, value in test_stats.items(): log_metric( key, { "value": value.item(), "epoch": epoch + 1.0, "bits": optimizer.bits_sent, "messages": optimizer.messages_sent, }, tags={"split": "test"}, ) # Compute and test the average model + consensus distance buffer, shapes = pack( [t.float() for t in task.state_dict().values()]) local_buffer = buffer.clone() torch.distributed.all_reduce(buffer) buffer /= torch.distributed.get_world_size() if torch.distributed.get_rank() == 0: log_metric( "consensus_distance", { "value": (local_buffer - buffer).norm().item(), "epoch": epoch + 1.0 }, {"type": "full_state_vector"}, ) if config["evaluate_average_model"]: avg_model = { key: value for key, value in zip(task.state_dict().keys(), unpack(buffer, shapes)) } test_stats = task.test(state_dict=avg_model) for key, value in test_stats.items(): log_metric( key, { "value": value.item(), "epoch": epoch + 1.0, "bits": optimizer.bits_sent, "messages": optimizer.messages_sent, }, tags={"split": "test_avg"}, ) del local_buffer, buffer, shapes params_flat, shapes = pack(task.state) avg_params_flat = params_flat.clone() torch.distributed.all_reduce(avg_params_flat) avg_params_flat /= torch.distributed.get_world_size() if torch.distributed.get_rank() == 0: log_metric( "consensus_distance", { "value": (params_flat - avg_params_flat).norm().item(), "epoch": epoch + 1.0 }, {"type": "params_only"}, ) del params_flat, shapes, avg_params_flat for entry in timer.transcript(): log_runtime(entry["event"], entry["mean"], entry["std"], entry["instances"]) info({"state.progress": 1.0})
def main(): torch.manual_seed(config["seed"] + config["rank"]) np.random.seed(config["seed"] + config["rank"]) device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") timer = Timer(verbosity_level=config["log_verbosity"], log_fn=metric) if torch.distributed.is_available(): if config["distributed_init_file"] is None: config["distributed_init_file"] = os.path.join( output_dir, "dist_init") print("Distributed init: rank {}/{} - {}".format( config["rank"], config["n_workers"], config["distributed_init_file"])) torch.distributed.init_process_group( backend=config["distributed_backend"], init_method="file://" + os.path.abspath(config["distributed_init_file"]), timeout=datetime.timedelta(seconds=120), world_size=config["n_workers"], rank=config["rank"], ) task = tasks.build(task_name=config["task"], device=device, timer=timer, **config) reducer = get_reducer(device, timer) bits_communicated = 0 runavg_model = MeanAccumulator() memories = [torch.zeros_like(param) for param in task.state] momenta = [torch.empty_like(param) for param in task.state] # need initialization send_buffers = [torch.zeros_like(param) for param in task.state] for epoch in range(config["num_epochs"]): epoch_metrics = MeanAccumulator() info({ "state.progress": float(epoch) / config["num_epochs"], "state.current_epoch": epoch }) # This seems fine ... # check_model_consistency_across_workers(task._model, epoch) # Determine per-parameter optimization parameters wds = [get_weight_decay(epoch, name) for name in task.parameter_names] # Reset running average of the model if epoch % config["average_reset_epoch_interval"] == 0: runavg_model.reset() train_loader = task.train_iterator(config["optimizer_batch_size"]) for i, batch in enumerate(train_loader): epoch_frac = epoch + i / len(train_loader) lrs = [ get_learning_rate(epoch_frac, name) for name in task.parameter_names ] with timer("batch", epoch_frac): _, grads, metrics = task.batch_loss_and_gradient(batch) epoch_metrics.add(metrics) # Compute some derived metrics from the raw gradients with timer("batch.reporting.lr", epoch_frac, verbosity=2): for name, param, grad, lr in zip(task.parameter_names, task.state, grads, lrs): if np.random.rand( ) < 0.001: # with a small probability tags = {"weight": name.replace("module.", "")} metric( "effective_lr", { "epoch": epoch_frac, "value": lr / max(l2norm(param).item()**2, 1e-8), }, tags, ) metric( "grad_norm", { "epoch": epoch_frac, "value": l2norm(grad).item() }, tags, ) if config["optimizer_wd_before_reduce"]: with timer("batch.weight_decay", epoch_frac, verbosity=2): for grad, param, wd in zip(grads, task.state, wds): if wd > 0: grad.add_(wd, param.detach()) if config["optimizer_mom_before_reduce"]: with timer("batch.momentum", epoch_frac, verbosity=2): for grad, momentum in zip(grads, momenta): if epoch == 0 and i == 0: momentum.data = grad.clone().detach() else: if (config["optimizer_momentum_type"] == "exponential_moving_average"): momentum.mul_( config["optimizer_momentum"]).add_( 1 - config["optimizer_momentum"], grad) else: momentum.mul_( config["optimizer_momentum"]).add_( grad) replace_grad_by_momentum(grad, momentum) with timer("batch.accumulate", epoch_frac, verbosity=2): for grad, memory, send_bfr in zip(grads, memories, send_buffers): if config["optimizer_memory"]: send_bfr.data[:] = grad + memory else: send_bfr.data[:] = grad with timer("batch.reduce", epoch_frac): # Set 'grads' to the averaged value from the workers bits_communicated += reducer.reduce( send_buffers, grads, memories) if config["optimizer_memory"]: with timer("batch.reporting.compr_err", verbosity=2): for name, memory, send_bfr in zip( task.parameter_names, memories, send_buffers): if np.random.rand() < 0.001: tags = {"weight": name.replace("module.", "")} rel_compression_error = l2norm( memory) / l2norm(send_bfr) metric( "rel_compression_error", { "epoch": epoch_frac, "value": rel_compression_error.item() }, tags, ) if not config["optimizer_wd_before_reduce"]: with timer("batch.wd", epoch_frac, verbosity=2): for grad, param, wd in zip(grads, task.state, wds): if wd > 0: grad.add_(wd, param.detach()) if not config["optimizer_mom_before_reduce"]: with timer("batch.mom", epoch_frac, verbosity=2): for grad, momentum in zip(grads, momenta): if epoch == 0 and i == 0: momentum.data = grad.clone().detach() else: if (config["optimizer_momentum_type"] == "exponential_moving_average"): momentum.mul_( config["optimizer_momentum"]).add_( 1 - config["optimizer_momentum"], grad) else: momentum.mul_( config["optimizer_momentum"]).add_( grad) replace_grad_by_momentum(grad, momentum) with timer("batch.step", epoch_frac, verbosity=2): for param, grad, lr in zip(task.state, grads, lrs): param.data.add_(-lr, grad) if config["fix_conv_weight_norm"]: with timer("batch.normfix", epoch_frac, verbosity=2): for param_name, param in zip(task.parameter_names, task.state): if is_conv_param(param_name): param.data[:] /= l2norm(param) with timer("batch.update_runavg", epoch_frac, verbosity=2): #print(type(task.state_dict())) #print(isinstance(task.state_dict(), dict)) runavg_model.add(task.state_dict()) if config["optimizer_memory"]: with timer("batch.reporting.memory_norm", epoch_frac, verbosity=2): if np.random.rand() < 0.001: sum_of_sq = 0.0 for parameter_name, memory in zip( task.parameter_names, memories): tags = { "weight": parameter_name.replace("module.", "") } sq_norm = torch.sum(memory**2) sum_of_sq += torch.sqrt(sq_norm) metric( "memory_norm", { "epoch": epoch_frac, "value": torch.sqrt(sq_norm).item() }, tags, ) metric( "compression_error", { "epoch": epoch_frac, "value": torch.sqrt(sum_of_sq).item() }, ) with timer("epoch_metrics.collect", epoch + 1.0, verbosity=2): epoch_metrics.reduce() for key, value in epoch_metrics.value().items(): metric( key, { "value": value, "epoch": epoch + 1.0, "bits": bits_communicated }, tags={"split": "train"}, ) metric( f"last_{key}", { "value": value, "epoch": epoch + 1.0, "bits": bits_communicated }, tags={"split": "train"}, ) with timer("test.last", epoch): test_stats = task.test() for key, value in test_stats.items(): metric( f"last_{key}", { "value": value, "epoch": epoch + 1.0, "bits": bits_communicated }, tags={"split": "test"}, ) with timer("test.runavg", epoch): test_stats = task.test(state_dict=runavg_model.value()) for key, value in test_stats.items(): metric( f"runavg_{key}", { "value": value, "epoch": epoch + 1.0, "bits": bits_communicated }, tags={"split": "test"}, ) if epoch in config["checkpoints"] and torch.distributed.get_rank( ) == 0: with timer("checkpointing"): save( os.path.join(output_dir, "epoch_{:03d}".format(epoch)), task.state_dict(), epoch + 1.0, test_stats, ) # Save running average model @TODO print(timer.summary()) if config["rank"] == 0: timer.save_summary(os.path.join(output_dir, "timer_summary.json")) info({"state.progress": 1.0})
def main(): output_dir = "../output" seed = int(config["seed"]) rank = int(config["rank"]) n_workers = int(config["n_workers"]) seed_everything(seed + rank) print('rank:{0}/{1}, local rank:{2}/{3}'.format( config["rank"], config["n_workers"], config["local_rank"], config["local_world_size"])) print('rank: {0}, available devices:{1}'.format(config["rank"], torch.cuda.device_count())) device = torch.device( "cuda:" + str(config["local_rank"]) if torch.cuda.is_available() else "cpu") print('rank: {0}, current device:{1}'.format(config["rank"], device)) timer = Timer(verbosity_level=config["log_verbosity"], log_fn=metric) if torch.distributed.is_available(): if config["distributed_init_file"] is None: config["distributed_init_file"] = os.path.join( output_dir, "dist_init") print("Distributed init: rank {}/{} - {}".format( config["rank"], config["n_workers"], config["distributed_init_file"])) torch.distributed.init_process_group( backend=config["distributed_backend"], init_method="file://" + os.path.abspath(config["distributed_init_file"]), timeout=datetime.timedelta(seconds=120), world_size=n_workers, rank=rank, ) task = tasks.build(task_name=config["task"], device=device, timer=timer, **config) # calculate total dim here total_dim = get_total_dim(task.state) n_layers = len(task.state) reducer = get_reducer(device, timer, total_dim, n_layers) bits_communicated = 0 memories = [torch.zeros_like(param) for param in task.state] momenta = [torch.empty_like(param) for param in task.state] send_buffers = [torch.zeros_like(param) for param in task.state] # collect info all_test_losses = [] all_test_accs = [] all_alphas = [] all_bytes_communicated = [] for epoch in range(config["num_epochs"]): print("state.progress: {0}/{1}, current epoch:{2}".format( float(epoch), config["num_epochs"], epoch)) # Determine per-parameter optimization parameters wds = [get_weight_decay(epoch, name) for name in task.parameter_names] train_loader = task.train_iterator(config["optimizer_batch_size"]) for i, batch in enumerate(train_loader): epoch_frac = epoch + i / len(train_loader) lrs = [ get_learning_rate(epoch_frac, name) for name in task.parameter_names ] with timer("batch", epoch_frac): _, grads, _ = task.batch_loss_and_gradient(batch) if config["optimizer_wd_before_reduce"]: with timer("batch.weight_decay", epoch_frac, verbosity=2): for grad, param, wd in zip(grads, task.state, wds): if wd > 0: grad.add_(param.detach(), alpha=wd) if config["optimizer_mom_before_reduce"]: with timer("batch.momentum", epoch_frac, verbosity=2): for grad, momentum in zip(grads, momenta): if epoch == 0 and i == 0: momentum.data = grad.clone().detach() else: if (config["optimizer_momentum_type"] == "exponential_moving_average"): momentum.mul_( config["optimizer_momentum"]).add_( grad, alpha=1 - config["optimizer_momentum"]) else: momentum.mul_( config["optimizer_momentum"]).add_( grad) replace_grad_by_momentum(grad, momentum) with timer("batch.accumulate", epoch_frac, verbosity=2): for grad, memory, send_bfr in zip(grads, memories, send_buffers): if config["optimizer_memory"]: send_bfr.data[:] = grad + memory else: send_bfr.data[:] = grad with timer("batch.reduce", epoch_frac): bits_communicated += reducer.reduce( send_buffers, grads, memories) if not config["optimizer_wd_before_reduce"]: with timer("batch.wd", epoch_frac, verbosity=2): for grad, param, wd in zip(grads, task.state, wds): if wd > 0: grad.add_(param.detach(), alpha=wd) if not config["optimizer_mom_before_reduce"]: with timer("batch.mom", epoch_frac, verbosity=2): for grad, momentum in zip(grads, momenta): if epoch == 0 and i == 0: momentum.data = grad.clone().detach() else: if (config["optimizer_momentum_type"] == "exponential_moving_average"): momentum.mul_( config["optimizer_momentum"]).add_( grad, alpha=1 - config["optimizer_momentum"]) else: momentum.mul_( config["optimizer_momentum"]).add_( grad) replace_grad_by_momentum(grad, momentum) with timer("batch.step", epoch_frac, verbosity=2): for param, grad, lr in zip(task.state, grads, lrs): param.data.add_(grad, alpha=-lr) with timer("test.last", epoch): test_stats = task.test() all_test_info = test_stats if config["optimizer_reducer"] in [ "IntQuantReducer", "HintQuantReducer" ]: if torch.is_tensor(reducer.alpha): alpha_val = reducer.alpha.item() else: alpha_val = reducer.alpha all_alphas.append(alpha_val) if torch.is_tensor(all_test_info['cross_entropy']): ce_val = all_test_info['cross_entropy'].item() else: ce_val = all_test_info['cross_entropy'] if torch.is_tensor(all_test_info['accuracy']): acc_val = all_test_info['accuracy'].item() else: acc_val = all_test_info['accuracy'] all_test_losses.append(ce_val) all_test_accs.append(acc_val) all_bytes_communicated.append(bits_communicated / (8 * 1e6)) if torch.distributed.get_rank() == 0: print("Epoch: {0}, Test loss: {1}, test acc: {2}".format( epoch, ce_val, acc_val)) method_name = config['optimizer_reducer'] if config["optimizer_reducer"] == "RankKReducer": method_name += ('_' + str(config['optimizer_memory'])) elif config["optimizer_reducer"] == "IntQuantReducer": method_name += ('_' + str(config['optimizer_reducer_rand_round'])) method_name += ('_' + str(config['optimizer_overflow_handling'])) method_name += ('_' + str(config['optimizer_reducer_int'])) elif config["optimizer_reducer"] == "HintQuantReducer": method_name += ('_' + str(config['optimizer_reducer_rand_round'])) method_name += ('_' + str(config['optimizer_overflow_handling'])) method_name += ('_' + str(config['optimizer_reducer_int'])) fl_name = config[ 'task_architecture'] + "_" + method_name + "_" + str( seed) + "_" + str( config["n_workers"]) + "_timer_summary.json" timer.save_summary(os.path.join(output_dir, fl_name)) method_name = config['optimizer_reducer'] if config["optimizer_reducer"] == "RankKReducer": method_name += ('_' + str(config['optimizer_memory'])) elif config["optimizer_reducer"] == "IntQuantReducer": method_name += ('_' + str(config['optimizer_reducer_rand_round'])) method_name += ('_' + str(config['optimizer_overflow_handling'])) method_name += ('_' + str(config['optimizer_reducer_int'])) elif config["optimizer_reducer"] == "HintQuantReducer": method_name += ('_' + str(config['optimizer_reducer_rand_round'])) method_name += ('_' + str(config['optimizer_overflow_handling'])) method_name += ('_' + str(config['optimizer_reducer_int'])) save_results(mbs=np.array(all_bytes_communicated), test_losses=np.array(all_test_losses), test_acc=np.array(all_test_accs), seed=seed, n_workers=config['n_workers'], all_alphas=np.array(all_alphas), method_name=method_name, experiment=config['task_architecture'])
def test_missing_destination(self): shutil.rmtree(tasks.CSS_DIR) tasks.build()
def test_existing_desintation(self): shutil.rmtree(tasks.CSS_DIR) os.makedirs(tasks.CSS_DIR) tasks.build()
def build(repo, tag='latest'): build(repo, tag)
def compile(self): tasks.build()
def on_any_event(self, event): build()
def HandleBuild(self, args): build(args.Repo, args.Tag, args.dry_run)