def test_horovod_allreduce_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different rank or dimension.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return # Same rank, different dimension torch.manual_seed(1234) dims = [17 + rank] * 3 tensor = torch.FloatTensor(*dims).random_(-100, 100) try: hvd.allreduce(tensor) assert False, 'hvd.allreduce did not throw error' except torch.FatalError: pass # Same number of elements, different rank torch.manual_seed(1234) if rank == 0: dims = [17, 23 * 57] else: dims = [17, 23, 57] tensor = torch.FloatTensor(*dims).random_(-100, 100) try: hvd.allreduce(tensor) assert False, 'hvd.allreduce did not throw error' except torch.FatalError: pass
def test_horovod_allgather(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank) tensor = tensor.type(dtype) gathered = hvd.allgather(tensor) assert list(gathered.shape) == [17 * size] + [17] * (dim - 1) for i in range(size): rank_tensor = gathered[i * 17:(i + 1) * 17] assert list(rank_tensor.shape) == [17] * dim, \ 'hvd.allgather produces incorrect gathered shape' assert rank_tensor.data.min() == i, 'hvd.allgather produces incorrect gathered tensor' assert rank_tensor.data.max() == i, 'hvd.allgather produces incorrect gathered tensor'
def test_horovod_allreduce_cpu_gpu_error(self): """Test that the allreduce raises an error if different ranks try to perform reduction on CPU and GPU.""" # Only do this test if there are GPUs available. if not torch.cuda.is_available(): return hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return # Same rank, different dimension dims = [17] * 3 if rank % 2 == 0: tensor = torch.cuda.FloatTensor(*dims) else: tensor = torch.FloatTensor(*dims) try: hvd.allreduce(tensor) assert False, 'hvd.allreduce did not throw error' except torch.FatalError: pass
def test_horovod_broadcast_inplace(self): """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks): tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank) root_tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(root_rank) tensor = tensor.type(dtype) root_tensor = root_tensor.type(dtype) broadcasted_tensor = hvd.broadcast_(tensor, root_rank) assert (tensor == broadcasted_tensor).min() == 1, \ 'hvd.broadcast does not modify source tensor' assert (broadcasted_tensor == root_tensor).min() == 1, \ 'hvd.broadcast produces incorrect broadcasted tensor'
def test_horovod_broadcast_grad(self): """Test the correctness of the broadcast gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks): tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank) tensor = tensor.type(dtype) tensor = torch.autograd.Variable(tensor, requires_grad=True) broadcasted_tensor = hvd.broadcast(tensor, root_rank) broadcasted_tensor.backward(torch.ones([17] * dim)) grad_out = tensor.grad.data.numpy() c = size if rank == root_rank else 0 expected = np.ones([17] * dim) * c err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def backward(ctx, grad_output): grad_reduced = allreduce(grad_output, average=False) dim_t = torch.IntTensor([ctx.dim]) dim = allgather(dim_t).view(size()) r = rank() offset = torch.sum(dim.narrow(0, 0, r)).data[0] if r != 0 else 0 return grad_reduced.narrow(0, offset, ctx.dim), None
def test_horovod_broadcast_rank_error(self): """Test that the broadcast returns an error if different ranks specify different root rank.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return tensor = torch.FloatTensor(*([17] * 3)).fill_(1) try: hvd.broadcast(tensor, rank) assert False, 'hvd.broadcast did not throw error' except torch.FatalError: pass
def test_horovod_allgather_grad(self): """Test the correctness of the allgather gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): # Support tests up to MPI Size of 35 if size > 35: break tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5 tensor_sizes = tensor_sizes[:size] tensor = torch.FloatTensor( *([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank) tensor = tensor.type(dtype) tensor = torch.autograd.Variable(tensor, requires_grad=True) grad_list = [] for r, size in enumerate(tensor_sizes): grad_list.append(torch.ones([size] + [17] * (dim - 1)) * r) grad_ys = torch.cat(grad_list, dim=0) gathered = hvd.allgather(tensor) gathered.backward(grad_ys) grad_out = tensor.grad.data.numpy() expected = np.ones( [tensor_sizes[rank]] + [17] * (dim - 1) ) * rank * size err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def test_horovod_broadcast_error(self): """Test that the broadcast returns an error if any dimension besides the first is different among the tensors being broadcasted.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return tensor_size = [17] * 3 tensor_size[1] = 10 * (rank + 1) tensor = torch.FloatTensor(*tensor_size).fill_(1).mul_(rank) try: hvd.broadcast(tensor, 0) assert False, 'hvd.broadcast did not throw error' except torch.FatalError: pass
def test_horovod_broadcast_type_error(self): """Test that the broadcast returns an error if the types being broadcasted differ among the processes""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return tensor_size = [17] * 3 if rank % 2 == 0: tensor = torch.IntTensor(*tensor_size) else: tensor = torch.FloatTensor(*tensor_size) try: hvd.broadcast(tensor, 0) assert False, 'hvd.broadcast did not throw error' except torch.FatalError: pass
def test(): model.eval() test_loss = 0. test_accuracy = 0. for data, target in test_loader: if args.cuda: data, target = data.cuda(), target.cuda() output = model(data) # sum up batch loss test_loss += F.nll_loss(output, target, size_average=False).item() # get the index of the max log-probability pred = output.data.max(1, keepdim=True)[1] test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum() test_loss /= len(test_sampler) test_accuracy /= len(test_sampler) test_loss = metric_average(test_loss, 'avg_loss') test_accuracy = metric_average(test_accuracy, 'avg_accuracy') if hvd.rank() == 0: print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( test_loss, 100. * test_accuracy))
def test_horovod_allreduce_type_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different type.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return # Same rank, different dimension dims = [17] * 3 if rank % 2 == 0: tensor = torch.IntTensor(*dims) else: tensor = torch.FloatTensor(*dims) try: hvd.allreduce(tensor) assert False, 'hvd.allreduce did not throw error' except torch.FatalError: pass
def test_horovod_allgather_variable_size(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors, even if those tensors have different sizes along the first dim.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): # Support tests up to MPI Size of 35 if size > 35: break tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5 tensor_sizes = tensor_sizes[:size] tensor = torch.FloatTensor( *([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank) tensor = tensor.type(dtype) gathered = hvd.allgather(tensor) expected_size = sum(tensor_sizes) assert list(gathered.shape) == [expected_size] + [17] * (dim - 1) for i in range(size): rank_size = [tensor_sizes[i]] + [17] * (dim - 1) rank_tensor = gathered[sum( tensor_sizes[:i]):sum(tensor_sizes[:i + 1])] assert list(rank_tensor.shape) == rank_size assert rank_tensor.data.min() == i assert rank_tensor.data.max() == i
args.independent_distributed_sampling = False args.kd_ratio = 0.0 args.kd_type = 'ce' if __name__ == '__main__': os.makedirs(args.path, exist_ok=True) # Initialize Horovod hvd.init() # Pin GPU to be used to process local rank (one GPU per process) torch.cuda.set_device(hvd.local_rank()) args.teacher_path = download_url( 'https://hanlab.mit.edu/files/OnceForAll/ofa_checkpoints/ofa_D4_E6_K7', model_dir='.torch/ofa_checkpoints/%d' % hvd.rank()) num_gpus = hvd.size() torch.manual_seed(args.manual_seed) torch.cuda.manual_seed_all(args.manual_seed) np.random.seed(args.manual_seed) random.seed(args.manual_seed) # image size args.image_size = [ int(img_size) for img_size in args.image_size.split(',') ] if len(args.image_size) == 1: args.image_size = args.image_size[0] MyRandomResizedCrop.CONTINUOUS = args.continuous_size
def main(): start_epoch = args.start_epoch # start from epoch 0 or last checkpoint epoch # Data print('==> Preparing dataset %s' % args.dataset) transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)), ]) if args.dataset == 'cifar10': dataloader = datasets.CIFAR10 num_classes = 10 else: dataloader = datasets.CIFAR100 num_classes = 100 trainset = dataloader(root=args.dataroot, train=True, download=True, transform=transform_train) sampler = torch.utils.data.distributed.DistributedSampler( trainset, num_replicas=hvd.size(), rank=hvd.rank()) trainloader = data.DataLoader(dataset=trainset, batch_size=args.train_batch * world_size, shuffle=False, sampler=sampler) testset = dataloader(root=args.dataroot, train=False, download=False, transform=transform_test) testloader = data.DataLoader(testset, batch_size=args.test_batch * world_size, shuffle=False, num_workers=args.workers) # Model print("==> creating model '{}'".format("vgg19")) model = vgg19_bn(num_classes=num_classes) device = torch.device('cuda', local_rank) model = model.to(device) # model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank) print('Model on cuda:%d' % local_rank) print(' Total params: %.2fM' % (sum(p.numel() for p in model.parameters()) / 1000000.0)) criterion = nn.CrossEntropyLoss() optimizer = optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # 用horovod封装优化器 optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) # 广播参数 hvd.broadcast_parameters(model.state_dict(), root_rank=0) # Train and val for epoch in range(start_epoch, args.epochs): adjust_learning_rate(optimizer, epoch) train_loss, train_acc = train(trainloader, model, criterion, optimizer, epoch, use_cuda) test_loss, test_acc = test(testloader, model, criterion, epoch, use_cuda) print( 'Rank:{} Epoch[{}/{}]: LR: {:.3f}, Train loss: {:.5f}, Test loss: {:.5f}, Train acc: {:.2f}, Test acc: {:.2f}.' .format(local_rank, epoch + 1, args.epochs, state['lr'], train_loss, test_loss, train_acc, test_acc))
def __init__(self, hps, result_subdir, step, epoch, devices, data_device, batch_size, verbose=True): """ Network trainer :param hps: hyper-parameters for this network :param result_subdir: path to result sub-directory :type result_subdir: str :param step: global step of model :type step: int :param epoch: global epoch of model :type epoch: int :param devices: list of available devices for model running :type devices: list :param data_device: available device for data loading :type data_device: str or int :param batch_size: number of inputs in mini-batch :type batch_size: int or dict :param verbose: whether or not to print running messages :type verbose: bool """ super().__init__(verbose) # general self.hps = hps self.result_subdir = result_subdir self.distributed = hps.device.distributed.enabled # horovod: print logs on the first worker. if self.distributed: self.verbose = hvd.rank() == 0 # state self.step = step self.epoch = epoch self.devices = devices self.num_device = len(devices) # data self.data_device = data_device self.batch_size = batch_size self.num_classes = self.hps.dataset.num_classes # logging self.is_output_rank = self.verbose if hps.logging.tensorboard.enabled: self.writer = SummaryWriter( logdir=self.result_subdir) if self.is_output_rank else None if hps.logging.comet.enabled: self.experiment = Experiment( project_name=hps.logging.comet.project_name, workspace=hps.logging.comet.workspace ) if self.is_output_rank else None if self.is_output_rank and self.experiment.alive is False: raise RuntimeError('Something went wrong w/ comet.ml') self.log_profile(self.hps) self.interval_scalar = self.hps.logging.interval.scalar self.interval_snapshot = self.hps.logging.interval.snapshot
args.independent_distributed_sampling = False args.kd_ratio = 1.0 args.kd_type = 'ce' if __name__ == '__main__': os.makedirs(args.path, exist_ok=True) # Initialize Horovod hvd.init() # Pin GPU to be used to process local rank (one GPU per process) torch.cuda.set_device(hvd.local_rank()) args.teacher_path = download_url( '/NAS_REMOTE/shaozl/Fine-grained/once-for-all-master/.torch/ofa_checkpoints/ofa_ws_D4_E6_K7', model_dir='.torch/ofa_checkpoints/%d' % hvd.rank()) num_gpus = hvd.size() torch.manual_seed(args.manual_seed) torch.cuda.manual_seed_all(args.manual_seed) np.random.seed(args.manual_seed) random.seed(args.manual_seed) # image size args.image_size = [ int(img_size) for img_size in args.image_size.split(',') ] if len(args.image_size) == 1: args.image_size = args.image_size[0] MyRandomResizedCrop.CONTINUOUS = args.continuous_size
type=float, help='The alpha value used in mix up training') parser.add_argument('--label_smoothing', type=float, default=0) args = parser.parse_args() hvd.init() # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) cudnn.benchmark = True device = 'cuda' log_writer = None verbose = 1 if hvd.rank() == 0 else 0 # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(args.workers) # create model if args.arch == 'proxyless': from tinynas.nn.networks import ProxylessNASNets with open(args.net_config) as f: config = json.load(f) args.resolution = config['resolution'] model = ProxylessNASNets.build_from_config(config) else: raise NotImplementedError model = model.to(device)
def log(s, nl=True): if hvd.rank() != 0: return print(s, end='\n' if nl else '')
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) # train_examples = None LOGGER.info(f"Loading Train Dataset {opts.train_txt_db}, " f"{opts.train_img_db}") if "paired" in opts.model: DatasetCls = Nlvr2PairedDataset EvalDatasetCls = Nlvr2PairedEvalDataset collate_fn = nlvr2_paired_collate eval_collate_fn = nlvr2_paired_eval_collate if opts.model == "paired": ModelCls = UniterForNlvr2Paired elif opts.model == "paired-attn": ModelCls = UniterForNlvr2PairedAttn else: raise ValueError("unrecognized model type") elif opts.model == "triplet": DatasetCls = Nlvr2TripletDataset EvalDatasetCls = Nlvr2TripletEvalDataset ModelCls = UniterForNlvr2Triplet collate_fn = nlvr2_triplet_collate eval_collate_fn = nlvr2_triplet_eval_collate else: raise ValueError("unrecognized model type") # data loaders train_dataloader = create_dataloader( opts.train_img_db, opts.train_txt_db, opts.train_batch_size, True, DatasetCls, collate_fn, opts, ) val_dataloader = create_dataloader( opts.val_img_db, opts.val_txt_db, opts.val_batch_size, False, EvalDatasetCls, eval_collate_fn, opts, ) test_dataloader = create_dataloader( opts.test_img_db, opts.test_txt_db, opts.val_batch_size, False, EvalDatasetCls, eval_collate_fn, opts, ) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} model = ModelCls.from_pretrained(opts.model_config, state_dict=checkpoint, img_dim=IMG_DIM) model.init_type_embedding() model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) model, optimizer = amp.initialize(model, optimizer, enabled=opts.fp16, opt_level="O2") global_step = 0 if rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, "log")) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, "ckpt")) os.makedirs(join(opts.output_dir, "results")) # store val predictions add_log_to_file(join(opts.output_dir, "log", "log.txt")) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Num examples = %d", len(train_dataloader.dataset)) LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) running_loss = RunningMeter("loss") model.train() n_examples = 0 n_epoch = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() while True: for step, batch in enumerate(train_dataloader): targets = batch["targets"] n_examples += targets.size(0) loss = model(batch, compute_loss=True) loss = loss.mean() delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) running_loss(loss.item()) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for param_group in optimizer.param_groups: param_group["lr"] = lr_this_step TB_LOGGER.add_scalar("lr", lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.add_scalar("loss", running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar("grad_norm", grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput tot_ex = sum(all_gather_list(n_examples)) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f"Step {global_step}: " f"{tot_ex} examples trained at " f"{ex_per_sec} ex/s") TB_LOGGER.add_scalar("perf/ex_per_s", ex_per_sec, global_step) if global_step % opts.valid_steps == 0: for split, loader in [ ("val", val_dataloader), ("test", test_dataloader), ]: LOGGER.info(f"Step {global_step}: start running " f"validation on {split} split...") log, results = validate(model, loader, split) with open( f"{opts.output_dir}/results/" f"{split}_results_{global_step}_" f"rank{rank}.csv", "w", ) as f: for id_, ans in results: f.write(f"{id_},{ans}\n") TB_LOGGER.log_scaler_dict(log) model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break if global_step >= opts.num_train_steps: break n_epoch += 1 LOGGER.info(f"Step {global_step}: finished {n_epoch} epochs") if opts.num_train_steps % opts.valid_steps != 0: for split, loader in [("val", val_dataloader), ("test", test_dataloader)]: LOGGER.info(f"Step {global_step}: start running " f"validation on {split} split...") log, results = validate(model, loader, split) with open( f"{opts.output_dir}/results/" f"{split}_results_{global_step}_" f"rank{rank}.csv", "w", ) as f: for id_, ans in results: f.write(f"{id_},{ans}\n") TB_LOGGER.log_scaler_dict(log) model_saver.save(model, global_step)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-config") parser.add_argument("-data", help="data yaml file") parser.add_argument("-dataPath", default='', type=str, help="path of data files") parser.add_argument("-seed_model", help="the seed nerual network model") parser.add_argument("-exp_dir", help="the directory to save the outputs") parser.add_argument("-transform", help="feature transformation matrix or mvn statistics") parser.add_argument("-criterion", type=str, choices=["mmi", "mpfe", "smbr"], help="set the sequence training crtierion") parser.add_argument( "-trans_model", help="the HMM transistion model, used for lattice generation") parser.add_argument( "-prior_path", help="the prior for decoder, usually named as final.occs in kaldi setup" ) parser.add_argument( "-den_dir", help="the decoding graph directory to find HCLG and words.txt files") parser.add_argument("-lr", type=float, help="set the learning rate") parser.add_argument("-ce_ratio", default=0.1, type=float, help="the ratio for ce regularization") parser.add_argument("-momentum", default=0, type=float, help="set the momentum") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-dropout", default=0, type=float, help="set the dropout ratio") parser.add_argument("-nheads", default=4, type=int, help="the number of attention heads") parser.add_argument("-dim_model", default=512, type=int, help="the model dimension") parser.add_argument("-ff_size", default=2048, type=int, help="the size of feed-forward layer") parser.add_argument("-nlayers", default=6, type=int, help="the number of layers") parser.add_argument("-look_ahead", default=-1, type=int, help="the number of frames to look ahead") parser.add_argument("-data_loader_threads", default=0, type=int, help="number of workers for data loading") parser.add_argument("-max_grad_norm", default=5, type=float, help="max_grad_norm for gradient clipping") parser.add_argument("-sweep_size", default=100, type=float, help="process n hours of data per sweep (default:60)") parser.add_argument("-num_epochs", default=1, type=int, help="number of training epochs (default:1)") parser.add_argument('-print_freq', default=10, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('-save_freq', default=1000, type=int, metavar='N', help='save model frequency (default: 1000)') args = parser.parse_args() #args.exp_dir = args.modelPath with open(args.config) as f: config = yaml.safe_load(f) config['data_path'] = args.dataPath config["sweep_size"] = args.sweep_size print("pytorch version:{}".format(th.__version__)) with open(args.data) as f: data = yaml.safe_load(f) config["source_paths"] = [j for i, j in data['clean_source'].items()] print("Experiment starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) # Initialize Horovod hvd.init() th.cuda.set_device(hvd.local_rank()) print("Run experiments with world size {}".format(hvd.size())) dataset = SpeechDataset(config) transform = None if args.transform is not None and os.path.isfile(args.transform): with open(args.transform, 'rb') as f: transform = pickle.load(f) dataset.transform = transform train_dataloader = SeqDataloader(dataset, batch_size=args.batch_size, num_workers=args.data_loader_threads, distributed=True, test_only=False) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(train_dataloader))) if not os.path.isdir(args.exp_dir): os.makedirs(args.exp_dir) # ceate model model_config = config["model_config"] model = transformer.TransformerAM(model_config["feat_dim"], args.dim_model, args.nheads, args.ff_size, args.nlayers, args.dropout, model_config["label_size"]) model.cuda() # setup the optimizer optimizer = th.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) # Broadcast parameters and opterimizer state from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) if os.path.isfile(args.seed_model): checkpoint = th.load(args.seed_model) state_dict = checkpoint['model'] model.load_state_dict(state_dict) print("=> loaded checkpoint '{}' ".format(args.seed_model)) else: sys.stderr.write('ERROR: The model file %s does not exist!\n' % (args.seed_model)) sys.exit(0) HCLG = args.den_dir + "/HCLG.fst" words_txt = args.den_dir + "/words.txt" silence_phones = args.den_dir + "/phones/silence.csl" if not os.path.isfile(HCLG): sys.stderr.write('ERROR: The HCLG file %s does not exist!\n' % (HCLG)) sys.exit(0) if not os.path.isfile(words_txt): sys.stderr.write('ERROR: The words.txt file %s does not exist!\n' % (words_txt)) sys.exit(0) if not os.path.isfile(silence_phones): sys.stderr.write('ERROR: The silence phone file %s does not exist!\n' % (silence_phones)) sys.exit(0) with open(silence_phones) as f: silence_ids = [int(i) for i in f.readline().strip().split(':')] f.close() if os.path.isfile(args.trans_model): trans_model = kaldi_hmm.TransitionModel() with kaldi_util.io.xopen(args.trans_model) as ki: trans_model.read(ki.stream(), ki.binary) else: sys.stderr.write('ERROR: The trans_model %s does not exist!\n' % (args.trans_model)) sys.exit(0) # now we can setup the decoder decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = config["decoder_config"]["beam"] decoder_opts.lattice_beam = config["decoder_config"]["lattice_beam"] decoder_opts.max_active = config["decoder_config"]["max_active"] acoustic_scale = config["decoder_config"]["acoustic_scale"] decoder_opts.determinize_lattice = False #To produce raw state-level lattice instead of compact lattice asr_decoder = MappedLatticeFasterRecognizer.from_files( args.trans_model, HCLG, words_txt, acoustic_scale=acoustic_scale, decoder_opts=decoder_opts) prior = kaldi_util.io.read_matrix(args.prior_path).numpy() log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float) model.train() model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print(params) for epoch in range(args.num_epochs): run_train_epoch(model, optimizer, log_prior.cuda(), train_dataloader, epoch, asr_decoder, trans_model, silence_ids, args) # save model if hvd.rank() == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() checkpoint['epoch'] = epoch output_file = args.exp_dir + '/model.se.' + str(epoch) + '.tar' th.save(checkpoint, output_file)
def run_train_epoch(model, optimizer, log_prior, dataloader, epoch, asr_decoder, trans_model, silence_ids, args): batch_time = utils.AverageMeter('Time', ':6.3f') losses = utils.AverageMeter('Loss', ':.4e') grad_norm = utils.AverageMeter('grad_norm', ':.4e') progress = utils.ProgressMeter(len(dataloader), batch_time, losses, grad_norm, prefix="Epoch: [{}]".format(epoch)) ce_criterion = nn.CrossEntropyLoss(ignore_index=-100, reduction='sum') if args.criterion == "mmi": se_criterion = ops.MMIFunction.apply else: se_criterion = ops.sMBRFunction.apply end = time.time() for i, batch in enumerate(dataloader, 0): feat = batch["x"] label = batch["y"] #pdf-ids for ce loss num_frs = batch["num_frs"] utt_ids = batch["utt_ids"] aux = batch["aux"] #trans_ids for se loss x = feat.to(th.float32) y = label.long() x = x.cuda() y = y.cuda() x = x.transpose(0, 1) key_padding_mask = th.ones((x.size(1), x.size(0))) for utt in range(len(num_frs)): key_padding_mask[utt, :num_frs[utt]] = 0 src_mask = None if (args.look_ahead > -1): src_mask = th.tril(th.ones(x.size(0), x.size(0)), diagonal=args.look_ahead) src_mask = src_mask.float().masked_fill(src_mask == 0, float('-inf')).masked_fill( src_mask == 1, float(0.0)) src_mask = src_mask.cuda() key_padding_mask = key_padding_mask.bool().cuda() prediction = model(x, src_mask, key_padding_mask) prediction = prediction.transpose(0, 1).contiguous() ce_loss = ce_criterion(prediction.view(-1, prediction.shape[2]), y.view(-1)) se_loss = 0.0 for j in range(len(num_frs)): log_like_j = prediction[j, :, :] log_like_j = log_like_j[:num_frs[j], :] log_like_j = log_like_j - log_prior trans_id = th.from_numpy(aux[j][0][0].astype(int)).tolist() if args.criterion == "mmi": se_loss += se_criterion(log_like_j, asr_decoder, trans_model, trans_id) else: se_loss += se_criterion(log_like_j, asr_decoder, trans_model, trans_id, args.criterion, silence_ids) loss = se_loss.cuda() + args.ce_ratio * ce_loss optimizer.zero_grad() loss.backward() # Gradient Clipping (th 5.0) norm = nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() grad_norm.update(norm) # update loss tot_frs = np.array(num_frs).sum() losses.update(loss.item() / tot_frs) # measure elapsed time batch_time.update(time.time() - end) # save model if hvd.rank() == 0 and i % args.save_freq == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() output_file = args.exp_dir + '/model.se.' + str(i) + '.tar' th.save(checkpoint, output_file) if hvd.rank() == 0 and i % args.print_freq == 0: progress.print(i)
def proc_rank(self): return hvd.rank()
def _init_distributed_setting(self): if self.distributed: import horovod.torch as hvd self._world_size = hvd.size() self._rank_id = hvd.rank() self._local_rank_id = hvd.local_rank()
def train(serialized_model, optimizer_cls, model_opt_state_serialized, train_rows, val_rows, avg_row_size): from petastorm import TransformSpec, make_reader, make_batch_reader from petastorm.pytorch import BatchedDataLoader, InMemBatchedDataLoader import torch import horovod.torch as hvd if random_seed is not None: torch.manual_seed(random_seed) # Deserializing objects model_opt_state = torch.load(model_opt_state_serialized) model = deserialize(serialized_model) if loss_fns_pre_train: loss_fns = loss_fns_pre_train if loss_constructors: local_vars = locals() loss_fns = [loss_constructor(**local_vars) for loss_constructor in loss_constructors] # Horovod: initialize library. hvd.init() if user_verbose: import horovod as _horovod print(f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}") # If user specifies any user_shuffle_buffer_size (even 0), we should honor it. if user_shuffle_buffer_size is None: shuffle_buffer_size = \ calculate_shuffle_buffer_size(hvd, avg_row_size, train_rows / hvd.size()) else: if user_shuffle_buffer_size < 0: raise ValueError("user_shuffle_buffer_size cannot be negative!") shuffle_buffer_size = user_shuffle_buffer_size if not should_use_gpu and user_verbose: print("Skip pinning current process to the GPU.") cuda_available = torch.cuda.is_available() if cuda_available and not should_use_gpu: print("GPU is available but use_gpu is set to False." "Training will proceed without GPU support.") cuda_available = False # We need to check all ranks have same device type for traning. # Horovod doesn't support heterogeneous allreduce for gradients. cuda_avail_list = hvd.allgather_object(cuda_available, name='device type') if cuda_avail_list.count(cuda_available) != hvd.size(): raise RuntimeError("All ranks don't have same device type!") if cuda_available: # Horovod: pin GPU to local rank or the assigned GPU from spark. torch.cuda.set_device(_get_assigned_gpu_or_default(default=hvd.local_rank())) # Move model to GPU. model.cuda() # Optimizer object needs to be re-instantiated. Internally, it uses memory addresses of # objects as their identity and therefore it cannot be serialized and then # deserialized. The deserialized optimizer object stores the names of the parameters # with their old memory addresses but in reality those are different than the # reconstructed deserialized object and that creates problem. # Learning rate is a required parameters in SGD optimizer. It will be overridden with # load_state_dict. optimizer = optimizer_cls(model.parameters(), lr=1) optimizer_state = model_opt_state['optimizer'] if last_checkpoint_state is not None: model.load_state_dict(last_checkpoint_state['model']) optimizer.load_state_dict(last_checkpoint_state['optimizer']) else: # scale the learning rate with the number of horovod workers for i in range(len(optimizer_state['param_groups'])): optimizer_state['param_groups'][i]['lr'] = \ optimizer_state['param_groups'][i]['lr'] * hvd.size() optimizer.load_state_dict(optimizer_state) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) for group in optimizer.param_groups: for p in group['params']: if id(p) not in optimizer.state_dict()['state']: p.grad = p.data.new(p.size()).zero_() optimizer.step() hvd.broadcast_optimizer_state(optimizer, root_rank=0) dist_optimizer_args = dict(optimizer=optimizer, named_parameters=model.named_parameters()) if gradient_compression: # Pass the compression arg only if it is specified by the user. dist_optimizer_args['compression'] = gradient_compression # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(**dist_optimizer_args) # This function takes the current optimizer and constructs a new optimizer with the # same state except with learning rate scaled down with the number of horovod workers. # This is important the retraining of the model. User may retrain the model with # different number of workers and we need the raw learning rate to adjust with the # new number of workers. transform_spec = None if transformation: transform_spec = TransformSpec(transformation) schema_fields = feature_columns + label_columns if sample_weight_col: schema_fields.append(sample_weight_col) if train_steps_per_epoch is None: steps_per_epoch = int(math.floor(float(train_rows) / batch_size / hvd.size())) else: steps_per_epoch = train_steps_per_epoch with remote_store.get_local_output_dir() as run_output_dir: logs_dir = os.path.join(run_output_dir, remote_store.logs_subdir) log_writer = SummaryWriter(logs_dir) if hvd.rank() == 0 else None ckpt_file = os.path.join(run_output_dir, remote_store.checkpoint_filename) def save_checkpoint(): model.cpu() optimizer_with_scaled_down_lr = \ get_optimizer_with_unscaled_lr(hvd, optimizer, optimizer_cls, model) state = { 'model': model.state_dict(), 'optimizer': optimizer_with_scaled_down_lr.state_dict(), } torch.save(state, ckpt_file) if cuda_available: model.cuda() if hvd.rank() == 0 and user_verbose: print(f"Training parameters: Epochs: {epochs}\n" f"Train rows: {train_rows}, Train batch size: {batch_size}, Train_steps_per_epoch: {steps_per_epoch}\n" f"Shuffle buffer size: {shuffle_buffer_size}, Random seed: {random_seed}\n" f"Checkpoint file: {ckpt_file}, Logs dir: {logs_dir}\n") # In general, make_batch_reader is faster than make_reader for reading the dataset. # However, we found out that make_reader performs data transformations much faster than # make_batch_reader with parallel worker processes. Therefore, the default reader # we choose is make_batch_reader unless there are data transformations. reader_factory = None reader_factory_kwargs = dict() if transform_spec: reader_factory = make_reader reader_factory_kwargs['pyarrow_serialize'] = True else: reader_factory = make_batch_reader # Petastorm: read data from the store with the correct shard for this rank # setting num_epochs=None will cause an infinite iterator # and enables ranks to perform training and validation with # unequal number of samples with reader_factory(remote_store.train_data_path, num_epochs=None, cur_shard=hvd.rank(), reader_pool_type=reader_pool_type, workers_count=train_reader_worker_count, shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields, transform_spec=transform_spec, storage_options=storage_options, # Don't shuffle row groups without shuffling. shuffle_row_groups=True if shuffle_buffer_size > 0 else False, **reader_factory_kwargs) as train_reader: with reader_factory(remote_store.val_data_path, num_epochs=None, cur_shard=hvd.rank(), reader_pool_type=reader_pool_type, workers_count=val_reader_worker_count, shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields, transform_spec=transform_spec, storage_options=storage_options, shuffle_row_groups=False, **reader_factory_kwargs) \ if should_validate else empty_batch_reader() as val_reader: if inmemory_cache_all: # Petastorm introduced InMemBatchedDataLoader class in v0.11.0 train_loader = InMemBatchedDataLoader(train_reader, batch_size=batch_size, num_epochs=epochs, rows_capacity=steps_per_epoch*batch_size, shuffle=True) else: train_loader = BatchedDataLoader(train_reader, batch_size=batch_size, shuffling_queue_capacity=shuffle_buffer_size) train_loader_iter = iter(train_loader) def prepare_batch(row): inputs = [ prepare_np_data( row[col].float(), col, metadata).reshape(shape) for col, shape in zip(feature_columns, input_shapes)] labels = [ prepare_np_data( row[col].float(), col, metadata) for col in label_columns] sample_weights = row.get(sample_weight_col, None) if sample_weights is not None: sample_weights = sample_weights.float() if cuda_available: inputs = [input.cuda() for input in inputs] labels = [label.cuda() for label in labels] if sample_weights is not None: sample_weights = sample_weights.cuda() return inputs, labels, sample_weights def transform_outputs(outputs, labels): if not isinstance(outputs, tuple) and not isinstance(outputs, list): outputs = [outputs] # reshape labels to match the output shape of the model if hasattr(outputs[0], 'shape'): if label_shapes: labels = [label.reshape(label_shape) for label, label_shape in zip(labels, label_shapes)] else: # If label_shapes parameter is not provided, reshape the label # columns data to match the shape of the model output labels = [label.reshape(output.shape) if output.shape.numel() == label.shape.numel() else label for label, output in zip(labels, outputs)] return outputs, labels def aggregate_metrics(stage, epoch, loss, metric_value_groups): all_metric_groups_values = get_metric_avgs(metric_value_groups) if remote_store.saving_runs: write_metrics_summary( stage, epoch, loss, all_metric_groups_values, log_writer) return { loss.name: loss.avg.item(), 'all_metrics': all_metric_groups_values } def loss_fn(outputs, labels, sample_weights): loss = calculate_loss(outputs, labels, loss_weights, loss_fns, sample_weights) return loss def print_metrics(batch_idx, loss, metric_value_groups, phase): if user_verbose > 0 and hvd.rank() == 0 and \ batch_idx % METRIC_PRINT_FREQUENCY == 0: print("{phase}\tepoch:\t{epoch}\tstep\t{batch_idx}:\t{metrics}". format(phase=phase, epoch=epoch, batch_idx=batch_idx, metrics=aggregate_metrics(phase, epoch, loss, metric_value_groups))) def _train(epoch): model.train() train_loss = metric_cls('loss', hvd) metric_value_groups = construct_metric_value_holders( metric_cls, metric_fn_groups, label_columns, hvd) # iterate on one epoch for batch_idx in range(steps_per_epoch): row = next(train_loader_iter) inputs, labels, sample_weights = prepare_batch(row) outputs, loss = train_minibatch(model, optimizer, transform_outputs, loss_fn, inputs, labels, sample_weights) update_metrics(metric_value_groups, outputs, labels) train_loss.update(loss) print_metrics(batch_idx, train_loss, metric_value_groups, 'train') return aggregate_metrics('train', epoch, train_loss, metric_value_groups) if should_validate: if validation_steps_per_epoch is None: validation_steps = int(math.ceil(float(val_rows) / val_batch_size / hvd.size())) else: validation_steps = validation_steps_per_epoch if hvd.rank() == 0 and user_verbose: print(f"Val rows: {val_rows}, Val batch size: {val_batch_size}, Val_steps_per_epoch: {validation_steps}\n") if inmemory_cache_all: # Petastorm introduced InMemBatchedDataLoader class in v0.11.0 val_loader = InMemBatchedDataLoader(val_reader, batch_size=val_batch_size, num_epochs=epochs, rows_capacity=validation_steps*val_batch_size, shuffle=False) else: val_loader = BatchedDataLoader(val_reader, batch_size=val_batch_size, shuffling_queue_capacity=0) val_loader_iter = iter(val_loader) def _validate(epoch): model.eval() val_loss = metric_cls('loss', hvd) metric_value_groups = construct_metric_value_holders( metric_cls, metric_fn_groups, label_columns, hvd) # iterate on one epoch for batch_idx in range(validation_steps): row = next(val_loader_iter) inputs, labels, sample_weights = prepare_batch(row) outputs = model(*inputs) outputs, labels = transform_outputs(outputs, labels) loss = calculate_loss( outputs, labels, loss_weights, loss_fns, sample_weights) val_loss.update(loss) update_metrics(metric_value_groups, outputs, labels) print_metrics(batch_idx, val_loss, metric_value_groups, 'val') return aggregate_metrics('val', epoch, val_loss, metric_value_groups) history = [] for epoch in range(epochs): epoch_metrics = { 'epoch': epoch, 'train': _train(epoch) } if should_validate: epoch_metrics['validation'] = _validate(epoch) if user_verbose > 0: pdt_dt = datetime.now(timezone.utc) pdt_time_str = pdt_dt.strftime("%Y-%b-%d %H:%M:%S UTC") print(pdt_time_str, epoch_metrics) history.append(epoch_metrics) if hvd.rank() == 0: # Save model after every epoch save_checkpoint() if remote_store.saving_runs: remote_store.sync(run_output_dir) if hvd.rank() == 0: best_checkpoint = torch.load(ckpt_file) serialized_checkpoint = io.BytesIO() torch.save(best_checkpoint, serialized_checkpoint) serialized_checkpoint.seek(0) return history, serialized_checkpoint
def simple_fn(num_epochs): import horovod.torch as hvd hvd.init() return hvd.rank() * num_epochs
args.cuda = not args.no_cuda and torch.cuda.is_available() # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) cudnn.benchmark = True kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) test_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]))
def train_main(args, splits): # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) if torch.cuda.is_available(): # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) rank = hvd.rank() model = MyModel(annotation, use_bn=False) # By default, Adasum doesn"t need scaling up learning rate. if torch.cuda.is_available(): # Move model to GPU. model.cuda() optimizers = construct_optimizers(model) loss_function = huber_loss # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) for opt in optimizers: hvd.broadcast_optimizer_state(opt, root_rank=0) def _train(epoch, train_dataset): model.train() # Horovod: set epoch to sampler for shuffling. # train_dataset.set_epoch(epoch) start_epoch = timeit.default_timer() last_batch_time = start_epoch batch_wait_times = [] for batch_idx, (data, target) in enumerate(train_dataset): batch_wait_times.append(timeit.default_timer() - last_batch_time) if torch.cuda.is_available(): data = data.cuda() target = target.cuda() for opt in optimizers: opt.zero_grad() batch = OrderedDict() batch["embeddings"] = OrderedDict() batch["one_hot"] = OrderedDict() for i, name in enumerate(annotation["embeddings"]): batch["embeddings"][name] = data[:, i : i + 1] batch["one_hot"]["hot0"] = data[:, -2:-1] batch["one_hot"]["hot1"] = data[:, -1:] batch_pred = model(batch) if batch_idx % args.log_interval == 0: print( f"Processing batch {batch_idx} in epoch {epoch} on worker " f"{rank}." ) time.sleep(args.mock_train_step_time) loss = loss_function(batch_pred, target, delta=60) loss.mean().backward() for opt in optimizers: opt.step() last_batch_time = timeit.default_timer() epoch_duration = timeit.default_timer() - start_epoch avg_batch_wait_time = np.mean(batch_wait_times) std_batch_wait_time = np.std(batch_wait_times) max_batch_wait_time = np.max(batch_wait_times) min_batch_wait_time = np.min(batch_wait_times) print( f"\nEpoch {epoch}, worker {rank} stats over " f"{len(batch_wait_times)} steps: {epoch_duration:.3f}" ) print( f"Mean batch wait time: {avg_batch_wait_time:.3f}s +- " f"{std_batch_wait_time}" ) print(f"Max batch wait time: {max_batch_wait_time:.3f}s") print(f"Min batch wait time: {min_batch_wait_time:.3f}s") return batch_wait_times print(f"Starting training on worker {rank}.") batch_wait_times = [] for epoch, split_ds in enumerate(splits[rank].iter_epochs()): train_dataset = create_torch_iterator(split_ds, args.batch_size, rank) new_batch_times = _train(epoch, train_dataset) new_batch_times.pop(0) batch_wait_times.extend(new_batch_times) print(f"Done training on worker {rank}.") avg_batch_wait_time = np.mean(batch_wait_times) std_batch_wait_time = np.std(batch_wait_times) max_batch_wait_time = np.max(batch_wait_times) min_batch_wait_time = np.min(batch_wait_times) print(f"\nWorker {rank} training stats over {args.epochs} epochs:") print( f"Mean batch wait time: {avg_batch_wait_time:.3f}s +- " f"{std_batch_wait_time}" ) print(f"Max batch wait time: {max_batch_wait_time:.3f}s") print(f"Min batch wait time: {min_batch_wait_time:.3f}s")
def train(config, checkpoint_dir=None): import horovod.torch as hvd hvd.init() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") net = ResNet18(None).to(device) optimizer = torch.optim.SGD( net.parameters(), lr=config["lr"], ) epoch = 0 if checkpoint_dir: with open(os.path.join(checkpoint_dir, "checkpoint")) as f: model_state, optimizer_state, epoch = torch.load(f) net.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) criterion = nn.CrossEntropyLoss() optimizer = hvd.DistributedOptimizer(optimizer) np.random.seed(1 + hvd.rank()) torch.manual_seed(1234) # To ensure consistent initialization across slots, hvd.broadcast_parameters(net.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) trainset = ray.get(config["data"]) trainloader = DataLoader(trainset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=4) for epoch in range(epoch, 40): # loop over the dataset multiple times running_loss = 0.0 epoch_steps = 0 for i, data in enumerate(trainloader): # get the inputs; data is a list of [inputs, labels] inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() epoch_steps += 1 tune.report(loss=running_loss / epoch_steps) if i % 2000 == 1999: # print every 2000 mini-batches print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / epoch_steps)) with distributed_checkpoint_dir(step=epoch) as checkpoint_dir: print("this checkpoint dir: ", checkpoint_dir) path = os.path.join(checkpoint_dir, "checkpoint") torch.save((net.state_dict(), optimizer.state_dict(), epoch), path)
if hvd.rank() == 0: print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( test_loss, 100. * test_accuracy)) if __name__ == '__main__': args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) if args.hvd: local_rank = hvd.local_rank() rank = hvd.rank() local_size = hvd.local_size() size = hvd.size() else: local_rank = 0 rank = 0 local_size = 1 size = 1 if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) # Horovod: limit # of CPU threads to be used per worker.
def single_point(self, with_tqdm=True, hdf5_group='single_point'): """Performs a single point calculation Args: with_tqdm (bool, optional): use tqdm for samplig. Defaults to True. hdf5_group (str, optional): hdf5 group where to store the data. Defaults to 'single_point'. Returns: SimpleNamespace: contains the local energy, positions, ... """ logd(hvd.rank(), '') logd( hvd.rank(), ' Single Point Calculation : {nw} walkers | {ns} steps'.format( nw=self.sampler.nwalkers, ns=self.sampler.nstep)) # check if we have to compute and store the grads grad_mode = torch.no_grad() if self.wf.kinetic == 'auto': grad_mode = torch.enable_grad() # distribute the calculation num_threads = 1 hvd.broadcast_parameters(self.wf.state_dict(), root_rank=0) torch.set_num_threads(num_threads) with grad_mode: # sample the wave function pos = self.sampler(self.wf.pdf) if self.wf.cuda and pos.device.type == 'cpu': pos = pos.to(self.device) # compute energy/variance/error eloc = self.wf.local_energy(pos) e, s, err = torch.mean(eloc), torch.var( eloc), self.wf.sampling_error(eloc) # gather all data eloc_all = hvd.allgather(eloc, name='local_energies') e, s, err = torch.mean(eloc_all), torch.var( eloc_all), self.wf.sampling_error(eloc_all) # print if hvd.rank() == 0: log.options(style='percent').info( ' Energy : %f +/- %f' % (e.detach().item(), err.detach().item())) log.options(style='percent').info(' Variance : %f' % s.detach().item()) # dump data to hdf5 obs = SimpleNamespace(pos=pos, local_energy=eloc_all, energy=e, variance=s, error=err) # dump to file if hvd.rank() == 0: dump_to_hdf5(obs, self.hdf5file, root_name=hdf5_group) add_group_attr(self.hdf5file, hdf5_group, {'type': 'single_point'}) return obs
def test_broadcast_state(self): hvd.init() N, D_in, H, D_out = 64, 100, 10, 10 x = torch.autograd.Variable(torch.randn(N, D_in), requires_grad=True) y = torch.autograd.Variable(torch.randn(N, D_out), requires_grad=False) def create_model(create_opt): model = torch.nn.Sequential( torch.nn.Linear(D_in, H), torch.nn.ReLU(), torch.nn.Linear(H, D_out), ) optimizer = create_opt(model) optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) return model, optimizer def get_model_param_values(model): params = sorted(model.state_dict().items()) return [(k, v.clone()) for k, v in params] def get_optimizer_param_values(optimizer): results = [] state_dict = optimizer.state_dict() for group in state_dict['param_groups']: for param_id in group['params']: params = sorted(state_dict['state'][param_id].items()) for k, v in params: results.append( (k, v.clone() if torch.is_tensor(v) else v)) return results opt_params = dict(lr=0.2, momentum=0.9, weight_decay=0.1, centered=True) def new_optimizer(cls): p = { k: v for k, v in opt_params.items() if k in inspect.getargspec(cls.__init__).args } return lambda m: cls(m.parameters(), **p) # L-BFGS is currently unsupported, as are sparse tensors, which are # required by SparseAdam optimizer optimizers = [ (subclass.__name__, new_optimizer(subclass)) for subclass in torch.optim.Optimizer.__subclasses__() if subclass.__module__.startswith('torch.optim') and subclass != torch.optim.LBFGS and subclass != torch.optim.SparseAdam ] optimizers.sort() for opt_name, create_opt in optimizers: model, optimizer = create_model(create_opt) y_pred = model(x) loss = F.mse_loss(y_pred, y, size_average=False) optimizer.zero_grad() loss.backward() optimizer.step() model_param_values = get_model_param_values(model) for name, model_param_value in model_param_values: hvd.broadcast_(model_param_value, root_rank=0) opt_param_values_updated = [] opt_param_values = get_optimizer_param_values(optimizer) for name, opt_param_value in opt_param_values: is_tensor = torch.is_tensor(opt_param_value) if not is_tensor: t = type(opt_param_value) opt_param_value = torch.Tensor([opt_param_value]) hvd.broadcast_(opt_param_value, root_rank=0) if not is_tensor: opt_param_value = t(opt_param_value.numpy()[0]) opt_param_values_updated.append((name, opt_param_value)) opt_param_values = opt_param_values_updated if hvd.rank() == 0: state = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), } _, fname = tempfile.mkstemp('.pt') torch.save(state, fname) model, optimizer = create_model(create_opt) if hvd.rank() == 0: checkpoint = torch.load(fname) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) os.remove(fname) hvd.broadcast_parameters(model.state_dict(), root_rank=0) model_param_value_after = get_model_param_values(model) for before, after in zip(model_param_values, model_param_value_after): name, model_param_value = before name_after, model_param_value_after = after self.assertEqual(name, name_after) self.assertEqual(type(model_param_value), type(model_param_value_after)) self.assertTrue( (model_param_value == model_param_value_after).all()) hvd.broadcast_optimizer_state(optimizer, root_rank=0) self.assertEqual(len(optimizer.state_dict()['state'].values()), 4) opt_param_values_after = get_optimizer_param_values(optimizer) for before, after in zip(opt_param_values, opt_param_values_after): name, opt_param_value = before name_after, opt_param_value_after = after self.assertEqual(name, name_after) self.assertEqual(type(opt_param_value), type(opt_param_value_after)) if torch.is_tensor(opt_param_value): self.assertTrue( (opt_param_value == opt_param_value_after).all()) else: self.assertEqual(opt_param_value, opt_param_value_after)
def train(state, dir): state.rendezvous += 1 logging.info('rank %s: rendezvous %s', hvd.rank(), state.rendezvous) for state.epoch in range(state.epoch, epochs): logging.info('rank %s: start epoch %s at batch %s', hvd.rank(), state.epoch, state.batch) for state.batch in range(state.batch, batches_per_epoch): check_fail(dir, hvd.rank(), state.epoch, state.batch) optimizer.zero_grad() output = model(data) loss = F.cross_entropy(output, target) loss.backward() optimizer.step() # TODO: this sleep makes the fault tolerant test fail # torch all gather throws an RuntimeError which should be a HorovodInternalError #import time #time.sleep(0.2) if state.batch % batches_per_commit == 0: logging.info('rank %s: allgather', hvd.rank()) hvd.allgather(torch.tensor([hvd.rank(), state.epoch, state.batch, state.rendezvous]), 'state').tolist() logging.info('rank %s: commit epoch %s batch %s', hvd.rank(), state.epoch, state.batch) state.commits += 1 state.commit() logging.info('rank %s: allgather', hvd.rank()) hvd.allgather(torch.tensor([hvd.rank(), state.epoch, state.batch, state.rendezvous]), 'state').tolist() logging.info('rank %s: commit epoch %s', hvd.rank(), state.epoch) state.commits += 1 state.commit() state.batch = 0 res = hvd.allgather(torch.tensor([hvd.rank(), state.epoch, state.batch, state.rendezvous]), 'state').tolist() logging.info('rank %s: returning', hvd.rank()) return res, hvd.rank()
def run(self, nepoch, batchsize=None, loss='energy', clip_loss=False, grad='manual', hdf5_group='wf_opt', num_threads=1, chkpt_every=None): """Run the optimization Args: nepoch (int): Number of optimization step batchsize (int, optional): Number of sample in a mini batch. If None, all samples are used. Defaults to None. loss (str, optional): method to compute the loss: variance or energy. Defaults to 'energy'. clip_loss (bool, optional): Clip the loss values at +/- 5std. Defaults to False. grad (str, optional): method to compute the gradients: 'auto' or 'manual'. Defaults to 'auto'. hdf5_group (str, optional): name of the hdf5 group where to store the data. Defaults to 'wf_opt' """ logd(hvd.rank(), '') logd( hvd.rank(), ' Distributed Optimization on {num} process'.format( num=hvd.size())) log.info(' - Process {id} using {nw} walkers'.format( id=hvd.rank(), nw=self.sampler.nwalkers)) # observable if not hasattr(self, 'observable'): self.track_observable(['local_energy']) self.evaluate_gradient = { 'auto': self.evaluate_grad_auto, 'manual': self.evaluate_grad_manual }[grad] if 'lpos_needed' not in self.opt.__dict__.keys(): self.opt.lpos_needed = False self.wf.train() hvd.broadcast_parameters(self.wf.state_dict(), root_rank=0) torch.set_num_threads(num_threads) # get the loss self.loss = Loss(self.wf, method=loss, clip=clip_loss) self.loss.use_weight = (self.resampling_options.resample_every > 1) # orthogonalization penalty for the MO coeffs self.ortho_loss = OrthoReg() self.prepare_optimization(batchsize, chkpt_every) # log data if hvd.rank() == 0: self.log_data_opt(nepoch, 'wave function optimization') # sample the wave function if hvd.rank() == 0: pos = self.sampler(self.wf.pdf) else: pos = self.sampler(self.wf.pdf, with_tqdm=False) # requried to build the distributed data container pos.requires_grad_(False) # handle the batch size if batchsize is None: batchsize = len(pos) # get the initial observable if hvd.rank() == 0: self.store_observable(pos) # change the number of steps/walker size _nstep_save = self.sampler.nstep _ntherm_save = self.sampler.ntherm _nwalker_save = self.sampler.walkers.nwalkers if self.resampling_options.mode == 'update': self.sampler.ntherm = -1 self.sampler.nstep = self.resampling_options.nstep_update self.sampler.walkers.nwalkers = pos.shape[0] self.sampler.nwalkers = pos.shape[0] # create the data loader self.dataset = DataSet(pos) if self.cuda: kwargs = {'num_workers': num_threads, 'pin_memory': True} else: kwargs = {'num_workers': num_threads} self.dataloader = DataLoader(self.dataset, batch_size=batchsize, **kwargs) min_loss = 1E3 for n in range(nepoch): tstart = time() logd(hvd.rank(), '') logd(hvd.rank(), ' epoch %d' % n) cumulative_loss = 0. for ibatch, data in enumerate(self.dataloader): # get data lpos = data.to(self.device) lpos.requires_grad = True # get the gradient loss, eloc = self.evaluate_gradient(lpos) cumulative_loss += loss # optimize the parameters self.optimization_step(lpos) # observable if hvd.rank() == 0: self.store_observable(pos, local_energy=eloc, ibatch=ibatch) cumulative_loss = self.metric_average(cumulative_loss, 'cum_loss') if hvd.rank() == 0: if n == 0 or cumulative_loss < min_loss: self.observable.models.best = dict(self.wf.state_dict()) min_loss = cumulative_loss if self.chkpt_every is not None: if (n > 0) and (n % chkpt_every == 0): self.save_checkpoint(n, cumulative_loss) self.print_observable(cumulative_loss) # resample the data pos = self.resample(n, pos) pos.requires_grad = False # scheduler step if self.scheduler is not None: self.scheduler.step() logd(hvd.rank(), ' epoch done in %1.2f sec.' % (time() - tstart)) # restore the sampler number of step self.sampler.nstep = _nstep_save self.sampler.ntherm = _ntherm_save self.sampler.walkers.nwalkers = _nwalker_save self.sampler.nwalkers = _nwalker_save if hvd.rank() == 0: dump_to_hdf5(self.observable, self.hdf5file, hdf5_group) add_group_attr(self.hdf5file, hdf5_group, {'type': 'opt'}) return self.observable
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format( device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True hps_file = f'{opts.output_dir}/log/hps.json' model_opts = Struct(json.load(open(hps_file))) model_config = f'{opts.output_dir}/log/model_config.json' # load DBs and image dirs video_ids = get_video_ids(opts.query_txt_db) video_db = load_video_sub_dataset( opts.vfeat_db, opts.sub_txt_db, model_opts.vfeat_interval, model_opts) assert opts.split in opts.query_txt_db q_txt_db = QaQueryTokLmdb(opts.query_txt_db, -1) eval_dataset = ViolinEvalDataset( video_ids, video_db, q_txt_db, sampled_by_q=model_opts.sampled_by_q) collate_fn = violin_eval_collate # Prepare model if exists(opts.checkpoint): ckpt_file = opts.checkpoint else: ckpt_file = f'{opts.output_dir}/ckpt/model_step_{opts.checkpoint}.pt' checkpoint = torch.load(ckpt_file) img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\ ".position_embeddings.weight" assert img_pos_embed_weight_key in checkpoint max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) model = HeroForViolin.from_pretrained( model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len ) model.to(device) if opts.fp16: model = amp.initialize(model, enabled=opts.fp16, opt_level='O2') eval_dataloader = DataLoader(eval_dataset, batch_size=opts.batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=collate_fn) eval_dataloader = PrefetchLoader(eval_dataloader) _, results, logits = validate_violin( model, eval_dataloader, opts.split, opts.save_logits) result_dir = f'{opts.output_dir}/results_{opts.split}' if opts.save_logits: result_dir += '_w_logit' if not exists(result_dir) and hvd.rank() == 0: os.makedirs(result_dir) all_results = {} for id2res in all_gather_list(results): all_results.update(id2res) if opts.save_logits: all_logits = {} for id2logit in all_gather_list(logits): all_logits.update(id2logit) if hvd.rank() == 0: save_json( all_results, f'{result_dir}/results_{opts.checkpoint}_all.json') LOGGER.info('All results written......') if opts.save_logits: save_pickle( all_logits, f'{result_dir}/logits_{opts.checkpoint}_all.pkl') LOGGER.info('All logits written......')
print('\n\n') torch.manual_seed(args.seed) device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu if not os.path.exists(os.path.join(args.save_folder, 'inference')): os.makedirs(os.path.join(args.save_folder, 'inference')) # Horovod settings hvd.init() torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(hvd.size()) args.distributed = hvd.size() > 1 args.rank = hvd.rank() args.size = hvd.size() # CREATE THE NETWORK ARCHITECTURE AND LOAD THE BEST MODEL if args.heatmaps: from models.bonet_heatmap import BoNet else: from models.bonet import BoNet net = BoNet() if args.rank == 0: print('---> Number of params: {}'.format( sum([p.data.nelement() for p in net.parameters()]))) if osp.exists(args.snapshot):
from torchvision import datasets, transforms import torch.utils.data.distributed from distutils.version import LooseVersion as LV import os import horovod.torch as hvd torch.manual_seed(42) if torch.cuda.is_available(): device = torch.device('cuda') else: device = torch.device('cpu') hvd.init() torch.cuda.set_device(hvd.local_rank()) if hvd.rank() == 0: print('Using PyTorch version:', torch.__version__, ' Device:', device) assert (LV(torch.__version__) >= LV("1.0.0")) subpath = 'dogs-vs-cats/train-2000' if 'DATADIR' in os.environ: DATADIR = os.environ['DATADIR'] else: DATADIR = "/scratch/project_2002675/extracted" datapath = os.path.join(DATADIR, subpath) if hvd.rank() == 0: print('Reading data from path:', datapath)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-exp_dir") parser.add_argument("-dataPath", default='', type=str, help="path of data files") parser.add_argument("-train_config") parser.add_argument("-data_config") parser.add_argument("-lr", default=0.0001, type=float, help="Override the LR in the config") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-data_loader_threads", default=1, type=int, help="number of workers for data loading") parser.add_argument("-max_grad_norm", default=5, type=float, help="max_grad_norm for gradient clipping") parser.add_argument("-sweep_size", default=200, type=float, help="process n hours of data per sweep (default:200)") parser.add_argument("-num_epochs", default=1, type=int, help="number of training epochs (default:1)") parser.add_argument("-global_mvn", default=False, type=bool, help="if apply global mean and variance normalization") parser.add_argument( "-resume_from_model", type=str, help="the model from which you want to resume training") parser.add_argument("-dropout", type=float, help="set the dropout ratio") parser.add_argument("-aneal_lr_epoch", default=2, type=int, help="start to aneal the learning rate from this epoch" ) # aneal -> anneal? parser.add_argument("-aneal_lr_ratio", default=0.5, type=float, help="the ratio to aneal the learning rate") parser.add_argument('-p', '--print-freq', default=100, type=int, metavar='N', help='print frequency (default: 100)') args = parser.parse_args() with open(args.train_config_file) as f: config = yaml.safe_load(f) config["sweep_size"] = args.sweep_size with open(args.data_config_file) as f: data = yaml.safe_load(f) config["source_paths"] = [j for i, j in data['clean_source'].items()] if 'dir_noise' in data: config["dir_noise_paths"] = [ j for i, j in data['dir_noise'].items() ] if 'rir' in data: config["rir_paths"] = [j for i, j in data['rir'].items()] config['data_path'] = args.dataPath print("Experiment starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) # Initialize Horovod hvd.init() th.cuda.set_device(hvd.local_rank()) print("Run experiments with world size {}".format(hvd.size())) if not os.path.isdir(args.exp_dir): os.makedirs(args.exp_dir) trainset = SpeechDataset(config) train_dataloader = ChunkDataloader(trainset, batch_size=args.batch_size, distributed=True, num_workers=args.data_loader_threads) if args.global_mvn: transform = reader.preprocess.GlobalMeanVarianceNormalization() print("Estimating global mean and variance of feature vectors...") transform.learn_mean_and_variance_from_train_loader( train_dataloader, train_dataloader.stream_keys_for_transform, n_sample_to_use=2000) train_dataloader.transform = transform print("Global mean and variance transform trained successfully!") with open(args.exp_dir + "/transform.pkl", 'wb') as f: pickle.dump(transform, f, pickle.HIGHEST_PROTOCOL) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(train_dataloader))) # ceate model model_config = config["model_config"] lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True) model = NnetAM(lstm, model_config["hidden_size"] * 2, model_config["label_size"]) # Start training th.backends.cudnn.enabled = True if th.cuda.is_available(): model.cuda() # optimizer optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True) # Broadcast parameters and opterimizer state from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) # criterion criterion = nn.CrossEntropyLoss(ignore_index=-100) start_epoch = 0 if args.resume_from_model: assert os.path.isfile(args.resume_from_model ), "ERROR: model file {} does not exit!".format( args.resume_from_model) checkpoint = th.load(args.resume_from_model) state_dict = checkpoint['model'] start_epoch = checkpoint['epoch'] model.load_state_dict(state_dict) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' ".format(args.resume_from_model)) model.train() for epoch in range(start_epoch, args.num_epochs): # aneal learning rate if epoch > args.aneal_lr_epoch: for param_group in optimizer.param_groups: param_group['lr'] *= args.aneal_lr_ratio run_train_epoch(model, optimizer, criterion, train_dataloader, epoch, args) # save model if hvd.rank() == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() checkpoint['epoch'] = epoch output_file = args.exp_dir + '/model.' + str(epoch) + '.tar' th.save(checkpoint, output_file)
OPTS.model_path = os.path.join(HOME_DIR, "checkpoints", "lvm", OPTS.dtok, os.path.basename(OPTS.model_path)) OPTS.result_path = os.path.join(HOME_DIR, "checkpoints", "lvm", OPTS.dtok, os.path.basename(OPTS.result_path)) os.makedirs(os.path.dirname(OPTS.model_path), exist_ok=True) #OPTS.fixbug2 = True # Determine the number of GPUs to use horovod_installed = importlib.util.find_spec("horovod") is not None if envswitch.who() != "shu": horovod_installed = False if torch.cuda.is_available() and horovod_installed: import horovod.torch as hvd hvd.init() torch.cuda.set_device(hvd.local_rank()) part_index = hvd.rank() part_num = hvd.size() gpu_num = hvd.size() else: part_index = 0 part_num = 1 gpu_num = 1 # Tensorboard Logging tb_logdir = None OPTS.trains_task = None if is_root_node(): print("Running on {} GPUs".format(gpu_num)) if OPTS.tensorboard: try: from trains import Task
def fn(): hvd.init() res = hvd.allgather(torch.tensor([hvd.rank()])).tolist() return res, hvd.rank()
# If set > 0, will resume training from a given checkpoint. resume_from_epoch = 0 for try_epoch in range(args.epochs, 0, -1): if os.path.exists(args.checkpoint_format.format(epoch=try_epoch)): resume_from_epoch = try_epoch break # Horovod: broadcast resume_from_epoch from rank 0 (which will have # checkpoints) to other ranks. resume_from_epoch = hvd.broadcast(torch.tensor(resume_from_epoch), root_rank=0, name='resume_from_epoch').item() # Horovod: print logs on the first worker. verbose = 1 if hvd.rank() == 0 else 0 # Horovod: write TensorBoard logs on first worker. try: if LooseVersion(torch.__version__) >= LooseVersion('1.2.0'): from torch.utils.tensorboard import SummaryWriter else: from tensorboardX import SummaryWriter log_writer = SummaryWriter(args.log_dir) if hvd.rank() == 0 else None except ImportError: log_writer = None # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(4) kwargs = {'num_workers': 4, 'pin_memory': True} if args.cuda else {}
def simple_fn(worker): import horovod.torch as hvd hvd.init() return hvd.rank()
help='how many batches to wait before logging training status') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() hvd.init() torch.manual_seed(args.seed) if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) test_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) test_sampler = torch.utils.data.distributed.DistributedSampler(
def global_rank(self) -> int: return hvd.rank()
def backward(ctx, grad_output): grad_reduced = allreduce(grad_output, average=False) if rank() != ctx.root_rank: grad_reduced *= 0 return grad_reduced, None, None
def train(self): dset = ConcatDataset( [eval(cls)(**params) for cls, params in self.dataset]) # eval(cls) means to call the Dataset,e.g:DAVISDataset # (**params) means to delivery the initial params[dict] into Dataset. e.g:DAVISDataset(params) # Finally, concat these Datasets. # Partition dataset among workers using DistributedSampler train_sampler = torch.utils.data.distributed.DistributedSampler( dset, num_replicas=hvd.size(), rank=hvd.rank()) loader = DataLoader(dset, batch_size=self.batch_size, sampler=train_sampler, num_workers=self.num_workers, pin_memory=True, shuffle=False) # Add Horovod Distributed Optimizer backward_passes_per_step = dset.datasets[ 0].sample_size - 1 # e.g:3 frames has 2 backward() self.optimizer = hvd.DistributedOptimizer( self.optimizer, named_parameters=self.model.named_parameters(), backward_passes_per_step=backward_passes_per_step) # Broadcast parameters from rank 0 to all other processes. hvd.broadcast_parameters(self.model.state_dict(), root_rank=0) for epoch in range(self.epoch + 1, self.max_epochs + 1): self.epoch = epoch self.stats = ddict(AverageMeter) t0 = None runtime = AverageMeter() for i, batch in enumerate(loader, 1): t0 = time( ) if t0 is None else t0 # Ignore loader startup pause self.optimizer.zero_grad() stats = self.model(*batch) self.optimizer.step() runtime.update(time() - t0) t0 = time() stats['stats/lr'] = self.scheduler.get_last_lr()[0] self.update_stats(stats, i, len(loader), runtime, do_print=True) if hvd.rank() == 0: self.log_stats() # tensorboard self.scheduler.step() lr_dict = hvd.broadcast_object(self.scheduler.state_dict(), 0) if hvd.rank() > 0: self.scheduler.load_state_dict(lr_dict) if self.epoch % self.save_interval == 0 and hvd.rank() == 0: self.save_checkpoint() print("%s done" % self.name)