def test_horovod_allreduce_grad(self): """Test the correctness of the allreduce gradient.""" hvd.init() size = hvd.size() dtypes = [torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): torch.manual_seed(1234) tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100) tensor = tensor.type(dtype) tensor = torch.autograd.Variable(tensor, requires_grad=True) summed = hvd.allreduce(tensor, average=False) summed.backward(torch.ones([17] * dim)) grad_out = tensor.grad.data.numpy() expected = np.ones([17] * dim) * size err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def test_horovod_allgather(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank) tensor = tensor.type(dtype) gathered = hvd.allgather(tensor) assert list(gathered.shape) == [17 * size] + [17] * (dim - 1) for i in range(size): rank_tensor = gathered[i * 17:(i + 1) * 17] assert list(rank_tensor.shape) == [17] * dim, \ 'hvd.allgather produces incorrect gathered shape' assert rank_tensor.data.min() == i, 'hvd.allgather produces incorrect gathered tensor' assert rank_tensor.data.max() == i, 'hvd.allgather produces incorrect gathered tensor'
def test_horovod_allreduce_cpu_gpu_error(self): """Test that the allreduce raises an error if different ranks try to perform reduction on CPU and GPU.""" # Only do this test if there are GPUs available. if not torch.cuda.is_available(): return hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return # Same rank, different dimension dims = [17] * 3 if rank % 2 == 0: tensor = torch.cuda.FloatTensor(*dims) else: tensor = torch.FloatTensor(*dims) try: hvd.allreduce(tensor) assert False, 'hvd.allreduce did not throw error' except torch.FatalError: pass
def test_horovod_broadcast_grad(self): """Test the correctness of the broadcast gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks): tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank) tensor = tensor.type(dtype) tensor = torch.autograd.Variable(tensor, requires_grad=True) broadcasted_tensor = hvd.broadcast(tensor, root_rank) broadcasted_tensor.backward(torch.ones([17] * dim)) grad_out = tensor.grad.data.numpy() c = size if rank == root_rank else 0 expected = np.ones([17] * dim) * c err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def test_horovod_allreduce_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different rank or dimension.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return # Same rank, different dimension torch.manual_seed(1234) dims = [17 + rank] * 3 tensor = torch.FloatTensor(*dims).random_(-100, 100) try: hvd.allreduce(tensor) assert False, 'hvd.allreduce did not throw error' except torch.FatalError: pass # Same number of elements, different rank torch.manual_seed(1234) if rank == 0: dims = [17, 23 * 57] else: dims = [17, 23, 57] tensor = torch.FloatTensor(*dims).random_(-100, 100) try: hvd.allreduce(tensor) assert False, 'hvd.allreduce did not throw error' except torch.FatalError: pass
def test_horovod_broadcast_inplace(self): """Test that the broadcast correctly broadcasts 1D, 2D, 3D tensors.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] root_ranks = list(range(size)) for dtype, dim, root_rank in itertools.product(dtypes, dims, root_ranks): tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(rank) root_tensor = torch.FloatTensor(*([17] * dim)).fill_(1).mul_(root_rank) tensor = tensor.type(dtype) root_tensor = root_tensor.type(dtype) broadcasted_tensor = hvd.broadcast_(tensor, root_rank) assert (tensor == broadcasted_tensor).min() == 1, \ 'hvd.broadcast does not modify source tensor' assert (broadcasted_tensor == root_tensor).min() == 1, \ 'hvd.broadcast produces incorrect broadcasted tensor'
def test_horovod_allreduce_inplace(self): """Test that the allreduce correctly sums 1D, 2D, 3D tensors.""" hvd.init() size = hvd.size() dtypes = [torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): torch.manual_seed(1234) tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100) tensor = tensor.type(dtype) multiplied = tensor * size hvd.allreduce_(tensor, average=False) max_difference = tensor.sub(multiplied).max() # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [torch.IntTensor, torch.LongTensor, torch.cuda.IntTensor, torch.cuda.LongTensor]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
def test_horovod_broadcast_rank_error(self): """Test that the broadcast returns an error if different ranks specify different root rank.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return tensor = torch.FloatTensor(*([17] * 3)).fill_(1) try: hvd.broadcast(tensor, rank) assert False, 'hvd.broadcast did not throw error' except torch.FatalError: pass
def test_horovod_allreduce_async_fused(self): """Test that the allreduce correctly sums 1D, 2D, 3D tensors with Tensor Fusion.""" hvd.init() size = hvd.size() dtypes = [torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] tests = [] is_hvd_poll_false_once = False for dtype, dim in itertools.product(dtypes, dims): torch.manual_seed(1234) tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100) tensor = tensor.type(dtype) handle = hvd.allreduce_async(tensor, average=False) if not hvd.poll(handle): is_hvd_poll_false_once = True multiplied = tensor * size tests.append((dtype, multiplied, handle)) # Make sure it's an asynchronous operation. assert is_hvd_poll_false_once, 'hvd.poll() always returns True, not an async op?' for dtype, multiplied, handle in tests: summed = hvd.synchronize(handle) max_difference = summed.sub(multiplied).max() # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [torch.IntTensor, torch.LongTensor, torch.cuda.IntTensor, torch.cuda.LongTensor]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
def test_horovod_allgather_grad(self): """Test the correctness of the allgather gradient.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): # Support tests up to MPI Size of 35 if size > 35: break tensor_sizes = [3, 2, 7, 4, 6, 8, 10] * 5 tensor_sizes = tensor_sizes[:size] tensor = torch.FloatTensor( *([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank) tensor = tensor.type(dtype) tensor = torch.autograd.Variable(tensor, requires_grad=True) grad_list = [] for r, size in enumerate(tensor_sizes): grad_list.append(torch.ones([size] + [17] * (dim - 1)) * r) grad_ys = torch.cat(grad_list, dim=0) gathered = hvd.allgather(tensor) gathered.backward(grad_ys) grad_out = tensor.grad.data.numpy() expected = np.ones( [tensor_sizes[rank]] + [17] * (dim - 1) ) * rank * size err = np.linalg.norm(expected - grad_out) self.assertLess(err, 0.00000001, "gradient %s differs from expected %s, " "error: %s" % (grad_out, expected, str(err)))
def test_horovod_broadcast_error(self): """Test that the broadcast returns an error if any dimension besides the first is different among the tensors being broadcasted.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return tensor_size = [17] * 3 tensor_size[1] = 10 * (rank + 1) tensor = torch.FloatTensor(*tensor_size).fill_(1).mul_(rank) try: hvd.broadcast(tensor, 0) assert False, 'hvd.broadcast did not throw error' except torch.FatalError: pass
def test_horovod_broadcast_type_error(self): """Test that the broadcast returns an error if the types being broadcasted differ among the processes""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return tensor_size = [17] * 3 if rank % 2 == 0: tensor = torch.IntTensor(*tensor_size) else: tensor = torch.FloatTensor(*tensor_size) try: hvd.broadcast(tensor, 0) assert False, 'hvd.broadcast did not throw error' except torch.FatalError: pass
def test_horovod_allreduce_type_error(self): """Test that the allreduce raises an error if different ranks try to send tensors of different type.""" hvd.init() rank = hvd.rank() size = hvd.size() # This test does not apply if there is only one worker. if size == 1: return # Same rank, different dimension dims = [17] * 3 if rank % 2 == 0: tensor = torch.IntTensor(*dims) else: tensor = torch.FloatTensor(*dims) try: hvd.allreduce(tensor) assert False, 'hvd.allreduce did not throw error' except torch.FatalError: pass
def test_horovod_allgather_variable_size(self): """Test that the allgather correctly gathers 1D, 2D, 3D tensors, even if those tensors have different sizes along the first dim.""" hvd.init() rank = hvd.rank() size = hvd.size() dtypes = [torch.ByteTensor, torch.CharTensor, torch.ShortTensor, torch.IntTensor, torch.LongTensor, torch.FloatTensor, torch.DoubleTensor] if torch.cuda.is_available(): dtypes += [torch.cuda.ByteTensor, torch.cuda.CharTensor, torch.cuda.ShortTensor, torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): # Support tests up to MPI Size of 35 if size > 35: break tensor_sizes = [17, 32, 81, 12, 15, 23, 22] * 5 tensor_sizes = tensor_sizes[:size] tensor = torch.FloatTensor( *([tensor_sizes[rank]] + [17] * (dim - 1))).fill_(1).mul_(rank) tensor = tensor.type(dtype) gathered = hvd.allgather(tensor) expected_size = sum(tensor_sizes) assert list(gathered.shape) == [expected_size] + [17] * (dim - 1) for i in range(size): rank_size = [tensor_sizes[i]] + [17] * (dim - 1) rank_tensor = gathered[sum( tensor_sizes[:i]):sum(tensor_sizes[:i + 1])] assert list(rank_tensor.shape) == rank_size assert rank_tensor.data.min() == i assert rank_tensor.data.max() == i
def test_horovod_allreduce_multi_gpu(self): """Test that the allreduce works on multiple GPUs.""" # Only do this test if there are GPUs available. if not torch.cuda.is_available(): return hvd.init() local_rank = hvd.local_rank() size = hvd.size() iter = 0 dtypes = [torch.cuda.IntTensor, torch.cuda.LongTensor, torch.cuda.FloatTensor, torch.cuda.DoubleTensor] dims = [1, 2, 3] for dtype, dim in itertools.product(dtypes, dims): iter += 1 torch.manual_seed(1234) tensor = torch.FloatTensor(*([17] * dim)).random_(-100, 100) device = local_rank * 2 + (iter + local_rank) % 2 tensor = tensor.cuda(device).type(dtype) multiplied = tensor * size hvd.allreduce_(tensor, average=False) max_difference = tensor.sub(multiplied).max() # Threshold for floating point equality depends on number of # ranks, since we're comparing against precise multiplication. if size <= 3 or dtype in [torch.cuda.IntTensor, torch.cuda.LongTensor]: threshold = 0 elif size < 10: threshold = 1e-4 elif size < 15: threshold = 5e-4 else: break assert max_difference <= threshold, 'hvd.allreduce produces incorrect results'
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format( device, n_gpu, hvd.rank(), opts.fp16)) if opts.gradient_accumulation_steps < 1: raise ValueError("Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format( opts.gradient_accumulation_steps)) set_random_seed(opts.seed) # load DBs and image dirs all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) # train LOGGER.info(f"Loading Train Dataset " f"{opts.train_txt_dbs}, {opts.train_img_dbs}") train_datasets = [] for txt_path, img_path in zip(opts.train_txt_dbs, opts.train_img_dbs): img_db, img_db_gt = load_img_feat(img_path, all_img_dbs, opts) qa_txt_db = VcrTxtTokLmdb(txt_path, opts.max_txt_len, task="qa") qar_txt_db = VcrTxtTokLmdb(txt_path, opts.max_txt_len, task="qar") train_datasets.append( VcrDataset(qa_txt_db, img_db_gt=img_db_gt, img_db=img_db)) train_datasets.append( VcrDataset(qar_txt_db, img_db_gt=img_db_gt, img_db=img_db)) train_dataset = ConcatDatasetWithLens(train_datasets) train_dataloader = build_dataloader(train_dataset, vcr_collate, True, opts) # val LOGGER.info(f"Loading Val Dataset {opts.val_txt_db}, {opts.val_img_db}") val_img_db, val_img_db_gt = load_img_feat(opts.val_img_db, all_img_dbs, opts) val_txt_db = VcrTxtTokLmdb(opts.val_txt_db, -1, task="qa") val_dataset = VcrEvalDataset( "val", val_txt_db, img_db=val_img_db, img_db_gt=val_img_db_gt) val_final_dataset = VcrEvalDataset( ##"test" "val", val_txt_db, img_db=val_img_db, img_db_gt=val_img_db_gt) val_dataloader = build_dataloader(val_dataset, vcr_eval_collate, False, opts) val_final_dataloader = build_dataloader( val_final_dataset, vcr_eval_collate, False, opts) # Prepare model if opts.checkpoint and opts.checkpoint_from == "pretrain": checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} all_dbs = opts.train_txt_dbs + [opts.val_txt_db] toker = json.load(open(f'{all_dbs[0]}/meta.json'))['bert'] assert all(toker == json.load(open(f'{db}/meta.json'))['bert'] for db in all_dbs) model = UniterForVisualCommonsenseReasoning.from_pretrained( opts.model_config, checkpoint, img_dim=IMG_DIM) model.init_type_embedding() model.init_type_embedding_know() model.init_word_embedding(NUM_SPECIAL_TOKENS) if opts.checkpoint_from == "vcr_pretrain": checkpoint = torch.load(opts.checkpoint) state_dict = checkpoint.get('model_state', checkpoint) matched_state_dict = {} unexpected_keys = set() missing_keys = set() for name, param in model.named_parameters(): missing_keys.add(name) for key, data in state_dict.items(): if key in missing_keys: matched_state_dict[key] = data missing_keys.remove(key) else: unexpected_keys.add(key) print("Unexpected_keys:", list(unexpected_keys)) print("Missing_keys:", list(missing_keys)) model.load_state_dict(matched_state_dict, strict=False) del checkpoint model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) model, optimizer = amp.initialize(model, optimizer, enabled=opts.fp16, opt_level='O2') global_step = 0 if rank == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) os.makedirs(join(opts.output_dir, 'results')) # store VQA predictions add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Num examples = %d", len(train_dataset) * hvd.size()) LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) running_loss = RunningMeter('loss') model.train() n_examples = 0 n_epoch = 0 start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() optimizer.step() while True: for step, batch in enumerate(train_dataloader): n_examples += batch['input_ids'].size(0) loss = model(batch, compute_loss=True) loss = loss.mean() delay_unscale = (step+1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale ) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None] all_reduce_and_rescale_tensors(grads, float(1)) running_loss(loss.item()) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for i, param_group in enumerate(optimizer.param_groups): if i == 0 or i == 1: param_group['lr'] = lr_this_step * opts.lr_mul elif i == 2 or i == 3: param_group['lr'] = lr_this_step else: raise ValueError() TB_LOGGER.add_scalar('lr', lr_this_step, global_step) # log loss # NOTE: not gathered across GPUs for efficiency TB_LOGGER.add_scalar('loss', running_loss.val, global_step) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info(f'============Step {global_step}=============') tot_ex = sum(all_gather_list(n_examples)) ex_per_sec = int(tot_ex / (time()-start)) LOGGER.info(f'{tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar('perf/ex_per_s', ex_per_sec, global_step) LOGGER.info(f'===========================================') if global_step % opts.valid_steps == 0: val_log, results = validate( model, val_dataloader) TB_LOGGER.log_scaler_dict(val_log) model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break if global_step >= opts.num_train_steps: break n_epoch += 1 LOGGER.info(f"finished {n_epoch} epochs") if global_step % opts.valid_steps != 0: val_log, results = validate( model, val_dataloader) TB_LOGGER.log_scaler_dict(val_log) val_log, results = validate(model, val_final_dataloader) with open(f'{opts.output_dir}/results/' f'results_{global_step}_final_qa_qar_' f'rank{rank}.json', 'w') as f: json.dump(results, f) TB_LOGGER.log_scaler_dict(val_log) model_saver.save(model, global_step)
import horovod.torch as hvd # noqa: E402, isort:skip parser = argparse.ArgumentParser( description="Train a multi-gpu model with Torch and Horovod") parser.add_argument("--dir_in", default=None, help="Input directory") parser.add_argument("--batch_size", default=None, help="Batch size") parser.add_argument("--cats", default=None, help="Categorical columns") parser.add_argument("--cats_mh", default=None, help="Categorical multihot columns") parser.add_argument("--conts", default=None, help="Continuous columns") parser.add_argument("--labels", default=None, help="Label columns") parser.add_argument("--epochs", default=1, help="Training epochs") args = parser.parse_args() hvd.init() gpu_to_use = hvd.local_rank() if torch.cuda.is_available(): torch.cuda.set_device(gpu_to_use) BASE_DIR = os.path.expanduser(args.dir_in or "./data/") BATCH_SIZE = int(args.batch_size or 16384) # Batch Size CATEGORICAL_COLUMNS = args.cats or ["movieId", "userId"] # Single-hot CATEGORICAL_MH_COLUMNS = args.cats_mh or ["genres"] # Multi-hot NUMERIC_COLUMNS = args.conts or [] # Output from ETL-with-NVTabular TRAIN_PATHS = sorted(glob.glob(os.path.join(BASE_DIR, "train", "*.parquet")))
def main(): ''' simple starter program that can be copied for use when starting a new script. ''' parser = argparse.ArgumentParser(description='') parser.add_argument('-c','--config',help='configuration file in json format',required=True) parser.add_argument('--horovod',default=False, action='store_true', help="Setup for distributed training") parser.add_argument('--random_seed',default=0,type=int,help='numpy random seed') parser.add_argument('--debug', dest='debug', default=False, action='store_true', help="Set Logger to DEBUG") parser.add_argument('--error', dest='error', default=False, action='store_true', help="Set Logger to ERROR") parser.add_argument('--warning', dest='warning', default=False, action='store_true', help="Set Logger to ERROR") parser.add_argument('--logfilename',dest='logfilename',default=None,help='if set, logging information will go to file') args = parser.parse_args() logging_format = '%(asctime)s %(levelname)s:%(name)s:%(message)s' logging_datefmt = '%Y-%m-%d %H:%M:%S' logging_level = logging.INFO if args.debug and not args.error and not args.warning: logging_level = logging.DEBUG elif not args.debug and args.error and not args.warning: logging_level = logging.ERROR elif not args.debug and not args.error and args.warning: logging_level = logging.WARNING rank = 0 nranks = 1 hvd = None if args.horovod: import horovod.torch as hvd hvd.init() rank = hvd.rank() nranks = hvd.size() logging_format = '%(asctime)s %(levelname)s:' + '{:05d}'.format(rank) + ':%(name)s:%(process)s:%(thread)s:%(message)s' if rank > 0 and logging_level == logging.INFO: logging_level = logging.WARNING logging.basicConfig(level=logging_level, format=logging_format, datefmt=logging_datefmt, filename=args.logfilename) np.random.seed(args.random_seed) config = json.load(open(args.config)) # detect device available device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") config['rank'] = rank config['nranks'] = nranks config['hvd'] = hvd config['device'] = device logger.info('rank %s of %s',rank,nranks) logger.info('hostname: %s',socket.gethostname()) logger.info('python version: %s',sys.version) logger.info('config file: %s',args.config) logger.info('random_seed: %s',args.random_seed) logger.info('horovod: %s',args.horovod) logger.info('device: %s',device) model = pointnet.PointNet1d_SemSeg(config).to(device) logger.info('got model') loss_func = loss_acc.get_loss(config) acc_func = loss_acc.get_accuracy(config) opt_class = optimizer.get_optimizer(config) opt = opt_class(model.parameters(),**config['optimizer']['args']) ds = data_gen.get_dataset(config) accuracies = [] losses = [] epochs = config['training']['epochs'] for epoch in range(epochs): logger.info('starting epoch %s of %s',epoch,config['training']['epochs']) for batch_number,(inputs,weights,targets) in enumerate(ds): inputs = inputs.to(device) weights = weights.to(device) targets = targets.to(device) logger.info('inputs = %s weights = %s targets = %s',inputs.shape,weights.shape,targets.shape) opt.zero_grad() pred,endpoints = model(inputs) loss = loss_func(pred,targets,endpoints,weights,device=device) loss.backward() opt.step() acc = acc_func(pred,targets,device) if 'mean_class_iou' in config['loss']['acc']: acc = acc.mean() accuracies.append(acc) losses.append(loss) if batch_number % config['training']['status'] == 0: acc = torch.median(torch.Tensor(accuracies)) loss = torch.median(torch.Tensor(losses)) logger.info('<[%3d of %3d, %5d of %5d]> train loss: %6.4f train acc: %6.4f ', epoch + 1,epochs,batch_number,len(ds),loss.item(),acc.item()) accuracies = [] losses = []
def main(): ''' simple starter program that can be copied for use when starting a new script. ''' parser = argparse.ArgumentParser(description='') parser.add_argument('-c', '--config_file', help='configuration file in json format', required=True) parser.add_argument( '--num_files', '-n', default=-1, type=int, help='limit the number of files to process. default is all') parser.add_argument( '--model_save', help='base name of saved model parameters for later loading') parser.add_argument('--nsave', default=100, type=int, help='frequency in batch number to save model') parser.add_argument( '--nval', default=100, type=int, help='frequency to evaluate validation sample in batch numbers') parser.add_argument('--nval_tests', default=-1, type=int, help='number batches to test per validation run') parser.add_argument('--status', default=20, type=int, help='frequency to print loss status in batch numbers') parser.add_argument('--batch', default=-1, type=int, help='set batch size, overrides file config') parser.add_argument('--random_seed', default=0, type=int, help='numpy random seed') parser.add_argument( '--valid_only', default=False, action='store_true', help='flag that triggers validation run. prints confusion matrix.') parser.add_argument( '--batch_limiter', help= 'if set to an integer, will limit the number of batches during training. Use this to create short training runs for profiling.', type=int) parser.add_argument( '-i', '--input_model_pars', help= 'if provided, the file will be used to fill the models state dict from a previous run.' ) parser.add_argument('-e', '--epochs', type=int, default=-1, help='number of epochs') parser.add_argument('-l', '--logdir', help='log directory for tensorboardx') parser.add_argument('--horovod', default=False, action='store_true', help="Setup for distributed training") parser.add_argument('--cpu-only', default=False, action='store_true', help='set to force CPU only running') parser.add_argument('--debug', dest='debug', default=False, action='store_true', help="Set Logger to DEBUG") parser.add_argument('--error', dest='error', default=False, action='store_true', help="Set Logger to ERROR") parser.add_argument('--warning', dest='warning', default=False, action='store_true', help="Set Logger to ERROR") parser.add_argument('--logfilename', dest='logfilename', default=None, help='if set, logging information will go to file') args = parser.parse_args() logging_format = '%(asctime)s %(levelname)s:%(name)s:%(process)s:%(thread)s:%(message)s' logging_datefmt = '%Y-%m-%d %H:%M:%S' log_level = logging.INFO if args.debug and not args.error and not args.warning: log_level = logging.DEBUG elif not args.debug and args.error and not args.warning: log_level = logging.ERROR elif not args.debug and not args.error and args.warning: log_level = logging.WARNING rank = 0 nranks = 1 local_rank = 0 local_size = 1 hvd = None if args.horovod: print('importing horovod') import horovod.torch as hvd print('imported horovod') hvd.init() rank = hvd.rank() nranks = hvd.size() local_rank = hvd.local_rank() local_size = hvd.local_size() logging_format = '%(asctime)s %(levelname)s:' + '{:05d}'.format( rank) + ':%(name)s:%(process)s:%(thread)s:%(message)s' if rank > 0 and log_level == logging.INFO: log_level = logging.WARNING logging.basicConfig(level=log_level, format=logging_format, datefmt=logging_datefmt, filename=args.logfilename) device = torch.device('cpu') if torch.cuda.is_available() and not args.cpu_only: device = torch.device('cuda:%d' % local_rank) torch.cuda.set_device(device) model_save = args.model_save if model_save is None: model_save = os.path.join(args.logdir, 'model') logger.warning('rank %6s of %6s local rank %6s of %6s', rank, nranks, local_rank, local_size) logger.info('hostname: %s', socket.gethostname()) logger.info('python version: %s', sys.version) logger.info('num_threads: %s', torch.get_num_threads()) logger.info('torch version: %s', torch.__version__) logger.info('torch file: %s', torch.__file__) logger.info('config file: %s', args.config_file) logger.info('num files: %s', args.num_files) logger.info('model_save: %s', model_save) logger.info('random_seed: %s', args.random_seed) logger.info('valid_only: %s', args.valid_only) logger.info('nsave: %s', args.nsave) logger.info('nval: %s', args.nval) logger.info('nval_tests: %s', args.nval_tests) logger.info('status: %s', args.status) logger.info('input_model_pars: %s', args.input_model_pars) logger.info('epochs: %s', args.epochs) logger.info('horovod: %s', args.horovod) logger.info('cpu_only: %s', args.cpu_only) logger.info('logdir: %s', args.logdir) np.random.seed(args.random_seed) config_file = json.load(open(args.config_file)) config_file['rank'] = rank config_file['nranks'] = nranks config_file['input_model_pars'] = args.input_model_pars config_file['horovod'] = args.horovod config_file['status'] = args.status config_file['nval'] = args.nval config_file['nval_tests'] = args.nval_tests config_file['nsave'] = args.nsave config_file['model_save'] = model_save config_file['valid_only'] = args.valid_only config_file['batch_limiter'] = args.batch_limiter config_file['cpu_only'] = args.cpu_only if args.valid_only and not args.input_model_pars: logger.error('if valid_only set, must provide input model') return if args.batch > 0: logger.info('setting batch size from command line: %s', args.batch) config_file['training']['batch_size'] = args.batch if args.epochs > 0: logger.info('setting epochs from command line: %s', args.epochs) config_file['training']['epochs'] = args.epochs logger.info('configuration = \n%s', json.dumps(config_file, indent=4, sort_keys=True)) config_file['hvd'] = hvd # get datasets for training and validation trainds, testds = data_handler.get_datasets(config_file) # setup tensorboard writer = None if args.logdir and rank == 0: if not os.path.exists(args.logdir): os.makedirs(args.logdir) writer = tensorboardX.SummaryWriter(args.logdir) logger.info('building model') torch.manual_seed(args.random_seed) net = model.get_model(config_file) logger.info('model = \n %s', net) total_params = sum(p.numel() for p in net.parameters()) logger.info('trainable parameters: %s', total_params) if args.valid_only: valid_model(net, validds, config_file) else: train_model(net, trainds, testds, config_file, device, writer)
def simple_fn(): hvd_torch.init() return hvd_torch.rank()
def train_main(args, splits): # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) if torch.cuda.is_available(): # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) rank = hvd.rank() model = MyModel(annotation, use_bn=False) # By default, Adasum doesn"t need scaling up learning rate. if torch.cuda.is_available(): # Move model to GPU. model.cuda() optimizers = construct_optimizers(model) loss_function = huber_loss # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) for opt in optimizers: hvd.broadcast_optimizer_state(opt, root_rank=0) def _train(epoch, train_dataset): model.train() # Horovod: set epoch to sampler for shuffling. # train_dataset.set_epoch(epoch) start_epoch = timeit.default_timer() last_batch_time = start_epoch batch_wait_times = [] for batch_idx, (data, target) in enumerate(train_dataset): batch_wait_times.append(timeit.default_timer() - last_batch_time) if torch.cuda.is_available(): data = data.cuda() target = target.cuda() for opt in optimizers: opt.zero_grad() batch = OrderedDict() batch["embeddings"] = OrderedDict() batch["one_hot"] = OrderedDict() for i, name in enumerate(annotation["embeddings"]): batch["embeddings"][name] = data[:, i:i + 1] batch["one_hot"]["hot0"] = data[:, -2:-1] batch["one_hot"]["hot1"] = data[:, -1:] batch_pred = model(batch) if batch_idx % args.log_interval == 0: print( f"Processing batch {batch_idx} in epoch {epoch} on worker " f"{rank}.") time.sleep(args.mock_train_step_time) loss = loss_function(batch_pred, target, delta=60) loss.mean().backward() for opt in optimizers: opt.step() last_batch_time = timeit.default_timer() epoch_duration = timeit.default_timer() - start_epoch avg_batch_wait_time = np.mean(batch_wait_times) std_batch_wait_time = np.std(batch_wait_times) max_batch_wait_time = np.max(batch_wait_times) min_batch_wait_time = np.min(batch_wait_times) print(f"\nEpoch {epoch}, worker {rank} stats over " f"{len(batch_wait_times)} steps: {epoch_duration:.3f}") print(f"Mean batch wait time: {avg_batch_wait_time:.3f}s +- " f"{std_batch_wait_time}") print(f"Max batch wait time: {max_batch_wait_time:.3f}s") print(f"Min batch wait time: {min_batch_wait_time:.3f}s") return batch_wait_times print(f"Starting training on worker {rank}.") batch_wait_times = [] for epoch, split_ds in enumerate(splits[rank].iter_datasets()): train_dataset = create_torch_iterator(split_ds, args.batch_size, rank) new_batch_times = _train(epoch, train_dataset) new_batch_times.pop(0) batch_wait_times.extend(new_batch_times) print(f"Done training on worker {rank}.") avg_batch_wait_time = np.mean(batch_wait_times) std_batch_wait_time = np.std(batch_wait_times) max_batch_wait_time = np.max(batch_wait_times) min_batch_wait_time = np.min(batch_wait_times) print(f"\nWorker {rank} training stats over {args.epochs} epochs:") print(f"Mean batch wait time: {avg_batch_wait_time:.3f}s +- " f"{std_batch_wait_time}") print(f"Max batch wait time: {max_batch_wait_time:.3f}s") print(f"Min batch wait time: {min_batch_wait_time:.3f}s")
def train_loop_per_worker(config): import horovod.torch as hvd hvd.init() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") net = ResNet18(None).to(device) optimizer = torch.optim.SGD( net.parameters(), lr=config["lr"], ) epoch = 0 checkpoint = train.load_checkpoint() if checkpoint: model_state = checkpoint["model_state"] optimizer_state = checkpoint["optimizer_state"] epoch = checkpoint["epoch"] net.load_state_dict(model_state) optimizer.load_state_dict(optimizer_state) criterion = nn.CrossEntropyLoss() optimizer = hvd.DistributedOptimizer(optimizer) np.random.seed(1 + hvd.rank()) torch.manual_seed(1234) # To ensure consistent initialization across workers, hvd.broadcast_parameters(net.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) trainset = ray.get(config["data"]) trainloader = DataLoader(trainset, batch_size=int(config["batch_size"]), shuffle=True, num_workers=4) for epoch in range(epoch, 40): # loop over the dataset multiple times running_loss = 0.0 epoch_steps = 0 for i, data in enumerate(trainloader): # get the inputs; data is a list of [inputs, labels] inputs, labels = data inputs, labels = inputs.to(device), labels.to(device) # zero the parameter gradients optimizer.zero_grad() # forward + backward + optimize outputs = net(inputs) loss = criterion(outputs, labels) loss.backward() optimizer.step() # print statistics running_loss += loss.item() epoch_steps += 1 train.report(loss=running_loss / epoch_steps) if i % 2000 == 1999: # print every 2000 mini-batches print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1, running_loss / epoch_steps)) train.save_checkpoint( model_state=net.state_dict(), optimizer_state=optimizer.state_dict(), epoch=epoch, )
def update_args_for_hvd(args): import horovod.torch as hvd hvd.init() args.local_rank = hvd.local_rank() args.global_rank = hvd.rank() args.world_size = hvd.size()
def test_broadcast_state(self): hvd.init() N, D_in, H, D_out = 64, 100, 10, 10 x = torch.autograd.Variable(torch.randn(N, D_in), requires_grad=True) y = torch.autograd.Variable(torch.randn(N, D_out), requires_grad=False) def create_model(create_opt): model = torch.nn.Sequential( torch.nn.Linear(D_in, H), torch.nn.ReLU(), torch.nn.Linear(H, D_out), ) optimizer = create_opt(model) optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) return model, optimizer def get_model_param_values(model): params = sorted(model.state_dict().items()) return [(k, v.clone()) for k, v in params] def get_optimizer_param_values(optimizer): results = [] state_dict = optimizer.state_dict() for group in state_dict['param_groups']: for param_id in group['params']: params = sorted(state_dict['state'][param_id].items()) for k, v in params: results.append( (k, v.clone() if torch.is_tensor(v) else v)) return results opt_params = dict(lr=0.2, momentum=0.9, weight_decay=0.1, centered=True) def new_optimizer(cls): p = { k: v for k, v in opt_params.items() if k in inspect.getargspec(cls.__init__).args } return lambda m: cls(m.parameters(), **p) # L-BFGS is currently unsupported, as are sparse tensors, which are # required by SparseAdam optimizer optimizers = [ (subclass.__name__, new_optimizer(subclass)) for subclass in torch.optim.Optimizer.__subclasses__() if subclass.__module__.startswith('torch.optim') and subclass != torch.optim.LBFGS and subclass != torch.optim.SparseAdam ] optimizers.sort() for opt_name, create_opt in optimizers: model, optimizer = create_model(create_opt) y_pred = model(x) loss = F.mse_loss(y_pred, y, size_average=False) optimizer.zero_grad() loss.backward() optimizer.step() model_param_values = get_model_param_values(model) for name, model_param_value in model_param_values: hvd.broadcast_(model_param_value, root_rank=0) opt_param_values_updated = [] opt_param_values = get_optimizer_param_values(optimizer) for name, opt_param_value in opt_param_values: is_tensor = torch.is_tensor(opt_param_value) if not is_tensor: t = type(opt_param_value) opt_param_value = torch.Tensor([opt_param_value]) hvd.broadcast_(opt_param_value, root_rank=0) if not is_tensor: opt_param_value = t(opt_param_value.numpy()[0]) opt_param_values_updated.append((name, opt_param_value)) opt_param_values = opt_param_values_updated if hvd.rank() == 0: state = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), } _, fname = tempfile.mkstemp('.pt') torch.save(state, fname) model, optimizer = create_model(create_opt) if hvd.rank() == 0: checkpoint = torch.load(fname) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) os.remove(fname) hvd.broadcast_parameters(model.state_dict(), root_rank=0) model_param_value_after = get_model_param_values(model) for before, after in zip(model_param_values, model_param_value_after): name, model_param_value = before name_after, model_param_value_after = after self.assertEqual(name, name_after) self.assertEqual(type(model_param_value), type(model_param_value_after)) self.assertTrue( (model_param_value == model_param_value_after).all()) hvd.broadcast_optimizer_state(optimizer, root_rank=0) self.assertEqual(len(optimizer.state_dict()['state'].values()), 4) opt_param_values_after = get_optimizer_param_values(optimizer) for before, after in zip(opt_param_values, opt_param_values_after): name, opt_param_value = before name_after, opt_param_value_after = after self.assertEqual(name, name_after) self.assertEqual(type(opt_param_value), type(opt_param_value_after)) if torch.is_tensor(opt_param_value): self.assertTrue( (opt_param_value == opt_param_value_after).all()) else: self.assertEqual(opt_param_value, opt_param_value_after)
def train_and_eval(tag, dataroot, test_ratio=0.0, cv_fold=0, reporter=None, metric='last', save_path=None, only_eval=False, horovod=False): if horovod: import horovod.torch as hvd hvd.init() device = torch.device('cuda', hvd.local_rank()) torch.cuda.set_device(device) if not reporter: reporter = lambda **kwargs: 0 max_epoch = C.get()['epoch'] trainsampler, trainloader, validloader, testloader_ = get_dataloaders(C.get()['dataset'], C.get()['batch'], dataroot, test_ratio, split_idx=cv_fold, horovod=horovod) # create a model & an optimizer model = get_model(C.get()['model'], num_class(C.get()['dataset']), data_parallel=(not horovod)) criterion = nn.CrossEntropyLoss() if C.get()['optimizer']['type'] == 'sgd': optimizer = optim.SGD( model.parameters(), lr=C.get()['lr'], momentum=C.get()['optimizer'].get('momentum', 0.9), weight_decay=C.get()['optimizer']['decay'], nesterov=C.get()['optimizer']['nesterov'] ) else: raise ValueError('invalid optimizer type=%s' % C.get()['optimizer']['type']) is_master = True if horovod: optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) optimizer._requires_update = set() # issue : https://github.com/horovod/horovod/issues/1099 hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) if hvd.rank() != 0: is_master = False logger.debug('is_master=%s' % is_master) lr_scheduler_type = C.get()['lr_schedule'].get('type', 'cosine') if lr_scheduler_type == 'cosine': scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=C.get()['epoch'], eta_min=0.) elif lr_scheduler_type == 'resnet': scheduler = adjust_learning_rate_resnet(optimizer) elif lr_scheduler_type == 'pyramid': scheduler = adjust_learning_rate_pyramid(optimizer, C.get()['epoch']) else: raise ValueError('invalid lr_schduler=%s' % lr_scheduler_type) if C.get()['lr_schedule'].get('warmup', None): scheduler = GradualWarmupScheduler( optimizer, multiplier=C.get()['lr_schedule']['warmup']['multiplier'], total_epoch=C.get()['lr_schedule']['warmup']['epoch'], after_scheduler=scheduler ) if not tag or not is_master: from FastAutoAugment.metrics import SummaryWriterDummy as SummaryWriter logger.warning('tag not provided, no tensorboard log.') else: from tensorboardX import SummaryWriter writers = [SummaryWriter(log_dir='./logs/%s/%s' % (tag, x)) for x in ['train', 'valid', 'test']] result = OrderedDict() epoch_start = 1 if save_path and os.path.exists(save_path): logger.info('%s file found. loading...' % save_path) data = torch.load(save_path) if 'model' in data: logger.info('checkpoint epoch@%d' % data['epoch']) if not isinstance(model, DataParallel): model.load_state_dict({k.replace('module.', ''): v for k, v in data['model'].items()}) else: model.load_state_dict({k if 'module.' in k else 'module.'+k: v for k, v in data['model'].items()}) optimizer.load_state_dict(data['optimizer']) if data['epoch'] < C.get()['epoch']: epoch_start = data['epoch'] else: only_eval = True else: model.load_state_dict({k: v for k, v in data.items()}) del data else: logger.info('"%s" file not found. skip to pretrain weights...' % save_path) if only_eval: logger.warning('model checkpoint not found. only-evaluation mode is off.') only_eval = False if only_eval: logger.info('evaluation only+') model.eval() rs = dict() rs['train'] = run_epoch(model, trainloader, criterion, None, desc_default='train', epoch=0, writer=writers[0]) rs['valid'] = run_epoch(model, validloader, criterion, None, desc_default='valid', epoch=0, writer=writers[1]) rs['test'] = run_epoch(model, testloader_, criterion, None, desc_default='*test', epoch=0, writer=writers[2]) for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']): if setname not in rs: continue result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = 0 return result # train loop best_top1 = 0 for epoch in range(epoch_start, max_epoch + 1): if horovod: trainsampler.set_epoch(epoch) model.train() rs = dict() rs['train'] = run_epoch(model, trainloader, criterion, optimizer, desc_default='train', epoch=epoch, writer=writers[0], verbose=is_master, scheduler=scheduler) model.eval() if math.isnan(rs['train']['loss']): raise Exception('train loss is NaN.') if epoch % 5 == 0 or epoch == max_epoch: rs['valid'] = run_epoch(model, validloader, criterion, None, desc_default='valid', epoch=epoch, writer=writers[1], verbose=is_master) rs['test'] = run_epoch(model, testloader_, criterion, None, desc_default='*test', epoch=epoch, writer=writers[2], verbose=is_master) if metric == 'last' or rs[metric]['top1'] > best_top1: if metric != 'last': best_top1 = rs[metric]['top1'] for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = epoch writers[1].add_scalar('valid_top1/best', rs['valid']['top1'], epoch) writers[2].add_scalar('test_top1/best', rs['test']['top1'], epoch) reporter( loss_valid=rs['valid']['loss'], top1_valid=rs['valid']['top1'], loss_test=rs['test']['loss'], top1_test=rs['test']['top1'] ) # save checkpoint if is_master and save_path: logger.info('save model@%d to %s' % (epoch, save_path)) torch.save({ 'epoch': epoch, 'log': { 'train': rs['train'].get_dict(), 'valid': rs['valid'].get_dict(), 'test': rs['test'].get_dict(), }, 'optimizer': optimizer.state_dict(), 'model': model.state_dict() }, save_path) del model result['top1_test'] = best_top1 return result
def main(): global args, best_prec1, best_prec5 args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() #horovod initialize hvd.init() log = None if hvd.rank() == 0: log = SummaryWriter(log_dir=args.log_dir) print('The Training Model is %s' % args.arch) # Check the save_dir exists or not if not os.path.exists(args.save_dir): os.makedirs(args.save_dir) if args.cuda: torch.cuda.set_device(hvd.local_rank()) normalize = transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} # When supported, use 'forkserver' to spawn dataloader workers instead of 'fork' to prevent # issues with Infiniband implementations that are not fork-safe if (kwargs.get('num_workers', 0) > 0 and hasattr(mp, '_supports_context') and mp._supports_context and 'forkserver' in mp.get_all_start_methods()): kwargs['multiprocessing_context'] = 'forkserver' train_dataset = datasets.CIFAR10('data-%d'%hvd.local_rank(), train=True, transform=transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]), download=True) val_dataset = datasets.CIFAR10('data-%d'%hvd.local_rank(), train=False,transform=transforms.Compose([ transforms.ToTensor(), normalize, ])) #Horovod Partition the training data train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset,batch_size=args.batch_size,sampler=train_sampler,**kwargs) val_loader = torch.utils.data.DataLoader( val_dataset,batch_size=args.batch_size,sampler=val_sampler,**kwargs) # model = torch.nn.DataParallel(resnet.__dict__[args.arch]()) if args.arch in resnet.__dict__: model = resnet.__dict__[args.arch]() elif args.arch == 'alexnet': model = models.AlexNet() elif args.arch == 'vgg16': model = models.VGG16() if hvd.rank() == 0: numel = sum(p.numel() for p in model.parameters()) print('Total params: {:d}'.format(numel)) lr_scaler = hvd.size() if args.cuda: model.cuda() if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) checkpoint = torch.load(args.resume) args.start_epoch = checkpoint['epoch'] best_prec1 = checkpoint['best_prec1'] model.load_state_dict(checkpoint['state_dict']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.evaluate, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().cuda() if args.half: model.half() criterion.half() base_optimizer = torch.optim.SGD(model.parameters(), args.lr * lr_scaler, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(base_optimizer, # milestones=[100, 150], last_epoch=args.start_epoch - 1) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(base_optimizer, root_rank=0) #Compression # compression = Allgather(MGCCompressor(0.05), ResidualMemory(), hvd.size()) # compression = Allgather(TernGradCompressor(), ResidualMemory(), hvd.size()) compression = Allreduce(NoneCompressor(), NoneMemory()) # compression = Allgather(DgcCompressor(0.01), ResidualMemory(), hvd.size()) # compression = Allgather(LowQSGDCompressor(), ResidualMemory(), hvd.size()) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(base_optimizer, compression, named_parameters=model.named_parameters()) if hvd.rank() == 0: log.add_scalar('train/accuracy', 0., 0) log.add_scalar('test/accuracy', 0., 0) for epoch in range(args.start_epoch + 1, args.epochs + 1): adjust_learning_rate(optimizer, epoch, size=lr_scaler) if hvd.rank() == 0: print('current lr {:.5e}'.format(optimizer.param_groups[0]['lr'])) # train for one epoch train(train_loader, model, criterion, optimizer, epoch, log=log) # evaluate on validation set prec1, prec5 = validate(val_loader, model, criterion, epoch, log=log) # remember best prec@1 and save checkpoint best_prec1 = max(prec1, best_prec1) best_prec5 = max(prec5, best_prec5) if hvd.rank() == 0: print('Best Pred@1:{:.2f}%, Prec@5:{:.2f}%\n'.format(best_prec1, best_prec5)) # if epoch > 0 and epoch % args.save_every == 0: # save_checkpoint({ # 'epoch': epoch + 1, # 'state_dict': model.state_dict(), # 'best_prec1': best_prec1, # }, is_best, filename=os.path.join(args.save_dir, 'checkpoint.th')) # # save_checkpoint({ # 'state_dict': model.state_dict(), # 'best_prec1': best_prec1, # }, is_best, filename=os.path.join(args.save_dir, 'model.th')) if hvd.rank() == 0: log.close()
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) opts.n_gpu = n_gpu LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True set_random_seed(opts.seed) # train_examples = None LOGGER.info(f"Loading the whole video dataset {opts.sub_txt_db}, " f"{opts.vfeat_db}") video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db, opts.vfeat_interval, opts) # data loaders # train video_ids = get_video_ids(opts.train_query_txt_db) train_q_txt_db = QaQueryTokLmdb(opts.train_query_txt_db, opts.max_txt_len) train_dataloaders = build_downstream_dataloaders([opts.task], video_db, video_ids, True, opts, q_txt_db=train_q_txt_db, shuffle=True) meta_loader = MetaLoader(train_dataloaders, accum_steps=opts.gradient_accumulation_steps, distributed=n_gpu > 1) meta_loader = PrefetchLoader(meta_loader) # val video_ids = get_video_ids(opts.val_query_txt_db) val_q_txt_db = QaQueryTokLmdb(opts.val_query_txt_db, -1) val_dataloaders = build_downstream_dataloaders([opts.task], video_db, video_ids, False, opts, q_txt_db=val_q_txt_db) # test video_ids = get_video_ids(opts.test_query_txt_db) test_q_txt_db = QaQueryTokLmdb(opts.test_query_txt_db, -1) test_dataloaders = build_downstream_dataloaders([opts.task], video_db, video_ids, False, opts, q_txt_db=test_q_txt_db) # Prepare model if opts.checkpoint: checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} img_pos_embed_weight_key = "v_encoder.f_encoder.img_embeddings" +\ ".position_embeddings.weight" max_frm_seq_len = MAX_FRM_SEQ_LEN if img_pos_embed_weight_key in checkpoint: checkpoint_img_seq_len = len(checkpoint[img_pos_embed_weight_key]) if checkpoint_img_seq_len < max_frm_seq_len: old_weight = checkpoint[img_pos_embed_weight_key] new_weight = torch.zeros(max_frm_seq_len, old_weight.shape[1]) new_weight.data[:checkpoint_img_seq_len, :].copy_(old_weight) checkpoint[img_pos_embed_weight_key] = new_weight else: max_frm_seq_len = checkpoint_img_seq_len model = HeroForViolin.from_pretrained(opts.model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len) model.to(device) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) # Prepare optimizer optimizer = build_optimizer(model, opts) task2scaler = {t: i for i, t in enumerate(train_dataloaders.keys())} model, optimizer = amp.initialize(model, optimizer, num_losses=len(task2scaler), enabled=opts.fp16, opt_level='O2') restorer = TrainingRestorer(opts, model, optimizer) global_step = restorer.global_step TB_LOGGER.global_step = global_step if hvd.rank() == 0: save_training_meta(opts) TB_LOGGER.create(join(opts.output_dir, 'log')) pbar = tqdm(total=opts.num_train_steps) model_saver = ModelSaver(join(opts.output_dir, 'ckpt')) if not exists(join(opts.output_dir, 'results')): # store violin predictions os.makedirs(join(opts.output_dir, 'results')) add_log_to_file(join(opts.output_dir, 'log', 'log.txt')) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() restorer = NoOp() if global_step > 0: pbar.update(global_step) LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(opts) LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) task2loss = { task: RunningMeter(f'loss/{task}') for task in train_dataloaders.keys() } model.train() n_examples = defaultdict(int) start = time() # quick hack for amp delay_unscale bug optimizer.zero_grad() if global_step == 0: optimizer.step() for step, (task, batch) in enumerate(meta_loader): n_examples[task] += opts.train_batch_size loss = model(batch, task=task, compute_loss=True) loss = loss.mean() task2loss[task](loss.item()) delay_unscale = (step + 1) % opts.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale, loss_id=task2scaler[task]) as scaled_loss: scaled_loss.backward() if not delay_unscale: # gather gradients from every processes # do this before unscaling to make sure every process uses # the same gradient scale grads = [ p.grad.data for p in model.parameters() if p.requires_grad and p.grad is not None ] all_reduce_and_rescale_tensors(grads, float(1)) if (step + 1) % opts.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling lr_this_step = get_lr_sched(global_step, opts) for i, param_group in enumerate(optimizer.param_groups): if i == 0 or i == 1: param_group['lr'] = lr_this_step * opts.lr_mul elif i == 2 or i == 3: param_group['lr'] = lr_this_step else: raise ValueError() TB_LOGGER.add_scalar('lr', lr_this_step, global_step) TB_LOGGER.log_scaler_dict({ ll.name: ll.val for ll in task2loss.values() if ll.val is not None }) TB_LOGGER.step() # update model params if opts.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), opts.grad_norm) TB_LOGGER.add_scalar('grad_norm', grad_norm, global_step) optimizer.step() optimizer.zero_grad() restorer.step() pbar.update(1) if global_step % 100 == 0: # monitor training throughput LOGGER.info('-------------------------------------------') LOGGER.info(f'Step {global_step}:') for t in train_dataloaders.keys(): tot_ex = sum(all_gather_list(n_examples[t])) ex_per_sec = int(tot_ex / (time() - start)) LOGGER.info(f'{t}: {tot_ex} examples trained at ' f'{ex_per_sec} ex/s') TB_LOGGER.add_scalar(f'perf/{t}_ex_per_s', ex_per_sec, global_step) if global_step % opts.valid_steps == 0: LOGGER.info('===========================================') LOGGER.info(f"Step {global_step}: start running validation") validate(model, val_dataloaders, "val", opts, global_step=global_step) validate(model, test_dataloaders, "test", opts, global_step=global_step) LOGGER.info('===========================================') model_saver.save(model, global_step) if global_step >= opts.num_train_steps: break LOGGER.info('===========================================') if global_step % opts.valid_steps != 0: LOGGER.info('===========================================') LOGGER.info(f"Step {global_step}: start running validation") validate(model, val_dataloaders, "val", opts, global_step=global_step) validate(model, test_dataloaders, "test", opts, global_step=global_step) LOGGER.info('===========================================') model_saver.save(model, f'{global_step}_final')
def main(opts, checkpoint_dir=None, tuning=False): from utils.logger import LOGGER, TB_LOGGER, RunningMeter, add_log_to_file with logger.catch(reraise=True): logger.info(f"{opts}") if isinstance(opts, dict): opts = edict(opts) hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() opts.rank = rank LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if opts.gradient_accumulation_steps < 1: raise ValueError( "Invalid gradient_accumulation_steps parameter: {}, " "should be >= 1".format(opts.gradient_accumulation_steps)) set_random_seed(opts.seed) """ # load DBs and image dirs """ all_img_dbs = ImageLmdbGroup(opts.conf_th, opts.max_bb, opts.min_bb, opts.num_bb, opts.compressed_db) # val LOGGER.info( f"Loading Val Dataset {opts.val_txt_db}, {opts.val_img_db}") val_img_db = all_img_dbs[opts.val_img_db] val_txt_db = TxtTokLmdb(opts.val_txt_db, -1) val_dataset = MemeEvalDataset(1, val_txt_db, val_img_db) val_dataloader = build_dataloader(val_dataset, meme_eval_collate, False, opts) val_itm_dataloader = build_dataloader(val_dataset, meme_eval_itm_ot_collate, False, opts) test_img_db = val_img_db test_txt_db = TxtTokLmdb(opts.test_txt_db, -1) test_dataset = MemeEvalDataset(1, test_txt_db, test_img_db) test_dataloader = build_dataloader(test_dataset, meme_eval_collate, False, opts) """ # Prepare model """ if opts.checkpoint: logger.info(f"Load checkpoint: {opts.checkpoint}") checkpoint = torch.load(opts.checkpoint) else: checkpoint = {} all_dbs = opts.train_txt_dbs + [opts.val_txt_db] model = UniterForITM.from_pretrained(opts.model_config, checkpoint, img_dim=IMG_DIM, num_answer=1) model.to(device) if hasattr(opts, 'tune_checkpoint') and isinstance( model, UniterForITM): model_state = torch.load(opts.tune_checkpoint)[0] model.load_state_dict(model_state) # make sure every process has same model parameters in the beginning broadcast_tensors([p.data for p in model.parameters()], 0) set_dropout(model, opts.dropout) """ # Prepare optimizer """ optimizer = build_optimizer(model, opts) model, optimizer = amp.initialize(model, optimizer, enabled=opts.fp16, opt_level='O2') global_step = 0 LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info(" Num examples = %d", len(val_dataset) * hvd.size()) LOGGER.info(" Batch size = %d", opts.train_batch_size) LOGGER.info(" Accumulate steps = %d", opts.gradient_accumulation_steps) LOGGER.info(" Num steps = %d", opts.num_train_steps) model.eval() val_log, results = validate(model, val_dataloader, None) with open( f'{opts.output_dir}/results/' f'results_{global_step}_' f'rank{rank}.json', 'w') as f: json.dump(results, f) pd.DataFrame.from_dict(results).to_csv( f'{opts.output_dir}/results/' f'results_{global_step}_' f'rank{rank}.csv', index=False) test_log, results = test(model, test_dataloader, None) os.makedirs(f'{opts.output_dir}/results/', exist_ok=True) with open( f'{opts.output_dir}/results/' f'results_{global_step}_' f'test.json', 'w') as f: json.dump(results, f) test_csv = pd.DataFrame.from_dict(results)[['id', 'proba', 'label']] test_csv = reorder_csv_rows( os.path.join(HERE, 'asset', 'test_unseen.jsonl'), test_csv, ) test_csv.to_csv(f'{opts.output_dir}/' f'test.csv', index=False) output_path = (f'{opts.output_dir}/' f'test.csv') print('Save test predict to: ', output_path) if opts.checkpoint: try: shutil.copy(opts.checkpoint, os.path.join(opts.output_dir, 'final.pt')) except shutil.SameFileError: logger.info( 'Rerun of the same chekcpoint, not re-copy it as final.pt')
def fn(batches_per_commit, batches_per_epoch, epochs, dir=None): @run def train(state, dir): state.rendezvous += 1 logging.info('rank %s: rendezvous %s', hvd.rank(), state.rendezvous) for state.epoch in range(state.epoch, epochs): logging.info('rank %s: start epoch %s at batch %s', hvd.rank(), state.epoch, state.batch) for state.batch in range(state.batch, batches_per_epoch): check_fail(dir, hvd.rank(), state.epoch, state.batch) optimizer.zero_grad() output = model(data) loss = F.cross_entropy(output, target) loss.backward() optimizer.step() # TODO: this sleep makes the fault tolerant test fail # torch all gather throws an RuntimeError which should be a HorovodInternalError #import time #time.sleep(0.2) if state.batch % batches_per_commit == 0: logging.info('rank %s: allgather', hvd.rank()) hvd.allgather( torch.tensor([ hvd.rank(), state.epoch, state.batch, state.rendezvous ]), 'state').tolist() logging.info('rank %s: commit epoch %s batch %s', hvd.rank(), state.epoch, state.batch) state.commits += 1 state.commit() logging.info('rank %s: allgather', hvd.rank()) hvd.allgather( torch.tensor( [hvd.rank(), state.epoch, state.batch, state.rendezvous]), 'state').tolist() logging.info('rank %s: commit epoch %s', hvd.rank(), state.epoch) state.commits += 1 state.commit() state.batch = 0 res = hvd.allgather( torch.tensor( [hvd.rank(), state.epoch, state.batch, state.rendezvous]), 'state').tolist() logging.info('rank %s: returning', hvd.rank()) return res, hvd.rank() logging.getLogger().setLevel(logging.DEBUG) logging.basicConfig( format= '%(asctime)-15s %(levelname)1.1s %(filename)s:%(lineno)d %(funcName)s() - %(message)s' ) hvd.init() batch_size = 32 data = torch.randn(batch_size, 2) target = torch.LongTensor(batch_size).random_() % 2 v = 1.0 model = torch.nn.Sequential(torch.nn.Linear(2, 2)) model.load_state_dict({ '0.weight': torch.tensor([[v, v], [v, v]]), '0.bias': torch.tensor([v, v]) }) optimizer = torch.optim.SGD(model.parameters(), lr=0.001) state = hvd.elastic.TorchState(model, optimizer, batch=0, epoch=0, commits=0, rendezvous=0) return train(state, dir)
def __init__(self, parent_comm, process_comm, parent_rank=None, num_epochs=1, data=None, algo=None, model_builder=None, verbose=False, monitor=False, custom_objects={}, checkpoint=None, checkpoint_interval=5): """If the rank of the parent is given, initialize this process and immediately start training. If no parent is indicated, training should be launched with train(). Parameters: parent_comm: MPI intracommunicator used to communicate with parent parent_rank (integer): rank of this node's parent in parent_comm num_epochs: number of training epochs data: Data object used to generate training or validation data algo: Algo object used to configure the training process model_builder: ModelBuilder object specifying model verbose: whether to print verbose output monitor: whether to monitor CPU/GPU usage """ self.parent_comm = parent_comm self.process_comm = process_comm self.parent_rank = parent_rank self.num_epochs = num_epochs self.data = data self.algo = algo self.model_builder = model_builder self.verbose = verbose self.histories = {} self.custom_objects = custom_objects self.update = None self.stop_training = False self.time_step = 0 self._short_batches = int(os.environ.get('NNLO_SHORT_BATCHES', 0)) self._is_shadow = (self.process_comm is not None and self.process_comm.Get_rank() != 0) self.monitor = Monitor() if monitor else None process_type = self.__class__.__name__.replace('MPI', '')[0] set_logging_prefix( MPI.COMM_WORLD.Get_rank(), self.parent_comm.Get_rank() if self.parent_comm is not None else '-', self.process_comm.Get_rank() if self.process_comm is not None else '-', process_type) if self.process_comm is not None and self.process_comm.Get_size() > 1: if self.model_builder.get_backend_name() == 'pytorch': import horovod.torch as hvd else: import horovod.keras as hvd logging.debug("initializing horovod") self.process_comm.Barrier() hvd.init(comm=self.process_comm) self.process_comm.Barrier() self.algo.worker_optimizer_builder.horovod_wrapper = True self.rank = parent_comm.Get_rank() if parent_comm else 0 self.ranks = "{0}:{1}:{2}".format( MPI.COMM_WORLD.Get_rank(), self.parent_comm.Get_rank() if self.parent_comm is not None else '-', self.process_comm.Get_rank() if self.process_comm is not None else '-') Timeline.set_process_name(type(self).__name__ + " " + self.ranks) self.epoch = 0 self.checkpoint = checkpoint self.checkpoint_interval = checkpoint_interval if self.algo.restore and self.checkpoint is not None: try: with open(self.checkpoint + '.latest', 'r') as latest: latest_file = latest.read().splitlines()[-1] epoch = int(latest_file.split('-')[-1]) except: epoch = 0 logging.error( "Failed to restore epoch from checkpoint {}".format( self.checkpoint)) if epoch < num_epochs: self.epoch = epoch self.num_epochs = num_epochs - epoch logging.info( "Continuing training from epoch {} for {} epochs".format( self.epoch, self.num_epochs)) self.build_model() if (self.parent_rank is not None and self.parent_comm is not None): self.bcast_weights(self.parent_comm) if (self.parent_rank is not None and self.parent_comm is not None) or (self.process_comm): self.train()
def train(serialized_model, optimizer_cls, model_opt_state_serialized, train_rows, val_rows, avg_row_size): from petastorm import TransformSpec, make_reader, make_batch_reader from petastorm.pytorch import BatchedDataLoader, InMemBatchedDataLoader import torch import horovod.torch as hvd # Deserializing objects model_opt_state = torch.load(model_opt_state_serialized) model = deserialize(serialized_model) if loss_fns_pre_train: loss_fns = loss_fns_pre_train if loss_constructors: local_vars = locals() loss_fns = [ loss_constructor(**local_vars) for loss_constructor in loss_constructors ] # Horovod: initialize library. hvd.init() if not user_shuffle_buffer_size: shuffle_buffer_size = \ calculate_shuffle_buffer_size(hvd, avg_row_size, train_rows / hvd.size()) else: shuffle_buffer_size = user_shuffle_buffer_size cuda_available = torch.cuda.is_available() if cuda_available: # Horovod: pin GPU to local rank or the assigned GPU from spark. torch.cuda.set_device( _get_assigned_gpu_or_default(default=hvd.local_rank())) # Move model to GPU. model.cuda() # Optimizer object needs to be re-instantiated. Internally, it uses memory addresses of # objects as their identity and therefore it cannot be serialized and then # deserialized. The deserialized optimizer object stores the names of the parameters # with their old memory addresses but in reality those are different than the # reconstructed deserialized object and that creates problem. # Learning rate is a required parameters in SGD optimizer. It will be overridden with # load_state_dict. optimizer = optimizer_cls(model.parameters(), lr=1) optimizer_state = model_opt_state['optimizer'] if last_checkpoint_state is not None: model.load_state_dict(last_checkpoint_state['model']) optimizer.load_state_dict(last_checkpoint_state['optimizer']) else: # scale the learning rate with the number of horovod workers for i in range(len(optimizer_state['param_groups'])): optimizer_state['param_groups'][i]['lr'] = \ optimizer_state['param_groups'][i]['lr'] * hvd.size() optimizer.load_state_dict(optimizer_state) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) for group in optimizer.param_groups: for p in group['params']: if id(p) not in optimizer.state_dict()['state']: p.grad = p.data.new(p.size()).zero_() optimizer.step() hvd.broadcast_optimizer_state(optimizer, root_rank=0) dist_optimizer_args = dict(optimizer=optimizer, named_parameters=model.named_parameters()) if gradient_compression: # Pass the compression arg only if it is specified by the user. dist_optimizer_args['compression'] = gradient_compression # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(**dist_optimizer_args) # This function takes the current optimizer and constructs a new optimizer with the # same state except with learning rate scaled down with the number of horovod workers. # This is important the retraining of the model. User may retrain the model with # different number of workers and we need the raw learning rate to adjust with the # new number of workers. transform_spec = None if transformation: transform_spec = TransformSpec(transformation) schema_fields = feature_columns + label_columns if sample_weight_col: schema_fields.append(sample_weight_col) if train_steps_per_epoch is None: steps_per_epoch = int( math.floor(float(train_rows) / batch_size / hvd.size())) else: steps_per_epoch = train_steps_per_epoch with remote_store.get_local_output_dir() as run_output_dir: logs_dir = os.path.join(run_output_dir, remote_store.logs_subdir) log_writer = SummaryWriter(logs_dir) if hvd.rank() == 0 else None ckpt_file = os.path.join(run_output_dir, remote_store.checkpoint_filename) def save_checkpoint(): model.cpu() optimizer_with_scaled_down_lr = \ get_optimizer_with_unscaled_lr(hvd, optimizer, optimizer_cls, model) state = { 'model': model.state_dict(), 'optimizer': optimizer_with_scaled_down_lr.state_dict(), } torch.save(state, ckpt_file) if cuda_available: model.cuda() # In general, make_batch_reader is faster than make_reader for reading the dataset. # However, we found out that make_reader performs data transformations much faster than # make_batch_reader with parallel worker processes. Therefore, the default reader # we choose is make_batch_reader unless there are data transformations. reader_factory = None reader_factory_kwargs = dict() if transform_spec: reader_factory = make_reader reader_factory_kwargs['pyarrow_serialize'] = True else: reader_factory = make_batch_reader # Petastorm: read data from the store with the correct shard for this rank # setting num_epochs=None will cause an infinite iterator # and enables ranks to perform training and validation with # unequal number of samples with reader_factory(remote_store.train_data_path, num_epochs=None, cur_shard=hvd.rank(), reader_pool_type=reader_pool_type, workers_count=train_reader_worker_count, shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields, transform_spec=transform_spec, **reader_factory_kwargs) as train_reader: with reader_factory(remote_store.val_data_path, num_epochs=None, cur_shard=hvd.rank(), reader_pool_type=reader_pool_type, workers_count=val_reader_worker_count, shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields, transform_spec=transform_spec, **reader_factory_kwargs) \ if should_validate else empty_batch_reader() as val_reader: if inmemory_cache_all: # Petastorm introduced InMemBatchedDataLoader class in v0.11.0 train_loader = InMemBatchedDataLoader( train_reader, batch_size=batch_size, num_epochs=epochs, rows_capacity=steps_per_epoch * batch_size, shuffle=True) else: train_loader = BatchedDataLoader( train_reader, batch_size=batch_size, shuffling_queue_capacity=shuffle_buffer_size) train_loader_iter = iter(train_loader) def prepare_batch(row): inputs = [ prepare_np_data(row[col].float(), col, metadata).reshape(shape) for col, shape in zip(feature_columns, input_shapes) ] labels = [ prepare_np_data(row[col].float(), col, metadata) for col in label_columns ] sample_weights = row.get(sample_weight_col, None) if sample_weights is not None: sample_weights = sample_weights.float() if cuda_available: inputs = [input.cuda() for input in inputs] labels = [label.cuda() for label in labels] if sample_weights is not None: sample_weights = sample_weights.cuda() return inputs, labels, sample_weights def transform_outputs(outputs, labels): if not isinstance(outputs, tuple) and not isinstance( outputs, list): outputs = [outputs] # reshape labels to match the output shape of the model if hasattr(outputs[0], 'shape'): if label_shapes: labels = [ label.reshape(label_shape) for label, label_shape in zip( labels, label_shapes) ] else: # If label_shapes parameter is not provided, reshape the label # columns data to match the shape of the model output labels = [ label.reshape(output.shape) if output.shape.numel() == label.shape.numel() else label for label, output in zip(labels, outputs) ] return outputs, labels def aggregate_metrics(stage, epoch, loss, metric_value_groups): all_metric_groups_values = get_metric_avgs( metric_value_groups) if remote_store.saving_runs: write_metrics_summary(stage, epoch, loss, all_metric_groups_values, log_writer) return { loss.name: loss.avg.item(), 'all_metrics': all_metric_groups_values } def loss_fn(outputs, labels, sample_weights): loss = calculate_loss(outputs, labels, loss_weights, loss_fns, sample_weights) return loss def print_metrics(batch_idx, loss, metric_value_groups, phase): if user_verbose > 0 and hvd.rank() == 0 and \ batch_idx % METRIC_PRINT_FREQUENCY == 0: print( "{phase}\tepoch:\t{epoch}\tstep\t{batch_idx}:\t{metrics}" .format(phase=phase, epoch=epoch, batch_idx=batch_idx, metrics=aggregate_metrics( phase, epoch, loss, metric_value_groups))) def _train(epoch): model.train() train_loss = metric_cls('loss', hvd) metric_value_groups = construct_metric_value_holders( metric_cls, metric_fn_groups, label_columns, hvd) # iterate on one epoch for batch_idx in range(steps_per_epoch): row = next(train_loader_iter) inputs, labels, sample_weights = prepare_batch(row) outputs, loss = train_minibatch( model, optimizer, transform_outputs, loss_fn, inputs, labels, sample_weights) update_metrics(metric_value_groups, outputs, labels) train_loss.update(loss) print_metrics(batch_idx, train_loss, metric_value_groups, 'train') return aggregate_metrics('train', epoch, train_loss, metric_value_groups) if should_validate: if validation_steps_per_epoch is None: validation_steps = int( math.ceil( float(val_rows) / val_batch_size / hvd.size())) else: validation_steps = validation_steps_per_epoch if inmemory_cache_all: # Petastorm introduced InMemBatchedDataLoader class in v0.11.0 val_loader = InMemBatchedDataLoader( val_reader, batch_size=val_batch_size, num_epochs=epochs, rows_capacity=validation_steps * val_batch_size, shuffle=False) else: val_loader = BatchedDataLoader( val_reader, batch_size=val_batch_size, shuffling_queue_capacity=0) val_loader_iter = iter(val_loader) def _validate(epoch): model.eval() val_loss = metric_cls('loss', hvd) metric_value_groups = construct_metric_value_holders( metric_cls, metric_fn_groups, label_columns, hvd) # iterate on one epoch for batch_idx in range(validation_steps): row = next(val_loader_iter) inputs, labels, sample_weights = prepare_batch( row) outputs = model(*inputs) outputs, labels = transform_outputs( outputs, labels) loss = calculate_loss(outputs, labels, loss_weights, loss_fns, sample_weights) val_loss.update(loss) update_metrics(metric_value_groups, outputs, labels) print_metrics(batch_idx, val_loss, metric_value_groups, 'val') return aggregate_metrics('val', epoch, val_loss, metric_value_groups) history = [] for epoch in range(epochs): epoch_metrics = { 'epoch': epoch, 'train': _train(epoch) } if should_validate: epoch_metrics['validation'] = _validate(epoch) if user_verbose > 0: pdt_dt = datetime.now(timezone.utc) pdt_time_str = pdt_dt.strftime( "%Y-%b-%d %H:%M:%S UTC") print(pdt_time_str, epoch_metrics) history.append(epoch_metrics) if hvd.rank() == 0: # Save model after every epoch save_checkpoint() if remote_store.saving_runs: remote_store.sync(run_output_dir) if hvd.rank() == 0: best_checkpoint = torch.load(ckpt_file) serialized_checkpoint = io.BytesIO() torch.save(best_checkpoint, serialized_checkpoint) serialized_checkpoint.seek(0) return history, serialized_checkpoint
def train(serialized_model): import horovod.torch as hvd if random_seed is not None: pl.utilities.seed.seed_everything(seed=random_seed) # Horovod: initialize library. hvd.init() if verbose: import horovod as _horovod print( f"Shared lib path is pointing to: {_horovod.common.process_sets._basics.MPI_LIB_CTYPES}" ) _checkpoint_callback = None require_checkpoint = False with remote_store.get_local_output_dir() as run_output_dir: logs_path = os.path.join(run_output_dir, remote_store.logs_subdir) os.makedirs(logs_path, exist_ok=True) print(f"Made directory {logs_path} for horovod rank {hvd.rank()}") ckpt_dir = run_output_dir ckpt_filename = remote_store.checkpoint_filename if logger is None: # Use default logger if no logger is supplied train_logger = TensorBoardLogger(logs_path) print(f"Setup logger: Using TensorBoardLogger: {train_logger}") elif isinstance(logger, CometLogger): if logger._experiment_key: # use logger passed in. train_logger = logger train_logger._save_dir = logs_path print( f"Setup logger: change save_dir of the logger to {logs_path}" ) elif logger_experiment_key: # Resume logger experiment with new log path if key passed correctly from CPU. train_logger = CometLogger( save_dir=logs_path, api_key=logger.api_key, experiment_key=logger_experiment_key, ) print( f"Setup logger: Resume comet logger: {vars(train_logger)}" ) else: print( f"Failed to setup or resume comet logger. origin logger: {vars(logger)}" ) else: # use logger passed in. train_logger = logger train_logger.save_dir = logs_path print( f"Setup logger: Using logger passed from estimator: {train_logger}" ) # Lightning requires to add checkpoint callbacks for all ranks. # Otherwise we are seeing hanging in training. for cb in callbacks: if isinstance(cb, ModelCheckpoint): cb.dirpath = ckpt_dir cb.filename = ckpt_filename _checkpoint_callback = cb require_checkpoint = True break if not _checkpoint_callback: # By default 'monitor'=None which saves a checkpoint only for the last epoch. _checkpoint_callback = ModelCheckpoint(dirpath=ckpt_dir, filename=ckpt_filename, verbose=True) callbacks.append(_checkpoint_callback) if remote_store.saving_runs and hvd.rank() == 0: # Horovod: sync checkpoint and logging files only on rank 0 to # prevent other ranks from corrupting them. class _SyncCallback(Callback): def on_epoch_end(self, trainer: "pl.Trainer", pl_module: "pl.LightningModule") -> None: remote_store.sync(run_output_dir) callbacks.append(_SyncCallback()) model = deserialize(serialized_model) _train_steps_per_epoch = train_steps_per_epoch if train_steps_per_epoch else \ int(math.floor(float(train_rows) / batch_size / hvd.size())) _val_steps_per_epoch = val_steps_per_epoch if val_steps_per_epoch else \ int(math.floor(float(val_rows) / val_batch_size / hvd.size())) shuffle_size = calculate_shuffle_buffer_size() if verbose: print( f"Training data of rank[{hvd.local_rank()}]: Epochs: {epochs}, " f"Shuffle_size: {shuffle_size}, Random seed: {random_seed}\n" f"Train rows: {train_rows}, Train batch size: {batch_size}, Train_steps_per_epoch: {_train_steps_per_epoch}\n" f"Val rows: {val_rows}, Val batch size: {val_batch_size}, Val_steps_per_epoch: {_val_steps_per_epoch}\n" f"Checkpoint file: {remote_store.checkpoint_path}, Logs dir: {remote_store.logs_path}\n" ) cuda_available = torch.cuda.is_available() # We need to check all ranks have same device type for traning. # Horovod doesn't support heterogeneous allreduce for gradients. cuda_avail_list = hvd.allgather_object(cuda_available, name='device type') if cuda_avail_list.count(cuda_available) != hvd.size(): raise RuntimeError("All ranks don't have same device type!") if cuda_available: # Horovod: pin GPU to local rank or the assigned GPU from spark. torch.cuda.set_device( _get_assigned_gpu_or_default(default=hvd.local_rank())) # Move model to GPU. model.cuda() _num_gpus = num_gpus if _num_gpus is None: _num_gpus = 1 if cuda_available else 0 # Set bar refresh to 1 / epoch, detailed loss and metrics is avaialbe in logger, # no need to print in screen here. User can still override this in trainer_args progress_bar_refresh_rate = _train_steps_per_epoch kwargs = { 'accelerator': 'horovod', 'gpus': _num_gpus, 'callbacks': callbacks, 'max_epochs': epochs, 'logger': train_logger, 'log_every_n_steps': log_every_n_steps, 'num_sanity_val_steps': 0, 'reload_dataloaders_every_epoch': False, 'progress_bar_refresh_rate': progress_bar_refresh_rate, 'terminate_on_nan': terminate_on_nan, 'profiler': profiler } if trainer_args: kwargs.update(trainer_args) if verbose and hvd.rank() == 0: print("Creating trainer with: \n ", kwargs) trainer = Trainer(**kwargs) if profiler != 'simple' and trainer.profiler: print( f"Set profiler's logs_path for {hvd.rank()} to {logs_path}" ) trainer.profiler.dirpath = logs_path # filename where the profiler results will be saved instead of # printing to stdout. The .txt extension will be used automatically. trainer.profiler.filename = "profile" if verbose and hvd.rank() == 0: print(f"pytorch_lightning version={pl.__version__}") data_module_kwargs = { 'train_dir': remote_store.train_data_path, 'val_dir': remote_store.val_data_path, 'num_train_epochs': epochs, 'has_val': should_validate is not None, 'train_batch_size': batch_size, 'val_batch_size': val_batch_size, 'shuffle_size': shuffle_size, 'num_reader_epochs': loader_num_epochs, 'reader_pool_type': reader_pool_type, 'reader_worker_count': train_reader_worker_count, 'transform_spec': transformation, 'inmemory_cache_all': inmemory_cache_all, 'cur_shard': hvd.rank(), 'shard_count': hvd.size(), 'schema_fields': schema_fields, 'storage_options': storage_options, 'steps_per_epoch_train': _train_steps_per_epoch, 'steps_per_epoch_val': _val_steps_per_epoch, 'verbose': verbose, 'debug_data_loader': debug_data_loader, 'train_async_data_loader_queue_size': train_async_data_loader_queue_size, 'val_async_data_loader_queue_size': val_async_data_loader_queue_size, } if debug_data_loader and hvd.rank() == 0: print( f"Creating data module with args:\n {data_module_kwargs}") dataset = data_module(**data_module_kwargs) trainer.fit(model, dataset) if hvd.rank() == 0: if remote_store.saving_runs and trainer.profiler: # One more file sync to push profiler result. remote_store.sync(logs_path) # rank 0 overwrites model with best checkpoint and returns. if require_checkpoint: if verbose: print("load from checkpoint best model path:", _checkpoint_callback.best_model_path) best_model = model.load_from_checkpoint( _checkpoint_callback.best_model_path) else: best_model = model serialized_checkpoint = io.BytesIO() module = best_model if not is_legacy else best_model._model output = { 'model': module.state_dict(), 'logged_metrics': trainer.logged_metrics } torch.save(output, serialized_checkpoint) return serialized_checkpoint
parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--lr', type=float, default=0.01, metavar='LR', help='learning rate (default: 0.01)') parser.add_argument('--momentum', type=float, default=0.5, metavar='M', help='SGD momentum (default: 0.5)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=42, metavar='S', help='random seed (default: 42)') parser.add_argument('--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') args = parser.parse_args() args.cuda = not args.no_cuda and torch.cuda.is_available() hvd.init() torch.manual_seed(args.seed) if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]))
def hvd_init(): if HAS_HOROVOD: hvd.init() return HAS_HOROVOD
def run_horovod(): # Temporary patch this script until the MNIST dataset download issue get resolved # https://github.com/pytorch/vision/issues/1938 import urllib try: # For python 2 class AppURLopener(urllib.FancyURLopener): version = "Mozilla/5.0" urllib._urlopener = AppURLopener() except AttributeError: # For python 3 opener = urllib.request.build_opener() opener.addheaders = [('User-agent', 'Mozilla/5.0')] urllib.request.install_opener(opener) batch_size = 64 test_batch_size = 1000 epochs = 10 lr = 0.01 momentum = 0.5 seed = 43 log_interval = 10 fp16_allreduce = False use_adasum = False # Horovod: initialize library. hvd.init() torch.manual_seed(seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(4) kwargs = {} train_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, sampler=train_sampler, **kwargs) test_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=False, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the test data. test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=test_batch_size, sampler=test_sampler, **kwargs) class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(1, 10, kernel_size=5) self.conv2 = nn.Conv2d(10, 20, kernel_size=5) self.conv2_drop = nn.Dropout2d() self.fc1 = nn.Linear(320, 50) self.fc2 = nn.Linear(50, 10) def forward(self, x): x = F.relu(F.max_pool2d(self.conv1(x), 2)) x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2)) x = x.view(-1, 320) x = F.relu(self.fc1(x)) x = F.dropout(x, training=self.training) x = self.fc2(x) return F.log_softmax(x) model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not use_adasum else 1 # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=lr * lr_scaler, momentum=momentum) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: (optional) compression algorithm. compression = hvd.Compression.fp16 if fp16_allreduce else hvd.Compression.none # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if use_adasum else hvd.Average) def train(epoch): model.train() # Horovod: set epoch to sampler for shuffling. train_sampler.set_epoch(epoch) for batch_idx, (data, target) in enumerate(train_loader): optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % log_interval == 0: # Horovod: use train_sampler to determine the number of examples in # this worker's partition. print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_sampler), 100. * batch_idx / len(train_loader), loss.item())) def metric_average(val, name): tensor = torch.tensor(val) avg_tensor = hvd.allreduce(tensor, name=name) return avg_tensor.item() def test(): model.eval() test_loss = 0. test_accuracy = 0. for data, target in test_loader: output = model(data) # sum up batch loss test_loss += F.nll_loss(output, target, size_average=False).item() # get the index of the max log-probability pred = output.data.max(1, keepdim=True)[1] test_accuracy += pred.eq(target.data.view_as(pred)).cpu().float().sum() # Horovod: use test_sampler to determine the number of examples in # this worker's partition. test_loss /= len(test_sampler) test_accuracy /= len(test_sampler) # Horovod: average metric values across workers. test_loss = metric_average(test_loss, 'avg_loss') test_accuracy = metric_average(test_accuracy, 'avg_accuracy') # Horovod: print output only on first rank. if hvd.rank() == 0: print('\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( test_loss, 100. * test_accuracy)) for epoch in range(1, epochs + 1): train(epoch) test()
def cli( gpu_embedding_gb_per_rank, gpu_embedding_dim, gpu_num_tables, gpu_bag_size, cpu_embedding_gb_per_rank, cpu_embedding_dim, cpu_num_tables, cpu_bag_size, dense_features_dim, over_dim, batch_size, iters, fp16, ): fp16 = int(fp16) hvd.init(comm=MPI.COMM_WORLD.Dup()) import socket import random ip = socket.gethostbyname(socket.gethostname()) # TODO: less hacky port = random.randint(20000, 60000) (master_ip, master_port) = MPI.COMM_WORLD.bcast((ip, port), root=0) MPI.COMM_WORLD.Barrier() dist.init_process_group( "nccl", init_method=f"file:///private/home/tulloch/src/bigads_{master_ip}_{master_port}.rendevouz", rank=hvd.rank(), world_size=hvd.size(), ) logging.info( f"Horovod initialized: size={hvd.size()}, rank={hvd.rank()}, local_rank={hvd.local_rank()}" ) torch.cuda.set_device(0) # hvd.local_rank()) elem_size = 4 if not fp16 else 2 cpu_num_embeddings = div_round_up( cpu_embedding_gb_per_rank * hvd.size() * 1024 * 1024 * 1024 / (elem_size * cpu_num_tables * cpu_embedding_dim), hvd.size(), ) cpu_embedding_dim = div_round_up(cpu_embedding_dim, hvd.size()) gpu_num_tables = div_round_up(gpu_num_tables, hvd.size()) gpu_num_embeddings = int( ( gpu_embedding_gb_per_rank * 1024 * 1024 * 1024 / (elem_size * (gpu_num_tables // hvd.size()) * gpu_embedding_dim) ) ) batch_size = div_round_up(batch_size, hvd.size()) name = "mixed-gpu-cpu" benchmark_torch_mixed_snn( name, gpu_num_tables, gpu_num_embeddings, gpu_embedding_dim, cpu_num_tables, cpu_num_embeddings, cpu_embedding_dim, dense_features_dim, over_dim, batch_size, gpu_bag_size, cpu_bag_size, iters=iters, fp16=fp16, )
def train(args): hvd.init() print("Hello from local_rank {}/{}, rank {}/{}".format( hvd.local_rank(), hvd.local_size(), hvd.rank(), hvd.size())) verbose = hvd.rank() == 0 if verbose: print('Using PyTorch version:', torch.__version__) print('Horovod version: {}, CUDA: {}, ROCM: {}, NCCL: {}, MPI: {}'.format( hvd_version, hvd.cuda_built(), hvd.rocm_built(), hvd.nccl_built(), hvd.mpi_built())) print(torch.__config__.show()) cudnn.benchmark = True torch.cuda.set_device(hvd.local_rank()) world_size = hvd.size() # Set up standard model. if verbose: print('Using {} model'.format(args.model)) model = getattr(models, args.model)() model = model.cuda() # import torch.multiprocessing as mp # # # assert "forkserver" in mp.get_all_start_methods() # mp.set_start_method("forkserver") lr_scaler = hvd.size() criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), 1e-4 * lr_scaler) optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) train_dataset = dataset_from_datadir(args.datadir, verbose) train_sampler = DistributedSampler(train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = DataLoader(dataset=train_dataset, batch_size=args.batchsize, shuffle=False, num_workers=args.workers, pin_memory=False, sampler=train_sampler, multiprocessing_context='forkserver') hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) total_step = args.steps if args.steps is not None else len(train_loader) # For each block of printed steps last_start = datetime.now() last_images = 0 # For final average avg_images = 0 avg_start = None tot_steps = 0 for epoch in range(args.epochs): for i, (images, labels) in enumerate(train_loader): images = images.cuda(non_blocking=True) labels = labels.cuda(non_blocking=True) outputs = model(images) loss = criterion(outputs, labels) optimizer.zero_grad() loss.backward() optimizer.step() li = len(images) last_images += li tot_steps += 1 if tot_steps == args.warmup_steps: avg_start = datetime.now() elif tot_steps > args.warmup_steps: avg_images += li if (i + 1) % args.print_steps == 0 and verbose: now = datetime.now() last_secs = (now-last_start).total_seconds() print(f'Epoch [{epoch+1}/{args.epochs}], Step [{i+1}/{total_step}], ' f'Loss: {loss.item():.4f}, ' f'Images/sec: {last_images*world_size/last_secs:.2f} ' f'(last {args.print_steps} steps)') last_start = now last_images = 0 if args.steps is not None and i >= args.steps: break if verbose: dur = datetime.now() - avg_start print(f"Training completed in: {dur}") print(f"Images/sec: {avg_images*world_size/dur.total_seconds():.2f} " f"(average, skipping {args.warmup_steps} warmup steps)")
def train(args): logger.debug("Number of gpus available - {}".format(args.num_gpus)) # Horovod: initialize library hvd.init() torch.manual_seed(args.seed) # Horovod: pin GPU to local local rank torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) # Horovod: limit number of CPU threads to be used per worker torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} train_loader = _get_train_data_loader(args.batch_size, args.data_dir, **kwargs) test_loader = _get_test_data_loader(args.test_batch_size, args.data_dir, **kwargs) logger.debug("Processes {}/{} ({:.0f}%) of train data".format( len(train_loader.sampler), len(train_loader.dataset), 100. * len(train_loader.sampler) / len(train_loader.dataset))) logger.debug("Processes {}/{} ({:.0f}%) of test data".format( len(test_loader.sampler), len(test_loader.dataset), 100. * len(test_loader.sampler) / len(test_loader.dataset))) model = Net() lr_scaler = hvd.size() model.cuda() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD(model.parameters(), lr=args.lr * lr_scaler, momentum=args.momentum) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) for epoch in range(1, args.epochs + 1): model.train() for batch_idx, (data, target) in enumerate(train_loader, 1): data, target = data.cuda(), target.cuda() optimizer.zero_grad() output = model(data) loss = F.nll_loss(output, target) loss.backward() optimizer.step() if batch_idx % args.log_interval == 0: logger.info( 'Train Epoch: {} [{}/{} ({:.0f}%)] Loss: {:.6f}'.format( epoch, batch_idx * len(data), len(train_loader.sampler), 100. * batch_idx / len(train_loader), loss.item())) test(model, test_loader) save_model(model, args.model_dir)
def train(serialized_model): import horovod.torch as hvd # Horovod: initialize library. hvd.init() with tempfile.TemporaryDirectory( ) as last_ckpt_dir, remote_store.get_local_output_dir( ) as run_output_dir: last_ckpt_file = os.path.join(last_ckpt_dir, 'last.ckpt') if ckpt_bytes: with open(last_ckpt_file, 'wb') as f: f.write(ckpt_bytes) # TODO: Pass the logger from estimator constructor logs_path = os.path.join(run_output_dir, remote_store.logs_subdir) # Use default logger if no logger is supplied train_logger = logger if train_logger is None: train_logger = TensorBoardLogger(logs_path) # TODO: find out a way to use ckpt_path created from remote store, but all other parameters ingest from estimator config # ckpt_path = os.path.join(run_output_dir, remote_store.checkpoint_filename) # os.makedirs(ckpt_path, exist_ok=True) # model_checkpoint_callback = ModelCheckpoint(dirpath=ckpt_path) # callbacks.append(model_checkpoint_callback) is_model_checkpoint_callback_exist = False if callbacks is not None: for cb in callbacks: if isinstance(cb, ModelCheckpoint): is_model_checkpoint_callback_exist = True break model = deserialize(serialized_model) _train_steps_per_epoch = train_steps_per_epoch if train_steps_per_epoch else \ int(math.floor(float(train_rows) / batch_size / hvd.size())) _val_steps_per_epoch = val_steps_per_epoch if val_steps_per_epoch else \ int(math.floor(float(val_rows) / val_batch_size / hvd.size())) print( f"Training data of rank[{hvd.local_rank()}]: train_rows:{train_rows}, batch_size:{batch_size}, _train_steps_per_epoch:{_train_steps_per_epoch}." ) print( f"Validation data of rank[{hvd.local_rank()}]: val_rows:{val_rows}, val_batch_size:{val_batch_size}, _val_steps_per_epoch:{_val_steps_per_epoch}, should_validate:{should_validate}" ) cuda_available = torch.cuda.is_available() # We need to check all ranks have same device type for traning. # Horovod doesn't support heterogeneous allreduce for gradients. cuda_avail_list = hvd.allgather_object(cuda_available, name='device type') if cuda_avail_list.count(cuda_available) != hvd.size(): raise RuntimeError("All ranks don't have same device type!") if cuda_available: # Horovod: pin GPU to local rank or the assigned GPU from spark. torch.cuda.set_device( _get_assigned_gpu_or_default(default=hvd.local_rank())) # Move model to GPU. model.cuda() _num_gpus = num_gpus if _num_gpus is None: _num_gpus = 1 if cuda_available else 0 kwargs = { 'accelerator': 'horovod', 'gpus': _num_gpus, 'callbacks': callbacks, 'max_epochs': epochs, 'logger': train_logger, 'log_every_n_steps': log_every_n_steps, 'resume_from_checkpoint': (last_ckpt_file if ckpt_bytes else None), 'checkpoint_callback': is_model_checkpoint_callback_exist, 'num_sanity_val_steps': 0, 'reload_dataloaders_every_epoch': False, 'progress_bar_refresh_rate': _train_steps_per_epoch // 10 } print("Creating trainer with: \n ", kwargs) trainer = Trainer(**kwargs) print(f"pytorch_lightning version={pl.__version__}") # print row group # pq.ParquetFile(remote_store.train_data_path) # for rowgroup in range(pq_file.metadata.num_row_groups): # row_group = pq_file.metadata.row_group(rowgroup) # print(row_group) with set_data_loader(model, remote_store.train_data_path, 'train_dataloader', train_reader_worker_count, reader_pool_type, calculate_shuffle_buffer_size(), name="train_dataloader", limit_step_per_epoch=_train_steps_per_epoch), \ set_data_loader(model, remote_store.val_data_path, 'val_dataloader', val_reader_worker_count, reader_pool_type, 0, should_validate, name="val_dataloader", limit_step_per_epoch=_val_steps_per_epoch): trainer.fit(model) serialized_checkpoint = io.BytesIO() module = model if not is_legacy else model._model # TODO: find a way to pass trainer.logged_metrics out. output = {'model': module.state_dict()} torch.save(output, serialized_checkpoint) serialized_checkpoint.seek(0) return serialized_checkpoint
def main(opts): hvd.init() n_gpu = hvd.size() device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) rank = hvd.rank() LOGGER.info("device: {} n_gpu: {}, rank: {}, 16-bits training: {}".format(device, n_gpu, hvd.rank(), opts.fp16)) if hvd.rank() != 0: LOGGER.disabled = True hps_file = f'{opts.output_dir}/log/hps.json' model_opts = Struct(load_json(hps_file)) model_config = f'{opts.output_dir}/log/model_config.json' # load DBs and image dirs video_ids = get_video_ids(opts.query_txt_db) if opts.task != "didemo_video_only": video_db = load_video_sub_dataset(opts.vfeat_db, opts.sub_txt_db, model_opts.vfeat_interval, model_opts) else: txt_meta = load_json(os.path.join(opts.query_txt_db, "meta.json")) video_db = load_video_only_dataset(opts.vfeat_db, txt_meta, model_opts.vfeat_interval, model_opts) assert opts.split in opts.query_txt_db q_txt_db = QueryTokLmdb(opts.query_txt_db, -1) if opts.task != "didemo_video_only": inf_dataset = VcmrFullEvalDataset else: inf_dataset = VcmrVideoOnlyFullEvalDataset eval_dataset = inf_dataset(video_ids, video_db, q_txt_db, distributed=model_opts.distributed_eval) # Prepare model if exists(opts.checkpoint): ckpt_file = opts.checkpoint else: ckpt_file = f'{opts.output_dir}/ckpt/model_step_{opts.checkpoint}.pt' checkpoint = torch.load(ckpt_file) img_pos_embed_weight_key = ("v_encoder.f_encoder.img_embeddings.position_embeddings.weight") assert img_pos_embed_weight_key in checkpoint max_frm_seq_len = len(checkpoint[img_pos_embed_weight_key]) model = HeroForVcmr.from_pretrained( model_config, state_dict=checkpoint, vfeat_dim=VFEAT_DIM, max_frm_seq_len=max_frm_seq_len, lw_neg_ctx=model_opts.lw_neg_ctx, lw_neg_q=model_opts.lw_neg_q, lw_st_ed=0, ranking_loss_type=model_opts.ranking_loss_type, use_hard_negative=False, hard_pool_size=model_opts.hard_pool_size, margin=model_opts.margin, use_all_neg=model_opts.use_all_neg, drop_svmr_prob=model_opts.drop_svmr_prob) model.to(device) if opts.fp16: model = amp.initialize(model, enabled=opts.fp16, opt_level='O2') eval_dataloader = DataLoader(eval_dataset, batch_size=opts.batch_size, num_workers=opts.n_workers, pin_memory=opts.pin_mem, collate_fn=vcmr_full_eval_collate) eval_dataloader = PrefetchLoader(eval_dataloader) _, results = validate_full_vcmr(model, eval_dataloader, opts.split, opts, model_opts) result_dir = f'{opts.output_dir}/results_{opts.split}' if not exists(result_dir) and rank == 0: os.makedirs(result_dir) all_results_list = all_gather_list(results) if hvd.rank() == 0: # save for only one time all_results = {"video2idx": all_results_list[0]["video2idx"]} for rank_id in range(hvd.size()): for key, val in all_results_list[rank_id].items(): if key == "video2idx": continue if key not in all_results: all_results[key] = [] all_results[key].extend(all_results_list[rank_id][key]) LOGGER.info('All results joined......') # save_vr(all_results, f'{result_dir}/results_{opts.checkpoint}_{opts.split}_vr.json') # save_vcmr_base_on_vr(all_results, f'{result_dir}/results_{opts.checkpoint}_{opts.split}_vcmr_base_on_vr.json') save_vcmr(all_results, f'{result_dir}/results_{opts.checkpoint}_{opts.split}_vcmr.json')
def fn(): hvd.init() devices = get_available_devices() return devices, hvd.local_rank()
def __init__(self, model: torch.nn.Module, input_dimension: int, output_dimension: int, pde_loss: PDELoss, initial_condition: InitialCondition, boundary_condition, use_gpu=True, use_horovod=False): """ Initializes an physics-informed neural network (PINN). A PINN consists of a model which represents the solution of the underlying partial differential equation(PDE) u, three loss terms representing initial (IC) and boundary condition(BC) and the PDE and a dataset which represents the bounded domain U. Args: model : is the model which is trained to represent the underlying PDE input_dimension : represents the dimension of the input vector x output_dimension : represents the dimension of the solution u pde_loss: Instance of the PDELoss class. Represents the underlying PDE initial_condition: Instance of the InitialCondition class. Represents the initial condition boundary_condition (BoundaryCondition, list): Instance of the BoundaryCondition class or a list of instances of the BoundaryCondition class use_gpu: enables gpu usage use_horovod: enables horovod support """ super(PINN, self).__init__() # checking if the model is a torch module more model checking should be possible self.use_gpu = use_gpu self.use_horovod = use_horovod self.rank = 0 # initialize rank 0 by default in order to make the fit method more flexible if self.use_horovod: # Initialize Horovod hvd.init() # Pin GPU to be used to process local rank (one GPU per process) torch.cuda.set_device(hvd.local_rank()) self.rank = hvd.rank() if isinstance(model, nn.Module): self.model = model if self.use_gpu: self.model.cuda() self.dtype = torch.cuda.FloatTensor else: self.dtype = torch.FloatTensor else: raise TypeError("Only models of type torch.nn.Module are allowed") # checking if the input dimension is well defined if not type(input_dimension) is int: raise TypeError("Only integers are allowed as input dimension") elif input_dimension <= 0: raise ValueError("Input dimension has to be greater than zero") else: self.input_dimension = input_dimension # checking if the output dimension is well defined if not type(output_dimension) is int: raise TypeError("Only integers are allowed as output dimension") elif input_dimension <= 0: raise ValueError("Input dimension has to be greater than zero") else: self.output_dimension = output_dimension if isinstance(pde_loss, PDELoss): self.pde_loss = pde_loss self.is_hpm = False else: raise TypeError( "PDE loss has to be an instance of a PDE Loss class") if isinstance(pde_loss, HPMLoss): self.is_hpm = True if isinstance(initial_condition, InitialCondition): self.initial_condition = initial_condition else: raise TypeError( "Initial condition has to be an instance of the InitialCondition class" ) joined_datasets = { "Initial_Condition": initial_condition.dataset, "PDE": pde_loss.dataset } if not self.is_hpm: if type(boundary_condition) is list: for bc in boundary_condition: if not isinstance(bc, BoundaryCondition): raise TypeError( "Boundary Condition has to be an instance of the BoundaryCondition class " ) self.boundary_condition = boundary_condition joined_datasets[bc.name] = bc.dataset else: if isinstance(boundary_condition, BoundaryCondition): self.boundary_condition = boundary_condition else: raise TypeError( "Boundary Condition has to be an instance of the BoundaryCondition class" "or a list of instances of the BoundaryCondition class" ) self.dataset = JoinedDataset(joined_datasets)
def train_fn(): # Horovod: initialize library. hvd.init() torch.manual_seed(args.seed) args.cuda = not args.no_cuda and torch.cuda.is_available() if args.cuda: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} train_dataset = \ datasets.MNIST('data-%d' % hvd.rank(), train=True, download=True, transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ])) # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) transformations = transforms.Compose( [transforms.ToTensor(), transforms.Normalize((0.1307, ), (0.3081, ))]) test_dataset = datasets.MNIST( 'data-%d' % hvd.rank(), train=False, transform=transformations) # Horovod: use DistributedSampler to partition the test data. test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = torch.utils.data.DataLoader( test_dataset, batch_size=args.test_batch_size, sampler=test_sampler, **kwargs) model = Net() # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() if not args.use_adasum else 1 if args.cuda: # Move model to GPU. model.cuda() # If using GPU Adasum allreduce, scale learning rate by local_size. if args.use_adasum and hvd.nccl_built(): lr_scaler = hvd.local_size() # Horovod: scale learning rate by lr_scaler. optimizer = optim.SGD( model.parameters(), lr=args.lr * lr_scaler, momentum=args.momentum) # Horovod: (optional) compression algorithm. compression = (hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none) @hvd.elastic.run def train(state): # post synchronization event (worker added, worker removed) init ... for state.epoch in range(state.epoch, args.epochs + 1): state.model.train() train_sampler.set_epoch(state.epoch) steps_remaining = len(train_loader) - state.batch for state.batch, (data, target) in enumerate(train_loader): if state.batch >= steps_remaining: break check_rank(state.epoch) if args.cuda: data, target = data.cuda(), target.cuda() state.optimizer.zero_grad() output = state.model(data) loss = F.nll_loss(output, target) loss.backward() state.optimizer.step() if state.batch % args.log_interval == 0: # Horovod: use train_sampler to determine # the number of examples in this worker's partition. print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'. format(state.epoch, state.batch * len(data), len(train_sampler), 100.0 * state.batch / len(train_loader), loss.item())) state.commit() state.batch = 0 def test(): model.eval() test_loss = 0. test_accuracy = 0. for data, target in test_loader: if args.cuda: data, target = data.cuda(), target.cuda() output = model(data) # sum up batch loss test_loss += F.nll_loss(output, target, size_average=False).item() # get the index of the max log-probability pred = output.data.max(1, keepdim=True)[1] test_accuracy += pred.eq( target.data.view_as(pred)).cpu().float().sum() # Horovod: use test_sampler to determine the number of examples in # this worker's partition. test_loss /= len(test_sampler) test_accuracy /= len(test_sampler) # Horovod: average metric values across workers. test_loss = metric_average(test_loss, 'avg_loss') test_accuracy = metric_average(test_accuracy, 'avg_accuracy') # Horovod: print output only on first rank. if hvd.rank() == 0: print( '\nTest set: Average loss: {:.4f}, Accuracy: {:.2f}%\n'.format( test_loss, 100. * test_accuracy)) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Adasum if args.use_adasum else hvd.Average) # adjust learning rate on reset def on_state_reset(): for param_group in optimizer.param_groups: param_group['lr'] = args.lr * hvd.size() state = hvd.elastic.TorchState(model, optimizer, epoch=1, batch=0) state.register_reset_callbacks([on_state_reset]) train(state) test()
def train(args): # initialize Horovod library hvd.init() # Horovod limits CPU threads to be used per worker torch.set_num_threads(1) # disable logging for processes except 0 on every node if hvd.local_rank() != 0: f = open(os.devnull, "w") sys.stdout = sys.stderr = f elif not os.path.exists(args.dir): # create 40 random image, mask paris on master node for training print( f"generating synthetic data to {args.dir} (this may take a while)") os.makedirs(args.dir) # set random seed to generate same random data for every node np.random.seed(seed=0) for i in range(40): im, seg = create_test_image_3d(128, 128, 128, num_seg_classes=1, channel_dim=-1) n = nib.Nifti1Image(im, np.eye(4)) nib.save(n, os.path.join(args.dir, f"img{i:d}.nii.gz")) n = nib.Nifti1Image(seg, np.eye(4)) nib.save(n, os.path.join(args.dir, f"seg{i:d}.nii.gz")) images = sorted(glob(os.path.join(args.dir, "img*.nii.gz"))) segs = sorted(glob(os.path.join(args.dir, "seg*.nii.gz"))) train_files = [{"img": img, "seg": seg} for img, seg in zip(images, segs)] # define transforms for image and segmentation train_transforms = Compose([ LoadNiftid(keys=["img", "seg"]), AsChannelFirstd(keys=["img", "seg"], channel_dim=-1), ScaleIntensityd(keys="img"), RandCropByPosNegLabeld(keys=["img", "seg"], label_key="seg", spatial_size=[96, 96, 96], pos=1, neg=1, num_samples=4), RandRotate90d(keys=["img", "seg"], prob=0.5, spatial_axes=[0, 2]), ToTensord(keys=["img", "seg"]), ]) # create a training data loader train_ds = Dataset(data=train_files, transform=train_transforms) # create a training data sampler train_sampler = DistributedSampler(train_ds, num_replicas=hvd.size(), rank=hvd.rank()) # when supported, use "forkserver" to spawn dataloader workers instead of "fork" to prevent # issues with Infiniband implementations that are not fork-safe multiprocessing_context = None if hasattr( mp, "_supports_context" ) and mp._supports_context and "forkserver" in mp.get_all_start_methods(): multiprocessing_context = "forkserver" # use batch_size=2 to load images and use RandCropByPosNegLabeld to generate 2 x 4 images for network training train_loader = DataLoader( train_ds, batch_size=2, shuffle=False, num_workers=2, pin_memory=True, sampler=train_sampler, multiprocessing_context=multiprocessing_context, ) # create UNet, DiceLoss and Adam optimizer device = torch.device(f"cuda:{hvd.local_rank()}") model = monai.networks.nets.UNet( dimensions=3, in_channels=1, out_channels=1, channels=(16, 32, 64, 128, 256), strides=(2, 2, 2, 2), num_res_units=2, ).to(device) loss_function = monai.losses.DiceLoss(sigmoid=True).to(device) optimizer = torch.optim.Adam(model.parameters(), 1e-3) # Horovod broadcasts parameters & optimizer state hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Horovod wraps optimizer with DistributedOptimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) # start a typical PyTorch training epoch_loss_values = list() for epoch in range(5): print("-" * 10) print(f"epoch {epoch + 1}/{5}") model.train() epoch_loss = 0 step = 0 train_sampler.set_epoch(epoch) for batch_data in train_loader: step += 1 inputs, labels = batch_data["img"].to( device), batch_data["seg"].to(device) optimizer.zero_grad() outputs = model(inputs) loss = loss_function(outputs, labels) loss.backward() optimizer.step() epoch_loss += loss.item() epoch_len = len(train_ds) // train_loader.batch_size print(f"{step}/{epoch_len}, train_loss: {loss.item():.4f}") epoch_loss /= step epoch_loss_values.append(epoch_loss) print(f"epoch {epoch + 1} average loss: {epoch_loss:.4f}") print(f"train completed, epoch losses: {epoch_loss_values}") if hvd.rank() == 0: # all processes should see same parameters as they all start from same # random parameters and gradients are synchronized in backward passes, # therefore, saving it in one process is sufficient torch.save(model.state_dict(), "final_model.pth")
def main(): parser = argparse.ArgumentParser() parser.add_argument("-exp_dir") parser.add_argument("-dataPath", default='', type=str, help="path of data files") parser.add_argument("-train_config") parser.add_argument("-data_config") parser.add_argument("-lr", default=0.0001, type=float, help="Override the LR in the config") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-data_loader_threads", default=0, type=int, help="number of workers for data loading") parser.add_argument("-max_grad_norm", default=5, type=float, help="max_grad_norm for gradient clipping") parser.add_argument("-sweep_size", default=200, type=float, help="process n hours of data per sweep (default:200)") parser.add_argument("-num_epochs", default=1, type=int, help="number of training epochs (default:1)") parser.add_argument("-global_mvn", default=False, type=bool, help="if apply global mean and variance normalization") parser.add_argument( "-resume_from_model", type=str, help="the model from which you want to resume training") parser.add_argument("-dropout", type=float, help="set the dropout ratio") parser.add_argument( "-anneal_lr_epoch", default=2, type=int, help="start to anneal the learning rate from this epoch") parser.add_argument("-anneal_lr_ratio", default=0.5, type=float, help="the ratio to anneal the learning rate") parser.add_argument('-print_freq', default=100, type=int, metavar='N', help='print frequency (default: 100)') parser.add_argument('-hvd', default=False, type=bool, help="whether to use horovod for training") args = parser.parse_args() with open(args.train_config) as f: config = yaml.safe_load(f) config["sweep_size"] = args.sweep_size with open(args.data_config) as f: data = yaml.safe_load(f) config["source_paths"] = [j for i, j in data['clean_source'].items()] if 'dir_noise' in data: config["dir_noise_paths"] = [ j for i, j in data['dir_noise'].items() ] if 'rir' in data: config["rir_paths"] = [j for i, j in data['rir'].items()] config['data_path'] = args.dataPath print("Experiment starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) # Initialize Horovod if args.hvd: import horovod.torch as hvd hvd.init() th.cuda.set_device(hvd.local_rank()) print("Run experiments with world size {}".format(hvd.size())) if not os.path.isdir(args.exp_dir): os.makedirs(args.exp_dir) trainset = SpeechDataset(config) train_dataloader = ChunkDataloader(trainset, batch_size=args.batch_size, distributed=args.hvd, num_workers=args.data_loader_threads) if args.global_mvn: transform = GlobalMeanVarianceNormalization() print("Estimating global mean and variance of feature vectors...") transform.learn_mean_and_variance_from_train_loader( trainset, trainset.stream_idx_for_transform, n_sample_to_use=2000) trainset.transform = transform print("Global mean and variance transform trained successfully!") with open(args.exp_dir + "/transform.pkl", 'wb') as f: pickle.dump(transform, f, pickle.HIGHEST_PROTOCOL) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(train_dataloader))) # ceate model model_config = config["model_config"] lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True) model = NnetAM(lstm, model_config["hidden_size"] * 2, model_config["label_size"]) # Start training th.backends.cudnn.enabled = True if th.cuda.is_available(): model.cuda() # optimizer optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True) if args.hvd: # Broadcast parameters and opterimizer state from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) # criterion criterion = nn.CrossEntropyLoss(ignore_index=-100) start_epoch = 0 if args.resume_from_model: assert os.path.isfile(args.resume_from_model ), "ERROR: model file {} does not exit!".format( args.resume_from_model) checkpoint = th.load(args.resume_from_model) state_dict = checkpoint['model'] start_epoch = checkpoint['epoch'] model.load_state_dict(state_dict) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' ".format(args.resume_from_model)) model.train() for epoch in range(start_epoch, args.num_epochs): # aneal learning rate if epoch > args.anneal_lr_epoch: for param_group in optimizer.param_groups: param_group['lr'] *= args.anneal_lr_ratio run_train_epoch(model, optimizer, criterion, train_dataloader, epoch, args) # save model if not args.hvd or hvd.rank() == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() checkpoint['epoch'] = epoch output_file = args.exp_dir + '/model.' + str(epoch) + '.tar' th.save(checkpoint, output_file)
def fn(): hvd.init() res = hvd.allgather(torch.tensor([hvd.rank()])).tolist() return res, hvd.rank()
def main(): hvd.init() seed = args.seed + hvd.rank() torch.manual_seed(seed) torch.cuda.manual_seed(seed) np.random.seed(seed) torch.cuda.set_device(hvd.local_rank()) transform = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) ]) kwargs = { 'num_workers': 4, 'pin_memory': True, 'batch_size': args.batch_size } trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform) train_sampler = data.distributed.DistributedSampler( trainset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = data.DataLoader(trainset, sampler=train_sampler, **kwargs) testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform) test_sampler = data.distributed.DistributedSampler(testset, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = data.DataLoader(testset, sampler=test_sampler, **kwargs) device = torch.device('cuda:{}'.format(hvd.local_rank())) obs_dim = trainset[0][0].shape model = PixelSNAIL(obs_dim).to(device) optimizer = optim.Adam(model.parameters(), lr=args.lr) optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) if hvd.rank() == 0: total_parameters = sum( [np.prod(p.shape) for p in model.parameters() if p.requires_grad]) print('Total Parameters {}'.format(total_parameters)) hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) folder = './models' if not exists(folder): os.makedirs(folder) model_fname = join(folder, 'pixel_snail.pt') for epoch in range(args.epochs): MPI.COMM_WORLD.Barrier() train(model, optimizer, device, train_loader, epoch) test(model, device, test_loader, epoch) if hvd.rank() == 0: sample(model, device, epoch) torch.save(dict(model=model, optimizer=optimizer.state_dict()), model_fname)