def restore_model(self): if self._rank == 0: return trainer_eventID.restore_model(self) else: return None if self.args.distributed_mode == "horovod": # Broadcast the global step: self._global_step = hvd.broadcast_object(self._global_step, root_rank = 0) # Broadcast the state of the model: hvd.broadcast_parameters(self._net.state_dict(), root_rank = 0) # Broadcast the optimizer state: hvd.broadcast_optimizer_state(self._opt, root_rank = 0) # Horovod doesn't actually move the optimizer onto a GPU: if self.args.compute_mode == "GPU": for state in self._opt.state.values(): for k, v in state.items(): if torch.is_tensor(v): state[k] = v.cuda() # Broadcast the LR Schedule state: state_dict = hvd.broadcast_object(self.lr_scheduler.state_dict(), root_rank = 0) elif self.args.distributed_mode == "DDP": if self.args.compute_mode == "GPU": self._net.cuda() self._net = torch.nn.parallel.DistributedDataParallel(self._net) self._global_step = MPI.COMM_WORLD.bcast(self._global_step, root=0) state_dict = MPI.COMM_WORLD.bcast(self.lr_scheduler.state_dict(), root=0)
def train_loop_per_worker(config): import torch import horovod.torch as hvd hvd.init() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") mode = config["mode"] net = Net(mode).to(device) optimizer = torch.optim.SGD( net.parameters(), lr=config["lr"], ) optimizer = hvd.DistributedOptimizer(optimizer) num_steps = 5 print(hvd.size()) np.random.seed(1 + hvd.rank()) torch.manual_seed(1234) # To ensure consistent initialization across workers, hvd.broadcast_parameters(net.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) start = time.time() x_max = config["x_max"] for step in range(1, num_steps + 1): features = torch.Tensor(np.random.rand(1) * 2 * x_max - x_max).to(device) if mode == "square": labels = sq(features) else: labels = qu(features) optimizer.zero_grad() outputs = net(features) loss = torch.nn.MSELoss()(outputs, labels) loss.backward() optimizer.step() time.sleep(0.1) train.report(loss=loss.item()) total = time.time() - start print(f"Took {total:0.3f} s. Avg: {total / num_steps:0.3f} s.")
def train(args): train_loader, val_loader, train_sampler, _ = prepare_data(args) assert (cuda.is_available() and cuda.device_count() > 0) net = models.__dict__[args.arch]() net = net.cuda() optimizer = optim.SGD(net.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=net.named_parameters()) lr_scheduler = optim.lr_scheduler.MultiStepLR( optimizer, milestones=[int(args.epoch * 0.5), int(args.epoch * 0.75)], gamma=0.1) hvd.broadcast_parameters(net.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) best_acc = 0 checkpoint = {} for epochid in range(args.epoch): # Horovod: set epoch to sampler for shuffling. train_sampler.set_epoch(epochid) print("==> Training Epoch %d, Learning Rate %.4f" % (epochid, lr_scheduler.get_lr()[0])) train_epoch(net, train_loader, optimizer, args) print('==> Validating ') acc = validate(net, val_loader, args) lr_scheduler.step() if acc > best_acc: best_acc = acc checkpoint = net.state_dict() fname = args.arch + '_' + str(best_acc) + '.pth.tar' os.makedirs(args.outdir, exist_ok=True) fname = os.path.join(args.outdir, fname) if hvd.rank() == 0: torch.save(checkpoint, fname) print('Best Accuracy: ', best_acc)
def setup(self, trainer: "pl.Trainer") -> None: self.model_to_device() super().setup(trainer) self._exit_stack = ExitStack() self._exit_stack.__enter__() if not self.lightning_module.trainer.training: # no need to setup optimizers return def _unpack_lightning_optimizer(opt): return opt._optimizer if isinstance(opt, LightningOptimizer) else opt optimizers = self.optimizers optimizers = [_unpack_lightning_optimizer(opt) for opt in optimizers] # Horovod: scale the learning rate by the number of workers to account for # increased total batch size for optimizer in optimizers: for param_group in optimizer.param_groups: param_group["lr"] *= self.world_size # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR lr_schedulers = self.lightning_module.trainer.lr_schedulers for scheduler in lr_schedulers: scheduler = scheduler["scheduler"] if isinstance(scheduler, _LRScheduler): scheduler.base_lrs = [lr * self.world_size for lr in scheduler.base_lrs] # Horovod: broadcast parameters & optimizer state to ensure consistent initialization hvd.broadcast_parameters(self.lightning_module.state_dict(), root_rank=0) for optimizer in optimizers: hvd.broadcast_optimizer_state(optimizer, root_rank=0) self.optimizers = self._wrap_optimizers(optimizers) for optimizer in self.optimizers: # Synchronization will be performed explicitly following backward() self._exit_stack.enter_context(optimizer.skip_synchronize())
def __init__(self, wf=None, sampler=None, optimizer=None, scheduler=None, output=None, rank=0): """Distributed QMC solver Args: wf (qmctorch.WaveFunction, optional): wave function. Defaults to None. sampler (qmctorch.sampler, optional): Sampler. Defaults to None. optimizer (torch.optim, optional): optimizer. Defaults to None. scheduler (torch.optim, optional): scheduler. Defaults to None. output (str, optional): hdf5 filename. Defaults to None. rank (int, optional): rank of he process. Defaults to 0. """ SolverOrbital.__init__(self, wf, sampler, optimizer, scheduler, output, rank) hvd.broadcast_optimizer_state(self.opt, root_rank=0) self.opt = hvd.DistributedOptimizer( self.opt, named_parameters=self.wf.named_parameters()) self.sampler.nwalkers //= hvd.size() self.sampler.walkers.nwalkers //= hvd.size()
def _broadcast_state(graph, optimizer, step=None, epoch=None, graph_ema=None): # broadcast parameters state. hvd.broadcast_parameters(graph.state_dict(), root_rank=0) if graph_ema is not None: hvd.broadcast_parameters(graph_ema.state_dict(), root_rank=0) # broadcast optimizer state. if optimizer is not None: hvd.broadcast_optimizer_state(optimizer, root_rank=0) # broadcast resume_from_epoch from rank 0 (which will have # checkpoints) to other ranks. if step is not None: step = hvd.broadcast(torch.tensor(step), root_rank=0, name='resume_from_step').item() if epoch is not None: epoch = hvd.broadcast(torch.tensor(epoch), root_rank=0, name='resume_from_epoch').item() return step, epoch
def pre_dispatch(self): def _unpack_lightning_optimizer(opt): return opt._optimizer if isinstance(opt, LightningOptimizer) else opt optimizers = self.lightning_module.trainer.optimizers optimizers = [_unpack_lightning_optimizer(opt) for opt in optimizers] # Horovod: scale the learning rate by the number of workers to account for # increased total batch size for optimizer in optimizers: for param_group in optimizer.param_groups: param_group["lr"] *= self.world_size # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR lr_schedulers = self.lightning_module.trainer.lr_schedulers for scheduler in lr_schedulers: scheduler = scheduler["scheduler"] if isinstance(scheduler, _LRScheduler): scheduler.base_lrs = [lr * self.world_size for lr in scheduler.base_lrs] # Horovod: broadcast parameters & optimizer state to ensure consistent initialization hvd.broadcast_parameters(self.lightning_module.state_dict(), root_rank=0) for optimizer in optimizers: hvd.broadcast_optimizer_state(optimizer, root_rank=0) def _filter_named_parameters(model, optimizer): opt_params = set([p for group in optimizer.param_groups for p in group.get("params", [])]) return [(name, p) for name, p in model.named_parameters() if p in opt_params] # Horovod: wrap optimizers to perform gradient aggregation via allreduce optimizers = [ hvd.DistributedOptimizer( optimizer, named_parameters=_filter_named_parameters(self.lightning_module, optimizer) ) for optimizer in optimizers ] self.lightning_module.trainer.accelerator.optimizers = optimizers
def get_model(args): model = models.RNNModel(args.model, args.ntokens, args.emsize, args.nhid, args.nlayers, args.dropout, args.tied).to(args.device) # Horovod: scale learning rate by the number of GPUs. args.base_lr = args.base_lr * hvd.size() optimizer = optim.SGD(model.parameters(), lr=args.base_lr, momentum=args.momentum, weight_decay=args.wd) if args.kfac_update_freq > 0: preconditioner = kfac.KFAC( model, lr=args.base_lr, stat_decay=args.stat_decay, damping=args.damping, kl_clip=args.kl_clip, TCov=args.kfac_cov_update_freq, TInv=args.kfac_update_freq, diag_blocks=args.diag_blocks, diag_warmup=args.diag_warmup) else: preconditioner = None optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=hvd.Compression.none, op=hvd.Average) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) lrs = create_lr_schedule(hvd.size(), args.warmup_epochs, args.lr_decay, alpha=0.25) lr_schedules = [LambdaLR(optimizer, lrs)] if preconditioner is not None: lr_schedules.append(LambdaLR(preconditioner, lrs)) criterion = nn.NLLLoss() return model, optimizer, preconditioner, lr_schedules, lrs, criterion
def pre_dispatch(self): if not self.lightning_module.trainer.training: # no need to setup optimizers return def _unpack_lightning_optimizer(opt): return opt._optimizer if isinstance(opt, LightningOptimizer) else opt optimizers = self.lightning_module.trainer.optimizers optimizers = [_unpack_lightning_optimizer(opt) for opt in optimizers] # Horovod: scale the learning rate by the number of workers to account for # increased total batch size for optimizer in optimizers: for param_group in optimizer.param_groups: param_group["lr"] *= self.world_size # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR lr_schedulers = self.lightning_module.trainer.lr_schedulers for scheduler in lr_schedulers: scheduler = scheduler["scheduler"] if isinstance(scheduler, _LRScheduler): scheduler.base_lrs = [ lr * self.world_size for lr in scheduler.base_lrs ] # Horovod: broadcast parameters & optimizer state to ensure consistent initialization hvd.broadcast_parameters(self.lightning_module.state_dict(), root_rank=0) for optimizer in optimizers: hvd.broadcast_optimizer_state(optimizer, root_rank=0) self.lightning_module.trainer.accelerator.optimizers = self._wrap_optimizers( optimizers)
def _init_horovod_setting(self): """Init horovod setting.""" self.is_chief = True # SR hvd.broadcast_parameters(self.model.netSR.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(self.model.optimizer_SR, root_rank=0) # G F hvd.broadcast_parameters(self.model.netG.state_dict(), root_rank=0) hvd.broadcast_parameters(self.model.netF.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(self.model.optimizer_G, root_rank=0) # D_X hvd.broadcast_parameters(self.model.netD_X.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(self.model.optimizer_D_X, root_rank=0) # D_Y hvd.broadcast_parameters(self.model.netD_Y.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(self.model.optimizer_D_Y, root_rank=0) if hvd.rank() != 0: self.is_chief = False else: self.is_chief = True
def train(args, train_dataset, model, tokenizer): """ Train the model """ if args.local_rank in [-1, 0]: tb_writer = SummaryWriter() args.train_batch_size = args.per_gpu_train_batch_size * max(1, args.n_gpu) # num_replicas=hvd.size() is added in case Horovod is used train_sampler = RandomSampler( train_dataset) if hvd.size() == 0 else DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size) if args.max_steps > 0: t_total = args.max_steps args.num_train_epochs = args.max_steps // ( len(train_dataloader) // args.gradient_accumulation_steps) + 1 logger.info( "+++++++++++++++ (if args.max_steps > 0) args.num_train_epochs = args.max_steps // (len(train_dataloader) // args.gradient_accumulation_steps)1 t_total = %d", t_total) else: t_total = len(train_dataloader) // (args.gradient_accumulation_steps * args.num_train_epochs) logger.info( "++++++++++++++++&&&&&& else... len(train_dataloader) = %d", len(train_dataloader)) logger.info( "++++++++++++++++&&&&&& else... args.gradient_accumulation_steps = %d", args.gradient_accumulation_steps) logger.info( "++++++++++++++++&&&&&& else... args.num_train_epochs = %d", args.num_train_epochs) logger.info( "++++++++++++++++ else (t_total = len(train_dataloader) // (args.gradient_accumulation_steps * args.num_train_epochs) t_total = %d", t_total) # Prepare optimizer and schedule (linear warmup and decay) no_decay = ['bias', 'LayerNorm.weight'] optimizer_grouped_parameters = [{ 'params': [ p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay) ], 'weight_decay': args.weight_decay }, { 'params': [ p for n, p in model.named_parameters() if any(nd in n for nd in no_decay) ], 'weight_decay': 0.0 }] # Multiply the learning rate by hvd.size() optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate * hvd.size(), eps=args.adam_epsilon) optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) scheduler = WarmupLinearSchedule(optimizer, warmup_steps=args.warmup_steps, t_total=t_total) if args.fp16: try: from apex import amp except ImportError: raise ImportError( "Please install apex from https://www.github.com/nvidia/apex to use fp16 training." ) model, optimizer = amp.initialize(model, optimizer, opt_level=args.fp16_opt_level) # multi-gpu training (should be after apex fp16 initialization) if args.n_gpu > 1: model = torch.nn.DataParallel(model) # Distributed training (should be after apex fp16 initialization) if args.local_rank != -1: model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.local_rank], output_device=args.local_rank, find_unused_parameters=True) logger.info("+++++++++++++++++++ test test test +++++++ hvd.rank() = %d", hvd.rank()) #In case of using GPU, Horovod: (optional) compression algorithm. #compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none # In case of using GPU, Horovod: wrap optimizer with DistributedOptimizer. #optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Train! logger.info("***** Running training -- hvd.rank() = %d *****", hvd.rank()) logger.info(" Num examples = %d", len(train_dataset)) logger.info(" Num Epochs = %d", args.num_train_epochs) logger.info(" Instantaneous batch size per GPU = %d", args.per_gpu_train_batch_size) logger.info( " Total train batch size (w. parallel, distributed & accumulation) = %d", args.train_batch_size * args.gradient_accumulation_steps * (torch.distributed.get_world_size() if args.local_rank != -1 else 1)) logger.info(" Gradient Accumulation steps = %d", args.gradient_accumulation_steps) logger.info(" Total optimization steps = %d", t_total) logger.info( "++++++++++++++++ else... len(train_dataloader) = %d & hvd.rank() =%d", len(train_dataloader), hvd.rank()) logger.info( "++++++++++++++++ else... args.gradient_accumulation_steps = %d", args.gradient_accumulation_steps) logger.info("++++++++++++++++ else... args.num_train_epochs = %d", args.num_train_epochs) global_step = 0 tr_loss, logging_loss = 0.0, 0.0 model.zero_grad() train_iterator = trange(int(args.num_train_epochs), desc="Epoch", disable=args.local_rank not in [-1, 0]) set_seed( args) # Added here for reproductibility (even between python 2 and 3) for _ in train_iterator: epoch_iterator = tqdm(train_dataloader, desc="Iteration", disable=args.local_rank not in [-1, 0]) for step, batch in enumerate(epoch_iterator): #if step % hvd.size() == 0: model.train() batch = tuple(t.to(args.device) for t in batch) inputs = { 'input_ids': batch[0], 'attention_mask': batch[1], 'token_type_ids': batch[2] if args.model_type in ['bert', 'xlnet'] else None, # XLM don't use segment_ids 'labels': batch[3] } outputs = model(**inputs) loss = outputs[ 0] # model outputs are always tuple in pytorch-transformers (see doc) if args.n_gpu > 1: loss = loss.mean( ) # mean() to average on multi-gpu parallel training if args.gradient_accumulation_steps > 1: loss = loss / args.gradient_accumulation_steps if args.fp16: with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_grad_norm) else: loss.backward() torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) tr_loss += loss.item() if (step + 1) % args.gradient_accumulation_steps == 0: scheduler.step() # Update learning rate schedule optimizer.step() model.zero_grad() global_step += 1 if args.local_rank in [ -1, 0 ] and args.logging_steps > 0 and global_step % args.logging_steps == 0: # Log metrics if args.local_rank == -1 and args.evaluate_during_training: # Only evaluate when single GPU otherwise metrics may not average well results = evaluate(args, model, tokenizer) for key, value in results.items(): tb_writer.add_scalar('eval_{}'.format(key), value, global_step) tb_writer.add_scalar('lr', scheduler.get_lr()[0], global_step) tb_writer.add_scalar('loss', (tr_loss - logging_loss) / args.logging_steps, global_step) logging_loss = tr_loss if args.local_rank in [ -1, 0 ] and args.save_steps > 0 and global_step % args.save_steps == 0: # Save model checkpoint if hvd.rank() == 0: output_dir = os.path.join( args.output_dir, 'checkpoint-{}'.format(global_step)) if not os.path.exists(output_dir): os.makedirs(output_dir) model_to_save = model.module if hasattr( model, 'module' ) else model # Take care of distributed/parallel training model_to_save.save_pretrained(output_dir) torch.save( args, os.path.join(output_dir, 'training_args.bin')) logger.info("Saving model checkpoint to %s", output_dir) # adding the calculation for hvd multi worker iteration modification if (args.max_steps > 0 and global_step > args.max_steps): epoch_iterator.close() break if (args.max_steps > 0 and global_step > args.max_steps): train_iterator.close() break if args.local_rank in [-1, 0]: tb_writer.close() return global_step, tr_loss / global_step
def train(serialized_model, optimizer_cls, model_opt_state_serialized, train_rows, val_rows, avg_row_size): from petastorm import make_batch_reader from petastorm.pytorch import DataLoader import torch import horovod.torch as hvd # Deserializing objects model_opt_state = torch.load(model_opt_state_serialized) model = deserialize(serialized_model) if loss_fns_pre_train: loss_fns = loss_fns_pre_train if loss_constructors: local_vars = locals() loss_fns = [ loss_constructor(**local_vars) for loss_constructor in loss_constructors ] # Horovod: initialize library. hvd.init() if not user_shuffle_buffer_size: shuffle_buffer_size = \ calculate_shuffle_buffer_size(hvd, avg_row_size, train_rows / hvd.size()) else: shuffle_buffer_size = user_shuffle_buffer_size cuda_available = torch.cuda.is_available() if cuda_available: # Horovod: pin GPU to local rank. torch.cuda.set_device(hvd.local_rank()) # Move model to GPU. model.cuda() # Optimizer object needs to be re-instantiated. Internally, it uses memory addresses of # objects as their identity and therefore it cannot be serialized and then # deserialized. The deserialized optimizer object stores the names of the parameters # with their old memory addresses but in reality those are different than the # reconstructed deserialized object and that creates problem. # Learning rate is a required parameters in SGD optimizer. It will be overridden with # load_state_dict. optimizer = optimizer_cls(model.parameters(), lr=1) optimizer_state = model_opt_state['optimizer'] if last_checkpoint_state is not None: model.load_state_dict(last_checkpoint_state['model']) optimizer.load_state_dict(last_checkpoint_state['optimizer']) else: # scale the learning rate with the number of horovod workers for i in range(len(optimizer_state['param_groups'])): optimizer_state['param_groups'][i]['lr'] = \ optimizer_state['param_groups'][i]['lr'] * hvd.size() optimizer.load_state_dict(optimizer_state) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) for group in optimizer.param_groups: for p in group['params']: if id(p) not in optimizer.state_dict()['state']: p.grad = p.data.new(p.size()).zero_() optimizer.step() hvd.broadcast_optimizer_state(optimizer, root_rank=0) dist_optimizer_args = dict(optimizer=optimizer, named_parameters=model.named_parameters()) if gradient_compression: # Pass the compression arg only if it is specified by the user. dist_optimizer_args['compression'] = gradient_compression # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer(**dist_optimizer_args) # This function takes the current optimizer and constructs a new optimizer with the # same state except with learning rate scaled down with the number of horovod workers. # This is important the retraining of the model. User may retrain the model with # different number of workers and we need the raw learning rate to adjust with the # new number of workers. schema_fields = feature_columns + label_columns if sample_weight_col: schema_fields.append(sample_weight_col) if train_steps_per_epoch is None: steps_per_epoch = int( math.ceil(float(train_rows) / batch_size / hvd.size())) else: steps_per_epoch = train_steps_per_epoch with remote_store.get_local_output_dir() as run_output_dir: logs_dir = os.path.join(run_output_dir, remote_store.logs_subdir) log_writer = SummaryWriter(logs_dir) if hvd.rank() == 0 else None ckpt_file = os.path.join(run_output_dir, remote_store.checkpoint_filename) def save_checkpoint(): model.cpu() optimizer_with_scaled_down_lr = \ get_optimizer_with_unscaled_lr(hvd, optimizer, optimizer_cls, model) state = { 'model': model.state_dict(), 'optimizer': optimizer_with_scaled_down_lr.state_dict(), } torch.save(state, ckpt_file) if cuda_available: model.cuda() # Petastorm: read data from the store with the correct shard for this rank # setting num_epochs=None will cause an infinite iterator # and enables ranks to perform training and validation with # unequal number of samples with make_batch_reader( remote_store.train_data_path, num_epochs=None, cur_shard=hvd.rank(), shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields) as train_reader: with make_batch_reader(remote_store.val_data_path, num_epochs=None, cur_shard=hvd.rank(), shard_count=hvd.size(), hdfs_driver=PETASTORM_HDFS_DRIVER, schema_fields=schema_fields) \ if should_validate else empty_batch_reader() as val_reader: train_loader = DataLoader( train_reader, batch_size=batch_size, shuffling_queue_capacity=shuffle_buffer_size) train_loader_iter = iter(train_loader) def prepare_batch(row): inputs = [ prepare_np_data(row[col].float(), col, metadata).reshape(shape) for col, shape in zip(feature_columns, input_shapes) ] labels = [ prepare_np_data(row[col].float(), col, metadata) for col in label_columns ] sample_weights = row.get(sample_weight_col, None) if cuda_available: inputs = [input.cuda() for input in inputs] labels = [label.cuda() for label in labels] return inputs, labels, sample_weights def transform_outputs(outputs, labels): if type(outputs) != tuple and type(outputs) != list: outputs = [outputs] # reshape labels to match the output shape of the model if hasattr(outputs[0], 'shape'): labels = [ label.reshape(output.shape) if output.shape.numel() == label.shape.numel() else label for label, output in zip(labels, outputs) ] return outputs, labels def aggregate_metrics(stage, epoch, loss, metric_value_groups): all_metric_groups_values = get_metric_avgs( metric_value_groups) if remote_store.saving_runs: write_metrics_summary(stage, epoch, loss, all_metric_groups_values, log_writer) return { loss.name: loss.avg.item(), 'all_metrics': all_metric_groups_values } def loss_fn(outputs, labels, sample_weights): loss = calculate_loss(outputs, labels, loss_weights, loss_fns, sample_weights) return loss def print_metrics(batch_idx, loss, metric_value_groups, phase): if user_verbose > 0 and hvd.rank() == 0 and \ batch_idx % METRIC_PRINT_FREQUENCY == 0: print( "epoch:\t{epoch}\tstep\t{batch_idx}:\t{metrics}" .format(epoch=epoch, batch_idx=batch_idx, metrics=aggregate_metrics( phase, epoch, loss, metric_value_groups))) def _train(epoch): model.train() train_loss = metric_cls('loss', hvd) metric_value_groups = construct_metric_value_holders( metric_cls, metric_fn_groups, label_columns, hvd) # iterate on one epoch for batch_idx in range(steps_per_epoch): row = next(train_loader_iter) inputs, labels, sample_weights = prepare_batch(row) outputs, loss = train_minibatch( model, optimizer, transform_outputs, loss_fn, inputs, labels, sample_weights) update_metrics(metric_value_groups, outputs, labels) train_loss.update(loss) print_metrics(batch_idx, train_loss, metric_value_groups, 'train') return aggregate_metrics('train', epoch, train_loss, metric_value_groups) if should_validate: val_loader = DataLoader(val_reader, batch_size=batch_size) val_loader_iter = iter(val_loader) if validation_steps_per_epoch is None: validation_steps = int( math.ceil( float(val_rows) / batch_size / hvd.size())) else: validation_steps = validation_steps_per_epoch def _validate(epoch): model.eval() val_loss = metric_cls('loss', hvd) metric_value_groups = construct_metric_value_holders( metric_cls, metric_fn_groups, label_columns, hvd) # iterate on one epoch for batch_idx in range(validation_steps): row = next(val_loader_iter) inputs, labels, sample_weights = prepare_batch( row) outputs = model(*inputs) outputs, labels = transform_outputs( outputs, labels) loss = calculate_loss(outputs, labels, loss_weights, loss_fns, sample_weights) val_loss.update(loss) update_metrics(metric_value_groups, outputs, labels) print_metrics(batch_idx, val_loss, metric_value_groups, 'val') return aggregate_metrics('val', epoch, val_loss, metric_value_groups) history = [] for epoch in range(epochs): epoch_metrics = { 'epoch': epoch, 'train': _train(epoch) } if should_validate: epoch_metrics['validation'] = _validate(epoch) if user_verbose > 0: print(epoch_metrics) history.append(epoch_metrics) if hvd.rank() == 0: # Save model after every epoch save_checkpoint() if remote_store.saving_runs: remote_store.sync(run_output_dir) if hvd.rank() == 0: best_checkpoint = torch.load(ckpt_file) serialized_checkpoint = io.BytesIO() torch.save(best_checkpoint, serialized_checkpoint) serialized_checkpoint.seek(0) return history, serialized_checkpoint
def test_broadcast_state(self): hvd.init() N, D_in, H, D_out = 64, 100, 10, 10 x = torch.autograd.Variable(torch.randn(N, D_in), requires_grad=True) y = torch.autograd.Variable(torch.randn(N, D_out), requires_grad=False) def create_model(create_opt): model = torch.nn.Sequential( torch.nn.Linear(D_in, H), torch.nn.ReLU(), torch.nn.Linear(H, D_out), ) optimizer = create_opt(model) optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) return model, optimizer def get_model_param_values(model): params = sorted(model.state_dict().items()) return [(k, v.clone()) for k, v in params] def get_optimizer_param_values(optimizer): results = [] state_dict = optimizer.state_dict() for group in state_dict['param_groups']: for param_id in group['params']: params = sorted(state_dict['state'][param_id].items()) for k, v in params: results.append( (k, v.clone() if torch.is_tensor(v) else v)) return results opt_params = dict(lr=0.2, momentum=0.9, weight_decay=0.1, centered=True) def new_optimizer(cls): p = { k: v for k, v in opt_params.items() if k in inspect.getargspec(cls.__init__).args } return lambda m: cls(m.parameters(), **p) # L-BFGS is currently unsupported, as are sparse tensors, which are # required by SparseAdam optimizer optimizers = [ (subclass.__name__, new_optimizer(subclass)) for subclass in torch.optim.Optimizer.__subclasses__() if subclass.__module__.startswith('torch.optim') and subclass != torch.optim.LBFGS and subclass != torch.optim.SparseAdam ] optimizers.sort() for opt_name, create_opt in optimizers: model, optimizer = create_model(create_opt) y_pred = model(x) loss = F.mse_loss(y_pred, y, size_average=False) optimizer.zero_grad() loss.backward() optimizer.step() model_param_values = get_model_param_values(model) for name, model_param_value in model_param_values: hvd.broadcast_(model_param_value, root_rank=0) opt_param_values_updated = [] opt_param_values = get_optimizer_param_values(optimizer) for name, opt_param_value in opt_param_values: is_tensor = torch.is_tensor(opt_param_value) if not is_tensor: t = type(opt_param_value) opt_param_value = torch.Tensor([opt_param_value]) hvd.broadcast_(opt_param_value, root_rank=0) if not is_tensor: opt_param_value = t(opt_param_value.numpy()[0]) opt_param_values_updated.append((name, opt_param_value)) opt_param_values = opt_param_values_updated if hvd.rank() == 0: state = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), } _, fname = tempfile.mkstemp('.pt') torch.save(state, fname) model, optimizer = create_model(create_opt) if hvd.rank() == 0: checkpoint = torch.load(fname) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) os.remove(fname) hvd.broadcast_parameters(model.state_dict(), root_rank=0) model_param_value_after = get_model_param_values(model) for before, after in zip(model_param_values, model_param_value_after): name, model_param_value = before name_after, model_param_value_after = after self.assertEqual(name, name_after) self.assertEqual(type(model_param_value), type(model_param_value_after)) self.assertTrue( (model_param_value == model_param_value_after).all()) hvd.broadcast_optimizer_state(optimizer, root_rank=0) self.assertEqual(len(optimizer.state_dict()['state'].values()), 4) opt_param_values_after = get_optimizer_param_values(optimizer) for before, after in zip(opt_param_values, opt_param_values_after): name, opt_param_value = before name_after, opt_param_value_after = after self.assertEqual(name, name_after) self.assertEqual(type(opt_param_value), type(opt_param_value_after)) if torch.is_tensor(opt_param_value): self.assertTrue( (opt_param_value == opt_param_value_after).all()) else: self.assertEqual(opt_param_value, opt_param_value_after)
def __init__(self, minigan_args): if (hvd.rank() == 0): print('Hello minigan init\n') self.minigan_args = minigan_args self.minigan_args.output_interimages_dir = \ self.minigan_args.output_dir + "/inter_%ss" % (self.minigan_args.dataset) # Load datasets self.load_data() if (hvd.rank() == 0): print('\n---LOAD DATA DONE---\n') # 2D miniGAN if (self.minigan_args.dim_mode == 2): self.generator = Generator2d(minigan_args) self.discriminator = Discriminator2d(minigan_args) # 3D miniGAN elif (self.minigan_args.dim_mode == 3): self.generator = Generator3d(minigan_args) self.discriminator = Discriminator3d(minigan_args) else: raise ValueError('\'dim_mode\' must be {2} or {3}.') self.generator = self.generator.float() self.discriminator = self.discriminator.float() self.generator.apply(weight_init) self.discriminator.apply(weight_init) if (hvd.rank() == 0): print(self.generator) print(self.discriminator) # Metrics and Loss self.loss_fn = nn.BCELoss() self.real_label = 1.0 - self.minigan_args.soft_label self.fake_label = self.minigan_args.soft_label self.gen_optim = optim.Adam( \ self.generator.parameters(), \ lr=self.minigan_args.gen_lr, \ betas=(self.minigan_args.gen_beta1, .999)) self.disc_optim = optim.Adam( \ self.discriminator.parameters(), \ lr=self.minigan_args.disc_lr, \ betas=(self.minigan_args.disc_beta1, .999)) if self.minigan_args.cuda: self.generator.cuda() self.discriminator.cuda() hvd.broadcast_parameters(self.generator.state_dict(), root_rank=0) hvd.broadcast_parameters(self.discriminator.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(self.gen_optim, root_rank=0) hvd.broadcast_optimizer_state(self.disc_optim, root_rank=0) self.compression = \ hvd.Compression.fp16 if self.minigan_args.fp16_allreduce else hvd.Compression.none self.gen_optim = hvd.DistributedOptimizer(self.gen_optim, \ named_parameters=self.generator.named_parameters(), \ compression = self.compression) self.disc_optim = hvd.DistributedOptimizer(self.disc_optim, \ named_parameters=self.discriminator.named_parameters(), \ compression = self.compression) # self.tb_disc.set_model(self.discriminator) # self.tb_comb_gan.set_model(self.combined_gan) if (self.minigan_args.profile): self.profile_layers(self.minigan_args.prof_steps, self.prof_images) if (hvd.rank() == 0): print('\n---NETWORK SETUP DONE---\n')
def main(_): FLAGS.torch_only = True melt.init() #fit = melt.get_fit() FLAGS.eval_batch_size = 512 * FLAGS.valid_multiplier FLAGS.eval_batch_size = 512 model_name = FLAGS.model model = getattr(base, model_name)() model = model.cuda() loss_fn = nn.BCEWithLogitsLoss() td = text_dataset.Dataset() train_files = gezi.list_files('../input/train/*') train_ds = get_dataset(train_files, td) #kwargs = {'num_workers': 4, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()} #num_workers = int(16 / hvd.size()) num_workers = 1 # set to 1 2 min to start might just set to 0 for safe num_workers = 0 # 设置0 速度比1慢很多 启动都需要1分多。。 # pin_memory 影响不大 单gpu提升速度一点点 多gpu 主要是 num_workers 影响资源占有。。有可能启动不起来 # 多gpu pin_memory = False 反而速度更快。。 #kwargs = {'num_workers': num_workers, 'pin_memory': True, 'collate_fn': lele.DictPadCollate()} kwargs = {'num_workers': 1, 'pin_memory': False, 'collate_fn': lele.DictPadCollate()} train_sampler = train_ds train_sampler = torch.utils.data.distributed.DistributedSampler( train_ds, num_replicas=hvd.size(), rank=hvd.rank()) train_dl = DataLoader(train_ds, FLAGS.batch_size, sampler=train_sampler, **kwargs) valid_files = gezi.list_files('../input/valid/*') valid_ds = get_dataset(valid_files, td) # support shuffle=False from version 1.2 valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_ds, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False) # valid_sampler2 = torch.utils.data.distributed.DistributedSampler( # valid_ds, num_replicas=hvd.size(), rank=hvd.rank(), shuffle=False) valid_dl = DataLoader(valid_ds, FLAGS.eval_batch_size, sampler=valid_sampler, **kwargs) #valid_dl2 = DataLoader(valid_ds, FLAGS.batch_size, sampler=valid_sampler2, **kwargs) optimizer = optim.Adamax(model.parameters(), lr=0.1) #optimizer = optim.SGD(model.parameters(), lr=0.1) hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) for epoch in range(2): train(epoch, model, loss_fn, train_dl, optimizer) test(model, loss_fn, valid_dl)
def main(): parser = argparse.ArgumentParser() parser.add_argument("-config") parser.add_argument("-data", help="data yaml file") parser.add_argument("-data_path", default='', type=str, help="path of data files") parser.add_argument("-seed_model", help="the seed nerual network model") parser.add_argument("-exp_dir", help="the directory to save the outputs") parser.add_argument("-transform", help="feature transformation matrix or mvn statistics") parser.add_argument("-criterion", type=str, choices=["mmi", "mpfe", "smbr"], help="set the sequence training crtierion") parser.add_argument( "-trans_model", help="the HMM transistion model, used for lattice generation") parser.add_argument( "-prior_path", help="the prior for decoder, usually named as final.occs in kaldi setup" ) parser.add_argument( "-den_dir", help="the decoding graph directory to find HCLG and words.txt files") parser.add_argument("-lr", type=float, help="set the learning rate") parser.add_argument("-ce_ratio", default=0.1, type=float, help="the ratio for ce regularization") parser.add_argument("-momentum", default=0, type=float, help="set the momentum") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-data_loader_threads", default=0, type=int, help="number of workers for data loading") parser.add_argument("-max_grad_norm", default=5, type=float, help="max_grad_norm for gradient clipping") parser.add_argument("-sweep_size", default=100, type=float, help="process n hours of data per sweep (default:60)") parser.add_argument("-num_epochs", default=1, type=int, help="number of training epochs (default:1)") parser.add_argument('-print_freq', default=10, type=int, metavar='N', help='print frequency (default: 10)') parser.add_argument('-save_freq', default=1000, type=int, metavar='N', help='save model frequency (default: 1000)') args = parser.parse_args() with open(args.config) as f: config = yaml.safe_load(f) config['data_path'] = args.data_path config["sweep_size"] = args.sweep_size print("pytorch version:{}".format(th.__version__)) with open(args.data) as f: data = yaml.safe_load(f) config["source_paths"] = [j for i, j in data['clean_source'].items()] print("Experiment starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) # Initialize Horovod hvd.init() th.cuda.set_device(hvd.local_rank()) print("Run experiments with world size {}".format(hvd.size())) dataset = SpeechDataset(config) transform = None if args.transform is not None and os.path.isfile(args.transform): with open(args.transform, 'rb') as f: transform = pickle.load(f) dataset.transform = transform train_dataloader = SeqDataloader(dataset, batch_size=args.batch_size, num_workers=args.data_loader_threads, distributed=True, test_only=False) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(train_dataloader))) if not os.path.isdir(args.exp_dir): os.makedirs(args.exp_dir) # ceate model model_config = config["model_config"] lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True) model = NnetAM(lstm, model_config["hidden_size"] * 2, model_config["label_size"]) model.cuda() # setup the optimizer optimizer = th.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum) # Broadcast parameters and opterimizer state from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) if os.path.isfile(args.seed_model): checkpoint = th.load(args.seed_model) state_dict = checkpoint['model'] from collections import OrderedDict new_state_dict = OrderedDict() for k, v in state_dict.items(): name = k[7:] # remove 'module.' of dataparallel new_state_dict[name] = v model.load_state_dict(new_state_dict) print("=> loaded checkpoint '{}' ".format(args.seed_model)) else: sys.stderr.write('ERROR: The model file %s does not exist!\n' % (model_file)) sys.exit(0) HCLG = args.den_dir + "/HCLG.fst" words_txt = args.den_dir + "/words.txt" silence_phones = args.den_dir + "/phones/silence.csl" if not os.path.isfile(HCLG): sys.stderr.write('ERROR: The HCLG file %s does not exist!\n' % (HCLG)) sys.exit(0) if not os.path.isfile(words_txt): sys.stderr.write('ERROR: The words.txt file %s does not exist!\n' % (words_txt)) sys.exit(0) if not os.path.isfile(silence_phones): sys.stderr.write('ERROR: The silence phone file %s does not exist!\n' % (silence_phones)) sys.exit(0) with open(silence_phones) as f: silence_ids = [int(i) for i in f.readline().strip().split(':')] f.close() if os.path.isfile(args.trans_model): trans_model = kaldi_hmm.TransitionModel() with kaldi_util.io.xopen(args.trans_model) as ki: trans_model.read(ki.stream(), ki.binary) else: sys.stderr.write('ERROR: The trans_model %s does not exist!\n' % (args.trans_model)) sys.exit(0) # now we can setup the decoder decoder_opts = LatticeFasterDecoderOptions() decoder_opts.beam = config["decoder_config"]["beam"] decoder_opts.lattice_beam = config["decoder_config"]["lattice_beam"] decoder_opts.max_active = config["decoder_config"]["max_active"] acoustic_scale = config["decoder_config"]["acoustic_scale"] decoder_opts.determinize_lattice = False #To produce raw state-level lattice instead of compact lattice asr_decoder = MappedLatticeFasterRecognizer.from_files( args.trans_model, HCLG, words_txt, acoustic_scale=acoustic_scale, decoder_opts=decoder_opts) prior = kaldi_util.io.read_matrix(args.prior_path).numpy() log_prior = th.tensor(np.log(prior[0] / np.sum(prior[0])), dtype=th.float) model.train() for epoch in range(args.num_epochs): run_train_epoch(model, optimizer, log_prior.cuda(), train_dataloader, epoch, asr_decoder, trans_model, silence_ids, args) # save model if hvd.rank() == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() checkpoint['epoch'] = epoch output_file = args.exp_dir + '/model.se.' + str(epoch) + '.tar' th.save(checkpoint, output_file)
def main(args): logfilename = 'convergence_cifar10_{}_kfac{}_gpu{}_bs{}_{}_lr{}_sr{}_wp{}.log'.format( args.model, args.kfac_update_freq, hvd.size(), args.batch_size, args.kfac_name, args.base_lr, args.sparse_ratio, args.warmup_epochs) if hvd.rank() == 0: wandb.init(project='kfac', entity='hkust-distributedml', name=logfilename, config=args) logfile = './logs/' + logfilename #logfile = './logs/sparse_cifar10_{}_kfac{}_gpu{}_bs{}.log'.format(args.model, args.kfac_update_freq, hvd.size(), args.batch_size) #logfile = './logs/cifar10_{}_kfac{}_gpu{}_bs{}.log'.format(args.model, args.kfac_update_freq, hvd.size(), args.batch_size) hdlr = logging.FileHandler(logfile) hdlr.setFormatter(formatter) logger.addHandler(hdlr) logger.info(args) torch.manual_seed(args.seed) verbose = True if hvd.rank() == 0 else False args.verbose = 1 if hvd.rank() == 0 else 0 if args.cuda: torch.cuda.set_device(hvd.local_rank()) torch.cuda.manual_seed(args.seed) torch.backends.cudnn.benchmark = True args.log_dir = os.path.join( args.log_dir, "cifar10_{}_kfac{}_gpu_{}_{}".format( args.model, args.kfac_update_freq, hvd.size(), datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S'))) #os.makedirs(args.log_dir, exist_ok=True) #log_writer = SummaryWriter(args.log_dir) if verbose else None log_writer = None # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) kwargs = {'num_workers': 8, 'pin_memory': True} if args.cuda else {} transform_train = transforms.Compose([ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) ]) transform_test = transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) ]) download = True if hvd.local_rank() == 0 else False if not download: hvd.allreduce(torch.tensor(1), name="barrier") train_dataset = datasets.CIFAR10(root=args.dir, train=True, download=download, transform=transform_train) test_dataset = datasets.CIFAR10(root=args.dir, train=False, download=download, transform=transform_test) if download: hvd.allreduce(torch.tensor(1), name="barrier") # Horovod: use DistributedSampler to partition the training data. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) #train_loader = torch.utils.data.DataLoader(train_dataset, train_loader = MultiEpochsDataLoader(train_dataset, batch_size=args.batch_size * args.batches_per_allreduce, sampler=train_sampler, **kwargs) # Horovod: use DistributedSampler to partition the test data. test_sampler = torch.utils.data.distributed.DistributedSampler( test_dataset, num_replicas=hvd.size(), rank=hvd.rank()) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=args.test_batch_size, sampler=test_sampler, **kwargs) if args.model.lower() == "resnet20": model = resnet.resnet20() elif args.model.lower() == "resnet32": model = resnet.resnet32() elif args.model.lower() == "resnet44": model = resnet.resnet44() elif args.model.lower() == "resnet56": model = resnet.resnet56() elif args.model.lower() == "resnet110": model = resnet.resnet110() if args.cuda: model.cuda() #if verbose: # summary(model, (3, 32, 32)) criterion = nn.CrossEntropyLoss() args.base_lr = args.base_lr * hvd.size() use_kfac = True if args.kfac_update_freq > 0 else False optimizer = optim.SGD(model.parameters(), lr=args.base_lr, momentum=args.momentum, weight_decay=args.weight_decay) if use_kfac: KFAC = kfac.get_kfac_module(args.kfac_name) preconditioner = KFAC( model, lr=args.base_lr, factor_decay=args.stat_decay, damping=args.damping, kl_clip=args.kl_clip, fac_update_freq=args.kfac_cov_update_freq, kfac_update_freq=args.kfac_update_freq, diag_blocks=args.diag_blocks, diag_warmup=args.diag_warmup, distribute_layer_factors=args.distribute_layer_factors, sparse_ratio=args.sparse_ratio) kfac_param_scheduler = kfac.KFACParamScheduler( preconditioner, damping_alpha=args.damping_alpha, damping_schedule=args.damping_schedule, update_freq_alpha=args.kfac_update_freq_alpha, update_freq_schedule=args.kfac_update_freq_schedule) # KFAC guarentees grads are equal across ranks before opt.step() is called # so if we do not use kfac we need to wrap the optimizer with horovod compression = hvd.Compression.fp16 if args.fp16_allreduce else hvd.Compression.none optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, op=hvd.Average, backward_passes_per_step=args.batches_per_allreduce) hvd.broadcast_optimizer_state(optimizer, root_rank=0) hvd.broadcast_parameters(model.state_dict(), root_rank=0) lrs = create_lr_schedule(hvd.size(), args.warmup_epochs, args.lr_decay) lr_scheduler = [LambdaLR(optimizer, lrs)] if use_kfac: lr_scheduler.append(LambdaLR(preconditioner, lrs)) def train(epoch): model.train() train_sampler.set_epoch(epoch) train_loss = Metric('train_loss') train_accuracy = Metric('train_accuracy') if STEP_FIRST: for scheduler in lr_scheduler: scheduler.step() if use_kfac: kfac_param_scheduler.step(epoch) # with tqdm(total=len(train_loader), # desc='Epoch {:3d}/{:3d}'.format(epoch + 1, args.epochs), # disable=not verbose) as t: display = 20 avg_time = 0.0 io_time = 0.0 if True: for batch_idx, (data, target) in enumerate(train_loader): stime = time.time() if args.cuda: data, target = data.cuda(non_blocking=True), target.cuda( non_blocking=True) io_time += time.time() - stime optimizer.zero_grad() for i in range(0, len(data), args.batch_size): data_batch = data[i:i + args.batch_size] target_batch = target[i:i + args.batch_size] output = model(data_batch) loss = criterion(output, target_batch) with torch.no_grad(): train_loss.update(loss) train_accuracy.update(accuracy(output, target_batch)) loss.div_(math.ceil(float(len(data)) / args.batch_size)) loss.backward() optimizer.synchronize() if use_kfac: preconditioner.step(epoch=epoch) with optimizer.skip_synchronize(): optimizer.step() #t.set_postfix_str("loss: {:.4f}, acc: {:.2f}%".format( #train_loss.avg.item(), 100*train_accuracy.avg.item())) #t.update(1) avg_time += (time.time() - stime) if batch_idx > 0 and batch_idx % display == 0: if args.verbose: logger.info( "[%d][%d] train loss: %.4f, acc: %.3f, time: %.3f [io: %.3f], speed: %.3f images/s" % (epoch, batch_idx, train_loss.avg.item(), 100 * train_accuracy.avg.item(), avg_time / display, io_time / display, args.batch_size / (avg_time / display))) avg_time = 0.0 io_time = 0.0 if hvd.rank() == 0: wandb.log({"loss": loss, "epoch": epoch}) if args.verbose: logger.info("[%d] epoch train loss: %.4f, acc: %.3f" % (epoch, train_loss.avg.item(), 100 * train_accuracy.avg.item())) if not STEP_FIRST: for scheduler in lr_scheduler: scheduler.step() if use_kfac: kfac_param_scheduler.step(epoch) if log_writer: log_writer.add_scalar('train/loss', train_loss.avg, epoch) log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch) def test(epoch): model.eval() test_loss = Metric('val_loss') test_accuracy = Metric('val_accuracy') #with tqdm(total=len(test_loader), # bar_format='{l_bar}{bar}|{postfix}', # desc=' '.format(epoch + 1, args.epochs), # disable=not verbose) as t: if True: with torch.no_grad(): for i, (data, target) in enumerate(test_loader): if args.cuda: data, target = data.cuda(), target.cuda() output = model(data) test_loss.update(criterion(output, target)) test_accuracy.update(accuracy(output, target)) if args.verbose: logger.info("[%d][0] evaluation loss: %.4f, acc: %.3f" % (epoch, test_loss.avg.item(), 100 * test_accuracy.avg.item())) if hvd.rank() == 0: wandb.log({ "val top-1 acc": test_accuracy.avg.item(), "epoch": epoch }) #t.update(1) #if i + 1 == len(test_loader): # t.set_postfix_str("\b\b test_loss: {:.4f}, test_acc: {:.2f}%".format( # test_loss.avg.item(), 100*test_accuracy.avg.item()), # refresh=False) if log_writer: log_writer.add_scalar('test/loss', test_loss.avg, epoch) log_writer.add_scalar('test/accuracy', test_accuracy.avg, epoch) start = time.time() for epoch in range(args.epochs): if args.verbose: logger.info("[%d] epoch train starts" % (epoch)) train(epoch) test(epoch) if verbose: logger.info("Training time: %s", str(datetime.timedelta(seconds=time.time() - start))) pass
def __init__( self, env, env_params: dict, log_dir: str, ac_kwargs: dict = {}, seed: int = 0, steps_per_epoch: int = 4000, epochs: int = 50, gamma: float = 0.99, clip_ratio: float = 0.2, pi_lr: float = 3e-4, vf_lr: float = 1e-3, train_iters: int = 100, entropy_coeff: float = 1e-2, lam: float = 0.97, target_kl: float = 0.01, save_freq: int = 10, load_path=None, render_train: bool = False, wandb_id: Optional[str] = None, **kwargs, ): self.log_dir = log_dir self.render_dir = os.path.join(log_dir, "renders") self.ckpt_dir = os.path.join(log_dir, "checkpoints") if hvd.rank() == 0: os.makedirs(self.log_dir, exist_ok=True) os.makedirs(self.render_dir, exist_ok=True) os.makedirs(self.ckpt_dir, exist_ok=True) self.softlink = os.path.abspath( os.path.join(self.ckpt_dir, f"ckpt_latest.pth")) self.ac_params_file = os.path.join(log_dir, "ac_params.json") hparams = convert_json(locals()) self.logger = EpochLogger(output_dir=self.log_dir, exp_name=wandb_id) if torch.cuda.is_available(): # Horovod: pin GPU to local rank. dev_id = int(torch.cuda.device_count() * hvd.local_rank() / hvd.local_size()) torch.cuda.set_device(dev_id) device = torch.device(f"cuda:{dev_id}") torch.cuda.manual_seed(seed) else: device = torch.device("cpu") # env_params.update({"device": device}) self.env = env(**env_params) self.ac_params = {k: v for k, v in ac_kwargs.items()} self.ac_params.update({ "observation_space": self.env.observation_space, "action_space": self.env.action_space, "nagents": self.env.nagents, }) self.entropy_coeff = entropy_coeff self.entropy_coeff_decay = entropy_coeff / epochs # Horovod: limit # of CPU threads to be used per worker. torch.set_num_threads(1) torch.save(self.ac_params, self.ac_params_file) if os.path.isfile(self.softlink): self.logger.log("Restarting from latest checkpoint", color="red") load_path = self.softlink # Random seed seed += 10000 * hvd.rank() torch.manual_seed(seed) np.random.seed(seed) self.nagents = self.env.nagents self.ac = PPOLidarActorCritic( self.env.observation_space, self.env.action_space, nagents=self.nagents, centralized=True, **ac_kwargs, ) self.device = device self.pi_lr = pi_lr self.vf_lr = vf_lr self.load_path = load_path if load_path is not None: self.load_model(load_path) else: self.pi_optimizer = Adam(trainable_parameters(self.ac.pi), lr=self.pi_lr, eps=1e-8) self.vf_optimizer = Adam(trainable_parameters(self.ac.v), lr=self.vf_lr, eps=1e-8) # Sync params across processes hvd.broadcast_parameters(self.ac.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(self.pi_optimizer, root_rank=0) hvd.broadcast_optimizer_state(self.vf_optimizer, root_rank=0) self.ac = self.ac.to(device) self.move_optimizer_to_device(self.pi_optimizer) self.move_optimizer_to_device(self.vf_optimizer) if hvd.rank() == 0: if wandb_id is None: eid = (log_dir.split("/")[-2] if load_path is None else load_path.split("/")[-4]) else: eid = wandb_id wandb.init( name=eid, id=eid, project="Social Driving", resume=load_path is not None, ) wandb.watch_called = False if "self" in hparams: del hparams["self"] wandb.config.update(hparams, allow_val_change=True) wandb.watch(self.ac.pi, log="all") wandb.watch(self.ac.v, log="all") # Count variables var_counts = tuple( count_vars(module) for module in [self.ac.pi, self.ac.v]) self.logger.log( "\nNumber of parameters: \t pi: %d, \t v: %d\n" % var_counts, color="green", ) # Set up experience buffer self.steps_per_epoch = steps_per_epoch self.local_steps_per_epoch = int(steps_per_epoch / hvd.size()) self.buf = CentralizedPPOBuffer( self.env.observation_space[0].shape, self.env.observation_space[1].shape, self.env.action_space.shape, self.local_steps_per_epoch, gamma, lam, self.env.nagents, device=self.device, ) self.gamma = gamma self.clip_ratio = clip_ratio self.train_iters = train_iters self.target_kl = target_kl self.epochs = epochs self.save_freq = save_freq
def horovod_train(self, model): if torch.cuda.is_available() and self.on_gpu: # Horovod: pin GPU to local rank assert self.root_gpu == hvd.local_rank() torch.cuda.set_device(self.root_gpu) model.cuda(self.root_gpu) # Only show progress bar from the first worker self.progress_bar_refresh_rate = self.progress_bar_refresh_rate if hvd.rank( ) == 0 else 0 # CHOOSE OPTIMIZER # allow for lr schedulers as well self.optimizers, self.lr_schedulers, self.optimizer_frequencies = self.init_optimizers( model) # Horovod: scale the learning rate by the number of workers to account for # increased total batch size for optimizer in self.optimizers: for param_group in optimizer.param_groups: param_group['lr'] *= hvd.size() if self.use_amp: # An example model, optimizers = model.configure_apex(amp, model, self.optimizers, self.amp_level) self.optimizers = optimizers # Horovod: broadcast parameters & optimizer state to ensure consistent initialization hvd.broadcast_parameters(model.state_dict(), root_rank=0) for optimizer in self.optimizers: hvd.broadcast_optimizer_state(optimizer, root_rank=0) def filter_named_parameters(model, optimizer): opt_params = set([ p for group in optimizer.param_groups for p in group.get('params', []) ]) return [(name, p) for name, p in model.named_parameters() if p in opt_params] # Horovod: wrap optimizers to perform gradient aggregation via allreduce self.optimizers = [ hvd.DistributedOptimizer(optimizer, named_parameters=filter_named_parameters( model, optimizer)) for optimizer in self.optimizers ] # Update logger rank info from Horovod to avoid race conditions from different ranks # creating directories / writing files in the same locations. self.proc_rank = hvd.rank() rank_zero_only.rank = self.proc_rank with ExitStack() as stack: for optimizer in self.optimizers: # Synchronization will be performed explicitly following backward() stack.enter_context(optimizer.skip_synchronize()) self.run_pretrain_routine(model)
def init_training(self): model = self.elements["model"] start_epoch = self.params["start_epoch"] exist_model = self.params["exist_model"] model_dir = self.params["model_dir"] model_blueprint = self.params["model_blueprint"] suffix = self.params["suffix"] if start_epoch <= 0 and utils.is_main_training(): model_creation = model.get_model_creation() utils.write_nnet_config(model_blueprint, model_creation, "{0}/config/nnet.config".format(model_dir)) ## Recover checkpoint | Tansform learning | Initialize parametes if start_epoch > 0: # This train_stage is equal to number of completed epoch if utils.is_main_training(): logger.info( "Recover training from {0} epoch.".format(start_epoch)) model.load_state_dict( torch.load('{0}/{1}.{2}'.format(model_dir, start_epoch, suffix), map_location="cpu")) elif os.path.exists(exist_model): if utils.is_main_training(): logger.info( "Use {0} as the initial model to start transform-training." .format(exist_model)) model.load_transform_state_dict( torch.load(exist_model, map_location="cpu")) else: # Just use the raw initial model or initialize it again by some initial functions here pass # Now, it means use the raw initial model if utils.use_horovod(): import horovod.torch as hvd # Broadcast parameters from rank 0 to all other processes. hvd.broadcast_parameters(self.elements["model"].state_dict(), root_rank=0) # For optimizer wrapper such as lookahead. if getattr(self.elements["optimizer"], "optimizer", None) is not None: raise TypeError( "Do not support using lookahead with horovod now.") else: # Broadcast optimizer state. hvd.broadcast_optimizer_state(self.elements["optimizer"], root_rank=0) self.elements["optimizer"] = hvd.DistributedOptimizer( self.elements["optimizer"], named_parameters=self.elements["model"].named_parameters()) ## Select device model = self.select_device() # Original model is built in libs.nnet.framework.TopVirtualNnet, and it is not available after # wrapped by DistributedDataParallel. So, to call functions of TopVirtualNnet conveniently, the # self.elements["model_forward"] is set here to name DistributedDataParallel. if isinstance(model, torch.nn.parallel.DistributedDataParallel): self.elements["model"] = model.module self.elements["model_forward"] = model
def train_and_eval(conf, val_ratio, val_fold, save_path, only_eval, reporter=None, metric='test'): writer = get_tb_writer() # region conf vars conf_dataset = conf['dataset'] dataroot = conf_dataset['dataroot'] horovod = conf['common']['horovod'] checkpoint_freq = conf['common']['checkpoint']['freq'] conf_loader = conf['autoaug']['loader'] conf_model = conf['autoaug']['model'] ds_name = conf_dataset['name'] aug = conf_loader['aug'] cutout = conf_loader['cutout'] batch_size = conf_loader['batch'] max_batches = conf_dataset['max_batches'] epochs = conf_loader['epochs'] conf_model = conf['autoaug']['model'] conf_opt = conf['autoaug']['optimizer'] conf_lr_sched = conf['autoaug']['lr_schedule'] n_workers = conf_loader['n_workers'] # endregion # initialize horovod # TODO: move to common init if horovod: import horovod.torch as hvd hvd.init() device = torch.device('cuda', hvd.local_rank()) torch.cuda.set_device(device) if not reporter: reporter = lambda **kwargs: 0 # get dataloaders with transformations and splits applied train_dl, valid_dl, test_dl = get_dataloaders(ds_name, batch_size, dataroot, aug, cutout, load_train=True, load_test=True, val_ratio=val_ratio, val_fold=val_fold, horovod=horovod, n_workers=n_workers, max_batches=max_batches) # create a model & an optimizer model = get_model(conf_model, num_class(ds_name), data_parallel=(not horovod)) # select loss function and optimizer lossfn = nn.CrossEntropyLoss() optimizer = ml_utils.create_optimizer(conf_opt, model.parameters()) # distributed optimizer if horovod is used is_master = True if horovod: optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) # issue : https://github.com/horovod/horovod/issues/1099 optimizer._requires_update = set() hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) if hvd.rank() != 0: is_master = False logger.debug('is_master=%s' % is_master) # select LR schedule scheduler = ml_utils.create_lr_scheduler(conf_lr_sched, epochs, optimizer, len(train_dl)) result = OrderedDict() epoch_start = 1 # if model available from previous checkpount then load it if save_path and os.path.exists(save_path): logger.info('%s checkpoint found. loading...' % save_path) data = torch.load(save_path) # when checkpointing we do add 'model' key so other cases are special cases if 'model' in data or 'state_dict' in data: key = 'model' if 'model' in data else 'state_dict' logger.info('checkpoint epoch@%d' % data['epoch']) # TODO: do we need change here? if not isinstance(model, DataParallel): # for non-dataparallel models, remove default 'module.' prefix model.load_state_dict({k.replace('module.', ''): \ v for k, v in data[key].items()}) else: # for dataparallel models, make sure 'module.' prefix exist model.load_state_dict({k if 'module.' in k \ else 'module.'+k: v for k, v in data[key].items()}) # load optimizer optimizer.load_state_dict(data['optimizer']) # restore epoch count if data['epoch'] < epochs: epoch_start = data['epoch'] else: # epochs finished, switch to eval mode only_eval = False else: model.load_state_dict({k: v for k, v in data.items()}) del data else: logger.info('model checkpoint does not exist at "%s". skip \ to pretrain weights...' % save_path) only_eval = False # we made attempt to load checkpt but as it does not exist, switch to train mode # if eval only then run model on train, test and val sets if only_eval: logger.info('evaluation only+') model.eval() rs = dict() # stores metrics for each set rs['train'] = run_epoch(conf, logger, model, train_dl, lossfn, None, split_type='train', epoch=0) if valid_dl: rs['valid'] = run_epoch(conf, logger, model, valid_dl, lossfn, None, split_type='valid', epoch=0) rs['test'] = run_epoch(conf, logger, model, test_dl, lossfn, None, split_type='test', epoch=0) for key, setname in itertools.product(['loss', 'top1', 'top5'], ['train', 'valid', 'test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = 0 return result # train loop best_top1, best_valid_loss = 0, 10.0e10 max_epoch = epochs for epoch in range(epoch_start, max_epoch + 1): if horovod: trainsampler.set_epoch(epoch) # run train epoch and update the model model.train() rs = dict() rs['train'] = run_epoch(conf, logger, model, train_dl, lossfn, optimizer, split_type='train', epoch=epoch, verbose=is_master, scheduler=scheduler) if scheduler[0]: scheduler[0].step() model.eval() # check for nan loss if math.isnan(rs['train']['loss']): raise Exception('train loss is NaN.') # collect metrics on val and test set, checkpoint if epoch % checkpoint_freq == 0 or epoch == max_epoch: if valid_dl: rs['valid'] = run_epoch(conf, logger, model, valid_dl, lossfn, None, split_type='valid', epoch=epoch, verbose=is_master) rs['test'] = run_epoch(conf, logger, model, test_dl, lossfn, None, split_type='test', epoch=epoch, verbose=is_master) # TODO: is this good enough condition? if rs[metric]['loss'] < best_valid_loss or rs[metric][ 'top1'] > best_top1: best_top1 = rs[metric]['top1'] best_valid_loss = rs[metric]['loss'] for key, setname in itertools.product( ['loss', 'top1', 'top5'], ['train', 'valid', 'test']): result['%s_%s' % (key, setname)] = rs[setname][key] result['epoch'] = epoch writer.add_scalar('best_top1/valid', rs['valid']['top1'], epoch) writer.add_scalar('best_top1/test', rs['test']['top1'], epoch) reporter(loss_valid=rs['valid']['loss'], top1_valid=rs['valid']['top1'], loss_test=rs['test']['loss'], top1_test=rs['test']['top1']) # save checkpoint if is_master and save_path: logger.info('save model@%d to %s' % (epoch, save_path)) torch.save( { 'epoch': epoch, 'log': { 'train': rs['train'].get_dict(), 'valid': rs['valid'].get_dict(), 'test': rs['test'].get_dict(), }, 'optimizer': optimizer.state_dict(), 'model': model.state_dict() }, save_path) del model result['top1_test'] = best_top1 return result
def _train_rnn(dataset, index, hidden_size, n_layers, bidirectional, classifier, n_epochs_max, batch_size, n_workers, file_name, root_dir=ROOT_DIR, lr=0.001, betas=(0.9, 0.999), opt_level="O0", seed=42, log_interval=10): '''constructs and trains neural networks given the dataset instance and network structure inputs ------ dataset - instance of dataset class inherited from Dataset class data should contain keys "data" and "labels" index - dict with keys "train" and "val" whose values are list-like indices hidden_size - size of hidden state n_layers - number of recurrent layers bidirectional - if True, becomes a bidirectional LSTM classifier: boolean indicating whether it's a classifier or regressor. n_epochs_max - maximum number of epochs to run. can be terminated by KeyboardInterrupt batch_size - batch size for training n_workers - int indicating number of workers when creating DataLoader instance file_name - name of file that contains the empty meta data root_dir - root directory of the meta data file lr - float type learning rate betas - tuple of floats indicating betas arguments in Adam optimizer opt_level - optimization level seed - random seed log_interval - how many batches to wait before logging training status outputs ------- saves data into given file ''' #hvd.init() # initialize horovod torch.manual_seed(seed) # FIXME: necessary here? #torch.cuda.set_device(hvd.local_rank()) # pin GPU to local rank # limit # of CPU threads to be used per worker : FIXME: why do this? torch.set_num_threads(1) file_name, _ = os.path.splitext(file_name) file_path = os.path.join(root_dir, file_name + '.pt') if hvd.rank() == 0: assert(len(dataset) == (len(index['train']) + len(index['val']))),\ "Size mismatch between dataset and index" # Partition dataset among workers using DistributedSampler sampler = { phase: DistributedSampler(Subset(dataset, index[phase]), num_replicas=hvd.size(), rank=hvd.rank()) for phase in ['train', 'val'] } dataloader = { phase: DataLoader(dataset, sampler=sampler[phase], batch_size=batch_size, num_workers=n_workers, collate_fn=collate_fn, pin_memory=True) for phase in ['train', 'val'] } # FIXME: pin_memory? perhaps pin_memory means putting data on the gpu, n_data = {phase: len(index[phase]) for phase in ['train', 'val']} input_size = dataset[0]['data'].shape[-2] meta = torch.load(file_path) output_size = len(meta['labels_lut']) if classifier else 1 rnn = RNN(input_size, hidden_size=hidden_size, output_size=output_size, n_layers=n_layers, bidirectional=bidirectional).cuda() optimizer = optim.Adam(rnn.parameters(), lr=lr, betas=betas) # Add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=rnn.named_parameters()) # Broadcast parameters from rank 0 to all other processes. hvd.broadcast_parameters(rnn.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # apex rnn, optimizer = amp.initialize(rnn, optimizer, opt_level=opt_level) criterion = nn.CrossEntropyLoss( reduction='sum') if classifier else nn.MSELoss(reduction='sum') criterion = criterion.cuda() metric = 'cross_entropy_mean' if classifier else 'rmse' time_start = time.time() for epoch in range(n_epochs_max): loss_sum = {} loss_metric = {} for phase in ['train', 'val']: rnn.train(phase == 'train') loss_sum[phase] = 0. for batch in dataloader[phase]: # permute s.t. shape is (data_len, n_data_total, n_channels * (n_scat_nodes)) batch_data = batch['data'].permute([2, 0, 1]).cuda() batch_labels = batch['labels'].cuda() input_lens = batch['input_lens'].cuda() output = rnn(batch_data, input_lens=input_lens) # for regression, output of rnn is shaped (batch_size, 1). drop dummy axis if classifier: batch_labels = batch_labels.type(torch.cuda.LongTensor) else: output = output[:, 0] loss = criterion(output, batch_labels) optimizer.zero_grad() if phase == 'train': with amp.scale_loss(loss, optimizer) as scaled_loss: scaled_loss.backward() optimizer.synchronize() with optimizer.skip_synchronize(): optimizer.step() loss_sum[phase] += loss.data.item() # classification: cross entropy mean, regression: RMSE loss per data point loss_metric[phase] = loss_sum[phase] / n_data[ phase] if classifier else np.sqrt(loss_sum[phase] / n_data[phase]) if epoch % log_interval == 0 and hvd.rank() == 0: time_curr = time.time() elapsed = time_curr - time_start loss_msg = ( "\t{} out of {} epochs, {}_train:{:.15f}, {}_val:{:.15f}, elapsed seconds:{:.2f}" .format(epoch, n_epochs_max, metric, loss_metric['train'], metric, loss_metric['val'], elapsed)) print(loss_msg) meta = torch.load(file_path) if classifier: meta['epoch'].append(epoch) meta['elapsed'].append(elapsed) #meta['weights'] = rnn.state_dict() for phase in ['train', 'val']: meta['loss'][phase].append(loss_metric[phase]) else: meta['epoch'].append(epoch) meta['elapsed'].append(elapsed) #meta['weights'] = rnn.state_dict() for phase in ['train', 'val']: meta['loss'][phase].append(loss_metric[phase]) torch.save(meta, file_path)
def start_training(cfg): set_random_seed(cfg.seed) n_gpu = hvd.size() cfg.n_gpu = n_gpu device = torch.device("cuda", hvd.local_rank()) torch.cuda.set_device(hvd.local_rank()) if hvd.rank() != 0: LOGGER.disabled = True LOGGER.info("device: {} n_gpu: {}, rank: {}, " "16-bits training: {}".format(device, n_gpu, hvd.rank(), bool(cfg.fp16))) model = setup_model(cfg, device=device) model.train() optimizer = setup_e2e_optimizer(model, cfg) # Horovod: (optional) compression algorithm.compressin compression = hvd.Compression.none optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) model, optimizer = amp.initialize(model, optimizer, enabled=cfg.fp16, opt_level='O2', keep_batchnorm_fp32=True) # prepare data tokenizer = BertTokenizerFast.from_pretrained(cfg.tokenizer_dir) train_loader, val_loader = setup_dataloaders(cfg, tokenizer) eval_loader = mk_video_ret_eval_dataloader( anno_path=cfg.val_datasets[0].txt, lmdb_dir=cfg.val_datasets[0].img, cfg=cfg, tokenizer=tokenizer, ) # compute the number of steps and update cfg total_n_examples = len(train_loader.dataset) * cfg.max_n_example_per_group total_train_batch_size = int(n_gpu * cfg.train_batch_size * cfg.gradient_accumulation_steps * cfg.max_n_example_per_group) cfg.num_train_steps = int( math.ceil(1. * cfg.num_train_epochs * total_n_examples / total_train_batch_size)) cfg.valid_steps = int( math.ceil(1. * cfg.num_train_steps / cfg.num_valid / cfg.min_valid_steps)) * cfg.min_valid_steps actual_num_valid = int( math.floor(1. * cfg.num_train_steps / cfg.valid_steps)) + 1 # restore restorer = TrainingRestorer(cfg, model, optimizer) global_step = restorer.global_step TB_LOGGER.global_step = global_step if hvd.rank() == 0: LOGGER.info("Saving training meta...") save_training_meta(cfg) path = join(cfg.output_dir, 'log', "detectron2_model_cfg.yaml") with open(path, "w") as f: f.write(model.cnn.config_file) LOGGER.info("Saving training done...") TB_LOGGER.create(join(cfg.output_dir, 'log')) pbar = tqdm(total=cfg.num_train_steps) model_saver = ModelSaver(join(cfg.output_dir, "ckpt")) add_log_to_file(join(cfg.output_dir, "log", "log.txt")) else: LOGGER.disabled = True pbar = NoOp() model_saver = NoOp() restorer = NoOp() if global_step > 0: pbar.update(global_step) LOGGER.info(cfg) LOGGER.info("Starting training...") LOGGER.info(f"***** Running training with {n_gpu} GPUs *****") LOGGER.info( f" Single-GPU Non-Accumulated batch size = {cfg.train_batch_size}") LOGGER.info(f" max_n_example_per_group = {cfg.max_n_example_per_group}") LOGGER.info(f" Accumulate steps = {cfg.gradient_accumulation_steps}") LOGGER.info( f" Total batch size = #GPUs * Single-GPU batch size * " f"max_n_example_per_group * Accumulate steps [Image] = {total_train_batch_size}" ) LOGGER.info(f" Total #epochs = {cfg.num_train_epochs}") LOGGER.info(f" Total #steps = {cfg.num_train_steps}") LOGGER.info( f" Validate every {cfg.valid_steps} steps, in total {actual_num_valid} times" ) # quick hack for amp delay_unscale bug with optimizer.skip_synchronize(): optimizer.zero_grad() if global_step == 0: optimizer.step() debug_step = 3 running_loss = RunningMeter('train_loss') for step, batch in enumerate(InfiniteIterator(train_loader)): # forward pass del batch["caption_ids"] mini_batch = dict() for k, v in batch.items(): if k != "visual_inputs": mini_batch[k] = v pool_method = cfg.score_agg_func # could be 1, where only a single clip is used num_clips = cfg.train_n_clips num_frm = cfg.num_frm # (B, T=num_clips*num_frm, C, H, W) --> (B, num_clips, num_frm, C, H, W) bsz = batch["visual_inputs"].shape[0] new_visual_shape = (bsz, num_clips, num_frm) + batch["visual_inputs"].shape[2:] visual_inputs = batch["visual_inputs"].view(*new_visual_shape) logits = [] for clip_idx in range(num_clips): # (B, num_frm, C, H, W) mini_batch["visual_inputs"] = visual_inputs[:, clip_idx] mini_batch["n_examples_list"] = batch["n_examples_list"] outputs = forward_step(model, mini_batch, cfg) logits.append(outputs["logits"]) # the losses are cross entropy and mse, no need to * num_labels logits = torch.stack(logits) # (num_frm, B, 5) if pool_method == "mean": logits = logits.mean(0) # (B, 5) elif pool_method == "max": logits = logits.max(0)[0] # (B, 5) elif pool_method == "lse": logits = logits.permute( 1, 0, 2).contiguous() # (B, num_frm, 5), pooling will be done in CE else: raise ValueError( f"Invalid value for pool_method, " f"got {pool_method}, expect one of [`mean`, `max`, `lse`]") if pool_method == "lse": out = torch.logsumexp(logits.view(logits.shape[0], -1), dim=-1, keepdim=True) \ - torch.logsumexp(logits, dim=1) loss = torch.gather(out, -1, batch["labels"].view(-1, 1)) else: _, loss = model.transformer.calc_loss( logits, batch["labels"], sample_size=len(batch["n_examples_list"])) loss = loss.mean() running_loss(loss.item()) # backward pass delay_unscale = (step + 1) % cfg.gradient_accumulation_steps != 0 with amp.scale_loss(loss, optimizer, delay_unscale=delay_unscale) as scaled_loss: scaled_loss.backward() zero_none_grad(model) optimizer.synchronize() # optimizer if (step + 1) % cfg.gradient_accumulation_steps == 0: global_step += 1 # learning rate scheduling n_epoch = int(1. * total_train_batch_size * global_step / total_n_examples) # learning rate scheduling transformer lr_this_step_transformer = get_lr_sched( global_step, cfg.decay, cfg.learning_rate, cfg.num_train_steps, warmup_ratio=cfg.warmup_ratio, decay_epochs=cfg.step_decay_epochs, multi_step_epoch=n_epoch) # learning rate scheduling cnn lr_this_step_cnn = get_lr_sched( global_step, cfg.cnn_lr_decay, cfg.cnn_learning_rate, cfg.num_train_steps, warmup_ratio=cfg.warmup_ratio, decay_epochs=cfg.cnn_step_decay_epochs, multi_step_epoch=n_epoch) # Hardcoded param group length assert len(optimizer.param_groups) == 8 for pg_n, param_group in enumerate(optimizer.param_groups): if pg_n in [0, 1]: param_group['lr'] = (cfg.transformer_lr_mul * lr_this_step_transformer) elif pg_n in [2, 3]: param_group['lr'] = lr_this_step_transformer elif pg_n in [4, 5]: param_group['lr'] = (cfg.cnn_lr_mul * lr_this_step_cnn) else: param_group['lr'] = lr_this_step_cnn TB_LOGGER.add_scalar("train/lr_transformer", lr_this_step_transformer, global_step) TB_LOGGER.add_scalar("train/lr_cnn", lr_this_step_cnn, global_step) TB_LOGGER.add_scalar('train/loss', running_loss.val, global_step) # update model params if cfg.grad_norm != -1: grad_norm = clip_grad_norm_(amp.master_params(optimizer), cfg.grad_norm) TB_LOGGER.add_scalar("train/grad_norm", grad_norm, global_step) TB_LOGGER.step() # Check if there is None grad none_grads = [ p[0] for p in model.named_parameters() if p[1].requires_grad and p[1].grad is None ] assert len(none_grads) == 0, f"{none_grads}" with optimizer.skip_synchronize(): optimizer.step() optimizer.zero_grad() restorer.step() pbar.update(1) # checkpoint if global_step % cfg.valid_steps == 0: LOGGER.info(f'Step {global_step}: start validation') validate(model, val_loader, eval_loader, cfg, global_step, eval_filepath=cfg.val_datasets[0].txt) model_saver.save(step=global_step, model=model) if global_step >= cfg.num_train_steps: break if cfg.debug and global_step >= debug_step: break if global_step % cfg.valid_steps != 0: LOGGER.info(f'Step {global_step}: start validation') validate(model, val_loader, eval_loader, cfg, global_step, eval_filepath=cfg.val_datasets[0].txt) model_saver.save(step=global_step, model=model)
def main(): hvd.init() torch.manual_seed(args.seed) # Horovod: print logs on the first worker. verbose = 1 if hvd.rank() == 0 else 0 # Horovod: write TensorBoard logs on first worker. log_writer = tensorboardX.SummaryWriter(args.log_dir) if hvd.rank() == 0 else None kwargs = {'num_workers': 0, 'pin_memory': True} print("======= START LOADING DATA =========") #train_n = len(os.listdir(args.train_dir)) train_n = 30 train_files = sorted(os.listdir(args.train_dir))[:train_n] #train_files = ["batch_train_{}.h5".format(i) for i in range(train_n)] #train_files = ["batch_train_0.h5", "batch_train_1.h5"] train_dataset = HDF5Dataset(args.train_dir, train_files) ''' train_loader = DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=2) ''' val_n = 10 val_files = sorted(os.listdir(args.val_dir))[:val_n] val_dataset = HDF5Dataset(args.val_dir, val_files) ''' val_loader = DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=True) ''' print("Training size") print(len(train_dataset)) print("Dev size") print("Test size") print(len(val_dataset)) # Horovod: use DistributedSampler to partition data among workers. Manually specify # `num_replicas=hvd.size()` and `rank=hvd.rank()`. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, sampler=train_sampler, **kwargs) val_sampler = torch.utils.data.distributed.DistributedSampler( val_dataset, num_replicas=hvd.size(), rank=hvd.rank()) val_loader = torch.utils.data.DataLoader( val_dataset, batch_size=args.val_batch_size, sampler=val_sampler, **kwargs) # Set up standard ResNet-50 model. model = models.resnet50() model.cuda() # Horovod: scale learning rate by the number of GPUs. # Gradient Accumulation: scale learning rate by batches_per_allreduce optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=args.momentum, weight_decay=args.wd) compression = hvd.Compression.fp16 # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=compression, backward_passes_per_step=args.batches_per_allreduce) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) def train(epoch): model.train() train_sampler.set_epoch(epoch) train_loss = Metric('train_loss') train_accuracy = Metric('train_accuracy') with tqdm(total=len(train_loader), desc='Train Epoch #{}'.format(epoch + 1), disable=not verbose) as t: for batch_idx, (data, target) in enumerate(train_loader): # print(data, target) if args.cuda: data, target = data.cuda(), target.cuda() optimizer.zero_grad() # Split data into sub-batches of size batch_size for i in range(0, len(data), args.batch_size): data_batch = data[i:i + args.batch_size] target_batch = target[i:i + args.batch_size] output = model(data_batch) train_accuracy.update(accuracy(output, target_batch)) loss = F.cross_entropy(output, target_batch) train_loss.update(loss) # Average gradients among sub-batches loss.div_(math.ceil(float(len(data)) / args.batch_size)) loss.backward() # Gradient is applied across all ranks optimizer.step() t.set_postfix({'loss': train_loss.avg.item(), 'accuracy': 100. * train_accuracy.avg.item()}) t.update(1) if log_writer: log_writer.add_scalar('train/loss', train_loss.avg, epoch) log_writer.add_scalar('train/accuracy', train_accuracy.avg, epoch) def validate(epoch): model.eval() val_loss = Metric('val_loss') val_accuracy = Metric('val_accuracy') with tqdm(total=len(val_loader), desc='Validate Epoch #{}'.format(epoch + 1), disable=not verbose) as t: with torch.no_grad(): for data, target in val_loader: if args.cuda: data, target = data.cuda(), target.cuda() output = model(data) val_loss.update(F.cross_entropy(output, target)) val_accuracy.update(accuracy(output, target)) t.set_postfix({'loss': val_loss.avg.item(), 'accuracy': 100. * val_accuracy.avg.item()}) t.update(1) if log_writer: log_writer.add_scalar('val/loss', val_loss.avg, epoch) log_writer.add_scalar('val/accuracy', val_accuracy.avg, epoch) def accuracy(output, target): # get the index of the max log-probability pred = output.max(1, keepdim=True)[1] return pred.eq(target.view_as(pred)).cpu().float().mean() # Horovod: average metrics from distributed training. class Metric(object): def __init__(self, name): self.name = name self.sum = torch.tensor(0.) self.n = torch.tensor(0.) def update(self, val): self.sum += hvd.allreduce(val.detach().cpu(), name=self.name) self.n += 1 @property def avg(self): return self.sum / self.n for epoch in range(resume_from_epoch, args.epochs): train(epoch) validate(epoch)
epochCurrent = checkpoint['epoch'] lossesG = checkpoint['lossesG'] lossesD = checkpoint['lossesD'] num_vid = checkpoint['num_vid'] i_batch_current = checkpoint['i_batch'] + 1 G.train() E.train() D.train() # Horovod broadcast parameters from rank 0 to all other processes. hvd.broadcast_parameters(G.state_dict(), root_rank=0) hvd.broadcast_parameters(E.state_dict(), root_rank=0) hvd.broadcast_parameters(D.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizerG, root_rank=0) hvd.broadcast_optimizer_state(optimizerE, root_rank=0) hvd.broadcast_optimizer_state(optimizerD, root_rank=0) optimizerG = hvd.DistributedOptimizer(optimizerG, named_parameters=G.named_parameters()) optimizerE = hvd.DistributedOptimizer(optimizerE, named_parameters=E.named_parameters()) optimizerD = hvd.DistributedOptimizer(optimizerD, named_parameters=D.named_parameters()) print("Start training") # Training batch_start = datetime.now() for epoch in range(epochCurrent, num_epochs):
def setup(self, model): # call setup after the ddp process has connected self.trainer.call_setup_hook(model) if torch.cuda.is_available() and self.trainer.on_gpu: # Horovod: pin GPU to local rank assert self.trainer.root_gpu == hvd.local_rank() torch.cuda.set_device(self.trainer.root_gpu) model.cuda(self.trainer.root_gpu) # avoid duplicating progress bar if hvd.rank() != 0 and self.trainer.progress_bar_callback is not None: self.trainer.progress_bar_callback.disable() # CHOOSE OPTIMIZER # allow for lr schedulers as well optimizers, lr_schedulers, optimizer_frequencies = self.trainer.init_optimizers( model) self.trainer.optimizers = optimizers self.trainer.lr_schedulers = lr_schedulers self.trainer.optimizer_frequencies = optimizer_frequencies # Horovod: scale the learning rate by the number of workers to account for # increased total batch size for optimizer in self.trainer.optimizers: for param_group in optimizer.param_groups: param_group['lr'] *= hvd.size() # Horovod: adjust base LR used by schedulers to match scaled optimizer initial LR for scheduler in self.trainer.lr_schedulers: scheduler = scheduler['scheduler'] if isinstance(scheduler, _LRScheduler): scheduler.base_lrs = [ lr * hvd.size() for lr in scheduler.base_lrs ] if self.trainer.amp_backend: model, optimizers = model.configure_apex(amp, model, self.trainer.optimizers, self.trainer.amp_level) self.trainer.optimizers = optimizers self.trainer.reinit_scheduler_properties( self.trainer.optimizers, self.trainer.lr_schedulers) # Horovod: broadcast parameters & optimizer state to ensure consistent initialization hvd.broadcast_parameters(model.state_dict(), root_rank=0) for optimizer in self.trainer.optimizers: hvd.broadcast_optimizer_state(optimizer, root_rank=0) def filter_named_parameters(model, optimizer): opt_params = set([ p for group in optimizer.param_groups for p in group.get('params', []) ]) return [(name, p) for name, p in model.named_parameters() if p in opt_params] # Horovod: wrap optimizers to perform gradient aggregation via allreduce self.trainer.optimizers = [ hvd.DistributedOptimizer(optimizer, named_parameters=filter_named_parameters( model, optimizer)) for optimizer in self.trainer.optimizers ] # Update logger rank info from Horovod to avoid race conditions from different ranks # creating directories / writing files in the same locations. self.trainer.global_rank = hvd.rank() rank_zero_only.rank = self.trainer.global_rank self.trainer.model = model
def main(): if hvd.rank() == 0: logger.info("Logger is set - training start") # set default gpu device id # torch.cuda.set_device(config.gpus[0]) # set seed np.random.seed(config.seed) torch.manual_seed(config.seed) # torch.cuda.manual_seed_all(config.seed) torch.backends.cudnn.benchmark = False # get data with meta info (train_X, train_y), (valid_X, valid_y) = load_data() in_dim = np.shape(train_X)[1] out_dim = np.shape(train_y)[1] train_X, train_y = (torch.tensor(train_X, dtype=torch.float), torch.tensor(train_y)) train_data = torch.utils.data.TensorDataset(train_X, train_y) valid_X, valid_y = (torch.tensor(valid_X, dtype=torch.float), torch.tensor(valid_y)) valid_data = torch.utils.data.TensorDataset(valid_X, valid_y) print("in_dim: ", in_dim) print("out_dim: ", out_dim) net_crit = nn.MSELoss().to(device) layers = 1 n_nodes = 4 model = SearchFCNNController(in_dim, out_dim, layers, net_crit, n_nodes=n_nodes, device_ids=config.gpus) model = model.to(device) # weights optimizer # By default, Adasum doesn't need scaling up learning rate. lr_scaler = hvd.size() # w_optim = torch.optim.SGD( # model.weights(), # config.w_lr * lr_scaler, # momentum=config.w_momentum, # weight_decay=config.w_weight_decay, # ) w_optim = torch.optim.Adagrad(model.weights(), config.w_lr * lr_scaler, weight_decay=config.w_weight_decay) # w_optim = torch.optim.RMSprop(model.weights()) # alphas optimizer alpha_lr = config.alpha_lr alpha_optim = torch.optim.Adam( model.alphas(), alpha_lr, betas=(0.5, 0.999), weight_decay=config.alpha_weight_decay, ) # split data to train/validation train_sampler = torch.utils.data.distributed.DistributedSampler( train_data, num_replicas=hvd.size(), rank=hvd.rank()) # valid_sampler = torch.utils.data.sampler.SubsetRandomSampler(indices_valid) valid_sampler = torch.utils.data.distributed.DistributedSampler( valid_data, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_data, batch_size=config.batch_size, sampler=train_sampler, num_workers=config.workers, pin_memory=True, ) # vis. # dataiter = iter(train_loader) # images, labels = dataiter.next() # writer.add_graph(model, [images[0]]) # writer.close() valid_loader = torch.utils.data.DataLoader( valid_data, batch_size=config.batch_size, sampler=valid_sampler, num_workers=config.workers, pin_memory=True, ) lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( w_optim, config.epochs, eta_min=config.w_lr_min) architect = Architect(model, config.w_momentum, config.w_weight_decay, allow_unused=False) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(w_optim, root_rank=0) # Horovod: (optional) compression algorithm. # compression = hvd.Compression.fp16 # Horovod: wrap optimizer with DistributedOptimizer. w_optim = hvd.DistributedOptimizer( w_optim, named_parameters=model.named_parameters(), # compression=compression, # op=hvd.Adasum, op=hvd.Average, ) # training loop best_top1 = None epochs = config.epochs for epoch in range(epochs): lr = lr_scheduler.get_lr()[0] if hvd.rank() == 0: model.print_alphas(logger) # training train( train_loader, valid_loader, model, architect, w_optim, alpha_optim, lr, epoch, train_sampler, ) lr_scheduler.step() # validation cur_step = (epoch + 1) * len(train_loader) top1 = validate(valid_loader, model, epoch, cur_step) top1 = metric_average(top1, name="avg_val_top1") if hvd.rank() == 0: # log # genotype genotype = model.genotype() logger.info("genotype = {}".format(genotype)) # genotype as a image plot_path = "." + os.path.join(config.plot_path, "EP{:02d}".format(epoch + 1)) caption = "Epoch {}".format(epoch + 1) plot(genotype.normal, plot_path + "-normal", caption) # save if best_top1 is None or best_top1 < top1: best_top1 = top1 best_genotype = genotype is_best = True else: is_best = False # utils.save_checkpoint(model, "." + config.path, is_best) print("") if hvd.rank() == 0: best_genotype = model.genotype() with open("." + config.path + "/best_genotype.txt", "w") as f: f.write(str(best_genotype)) logger.info("Final best TopR2@1 = {:.3f}".format(best_top1)) logger.info("Best Genotype = {}".format(best_genotype))
def main(): parser = argparse.ArgumentParser() parser.add_argument("-exp_dir") parser.add_argument("-dataPath", default='', type=str, help="path of data files") parser.add_argument("-train_config") parser.add_argument("-data_config") parser.add_argument("-lr", default=0.0001, type=float, help="Override the LR in the config") parser.add_argument("-batch_size", default=32, type=int, help="Override the batch size in the config") parser.add_argument("-data_loader_threads", default=0, type=int, help="number of workers for data loading") parser.add_argument("-max_grad_norm", default=5, type=float, help="max_grad_norm for gradient clipping") parser.add_argument("-sweep_size", default=200, type=float, help="process n hours of data per sweep (default:200)") parser.add_argument("-num_epochs", default=1, type=int, help="number of training epochs (default:1)") parser.add_argument("-global_mvn", default=False, type=bool, help="if apply global mean and variance normalization") parser.add_argument( "-resume_from_model", type=str, help="the model from which you want to resume training") parser.add_argument("-dropout", type=float, help="set the dropout ratio") parser.add_argument("-aneal_lr_epoch", default=2, type=int, help="start to aneal the learning rate from this epoch" ) # aneal -> anneal? parser.add_argument("-aneal_lr_ratio", default=0.5, type=float, help="the ratio to aneal the learning rate") parser.add_argument('-p', '--print-freq', default=100, type=int, metavar='N', help='print frequency (default: 100)') parser.add_argument('-hvd', default=False, type=bool, help="whether to use horovod for training") args = parser.parse_args() with open(args.train_config) as f: config = yaml.safe_load(f) config["sweep_size"] = args.sweep_size with open(args.data_config) as f: data = yaml.safe_load(f) config["source_paths"] = [j for i, j in data['clean_source'].items()] if 'dir_noise' in data: config["dir_noise_paths"] = [ j for i, j in data['dir_noise'].items() ] if 'rir' in data: config["rir_paths"] = [j for i, j in data['rir'].items()] config['data_path'] = args.dataPath print("Experiment starts with config {}".format( json.dumps(config, sort_keys=True, indent=4))) # Initialize Horovod if args.hvd: import horovod.torch as hvd hvd.init() th.cuda.set_device(hvd.local_rank()) print("Run experiments with world size {}".format(hvd.size())) if not os.path.isdir(args.exp_dir): os.makedirs(args.exp_dir) trainset = SpeechDataset(config) train_dataloader = ChunkDataloader(trainset, batch_size=args.batch_size, distributed=args.multi_gpu, num_workers=args.data_loader_threads) if args.global_mvn: transform = GlobalMeanVarianceNormalization() print("Estimating global mean and variance of feature vectors...") transform.learn_mean_and_variance_from_train_loader( trainset, trainset.stream_idx_for_transform, n_sample_to_use=2000) trainset.transform = transform print("Global mean and variance transform trained successfully!") with open(args.exp_dir + "/transform.pkl", 'wb') as f: pickle.dump(transform, f, pickle.HIGHEST_PROTOCOL) print("Data loader set up successfully!") print("Number of minibatches: {}".format(len(train_dataloader))) # ceate model model_config = config["model_config"] lstm = LSTMStack(model_config["feat_dim"], model_config["hidden_size"], model_config["num_layers"], model_config["dropout"], True) model = NnetAM(lstm, model_config["hidden_size"] * 2, model_config["label_size"]) # Start training th.backends.cudnn.enabled = True if th.cuda.is_available(): model.cuda() # optimizer optimizer = th.optim.Adam(model.parameters(), lr=args.lr, amsgrad=True) if args.hvd: # Broadcast parameters and opterimizer state from rank 0 to all other processes. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) # Add Horovod Distributed Optimizer optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) # criterion criterion = nn.CrossEntropyLoss(ignore_index=-100) start_epoch = 0 if args.resume_from_model: assert os.path.isfile(args.resume_from_model ), "ERROR: model file {} does not exit!".format( args.resume_from_model) checkpoint = th.load(args.resume_from_model) state_dict = checkpoint['model'] start_epoch = checkpoint['epoch'] model.load_state_dict(state_dict) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' ".format(args.resume_from_model)) model.train() for epoch in range(start_epoch, args.num_epochs): # aneal learning rate if epoch > args.aneal_lr_epoch: for param_group in optimizer.param_groups: param_group['lr'] *= args.aneal_lr_ratio run_train_epoch(model, optimizer, criterion, train_dataloader, epoch, args) # save model if not args.hvd or hvd.rank() == 0: checkpoint = {} checkpoint['model'] = model.state_dict() checkpoint['optimizer'] = optimizer.state_dict() checkpoint['epoch'] = epoch output_file = args.exp_dir + '/model.' + str(epoch) + '.tar' th.save(checkpoint, output_file)
def test_broadcast_state(self): hvd.init() N, D_in, H, D_out = 64, 100, 10, 10 x = torch.randn(N, D_in).requires_grad_() y = torch.randn(N, D_out).requires_grad_() def new_optimizer(cls, opt_params, model): p = { k: v for k, v in opt_params.items() if k in inspect.getargspec(cls.__init__).args } return cls(model.parameters(), **p) def create_model(opt_class, opt_params): model = torch.nn.Sequential( torch.nn.Linear(D_in, H), torch.nn.ReLU(), torch.nn.Linear(H, D_out), ) optimizer = new_optimizer(opt_class, opt_params, model) optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters()) return model, optimizer def get_model_param_values(model): params = sorted(model.state_dict().items()) return [(k, v.clone()) for k, v in params] def get_optimizer_param_values(optimizer): results = [] state_dict = optimizer.state_dict() for group in state_dict['param_groups']: for param_id in group['params']: if param_id not in state_dict['state']: continue params = sorted(state_dict['state'][param_id].items()) for k, v in params: results.append( (k, v.clone() if torch.is_tensor(v) else v)) return results # L-BFGS is currently unsupported, as are sparse tensors, which are # required by SparseAdam optimizer optimizers = [ (subclass.__name__, subclass) for subclass in torch.optim.Optimizer.__subclasses__() if subclass.__module__.startswith('torch.optim') and subclass != torch.optim.LBFGS and subclass != torch.optim.SparseAdam ] optimizers.sort() opt_params_list = [ dict(lr=0.2, momentum=0.9, weight_decay=0.1, centered=True), dict(lr=0.2) ] for (opt_name, opt_class), opt_params in itertools.product( optimizers, opt_params_list): model, optimizer = create_model(opt_class, opt_params) y_pred = model(x) loss = F.mse_loss(y_pred, y, size_average=False) optimizer.zero_grad() loss.backward() optimizer.step() model_param_values = get_model_param_values(model) for name, model_param_value in model_param_values: hvd.broadcast_(model_param_value, root_rank=0) opt_param_values_updated = [] opt_param_values = get_optimizer_param_values(optimizer) for name, opt_param_value in opt_param_values: is_tensor = torch.is_tensor(opt_param_value) if not is_tensor: t = type(opt_param_value) opt_param_value = torch.Tensor([opt_param_value]) hvd.broadcast_(opt_param_value, root_rank=0) if not is_tensor: opt_param_value = t(opt_param_value.cpu().numpy()[0]) opt_param_values_updated.append((name, opt_param_value)) opt_param_values = opt_param_values_updated if hvd.rank() == 0: state = { 'model': model.state_dict(), 'optimizer': optimizer.state_dict(), } _, fname = tempfile.mkstemp('.pt') torch.save(state, fname) model, optimizer = create_model(opt_class, opt_params) if hvd.rank() == 0: checkpoint = torch.load(fname) model.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) os.remove(fname) hvd.broadcast_parameters(model.state_dict(), root_rank=0) model_param_value_after = get_model_param_values(model) for before, after in zip(model_param_values, model_param_value_after): name, model_param_value = before name_after, model_param_value_after = after self.assertEqual(name, name_after) self.assertEqual(type(model_param_value), type(model_param_value_after)) self.assertTrue( (model_param_value == model_param_value_after).all()) hvd.broadcast_optimizer_state(optimizer, root_rank=0) expected_tensors = 4 if 'momentum' not in opt_params and opt_class == torch.optim.SGD: # SGD only maintains state when momentum is specified, otherwise # it does not populate the state dict, so it will contain no tensors. expected_tensors = 0 self.assertEqual(len(optimizer.state_dict()['state'].values()), expected_tensors) opt_param_values_after = get_optimizer_param_values(optimizer) for before, after in zip(opt_param_values, opt_param_values_after): name, opt_param_value = before name_after, opt_param_value_after = after self.assertEqual(name, name_after) self.assertEqual(type(opt_param_value), type(opt_param_value_after)) if torch.is_tensor(opt_param_value): self.assertTrue( (opt_param_value == opt_param_value_after).all()) else: self.assertEqual(opt_param_value, opt_param_value_after)
}, ) model = Model( embedding_table_shapes=EMBEDDING_TABLE_SHAPES_TUPLE, num_continuous=0, emb_dropout=0.0, layer_hidden_dims=[128, 128, 128], layer_dropout_rates=[0.0, 0.0, 0.0], ).cuda() lr_scaler = hvd.size() optimizer = torch.optim.Adam(model.parameters(), lr=0.01 * lr_scaler) hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) optimizer = hvd.DistributedOptimizer(optimizer, named_parameters=model.named_parameters()) for epoch in range(args.epochs): start = time() print(f"Training epoch {epoch}") train_loss, y_pred, y = process_epoch(train_loader, model, train=True, optimizer=optimizer) hvd.join(gpu_to_use) hvd.broadcast_parameters(model.state_dict(), root_rank=0) print(f"Epoch {epoch:02d}. Train loss: {train_loss:.4f}.") hvd.join(gpu_to_use)
def main_worker(args): global best_acc1 if hvd.rank() == 0: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() # Move model to GPU. model.cuda() # Use CrossEntropyLoss with multi-class classification. criterion = nn.CrossEntropyLoss().cuda() # Default hyperparameters based on PyTorch Imagenet examples. optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=0.9, weight_decay=1e-4) # Horovod: wrap optimizer with DistributedOptimizer. optimizer = hvd.DistributedOptimizer( optimizer, named_parameters=model.named_parameters(), compression=hvd.Compression.none) cudnn.benchmark = True traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') # Default normalizations based on PyTorch Imagenet examples. normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Default transforms based on PyTorch Imagenet examples. train_dataset = datasets.ImageFolder( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ])) # The sampler defines the strategy to draw samples from the dataset. If specified, shuffle must be False. # Horovod: use DistributedSampler to partition data among workers. Manually specify # `num_replicas=hvd.size()` and `rank=hvd.rank()`. train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=hvd.size(), rank=hvd.rank()) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.bs, shuffle=(train_sampler is None), num_workers=args.num_loader_threads, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( datasets.ImageFolder( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.bs, shuffle=False, num_workers=args.num_loader_threads, pin_memory=True) # Horovod: broadcast parameters & optimizer state. hvd.broadcast_parameters(model.state_dict(), root_rank=0) hvd.broadcast_optimizer_state(optimizer, root_rank=0) for epoch in range(0, args.epochs): train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, args) # Train the model. train(train_loader, model, criterion, optimizer, epoch, args) # Validate accuracy on valdation set. acc1 = validate(val_loader, model, criterion, args) # Remember best acc@1 and save best model. is_best = acc1 > best_acc1 best_acc1 = max(acc1, best_acc1) # Horovod: only save model on rank 0 if is_best and (hvd.rank() == 0): state = { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), } torch.save(state, 'model_best.pth.tar')