def train_REINFORCE(input_dir: str, workspace_dir: str, output_dir: str, dataset: torch.utils.data.Dataset, synthesizer: Synthesizer, model: nn.Module, optimizer: torch.optim.Optimizer, loss: Callable[[Any], torch.Tensor], evaluate: Optional[Callable[[], None]], metric: str, reward: Callable[[Environment, Any], float], collate: Callable[[List[Any]], Any], batch_size: int, n_rollout: int, length: Length, evaluation_interval: Optional[Length] = None, snapshot_interval: Optional[Length] = None, maximize: bool = True, threshold: Optional[float] = None, use_pretrained_model: bool = False, use_pretrained_optimizer: bool = False, n_dataloader_worker: int = 2, device: torch.device = torch.device("cpu")) \ -> None: os.makedirs(workspace_dir, exist_ok=True) logger.info("Prepare model") model.to(device) model.train() group = get_world_process_group(device) if hasattr(dataset, "__len__"): iter_per_epoch = len(dataset) // batch_size else: iter_per_epoch = 1 evaluation_interval = evaluation_interval or Epoch(1) snapshot_interval = snapshot_interval or Epoch(1) n_iter = length.n_iter(iter_per_epoch) evaluation_interval_iter = evaluation_interval.n_iter(iter_per_epoch) snapshot_interval_iter = snapshot_interval.n_iter(iter_per_epoch) if use_pretrained_model: logger.info("Load pretrained model") pretrained_model = os.path.join(input_dir, "model.pt") state_dict = torch.load(pretrained_model, map_location=torch.device("cpu")) model.load_state_dict(state_dict) if use_pretrained_optimizer: logger.info("Load pretrained optimizer") pretrained_optimizer = os.path.join(input_dir, "optimizer.pt") state_dict = torch.load(pretrained_optimizer, map_location=torch.device("cpu")) optimizer.load_state_dict(state_dict) # Initialize extensions manager manager = \ create_extensions_manager( n_iter, evaluation_interval_iter, snapshot_interval_iter, iter_per_epoch, model, optimizer, evaluate, metric, maximize, threshold, workspace_dir, report_metrics=["reward"]) train_model = setup_distributed_training(model, loss, group) logger.info("Start training") try: while manager.iteration < n_iter: loader = create_dataloader(dataset, batch_size, n_dataloader_worker, lambda x: x) for samples in logger.iterable_block("iteration", loader, True): if manager.iteration >= n_iter: break # Rollout rollouts = [] train_model.train() with torch.no_grad(): for sample in logger.iterable_block("rollout", samples): sample_inputs = sample.clone_without_supervision() sample_inputs.to(device) for rollout in logger.iterable_block( "sample", synthesizer(sample_inputs, n_required_output=n_rollout)): if not rollout.is_finished: continue for _ in range(rollout.num): output = sample.clone() output["ground_truth"] = rollout.output output.mark_as_supervision("ground_truth") output["reward"] = \ torch.tensor(reward(sample.clone(), rollout.output)) rollouts.append(output) if len(rollouts) == 0: logger.warning("No rollout") continue if len(rollouts) != n_rollout: logger.warning( "#rollout is unexpected: " f"expected={n_rollout} actual={len(rollouts)}") with manager.run_iteration(): model.train() with logger.block("collate"): batch2 = collate(rollouts) with logger.block("to"): batch2.to(device) with logger.block("forward"): train_model.train() bloss = train_model(batch2) with logger.block("backward"): optimizer.zero_grad(set_to_none=True) bloss.backward() with logger.block("optimizer.step"): optimizer.step() ppe.reporting.report({"loss": bloss.item()}) ppe.reporting.report( {"reward": batch2["reward"].float().mean().item()}) logger.dump_elapsed_time_log() if device.type == "cuda": ppe.reporting.report({ "gpu.max_memory_allocated": torch.cuda.max_memory_allocated(device) }) except RuntimeError as e: # noqa logger.critical(traceback.format_exc()) save_results(workspace_dir, output_dir, model, optimizer)
def step_optimizer( self, optimizer: torch.optim.Optimizer, # type: ignore clip_grads: Optional[Callable[[Iterator], None]] = None, auto_zero_grads: bool = True, ) -> None: """ Perform a single optimization step. This function must be called once for each optimizer. However, the order of different optimizers' steps can be specified by calling this function in different orders. Also, gradient accumulation across iterations is performed by the Determined training loop by setting the experiment configuration field :ref:`optimizations.aggregation_frequency <config-aggregation-frequency>`. Here is a code example: .. code-block:: python def clip_grads(params): torch.nn.utils.clip_grad_norm_(params, 0.0001), self.context.step_optimizer(self.opt1, clip_grads) Arguments: optimizer(``torch.optim.Optimizer``): Which optimizer should be stepped. clip_grads(a function, optional): This function should have one argument for parameters in order to clip the gradients. auto_zero_grads(bool, optional): Automatically zero out gradients automatically after stepping the optimizer. If false, you need to call ``optimizer.zero_grad()`` manually. Note that if :ref:`optimizations.aggregation_frequency <config-aggregation-frequency>` is greater than 1, ``auto_zero_grads`` must be true. """ check.true( auto_zero_grads or self.hvd_config.aggregation_frequency > 1, "if optimizations.aggregation_frequency is larger than 1, " "you can only set auto_zero_grads to be true. ", ) if self._should_communicate_and_update(): # Communication needs to be synchronized so that is completed # before we apply gradient clipping and `step()`. if self.hvd_config.use and not self._use_amp: optimizer.synchronize() parameters = ( [p for group in optimizer.param_groups for p in group.get("params", [])] if not self._use_amp else apex.amp.master_params(optimizer) ) if self.hvd_config.average_aggregated_gradients: self._average_gradients( parameters=parameters, divisor=self.hvd_config.aggregation_frequency ) if clip_grads is not None: clip_grads(parameters) if self.hvd_config.use: with optimizer.skip_synchronize(): optimizer.step() else: optimizer.step() if auto_zero_grads: optimizer.zero_grad()
def train_epoch( model: Model, graph: Graph, optimizer: torch.optim.Optimizer, config: Config, train_trajectories: Trajectories, pairwise_node_features: torch.Tensor, ): """One epoch of training""" model.train() print_cum_loss = 0.0 print_num_preds = 0 print_time = time.time() print_every = len( train_trajectories) // config.batch_size // config.print_per_epoch trajectories_shuffle_indices = np.arange(len(train_trajectories)) if config.shuffle_samples: np.random.shuffle(trajectories_shuffle_indices) for iteration, batch_start in enumerate( range(0, len(trajectories_shuffle_indices) - config.batch_size + 1, config.batch_size)): optimizer.zero_grad() loss = torch.tensor(0.0, device=config.device) for i in range(batch_start, batch_start + config.batch_size): trajectory_idx = trajectories_shuffle_indices[i] observations = train_trajectories[trajectory_idx] length = train_trajectories.lengths[trajectory_idx] number_steps = None if config.rw_edge_weight_see_number_step or config.rw_expected_steps: if config.use_shortest_path_distance: number_steps = (train_trajectories.leg_shortest_lengths( trajectory_idx).float() * 1.1).long() else: number_steps = train_trajectories.leg_lengths( trajectory_idx) observed, starts, targets = generate_masks( trajectory_length=observations.shape[0], number_observations=config.number_observations, predict=config.target_prediction, with_interpolation=config.with_interpolation, device=config.device, ) diffusion_graph = graph if not config.diffusion_self_loops else graph.add_self_loops( ) predictions, potentials, rw_weights = model( observations, graph, diffusion_graph, observed=observed, starts=starts, targets=targets, pairwise_node_features=pairwise_node_features, number_steps=number_steps, ) print_num_preds += starts.shape[0] l = (compute_loss( config.loss, train_trajectories, observations, predictions, starts, targets, rw_weights, trajectory_idx, ) / starts.shape[0]) loss += l loss /= config.batch_size print_cum_loss += loss.item() loss.backward() optimizer.step() if (iteration + 1) % print_every == 0: print_loss = print_cum_loss / print_every print_loss /= print_num_preds pred_per_second = 1.0 * print_num_preds / (time.time() - print_time) print_cum_loss = 0.0 print_num_preds = 0 print_time = time.time() progress_percent = int(100.0 * ((iteration + 1) // print_every) / config.print_per_epoch) print( f"Progress {progress_percent}% | iter {iteration} | {pred_per_second:.1f} pred/s | loss {config.loss} {print_loss}" )
def train_supervised(workspace_dir: str, output_dir: str, dataset: torch.utils.data.Dataset, model: nn.Module, optimizer: torch.optim.Optimizer, loss: Callable[[Any], torch.Tensor], evaluate: Optional[Callable[[], None]], metric: str, collate: Callable[[List[Any]], Any], batch_size: int, length: Length, evaluation_interval: Optional[Length] = None, snapshot_interval: Optional[Length] = None, maximize: bool = True, threshold: Optional[float] = None, n_dataloader_worker: int = 1, device: torch.device = torch.device("cpu")) \ -> None: os.makedirs(workspace_dir, exist_ok=True) logger.info("Prepare model") model.to(device) model.train() group = get_world_process_group(device) global_batch_size = batch_size * distributed.size(group) if hasattr(dataset, "__len__"): iter_per_epoch = len(dataset) // global_batch_size else: iter_per_epoch = 1 evaluation_interval = evaluation_interval or Epoch(1) snapshot_interval = snapshot_interval or Epoch(1) n_iter = length.n_iter(iter_per_epoch) evaluation_interval_iter = evaluation_interval.n_iter(iter_per_epoch) snapshot_interval_iter = snapshot_interval.n_iter(iter_per_epoch) # Initialize extensions manager manager = \ create_extensions_manager( n_iter, evaluation_interval_iter, snapshot_interval_iter, iter_per_epoch, model, optimizer, evaluate, metric, maximize, threshold, workspace_dir) train_model = setup_distributed_training(model, loss, group) logger.info("Start training") try: while manager.iteration < n_iter: loader = create_dataloader(dataset, batch_size, n_dataloader_worker, collate) for batch in logger.iterable_block("iteration", loader, True): if manager.iteration >= n_iter: break if len(batch.to_dict()) == 0: logger.warning(f"Skip {manager.iteration} th batch") continue with manager.run_iteration(): train_model.train() with logger.block("to"): batch.to(device=device) with logger.block("forward"): bloss = train_model(batch) with logger.block("backward"): optimizer.zero_grad(set_to_none=True) bloss.backward() with logger.block("optimizer.step"): optimizer.step() ppe.reporting.report({"loss": bloss.item()}) logger.dump_elapsed_time_log() if device.type == "cuda": ppe.reporting.report({ "gpu.max_memory_allocated": torch.cuda.max_memory_allocated(device) }) except RuntimeError as e: # noqa logger.critical(traceback.format_exc()) save_results(workspace_dir, output_dir, model, optimizer)
def train( dataset: torch.utils.data.Dataset, autoencoder: torch.nn.Module, epochs: int, batch_size: int, optimizer: torch.optim.Optimizer, scheduler: Any = None, num_workers: Optional[int] = 0, validation: Optional[torch.utils.data.Dataset] = None, corruption: Optional[float] = None, cuda: bool = True, sampler: Optional[torch.utils.data.sampler.Sampler] = None, silent: bool = False, update_freq: Optional[int] = 1, update_callback: Optional[Callable[[float, float], None]] = None, epoch_callback: Optional[Callable[[int, torch.nn.Module], None]] = None ) -> None: """ Function to train an autoencoder using the provided dataset. If the dataset consists of 2-tuples or lists of (feature, prediction), then the prediction is stripped away. :param dataset: training Dataset :param autoencoder: autoencoder to train :param epochs: number of training epochs :param batch_size: batch size for training :param optimizer: optimizer to use :param scheduler: scheduler to use, or None to disable, defaults to None :param num_workers: number of workers to use in DataLoader :param corruption: proportion of masking corruption to apply, set to None to disable, defaults to None :param validation: instance of Dataset to use for validation, set to None to disable, defaults to None :param cuda: whether CUDA is used, defaults to True :param sampler: sampler to use in the DataLoader, set to None to disable, defaults to None :param silent: set to True to prevent printing out summary statistics, defaults to False :param update_freq: frequency of batches with which to update counter, set to None disables, default 1 :param update_callback: optional function of loss and validation loss to update :param epoch_callback: optional function of epoch and model :return: None """ dataloader = DataLoader(dataset, num_workers=num_workers, batch_size=batch_size, pin_memory=False, sampler=sampler, shuffle=True) if validation is not None: validation_loader = DataLoader(validation, num_workers=num_workers, batch_size=batch_size, pin_memory=False, sampler=None, shuffle=False) else: validation_loader = None loss_function = nn.MSELoss() autoencoder.train() validation_loss_value = -1 loss_value = 0 for epoch in range(epochs): if scheduler is not None: scheduler.step() data_iterator = tqdm( dataloader, leave=True, unit='batch', postfix={ 'epo': epoch, 'lss': '%.6f' % 0.0, 'vls': '%.6f' % -1, }, disable=silent, ) for index, batch in enumerate(data_iterator): # unpack the batch if its consists of a (feature, prediction) tuple or list if (isinstance(batch, tuple) or isinstance(batch, list)) and len(batch) == 2: batch, _ = batch # if we have a prediction label, strip it away elif (isinstance(batch, tuple) or isinstance(batch, list)) and len(batch) == 1: batch = batch[0] if cuda: batch = batch.cuda(non_blocking=True) batch = batch.squeeze(1).view(batch.size(0), -1) # run the batch through the autoencoder and obtain the output if corruption is not None: output = autoencoder(F.dropout(batch, corruption)) else: output = autoencoder(batch) loss = loss_function(output, batch) # accuracy = pretrain_accuracy(output, batch) loss_value = float(loss.item()) optimizer.zero_grad() loss.backward() optimizer.step(closure=None) data_iterator.set_postfix( epo=epoch, lss='%.6f' % loss_value, vls='%.6f' % validation_loss_value, ) if update_freq is not None and epoch % update_freq == 0: if validation_loader is not None: validation_output = predict(validation, autoencoder, batch_size, cuda=cuda, silent=True, encode=False, num_workers=num_workers) validation_inputs = [] for val_batch in validation_loader: if (isinstance(val_batch, tuple) or isinstance( val_batch, list)) and len(val_batch) == 2: validation_inputs.append(val_batch[0]) elif (isinstance(val_batch, tuple) or isinstance( val_batch, list)) and len(val_batch) == 1: validation_inputs.append(val_batch[0]) else: validation_inputs.append(val_batch) validation_actual = torch.cat(validation_inputs) if cuda: validation_actual = validation_actual.cuda( non_blocking=True) validation_output = validation_output.cuda( non_blocking=True) validation_loss = loss_function(validation_output, validation_actual) # validation_accuracy = pretrain_accuracy(validation_output, validation_actual) validation_loss_value = float(validation_loss.item()) data_iterator.set_postfix( epo=epoch, lss='%.6f' % loss_value, vls='%.6f' % validation_loss_value, ) autoencoder.train() else: validation_loss_value = -1 #validation_accuracy = -1 data_iterator.set_postfix( epo=epoch, lss='%.6f' % loss_value, vls='%.6f' % -1, ) if update_callback is not None: update_callback(epoch, optimizer.param_groups[0]['lr'], loss_value, validation_loss_value) if epoch_callback is not None: autoencoder.eval() epoch_callback(epoch, autoencoder) autoencoder.train()
def training( model: Module, data_iterator: Iterator, optimiser: torch.optim.Optimizer, scheduler, writer: ImageWriterMixin, interrupted_path: Path, *, num_updates: int = 2500000, early_stop_threshold: Number = 1e-9, ) -> Module: """ :param model: :type model: :param data_iterator: :type data_iterator: :param optimiser: :type optimiser: :param scheduler: :type scheduler: :param writer: :type writer: :param interrupted_path: :type interrupted_path: :param num_updates: :type num_updates: :param early_stop_threshold: :type early_stop_threshold: :return: :rtype:""" best_model_wts = copy.deepcopy(model.state_dict()) best_loss = 1e10 since = time.time() masker = StochasticMaskGenerator(4, 0.8) try: sess = tqdm(range(num_updates), leave=False, disable=False) for update_i in sess: for phase in [SplitEnum.training, SplitEnum.validation]: if phase == SplitEnum.training: for param_group in optimiser.param_groups: writer.scalar("lr", param_group["lr"], update_i) model.train() else: model.eval() rgb_imgs, *_ = next(data_iterator) optimiser.zero_grad() with torch.set_grad_enabled(phase == SplitEnum.training): model_input = masker(rgb_imgs) recon_pred, *_ = model(torch.clamp(model_input, 0.0, 1.0)) ret = criterion(recon_pred, rgb_imgs) if phase == SplitEnum.training: ret.backward() optimiser.step() scheduler.step() update_loss = ret.data.cpu().numpy() writer.scalar(f"loss/accum", update_loss, update_i) if phase == SplitEnum.validation and update_loss < best_loss: best_loss = update_loss best_model_wts = copy.deepcopy(model.state_dict()) _format = "NCHW" writer.image( "model_input", model_input, update_i, data_formats=_format ) writer.image(f"rgb_imgs", rgb_imgs, update_i, data_formats=_format) writer.image( f"recon_pred", recon_pred, update_i, data_formats=_format ) sess.write(f"New best model at update {update_i}") sess.set_description_str( f"Update {update_i} - {phase} accum_loss:{update_loss:2f}" ) if update_loss < early_stop_threshold: break except KeyboardInterrupt: print("Interrupt") finally: model.load_state_dict(best_model_wts) # load best model weights torch.save(model.state_dict(), interrupted_path) time_elapsed = time.time() - since print(f"{time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s") print(f"Best val loss: {best_loss}") return model
def attach(self, optimizer: torch.optim.Optimizer): r""" Attaches the privacy engine to the optimizer. Attaches to the ``PrivacyEngine`` an optimizer object,and injects itself into the optimizer's step. To do that it, 1. Validates that the model does not have unsupported layers. 2. Adds a pointer to this object (the ``PrivacyEngine``) inside the optimizer. 3. Moves optimizer's original ``step()`` function to ``original_step()``. 4. Monkeypatches the optimizer's ``step()`` function to call ``step()`` on the query engine automatically whenever it would call ``step()`` for itself. Parameters ---------- optimizer : torch.optim.Optimizer The optimizer to which the privacy engine will attach """ self.validator.validate(self.module) norm_clipper = ( # pyre-fixme[6]: Expected `float` for 1st param but got # `Union[List[float], float]`. clipping.ConstantFlatClipper(self.max_grad_norm) if not isinstance(self.max_grad_norm, list) # pyre-fixme[6]: Expected `List[float]` for 1st param but got # `Union[List[float], float]`. else clipping.ConstantPerLayerClipper(self.max_grad_norm)) if self.misc_settings.get("experimental", False): norm_clipper = clipping._Dynamic_Clipper_( # pyre-fixme[6]: Expected `List[float]` for 1st param but got # `List[Union[List[float], float]]`. [self.max_grad_norm], self.misc_settings.get("clip_per_layer", False), self.misc_settings.get("clipping_method", clipping.ClippingMethod.STATIC), self.misc_settings.get("ratio", 0.0), ) self.clipper = PerSampleGradientClipper(self.module, norm_clipper, self.batch_first) def dp_step(self, closure=None): self.privacy_engine.step() self.original_step(closure) # Pyre doesn't like monkeypatching. But we'll do it anyway :) optimizer.privacy_engine = self # pyre-ignore optimizer.original_step = optimizer.step # pyre-ignore optimizer.step = types.MethodType(dp_step, optimizer) # pyre-ignore def virtual_step(self): self.privacy_engine.virtual_step() # pyre-ignore optimizer.virtual_step = types.MethodType(virtual_step, optimizer) # create a cross reference for detaching self.optimizer = optimizer # pyre-ignore
def train( num_iters: int, loader: torch.utils.data.DataLoader, # type: ignore model: Parser, optimizer: torch.optim.Optimizer, label_vocab: List[Label], cfg: DictConfig, ) -> Tuple[int, float, float]: "Train the model for one epoch" model.train() env = Environment(loader, model.encoder, cfg.subbatch_max_tokens) optimizer.zero_grad() state = env.reset() device, _ = get_device() loss = torch.tensor(0.0, device=device) # stats losses = [0.0] num_examples = 0 num_correct_actions = 0 num_total_actions = 0 time_start = time() # Each batch is divided into multiple subbatches (for saving GPU memory). # Accumulate gradients calculated from subbatches and perform a single optimization step for a batch (not subbatch) while True: actions, logits = model(state) # action generation from partial trees if cfg.decoder == "graph": # for graph-based decoder, actons: List[Action] are actions at the current step for a subbatch gt_actions = env.gt_actions() loss += action_loss(logits, gt_actions, label_vocab, cfg.batch_size) correct, total = count_actions(actions, gt_actions) num_correct_actions += correct num_total_actions += total state, done = env.step(gt_actions) # teacher forcing if done: # a subbatch is finished num_examples += len(env.pred_trees) else: continue else: # for sequence-based decoder, actons: List[List[Action]] are action sequences for all steps assert cfg.decoder == "sequence" all_gt_actions = env.gt_action_seqs() loss = action_seqs_loss(logits, all_gt_actions, label_vocab, cfg.batch_size) correct, total = count_actions(actions, all_gt_actions) num_correct_actions += correct num_total_actions += total num_examples += len(all_gt_actions) # a subbatch is finished losses[-1] += loss.item() loss.backward() # type: ignore loss = 0 # type: ignore if num_examples % cfg.batch_size == 0: # a full batch is finished if num_iters <= cfg.learning_rate_warmup_steps: adjust_lr(num_iters, optimizer, cfg) torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm) optimizer.step() optimizer.zero_grad() losses.append(0) num_iters += 1 try: state = env.reset(force=True) # load a new batch except EpochEnd: accuracy = 100 * num_correct_actions / num_total_actions return num_iters, accuracy, np.mean(losses) # log training stats if (num_examples / cfg.batch_size) % cfg.log_freq == 0: recent_loss = np.mean(losses[-cfg.log_freq:]) running_accuracy = 100 * num_correct_actions / num_total_actions log.info("[%d] Loss: %.03f, Running accuracy: %.03f, Time: %.02f" % ( num_examples, recent_loss, running_accuracy, time() - time_start, )) time_start = time()
def train_single_epoch(train_loader: object, model: torch.nn.Module, criterion: object, optimizer: torch.optim.Optimizer, epoch: int, clip_var: float, total_steps: int, print_freq: int, writer: object, thres_stds: tuple = (), shape: list = None) -> int: batch_time = AverageMeter() losses = AverageMeter() top1 = AverageMeter() model.train() end = time.time() ema_loss, steps = 0, 0 for i, (input_, target) in enumerate(train_loader): steps += 1 total_steps += 1 if torch.cuda.is_available(): target = target.cuda(async=True) input_ = input_.cuda() if shape is None: input_var = torch.autograd.Variable(input_) else: input_var = torch.autograd.Variable(input_.view(shape)) target_var = torch.autograd.Variable(target) if isinstance(optimizer, VProp): # Calculate noisy loss optimizer.add_noise_to_parameters() output = model(input_var) loss = criterion(output, target_var, model) # Do an update optimizer.zero_grad() loss.backward() optimizer.remove_noise_from_parameters() optimizer.step() # Calculate clean loss to update metrics output = model(input_var) loss = criterion(output, target_var, model) else: optimizer.zero_grad() output = model(input_var) if isinstance(criterion, torch.nn.CrossEntropyLoss): loss = criterion(output, target_var) else: loss = criterion(output, target_var, model) loss.backward() if isinstance(optimizer, torch.optim.LBFGS): def closure(): return loss optimizer.step(closure) else: optimizer.step() prec1 = accuracy(output.data, target, topk=(1, ))[0] losses.update(loss.item(), input_.size(0)) top1.update(prec1, input_.size(0)) if clip_var: for k, layer in enumerate(model.layers): layer.constrain_parameters(thres_std=thres_stds[k]) if isinstance(model, torch.nn.DataParallel): if model.module.beta_ema > 0.: model.module.update_ema() else: if model.beta_ema > 0.: model.update_ema() batch_time.update(time.time() - end) end = time.time() if i % print_freq == 0: print(f' Epoch: [{epoch}][{i}/{len(train_loader)}]\t' + f'Time {batch_time.val:.3f} ({batch_time.avg:.3f})\t' + f'Loss {losses.val:.4f} ({losses.avg:.4f})\t' + f'Prec@1 {top1.val:.3f} ({top1.avg:.3f})') if writer is not None: writer.add_scalar('train/loss', losses.avg, epoch) writer.add_scalar('train/acc', top1.avg, epoch) return total_steps
def train_model(epoch: int, conf: Dict, opt, model: Model, optimizer: torch.optim.Optimizer, scheduler: LearningRateScheduler, train_batch: BatcherBase, valid_batch: BatcherBase, test_batch: BatcherBase, best_train: float, best_valid: float, test_ppl: float): model.train() total_loss, total_fwd_loss, total_bwd_loss, total_tag = 0., 0., 0., 0. step = 0 start_time = time.time() improved = False warmup_step = conf['optimizer']['warmup_step'] scheduler_name = conf['optimizer'].get('scheduler', 'cosine') add_sentence_boundary_ids = conf['token_embedder'].get( 'add_sentence_boundary_ids', False) for word_inputs, char_inputs, lengths, texts, targets in train_batch.get(): if conf['classifier']['name'].lower() in ('window_sampled_cnn_softmax', 'window_sampled_softmax'): negative_sample_targets = [] vocab = train_batch.vocab_batch mapping = train_batch.vocab_batch.mapping for words in texts: negative_sample_targets.append(mapping.get(vocab.bos)) negative_sample_targets.extend( [mapping.get(word) for word in words]) negative_sample_targets.append(mapping.get(vocab.eos)) model.classify_layer.update_negative_samples( negative_sample_targets) model.zero_grad() forward_loss, backward_loss = model.forward(word_inputs, char_inputs, lengths, targets) loss = 0.5 * (forward_loss + backward_loss) n_tags = lengths.sum().item() # It's because length counts sentence boundary, but loss doesn't if add_sentence_boundary_ids: n_tags -= (lengths.size(0) * 2) total_fwd_loss += forward_loss.item() total_bwd_loss += backward_loss.item() total_loss += loss.item() total_tag += n_tags if conf['optimizer'].get('fp16', False): optimizer.backward(loss) else: loss.backward() if 'clip_grad' in conf['optimizer']: if conf['optimizer'].get('fp16', False): optimizer.clip_master_grads(conf['optimizer']['clip_grad']) else: torch.nn.utils.clip_grad_norm_(model.parameters(), conf['optimizer']['clip_grad']) optimizer.step() step += 1 global_step = epoch * train_batch.num_batches() + step if scheduler_name in ['cosine', 'constant', 'dev_perf']: # linear warmup stage if global_step < warmup_step: curr_lr = conf['optimizer']['lr'] * global_step / warmup_step optimizer.param_groups[0]['lr'] = curr_lr if scheduler_name == 'cosine': scheduler.step_batch(global_step) elif scheduler_name == 'noam': scheduler.step_batch(global_step) if step % opt.report_steps == 0: train_ppl, train_fwd_ppl, train_bwd_ppl = [ np.exp(loss / total_tag) for loss in (total_loss, total_fwd_loss, total_bwd_loss) ] log_str = "| epoch {:3d} | step {:>6d} | lr {:.3g} | ms/batch {:5.2f} | ppl {:.2f} ({:.2f} " \ "{:.2f}) |".format(epoch, step, optimizer.param_groups[0]['lr'], 1000 * (time.time() - start_time) / opt.report_steps, train_ppl, train_fwd_ppl, train_bwd_ppl) logger.info(log_str) start_time = time.time() if step % opt.eval_steps == 0 or step % train_batch.num_batches() == 0: train_ppl, train_fwd_ppl, train_bwd_ppl = [ np.exp(loss / total_tag) for loss in (total_loss, total_fwd_loss, total_bwd_loss) ] log_str = "| epoch {:3d} | step {:>6d} | lr {:.3g} | ppl {:.2f} ({:.2f} {:.2f}) |".format( epoch, step, optimizer.param_groups[0]['lr'], train_ppl, train_fwd_ppl, train_bwd_ppl) if opt.always_save: model.save_model(opt.model, opt.save_classify_layer, global_step) if valid_batch is None: if scheduler: scheduler.step(train_ppl, epoch) if train_ppl < best_train: best_train = train_ppl log_str += ' NEW |' logger.info(log_str) improved = True if opt.always_save: model.create_symbolic_link(opt.model, opt.save_classify_layer, global_step) else: model.save_model(opt.model, opt.save_classify_layer) else: logger.info(log_str) if train_ppl < best_train: best_train = train_ppl valid_ppl, valid_fwd_ppl, valid_bwd_ppl = eval_model( model, valid_batch) log_str = "| epoch {:3d} | step {:>6d} | lr {:.3g} | dev ppl {:.2f} ({:.2f} {:.2f}) |".format( epoch, step, optimizer.param_groups[0]['lr'], valid_ppl, valid_fwd_ppl, valid_bwd_ppl) if scheduler: scheduler.step(valid_ppl, epoch) if valid_ppl < best_valid: improved = True if opt.always_save: model.create_symbolic_link(opt.model, opt.save_classify_layer, global_step) else: model.save_model(opt.model, opt.save_classify_layer) best_valid = valid_ppl log_str += ' NEW |' logger.info(log_str) if test_batch is not None: test_ppl, test_fwd_ppl, test_bwd_ppl = eval_model( model, test_batch) log_str = "| epoch {:3d} | step {:>6d} | lr {:.3g} | test ppl {:.2f} ({:.2f} {:.2f}) |".format( epoch, step, optimizer.param_groups[0]['lr'], test_ppl, test_fwd_ppl, test_bwd_ppl) logger.info(log_str) else: logger.info(log_str) return best_train, best_valid, test_ppl, improved
def do_epoch(args: argparse.Namespace, train_loader: torch.utils.data.DataLoader, model: DDP, optimizer: torch.optim.Optimizer, scheduler: torch.optim.lr_scheduler, epoch: int, callback: VisdomLogger, iter_per_epoch: int, log_iter: int) -> Tuple[torch.tensor, torch.tensor]: loss_meter = AverageMeter() train_losses = torch.zeros(log_iter).to(dist.get_rank()) train_mIous = torch.zeros(log_iter).to(dist.get_rank()) iterable_train_loader = iter(train_loader) if main_process(args): bar = tqdm(range(iter_per_epoch)) else: bar = range(iter_per_epoch) for i in bar: model.train() current_iter = epoch * len(train_loader) + i + 1 images, gt = iterable_train_loader.next() images = images.to(dist.get_rank(), non_blocking=True) gt = gt.to(dist.get_rank(), non_blocking=True) loss = compute_loss(args=args, model=model, images=images, targets=gt.long(), num_classes=args.num_classes_tr, ) optimizer.zero_grad() loss.backward() optimizer.step() if args.scheduler == 'cosine': scheduler.step() if i % args.log_freq == 0: model.eval() logits = model(images) intersection, union, target = intersectionAndUnionGPU(logits.argmax(1), gt, args.num_classes_tr, 255) if args.distributed: dist.all_reduce(loss) dist.all_reduce(intersection) dist.all_reduce(union) dist.all_reduce(target) allAcc = (intersection.sum() / (target.sum() + 1e-10)) # scalar mAcc = (intersection / (target + 1e-10)).mean() mIoU = (intersection / (union + 1e-10)).mean() loss_meter.update(loss.item() / dist.get_world_size()) if main_process(args): if callback is not None: t = current_iter / len(train_loader) callback.scalar('loss_train_batch', t, loss_meter.avg, title='Loss') callback.scalars(['mIoU', 'mAcc', 'allAcc'], t, [mIoU, mAcc, allAcc], title='Training metrics') for index, param_group in enumerate(optimizer.param_groups): lr = param_group['lr'] callback.scalar('lr', t, lr, title='Learning rate') break train_losses[int(i / args.log_freq)] = loss_meter.avg train_mIous[int(i / args.log_freq)] = mIoU if args.scheduler != 'cosine': scheduler.step() return train_mIous, train_losses
def train(model: torch.nn.Module, train_dl: DataLoader, optimizer: torch.optim.Optimizer, scheduler: LambdaLR, validation_evaluator: ClassificationEvaluator, n_epochs: int, device: AnyStr, log_interval: int = 1, patience: int = 10, neg_class_weight: float = None, model_dir: str = "local", split: str = '') -> torch.nn.Module: best_loss = float('inf') patience_counter = 0 best_f1 = 0.0 weights_found = False loss_fn = torch.nn.CrossEntropyLoss( weight=torch.tensor([neg_class_weight, 1.]).to(device)) # Main loop for ep in range(n_epochs): # Training loop for i, batch in enumerate(tqdm(train_dl)): model.train() optimizer.zero_grad() batch = tuple(t.to(device) for t in batch) input_ids = batch[0] masks = batch[1] labels = batch[2] weights = batch[3] (logits, ) = model(input_ids, attention_mask=masks) loss = loss_fn(logits.view(-1, 2), labels.view(-1)) # loss = (loss * weights).sum() loss = (loss * weights).mean() loss.backward() optimizer.step() scheduler.step() gc.collect() # Inline evaluation (val_loss, acc, P, R, F1), _ = validation_evaluator.evaluate(model) # Saving the best model and early stopping if F1 > best_f1: weights_found = True best_model = model.state_dict() # best_loss = val_loss best_f1 = F1 torch.save(model.state_dict(), f'{model_dir}/model_{split}.pth') patience_counter = 0 else: patience_counter += 1 # Stop training once we have lost patience if patience_counter == patience: break if weights_found == False: print("No good weights found, saving weights from last epoch") # Save one just in case torch.save(model.state_dict(), f'{model_dir}/model_{split}.pth') gc.collect() return best_f1
def train_epoch(train_loader: torch.utils.data.DataLoader, base_model: torch.nn.Module, classification_layer: torch.nn.Module, forg_layer: torch.nn.Module, adv_models: List[torch.nn.Module], epoch: int, optimizer: torch.optim.Optimizer, lr_scheduler: torch.optim.lr_scheduler._LRScheduler, callback: Optional[VisdomLogger], device: torch.device, args: Any): """ Trains the network for one epoch Parameters ---------- train_loader: torch.utils.data.DataLoader Iterable that loads the training set (x, y) tuples base_model: torch.nn.Module The model architecture that "extract features" from signatures classification_layer: torch.nn.Module The classification layer (from features to predictions of which user wrote the signature) forg_layer: torch.nn.Module The forgery prediction layer (from features to predictions of whether the signature is a forgery). Only used in args.forg = True epoch: int The current epoch (used for reporting) optimizer: torch.optim.Optimizer The optimizer (already initialized) lr_scheduler: torch.optim.lr_scheduler._LRScheduler The learning rate scheduler callback: VisdomLogger (optional) A callback to report the training progress device: torch.device The device (CPU or GPU) to use for training args: Namespace Extra arguments used for training: args.forg: bool Whether forgeries are being used for training args.lamb: float The weight used for the forgery loss (training with forgeries only) Returns ------- None """ step = 0 n_steps = len(train_loader) adv_model_idx = 0 for batch in train_loader: x, y = batch[0], batch[1] x = torch.tensor(x, dtype=torch.float).to(device) y = torch.tensor(y, dtype=torch.long).to(device) yforg = torch.tensor(batch[2], dtype=torch.float).to(device) # Create adversarial example adv = create_adversarial(adv_models, adv_model_idx, x, y, args.eps) # Clean example features = base_model(x) if args.forg: # Eq (4) in https://arxiv.org/abs/1705.05787 logits = classification_layer(features[yforg == 0]) class_loss = F.cross_entropy(logits, y[yforg == 0]) forg_logits = forg_layer(features).squeeze() forg_loss = F.binary_cross_entropy_with_logits(forg_logits, yforg) loss = (1 - args.lamb) * class_loss loss += args.lamb * forg_loss else: # Eq (1) in https://arxiv.org/abs/1705.05787 logits = classification_layer(features) loss = class_loss = F.cross_entropy(logits, y) # Back propagation loss = args.alpha * loss optimizer.zero_grad() loss.backward() # adv example adv_features = base_model(adv) adv_logits = classification_layer(adv_features) adv_loss = F.cross_entropy(adv_logits, y) loss2 = (1 - args.alpha) * adv_loss loss2.backward() torch.nn.utils.clip_grad_value_(optimizer.param_groups[0]['params'], 10) # Update weights optimizer.step() # Logging if callback and step % 11 == 0: with torch.no_grad(): pred_clean = logits.argmax(1) acc_clean = y[yforg == 0].eq(pred_clean).float().mean() pred_adv = adv_logits.argmax(1) acc_adv = y[yforg == 0].eq(pred_adv).float().mean() iteration = epoch + (step / n_steps) callback.scalars( ['closs_clean', 'closs_adv'], iteration, [class_loss.detach(), adv_loss.detach()]) callback.scalar('closs_adv_{}'.format(adv_model_idx), iteration, adv_loss.detach()) callback.scalars(['acc_clean', 'acc_addv'], epoch + (step / n_steps), [acc_clean, acc_adv.detach()]) if args.forg: forg_pred = forg_logits > 0 forg_acc = yforg.long().eq(forg_pred.long()).float().mean() callback.scalar('forg_loss', iteration, forg_loss.detach()) callback.scalar('forg_acc', iteration, forg_acc.detach()) step += 1 adv_model_idx = (adv_model_idx + 1) % len(adv_models) lr_scheduler.step()
def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, data_loader: Iterable, optimizer: torch.optim.Optimizer, device: torch.device, epoch: int, max_norm: float = 0): model.train() criterion.train() metric_logger = utils.MetricLogger(delimiter=" ") metric_logger.add_meter( 'lr', utils.SmoothedValue(window_size=1, fmt='{value:.6f}')) metric_logger.add_meter( 'class_error', utils.SmoothedValue(window_size=1, fmt='{value:.2f}')) header = 'Epoch: [{}]'.format(epoch) print_freq = 10 for samples, targets in metric_logger.log_every(data_loader, print_freq, header): samples = samples.to(device) targets = [{k: v.to(device) for k, v in t.items()} for t in targets] outputs = model(samples) loss_dict = criterion(outputs, targets) weight_dict = criterion.weight_dict losses = sum(loss_dict[k] * weight_dict[k] for k in loss_dict.keys() if k in weight_dict) # reduce losses over all GPUs for logging purposes loss_dict_reduced = utils.reduce_dict(loss_dict) loss_dict_reduced_unscaled = { f'{k}_unscaled': v for k, v in loss_dict_reduced.items() } loss_dict_reduced_scaled = { k: v * weight_dict[k] for k, v in loss_dict_reduced.items() if k in weight_dict } losses_reduced_scaled = sum(loss_dict_reduced_scaled.values()) loss_value = losses_reduced_scaled.item() if not math.isfinite(loss_value): print("Loss is {}, stopping training".format(loss_value)) print(loss_dict_reduced) sys.exit(1) optimizer.zero_grad() losses.backward() if max_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) optimizer.step() metric_logger.update(loss=loss_value, **loss_dict_reduced_scaled, **loss_dict_reduced_unscaled) metric_logger.update(class_error=loss_dict_reduced['class_error']) metric_logger.update(lr=optimizer.param_groups[0]["lr"]) # gather the stats from all processes metric_logger.synchronize_between_processes() print("Averaged stats:", metric_logger) return {k: meter.global_avg for k, meter in metric_logger.meters.items()}
def train(loss_calculator: torch.nn.Module, data_loader, optimizer: torch.optim.Optimizer, epoch: int, report_interval: int = 1000, loss_types: Union[str, Sequence[str]] = 'mse', ref_channel: int = 0, loss_weight: Sequence[float] = 1.0, grad_clip: float = None): assert check_argument_types() if isinstance(loss_types, str): loss_types = [loss_types] if isinstance(loss_weight, float): loss_weight = \ [loss_weight] + [0 for _ in range(len(loss_types) - 1)] if len(loss_types) != len(loss_weight): raise RuntimeError( f'Mismatch: {len(loss_types)} != {len(loss_weight)}') reporter = Reporter() loss_calculator.train() miss_count = 0 for ibatch, (_, xs, ts, ilens) in enumerate(data_loader): # xs: (B, C, T, F), ts: (B, C, T, F), ilens: (B,) optimizer.zero_grad() try: loss_dict = loss_calculator(xs, ts, ilens, loss_types=loss_types, ref_channel=ref_channel) except RuntimeError as e: # If inverse() failed in wpe if str(e).startswith('inverse_cuda: For batch'): global_logger.warning('Skipping this step. ' + str(e)) miss_count += 1 continue raise sloss = 0 for iloss, (loss_type, loss) in enumerate(loss_dict.items()): # Averaging between each gpu devices loss = loss.mean() reporter[loss_type] = loss.item() if loss_weight[iloss] != 0: sloss += loss_weight[iloss] * loss else: sloss += loss sloss.backward() if grad_clip is not None: torch.nn.utils.clip_grad_norm_(loss_calculator.parameters(), grad_clip) if check_gradient(loss_calculator): optimizer.step() else: global_logger.warning('The gradient is diverged. Skip updating.') if (ibatch + 1) % report_interval == 0: reporter.report(f'Train {epoch}epoch {ibatch + 1}: ', nhistory=report_interval - miss_count) miss_count = 0