def train(self): if not self._state["initialized"]: self.init_train() self._state["initialized"] = True self._state["epoch"] += 1 epoch = self._state["epoch"] num_iterations = self._hyper_params["num_iterations"] # udpate engine_state self._state["max_epoch"] = self._hyper_params["max_epoch"] self._state["max_iteration"] = num_iterations self._optimizer.modify_grad(epoch) pbar = tqdm(range(num_iterations)) self._state["pbar"] = pbar self._state["print_str"] = "" time_dict = OrderedDict() for iteration, _ in enumerate(pbar): self._state["iteration"] = iteration with Timer(name="data", output_dict=time_dict): training_data = next(self._dataloader) training_data = move_data_to_device(training_data, self._state["devices"][0]) schedule_info = self._optimizer.schedule(epoch, iteration) self._optimizer.zero_grad() # forward propagation with Timer(name="fwd", output_dict=time_dict): predict_data = self._model(training_data) training_losses, extras = OrderedDict(), OrderedDict() for loss_name, loss in self._losses.items(): training_losses[loss_name], extras[loss_name] = loss( predict_data, training_data) total_loss = sum(training_losses.values()) # backward propagation with Timer(name="bwd", output_dict=time_dict): if self._optimizer.grad_scaler is not None: self._optimizer.grad_scaler.scale(total_loss).backward() else: total_loss.backward() self._optimizer.modify_grad(epoch, iteration) with Timer(name="optim", output_dict=time_dict): self._optimizer.step() trainer_data = dict( schedule_info=schedule_info, training_losses=training_losses, extras=extras, time_dict=time_dict, ) for monitor in self._monitors: monitor.update(trainer_data) del training_data print_str = self._state["print_str"] pbar.set_description(print_str)
def train(self): if not self._state["initialized"]: self.init_train() self._state["initialized"] = True # epoch counter +1 self._state["epoch"] += 1 epoch = self._state["epoch"] num_iterations = self._hyper_params["num_iterations"] # udpate engine_state self._state["max_epoch"] = self._hyper_params["max_epoch"] self._state["max_iteration"] = num_iterations self._optimizer.modify_grad(epoch) # TODO: build stats gathering code and reorganize tqdm pbar = tqdm(range(num_iterations)) # pbar = range(num_iterations) self._state["pbar"] = pbar self._state["print_str"] = "" time_dict = OrderedDict() for iteration, _ in enumerate(pbar): self._state["iteration"] = iteration with Timer(name="data", output_dict=time_dict): training_data = next(self._dataloader) training_data = move_data_to_device(training_data, self._state["devices"][0]) schedule_info = self._optimizer.schedule(epoch, iteration) self._optimizer.zero_grad() # forward propagation with Timer(name="fwd", output_dict=time_dict): predict_data = self._model(training_data) training_losses, extras = OrderedDict(), OrderedDict() for loss_name, loss in self._losses.items(): training_losses[loss_name], extras[loss_name] = loss( predict_data, training_data) total_loss = sum(training_losses.values()) # backward propagation with Timer(name="bwd", output_dict=time_dict): total_loss.backward() # TODO: No need for average_gradients() when wrapped model with DDP? # TODO: need to register _optimizer.modify_grad as hook # see https://discuss.pytorch.org/t/distributeddataparallel-modify-gradient-before-averaging/59291 # self._optimizer.modify_grad(epoch, iteration) with Timer(name="optim", output_dict=time_dict): self._optimizer.step() trainer_data = dict( schedule_info=schedule_info, training_losses=training_losses, extras=extras, time_dict=time_dict, ) for monitor in self._monitors: monitor.update(trainer_data) del training_data print_str = self._state["print_str"] pbar.set_description(print_str) del pbar # need to be freed, otherwise spawn would be stucked.
def train(self): if not self._state["initialized"]: self.init_train() self._state["initialized"] = True # epoch counter +1 self._state["epoch"] += 1 epoch = self._state["epoch"] num_iterations = self._hyper_params["num_iterations"] # udpate engine_state self._state["max_epoch"] = self._hyper_params["max_epoch"] self._state["max_iteration"] = num_iterations self._optimizer.modify_grad(epoch) pbar = tqdm(range(num_iterations)) self._state["pbar"] = pbar self._state["print_str"] = "" time_dict = OrderedDict() for iteration, _ in enumerate(pbar): self._state["iteration"] = iteration with Timer(name="data", output_dict=time_dict): training_data = next(self._dataloader) training_data = move_data_to_device(training_data, self._state["devices"][0]) schedule_info = self._optimizer.schedule(epoch, iteration) self._optimizer.zero_grad() # forward propagation with Timer(name="fwd", output_dict=time_dict): pred_data = self._model(training_data) # compute losses loss_extra_dict = OrderedDict() for k in self._losses: loss_extra_dict[k] = self._losses[k](pred_data, training_data) # split losses & extras training_losses, extras = OrderedDict(), OrderedDict() for k in self._losses: training_losses[k], extras[k] = loss_extra_dict[k] # get loss weights & sum up loss_weights = OrderedDict() for k in self._losses: loss_weights[k] = self._losses[k].get_hps()["weight"] total_loss = [ training_losses[k] * loss_weights[k] for k in self._losses ] total_loss = sum(total_loss) # backward propagation with Timer(name="bwd", output_dict=time_dict): total_loss.backward() self._optimizer.modify_grad(epoch, iteration) with Timer(name="optim", output_dict=time_dict): self._optimizer.step() trainer_data = dict( schedule_info=schedule_info, training_losses=training_losses, extras=extras, time_dict=time_dict, ) for monitor in self._monitors: monitor.update(trainer_data) del training_data print_str = self._state["print_str"] pbar.set_description(print_str)
def train(self): if not self._state["initialized"]: self.init_train() self._state["initialized"] = True self._state["epoch"] += 1 epoch = self._state["epoch"] num_iterations = self._hyper_params["num_iterations"] # udpate engine_state self._state["max_iteration"] = num_iterations self._optimizer.modify_grad(epoch) self._state["print_str"] = "" time_dict = OrderedDict() for iteration in range(num_iterations): start_time = time.time() self._state["iteration"] = iteration with Timer(name="data", output_dict=time_dict): training_data = next(self._dataloader) training_data = move_data_to_device(training_data, self._state["devices"][0]) schedule_info = self._optimizer.schedule(epoch, iteration) self._optimizer.zero_grad() with Timer(name="track_fwd", output_dict=time_dict): with torch.no_grad(): tracker_output = self.tracker(training_data, phase="train") corr_fea = tracker_output["corr_fea"].detach() # forward propagation with Timer(name="segfwd", output_dict=time_dict): predict_data = self._model( training_data["seg_img"], corr_fea, training_data["filtered_global_img"]) training_losses, extras = OrderedDict(), OrderedDict() for loss_name, loss in self._losses.items(): training_losses[loss_name], extras[loss_name] = loss( predict_data, training_data["seg_mask"]) total_loss = sum(training_losses.values()) # backward propagation with Timer(name="bwd", output_dict=time_dict): if self._optimizer.grad_scaler is not None: self._optimizer.grad_scaler.scale(total_loss).backward() else: total_loss.backward() with Timer(name="optim", output_dict=time_dict): self._optimizer.step() cost_time = (num_iterations - iteration) * (time.time() - start_time) if dist_utils.get_rank() == 0: trainer_data = dict( schedule_info=schedule_info, training_losses=training_losses, training_data=training_data, extras=extras, time_dict=time_dict, predict_data=predict_data, iter=iteration, ) for monitor in self._monitors: monitor.update(trainer_data) print_str = "{}/{} epoch {} eta ({}h {}m {}s) bs: {} ".format( iteration, num_iterations, epoch, int(cost_time // (3600)), int(cost_time % 3600 // 60), int(cost_time % 60), training_data["im_x"].size(0)) + self._state["print_str"] logger.info(print_str) del training_data