def do_face_train_triplet( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, divs_nums, ): logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() dataset_names = cfg.DATASETS.TEST for iteration, (img_a, img_p, img_n, label_p, label_n) in enumerate(data_loader, start_iter): data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration img_a_list, _ = divs_tensors(device=device, tensors=img_a, targets=None, divs_nums=divs_nums) img_p_list, label_p_list = divs_tensors(device=device, tensors=img_p, targets=label_p, divs_nums=divs_nums) img_n_list, label_n_list = divs_tensors(device=device, tensors=img_n, targets=label_n, divs_nums=divs_nums) ####======== 拆分batch 可能对bn层有影响 ==========#### optimizer.zero_grad() for img_a, img_p, img_n, label_p, label_n in zip( img_a_list, img_p_list, img_n_list, label_p_list, label_n_list): loss_dict = model(tensors=[img_a, img_p, img_n], targets=[label_p, label_n], batch=iteration, total_batch=None) losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum(loss for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) losses /= divs_nums with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() optimizer.step() scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) if iteration > 40000: checkpointer.save_backbone("BACKBONE_{:07d}".format(iteration)) #####========= data test ============####### if data_loader_val is not None and test_period > 0 and iteration % test_period == 0: meters_val = MetricLogger(delimiter=" ") synchronize() _ = inference( # The result can be used for additional logging, e. g. for TensorBoard model, # The method changes the segmentation mask format in a data loader, # so every time a new data loader is created: make_data_loader(cfg, is_train=False, is_distributed=(get_world_size() > 1), is_for_period=True), dataset_name="[Validation]", iou_types=iou_types, box_only=False if cfg.MODEL.RETINANET_ON else cfg.MODEL.RPN_ONLY, device=cfg.MODEL.DEVICE, expected_results=cfg.TEST.EXPECTED_RESULTS, expected_results_sigma_tol=cfg.TEST.EXPECTED_RESULTS_SIGMA_TOL, output_folder=None, ) synchronize() model.train() with torch.no_grad(): # Should be one image for each GPU: for iteration_val, (images_val, targets_val, _) in enumerate(tqdm(data_loader_val)): images_val = images_val.to(device) targets_val = [target.to(device) for target in targets_val] loss_dict = model(images_val, targets_val) losses = sum(loss for loss in loss_dict.values()) loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum( loss for loss in loss_dict_reduced.values()) meters_val.update(loss=losses_reduced, **loss_dict_reduced) synchronize() logger.info( meters_val.delimiter.join([ "[Validation]: ", "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters_val), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration == max_iter: checkpointer.save("model_final", **arguments) checkpointer.save_backbone("model_final") total_training_time = time.time() - start_training_time total_time_str = str(datetime.timedelta(seconds=total_training_time)) logger.info("Total training time: {} ({:.4f} s / it)".format( total_time_str, total_training_time / (max_iter)))
def do_face_train_dk_dist_DIV_FC( cfg, model, data_loader, data_loader_val, optimizer, scheduler, checkpointer, device, checkpoint_period, test_period, arguments, divs_nums, ): # model, head = model optimizer, head_optimizer = optimizer scheduler, head_scheduler = scheduler checkpointer, head_checkpointer = checkpointer teacher = model[2] head = model[1] model = model[0] logger = logging.getLogger("maskrcnn_benchmark.trainer") logger.info("Start training") meters = MetricLogger(delimiter=" ") max_iter = len(data_loader) start_iter = arguments["iteration"] model.train() start_training_time = time.time() end = time.time() dataset_names = cfg.DATASETS.TEST teacher.eval() for iteration, (images, targets, _) in enumerate(data_loader, start_iter): data_time = time.time() - end iteration = iteration + 1 arguments["iteration"] = iteration images_list, targets_list = divs_tensors(device=device, tensors=images, targets=targets, divs_nums=divs_nums) ####======== 拆分batch 可能对bn层有影响 ==========#### optimizer.zero_grad() if len(images_list) > 1: grad_sync = False else: grad_sync = True for i, (images, targets) in enumerate(zip(images_list, targets_list)): with torch.no_grad(): soft_target = teacher(inputs=images, ) # grad_sync=False,grad_params=False) # soft_target = [soft_target_.detach() for soft_target_ in soft_target] soft_target = [ soft_target.to(GPU).detach() for GPU in head.module.GPUS ] features = model(inputs=images, grad_sync=grad_sync) loss_dict = head(features, targets=targets, batch=iteration, soft_target=soft_target, total_batch=None, grad_sync=grad_sync) #param_sync = param_sync losses = sum(loss for loss in loss_dict.values()) # reduce losses over all GPUs for logging purposes loss_dict_reduced = reduce_loss_dict(loss_dict) losses_reduced = sum( torch.mean(loss) for loss in loss_dict_reduced.values()) meters.update(loss=losses_reduced, **loss_dict_reduced) losses /= divs_nums with amp.scale_loss(losses, optimizer) as scaled_losses: scaled_losses.backward() if i == len(images_list) - 2: grad_sync = True optimizer.step() scheduler.step() # head_optimizer.step() # head_scheduler.step() batch_time = time.time() - end end = time.time() meters.update(time=batch_time, data=data_time) eta_seconds = meters.time.global_avg * (max_iter - iteration) eta_string = str(datetime.timedelta(seconds=int(eta_seconds))) if iteration % 20 == 0 or iteration == max_iter: logger.info( meters.delimiter.join([ "eta: {eta}", "iter: {iter}", "{meters}", "lr: {lr:.6f}", "max mem: {memory:.0f}", ]).format( eta=eta_string, iter=iteration, meters=str(meters), lr=optimizer.param_groups[0]["lr"], memory=torch.cuda.max_memory_allocated() / 1024.0 / 1024.0, )) if iteration % checkpoint_period == 0: checkpointer.save("model_{:07d}".format(iteration), **arguments) checkpointer.save_backbone("BACKBONE_{:07d}".format(iteration)) head_checkpointer.save("HEAD_{:07d}".format(iteration), **arguments)