if LOSS_NAME == 'Focal': LOSS = FocalLoss() elif LOSS_NAME == 'Softmax': LOSS = nn.CrossEntropyLoss() else: raise NotImplementedError print("=" * 60) print(LOSS) print("{} Loss Generated".format(LOSS_NAME)) print("=" * 60) if BACKBONE_NAME.find("IR") >= 0: backbone_paras_only_bn, backbone_paras_wo_bn = separate_irse_bn_paras(BACKBONE) # separate batch_norm parameters from others; do not do weight decay for batch_norm parameters to improve the generalizability _, head_paras_wo_bn = separate_irse_bn_paras(HEAD) else: backbone_paras_only_bn, backbone_paras_wo_bn = separate_resnet_bn_paras(BACKBONE) # separate batch_norm parameters from others; do not do weight decay for batch_norm parameters to improve the generalizability _, head_paras_wo_bn = separate_resnet_bn_paras(HEAD) OPTIMIZER = optim.SGD([{'params': backbone_paras_wo_bn + head_paras_wo_bn, 'weight_decay': WEIGHT_DECAY}, {'params': backbone_paras_only_bn}], lr = LR, momentum = MOMENTUM) print("=" * 60) print(OPTIMIZER) print("Optimizer Generated") print("=" * 60) # optionally resume from a checkpoint if BACKBONE_RESUME_ROOT and HEAD_RESUME_ROOT: print("=" * 60) if os.path.isfile(BACKBONE_RESUME_ROOT) and os.path.isfile(HEAD_RESUME_ROOT): print("Loading Backbone Checkpoint '{}'".format(BACKBONE_RESUME_ROOT)) BACKBONE.load_state_dict(torch.load(BACKBONE_RESUME_ROOT)) print("Loading Head Checkpoint '{}'".format(HEAD_RESUME_ROOT)) HEAD.load_state_dict(torch.load(HEAD_RESUME_ROOT))
print("{} Head Generated".format(HEAD_NAME)) print("=" * 60) LOSS_DICT = {'Focal': FocalLoss(), 'Softmax': nn.CrossEntropyLoss()} LOSS = LOSS_DICT[LOSS_NAME] print("=" * 60) print(LOSS) print("{} Loss Generated".format(LOSS_NAME)) print("=" * 60) if BACKBONE_NAME.find("IR") >= 0: backbone_paras_only_bn, backbone_paras_wo_bn = separate_irse_bn_paras( BACKBONE) # do not do weight decay for batch_norm parameters _, head_paras_wo_bn = separate_irse_bn_paras(HEAD) else: backbone_paras_only_bn, backbone_paras_wo_bn = separate_resnet_bn_paras( BACKBONE) # do not do weight decay for batch_norm parameters _, head_paras_wo_bn = separate_resnet_bn_paras(HEAD) OPTIMIZER = optim.SGD([{ 'params': backbone_paras_wo_bn + head_paras_wo_bn, 'weight_decay': WEIGHT_DECAY }, { 'params': backbone_paras_only_bn }], lr=LR, momentum=MOMENTUM) print("=" * 60) print(OPTIMIZER) print("Optimizer Generated") print("=" * 60) if MULTI_GPU:
def main_worker(gpu, ngpus_per_node, cfg): cfg['GPU'] = gpu if gpu != 0: def print_pass(*args): pass builtins.print = print_pass cfg['RANK'] = cfg['RANK'] * ngpus_per_node + gpu dist.init_process_group(backend=cfg['DIST_BACKEND'], init_method=cfg["DIST_URL"], world_size=cfg['WORLD_SIZE'], rank=cfg['RANK']) # Data loading code batch_size = int(cfg['BATCH_SIZE'] / ngpus_per_node) workers = int((cfg['NUM_WORKERS'] + ngpus_per_node - 1) / ngpus_per_node) DATA_ROOT = cfg[ 'DATA_ROOT'] # the parent root where your train/val/test data are stored RECORD_DIR = cfg['RECORD_DIR'] RGB_MEAN = cfg['RGB_MEAN'] # for normalize inputs RGB_STD = cfg['RGB_STD'] train_transform = transforms.Compose([ transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize(mean=RGB_MEAN, std=RGB_STD), ]) dataset_train = FaceDataset(DATA_ROOT, RECORD_DIR, train_transform) train_sampler = torch.utils.data.distributed.DistributedSampler( dataset_train) train_loader = torch.utils.data.DataLoader(dataset_train, batch_size=batch_size, shuffle=(train_sampler is None), num_workers=workers, pin_memory=True, sampler=train_sampler, drop_last=True) SAMPLE_NUMS = dataset_train.get_sample_num_of_each_class() NUM_CLASS = len(train_loader.dataset.classes) print("Number of Training Classes: {}".format(NUM_CLASS)) #======= model & loss & optimizer =======# BACKBONE_DICT = { 'ResNet_50': ResNet_50, 'ResNet_101': ResNet_101, 'ResNet_152': ResNet_152, 'IR_50': IR_50, 'IR_101': IR_101, 'IR_152': IR_152, 'IR_SE_50': IR_SE_50, 'IR_SE_101': IR_SE_101, 'IR_SE_152': IR_SE_152 } BACKBONE_NAME = cfg['BACKBONE_NAME'] INPUT_SIZE = cfg['INPUT_SIZE'] assert INPUT_SIZE == [112, 112] backbone = BACKBONE_DICT[BACKBONE_NAME](INPUT_SIZE) print("=" * 60) print(backbone) print("{} Backbone Generated".format(BACKBONE_NAME)) print("=" * 60) HEAD_DICT = {'ArcFace': ArcFace, 'CurricularFace': CurricularFace} HEAD_NAME = cfg['HEAD_NAME'] EMBEDDING_SIZE = cfg['EMBEDDING_SIZE'] # feature dimension head = HEAD_DICT[HEAD_NAME](in_features=EMBEDDING_SIZE, out_features=NUM_CLASS) print("=" * 60) print(head) print("{} Head Generated".format(HEAD_NAME)) print("=" * 60) #--------------------optimizer----------------------------- if BACKBONE_NAME.find("IR") >= 0: backbone_paras_only_bn, backbone_paras_wo_bn = separate_irse_bn_paras( backbone ) # separate batch_norm parameters from others; do not do weight decay for batch_norm parameters to improve the generalizability else: backbone_paras_only_bn, backbone_paras_wo_bn = separate_resnet_bn_paras( backbone ) # separate batch_norm parameters from others; do not do weight decay for batch_norm parameters to improve the generalizability LR = cfg['LR'] # initial LR WEIGHT_DECAY = cfg['WEIGHT_DECAY'] MOMENTUM = cfg['MOMENTUM'] optimizer = optim.SGD( [{ 'params': backbone_paras_wo_bn + list(head.parameters()), 'weight_decay': WEIGHT_DECAY }, { 'params': backbone_paras_only_bn }], lr=LR, momentum=MOMENTUM) print("=" * 60) print(optimizer) print("Optimizer Generated") print("=" * 60) # loss LOSS_NAME = cfg['LOSS_NAME'] LOSS_DICT = {'Softmax': nn.CrossEntropyLoss()} loss = LOSS_DICT[LOSS_NAME].cuda(gpu) print("=" * 60) print(loss) print("{} Loss Generated".format(loss)) print("=" * 60) torch.cuda.set_device(cfg['GPU']) backbone.cuda(cfg['GPU']) head.cuda(cfg['GPU']) #optionally resume from a checkpoint BACKBONE_RESUME_ROOT = cfg[ 'BACKBONE_RESUME_ROOT'] # the root to resume training from a saved checkpoint HEAD_RESUME_ROOT = cfg[ 'HEAD_RESUME_ROOT'] # the root to resume training from a saved checkpoint if BACKBONE_RESUME_ROOT: print("=" * 60) if os.path.isfile(BACKBONE_RESUME_ROOT): print("Loading Backbone Checkpoint '{}'".format( BACKBONE_RESUME_ROOT)) loc = 'cuda:{}'.format(cfg['GPU']) backbone.load_state_dict( torch.load(BACKBONE_RESUME_ROOT, map_location=loc)) if os.path.isfile(HEAD_RESUME_ROOT): print("Loading Head Checkpoint '{}'".format(HEAD_RESUME_ROOT)) checkpoint = torch.load(HEAD_RESUME_ROOT, map_location=loc) cfg['START_EPOCH'] = checkpoint['EPOCH'] head.load_state_dict(checkpoint['HEAD']) optimizer.load_state_dict(checkpoint['OPTIMIZER']) else: print( "No Checkpoint Found at '{}' and '{}'. Please Have a Check or Continue to Train from Scratch" .format(BACKBONE_RESUME_ROOT, HEAD_RESUME_ROOT)) print("=" * 60) backbone = torch.nn.parallel.DistributedDataParallel( backbone, device_ids=[cfg['GPU']]) head = torch.nn.parallel.DistributedDataParallel(head, device_ids=[cfg['GPU']]) # checkpoint and tensorboard dir MODEL_ROOT = cfg['MODEL_ROOT'] # the root to buffer your checkpoints LOG_ROOT = cfg['LOG_ROOT'] # the root to log your train/val status STAGES = cfg['STAGES'] # epoch stages to decay learning rate if not os.path.exists(MODEL_ROOT): os.makedirs(MODEL_ROOT) if not os.path.exists(LOG_ROOT): os.makedirs(LOG_ROOT) writer = SummaryWriter( LOG_ROOT) # writer for buffering intermedium results # train for epoch in range(cfg['START_EPOCH'], cfg['NUM_EPOCH']): train_sampler.set_epoch(epoch) adjust_learning_rate(optimizer, epoch, cfg) #train for one epoch train(train_loader, backbone, head, loss, optimizer, epoch, cfg, writer) print("=" * 60) print("Save Checkpoint...") if cfg['RANK'] % ngpus_per_node == 0: torch.save( backbone.module.state_dict(), os.path.join( MODEL_ROOT, "Backbone_{}_Epoch_{}_Time_{}_checkpoint.pth".format( BACKBONE_NAME, epoch + 1, get_time()))) save_dict = { 'EPOCH': epoch + 1, 'HEAD': head.module.state_dict(), 'OPTIMIZER': optimizer.state_dict() } torch.save( save_dict, os.path.join( MODEL_ROOT, "Head_{}_Epoch_{}_Time_{}_checkpoint.pth".format( HEAD_NAME, epoch + 1, get_time())))
print("=" * 60) LOSS_DICT = {'Focal': FocalLoss(), 'Softmax': nn.CrossEntropyLoss()} LOSS = LOSS_DICT[LOSS_NAME] print("=" * 60) print(LOSS) print("{} Loss Generated".format(LOSS_NAME)) print("=" * 60) if BACKBONE_NAME.find("IR") >= 0: backbone_paras_only_bn, backbone_paras_wo_bn = separate_irse_bn_paras( BACKBONE) # separate batch_norm parameters from others; do not do weight decay for batch_norm parameters to improve the generalizability _, head_paras_wo_bn = separate_irse_bn_paras(HEAD) else: backbone_paras_only_bn, backbone_paras_wo_bn = separate_resnet_bn_paras( BACKBONE) # separate batch_norm parameters from others; do not do weight decay for batch_norm parameters to improve the generalizability _, head_paras_wo_bn = separate_resnet_bn_paras(HEAD) OPTIMIZER = optim.SGD( [{'params': backbone_paras_wo_bn}, {'params': backbone_paras_only_bn}, {'params': head_paras_wo_bn}], lr=LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY) print("=" * 60) print(OPTIMIZER) print("Optimizer Generated") print("=" * 60) # optionally resume from a checkpoint if BACKBONE_RESUME_ROOT and HEAD_RESUME_ROOT: print("=" * 60) if os.path.isfile(BACKBONE_RESUME_ROOT): print("Loading Backbone Checkpoint '{}'".format(BACKBONE_RESUME_ROOT)) BACKBONE.load_state_dict(torch.load(BACKBONE_RESUME_ROOT))