def __init__(self, num_classes, network='efficientdet-d0', D_bifpn=3, W_bifpn=88, is_training=True, threshold=0.01, iou_threshold=0.5, transform=transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), device='cpu'): super(EfficientDet, self).__init__() self.device = device self.backbone = EfficientNet.from_pretrained(MODEL_MAP[network]).to(device) self.is_training = is_training self.neck = BIFPN(in_channels=self.backbone.get_list_features()[-5:], out_channels=W_bifpn, stack=D_bifpn, num_outs=5).to(device) self.bbox_head = RetinaHead(num_classes=num_classes, in_channels=W_bifpn, device=self.device).to(device) self.threshold = threshold self.iou_threshold = iou_threshold self.transform = transform
def __init__(self, compound_coef, load_weights=False): super(EfficientNet, self).__init__() model = EffNet.from_pretrained(f'efficientnet-b{compound_coef}', load_weights) del model._conv_head del model._bn1 del model._avg_pooling del model._dropout del model._fc self.model = model
def get_model(config, num_classes=1): model_name = config.MODEL.NAME if model_name.startswith('resnet'): model = globals().get(model_name)(pretrained=True) model.avgpool = nn.AdaptiveAvgPool2d(1) in_features = model.fc.in_features model.fc = nn.Linear(in_features, num_classes) elif model_name.startswith('efficient'): model = EfficientNet.from_pretrained(model_name, num_classes=num_classes) else: model = pretrainedmodels.__dict__[model_name](num_classes=1000, pretrained='imagenet') model.avg_pool = nn.AdaptiveAvgPool2d(1) in_features = model.last_linear.in_features model.last_linear = nn.Linear(in_features, num_classes) print('model name:', model_name) if model_name.startswith('efficient'): if config.MODEL.FC_TYPE == 1: model.fc_type = 1 in_features = model.out_channels new_fc = nn.Sequential( nn.Linear(in_features, 256), nn.BatchNorm1d(256, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True), nn.ReLU(), nn.Dropout(0.5), nn.Linear(256, 1)) model._fc = new_fc print('new fc added') elif config.MODEL.FC_TYPE == 2: model.fc_type = 2 in_features = model.out_channels new_fc = nn.Sequential( nn.BatchNorm1d(in_features*2, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True), nn.Dropout(0.25), nn.Linear(in_features*2, 512, bias=True), nn.ReLU(), nn.BatchNorm1d(512, eps=0.001, momentum=0.010000000000000009, affine=True, track_running_stats=True), nn.Dropout(0.5), nn.Linear(512, 1, bias=True)) model._fc = new_fc print('gold fc added') if config.PARALLEL: model = nn.DataParallel(model) return model
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model ##################################################################################### if args.pretrained: if args.arch.startswith('efficientnet-b'): print('=> using pre-trained {}'.format(args.arch)) model = EfficientNet.from_pretrained(args.arch, advprop=args.advprop) else: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: if args.arch.startswith('efficientnet-b'): print("=> creating model {}".format(args.arch)) model = EfficientNet.from_name(args.arch) elif args.arch.startswith('Dense'): print("=> creating model {}".format(args.arch)) model = DenseNet40() else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() # create teacher model if args.kd: print('=> loading teacher model') if args.teacher_arch.startswith('efficientnet-b'): teacher = EfficientNet.from_pretrained(args.teacher_arch) teacher.eval() print('=> {} loaded'.format(args.teacher_arch)) elif args.teacher_arch.startswith('resnext101_32'): teacher = torch.hub.load('facebookresearch/WSL-Images', '{}_wsl'.format(args.teacher_arch)) teacher.eval() print('=> {} loaded'.format(args.teacher_arch)) elif args.overhaul: teacher = resnet.resnet152(pretrained=True) else: teacher = models.__dict__[args.teacher_arch](pretrained=True) teacher.eval() print('=> {} loaded'.format(args.teacher_arch)) if args.overhaul: print('=> using overhaul distillation') d_net = Distiller(teacher, model) if args.distributed: if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int((args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() model.cuda() else: model = torch.nn.DataParallel(model).cuda() if args.kd: teacher = torch.nn.DataParallel(teacher).cuda() if args.overhaul: d_net = torch.nn.DataParallel(d_net).cuda() if args.pretrained: if args.arch.startswith('efficientnet-b'): loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.pth_path, map_location=loc) model.load_state_dict(checkpoint['state_dict']) ##################################################################################### # define loss function (criterion) and optimizer, scheduler ##################################################################################### if args.kd: criterion = kd_criterion if args.overhaul: criterion = nn.CrossEntropyLoss().cuda(args.gpu) else: criterion = nn.CrossEntropyLoss().cuda(args.gpu) if args.overhaul: optimizer = torch.optim.SGD(list(model.parameters()) + list(d_net.module.Connectors.parameters()), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # nesterov else: optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=args.weight_decay, amsgrad=False) scheduler = CosineAnnealingLR(optimizer, T_max=args.epochs * int(1281167 / args.batch_size), eta_min=0, last_epoch=-1) args.lr = 0.048 args.bs = 384 optimizer = torch.optim.RMSprop( model.parameters(), lr=args.lr, alpha=0.9, eps=.001, momentum=0.9, weight_decay=args.weight_decay) from typing import Dict, Any class Scheduler: """ Parameter Scheduler Base Class A scheduler base class that can be used to schedule any optimizer parameter groups. Unlike the builtin PyTorch schedulers, this is intended to be consistently called * At the END of each epoch, before incrementing the epoch count, to calculate next epoch's value * At the END of each optimizer update, after incrementing the update count, to calculate next update's value The schedulers built on this should try to remain as stateless as possible (for simplicity). This family of schedulers is attempting to avoid the confusion of the meaning of 'last_epoch' and -1 values for special behaviour. All epoch and update counts must be tracked in the training code and explicitly passed in to the schedulers on the corresponding step or step_update call. Based on ideas from: * https://github.com/pytorch/fairseq/tree/master/fairseq/optim/lr_scheduler * https://github.com/allenai/allennlp/tree/master/allennlp/training/learning_rate_schedulers """ def __init__(self, optimizer: torch.optim.Optimizer, param_group_field: str, noise_range_t=None, noise_type='normal', noise_pct=0.67, noise_std=1.0, noise_seed=None, initialize: bool = True) -> None: self.optimizer = optimizer self.param_group_field = param_group_field self._initial_param_group_field = f"initial_{param_group_field}" if initialize: for i, group in enumerate(self.optimizer.param_groups): if param_group_field not in group: raise KeyError(f"{param_group_field} missing from param_groups[{i}]") group.setdefault(self._initial_param_group_field, group[param_group_field]) else: for i, group in enumerate(self.optimizer.param_groups): if self._initial_param_group_field not in group: raise KeyError(f"{self._initial_param_group_field} missing from param_groups[{i}]") self.base_values = [group[self._initial_param_group_field] for group in self.optimizer.param_groups] self.metric = None # any point to having this for all? self.noise_range_t = noise_range_t self.noise_pct = noise_pct self.noise_type = noise_type self.noise_std = noise_std self.noise_seed = noise_seed if noise_seed is not None else 42 self.update_groups(self.base_values) def state_dict(self) -> Dict[str, Any]: return {key: value for key, value in self.__dict__.items() if key != 'optimizer'} def load_state_dict(self, state_dict: Dict[str, Any]) -> None: self.__dict__.update(state_dict) def get_epoch_values(self, epoch: int): return None def get_update_values(self, num_updates: int): return None def step(self, epoch: int, metric: float = None) -> None: self.metric = metric values = self.get_epoch_values(epoch) if values is not None: values = self._add_noise(values, epoch) self.update_groups(values) def step_update(self, num_updates: int, metric: float = None): self.metric = metric values = self.get_update_values(num_updates) if values is not None: values = self._add_noise(values, num_updates) self.update_groups(values) def update_groups(self, values): if not isinstance(values, (list, tuple)): values = [values] * len(self.optimizer.param_groups) for param_group, value in zip(self.optimizer.param_groups, values): param_group[self.param_group_field] = value def _add_noise(self, lrs, t): if self.noise_range_t is not None: if isinstance(self.noise_range_t, (list, tuple)): apply_noise = self.noise_range_t[0] <= t < self.noise_range_t[1] else: apply_noise = t >= self.noise_range_t if apply_noise: g = torch.Generator() g.manual_seed(self.noise_seed + t) if self.noise_type == 'normal': while True: # resample if noise out of percent limit, brute force but shouldn't spin much noise = torch.randn(1, generator=g).item() if abs(noise) < self.noise_pct: break else: noise = 2 * (torch.rand(1, generator=g).item() - 0.5) * self.noise_pct lrs = [v + v * noise for v in lrs] return lrs class StepLRScheduler(Scheduler): """ """ def __init__(self, optimizer: torch.optim.Optimizer, decay_t: float, decay_rate: float = 1., warmup_t=0, warmup_lr_init=0, t_in_epochs=True, noise_range_t=None, noise_pct=0.67, noise_std=1.0, noise_seed=42, initialize=True, ) -> None: super().__init__( optimizer, param_group_field="lr", noise_range_t=noise_range_t, noise_pct=noise_pct, noise_std=noise_std, noise_seed=noise_seed, initialize=initialize) self.decay_t = decay_t self.decay_rate = decay_rate self.warmup_t = warmup_t self.warmup_lr_init = warmup_lr_init self.t_in_epochs = t_in_epochs if self.warmup_t: self.warmup_steps = [(v - warmup_lr_init) / self.warmup_t for v in self.base_values] super().update_groups(self.warmup_lr_init) else: self.warmup_steps = [1 for _ in self.base_values] def _get_lr(self, t): if t < self.warmup_t: lrs = [self.warmup_lr_init + t * s for s in self.warmup_steps] else: lrs = [v * (self.decay_rate ** (t // self.decay_t)) for v in self.base_values] return lrs def get_epoch_values(self, epoch: int): if self.t_in_epochs: return self._get_lr(epoch) else: return None def get_update_values(self, num_updates: int): if not self.t_in_epochs: return self._get_lr(num_updates) else: return None scheduler = StepLRScheduler( optimizer, decay_t=2.4, decay_rate=0.97, warmup_lr_init=1e-6, warmup_t=3, noise_range_t=None, noise_pct=getattr(args, 'lr_noise_pct', 0.67), noise_std=getattr(args, 'lr_noise_std', 1.), noise_seed=getattr(args, 'seed', 42), ) # scheduler = MultiStepLR(optimizer, milestones=args.schedule, gamma=args.gamma) # milestone = np.ceil(np.arange(0,300,2.4)) # scheduler = MultiStepLR(optimizer, milestones=[30,60,90,120,150,180,210,240,270], gamma=0.1) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})" .format(args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True ##################################################################################### # Data loading code ##################################################################################### traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') if args.advprop: normalize = transforms.Lambda(lambda img: img * 2.0 - 1.0) else: normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = ImageFolder_iid( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(p=0.5), ImageNetPolicy(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader( ImageFolder_iid(valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) ##################################################################################### if args.evaluate: validate(val_loader, model, criterion, args) # Start training ##################################################################################### best_acc1 = 0 teacher_name = '' student_name = '' for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) # train for one epoch if args.kd: if args.overhaul: train_with_overhaul(train_loader, d_net, optimizer, criterion, epoch, args) acc1 = validate_overhaul(val_loader, model, criterion, epoch, args) else: train_kd(train_loader, teacher, model, criterion, optimizer, epoch, args) acc1 = validate_kd(val_loader, teacher, model, criterion, args) teacher_name = teacher.module.__class__.__name__ else: student_name = model.module.__class__.__name__ train(train_loader, model, criterion, optimizer, epoch, args) acc1 = validate(val_loader, model, criterion, args) # remember best acc@1 and save checkpoint #writer.add_scalars('acc1', acc1, epoch) is_best = acc1 > best_acc1 if acc1 < 65: print(colored('not saving... accuracy smaller than 65','green')) is_best = False best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or (args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint({ 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer' : optimizer.state_dict(), }, is_best, teacher_name=teacher_name, student_name=student_name, save_path=args.save_path, acc=acc1) scheduler.step(epoch)
def __init__(self, config): super(Segtran2d, self).__init__(config) self.config = config self.device = config.device self.trans_in_dim = config.trans_in_dim self.trans_out_dim = config.trans_out_dim self.num_translayers = config.num_translayers self.bb_feat_upsize = config.bb_feat_upsize self.G = config.G self.voxel_fusion = SegtranFusionEncoder(config, 'Fusion') self.backbone_type = config.backbone_type self.use_pretrained = config.use_pretrained self.pos_embed_every_layer = config.pos_embed_every_layer if self.backbone_type.startswith('resnet'): self.backbone = resnet.__dict__[self.backbone_type]( pretrained=self.use_pretrained, do_pool1=not self.bb_feat_upsize) print("%s created" % self.backbone_type) elif self.backbone_type.startswith('resibn'): mat = re.search(r"resibn(\d+)", self.backbone_type) backbone_type = 'resnet{}_ibn_a'.format(mat.group(1)) self.backbone = resnet_ibn.__dict__[backbone_type]( pretrained=self.use_pretrained, do_pool1=not self.bb_feat_upsize) print("%s created" % backbone_type) elif self.backbone_type.startswith('eff'): backbone_type = self.backbone_type.replace("eff", "efficientnet") stem_stride = 1 if self.bb_feat_upsize else 2 advprop = True if self.use_pretrained: self.backbone = EfficientNet.from_pretrained( backbone_type, advprop=advprop, ignore_missing_keys=True, stem_stride=stem_stride) else: self.backbone = EfficientNet.from_name(backbone_type, stem_stride=stem_stride) print("{} created (stem_stride={}, advprop={})".format( backbone_type, stem_stride, advprop)) self.in_fpn_use_bn = config.in_fpn_use_bn self.in_fpn_layers = config.in_fpn_layers self.in_fpn_scheme = config.in_fpn_scheme # FPN output resolution is determined by the smallest number (lowest layer). pool_stride = 2**np.min(self.in_fpn_layers) if not self.bb_feat_upsize: pool_stride *= 2 self.mask_pool = nn.AvgPool2d((pool_stride, pool_stride)) self.bb_feat_dims = config.bb_feat_dims self.in_fpn23_conv = nn.Conv2d(self.bb_feat_dims[2], self.bb_feat_dims[3], 1) self.in_fpn34_conv = nn.Conv2d(self.bb_feat_dims[3], self.bb_feat_dims[4], 1) # Default in_fpn_layers: 34. last_in_fpn_layer_idx: 4. last_in_fpn_layer_idx = self.in_fpn_layers[-1] if self.bb_feat_dims[last_in_fpn_layer_idx] != self.trans_in_dim: self.in_fpn_bridgeconv = nn.Conv2d( self.bb_feat_dims[last_in_fpn_layer_idx], self.trans_in_dim, 1) else: self.in_fpn_bridgeconv = nn.Identity() # in_bn4b/in_gn4b normalizes in_fpn43_conv(layer 4 features), # so the feature dim = dim of layer 3. # in_bn3b/in_gn3b normalizes in_fpn32_conv(layer 3 features), # so the feature dim = dim of layer 2. if self.in_fpn_use_bn: self.in_bn3b = nn.BatchNorm2d(self.bb_feat_dims[3]) self.in_bn4b = nn.BatchNorm2d(self.bb_feat_dims[4]) self.in_fpn_norms = [None, None, None, self.in_bn3b, self.in_bn4b] else: self.in_gn3b = nn.GroupNorm(self.G, self.bb_feat_dims[3]) self.in_gn4b = nn.GroupNorm(self.G, self.bb_feat_dims[4]) self.in_fpn_norms = [None, None, None, self.in_gn3b, self.in_gn4b] self.in_fpn_convs = [ None, None, self.in_fpn23_conv, self.in_fpn34_conv ] self.num_classes = config.num_classes self.num_modalities = config.num_modalities if self.num_modalities > 0: self.mod_fuse_conv = nn.Conv2d(self.num_modalities, 1, 1) self.out_fpn_use_bn = config.out_fpn_use_bn self.out_fpn_layers = config.out_fpn_layers self.out_fpn_scheme = config.out_fpn_scheme self.out_fpn_do_dropout = config.out_fpn_do_dropout self.posttrans_use_bn = config.posttrans_use_bn if self.out_fpn_layers != self.in_fpn_layers: self.do_out_fpn = True self.out_fpn12_conv = nn.Conv2d(self.bb_feat_dims[1], self.bb_feat_dims[2], 1) self.out_fpn23_conv = nn.Conv2d(self.bb_feat_dims[2], self.bb_feat_dims[3], 1) self.out_fpn34_conv = nn.Conv2d(self.bb_feat_dims[3], self.bb_feat_dims[4], 1) # Default in_fpn_layers: 34, out_fpn_layers: 1234. last_out_fpn_layer_idx: 3. last_out_fpn_layer_idx = self.out_fpn_layers[-len(self. in_fpn_layers)] if self.bb_feat_dims[last_out_fpn_layer_idx] != self.trans_out_dim: self.out_fpn_bridgeconv = nn.Conv2d( self.bb_feat_dims[last_out_fpn_layer_idx], self.trans_out_dim, 1) else: self.out_fpn_bridgeconv = nn.Identity() # out_bn3b/out_gn3b normalizes out_fpn23_conv(layer 3 features), # so the feature dim = dim of layer 2. # out_bn2b/out_gn2b normalizes out_fpn12_conv(layer 2 features), # so the feature dim = dim of layer 1. if self.out_fpn_use_bn: self.out_bn2b = nn.BatchNorm2d(self.bb_feat_dims[2]) self.out_bn3b = nn.BatchNorm2d(self.bb_feat_dims[3]) self.out_bn4b = nn.BatchNorm2d(self.bb_feat_dims[4]) self.out_fpn_norms = [ None, None, self.out_bn2b, self.out_bn3b, self.out_bn4b ] else: self.out_gn2b = nn.GroupNorm(self.G, self.bb_feat_dims[2]) self.out_gn3b = nn.GroupNorm(self.G, self.bb_feat_dims[3]) self.out_gn4b = nn.GroupNorm(self.G, self.bb_feat_dims[4]) self.out_fpn_norms = [ None, None, self.out_gn2b, self.out_gn3b, self.out_gn4b ] self.out_fpn_convs = [ None, self.out_fpn12_conv, self.out_fpn23_conv, self.out_fpn34_conv ] self.out_conv = nn.Conv2d(self.trans_out_dim, self.num_classes, 1) self.out_fpn_dropout = nn.Dropout(config.hidden_dropout_prob) # out_fpn_layers = in_fpn_layers, no need to do fpn at the output end. # Output class scores directly. else: self.do_out_fpn = False if '2' in self.in_fpn_layers: # Output resolution is 1/4 of input already. No need to do upsampling here. self.out_conv = nn.Conv2d(config.trans_out_dim, self.num_classes, 1) else: # Output resolution is 1/8 of input. Do upsampling to make resolution x 2 self.out_conv = nn.ConvTranspose2d(config.trans_out_dim, self.num_classes, 2, 2) self.apply(self.init_weights) # tie_qk() has to be executed after weight initialization. self.apply(self.tie_qk) self.apply(self.add_identity_bias) # Initialize mod_fuse_conv weights and bias. # Set all modalities to have equal weights. if self.num_modalities > 0: self.mod_fuse_conv.weight.data.fill_(1 / self.num_modalities) self.mod_fuse_conv.bias.data.zero_() self.scales_printed = False self.translayer_dims = config.translayer_dims self.num_vis_layers = 1 + 2 * self.num_translayers
def __init__(self, config): super(Segtran25d, self).__init__(config) self.config = config self.device = config.device self.orig_in_channels = config.orig_in_channels self.trans_in_dim = config.trans_in_dim self.trans_out_dim = config.trans_out_dim self.num_translayers = config.num_translayers self.bb_feat_upsize = config.bb_feat_upsize self.G = config.G self.voxel_fusion = SegtranFusionEncoder(config, 'Fusion') self.backbone_type = config.backbone_type self.use_pretrained = config.use_pretrained self.pos_embed_every_layer = config.pos_embed_every_layer if self.backbone_type.startswith('resnet'): self.backbone = resnet.__dict__[self.backbone_type]( pretrained=self.use_pretrained, do_pool1=not self.bb_feat_upsize) print("%s created" % self.backbone_type) elif self.backbone_type.startswith('resibn'): mat = re.search(r"resibn(\d+)", self.backbone_type) backbone_type = 'resnet{}_ibn_a'.format(mat.group(1)) self.backbone = resnet_ibn.__dict__[backbone_type]( pretrained=self.use_pretrained, do_pool1=not self.bb_feat_upsize) print("%s created" % backbone_type) elif self.backbone_type.startswith('eff'): backbone_type = self.backbone_type.replace("eff", "efficientnet") stem_stride = 1 if self.bb_feat_upsize else 2 advprop = True if self.use_pretrained: self.backbone = EfficientNet.from_pretrained( backbone_type, advprop=advprop, ignore_missing_keys=True, stem_stride=stem_stride) else: self.backbone = EfficientNet.from_name(backbone_type, stem_stride=stem_stride) print("{} created (stem_stride={}, advprop={})".format( backbone_type, stem_stride, advprop)) self.inchan_to3_scheme = config.inchan_to3_scheme self.D_groupsize = config.D_groupsize self.eff_in_channels = self.orig_in_channels * self.D_groupsize self.D_pool_K = config.D_pool_K self.out_fpn_upsampleD_scheme = config.out_fpn_upsampleD_scheme self.input_scale = config.input_scale # For brats, eff_in_channels = 4 (4 modalities, D_groupsize = 1). if self.eff_in_channels != 3: if self.inchan_to3_scheme == 'avgto3': if self.eff_in_channels == 2: self.in_bridge_to3 = nn.Linear(2, 3, bias=False) in_avg_2to3_weight = torch.tensor([[1, 0], [0.5, 0.5], [0, 1]]) self.in_bridge_to3.weight.data.copy_(in_avg_2to3_weight) elif self.eff_in_channels == 4: self.in_bridge_to3 = nn.Linear(4, 3, bias=False) in_avg_4to3_weight = torch.tensor([[1, 0, 0, 0], [0, 0.5, 0.5, 0], [0, 0, 0, 1]]) self.in_bridge_to3.weight.data.copy_(in_avg_4to3_weight) else: raise NotImplementedError( "'avgto3' is only for effective channels == 2 or 4, not {}" .format(self.eff_in_channels)) self.in_bridge_to3.weight.requires_grad = False elif self.eff_in_channels == 1 and self.inchan_to3_scheme == 'dup3': self.in_bridge_to3 = lambda x: x.expand(-1, 3, -1, -1, -1) elif self.inchan_to3_scheme == 'bridgeconv': self.in_bridge_to3 = nn.Conv3d(self.eff_in_channels, 3, 1) # stemconv is only applicable for efficientnet. elif self.eff_in_channels > 3 and self.inchan_to3_scheme == 'stemconv': if self.backbone_type.startswith('eff'): self.backbone._change_in_channels(4, keep_RGB_weight=True) self.in_bridge_to3 = nn.Identity() else: raise NotImplementedError( "Changing stemconv channel number is not supported for {}" .format(self.backbone_type)) else: raise NotImplementedError( "Effective input channel size={}*{} is not supported for scheme '{}'" .format(self.orig_in_channels, self.D_groupsize, self.inchan_to3_scheme)) self.in_fpn_use_bn = config.in_fpn_use_bn self.in_fpn_layers = config.in_fpn_layers self.in_fpn_scheme = config.in_fpn_scheme # FPN output resolution is determined by the smallest number (lowest layer). pool_stride = 2**np.min(self.in_fpn_layers) if not self.bb_feat_upsize: pool_stride *= 2 self.mask_pool = nn.AvgPool2d((pool_stride, pool_stride)) self.bb_feat_dims = config.bb_feat_dims self.in_fpn23_conv = nn.Conv2d(self.bb_feat_dims[2], self.bb_feat_dims[3], 1) self.in_fpn34_conv = nn.Conv2d(self.bb_feat_dims[3], self.bb_feat_dims[4], 1) # Default in_fpn_layers: 34. last_in_fpn_layer_idx: 4. last_in_fpn_layer_idx = self.in_fpn_layers[-1] if self.bb_feat_dims[last_in_fpn_layer_idx] != self.trans_in_dim: self.in_fpn_bridgeconv = nn.Conv2d( self.bb_feat_dims[last_in_fpn_layer_idx], self.trans_in_dim, 1) else: self.in_fpn_bridgeconv = nn.Identity() # in_bn4b/in_gn4b normalizes in_fpn43_conv(layer 4 features), # so the feature dim = dim of layer 3. # in_bn3b/in_gn3b normalizes in_fpn32_conv(layer 3 features), # so the feature dim = dim of layer 2. if self.in_fpn_use_bn: self.in_bn3b = nn.BatchNorm2d(self.bb_feat_dims[3]) self.in_bn4b = nn.BatchNorm2d(self.bb_feat_dims[4]) self.in_fpn_norms = [None, None, None, self.in_bn3b, self.in_bn4b] else: self.in_gn3b = nn.GroupNorm(self.G, self.bb_feat_dims[3]) self.in_gn4b = nn.GroupNorm(self.G, self.bb_feat_dims[4]) self.in_fpn_norms = [None, None, None, self.in_gn3b, self.in_gn4b] self.in_fpn_convs = [ None, None, self.in_fpn23_conv, self.in_fpn34_conv ] self.num_classes = config.num_classes self.out_fpn_use_bn = config.out_fpn_use_bn self.out_fpn_layers = config.out_fpn_layers self.out_fpn_scheme = config.out_fpn_scheme self.out_fpn_do_dropout = config.out_fpn_do_dropout if self.out_fpn_layers != self.in_fpn_layers: self.do_out_fpn = True self.out_fpn12_conv3d = nn.Conv3d(self.bb_feat_dims[1], self.bb_feat_dims[2], 1) self.out_fpn23_conv3d = nn.Conv3d(self.bb_feat_dims[2], self.bb_feat_dims[3], 1) self.out_fpn34_conv3d = nn.Conv3d(self.bb_feat_dims[3], self.bb_feat_dims[4], 1) last_out_fpn_layer = self.out_fpn_layers[-len(self.in_fpn_layers)] self.out_fpn_bridgeconv3d = nn.Conv3d( self.bb_feat_dims[last_out_fpn_layer], self.trans_out_dim, 1) if self.out_fpn_upsampleD_scheme == 'conv': self.out_feat_dim = self.trans_out_dim // self.D_pool_K self.out_fpn_upsampleD = nn.Conv3d( self.trans_out_dim, self.out_feat_dim * self.D_pool_K, 1) else: self.out_feat_dim = self.trans_out_dim # out_bn3b/out_gn3b normalizes out_fpn23_conv3d(layer 3 features), # so the feature dim = dim of layer 2. # out_bn2b/out_gn2b normalizes out_fpn12_conv3d(layer 2 features), # so the feature dim = dim of layer 1. if self.out_fpn_use_bn: self.out_bn2b = nn.BatchNorm3d(self.bb_feat_dims[2]) self.out_bn3b = nn.BatchNorm3d(self.bb_feat_dims[3]) self.out_bn4b = nn.BatchNorm3d(self.bb_feat_dims[4]) self.out_fpn_norms = [ None, None, self.out_bn2b, self.out_bn3b, self.out_bn4b ] else: self.out_gn2b = nn.GroupNorm(self.G, self.bb_feat_dims[2]) self.out_gn3b = nn.GroupNorm(self.G, self.bb_feat_dims[3]) self.out_gn4b = nn.GroupNorm(self.G, self.bb_feat_dims[4]) self.out_fpn_norms = [ None, None, self.out_gn2b, self.out_gn3b, self.out_gn4b ] self.out_fpn_convs = [ None, self.out_fpn12_conv3d, self.out_fpn23_conv3d, self.out_fpn34_conv3d ] self.out_conv3d = nn.Conv3d(self.out_feat_dim, self.num_classes, 1) self.out_fpn_dropout = nn.Dropout(config.hidden_dropout_prob) # out_fpn_layers = in_fpn_layers, no need to do fpn at the output end. # Output class scores directly. else: self.do_out_fpn = False if '2' in self.in_fpn_layers: # Output resolution is 1/4 of input already. No need to do upsampling here. self.out_conv3d = nn.Conv3d(config.trans_out_dim, self.num_classes, 1) else: # Output resolution is 1/8 of input. Do upsampling to make resolution x 2 self.out_conv3d = nn.ConvTranspose3d(config.trans_out_dim, self.num_classes, (2, 2, 1), (2, 2, 1)) self.apply(self.init_weights) # tie_qk() has to be executed after weight initialization. self.apply(self.tie_qk) self.apply(self.add_identity_bias) self.scales_printed = False self.translayer_dims = config.translayer_dims self.num_vis_layers = 1 + 2 * self.num_translayers
def main_worker(gpu, ngpus_per_node, args): global best_acc1 args.gpu = gpu if args.gpu is not None: print("Use GPU: {} for training".format(args.gpu)) if args.distributed: if args.dist_url == "env://" and args.rank == -1: args.rank = int(os.environ["RANK"]) if args.multiprocessing_distributed: # For multiprocessing distributed training, rank needs to be the # global rank among all the processes args.rank = args.rank * ngpus_per_node + gpu dist.init_process_group(backend=args.dist_backend, init_method=args.dist_url, world_size=args.world_size, rank=args.rank) # create model ##################################################################################### if args.pretrained: if args.arch.startswith('efficientnet-b'): print('=> using pre-trained {}'.format(args.arch)) model = EfficientNet.from_pretrained(args.arch, advprop=args.advprop) else: print("=> using pre-trained model '{}'".format(args.arch)) model = models.__dict__[args.arch](pretrained=True) else: if args.arch.startswith('efficientnet-b'): print("=> creating model {}".format(args.arch)) model = EfficientNet.from_name(args.arch) elif args.arch.startswith('Dense'): print("=> creating model {}".format(args.arch)) model = DenseNet40() else: print("=> creating model '{}'".format(args.arch)) model = models.__dict__[args.arch]() # create teacher model if args.kd: print('=> loading teacher model') if args.teacher_arch.startswith('efficientnet-b'): teacher = EfficientNet.from_pretrained(args.teacher_arch) teacher.eval() print('=> {} loaded'.format(args.teacher_arch)) elif args.teacher_arch.startswith('resnext101_32'): teacher = torch.hub.load('facebookresearch/WSL-Images', '{}_wsl'.format(args.teacher_arch)) teacher.eval() print('=> {} loaded'.format(args.teacher_arch)) elif args.overhaul: teacher = resnet.resnet152(pretrained=True) else: teacher = models.__dict__[args.teacher_arch](pretrained=True) teacher.eval() print('=> {} loaded'.format(args.teacher_arch)) if args.overhaul: print('=> using overhaul distillation') d_net = Distiller(teacher, model) if args.distributed: if args.gpu is not None: torch.cuda.set_device(args.gpu) model.cuda(args.gpu) args.batch_size = int(args.batch_size / ngpus_per_node) args.workers = int( (args.workers + ngpus_per_node - 1) / ngpus_per_node) model = torch.nn.parallel.DistributedDataParallel( model, device_ids=[args.gpu]) else: model.cuda() model = torch.nn.parallel.DistributedDataParallel(model) elif args.gpu is not None: torch.cuda.set_device(args.gpu) model = model.cuda(args.gpu) else: # DataParallel will divide and allocate batch_size to all available GPUs if args.arch.startswith('alexnet') or args.arch.startswith('vgg'): model.features = torch.nn.DataParallel(model.features) model.cuda() model.cuda() else: model = torch.nn.DataParallel(model).cuda() if args.kd: teacher = torch.nn.DataParallel(teacher).cuda() if args.overhaul: d_net = torch.nn.DataParallel(d_net).cuda() if args.pretrained: if args.arch.startswith('efficientnet-b'): loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.pth_path, map_location=loc) model.load_state_dict(checkpoint['state_dict']) ##################################################################################### # define loss function (criterion) and optimizer, scheduler ##################################################################################### if args.kd: criterion = kd_criterion if args.overhaul: criterion = nn.CrossEntropyLoss().cuda(args.gpu) else: criterion = nn.CrossEntropyLoss().cuda(args.gpu) if args.overhaul: optimizer = torch.optim.SGD(list(model.parameters()) + list(d_net.module.Connectors.parameters()), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # nesterov else: optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=args.weight_decay, amsgrad=False) scheduler = CosineAnnealingLR(optimizer, T_max=args.epochs * int(1281167 / args.batch_size), eta_min=0, last_epoch=-1) args.lr = 0.048 args.batch_size = 384 parameters = add_weight_decay(model, 1e-5) optimizer = RMSpropTF(parameters, lr=0.048, alpha=0.9, eps=0.001, momentum=args.momentum, weight_decay=1e-5) scheduler = StepLRScheduler( optimizer, decay_t=2.4, decay_rate=0.97, warmup_lr_init=1e-6, warmup_t=3, noise_range_t=None, noise_pct=getattr(args, 'lr_noise_pct', 0.67), noise_std=getattr(args, 'lr_noise_std', 1.), noise_seed=getattr(args, 'seed', 42), ) # scheduler = MultiStepLR(optimizer, milestones=args.schedule, gamma=args.gamma) # milestone = np.ceil(np.arange(0,300,2.4)) # scheduler = MultiStepLR(optimizer, milestones=[30,60,90,120,150,180,210,240,270], gamma=0.1) # optionally resume from a checkpoint if args.resume: if os.path.isfile(args.resume): print("=> loading checkpoint '{}'".format(args.resume)) if args.gpu is None: checkpoint = torch.load(args.resume) else: # Map model to be loaded to specified single gpu. loc = 'cuda:{}'.format(args.gpu) checkpoint = torch.load(args.resume, map_location=loc) args.start_epoch = checkpoint['epoch'] best_acc1 = checkpoint['best_acc1'] if args.gpu is not None: # best_acc1 may be from a checkpoint from a different GPU best_acc1 = best_acc1.to(args.gpu) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) print("=> loaded checkpoint '{}' (epoch {})".format( args.resume, checkpoint['epoch'])) else: print("=> no checkpoint found at '{}'".format(args.resume)) cudnn.benchmark = True ##################################################################################### # Data loading code ##################################################################################### traindir = os.path.join(args.data, 'train') valdir = os.path.join(args.data, 'val') if args.advprop: normalize = transforms.Lambda(lambda img: img * 2.0 - 1.0) else: normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) train_dataset = ImageFolder_iid( traindir, transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(p=0.5), ImageNetPolicy(), transforms.ToTensor(), normalize, ])) if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset) else: train_sampler = None train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=(train_sampler is None), num_workers=args.workers, pin_memory=True, sampler=train_sampler) val_loader = torch.utils.data.DataLoader(ImageFolder_iid( valdir, transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ])), batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=True) ##################################################################################### if args.evaluate: validate(val_loader, model, criterion, args) # Start training ##################################################################################### best_acc1 = 0 teacher_name = '' student_name = '' ema = EMA(model, 0.9999) ema.register() for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) # train for one epoch if args.kd: if args.overhaul: train_with_overhaul(train_loader, d_net, optimizer, criterion, epoch, args) acc1 = validate_overhaul(val_loader, model, criterion, epoch, args) else: train_kd(train_loader, teacher, model, criterion, optimizer, epoch, args) acc1 = validate_kd(val_loader, teacher, model, criterion, args) teacher_name = teacher.module.__class__.__name__ else: student_name = model.module.__class__.__name__ train(train_loader, model, criterion, optimizer, epoch, args, ema) acc1 = validate(val_loader, model, criterion, args, ema) # remember best acc@1 and save checkpoint #writer.add_scalars('acc1', acc1, epoch) is_best = acc1 > best_acc1 if acc1 < 65: print(colored('not saving... accuracy smaller than 65', 'green')) is_best = False best_acc1 = max(acc1, best_acc1) if not args.multiprocessing_distributed or ( args.multiprocessing_distributed and args.rank % ngpus_per_node == 0): save_checkpoint( { 'epoch': epoch + 1, 'arch': args.arch, 'state_dict': model.state_dict(), 'best_acc1': best_acc1, 'optimizer': optimizer.state_dict(), }, is_best, teacher_name=teacher_name, student_name=student_name, save_path=args.save_path, acc=acc1) scheduler.step(epoch)
def __init__( self, backbone: str, n_output: int, input_channels: int = 3, pretrained: bool = True, activation=None, ): super(ClassificationModel, self).__init__() """ The aggregation model of different predefined archtecture Args: backbone : model architecture to use, one of (resnet18 | resnet34 | densenet121 | se_resnext50_32x4d | se_resnext101_32x4d | efficientnet-b0 - efficientnet-b6) n_output : number of classes to predict input_channels : number of channels for the input image pretrained : bool value either to use weights pretrained on imagenet or to random initialization activation : a callable will be applied at the very end """ self.backbone = backbone if backbone == "se_resnext50_32x4d": if pretrained: self.encoder = pretrainedmodels.se_resnext50_32x4d( pretrained="imagenet" ) else: self.encoder = pretrainedmodels.se_resnext50_32x4d(pretrained=None) elif backbone == "se_resnext101_32x4d": if pretrained: self.encoder = pretrainedmodels.se_resnext101_32x4d( pretrained="imagenet" ) else: self.encoder = pretrainedmodels.se_resnext101_32x4d(pretrained=None) avgpool = nn.AdaptiveAvgPool2d(1) if backbone == "se_resnext50_32x4d" or backbone == "se_resnext101_32x4d": if input_channels != 3: conv = nn.Conv2d( input_channels, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False, ) conv.weight.data = ( self.encoder.layer0.conv1.weight.data.sum(dim=1) .unsqueeze(1) .repeat_interleave(input_channels, dim=1) ) self.encoder.layer0.conv1 = conv self.encoder.avg_pool = avgpool in_features = self.encoder.last_linear.in_features self.encoder.last_linear = nn.Identity() elif backbone.startswith("efficientnet"): self.encoder = EfficientNet.from_pretrained(backbone, advprop=True) if input_channels != 3: self.encoder._conv_stem = nn.Conv2d( input_channels, self.encoder._conv_stem.out_channels, kernel_size=(3, 3), stride=(2, 2), padding=(3, 3), bias=False, ) self.encoder._avg_pooling = avgpool in_features = self.encoder._fc.in_features self.encoder._fc = nn.Identity() self.fc0 = nn.Sequential( nn.Dropout(0.2), nn.Linear(in_features, 1024), nn.LeakyReLU(0.1), nn.BatchNorm1d(num_features=1024), nn.Linear(1024, n_output[0]), ) self.fc1 = nn.Sequential( nn.Dropout(0.2), nn.Linear(in_features, 1024), nn.LeakyReLU(0.1), nn.BatchNorm1d(num_features=1024), nn.Linear(1024, n_output[1]), ) self.fc2 = nn.Sequential( nn.Dropout(0.2), nn.Linear(in_features, 1024), nn.LeakyReLU(0.1), nn.BatchNorm1d(num_features=1024), nn.Linear(1024, n_output[2]), ) self.fc3 = nn.Sequential( nn.Dropout(0.2), nn.Linear(in_features, 1024), nn.LeakyReLU(0.1), nn.BatchNorm1d(num_features=1024), nn.Linear(1024, n_output[3]), ) self.activation = activation