# Data print('==> Preparing data..') data_tmp = imagenet.Data(args) train_loader = data_tmp.trainLoader val_loader = data_tmp.testLoader # Architecture if args.arch == 'resnet': origin_model = import_module(f'model.{args.arch}').resnet(args.cfg).to(device) else: raise('arch not exist!') # Calculate FLOPs of origin model input = torch.randn(1, 3, 224, 224).to(device) oriflops, oriparams = profile(origin_model, inputs=(input, )) # Based on the trained class-wise mask, perform global voting to obtain pruned model def build_resnet_pruned_model(origin_model): pruning_rate_now = 0 channel_prune_rate = 0.9 num_mask_cfg = {'resnet50' : 48} while pruning_rate_now < args.pruning_rate: score = [] index_cfg = [] block_index_cfg = [] layer_cfg = [] block_cfg = []
# stage1 [[3, 16, 16, 0, 1]], # stage2 [[3, 48, 24, 0, 2]], [[3, 72, 24, 0, 1]], # stage3 [[5, 72, 40, 0.25, 2]], [[5, 120, 40, 0.25, 1]], # stage4 [[3, 240, 80, 0, 2]], [[3, 200, 80, 0, 1], [3, 184, 80, 0, 1], [3, 184, 80, 0, 1], [3, 480, 112, 0.25, 1], [3, 672, 112, 0.25, 1]], # stage5 [[5, 672, 160, 0.25, 2]], [[5, 960, 160, 0, 1], [5, 960, 160, 0.25, 1], [5, 960, 160, 0, 1], [5, 960, 160, 0.25, 1]] ] return GhostNet(cfgs, **kwargs) if __name__ == '__main__': model = ghostnet() model.eval() print('[1] ', model) # input = torch.randn(1, 3, 320, 256) input = torch.randn(1, 3, 224, 224) y = model(input) macs, params = profile(model, inputs=(input, )) print('[3] flops=', macs, ', param=', params)
x = self.dilated2_6(x) x = self.asymmetric2_7(x) x = self.dilated2_8(x) x_dsn = self.dsn(x) # Stage 4 - Decoder x = self.head(x, self.recurrence) x = F.interpolate(input=x, size=(h, w), mode='bilinear', align_corners=True) x_dsn = F.interpolate(input=x_dsn, size=(h, w), mode='bilinear', align_corners=True) return [x, x_dsn] def get_eccnet(gpu_ids=1, ema=False, num_classes=1): net = ECCNet(num_classes=num_classes, recurrence=1) if ema: for param in net.parameters(): param.detach_() return init_network(net, gpu_ids) if __name__ == '__main__': model = ECCNet(num_classes=1) print(model) from thop import profile, clever_format input = torch.randn(1, 3, 1024, 1024) flops, params = profile(model, inputs=(input,), verbose=False) flops, params = clever_format([flops, params], "%.3f") print('flops: ', flops, "params: ", params)
layers.append(block(self.cur_channel, c, s, t)) else: layers.append(block(self.cur_channel, c, 1, t)) self.cur_channel = c return nn.Sequential(*layers) def forward(self, x): x = self.conv1(x) x = self.dwconv1(x) x = self.layers(x) x = self.conv2(x) # x = self.output_layer(x) x = self.linear7(x) sig_x = x x = self.linear1(x) x = x.view(x.size(0), -1) return x, sig_x if __name__ == "__main__": input = torch.Tensor(1, 3, 112, 112) model = MobileFace(use_cbam=True) flops, params = thop.profile(model, inputs=(input, )) flops, params = thop.clever_format([flops, params], "%.3f") print(flops, params) # model = model.eval() # out = model(input) # print(out.shape)
def main(logger, args): if not torch.cuda.is_available(): raise Exception("need gpu to train network!") if args.seed is not None: random.seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True gpus = torch.cuda.device_count() logger.info(f'use {gpus} gpus') logger.info(f"args: {args}") cudnn.benchmark = True cudnn.enabled = True start_time = time.time() # dataset and dataloader logger.info('start loading data') train_loader = DataLoader(Config.train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) val_loader = DataLoader(Config.val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) logger.info('finish loading data') logger.info(f"creating model '{args.network}'") model = resnet50(**{ "pretrained": args.pretrained, "num_classes": args.num_classes, }) flops_input = torch.randn(1, 3, args.input_image_size, args.input_image_size) flops, params = profile(model, inputs=(flops_input, )) flops, params = clever_format([flops, params], "%.3f") logger.info(f"model: '{args.network}', flops: {flops}, params: {params}") for name, param in model.named_parameters(): logger.info(f"{name},{param.requires_grad}") model = model.cuda() criterion = nn.CrossEntropyLoss().cuda() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.milestones, gamma=0.1) if args.apex: model, optimizer = amp.initialize(model, optimizer, opt_level='O1') model = nn.DataParallel(model) if args.evaluate: if not os.path.isfile(args.evaluate): raise Exception( f"{args.resume} is not a file, please check it again") logger.info('start only evaluating') logger.info(f"start resuming model from {args.evaluate}") checkpoint = torch.load(args.evaluate, map_location=torch.device('cpu')) model.load_state_dict(checkpoint['model_state_dict']) acc1, acc5, throughput = validate(val_loader, model, args) logger.info( f"epoch {checkpoint['epoch']:0>3d}, top1 acc: {acc1:.2f}%, top5 acc: {acc5:.2f}%, throughput: {throughput:.2f}sample/s" ) return start_epoch = 1 # resume training if os.path.exists(args.resume): logger.info(f"start resuming model from {args.resume}") checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) start_epoch += checkpoint['epoch'] model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) scheduler.load_state_dict(checkpoint['scheduler_state_dict']) logger.info( f"finish resuming model from {args.resume}, epoch {checkpoint['epoch']}, " f"loss: {checkpoint['loss']:3f}, lr: {checkpoint['lr']:.6f}, " f"top1_acc: {checkpoint['acc1']}%") if not os.path.exists(args.checkpoints): os.makedirs(args.checkpoints) logger.info('start training') for epoch in range(start_epoch, args.epochs + 1): acc1, acc5, losses = train(train_loader, model, criterion, optimizer, scheduler, epoch, logger, args) logger.info( f"train: epoch {epoch:0>3d}, top1 acc: {acc1:.2f}%, top5 acc: {acc5:.2f}%, losses: {losses:.2f}" ) acc1, acc5, throughput = validate(val_loader, model, args) logger.info( f"val: epoch {epoch:0>3d}, top1 acc: {acc1:.2f}%, top5 acc: {acc5:.2f}%, throughput: {throughput:.2f}sample/s" ) # remember best prec@1 and save checkpoint torch.save( { 'epoch': epoch, 'acc1': acc1, 'loss': losses, 'lr': scheduler.get_lr()[0], 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), }, os.path.join(args.checkpoints, 'latest.pth')) if epoch == args.epochs: torch.save( model.module.state_dict(), os.path.join( args.checkpoints, "{}-epoch{}-acc{}.pth".format(args.network, epoch, acc1))) training_time = (time.time() - start_time) / 3600 logger.info( f"finish training, total training time: {training_time:.2f} hours")
test_loss = test_model(model=model, test_loader=test_loader) missing_entity_test_loss = test_model( model=model, test_loader=missing_entity_test_loader) print('epoch:', epoch, 'test_loss:', test_loss) print('epoch:', epoch, 'missing_entity_test_loss:', missing_entity_test_loss) logger.log_scalar('test_loss', test_loss, step=train_step) logger.log_scalar('missing_entity_test_loss', missing_entity_test_loss, step=train_step) mylogger.log(seed=seed, tag='test_loss', value=test_loss) mylogger.log(seed=seed, tag='missing_entity_test_loss', value=missing_entity_test_loss) if train_step == 0: flops, params = profile(model=model, inputs=(data, )) print(flops) print(params) train_step += 1 model.eval() for data in missing_entity_test_loader1: with torch.no_grad(): out = model(data) print(out) print(data.x) print(data.y[:, :2]) loss_vec = F.mse_loss(out * data.y[:, 2:], data.y[:, :2] * data.y[:, 2:], reduce=False) loss = torch.sum(torch.sum(loss_vec, dim=1)) / test_size
def get_profile(model, dataset, output): """ Params, """ data = dataset.X[0].unsqueeze(0) macs, params = profile(model, (data, ), verbose=False) Print("Params(M): %.3f" % (params / 10**6), output)
def main(): device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print("using {} device.".format(device)) data_transform = { "train": transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]), "val": transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225]) ]) } data_root = os.path.abspath(os.path.join(os.getcwd(), "../..")) # get data root path image_path = os.path.join("D:", "imagenet") # flower data set path assert os.path.exists(image_path), "{} path does not exist.".format( image_path) train_dataset = datasets.ImageFolder(root=os.path.join( image_path, "train"), transform=data_transform["train"]) train_num = len(train_dataset) # {'daisy':0, 'dandelion':1, 'roses':2, 'sunflower':3, 'tulips':4} flower_list = train_dataset.class_to_idx cla_dict = dict((val, key) for key, val in flower_list.items()) # write dict into json file json_str = json.dumps(cla_dict, indent=4) with open('class_indices.json', 'w') as json_file: json_file.write(json_str) nw = min([os.cpu_count(), batch_size if batch_size > 1 else 0, 8]) # number of workers print('Using {} dataloader workers every process'.format(nw)) train_loader = torch.utils.data.DataLoader( train_dataset, batch_size=batch_size, shuffle=True, ) validate_dataset = datasets.ImageFolder(root=os.path.join( image_path, "val"), transform=data_transform["val"]) val_num = len(validate_dataset) validate_loader = torch.utils.data.DataLoader( validate_dataset, batch_size=batch_size, shuffle=False, ) print("using {} images for training, {} images for validation.".format( train_num, val_num)) net = resnest50(num_classes=10) net_input = torch.randn(batch_size, 3, 224, 224) flops, params = profile(net, inputs=(net_input, )) print("网络计算量GFlops:{},网络参数量#P:{}".format(flops, params)) # load pretrain weights # download url: https://download.pytorch.org/models/resnet34-333f7ec4.pth #assert os.path.exists(model_weight_path), "file {} does not exist.".format(model_weight_path) #####读取训练好的参数############### #model_weight_path = "./resNest50.pth" #missing_keys, unexpected_keys = net.load_state_dict(torch.load(model_weight_path), strict=False) #for param in net.parameters(): # param.requires_grad = False #####读取训练好的参数############### # change fc layer structure in_channel = net.fc.in_features net.fc = nn.Linear(in_channel, 10) net.to(device) loss_function = nn.CrossEntropyLoss() optimizer = optim.Adam(net.parameters(), lr=lr) best_acc = 0.0 save_path = './resNest50.pth' list_loss = list() list_acc = list() for epoch in range(epoch_all): # train net.train() running_loss = 0.0 for step, data in enumerate(train_loader, start=0): images, labels = data optimizer.zero_grad() logits = net(images.to(device)) loss = loss_function(logits, labels.to(device)) loss.backward() optimizer.step() # print statistics running_loss += loss.item() # print train process rate = (step + 1) / len(train_loader) a = "*" * int(rate * 50) b = "." * int((1 - rate) * 50) print("\rtrain loss: {:^3.0f}%[{}->{}]{:.4f}".format( int(rate * 100), a, b, loss), end="") print() # validate net.eval() acc = 0.0 # accumulate accurate number / epoch with torch.no_grad(): for val_data in validate_loader: val_images, val_labels = val_data outputs = net(val_images.to( device)) # eval model only have last output layer # loss = loss_function(outputs, test_labels) predict_y = torch.max(outputs, dim=1)[1] acc += (predict_y == val_labels.to(device)).sum().item() val_accurate = acc / val_num if val_accurate > best_acc: best_acc = val_accurate torch.save(net.state_dict(), save_path) print('[epoch %d] train_loss: %.3f test_accuracy: %.3f' % (epoch + 1, running_loss / step, val_accurate)) list_loss.append(running_loss / step) list_acc.append(val_accurate) #保存loss and acc file_loss_path = "./ResNest_loss.txt" with open(file_loss_path, "w") as file_object: json.dump(list_loss, file_object) file_acc_path = "./ResNest_acc.txt" with open(file_acc_path, "w") as file_object: json.dump(list_acc, file_object) print('Finished Training')
ResDepSepBlock(channel*4,channel*8,kernel_size=3,stride=2), ResDepSepBlock(channel*8,channel*8,kernel_size=3,stride=1), ResDepSepBlock(channel*8,channel*8,kernel_size=3,stride=1), ResDepSepBlock(channel*8,channel*8,kernel_size=3,stride=2), ResDepSepBlock(channel*8,channel*8,kernel_size=3,stride=1), ResDepSepBlock(channel*8,channel*8,kernel_size=3,stride=1), nn.AdaptiveAvgPool1d(1), ) self.out = nn.Sequential( nn.Linear(channel*8,channel*2), nn.ReLU6(inplace=False), nn.Linear(channel*2,2), nn.Softmax(dim=-1) ) def forward(self,x): x = self.net(x) x=torch.squeeze(x) return self.out(x),x if __name__=='__main__': from thop import profile model = OneD_CNN(600,30) x1 = torch.randn(1, 1,30, 600) x2 = torch.randn(2, 36, 30) x3 = torch.randn(2, 36, 30) flops, params = profile(model, inputs=(x1)) print(flops,params)
def test_resnet50_thop(): model = AlexNet() input = torch.randn(1,3,224,224) flops, params = profile(model, inputs=(input,)) flops, params = clever_format([flops, params], "%.3f") print("flops: ", flops, "params: ", params)
def constraint_cal(net, img_size=128): input = torch.randn(1, 3, img_size, img_size) macs, params = profile(net, inputs=(input,)) return (macs, params)
x = self.classifier(x) x = x.view(x.size()[0], -1) return x if __name__ == '__main__': net = DSACNet() # stat(net, (3, 128, 128)) # net = net.cuda() # summary(net, (3, 128, 128)) input = torch.randn(1, 3, 128, 128) # output = net(input) # print("The net out: ", output) flops, params = profile(net, inputs=(input, )) print(flops, params) # #计算方式1 # flops, params = profile(net, inputs=(input, )) # print(flops, params) # #计算方式2 # flops, params = get_model_complexity_info(net, (3, 128, 128), as_strings=True, print_per_layer_stat=True) # print("|flops: %s |params: %s" % (flops, params)) # #计算方式3 # stat(net, (3, 128, 128)) # #计算方式4 # net = net.cuda()
# @File : test_net.py import torch from torch import nn from torchkeras import summary from thop import profile class Net(nn.Module): def __init__(self): super(Net, self).__init__() self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, bias=False) self.bn1 = nn.BatchNorm2d(64) self.relu = nn.ReLU(inplace=True) def forward(self, x): x = self.conv1(x) x = self.bn1(x) y = self.relu(x) return y if __name__ == '__main__': model = Net() print(model) print(summary(model, input_shape=(3, 20, 20))) print('number of params:', sum(param.numel() for param in model.parameters())) inputs = torch.randn(8, 3, 20, 20) flops, params = profile(model, (inputs,)) print('flops:', flops, 'params:', params)
def build_resnet_pruned_model(origin_model): pruning_rate_now = 0 channel_prune_rate = 0.9 num_mask_cfg = {'resnet50' : 48} while pruning_rate_now < args.pruning_rate: score = [] index_cfg = [] block_index_cfg = [] layer_cfg = [] block_cfg = [] final_mask = [] pruned_state_dict = {} for i in range(num_mask_cfg[args.cfg]): mask = origin_model.state_dict()['mask.'+str(i)] score.append(torch.abs(torch.sum(mask, 0))) final_mask.append(torch.div(torch.sum(mask, 0), 2)) all_score = torch.cat(score,0) preserve_num = int(all_score.size(0) * channel_prune_rate) preserve_channel, _ = torch.topk(all_score, preserve_num) threshold = preserve_channel[preserve_num-1] block_score = [] # Based on the pruning threshold, the prune cfg of each layer is obtained for i, mini_score in enumerate(score): mask = torch.ge(mini_score, threshold) index = [] for j, m in enumerate(mask): if m == True: index.append(j) if len(index) < mask.size(0) * args.min_preserve: _, index = torch.topk(mini_score, int(mask.size(0) * args.min_preserve)) index = index.cpu().numpy().tolist() if (i + 1) % 3 != 0: #in block index_cfg.append(index) layer_cfg.append(len(index)) else: #out block block_score.append(mini_score) num_blocks = [3,4,6,3] begin = 0 for i in range(len(num_blocks)): block_cfg.append(int(block_score[begin].size(0)/4)) for j in range(begin, begin + num_blocks[i]): block_index_cfg.append(torch.arange(block_score[begin].size(0))) begin = begin + num_blocks[i] model = import_module(f'model.{args.arch}').resnet(args.cfg, block_cfg, layer_cfg).to(device) flops, params = profile(model, inputs=(input, )) pruning_rate_now = (oriflops - flops) / oriflops channel_prune_rate = channel_prune_rate - 0.01 model_state_dict = origin_model.state_dict() current_block = 0 block_index = torch.arange(64) model = import_module(f'model.{args.arch}').resnet(args.cfg, block_cfg, layer_cfg).to(device) pruned_state_dict = model.state_dict() for name, module in origin_model.named_modules(): if isinstance(module, Bottleneck_class): # conv1 & bn1 index_1 = torch.LongTensor(index_cfg[current_block * 2]).to(device) pruned_weight = torch.index_select(model_state_dict[name + '.conv1.weight'], 0, index_1).cpu() pruned_weight = direct_project(pruned_weight, block_index) pruned_state_dict[name + '.conv1.weight'] = pruned_weight mask = final_mask[current_block * 3][index_cfg[current_block * 2]] pruned_state_dict[name + '.bn1.weight'] = torch.mul(mask,model_state_dict[name + '.bn1.weight'][index_1]).cpu() pruned_state_dict[name + '.bn1.bias'] = torch.mul(mask,model_state_dict[name + '.bn1.bias'][index_1]).cpu() pruned_state_dict[name + '.bn1.running_var'] = model_state_dict[name + '.bn1.running_var'][index_1].cpu() pruned_state_dict[name + '.bn1.running_mean'] = model_state_dict[name + '.bn1.running_mean'][index_1].cpu() # conv2 & bn2 index_2 = torch.LongTensor(index_cfg[current_block * 2 + 1]).to(device) pruned_weight = torch.index_select(model_state_dict[name + '.conv2.weight'], 0, index_2).cpu() pruned_weight = direct_project(pruned_weight, index_1) pruned_state_dict[name + '.conv2.weight'] = pruned_weight mask = final_mask[current_block * 3 + 1][index_cfg[current_block * 2 + 1]] pruned_state_dict[name + '.bn2.weight'] = torch.mul(mask,model_state_dict[name + '.bn2.weight'][index_2]).cpu() pruned_state_dict[name + '.bn2.bias'] = torch.mul(mask,model_state_dict[name + '.bn2.bias'][index_2]).cpu() pruned_state_dict[name + '.bn2.running_var'] = model_state_dict[name + '.bn2.running_var'][index_2].cpu() pruned_state_dict[name + '.bn2.running_mean'] = model_state_dict[name + '.bn2.running_mean'][index_2].cpu() block_index = torch.LongTensor(block_index_cfg[current_block]).to(device) mask = final_mask[current_block * 3 + 2][block_index_cfg[current_block]] # conv3 & bn3 & shortcut pruned_state_dict[name + '.conv3.weight'] = torch.index_select(model_state_dict[name + '.conv3.weight'], 0, block_index).cpu() pruned_state_dict[name + '.conv3.weight'] = direct_project(pruned_state_dict[name + '.conv3.weight'], index_2) pruned_state_dict[name + '.bn3.weight'] = model_state_dict[name + '.bn3.weight'].cpu() pruned_state_dict[name + '.bn3.bias'] = model_state_dict[name + '.bn3.bias'].cpu() pruned_state_dict[name + '.bn3.running_var'] = model_state_dict[name + '.bn3.running_var'][block_index].cpu() pruned_state_dict[name + '.bn3.running_mean'] = model_state_dict[name + '.bn3.running_mean'][block_index].cpu() current_block += 1 pruned_state_dict['fc.weight'] = model_state_dict['fc.weight'].cpu() pruned_state_dict['fc.bias'] = model_state_dict['fc.bias'].cpu() pruned_state_dict['conv1.weight'] = model_state_dict['conv1.weight'].cpu() pruned_state_dict['bn1.weight'] = model_state_dict['bn1.weight'].cpu() pruned_state_dict['bn1.bias'] = model_state_dict['bn1.bias'].cpu() pruned_state_dict['bn1.running_var'] = model_state_dict['bn1.running_var'].cpu() pruned_state_dict['bn1.running_mean'] = model_state_dict['bn1.running_mean'].cpu() #load weight model = import_module(f'model.{args.arch}').resnet(args.cfg, block_cfg = block_cfg, layer_cfg=layer_cfg).to(device) model.load_state_dict(pruned_state_dict) return model, [layer_cfg, block_cfg], flops, params
def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) np.random.seed(args.seed) torch.cuda.set_device(args.gpu) torch.manual_seed(args.seed) torch.cuda.manual_seed(args.seed) random.seed(args.seed) torch.backends.cudnn.benchmark = False torch.backends.cudnn.deterministic = True torch.backends.cudnn.enabled = True logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) model = get_model(args.model_name) model.drop_path_prob = 0. macs, params = profile(model, inputs=(torch.randn(1, 3, 32, 32), )) macs, params = macs / 1000. / 1000., params / 1000. / 1000. logging.info("The parameter size is: {0}".format((params))) logging.info("The FLOPS is: {0}".format(macs)) model = torch.nn.DataParallel(model) criterion = nn.CrossEntropyLoss() criterion = criterion.cuda() optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) train_transform, valid_transform = utils._data_transforms_cifar10(args) train_data = dset.CIFAR10(root=args.data, train=True, download=True, transform=train_transform) valid_data = dset.CIFAR10(root=args.data, train=False, download=True, transform=valid_transform) train_queue = torch.utils.data.DataLoader(train_data, batch_size=args.batch_size, shuffle=True, pin_memory=True, num_workers=2) valid_queue = torch.utils.data.DataLoader(valid_data, batch_size=args.batch_size, shuffle=False, pin_memory=True, num_workers=2) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs)) best_acc = 0. for epoch in range(args.epochs): logging.info('epoch %d lr %e', epoch, scheduler.get_lr()[0]) model.drop_path_prob = args.drop_path_prob * epoch / args.epochs train_acc, train_obj = train(train_queue, model, criterion, optimizer) logging.info('train_acc %f', train_acc) valid_acc, valid_obj = infer(valid_queue, model, criterion) logging.info('valid_acc %f', valid_acc) scheduler.step() if best_acc < valid_acc: best_acc = valid_acc logging.info("Current best Prec@1 = %f", best_acc) utils.save(model, os.path.join(args.save, 'best.pt')) utils.save(model, os.path.join(args.save, 'weights.pt'))
self.inplanes = planes for _ in range(1, blocks): layers.append(AsterBlock(self.inplanes, planes)) return nn.Sequential(*layers) def forward(self, x): x0 = self.layer0(x) x1 = self.layer1(x0) x2 = self.layer2(x1) x3 = self.layer3(x2) x4 = self.layer4(x3) x5 = self.layer5(x4) cnn_feat = x5.squeeze(2) # [N, c, w] cnn_feat = cnn_feat.transpose(2, 1) if self.with_lstm: rnn_feat, _ = self.rnn(cnn_feat) return rnn_feat else: return cnn_feat if __name__ == "__main__": net = ResNet_ASTER() net2 = Tiny_ResNet_ASTER() x = torch.randn(1, 3, 32, 100) from thop import profile flops, params = profile(net, inputs=(x, )) flops2, params2 = profile(net2, inputs=(x, )) print('Flops: %.2f G, params: %.2f M' % (flops / 1e9, params / 1e6)) print('Flops: %.2f G, params: %.2f M' % (flops2 / 1e9, params2 / 1e6))
def main(): if not torch.cuda.is_available(): logging.info('No GPU device available') sys.exit(1) num_gpus = torch.cuda.device_count() args.gpu = args.local_rank % num_gpus torch.cuda.set_device(args.gpu) np.random.seed(args.seed) cudnn.benchmark = True cudnn.deterministic = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info("args = %s", args) logging.info("unparsed_args = %s", unparsed) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.batch_size = args.batch_size // args.world_size genotype = eval("genotypes.%s" % args.arch) logging.info('---------Genotype---------') logging.info(genotype) logging.info('--------------------------') model = Network(args.init_channels, CLASSES, args.layers, args.auxiliary, genotype) model = model.cuda(args.gpu) model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) model_profile = Network(args.init_channels, CLASSES, args.layers, args.auxiliary, genotype) model_profile = model_profile.cuda(args.gpu) model_input_size_imagenet = (1, 3, 224, 224) model_profile.drop_path_prob = 0 flops, _ = profile(model_profile, model_input_size_imagenet) logging.info("flops = %fMB, param size = %fMB", flops, count_parameters_in_MB(model)) criterion_smooth = CrossEntropyLabelSmooth(CLASSES, args.label_smooth) criterion_smooth = criterion_smooth.cuda() optimizer = torch.optim.SGD(model.parameters(), args.learning_rate, momentum=args.momentum, weight_decay=args.weight_decay) # Prepare data total_iters = per_epoch_iters * args.epochs train_loader = get_train_dataloader(args.train_dir, args.batch_size, args.local_rank, total_iters) train_dataprovider = DataIterator(train_loader) val_loader = get_val_dataloader(args.test_dir) val_dataprovider = DataIterator(val_loader) scheduler = torch.optim.lr_scheduler.CosineAnnealingLR( optimizer, float(args.epochs)) start_epoch = 0 best_acc_top1 = 0 best_acc_top5 = 0 checkpoint_tar = os.path.join(args.save, 'checkpoint.pth.tar') if os.path.exists(checkpoint_tar): logging.info('loading checkpoint {} ..........'.format(checkpoint_tar)) checkpoint = torch.load( checkpoint_tar, map_location={'cuda:0': 'cuda:{}'.format(args.local_rank)}) start_epoch = checkpoint['epoch'] + 1 model.load_state_dict(checkpoint['state_dict']) logging.info("loaded checkpoint {} epoch = {}".format( checkpoint_tar, checkpoint['epoch'])) # evaluation mode if args.eval: if args.eval_resume is not None: checkpoint = torch.load(args.eval_resume) model.module.drop_path_prob = 0 model.load_state_dict(checkpoint['state_dict']) valid_acc_top1, valid_acc_top5 = infer(val_dataprovider, model.module, val_iters) print('valid_acc_top1: {}'.format(valid_acc_top1)) exit(0) for epoch in range(start_epoch, args.epochs): if args.lr_scheduler == 'cosine': scheduler.step() current_lr = scheduler.get_lr()[0] elif args.lr_scheduler == 'linear': current_lr = adjust_lr(optimizer, epoch) else: logging.info('Wrong lr type, exit') sys.exit(1) logging.info('Epoch: %d lr %e', epoch, current_lr) if epoch < 5 and args.batch_size > 256: for param_group in optimizer.param_groups: param_group['lr'] = current_lr * (epoch + 1) / 5.0 logging.info('Warming-up Epoch: %d, LR: %e', epoch, current_lr * (epoch + 1) / 5.0) model.module.drop_path_prob = args.drop_path_prob * epoch / args.epochs epoch_start = time.time() train_acc, train_obj = train(train_dataprovider, model, criterion_smooth, optimizer, per_epoch_iters) writer.add_scalar('Train/Loss', train_obj, epoch) writer.add_scalar('Train/LR', current_lr, epoch) if args.local_rank == 0 and (epoch % 5 == 0 or args.epochs - epoch < 10): valid_acc_top1, valid_acc_top5 = infer(val_dataprovider, model.module, val_iters) is_best = False if valid_acc_top5 > best_acc_top5: best_acc_top5 = valid_acc_top5 if valid_acc_top1 > best_acc_top1: best_acc_top1 = valid_acc_top1 is_best = True logging.info('Valid_acc_top1: %f', valid_acc_top1) logging.info('Valid_acc_top5: %f', valid_acc_top5) logging.info('best_acc_top1: %f', best_acc_top1) epoch_duration = time.time() - epoch_start logging.info('Epoch time: %ds.', epoch_duration) save_checkpoint_( { 'epoch': epoch, 'state_dict': model.state_dict(), 'best_acc_top1': best_acc_top1, 'optimizer': optimizer.state_dict(), }, args.save)
def test_model(args): print(args) if args.use_gpu: # use one Graphics card to test os.environ["CUDA_VISIBLE_DEVICES"] = "0" if not torch.cuda.is_available(): raise Exception("need gpu to test network!") torch.cuda.empty_cache() if args.seed is not None: random.seed(args.seed) if args.use_gpu: torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True if args.use_gpu: cudnn.benchmark = True cudnn.enabled = True scale = 256 / 224 val_dataset = datasets.ImageFolder( os.path.join(ILSVRC2012_path, 'val'), transforms.Compose([ transforms.Resize(int(args.input_image_size * scale)), transforms.CenterCrop(args.input_image_size), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ])) val_loader = DataLoader(val_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) if args.classifier == "darknet": model = _darknet(args.backbone, args.use_pretrained_model, args.pretrained_model_path, args.num_classes) elif args.classifier == "efficientnet": model = _efficientnet(args.backbone, args.use_pretrained_model, args.pretrained_model_path, args.num_classes) elif args.classifier == "regnet": model = _regnet(args.backbone, args.use_pretrained_model, args.pretrained_model_path, args.num_classes) elif args.classifier == "resnet": model = _resnet(args.backbone, args.use_pretrained_model, args.pretrained_model_path, args.num_classes) elif args.classifier == "vovnet": model = _vovnet(args.backbone, args.use_pretrained_model, args.pretrained_model_path, args.num_classes) else: print("unsupport classification model!") return flops_input = torch.randn(1, 3, args.input_image_size, args.input_image_size) flops, params = profile(model, inputs=(flops_input, )) flops, params = clever_format([flops, params], "%.3f") print( f"backbone:{args.backbone},classifier: '{args.classifier}', flops: {flops}, params: {params}" ) if args.use_gpu: model = model.cuda() model = nn.DataParallel(model) print(f"start eval.") acc1, acc5, throughput = validate(val_loader, model, args) print( f"top1 acc: {acc1:.2f}%, top5 acc: {acc5:.2f}%, throughput: {throughput:.2f}sample/s" ) print(f"eval done.") return
def main(): args = parse_args() global local_rank local_rank = args.local_rank if local_rank == 0: global logger logger = get_logger(__name__, args.log) torch.cuda.empty_cache() if args.seed is not None: random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) cudnn.deterministic = True torch.cuda.set_device(local_rank) dist.init_process_group(backend='nccl', init_method='env://') global gpus_num gpus_num = torch.cuda.device_count() if local_rank == 0: logger.info(f'use {gpus_num} gpus') logger.info(f"args: {args}") cudnn.benchmark = True cudnn.enabled = True start_time = time.time() # dataset and dataloader if local_rank == 0: logger.info('start loading data') train_sampler = torch.utils.data.distributed.DistributedSampler( Config.train_dataset, shuffle=True) train_loader = DataLoader(Config.train_dataset, batch_size=args.per_node_batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collater, sampler=train_sampler) if local_rank == 0: logger.info('finish loading data') model = centernet.__dict__[args.network](**{ "pretrained": args.pretrained, "num_classes": args.num_classes, }) for name, param in model.named_parameters(): if local_rank == 0: logger.info(f"{name},{param.requires_grad}") flops_input = torch.randn(1, 3, args.input_image_size, args.input_image_size) flops, params = profile(model, inputs=(flops_input, )) flops, params = clever_format([flops, params], "%.3f") if local_rank == 0: logger.info( f"model: '{args.network}', flops: {flops}, params: {params}") criterion = CenterNetLoss().cuda() decoder = CenterNetDecoder(image_w=args.input_image_size, image_h=args.input_image_size).cuda() model = model.cuda() optimizer = torch.optim.AdamW(model.parameters(), lr=args.lr) scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=args.milestones, gamma=0.1) if args.sync_bn: model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model) if args.apex: amp.register_float_function(torch, 'sigmoid') amp.register_float_function(torch, 'softmax') model, optimizer = amp.initialize(model, optimizer, opt_level='O1') model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) if args.sync_bn: model = apex.parallel.convert_syncbn_model(model) else: model = nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank) if args.evaluate: if not os.path.isfile(args.evaluate): if local_rank == 0: logger.exception( '{} is not a file, please check it again'.format( args.resume)) sys.exit(-1) if local_rank == 0: logger.info('start only evaluating') logger.info(f"start resuming model from {args.evaluate}") checkpoint = torch.load(args.evaluate, map_location=torch.device('cpu')) model.load_state_dict(checkpoint['model_state_dict']) if local_rank == 0: logger.info(f"start eval.") all_eval_result = validate(Config.val_dataset, model, decoder) logger.info(f"eval done.") if all_eval_result is not None: logger.info( f"val: epoch: {checkpoint['epoch']:0>5d}, IoU=0.5:0.95,area=all,maxDets=100,mAP:{all_eval_result[0]:.3f}, IoU=0.5,area=all,maxDets=100,mAP:{all_eval_result[1]:.3f}, IoU=0.75,area=all,maxDets=100,mAP:{all_eval_result[2]:.3f}, IoU=0.5:0.95,area=small,maxDets=100,mAP:{all_eval_result[3]:.3f}, IoU=0.5:0.95,area=medium,maxDets=100,mAP:{all_eval_result[4]:.3f}, IoU=0.5:0.95,area=large,maxDets=100,mAP:{all_eval_result[5]:.3f}, IoU=0.5:0.95,area=all,maxDets=1,mAR:{all_eval_result[6]:.3f}, IoU=0.5:0.95,area=all,maxDets=10,mAR:{all_eval_result[7]:.3f}, IoU=0.5:0.95,area=all,maxDets=100,mAR:{all_eval_result[8]:.3f}, IoU=0.5:0.95,area=small,maxDets=100,mAR:{all_eval_result[9]:.3f}, IoU=0.5:0.95,area=medium,maxDets=100,mAR:{all_eval_result[10]:.3f}, IoU=0.5:0.95,area=large,maxDets=100,mAR:{all_eval_result[11]:.3f}" ) return best_map = 0.0 start_epoch = 1 # resume training if os.path.exists(args.resume): if local_rank == 0: logger.info(f"start resuming model from {args.resume}") checkpoint = torch.load(args.resume, map_location=torch.device('cpu')) start_epoch += checkpoint['epoch'] best_map = checkpoint['best_map'] model.load_state_dict(checkpoint['model_state_dict']) optimizer.load_state_dict(checkpoint['optimizer_state_dict']) scheduler.load_state_dict(checkpoint['scheduler_state_dict']) if local_rank == 0: logger.info( f"finish resuming model from {args.resume}, epoch {checkpoint['epoch']}, best_map: {checkpoint['best_map']}, " f"loss: {checkpoint['loss']:3f}, heatmap_loss: {checkpoint['heatmap_loss']:2f}, offset_loss: {checkpoint['offset_loss']:2f},wh_loss: {checkpoint['wh_loss']:2f}" ) if local_rank == 0: if not os.path.exists(args.checkpoints): os.makedirs(args.checkpoints) if local_rank == 0: logger.info('start training') for epoch in range(start_epoch, args.epochs + 1): train_sampler.set_epoch(epoch) heatmap_losses, offset_losses, wh_losses, losses = train( train_loader, model, criterion, optimizer, scheduler, epoch, args) if local_rank == 0: logger.info( f"train: epoch {epoch:0>3d}, heatmap_loss: {heatmap_losses:.2f}, offset_loss: {offset_losses:.2f}, wh_loss: {wh_losses:.2f}, loss: {losses:.2f}" ) if epoch % 5 == 0 or epoch == args.epochs: if local_rank == 0: logger.info(f"start eval.") all_eval_result = validate(Config.val_dataset, model, decoder) logger.info(f"eval done.") if all_eval_result is not None: logger.info( f"val: epoch: {epoch:0>5d}, IoU=0.5:0.95,area=all,maxDets=100,mAP:{all_eval_result[0]:.3f}, IoU=0.5,area=all,maxDets=100,mAP:{all_eval_result[1]:.3f}, IoU=0.75,area=all,maxDets=100,mAP:{all_eval_result[2]:.3f}, IoU=0.5:0.95,area=small,maxDets=100,mAP:{all_eval_result[3]:.3f}, IoU=0.5:0.95,area=medium,maxDets=100,mAP:{all_eval_result[4]:.3f}, IoU=0.5:0.95,area=large,maxDets=100,mAP:{all_eval_result[5]:.3f}, IoU=0.5:0.95,area=all,maxDets=1,mAR:{all_eval_result[6]:.3f}, IoU=0.5:0.95,area=all,maxDets=10,mAR:{all_eval_result[7]:.3f}, IoU=0.5:0.95,area=all,maxDets=100,mAR:{all_eval_result[8]:.3f}, IoU=0.5:0.95,area=small,maxDets=100,mAR:{all_eval_result[9]:.3f}, IoU=0.5:0.95,area=medium,maxDets=100,mAR:{all_eval_result[10]:.3f}, IoU=0.5:0.95,area=large,maxDets=100,mAR:{all_eval_result[11]:.3f}" ) if all_eval_result[0] > best_map: torch.save(model.module.state_dict(), os.path.join(args.checkpoints, "best.pth")) best_map = all_eval_result[0] if local_rank == 0: torch.save( { 'epoch': epoch, 'best_map': best_map, 'heatmap_loss': heatmap_losses, 'offset_loss': offset_losses, 'wh_loss': wh_losses, 'loss': losses, 'model_state_dict': model.state_dict(), 'optimizer_state_dict': optimizer.state_dict(), 'scheduler_state_dict': scheduler.state_dict(), }, os.path.join(args.checkpoints, 'latest.pth')) if local_rank == 0: logger.info(f"finish training, best_map: {best_map:.3f}") training_time = (time.time() - start_time) / 3600 if local_rank == 0: logger.info( f"finish training, total training time: {training_time:.2f} hours")
def measure_model(model, IMAGE_SIZE1, IMAGE_SIZE2): inputs = torch.randn(1, 3, IMAGE_SIZE1, IMAGE_SIZE2) flops, params = profile(model, (inputs,)) return flops, params
shuffle=False, num_workers=16, pin_memory=True) test_data = utils.CIFAR10Pair(root='data', train=False, transform=utils.test_transform, download=True) test_loader = DataLoader(test_data, batch_size=batch_size, shuffle=False, num_workers=16, pin_memory=True) # model setup and optimizer config model = Model(feature_dim).cuda() flops, params = profile(model, inputs=(torch.randn(1, 3, 32, 32).cuda(), )) flops, params = clever_format([flops, params]) print('# Model Params: {} FLOPs: {}'.format(params, flops)) optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-6) c = len(memory_data.classes) # training loop results = {'train_loss': [], 'test_acc@1': [], 'test_acc@5': []} save_name_pre = '{}_{}_{}_{}_{}'.format(feature_dim, temperature, k, batch_size, epochs) if not os.path.exists('results'): os.mkdir('results') best_acc = 0.0 for epoch in range(1, epochs + 1): train_loss = train(model, train_loader, optimizer) results['train_loss'].append(train_loss)
import torch from gluoncv.torch.model_zoo import get_model from gluoncv.torch.engine.config import get_cfg_defaults from thop import profile, clever_format if __name__ == '__main__': parser = argparse.ArgumentParser(description='Compute FLOPs of a model.') parser.add_argument('--config-file', type=str, help='path to config file.') parser.add_argument('--num-frames', type=int, default=32, help='temporal clip length.') parser.add_argument('--input-size', type=int, default=224, help='size of the input image size. default is 224') args = parser.parse_args() cfg = get_cfg_defaults() cfg.merge_from_file(args.config_file) model = get_model(cfg) input_tensor = torch.autograd.Variable( torch.rand(1, 3, args.num_frames, args.input_size, args.input_size)) macs, params = profile(model, inputs=(input_tensor, )) macs, params = clever_format([macs, params], "%.3f") print("FLOPs: ", macs, "; #params: ", params)
def main(): if not torch.cuda.is_available(): logging.info('no gpu device available') sys.exit(1) num_gpus = torch.cuda.device_count() np.random.seed(args.seed) args.gpu = args.local_rank % num_gpus torch.cuda.set_device(args.gpu) cudnn.benchmark = True cudnn.deterministic = True torch.manual_seed(args.seed) cudnn.enabled = True torch.cuda.manual_seed(args.seed) logging.info('gpu device = %d' % args.gpu) logging.info("args = %s", args) torch.distributed.init_process_group(backend='nccl', init_method='env://') args.world_size = torch.distributed.get_world_size() args.batch_size = args.batch_size // args.world_size # The network architeture coding rngs = [int(id) for id in args.model_id.split(' ')] model = Network(rngs) op_flops_dict = pickle.load(open(config.flops_lookup_table, 'rb')) profile(model, config.model_input_size_imagenet, rngs=rngs) flops = get_arch_flops(op_flops_dict, rngs, config.backbone_info, config.blocks_keys) params = count_parameters_in_MB(model) model = model.cuda(args.gpu) model = apex.parallel.DistributedDataParallel(model, delay_allreduce=True) arch = model.module.architecture() logging.info('rngs:{}, arch:{}'.format(rngs, arch)) logging.info("flops = %fMB, param size = %fMB", flops / 1e6, params) logging.info('batch_size:{}'.format(args.batch_size)) criterion_smooth = CrossEntropyLabelSmooth(CLASSES, args.label_smooth) criterion_smooth = criterion_smooth.cuda() all_parameters = model.parameters() weight_parameters = [] for pname, p in model.named_parameters(): if p.ndimension( ) == 4 or 'classifier.0.weight' in pname or 'classifier.0.bias' in pname: weight_parameters.append(p) weight_parameters_id = list(map(id, weight_parameters)) other_parameters = list( filter(lambda p: id(p) not in weight_parameters_id, all_parameters)) optimizer = torch.optim.SGD( [{ 'params': other_parameters }, { 'params': weight_parameters, 'weight_decay': args.weight_decay }], args.learning_rate, momentum=args.momentum, ) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lambda step: (1.0 - step / args.total_iters), last_epoch=-1) # Prepare data train_loader = get_train_dataloader(args.train_dir, args.batch_size, args.local_rank, args.total_iters) train_dataprovider = DataIterator(train_loader) val_loader = get_val_dataloader(args.test_dir) val_dataprovider = DataIterator(val_loader) start_iter = 0 best_acc_top1 = 0 checkpoint_tar = os.path.join(args.save, 'checkpoint.pth.tar') if os.path.exists(checkpoint_tar): logging.info('loading checkpoint {} ..........'.format(checkpoint_tar)) checkpoint = torch.load( checkpoint_tar, map_location={'cuda:0': 'cuda:{}'.format(args.local_rank)}) start_iter = checkpoint['iters'] best_acc_top1 = checkpoint['best_acc_top1'] model.load_state_dict(checkpoint['state_dict']) logging.info("loaded checkpoint {} iters = {}".format( checkpoint_tar, checkpoint['iters'])) for iters in range(start_iter): scheduler.step() # evaluation mode if args.eval: if args.eval_resume is not None: checkpoint = torch.load(args.eval_resume) model.load_state_dict(checkpoint['state_dict']) valid_acc_top1, valid_acc_top5 = infer(val_dataprovider, model.module, val_iters) print('valid_acc_top1: {}'.format(valid_acc_top1)) exit(0) iters = start_iter while iters < args.total_iters: train_iters = 10000 train_acc, train_obj, iters= train(iters, train_dataprovider, model, criterion_smooth, \ optimizer, train_iters, scheduler) writer.add_scalar('Train/Loss', train_obj, iters) writer.add_scalar('Train/LR', scheduler.get_lr()[0], iters) # torch.cuda.empty_cache() if args.local_rank == 0: valid_acc_top1, valid_acc_top5 = infer(val_dataprovider, model.module, val_iters) is_best = False if valid_acc_top1 > best_acc_top1: best_acc_top1 = valid_acc_top1 is_best = True logging.info( 'valid_acc_top1: %f valid_acc_top5: %f best_acc_top1: %f', valid_acc_top1, valid_acc_top5, best_acc_top1) save_checkpoint_( { 'iters': iters, 'state_dict': model.state_dict(), 'best_acc_top1': best_acc_top1, 'optimizer': optimizer.state_dict(), }, args.save)
def main(pretrain=True): config.save = 'search-{}-{}'.format(config.save, time.strftime("%Y%m%d-%H%M%S")) create_exp_dir(config.save, scripts_to_save=glob.glob('*.py')+glob.glob('*.sh')) logger = SummaryWriter(config.save) log_format = '%(asctime)s %(message)s' logging.basicConfig(stream=sys.stdout, level=logging.INFO, format=log_format, datefmt='%m/%d %I:%M:%S %p') fh = logging.FileHandler(os.path.join(config.save, 'log.txt')) fh.setFormatter(logging.Formatter(log_format)) logging.getLogger().addHandler(fh) assert type(pretrain) == bool or type(pretrain) == str update_arch = True if pretrain == True: update_arch = False logging.info("args = %s", str(config)) # preparation ################ torch.backends.cudnn.enabled = True torch.backends.cudnn.benchmark = True seed = config.seed np.random.seed(seed) torch.manual_seed(seed) if torch.cuda.is_available(): torch.cuda.manual_seed(seed) # config network and criterion ################ min_kept = int(config.batch_size * config.image_height * config.image_width // (16 * config.gt_down_sampling ** 2)) ohem_criterion = ProbOhemCrossEntropy2d(ignore_label=255, thresh=0.7, min_kept=min_kept, use_weight=False) # Model ####################################### model = Network(config.num_classes, config.layers, ohem_criterion, Fch=config.Fch, width_mult_list=config.width_mult_list, prun_modes=config.prun_modes, stem_head_width=config.stem_head_width) flops, params = profile(model, inputs=(torch.randn(1, 3, 1024, 2048),), verbose=False) logging.info("params = %fMB, FLOPs = %fGB", params / 1e6, flops / 1e9) model = model.cuda() if type(pretrain) == str: partial = torch.load(pretrain + "/weights.pt", map_location='cuda:0') state = model.state_dict() pretrained_dict = {k: v for k, v in partial.items() if k in state and state[k].size() == partial[k].size()} state.update(pretrained_dict) model.load_state_dict(state) else: init_weight(model, nn.init.kaiming_normal_, nn.BatchNorm2d, config.bn_eps, config.bn_momentum, mode='fan_in', nonlinearity='relu') device = torch.device("cuda" if torch.cuda.is_available() else "cpu") model.to(device) architect = Architect(model, config) # Optimizer ################################### base_lr = config.lr parameters = [] parameters += list(model.stem.parameters()) parameters += list(model.cells.parameters()) parameters += list(model.refine32.parameters()) parameters += list(model.refine16.parameters()) parameters += list(model.head0.parameters()) parameters += list(model.head1.parameters()) parameters += list(model.head2.parameters()) parameters += list(model.head02.parameters()) parameters += list(model.head12.parameters()) optimizer = torch.optim.SGD( parameters, lr=base_lr, momentum=config.momentum, weight_decay=config.weight_decay) # lr policy ############################## lr_policy = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.978) # data loader ########################### data_setting = {'img_root': config.img_root_folder, 'gt_root': config.gt_root_folder, 'train_source': config.train_source, 'eval_source': config.eval_source, 'down_sampling': config.down_sampling} index_select = list(range(config.num_train_imgs)) shuffle(index_select) # shuffle to make sure balanced dataset split train_loader_model = get_train_loader(config, Cityscapes, portion=config.train_portion, index_select=index_select) train_loader_arch = get_train_loader(config, Cityscapes, portion=config.train_portion-1, index_select=index_select) evaluator = SegEvaluator(Cityscapes(data_setting, 'val', None), config.num_classes, config.image_mean, config.image_std, model, config.eval_scale_array, config.eval_flip, 0, config=config, verbose=False, save_path=None, show_image=False) if update_arch: for idx in range(len(config.latency_weight)): logger.add_scalar("arch/latency_weight%d"%idx, config.latency_weight[idx], 0) logging.info("arch_latency_weight%d = "%idx + str(config.latency_weight[idx])) tbar = tqdm(range(config.nepochs), ncols=80) valid_mIoU_history = []; FPSs_history = []; latency_supernet_history = []; latency_weight_history = []; valid_names = ["8s", "16s", "32s", "8s_32s", "16s_32s"] arch_names = {0: "teacher", 1: "student"} for epoch in tbar: logging.info(pretrain) logging.info(config.save) logging.info("lr: " + str(optimizer.param_groups[0]['lr'])) logging.info("update arch: " + str(update_arch)) # training tbar.set_description("[Epoch %d/%d][train...]" % (epoch + 1, config.nepochs)) train(pretrain, train_loader_model, train_loader_arch, model, architect, ohem_criterion, optimizer, lr_policy, logger, epoch, update_arch=update_arch) torch.cuda.empty_cache() lr_policy.step() # validation tbar.set_description("[Epoch %d/%d][validation...]" % (epoch + 1, config.nepochs)) with torch.no_grad(): if pretrain == True: model.prun_mode = "min" valid_mIoUs = infer(epoch, model, evaluator, logger, FPS=False) for i in range(5): logger.add_scalar('mIoU/val_min_%s'%valid_names[i], valid_mIoUs[i], epoch) logging.info("Epoch %d: valid_mIoU_min_%s %.3f"%(epoch, valid_names[i], valid_mIoUs[i])) if len(model._width_mult_list) > 1: model.prun_mode = "max" valid_mIoUs = infer(epoch, model, evaluator, logger, FPS=False) for i in range(5): logger.add_scalar('mIoU/val_max_%s'%valid_names[i], valid_mIoUs[i], epoch) logging.info("Epoch %d: valid_mIoU_max_%s %.3f"%(epoch, valid_names[i], valid_mIoUs[i])) model.prun_mode = "random" valid_mIoUs = infer(epoch, model, evaluator, logger, FPS=False) for i in range(5): logger.add_scalar('mIoU/val_random_%s'%valid_names[i], valid_mIoUs[i], epoch) logging.info("Epoch %d: valid_mIoU_random_%s %.3f"%(epoch, valid_names[i], valid_mIoUs[i])) else: valid_mIoUss = []; FPSs = [] model.prun_mode = None for idx in range(len(model._arch_names)): # arch_idx model.arch_idx = idx valid_mIoUs, fps0, fps1 = infer(epoch, model, evaluator, logger) valid_mIoUss.append(valid_mIoUs) FPSs.append([fps0, fps1]) for i in range(5): # preds logger.add_scalar('mIoU/val_%s_%s'%(arch_names[idx], valid_names[i]), valid_mIoUs[i], epoch) logging.info("Epoch %d: valid_mIoU_%s_%s %.3f"%(epoch, arch_names[idx], valid_names[i], valid_mIoUs[i])) if config.latency_weight[idx] > 0: logger.add_scalar('Objective/val_%s_8s_32s'%arch_names[idx], objective_acc_lat(valid_mIoUs[3], 1000./fps0), epoch) logging.info("Epoch %d: Objective_%s_8s_32s %.3f"%(epoch, arch_names[idx], objective_acc_lat(valid_mIoUs[3], 1000./fps0))) logger.add_scalar('Objective/val_%s_16s_32s'%arch_names[idx], objective_acc_lat(valid_mIoUs[4], 1000./fps1), epoch) logging.info("Epoch %d: Objective_%s_16s_32s %.3f"%(epoch, arch_names[idx], objective_acc_lat(valid_mIoUs[4], 1000./fps1))) valid_mIoU_history.append(valid_mIoUss) FPSs_history.append(FPSs) if update_arch: latency_supernet_history.append(architect.latency_supernet) latency_weight_history.append(architect.latency_weight) save(model, os.path.join(config.save, 'weights.pt')) if type(pretrain) == str: # contains arch_param names: {"alphas": alphas, "betas": betas, "gammas": gammas, "ratios": ratios} for idx, arch_name in enumerate(model._arch_names): state = {} for name in arch_name['alphas']: state[name] = getattr(model, name) for name in arch_name['betas']: state[name] = getattr(model, name) for name in arch_name['ratios']: state[name] = getattr(model, name) state["mIoU02"] = valid_mIoUs[3] state["mIoU12"] = valid_mIoUs[4] if pretrain is not True: state["latency02"] = 1000. / fps0 state["latency12"] = 1000. / fps1 torch.save(state, os.path.join(config.save, "arch_%d_%d.pt"%(idx, epoch))) torch.save(state, os.path.join(config.save, "arch_%d.pt"%(idx))) if update_arch: for idx in range(len(config.latency_weight)): if config.latency_weight[idx] > 0: if (int(FPSs[idx][0] >= config.FPS_max[idx]) + int(FPSs[idx][1] >= config.FPS_max[idx])) >= 1: architect.latency_weight[idx] /= 2 elif (int(FPSs[idx][0] <= config.FPS_min[idx]) + int(FPSs[idx][1] <= config.FPS_min[idx])) > 0: architect.latency_weight[idx] *= 2 logger.add_scalar("arch/latency_weight_%s"%arch_names[idx], architect.latency_weight[idx], epoch+1) logging.info("arch_latency_weight_%s = "%arch_names[idx] + str(architect.latency_weight[idx]))
# print('Total params: %f M' % (sum(p.numel() for p in model.parameters()) / 1024. / 1024.0)) # print(len(list(model.modules()))) # model =mobilenetv2_sandglass() # print('Total params: %f M' % (sum(p.numel() for p in model.parameters()) / 1024. / 1024.0)) # print(len(list(model.modules()))) # model = MobileNetV2_sandglass() # print('Total params: %f M' % (sum(p.numel() for p in model.parameters()) / 1024. / 1024.0)) # print(len(list(model.modules()))) # model = InvertedResidual(32, 32, 1, 6) # print('InvertedResidual params: %.f' % (sum(p.numel() for p in model.parameters()))) # print(len(list(model.modules()))) # print(model) # model = Sandglass(192, 192, 1, 6) # print('Sandglass params: %.f' % (sum(p.numel() for p in model.parameters()))) # print(len(list(model.modules()))) # # print(model) # model = My_Sandglass(192, 192, 1, 6) # print('Sandglass params: %.f' % (sum(p.numel() for p in model.parameters()))) # print(len(list(model.modules()))) # print(model) # model.eval() # # print(model) input = torch.randn(1, 3, 224, 224) # y = model(input) # # print(y.shape) # print('Total params: %f M' % (sum(p.numel() for p in model.parameters())/ 1024. / 1024.0)) from thop import profile flops, params = profile(model, inputs=[input]) print(flops) print(params) print('Total params: %f M' % (sum(p.numel() for p in model.parameters())))
def _flops(h, w, C_in, C_out, stride=1): layer = FactorizedReduce(C_in, C_out, stride, slimmable=False) flops, params = profile(layer, inputs=(torch.randn(1, C_in, h, w), ), verbose=False) return flops
def count_flops(model, target_size): from thop import profile model_input = torch.randn(1, 3, target_size, target_size) flops, n_params = profile(model, inputs=(model_input, ), verbose=False) return flops, n_params
write_voc_results_file(box_list, dataset) do_python_eval(output_dir) if __name__ == '__main__': # load net num_classes = len(labelmap) + 1 # +1 for background net = build_ssd('test', 300, num_classes) # initialize SSD # Load all of the weights into the network net.load_state_dict(torch.load(args.trained_model)) # Get an estimated number of paramters and flops for the model if args.cuda == False: input = torch.randn(1, 3, 300, 300) flops, params = profile(net, inputs=(input, ), verbose=0) flops, params = clever_format([flops, params], "%.3f") print("\nFlops =", flops, "\nParams =", params, "\n") net.eval() print('Finished loading model!') # load data dataset = VOCDetection(args.voc_root, [('2007', set_type)], BaseTransform(300, dataset_mean), VOCAnnotationTransform()) if args.cuda: net = net.cuda() cudnn.benchmark = True # evaluation test_net(args.save_folder, net,
def main_worker(gpu, opt, cfg): if opt.seed is not None: setup_seed(opt.seed) if gpu is not None: opt.gpu = gpu init_dist(opt) if not opt.log: logger.setLevel(50) null_writer = NullWriter() sys.stdout = null_writer logger.info('******************************') logger.info(opt) logger.info('******************************') logger.info(cfg) logger.info('******************************') opt.nThreads = int(opt.nThreads / num_gpu) # Model Initialize m = preset_model(cfg) if opt.params: from thop import clever_format, profile input = torch.randn(1, 3, 256, 256).cuda(opt.gpu) flops, params = profile(m.cuda(opt.gpu), inputs=(input, )) macs, params = clever_format([flops, params], "%.3f") logger.info(macs, params) m.cuda(opt.gpu) m = torch.nn.parallel.DistributedDataParallel(m, device_ids=[opt.gpu]) criterion = builder.build_loss(cfg.LOSS).cuda(opt.gpu) optimizer = torch.optim.Adam(m.parameters(), lr=cfg.TRAIN.LR) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR( optimizer, milestones=cfg.TRAIN.LR_STEP, gamma=cfg.TRAIN.LR_FACTOR) if opt.log: writer = SummaryWriter('.tensorboard/{}/{}-{}'.format( cfg.DATASET.DATASET, cfg.FILE_NAME, opt.exp_id)) else: writer = None if cfg.DATASET.DATASET == 'mix_smpl': train_dataset = MixDataset(cfg=cfg, train=True) else: raise NotImplementedError heatmap_to_coord = get_func_heatmap_to_coord(cfg) train_sampler = torch.utils.data.distributed.DistributedSampler( train_dataset, num_replicas=opt.world_size, rank=opt.rank) train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=cfg.TRAIN.BATCH_SIZE, shuffle=(train_sampler is None), num_workers=opt.nThreads, sampler=train_sampler, worker_init_fn=_init_fn) # gt val dataset gt_val_dataset_h36m = MixDataset(cfg=cfg, train=False) gt_val_dataset_3dpw = PW3D(cfg=cfg, ann_file='3DPW_test_new.json', train=False) opt.trainIters = 0 best_err_h36m = 999 best_err_3dpw = 999 for i in range(cfg.TRAIN.BEGIN_EPOCH, cfg.TRAIN.END_EPOCH): opt.epoch = i train_sampler.set_epoch(i) current_lr = optimizer.state_dict()['param_groups'][0]['lr'] logger.info( f'############# Starting Epoch {opt.epoch} | LR: {current_lr} #############' ) # Training loss, acc17 = train(opt, train_loader, m, criterion, optimizer, writer) logger.epochInfo('Train', opt.epoch, loss, acc17) lr_scheduler.step() if (i + 1) % opt.snapshot == 0: if opt.log: # Save checkpoint torch.save( m.module.state_dict(), './exp/{}/{}-{}/model_{}.pth'.format( cfg.DATASET.DATASET, cfg.FILE_NAME, opt.exp_id, opt.epoch)) # Prediction Test with torch.no_grad(): gt_tot_err_h36m = validate_gt(m, opt, cfg, gt_val_dataset_h36m, heatmap_to_coord) gt_tot_err_3dpw = validate_gt(m, opt, cfg, gt_val_dataset_3dpw, heatmap_to_coord) if opt.log: if gt_tot_err_h36m <= best_err_h36m: best_err_h36m = gt_tot_err_h36m torch.save( m.module.state_dict(), './exp/{}/{}-{}/best_h36m_model.pth'.format( cfg.DATASET.DATASET, cfg.FILE_NAME, opt.exp_id)) if gt_tot_err_3dpw <= best_err_3dpw: best_err_3dpw = gt_tot_err_3dpw torch.save( m.module.state_dict(), './exp/{}/{}-{}/best_3dpw_model.pth'.format( cfg.DATASET.DATASET, cfg.FILE_NAME, opt.exp_id)) logger.info( f'##### Epoch {opt.epoch} | h36m err: {gt_tot_err_h36m} / {best_err_h36m} | 3dpw err: {gt_tot_err_3dpw} / {best_err_3dpw} #####' ) torch.distributed.barrier() # Sync torch.save( m.module.state_dict(), './exp/{}/{}-{}/final_DPG.pth'.format(cfg.DATASET.DATASET, cfg.FILE_NAME, opt.exp_id))
from unet_parts import double_conv, double_dsconv from unet_model import UNet, UNet_dsc from torchsummary import summary import torch from thop import profile model = UNet(3, 2) model_dsc = UNet_dsc(3, 2) # summary(model.cuda(), (3, 512, 512)) # summary(model_dsc.cuda(), (3, 512, 512)) conv = double_conv(64, 128) conv_dsc = double_dsconv(64, 128) summary(conv.cuda(), (64,512,512)) summary(conv_dsc.cuda(), (64, 512, 512)) # torch.save(model.state_dict(), 'unet.pth') # torch.save(model_dsc.state_dict(), 'unet_dsc.pth') flops, params = profile(conv, input_size=(1, 64, 512, 512)) print(flops, params) flops, params = profile(conv_dsc, input_size=(1, 64, 512, 512)) print(flops, params)