def classifier(args, clf, train, pretrained_dir="/"): """Set classifier Arguments: dataset -- "Cifar10" or "Cifar100" clf -- "resnet18", "resnet50", or "resnet101" train {bool} -- Train or not Keyword Arguments: pretrained_dir {str} -- pretrained weights path (default: {"/"}) Returns: Model """ num_classes = args.num_classes input_channels = args.image_channels map_location = (lambda s, _: s) checkpoint_dir = os.path.join(pretrained_dir, args.dataset, clf + '.pth') if clf.lower() == 'resnet18': if train: net = resnet18(num_classes, input_channels) else: checkpoint = torch.load(checkpoint_dir, map_location=map_location) net = resnet18(num_classes, input_channels) net.load_state_dict(checkpoint['model_state_dict']) elif clf.lower() == 'resnet50': if train: net = resnet50(num_classes, input_channels) else: checkpoint = torch.load(checkpoint_dir, map_location=map_location) net = resnet50(num_classes, input_channels) net.load_state_dict(checkpoint['model_state_dict']) elif clf.lower() == 'resnet101': if train: net = resnet101(num_classes, input_channels) else: checkpoint = torch.load(checkpoint_dir, map_location=map_location) net = resnet101(num_classes, input_channels) net.load_state_dict(checkpoint['model_state_dict']) else: raise Exception( "You can choose the model among [resnet18, resnet 50, resnet101]") return net
def train_main(args): global loader_train, loader_val loader_train, loader_val, loader_test = load_data(train_bath_size=BATCH_SIZE, args=args,RANDOM_SEED=RANDOM_SEED, val_batch_size=BATCH_SIZE) device = set_device() setup_seed(RANDOM_SEED) #随机种子 #model = googleNet() model = resnet18() #model = load_model(model, args.pretrained_model_path, device=device) model = nn.DataParallel(model) #多gpu criterion = nn.CrossEntropyLoss() params = net_lr(model, FC_LR, NET_LR) if OPTIMIZER == 'adam': optimizer = torch.optim.Adam(params, betas=(0.9, 0.999), weight_decay=0, eps=1e-08) else: optimizer = torch.optim.SGD(params, momentum=MOMENTUM, nesterov=True, weight_decay=WEIGHT_DECAY) print(model) start_epoch = 0 if Load_model: start_epoch = 25 filepath = 'load_model_path' model = load_model(model, filepath, device=device) model = model.to(device=device) optimizer = load_optimizer(optimizer, filepath, device=device) train(model, optimizer, criterion, device=device, epochs=EPOCH, start=start_epoch)
def model_init(model_name): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") if model_name == 'retinanet' : #weight_file_path = '/content/retinanet/resnet34-333f7ec4.pth' #weight_file_path = '/content/retinanet/CP_epoch5.pth' weight_file_path = '/content/retinanet/retinanet50_pretrained.pth' total_keys = len(list(torch.load(weight_file_path).keys())) # Create the model if total_keys >= 102 and total_keys < 182 : retinanet = model.resnet18(num_classes=num_classes, pretrained=False) elif total_keys >= 182 and total_keys < 267: retinanet = model.resnet34(num_classes=num_classes, pretrained=False) elif total_keys >= 267 and total_keys < 522: retinanet = model.resnet50(num_classes=num_classes, pretrained=False) elif total_keys >= 522 and total_keys < 777: retinanet = model.resnet101(num_classes=num_classes, pretrained=False) elif total_keys >= 777: retinanet = model.resnet152(num_classes=num_classes, pretrained=False) else: raise ValueError('Unsupported model backbone, must be one of resnet18, resnet34, resnet50, resnet101, resnet152') retinanet.load_state_dict(torch.load(weight_file_path, map_location=device), strict=False) # Initialisng Model with loaded weights print('model initialized..') return retinanet, device
def cnn(classes=5): model = resnet18(num_classes=classes) model.compile(loss="sparse_categorical_crossentropy", optimizer=keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999), metrics=['accuracy']) return model
def __init__(self, batch_size, lr, num_workers, data_root, **kwargs): super(LitDistModule, self).__init__() self.model = resnet18(with_fc=False) self.batch_size = batch_size self.lr = lr self.num_workers = num_workers self.data_root = data_root
def test_resnet18(test_loader): model = resnet18(input_channel=1, num_classes=10) print("ResNet-18:") print(model) for batch_idx, (data, target) in enumerate(test_loader): print("Input data shape: {}".format(data.shape)) output = model(data) print("Output shape: {}".format(output.shape)) break
def __init__(self, lr, batch_size, num_workers, **kwargs): super(BasePL, self).__init__() self.model = resnet18(with_fc=True) self.criterion = torch.nn.CrossEntropyLoss() self.train_accuracy = pl.metrics.Accuracy() self.batch_size = batch_size self.lr = lr self.num_workers = num_workers self.save_hyperparameters()
def main_dropout(data_args, train_args, model_args): train_loader, val_loader, test_loader = get_dataloaders(data_args, load_test=False) #train_loader, val_loader, test_loader = get_dataloaders_incr(data_args, load_test=False) #train_loader, val_loader = train_loader[0], val_loader[0] assert model_args.load_state_path, 'please specify a path to a pretrained model' state = torch.load(model_args.load_state_path) net = resnet18(num_classes=data_args.num_classes, seed=data_args.seed, disable_bn_stats=model_args.disable_bn_stats) net.load_state_dict(state) net.cuda() drop_features(train_args, net, train_loader, val_loader, device=0)
def main_consolidate(data_args, train_args, model_args): assert model_args.load_state_path, 'please specify a path to a pretrained model' state = torch.load(model_args.load_state_path) net = resnet18(num_classes=data_args.num_classes, seed=data_args.seed, disable_bn_stats=model_args.disable_bn_stats) if data_args.num_classes != state['fc.weight'].shape[0]: net.fc = nn.Linear(net.fc.in_features, state['fc.bias'].shape[0], bias=True) net.load_state_dict(state) net.cuda() if train_args.single_task: consolidate_single_task(data_args, train_args, net, device=0) else: consolidate_multi_task(data_args, train_args, net, device=0)
def load_model(self): self.checkpoint = torch.load(self.model_checkpoint_file_path, map_location=lambda storage, loc: storage) self.model_args = self.checkpoint['args'] self.num_classes = None if self.model_args.model_type == 'food179': self.num_classes = 179 elif self.model_args.model_type == 'nsfw': self.num_classes = 5 else: raise ('Not Implemented!') if self.model_args.model_arc == 'resnet18': self.model = model.resnet18(num_classes=self.num_classes, zero_init_residual=True) elif self.model_args.model_arc == 'resnet34': self.model = model.resnet34(num_classes=self.num_classes, zero_init_residual=True) elif self.model_args.model_arc == 'resnet50': self.model = model.resnet50(num_classes=self.num_classes, zero_init_residual=True) elif self.model_args.model_arc == 'resnet101': self.model = model.resnet101(num_classes=self.num_classes, zero_init_residual=True) elif self.model_args.model_arc == 'resnet152': self.model = model.resnet152(num_classes=self.num_classes, zero_init_residual=True) elif self.model_args.model_arc == 'mobilenet': self.model = model.MobileNetV2(n_class=self.num_classes, input_size=256) else: raise ('Not Implemented!') self.model = nn.DataParallel(self.model) self.model.load_state_dict(self.checkpoint['model_state_dict']) self.model_epoch = self.checkpoint['epoch'] self.model_test_acc = self.checkpoint['test_acc'] self.model_best_acc = self.checkpoint['best_acc'] self.model_test_acc_top5 = self.checkpoint['test_acc_top5'] self.model_class_to_idx = self.checkpoint['class_to_idx'] self.model_idx_to_class = { v: k for k, v in self.model_class_to_idx.items() } self.model_train_history_dict = self.checkpoint['train_history_dict'] self.mean = self.checkpoint['NORM_MEAN'] self.std = self.checkpoint['NORM_STD'] self.model.eval() return
def predict_bagging(args): global loader_train, loader_val loader_train, loader_val, loader_test = load_data(train_bath_size=BATCH_SIZE, args=args,RANDOM_SEED=RANDOM_SEED, val_batch_size=BATCH_SIZE) device = set_device() setup_seed(RANDOM_SEED) #random seed model = resnet18() model_list=[] for maindir, subdir, file_name_list in os.walk(args.bagging_root): for filename in file_name_list: model_path = os.path.join(maindir, filename) print(model_path) model_i = load_model(model, model_path, device=device) model_i = model_i.to(device=device) model_i.eval() model_list.append(copy.deepcopy(model_i)) result = [] for batch_idx, (x, label, idx) in enumerate(loader_test): x = x.to(device=device, dtype=dtype) # move to device, e.g. GPU x = Variable(x) oneHot_result_sum = torch.zeros(x.size(0), 10) for model_i in model_list: output = model_i(x) _, predicted = output.max(1) predicted_cpu = predicted.cpu() oneHot_predicted = torch.zeros(output.size(0), output.size(1)).scatter_(1, predicted_cpu.unsqueeze(1), 1) oneHot_result_sum += oneHot_predicted _, bagging_result = oneHot_result_sum.max(1) bagging_result = bagging_result.cpu() if len(idx.shape)==1: idx = idx.unsqueeze(1) index_predicted = torch.cat([idx, bagging_result.unsqueeze(1)], dim=1) index_predicted = index_predicted.cpu().data.numpy() result.extend(index_predicted) headers = ['image_id', 'label'] with open(args.predict_output_root, 'w', newline='')as f: f_csv = csv.writer(f) f_csv.writerow(headers) f_csv.writerows(result)
def main(args=None): data_set = { x: guipang(cfg=cfg['dataset_guipang'], part=x) for x in ['train', 'val'] } # data_set = { # x: qiafan(cfg=cfg['dataset_qiafan'], part=x) for x in ['train', 'val'] # } data_loader = { x: data.DataLoader(data_set[x], batch_size=cfg['batch_size'], num_workers=4, shuffle=True, pin_memory=False) for x in ['train', 'val'] } # Create the model if cfg['depth'] == 18: retinanet = model.resnet18( num_classes=dataset_train.num_classes(), pretrained=True) elif cfg['depth'] == 34: retinanet = model.resnet34( num_classes=dataset_train.num_classes(), pretrained=True) elif cfg['depth'] == 50: retinanet = model.resnet50( num_classes=dataset_train.num_classes(), pretrained=True) elif cfg['depth'] == 101: retinanet = model.resnet101( num_classes=dataset_train.num_classes(), pretrained=True) elif cfg['depth'] == 152: retinanet = model.resnet152( num_classes=dataset_train.num_classes(), pretrained=True) else: raise ValueError( 'Unsupported model depth, must be one of 18, 34, 50, 101, 152') use_gpu = True if use_gpu: retinanet = retinanet.cuda() retinanet = torch.nn.DataParallel(retinanet).cuda() optimizer = optim.Adam(retinanet.parameters(), lr=1e-5)
def get_nn_model(run_dir, ds): config.log.info('==> Building model...') if config.args.model not in config.torch_models: raise NotImplementedError net = None if config.args.model == config.Resnet18: net = model.resnet18(num_classes=ds.label_count) elif config.args.model == config.OverfitResnet: net = model.OverfitResNet18(num_classes=ds.label_count) elif config.args.model == config.MLP: net = model.create_mlp(len(ds.feature_columns), ds.label_count) elif config.args.model == config.LeNet: if config.args.dataset in config.ColoredMNIST_lst: net = model.LeNetMNIST(3, ds.label_count) # use to colored mnist elif config.args.dataset == config.Timit: net = model.LeNetTIMIT(1, ds.label_count) # use for timit elif config.args.dataset == config.Timit2Groups: # net = model.LeNetTIMIT(1, ds.label_count) # use for timit net = model.LeNetTIMIT2(1, ds.label_count) # use for timit net = net.to(config.args.torch_device) if config.args.task in [ *config.augment_testing_lst, config.evaluate_fairness ]: net_path = os.path.join(run_dir, config.args.save_model) config.log.info(f'==> Loading model from: {net_path}') if config.args.use_cuda: state = torch.load(net_path) else: state = torch.load(net_path, map_location=torch.device('cpu')) net.load_state_dict(state['net']) # support cuda if config.args.use_cuda: config.log.info('Using CUDA') config.log.info('Parallel training on {0} GPUs.'.format( torch.cuda.device_count())) net = torch.nn.DataParallel(net, device_ids=list( range(torch.cuda.device_count()))) cudnn.benchmark = True return net
def main(args=None): from dataloader import JinNanDataset, Augmenter, UnNormalizer, Normalizer,Resizer from torch.utils.data import Dataset, DataLoader from torchvision import datasets, models, transforms import model import torch import argparse parser = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.') parser.add_argument('--dataset',default='jingnan', help='Dataset type, must be one of csv or coco.') parser.add_argument('--threshold',help='treshold') parser.add_argument('--dataset_path', help='Path to file containing training and validation annotations (optional, see readme)') parser.add_argument('--model_path',help=('the model path')) parser.add_argument('--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=50) parser = parser.parse_args(args) dataset_val=JinNanDataset(parser.dataset_path, set_name='val', transform=transforms.Compose([Normalizer(), Augmenter(), Resizer()])) # Create the model if parser.depth == 18: retinanet = model.resnet18(num_classes=dataset_val.num_classes(), pretrained=True) elif parser.depth == 34: retinanet = model.resnet34(num_classes=dataset_val.num_classes(), pretrained=True) elif parser.depth == 50: retinanet = model.resnet50(num_classes=dataset_val.num_classes(), pretrained=True) elif parser.depth == 101: retinanet = model.resnet101(num_classes=dataset_val.num_classes(), pretrained=True) elif parser.depth == 152: retinanet = model.resnet152(num_classes=dataset_val.num_classes(), pretrained=True) else: raise ValueError('Unsupported model depth, must be one of 18, 34, 50, 101, 152') retinanet=torch.load(parser.model_path) use_gpu = True if use_gpu: retinanet = retinanet.cuda() retinanet.eval() print('Evaluating dataset') evaluate_jinnan(dataset_val, retinanet)
def train(rank, nprocs, args): print(rank) torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:23456', rank=rank, world_size=args.world_size) # seed for reproducibility torch.manual_seed(1) torch.cuda.manual_seed(1) torch.backends.cudnn.deterministic = True # create dataset. #train_loader, test_loader = partition_dataset(rank, args.world_size, args) train_loader, test_loader, train_sampler = create_dataloader( '../data', args.world_size, args.batch_size) print("loading dataset successed!") # create model. model = resnet18() torch.cuda.set_device(rank) model.cuda(rank) cudnn.benchmark = True # define the optimizer. #optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=0.9) optimizer = LocalSGD(model.parameters(), lr=args.lr, gmf=0, tau=args.tau, size=args.world_size, momentum=0.9, nesterov=True, weight_decay=1e-4) # define the criterion and lr scheduler. criterion = nn.CrossEntropyLoss().cuda() for epoch in range(args.epoches): acc = train_one_epoch(model, optimizer, criterion, train_loader, test_loader, epoch, rank) print(acc) break
def build(self, depth=50, learning_rate=1e-5, ratios=[0.5, 1, 2], scales=[2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]): # Create the model if depth == 18: retinanet = model.resnet18(num_classes=self.dataset_train.num_classes(), ratios=ratios, scales=scales, weights_dir=self.weights_dir_path, pretrained=True) elif depth == 34: retinanet = model.resnet34(num_classes=self.dataset_train.num_classes(), ratios=ratios, scales=scales, weights_dir=self.weights_dir_path, pretrained=True) elif depth == 50: retinanet = model.resnet50(num_classes=self.dataset_train.num_classes(), ratios=ratios, scales=scales, weights_dir=self.weights_dir_path, pretrained=True) elif depth == 101: retinanet = model.resnet101(num_classes=self.dataset_train.num_classes(), ratios=ratios, scales=scales, weights_dir=self.weights_dir_path, pretrained=True) elif depth == 152: retinanet = model.resnet152(num_classes=self.dataset_train.num_classes(), ratios=ratios, scales=scales, weights_dir=self.weights_dir_path, pretrained=True) else: raise ValueError('Unsupported model depth, must be one of 18, 34, 50, 101, 152') self.retinanet = retinanet.to(device=self.device) self.retinanet.training = True self.optimizer = optim.Adam(self.retinanet.parameters(), lr=learning_rate) self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=3, verbose=True) if self.checkpoint is not None: self.retinanet.load_state_dict(self.checkpoint['model']) self.optimizer.load_state_dict(self.checkpoint['optimizer']) self.scheduler.load_state_dict(self.checkpoint['scheduler']) # TODO: test this, is it done right? # TODO is it right to resume_read_trial optimizer and schedular like this??? self.ratios = ratios self.scales = scales self.depth = depth
def predict_main(args): global loader_train, loader_val loader_train, loader_val, loader_test = load_data(train_bath_size=BATCH_SIZE, args=args,RANDOM_SEED=RANDOM_SEED, val_batch_size=BATCH_SIZE) device = set_device() setup_seed(RANDOM_SEED) #random seed model = resnet18() model = load_model(model, args.load_model_path, device=device) model = model.to(device=device) test_epoch(model, nn.CrossEntropyLoss(), loader_val, device, 0, 1) model.eval() result = [] for batch_idx, (x, label, idx) in enumerate(loader_test): x = x.to(device=device, dtype=dtype) # move to device, e.g. GPU x = Variable(x) output = model(x) _, predicted = output.max(1) predicted = predicted.cpu() if len(idx.shape)==1: idx = idx.unsqueeze(1) index_predicted = torch.cat([idx, predicted.unsqueeze(1)], dim=1) index_predicted = index_predicted.cpu().data.numpy() result.extend(index_predicted) headers = ['image_id', 'label'] with open(args.predict_output_root, 'w', newline='')as f: f_csv = csv.writer(f) f_csv.writerow(headers) f_csv.writerows(result)
def set_models(self, dataset_train): # Create the model if self.depth == 18: retinanet = model.resnet18(num_classes=dataset_train.num_classes(), pretrained=True) elif self.depth == 34: retinanet = model.resnet34(num_classes=dataset_train.num_classes(), pretrained=True) elif self.depth == 50: retinanet = model.resnet50(num_classes=dataset_train.num_classes(), pretrained=True) elif self.depth == 101: retinanet = model.resnet101( num_classes=dataset_train.num_classes(), pretrained=True) elif self.depth == 152: retinanet = model.resnet152( num_classes=dataset_train.num_classes(), pretrained=True) else: raise ValueError( 'Unsupported model depth, must be one of 18, 34, 50, 101, 152') if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") # dim = 0 [30, xxx] -> [10, ...], [10, ...], [10, ...] on 3 GPUs retinanet = nn.DataParallel(retinanet) self.retinanet = retinanet.to(self.device) self.retinanet.training = True self.optimizer = optim.Adam(self.retinanet.parameters(), lr=self.lr) # This lr_shceduler reduce the learning rate based on the models's validation loss self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, patience=3, verbose=True) self.loss_hist = collections.deque(maxlen=500)
def train(num_epoch, learning_rate, save_name,batch_size, early_stopping_epoch=10, cuda_flag=False): # 基本参数 num_epoch = num_epoch lr_decay = 500 lr = learning_rate batch_size=batch_size model = resnet18() if cuda_flag: print('using cuda...') model = model.cuda() else: print('using cpu...') # 模型参数初始化 def weight_init(m): # 使用isinstance来判断m属于什么类型 if isinstance(m, nn.Conv2d): # n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels # m.weight.data.normal_(0, math.sqrt(2. / n)) torch.nn.init.xavier_uniform_(m.weight) torch.nn.init.constant_(m.bias, 0.1) elif isinstance(m, nn.Linear): m.weight.data.normal_() # 全连接层参数初始化 elif isinstance(m, nn.BatchNorm2d): # # m中的weight,bias其实都是Variable,为了能学习参数以及后向传播 m.weight.data.fill_(1) m.bias.data.zero_() model.apply(weight_init) # 定义损失函数 loss_func = nn.CrossEntropyLoss() # 定义优化函数 optimizer = torch.optim.Adam(model.parameters(), lr=lr) scheduler = torch.optim.lr_scheduler.StepLR( optimizer, 1000, gamma=lr_decay) train_data_loader, valid_data_loader, test_data_loader, _ = my_data_loader( batch_size) train_loss_epoch = [] valid_loss_epoch = [] best_model = None import sys min_valid_loss = sys.maxsize for epoch in range(num_epoch): model.train() train_loss = 0 train_pred_label = [] train_true_label = [] valid_pred_label = [] valid_true_label = [] # 评估accuracy for x, y in train_data_loader: train_true_label.extend(y) if cuda_flag: x = x.cuda() y = y.cuda() optimizer.zero_grad() # 这行代码很关键,不加会导致训练误差和验证误差都很大!!! pred_y = model(x) loss = loss_func(pred_y, y) if cuda_flag: pred_label = torch.argmax(pred_y.detach().cpu(), dim=1) train_pred_label.extend(pred_label) train_loss += loss.detach().cpu().numpy() else: train_loss += loss.detach().numpy() loss.backward() optimizer.step() train_accuracy = accuracy_score(train_true_label, train_pred_label) # valid performance model.eval() valid_loss = 0 for valid_x, valid_y in valid_data_loader: valid_true_label.extend(valid_y) if cuda_flag: valid_x = valid_x.cuda() valid_y = valid_y.cuda() with torch.no_grad(): # 这行代码很关键,不加就会导致 cuda out of memory 的问题 pred_valid_y = model(valid_x) cur_valid_loss = loss_func(pred_valid_y, valid_y) if cuda_flag: pred_valid_label = torch.argmax( pred_valid_y.detach().cpu(), dim=1) valid_pred_label.extend(pred_valid_label) valid_loss += cur_valid_loss.detach().cpu().numpy() else: valid_loss += cur_valid_loss.detach().numpy() valid_accuracy = accuracy_score(valid_true_label, valid_pred_label) print('===========epoch:{}, train_loss:{}, valid_loss:{}, train acc:{},valid acc:{}==========='.format( epoch, train_loss, valid_loss, train_accuracy, valid_accuracy)) train_loss_epoch.append(train_loss) valid_loss_epoch.append(valid_loss) ## 只需要保存当前效果最好的轮次的模型就可以了 if valid_loss<=min_valid_loss: best_model = copy.deepcopy(model) if early_stopping(valid_loss_epoch, early_stopping_epoch): break # checkpoint # 每隔10个epoch就checkpoint一下 # 修改成只保存early_stopping的时候效果最好的模型 # if epoch % 10 == 0 and epoch != 0: # checkpoint_tmp(model, save_name) min_loss_epoch = valid_loss_epoch.index(min(valid_loss_epoch)) save_name += '-min_loss_epoch_{}'.format(min_loss_epoch) checkpoint_tmp(best_model, save_name)
def main(args=None): parser = argparse.ArgumentParser(description='Simple testing script for RetinaNet network.') parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.',default = "csv") parser.add_argument('--coco_path', help='Path to COCO directory') parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)',default="binary_class.csv") parser.add_argument('--csv_val', help='Path to file containing validation annotations (optional, see readme)') parser.add_argument('--csv_box_annot', help='Path to file containing predicted box annotations ') parser.add_argument('--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=18) parser.add_argument('--epochs', help='Number of epochs', type=int, default=500) parser.add_argument('--model', help='Path of .pt file with trained model',default = 'esposallescsv_retinanet_0.pt') parser.add_argument('--model_out', help='Path of .pt file with trained model to save',default = 'trained') parser.add_argument('--score_threshold', help='Score above which boxes are kept',default=0.15) parser.add_argument('--nms_threshold', help='Score above which boxes are kept',default=0.2) parser.add_argument('--max_epochs_no_improvement', help='Max epochs without improvement',default=100) parser.add_argument('--max_boxes', help='Max boxes to be fed to recognition',default=50) parser.add_argument('--seg_level', help='Line or word, to choose anchor aspect ratio',default='line') parser.add_argument('--htr_gt_box',help='Train recognition branch with box gt (for debugging)',default=False) parser = parser.parse_args(args) # Create the data loaders if parser.dataset == 'csv': if parser.csv_classes is None: raise ValueError('Must provide --csv_classes when training on COCO,') if parser.csv_val is None: dataset_val = None print('No validation annotations provided.') else: dataset_val = CSVDataset(train_file=parser.csv_val, class_list=parser.csv_classes, transform=transforms.Compose([Normalizer(), Resizer()])) if parser.csv_box_annot is not None: box_annot_data = CSVDataset(train_file=parser.csv_box_annot, class_list=parser.csv_classes, transform=transforms.Compose([Normalizer(), Resizer()])) else: box_annot_data = None else: raise ValueError('Dataset type not understood (must be csv or coco), exiting.') if dataset_val is not None: sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False) dataloader_val = DataLoader(dataset_val, num_workers=0, collate_fn=collater, batch_sampler=sampler_val) if box_annot_data is not None: sampler_val = AspectRatioBasedSampler(box_annot_data, batch_size=1, drop_last=False) dataloader_box_annot = DataLoader(box_annot_data, num_workers=0, collate_fn=collater, batch_sampler=sampler_val) else: dataloader_box_annot = dataloader_val if not os.path.exists('trained_models'): os.mkdir('trained_models') # Create the model alphabet=dataset_val.alphabet if os.path.exists(parser.model): retinanet = torch.load(parser.model) else: if parser.depth == 18: retinanet = model.resnet18(num_classes=dataset_val.num_classes(), pretrained=True,max_boxes=int(parser.max_boxes),score_threshold=float(parser.score_threshold),seg_level=parser.seg_level,alphabet=alphabet) elif parser.depth == 34: retinanet = model.resnet34(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 50: retinanet = model.resnet50(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 101: retinanet = model.resnet101(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 152: retinanet = model.resnet152(num_classes=dataset_train.num_classes(), pretrained=True) else: raise ValueError('Unsupported model depth, must be one of 18, 34, 50, 101, 152') use_gpu = True if use_gpu: retinanet = retinanet.cuda() retinanet = torch.nn.DataParallel(retinanet).cuda() #retinanet = torch.load('../Documents/TRAINED_MODELS/pytorch-retinanet/esposallescsv_retinanet_99.pt') #print "LOADED pretrained MODEL\n\n" optimizer = optim.Adam(retinanet.parameters(), lr=1e-4) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=4, verbose=True) loss_hist = collections.deque(maxlen=500) ctc = CTCLoss() retinanet.module.freeze_bn() best_cer = 1000 epochs_no_improvement=0 cers=[] retinanet.eval() retinanet.module.epochs_only_det = 0 #retinanet.module.htr_gt_box = False retinanet.training=False if parser.score_threshold is not None: retinanet.module.score_threshold = float(parser.score_threshold) '''if parser.dataset == 'csv' and parser.csv_val is not None: print('Evaluating dataset') ''' mAP = csv_eval.evaluate(dataset_val, retinanet,score_threshold=retinanet.module.score_threshold) aps = [] for k,v in mAP.items(): aps.append(v[0]) print ("VALID mAP:",np.mean(aps)) print("score th",retinanet.module.score_threshold) for idx,data in enumerate(dataloader_box_annot): print("Eval CER on validation set:",idx,"/",len(dataloader_box_annot),"\r") if box_annot_data: image_name = box_annot_data.image_names[idx].split('/')[-1].split('.')[-2] else: image_name = dataset_val.image_names[idx].split('/')[-1].split('.')[-2] #generate_pagexml(image_name,data,retinanet,parser.score_threshold,parser.nms_threshold,dataset_val) text_gt_path="/".join(dataset_val.image_names[idx].split('/')[:-1]) text_gt = os.path.join(text_gt_path,image_name+'.txt') f =open(text_gt,'r') text_gt_lines=f.readlines()[0] transcript_pred = get_transcript(image_name,data,retinanet,retinanet.module.score_threshold,float(parser.nms_threshold),dataset_val,alphabet) cers.append(float(editdistance.eval(transcript_pred,text_gt_lines))/len(text_gt_lines)) print("GT",text_gt_lines) print("PREDS SAMPLE:",transcript_pred) print("VALID CER:",np.mean(cers),"best CER",best_cer) print("GT",text_gt_lines) print("PREDS SAMPLE:",transcript_pred) print("VALID CER:",np.mean(cers),"best CER",best_cer)
def main(args=None): #def main(epoch): parser = argparse.ArgumentParser(description='Simple training script for training a RetinaNet network.') parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.') parser.add_argument('--coco_path', help='Path to COCO directory') parser.add_argument('--csv_train', help='Path to file containing training annotations (see readme)') parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)') parser.add_argument('--csv_val', help='Path to file containing validation annotations (optional, see readme)') parser.add_argument('--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=50) parser.add_argument('--epochs', help='Number of epochs', type=int, default=100) #parser.add_argument('--resume', '-r', action='store_true', help='resume from checkpoint') parser.add_argument('--start-epoch', default=0, type=int, help='manual epoch number (useful on restarts)') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint (default: none)') parser = parser.parse_args(args) #args = parser.parse_args() #parser = parser.parse_args(epoch) # Create the data loaders if parser.dataset == 'coco': if parser.coco_path is None: raise ValueError('Must provide --coco_path when training on COCO,') dataset_train = CocoDataset(parser.coco_path, set_name='train2017', transform=transforms.Compose([Normalizer(), Augmenter(), Resizer()])) dataset_val = CocoDataset(parser.coco_path, set_name='val2017', transform=transforms.Compose([Normalizer(), Resizer()])) elif parser.dataset == 'csv': if parser.csv_train is None: raise ValueError('Must provide --csv_train when training on COCO,') if parser.csv_classes is None: raise ValueError('Must provide --csv_classes when training on COCO,') dataset_train = CSVDataset(train_file=parser.csv_train, class_list=parser.csv_classes, transform=transforms.Compose([Normalizer(), Augmenter(), Resizer()])) if parser.csv_val is None: dataset_val = None print('No validation annotations provided.') else: dataset_val = CSVDataset(train_file=parser.csv_val, class_list=parser.csv_classes, transform=transforms.Compose([Normalizer(), Resizer()])) else: raise ValueError('Dataset type not understood (must be csv or coco), exiting.') sampler = AspectRatioBasedSampler(dataset_train, batch_size=4, drop_last=False) dataloader_train = DataLoader(dataset_train, num_workers=3, collate_fn=collater, batch_sampler=sampler) if dataset_val is not None: sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False) dataloader_val = DataLoader(dataset_val, num_workers=3, collate_fn=collater, batch_sampler=sampler_val) # Create the model if parser.depth == 18: retinanet = model.resnet18(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 34: retinanet = model.resnet34(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 50: retinanet = model.resnet50(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 101: retinanet = model.resnet101(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 152: retinanet = model.resnet152(num_classes=dataset_train.num_classes(), pretrained=True) else: raise ValueError('Unsupported model depth, must be one of 18, 34, 50, 101, 152') use_gpu = True if use_gpu: retinanet = retinanet.cuda() #retinanet().load_state_dict(torch.load('/users/wenchi/ghwwc/Pytorch-retinanet-master/resnet50-19c8e357.pth')) #if True: #print('==> Resuming from checkpoint..') #checkpoint = torch.load('/users/wenchi/ghwwc/Pytorch-retinanet-master/coco_retinanet_2.pt') #retinanet().load_state_dict(checkpoint) #best_loss = checkpoint['loss'] #start_epoch = checkpoint['epoch'] retinanet = torch.nn.DataParallel(retinanet).cuda() retinanet.training = True #optimizer = optim.Adam(retinanet.parameters(), lr=1e-5) optimizer = optim.SGD(retinanet.parameters(), lr=1e-5) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) loss_hist = collections.deque(maxlen=500) retinanet.train() #retinanet.freeze_bn() #for train from a middle state retinanet.module.freeze_bn() #for train from the very beginning print('Num training images: {}'.format(len(dataset_train))) for epoch_num in range(parser.start_epoch, parser.epochs): if parser.resume: if os.path.isfile(parser.resume): print("=>loading checkpoint '{}'".format(parser.resume)) checkpoint = torch.load(parser.resume) print(parser.start_epoch) #parser.start_epoch = checkpoint['epoch'] #retinanet.load_state_dict(checkpoint['state_dict']) retinanet=checkpoint #retinanet.load_state_dict(checkpoint) print(retinanet) #optimizer.load_state_dict(checkpoint) print("=> loaded checkpoint '{}' (epoch {})".format(parser.resume, checkpoint)) else: print("=> no checkpoint found at '{}'".format(parser.resume)) retinanet.train() retinanet.freeze_bn() #retinanet.module.freeze_bn() if parser.dataset == 'coco': print('Evaluating dataset') coco_eval.evaluate_coco(dataset_val, retinanet) elif parser.dataset == 'csv' and parser.csv_val is not None: print('Evaluating dataset') mAP = csv_eval.evaluate(dataset_val, retinanet) epoch_loss = [] for iter_num, data in enumerate(dataloader_train): try: optimizer.zero_grad() classification_loss, regression_loss = retinanet([data['img'].cuda().float(), data['annot'].cuda()]) classification_loss = classification_loss.mean() regression_loss = regression_loss.mean() loss = classification_loss + regression_loss if bool(loss == 0): continue loss.backward() torch.nn.utils.clip_grad_norm_(retinanet.parameters(), 0.1) optimizer.step() loss_hist.append(float(loss)) epoch_loss.append(float(loss)) print('Epoch: {} | Iteration: {} | Classification loss: {:1.5f} | Regression loss: {:1.5f} | Running loss: {:1.5f}'.format(epoch_num, iter_num, float(classification_loss), float(regression_loss), np.mean(loss_hist))) del classification_loss del regression_loss except Exception as e: print(e) continue if parser.dataset == 'coco': print('Evaluating dataset') coco_eval.evaluate_coco(dataset_val, retinanet) elif parser.dataset == 'csv' and parser.csv_val is not None: print('Evaluating dataset') mAP = csv_eval.evaluate(dataset_val, retinanet) scheduler.step(np.mean(epoch_loss)) #torch.save(retinanet.module, '{}_retinanet_101_{}.pt'.format(parser.dataset, epoch_num)) torch.save(retinanet, '{}_retinanet_dilation_experiment1_{}.pt'.format(parser.dataset, epoch_num)) name = '{}_retinanet_dilation_experiment1_{}.pt'.format(parser.dataset, epoch_num) parser.resume = '/users/wenchi/ghwwc/pytorch-retinanet-master_new/name' retinanet.eval() torch.save(retinanet, 'model_final_dilation_experiment1.pt'.format(epoch_num))
def main(args=None): parser = argparse.ArgumentParser( description='Simple training script for training a RetinaNet network.') parser.add_argument('--coco_path', help='Path to COCO directory', type=str, default='./data/coco') parser.add_argument( '--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=50) parser.add_argument('--checkpoint', help='The path to the checkpoint.', type=str, default=None) parser.add_argument('--epochs', help='Number of epochs', type=int, default=100) parser.add_argument('--batch_size', help='Number of batch', type=int, default=16) parser.add_argument('--gpu_ids', help='Gpu parallel', type=str, default='1, 2') parser = parser.parse_args(args) # Create the data lodaders dataset_train = CocoDataset(parser.coco_path, set_name='train2017', transform=transforms.Compose( [Normalizer(), Augmenter(), Resizer()])) dataset_val = CocoDataset(parser.coco_path, set_name='val2017', transform=transforms.Compose( [Normalizer(), Resizer()])) sampler = AspectRatioBasedSampler(dataset_train, batch_size=4, drop_last=False) dataloader_train = DataLoader(dataset_train, num_workers=16, collate_fn=collater, batch_sampler=sampler) sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False) dataloader_val = DataLoader(dataset_val, num_workers=3, collate_fn=collater, batch_sampler=sampler_val) # Create the model if parser.depth == 18: retinanet = model.resnet18(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 34: retinanet = model.resnet34(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 50: retinanet = model.resnet50(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 101: retinanet = model.resnet101(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 152: retinanet = model.resnet152(num_classes=dataset_train.num_classes(), pretrained=True) else: raise ValueError( 'Unsupported model depth, must be one of 18, 34, 50, 101, 152') use_gpu = True if use_gpu: retinanet = retinanet.cuda() gpu_ids = parser.gpu_ids.split(',') device = torch.device("cuda:" + gpu_ids[0]) torch.cuda.set_device(device) gpu_ids = list(map(int, gpu_ids)) retinanet = torch.nn.DataParallel(retinanet, device_ids=gpu_ids).to(device) if parser.checkpoint: pretrained = torch.load(parser.checkpoint).state_dict() retinanet.module.load_state_dict(pretrained) # add tensorboard to record train log retinanet.training = True writer = SummaryWriter('./log') # writer.add_graph(retinanet, input_to_model=[images, labels]) retinanet.training = True optimizer = optim.Adam(retinanet.parameters(), lr=1e-5) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) loss_hist = collections.deque(maxlen=500) retinanet.train() retinanet.module.freeze_bn() print('Num training images: {}'.format(len(dataset_train))) for epoch_num in range(parser.epochs): retinanet.train() retinanet.module.freeze_bn() epoch_loss = [] for iter_num, data in enumerate(dataloader_train): try: optimizer.zero_grad() classification_loss, regression_loss = retinanet( [data['img'].to(device), data['ann'].to(device)]) classification_loss = classification_loss.mean() regression_loss = regression_loss.mean() loss = classification_loss + regression_loss if bool(loss == 0): continue loss.backward() torch.nn.utils.clip_grad_norm_(retinanet.parameters(), 0.1) optimizer.step() loss_hist.append(float(loss)) writer.add_scalar('Loss/train', loss, iter_num) writer.add_scalar('Loss/reg_loss', regression_loss, iter_num) writer.add_scalar('Loss/cls_loss', classification_loss, iter_num) epoch_loss.append(float(loss)) if (iter_num + 1) % 1000 == 0: print('Save model') torch.save( retinanet.module, 'COCO_retinanet_epoch{}_iter{}.pt'.format( epoch_num, iter_num)) print( 'Epoch: {} | Iteration: {} | Classification loss: {:1.5f} | Regression loss: {:1.5f} | Running loss: {:1.5f}' .format(epoch_num, iter_num, float(classification_loss), float(regression_loss), np.mean(loss_hist))) del classification_loss del regression_loss except Exception as e: print(e) continue print('Evaluating dataset') coco_eval.evaluate_coco(dataset_val, retinanet, writer) scheduler.step(np.mean(epoch_loss)) torch.save(retinanet.module, 'COCO_retinanet_{}.pt'.format(epoch_num)) retinanet.eval() torch.save(retinanet, 'model_final.pt'.format(epoch_num))
def main(): data_args, train_args, model_args = parse_args(IncrDataArgs, ExperimentArgs, AllModelArgs) if train_args.batch and not train_args.multihead: train_loader, val_loader, test_loader = get_dataloaders( data_args, load_test=False) else: train_loader, val_loader, test_loader = get_dataloaders_incr( data_args, load_test=False, multihead_batch=train_args.batch) state = None # load pretrained feature extractor if specified if model_args.load_state_path: state = torch.load(model_args.load_state_path) if model_args.arch == 'resnet18': net = resnet18(num_classes=data_args.num_classes, seed=data_args.seed, disable_bn_stats=model_args.disable_bn_stats) if state is not None: state['fc.weight'], state['fc.bias'] = net.fc.weight, net.fc.bias net.load_state_dict(state) elif model_args.arch == 'lrm_resnet18': net = load_lrm(state=state, num_classes=data_args.num_classes, seed=data_args.seed, disable_bn_stats=model_args.disable_bn_stats, n_blocks=model_args.n_blocks, block_size_alpha=model_args.block_size_alpha, route_by_task=model_args.route_by_task, fit_keys=train_args.fit_keys) # save state initialization if we will be reinitializing the model before each new exposure if train_args.exposure_reinit: torch.save( net.state_dict(), join(train_args.model_save_dir, append_to_file(train_args.model_save_path, 'init'))) net.cuda() if train_args.batch: if train_args.multihead: # trains model on batches of data across tasks while enforcing classification predictions to be within task train_batch_multihead(train_args, net, train_loader, val_loader, device=0) np.savez(join(train_args.acc_save_dir, train_args.incr_results_path), entropy=net.get_entropy(), class_div=net.get_class_routing_divergence()) else: train(train_args, net, train_loader, val_loader, device=0, multihead=False) else: train_incr(train_args, net, train_loader, val_loader, device=0)
def main(): normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]) # Image Preprocessing train_transform = transforms.Compose([ transforms.RandomResizedCrop(224), transforms.RandomHorizontalFlip(), transforms.ToTensor(), normalize, ]) test_transform = transforms.Compose([ transforms.Resize(256), transforms.CenterCrop(224), transforms.ToTensor(), normalize, ]) num_epochs = args.epochs batch_size = args.batch_size train_dataset = datasets.folder.ImageFolder( root='/fast/users/a1675776/data/imagenet/train/', transform=train_transform) test_dataset = datasets.folder.ImageFolder( root='/fast/users/a1675776/data/imagenet/val/', transform=test_transform) train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True, num_workers=10, pin_memory=True) test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False, num_workers=10, pin_memory=True) num_train = train_dataset.__len__() n_train_batches = math.floor(num_train / batch_size) criterion = nn.CrossEntropyLoss().cuda() bitW = 32 bitA = 32 model = resnet18(bitW, bitA) model = utils.dataparallel(model, 3) print("Compilation complete, starting training...") test_record = [] train_record = [] learning_rate = args.learning_rate epoch = 0 step_idx = 0 best_top1 = 0 optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=args.weight_decay, momentum=args.momentum) for m in model.modules(): if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear) or isinstance( m, self_conv): c = float(m.weight.data[0].nelement()) torch.nn.init.xavier_uniform(m.weight) elif isinstance(m, nn.BatchNorm2d): m.weight.data = m.weight.data.zero_().add(1.0) while epoch < num_epochs: epoch = epoch + 1 # resume training if (args.resume_train) and (epoch == 1): checkpoint = torch.load(args.resume_dir) epoch = checkpoint['epoch'] learning_rate = checkpoint['learning_rate'] optimizer.load_state_dict(checkpoint['optimizer']) step_idx = checkpoint['step_idx'] model.load_state_dict(checkpoint['state_dict']) test_record = list(np.load(args.weights_dir + 'test_record.npy')) train_record = list(np.load(args.weights_dir + 'train_record.npy')) logging.info('epoch %d lr %e', epoch, learning_rate) # training train_acc_top1, train_acc_top5, train_obj = train( train_loader, model, criterion, optimizer) logging.info('train_acc %f', train_acc_top1) train_record.append([train_acc_top1, train_acc_top5]) np.save(args.weights_dir + 'train_record.npy', train_record) # test test_acc_top1, test_acc_top5, test_obj = infer(test_loader, model, criterion) is_best = test_acc_top1 > best_top1 if is_best: best_top1 = test_acc_top1 logging.info('test_acc %f', test_acc_top1) test_record.append([test_acc_top1, test_acc_top5]) np.save(args.weights_dir + 'test_record.npy', test_record) save_checkpoint( { 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'optimizer': optimizer.state_dict(), 'best_top1': best_top1, 'step_idx': step_idx, 'learning_rate': learning_rate, }, args, is_best) step_idx, learning_rate = utils.adjust_learning_rate( args, epoch, step_idx, learning_rate) for param_group in optimizer.param_groups: param_group['lr'] = learning_rate
def main(): # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') # model cfg # parser.add_argument('--pretrained', action='store_true', default=False, # help='load pretrained model') parser.add_argument('--model-type', type=str, default="", help="type of the model.") parser.add_argument('--model-structure', type=int, default=0, metavar='N', help='model structure to be trained (default: 0)') parser.add_argument('--resume', default='', type=str, metavar='PATH', help='path to latest checkpoint, (default: None)') parser.add_argument('--e', '--evaluate', dest='evaluate', action='store_true', help='evaluate model on validation set') # dataset parser.add_argument('--dataset-root', type=str, default="../datasets", help="load dataset path.") parser.add_argument('--workers', default=0, type=int, metavar='N', help='number of data loading workers (default: 0)') parser.add_argument('--train-batch-size', type=int, default=128, metavar='N', help='input batch size for training (default: 128)') parser.add_argument('--test-batch-size', type=int, default=128, metavar='N', help='input batch size for testing (default: 128)') # train cfg parser.add_argument('--epochs', type=int, default=80, metavar='N', help='number of epochs to train (default: 10)') parser.add_argument('--start-epoch', default=0, type=int, metavar='N', help='manual epoch number (useful to restarts)') parser.add_argument('--lr', type=float, default=0.001, metavar='LR', help='learning rate (default: 0.001)') # optimizer parser.add_argument('--momentum', default=0.9, type=float, metavar='M', help='SGD momentum (default: 0.9)') parser.add_argument('--wd', default=5e-4, type=float, metavar='W', help='weight decay (default: 5e-4)') # scheduler parser.add_argument('--schedule', type=int, nargs='+', default=[150, 225], help='Decrease learning rate at these epochs.') parser.add_argument('--gamma', type=float, default=0.1, help='LR is multiplied by gamma on schedule.') parser.add_argument('--decreasing-lr', default='16,30,54', help='decreasing strategy') # device init cfg parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') # result output cfg parser.add_argument('--detail', action='store_true', default=False, help='show log in detial') parser.add_argument( '--log-interval', type=int, default=10, metavar='N', help='how many batches to wait before logging training status') parser.add_argument('--save-model', action='store_true', default=True, help='For Saving the current Model') parser.add_argument('--checkpoint-path', type=str, default="", help="save model path.") parser.add_argument('--crxb-size', type=int, default=64, help='corssbar size') parser.add_argument('--vdd', type=float, default=3.3, help='supply voltage') parser.add_argument('--gwire', type=float, default=0.0357, help='wire conductacne') parser.add_argument('--gload', type=float, default=0.25, help='load conductance') parser.add_argument('--gmax', type=float, default=0.000333, help='maximum cell conductance') parser.add_argument('--gmin', type=float, default=0.000000333, help='minimum cell conductance') parser.add_argument('--ir-drop', action='store_true', default=False, help='switch to turn on ir drop analysis') parser.add_argument('--scaler-dw', type=float, default=1, help='scaler to compress the conductance') parser.add_argument('--test', action='store_true', default=False, help='switch to turn inference mode') parser.add_argument('--enable_noise', action='store_true', default=False, help='switch to turn on noise analysis') parser.add_argument('--enable_SAF', action='store_true', default=False, help='switch to turn on SAF analysis') parser.add_argument('--enable_ec-SAF', action='store_true', default=False, help='switch to turn on SAF error correction') parser.add_argument('--freq', type=float, default=10e6, help='scaler to compress the conductance') parser.add_argument('--temp', type=float, default=300, help='scaler to compress the conductance') args = parser.parse_args() print("+++", args) # Train the network on the training data # Test the network on the test data use_cuda = not args.no_cuda and torch.cuda.is_available() torch.manual_seed(args.seed) device = torch.device("cuda" if use_cuda else "cpu") print(device) # net = alexnet(args.pretrained, args.resume, num_classes=10, structure=args.model_structure) # create model crxb_cfg = { 'crxb_size': args.crxb_size, 'gmax': args.gmax, 'gmin': args.gmin, 'gwire': args.gwire, 'gload': args.gload, 'vdd': args.vdd, 'ir_drop': args.ir_drop, 'device': device, 'freq': args.freq, 'temp': args.temp, 'enable_SAF': args.enable_SAF, 'enable_noise': args.enable_noise, 'enable_ec_SAF': args.enable_ec_SAF, 'quantize': 64 } if args.model_type == 'VGG16': net = model.VGG16() elif args.model_type == 'cifar10' or args.model_type == 'VGG8': net = model.cifar10(n_channel=128, physical=0, **crxb_cfg) elif args.model_type == 'alexnet': net = model.alexnet(num_classes=10, structure=args.model_structure) elif args.model_type == 'resnet18': net = model.resnet18() else: net = model.cifar10(n_channel=128, physical=True, **crxb_cfg) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") net = nn.DataParallel(net) net.to(device) # for param in net.parameters(): # param = nn.init.normal_(param) # config milestones = list(map(int, args.decreasing_lr.split(','))) print(milestones) # optimizer = optim.SGD(net.parameters(), lr=lr, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY) # not good enough 68% optimizer = optim.Adam(net.parameters(), lr=args.lr, weight_decay=args.wd) scheduler = lr_scheduler.MultiStepLR(optimizer, milestones=milestones, gamma=args.gamma) # optionlly resume from a checkpoint if args.resume: print("=> using pre-trained model '{}'".format(args.model_type)) else: print("=> creating model '{}'".format(args.model_type)) global best_prec1 # if args.resume: # if os.path.isfile(args.resume): # print("=> loading checkpoint '{}'".format(args.resume)) # checkpoint = torch.load(args.resume) # args.start_epoch = checkpoint['epoch'] # best_prec1 = checkpoint['best_prec1'] # net.load_state_dict(checkpoint['state_dict']) # optimizer.load_state_dict(checkpoint['optimizer']) # print("=> loaded checkpoint '{}' (epoch {})".format(args.resume, checkpoint['epoch'])) # else: # print("=> no checkpoint found at '{}'".format(args.resume)) # Data loading kwargs = { 'num_workers': args.workers, 'pin_memory': True } if use_cuda else {} trainloader, valloader, testloader = getcifar(args, 'pad', 4, True, 32, True, **kwargs) print(len(trainloader), len(valloader), len(testloader)) print('\r\n1!!!model_dict') model_dict = net.state_dict() print(model_dict.keys(), "\r\n2!!!model parameters") parm = {} for name, parameters in net.named_parameters(): print(name) print('\r\n3!!!pretrained_dict') checkpoint = torch.load(args.resume) # print(type(checkpoint),'\r\n!!!') # print(checkpoint.keys(),'\r\n!!!') pretrained_dict = checkpoint['state_dict'] print(pretrained_dict.keys(), '\r\n4!!!new_dict') import re new_dict = {} for k, v in pretrained_dict.items(): if k not in model_dict: bn_detect = re.match( r'module\.features\.(1|4|8|11|15|18|22)\.(running_mean|num_batches_tracked|running_var)', k) if bn_detect: k = 'module.features.{}.bn.{}'.format(bn_detect.group(1), bn_detect.group(2)) print(k) new_dict[k] = v else: pass else: new_dict[k] = v print(new_dict.keys(), '\r\n5!!!') print([k for k, v in new_dict.items() if k not in model_dict], '\r\n') print([k for k, v in model_dict.items() if k not in new_dict]) # print('net buffers') # print([n for n,v in net.named_buffers()], '\r\n !!!ideal_buffer') ideal_buffer = torch.load("../models/cifar10_crxb_ideal_VGG8_0_final.pth") # buffer_list = [k for k, v in ideal_buffer['state_dict'].items() if k not in new_dict] for k, v in ideal_buffer['state_dict'].items(): if k not in new_dict: new_dict[k] = v print("\r\ncheck:", new_dict.keys() == model_dict.keys()) model_dict.update(new_dict) # model_dict.update(ideal_buffer['state_dict']) net.load_state_dict(model_dict) # print('vvv') # print([k for k,v in ideal_buffer['state_dict'].items() if k not in model_dict]) # net.load_state_dict(ideal_buffer['state_dict']) test(args, net, device, testloader)
def main(args=None): parser = argparse.ArgumentParser( description='Simple training script for training a RetinaNet network.') parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.', default="csv") parser.add_argument('--coco_path', help='Path to COCO directory') parser.add_argument( '--csv_train', help='Path to file containing training annotations (see readme)') parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)', default="binary_class.csv") parser.add_argument( '--csv_val', help= 'Path to file containing validation annotations (optional, see readme)' ) parser.add_argument( '--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=18) parser.add_argument('--epochs', help='Number of epochs', type=int, default=500) parser.add_argument('--epochs_only_det', help='Number of epochs to train detection part', type=int, default=1) parser.add_argument('--max_epochs_no_improvement', help='Max epochs without improvement', type=int, default=100) parser.add_argument('--pretrained_model', help='Path of .pt file with pretrained model', default='esposallescsv_retinanet_0.pt') parser.add_argument('--model_out', help='Path of .pt file with trained model to save', default='trained') parser.add_argument('--score_threshold', help='Score above which boxes are kept', type=float, default=0.5) parser.add_argument('--nms_threshold', help='Score above which boxes are kept', type=float, default=0.2) parser.add_argument('--max_boxes', help='Max boxes to be fed to recognition', default=95) parser.add_argument('--seg_level', help='[line, word], to choose anchor aspect ratio', default='word') parser.add_argument( '--early_stop_crit', help='Early stop criterion, detection (map) or transcription (cer)', default='cer') parser.add_argument('--max_iters_epoch', help='Max steps per epoch (for debugging)', default=1000000) parser.add_argument('--train_htr', help='Train recognition or not', default='True') parser.add_argument('--train_det', help='Train detection or not', default='True') parser.add_argument( '--binary_classifier', help= 'Wether to use classification branch as binary or not, multiclass instead.', default='False') parser.add_argument( '--htr_gt_box', help='Train recognition branch with box gt (for debugging)', default='False') parser.add_argument( '--ner_branch', help='Train named entity recognition with separate branch', default='False') parser = parser.parse_args(args) if parser.dataset == 'csv': if parser.csv_train is None: raise ValueError('Must provide --csv_train') dataset_name = parser.csv_train.split("/")[-2] dataset_train = CSVDataset(train_file=parser.csv_train, class_list=parser.csv_classes, transform=transforms.Compose( [Normalizer(), Augmenter(), Resizer()])) if parser.csv_val is None: dataset_val = None print('No validation annotations provided.') else: dataset_val = CSVDataset(train_file=parser.csv_val, class_list=parser.csv_classes, transform=transforms.Compose( [Normalizer(), Resizer()])) else: raise ValueError( 'Dataset type not understood (must be csv or coco), exiting.') # Files for training log experiment_id = str(time.time()).split('.')[0] valid_cer_f = open('trained_models/' + parser.model_out + 'log.txt', 'w') for arg in vars(parser): if getattr(parser, arg) is not None: valid_cer_f.write( str(arg) + ' ' + str(getattr(parser, arg)) + '\n') current_commit = subprocess.check_output(['git', 'rev-parse', 'HEAD']) valid_cer_f.write(str(current_commit)) valid_cer_f.write( "epoch_num cer best cer mAP best mAP time\n") valid_cer_f.close() sampler = AspectRatioBasedSampler(dataset_train, batch_size=1, drop_last=False) dataloader_train = DataLoader(dataset_train, num_workers=3, collate_fn=collater, batch_sampler=sampler) if dataset_val is not None: sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False) dataloader_val = DataLoader(dataset_val, num_workers=0, collate_fn=collater, batch_sampler=sampler_val) if not os.path.exists('trained_models'): os.mkdir('trained_models') # Create the model train_htr = parser.train_htr == 'True' htr_gt_box = parser.htr_gt_box == 'True' ner_branch = parser.ner_branch == 'True' binary_classifier = parser.binary_classifier == 'True' torch.backends.cudnn.benchmark = False alphabet = dataset_train.alphabet if os.path.exists(parser.pretrained_model): retinanet = torch.load(parser.pretrained_model) retinanet.classificationModel = ClassificationModel( num_features_in=256, num_anchors=retinanet.anchors.num_anchors, num_classes=dataset_train.num_classes()) if ner_branch: retinanet.nerModel = NERModel( feature_size=256, pool_h=retinanet.pool_h, n_classes=dataset_train.num_classes(), pool_w=retinanet.pool_w) else: if parser.depth == 18: retinanet = model.resnet18(num_classes=dataset_train.num_classes(), pretrained=True, max_boxes=int(parser.max_boxes), score_threshold=float( parser.score_threshold), seg_level=parser.seg_level, alphabet=alphabet, train_htr=train_htr, htr_gt_box=htr_gt_box, ner_branch=ner_branch, binary_classifier=binary_classifier) elif parser.depth == 34: retinanet = model.resnet34(num_classes=dataset_train.num_classes(), pretrained=True, max_boxes=int(parser.max_boxes), score_threshold=float( parser.score_threshold), seg_level=parser.seg_level, alphabet=alphabet, train_htr=train_htr, htr_gt_box=htr_gt_box) elif parser.depth == 50: retinanet = model.resnet50(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 101: retinanet = model.resnet101( num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 152: retinanet = model.resnet152( num_classes=dataset_train.num_classes(), pretrained=True) else: raise ValueError( 'Unsupported model depth, must be one of 18, 34, 50, 101, 152') use_gpu = True train_htr = parser.train_htr == 'True' train_det = parser.train_det == 'True' retinanet.htr_gt_box = parser.htr_gt_box == 'True' retinanet.train_htr = train_htr retinanet.epochs_only_det = parser.epochs_only_det if use_gpu: retinanet = retinanet.cuda() retinanet = torch.nn.DataParallel(retinanet).cuda() retinanet.training = True optimizer = optim.Adam(retinanet.parameters(), lr=1e-4) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=50, verbose=True) loss_hist = collections.deque(maxlen=500) ctc = CTCLoss() retinanet.train() retinanet.module.freeze_bn() best_cer = 1000 best_map = 0 epochs_no_improvement = 0 verbose_each = 20 optimize_each = 1 objective = 100 best_objective = 10000 print(('Num training images: {}'.format(len(dataset_train)))) for epoch_num in range(parser.epochs): cers = [] retinanet.training = True retinanet.train() retinanet.module.freeze_bn() epoch_loss = [] for iter_num, data in enumerate(dataloader_train): if iter_num > int(parser.max_iters_epoch): break try: if iter_num % optimize_each == 0: optimizer.zero_grad() (classification_loss, regression_loss, ctc_loss, ner_loss) = retinanet([ data['img'].cuda().float(), data['annot'], ctc, epoch_num ]) classification_loss = classification_loss.mean() regression_loss = regression_loss.mean() if train_det: if train_htr: loss = ctc_loss + classification_loss + regression_loss + ner_loss else: loss = classification_loss + regression_loss + ner_loss elif train_htr: loss = ctc_loss else: continue if bool(loss == 0): continue loss.backward() torch.nn.utils.clip_grad_norm_(retinanet.parameters(), 0.1) if iter_num % verbose_each == 0: print(( 'Epoch: {} | Step: {} |Classification loss: {:1.5f} | Regression loss: {:1.5f} | CTC loss: {:1.5f} | NER loss: {:1.5f} | Running loss: {:1.5f} | Total loss: {:1.5f}\r' .format(epoch_num, iter_num, float(classification_loss), float(regression_loss), float(ctc_loss), float(ner_loss), np.mean(loss_hist), float(loss), "\r"))) optimizer.step() loss_hist.append(float(loss)) epoch_loss.append(float(loss)) torch.cuda.empty_cache() except Exception as e: print(e) continue if parser.dataset == 'csv' and parser.csv_val is not None and train_det: print('Evaluating dataset') mAP, text_mAP, current_cer = csv_eval.evaluate( dataset_val, retinanet, score_threshold=parser.score_threshold) #text_mAP,_ = csv_eval_binary_map.evaluate(dataset_val, retinanet,score_threshold=parser.score_threshold) objective = current_cer * (1 - mAP) retinanet.eval() retinanet.training = False retinanet.score_threshold = float(parser.score_threshold) '''for idx,data in enumerate(dataloader_val): if idx>int(parser.max_iters_epoch): break print("Eval CER on validation set:",idx,"/",len(dataset_val),"\r") image_name = dataset_val.image_names[idx].split('/')[-1].split('.')[-2] #generate_pagexml(image_name,data,retinanet,parser.score_threshold,parser.nms_threshold,dataset_val) text_gt =".".join(dataset_val.image_names[idx].split('.')[:-1])+'.txt' f =open(text_gt,'r') text_gt_lines=f.readlines()[0] transcript_pred = get_transcript(image_name,data,retinanet,float(parser.score_threshold),float(parser.nms_threshold),dataset_val,alphabet) cers.append(float(editdistance.eval(transcript_pred,text_gt_lines))/len(text_gt_lines))''' t = str(time.time()).split('.')[0] valid_cer_f.close() #print("GT",text_gt_lines) #print("PREDS SAMPLE:",transcript_pred) if parser.early_stop_crit == 'cer': if float(objective) < float( best_objective): #float(current_cer)<float(best_cer): best_cer = current_cer best_objective = objective epochs_no_improvement = 0 torch.save( retinanet.module, 'trained_models/' + parser.model_out + '{}_retinanet.pt'.format(parser.dataset)) else: epochs_no_improvement += 1 if mAP > best_map: best_map = mAP elif parser.early_stop_crit == 'map': if mAP > best_map: best_map = mAP epochs_no_improvement = 0 torch.save( retinanet.module, 'trained_models/' + parser.model_out + '{}_retinanet.pt'.format(parser.dataset)) else: epochs_no_improvement += 1 if float(current_cer) < float(best_cer): best_cer = current_cer if train_det: print(epoch_num, "mAP: ", mAP, " best mAP", best_map) if train_htr: print("VALID CER:", current_cer, "best CER", best_cer) print("Epochs no improvement:", epochs_no_improvement) valid_cer_f = open('trained_models/' + parser.model_out + 'log.txt', 'a') valid_cer_f.write( str(epoch_num) + " " + str(current_cer) + " " + str(best_cer) + ' ' + str(mAP) + ' ' + str(best_map) + ' ' + str(text_mAP) + '\n') if epochs_no_improvement > 3: for param_group in optimizer.param_groups: if param_group['lr'] > 10e-5: param_group['lr'] *= 0.1 if epochs_no_improvement >= parser.max_epochs_no_improvement: print("TRAINING FINISHED AT EPOCH", epoch_num, ".") sys.exit() scheduler.step(np.mean(epoch_loss)) torch.cuda.empty_cache() retinanet.eval()
def main(args=None): parser = argparse.ArgumentParser( description="Simple training script for training a RetinaNet network.") parser.add_argument( "--dataset", help="Dataset type, must be one of csv or coco or ycb.") parser.add_argument("--path", help="Path to dataset directory") parser.add_argument( "--csv_train", help="Path to file containing training annotations (see readme)") parser.add_argument("--csv_classes", help="Path to file containing class list (see readme)") parser.add_argument("--csv_val", help="Path to file containing validation annotations " "(optional, see readme)") parser.add_argument( "--depth", help="Resnet depth, must be one of 18, 34, 50, 101, 152", type=int, default=50) parser.add_argument("--epochs", help="Number of epochs", type=int, default=100) parser.add_argument("--evaluate_every", default=20, type=int) parser.add_argument("--print_every", default=20, type=int) parser.add_argument('--distributed', action="store_true", help='Run model in distributed mode with DataParallel') parser = parser.parse_args(args) # Create the data loaders if parser.dataset == "coco": if parser.path is None: raise ValueError( "Must provide --path when training on non-CSV datasets") dataset_train = CocoDataset(parser.path, ann_file="instances_train2014.json", set_name="train2014", transform=transforms.Compose([ Normalizer(), Augmenter(), Resizer(min_side=512, max_side=512) ])) dataset_val = CocoDataset(parser.path, ann_file="instances_val2014.cars.json", set_name="val2014", transform=transforms.Compose( [Normalizer(), Resizer()])) elif parser.dataset == "ycb": dataset_train = YCBDataset(parser.path, "image_sets/train.txt", transform=transforms.Compose([ Normalizer(), Augmenter(), Resizer(min_side=512, max_side=512) ]), train=True) dataset_val = YCBDataset(parser.path, "image_sets/val.txt", transform=transforms.Compose( [Normalizer(), Resizer()]), train=False) elif parser.dataset == "csv": if parser.csv_train is None: raise ValueError("Must provide --csv_train when training on COCO,") if parser.csv_classes is None: raise ValueError( "Must provide --csv_classes when training on COCO,") dataset_train = CSVDataset(train_file=parser.csv_train, class_list=parser.csv_classes, transform=transforms.Compose( [Normalizer(), Augmenter(), Resizer()])) if parser.csv_val is None: dataset_val = None print("No validation annotations provided.") else: dataset_val = CSVDataset(train_file=parser.csv_val, class_list=parser.csv_classes, transform=transforms.Compose( [Normalizer(), Resizer()])) else: raise ValueError( "Dataset type not understood (must be csv or coco), exiting.") sampler = AspectRatioBasedSampler(dataset_train, batch_size=12, drop_last=False) dataloader_train = DataLoader(dataset_train, num_workers=8, collate_fn=collater, batch_sampler=sampler) if dataset_val is not None: sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False) dataloader_val = DataLoader(dataset_val, num_workers=4, collate_fn=collater, batch_sampler=sampler_val) # Create the model if parser.depth == 18: retinanet = model.resnet18(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 34: retinanet = model.resnet34(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 50: retinanet = model.resnet50(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 101: retinanet = model.resnet101(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 152: retinanet = model.resnet152(num_classes=dataset_train.num_classes(), pretrained=True) else: raise ValueError( "Unsupported model depth, must be one of 18, 34, 50, 101, 152") print("CUDA available: {}".format(torch.cuda.is_available())) if torch.cuda.is_available(): device = "cuda" else: device = "cpu" retinanet = retinanet.to(device) if parser.distributed: retinanet = torch.nn.DataParallel(retinanet) optimizer = optim.Adam(retinanet.parameters(), lr=1e-5) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) loss_hist = collections.deque(maxlen=500) print("Num training images: {}".format(len(dataset_train))) best_mean_avg_prec = 0.0 for epoch_num in range(parser.epochs): retinanet.train() retinanet.freeze_bn() epoch_loss = [] for iter_num, data in enumerate(dataloader_train): try: optimizer.zero_grad() classification_loss, regression_loss = retinanet( [data["img"].to(device).float(), data["annot"]]) classification_loss = classification_loss.mean() regression_loss = regression_loss.mean() loss = classification_loss + regression_loss if bool(loss == 0): continue loss.backward() torch.nn.utils.clip_grad_norm_(retinanet.parameters(), 0.1) optimizer.step() loss_hist.append(float(loss.item())) epoch_loss.append(float(loss.item())) if parser.print_every % iter_num == 0: print("Epoch: {} | Iteration: {}/{} | " "Classification loss: {:1.5f} | " "Regression loss: {:1.5f} | " "Running loss: {:1.5f}".format( epoch_num, iter_num, len(dataloader_train), float(classification_loss), float(regression_loss), np.mean(loss_hist))) del classification_loss del regression_loss except Exception as e: print(e) continue if ((epoch_num + 1) % parser.evaluate_every == 0) or epoch_num + 1 == parser.epochs: mAP = 0.0 if parser.dataset == "coco": print("Evaluating dataset") mAP = coco_eval.evaluate_coco(dataset_val, retinanet) else: print("Evaluating dataset") AP = eval.evaluate(dataset_val, retinanet) mAP = np.asarray([x[0] for x in AP.values()]).mean() print("Val set mAP: ", mAP) if mAP > best_mean_avg_prec: best_mean_avg_prec = mAP torch.save( retinanet.state_dict(), "{}_retinanet_best_mean_ap_{}.pt".format( parser.dataset, epoch_num)) scheduler.step(np.mean(epoch_loss)) retinanet.eval() torch.save(retinanet.state_dict(), "retinanet_model_final.pt")
def main(): global args, best_sa args = parser.parse_args() print(args) torch.cuda.set_device(int(args.gpu)) os.makedirs(args.save_dir, exist_ok=True) if args.seed: setup_seed(args.seed) if args.task == 'rotation': print('train for rotation classification') class_number = 4 else: print('train for supervised classification') if args.dataset == 'cifar10': class_number = 10 elif args.dataset == 'fmnist': class_number = 10 elif args.dataset == 'cifar100': class_number = 100 else: print('error dataset') assert 0 # prepare dataset if args.dataset == 'cifar10': print('training on cifar10 dataset') model = resnet18(num_classes=class_number) model.normalize = NormalizeByChannelMeanStd( mean=[0.4914, 0.4822, 0.4465], std=[0.2470, 0.2435, 0.2616]) train_loader, val_loader, test_loader = cifar10_dataloaders(batch_size= args.batch_size, data_dir =args.data) elif args.dataset == 'cifar_10_10': print('training on cifar10 subset') model = resnet18(num_classes=class_number) model.normalize = NormalizeByChannelMeanStd( mean=[0.4914, 0.4822, 0.4465], std=[0.2470, 0.2435, 0.2616]) train_loader, val_loader, test_loader = cifar10_subset_dataloaders(batch_size= args.batch_size, data_dir =args.data) elif args.dataset == 'cifar100': model = resnet18(num_classes=class_number) model.normalize = NormalizeByChannelMeanStd( mean=[0.5071, 0.4865, 0.4409], std=[0.2673, 0.2564, 0.2762]) train_loader, val_loader, test_loader = cifar100_dataloaders(batch_size= args.batch_size, data_dir =args.data) elif args.dataset == 'fmnist': model = resnet18(num_classes=class_number) model.normalize = NormalizeByChannelMeanStd( mean=[0.2860], std=[0.3530]) model.conv1 = nn.Conv2d(1, 16, kernel_size=3, stride=1, padding=1, bias=False) train_loader, val_loader, test_loader = fashionmnist_dataloaders(batch_size= args.batch_size, data_dir =args.data) else: print('dataset not support') model.cuda() criterion = nn.CrossEntropyLoss() decreasing_lr = list(map(int, args.decreasing_lr.split(','))) if args.prune_type == 'lt': print( 'report lottery tickets setting') initalization = deepcopy(model.state_dict()) else: initalization = None optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=decreasing_lr, gamma=0.1) if args.resume: print('resume from checkpoint') checkpoint = torch.load(args.resume, map_location = torch.device('cuda:'+str(args.gpu))) best_sa = checkpoint['best_sa'] start_epoch = checkpoint['epoch'] all_result = checkpoint['result'] start_state = checkpoint['state'] if start_state>0: current_mask = extract_mask(checkpoint['state_dict']) prune_model_custom(model, current_mask) check_sparsity(model) model.load_state_dict(checkpoint['state_dict']) optimizer.load_state_dict(checkpoint['optimizer']) scheduler.load_state_dict(checkpoint['scheduler']) initalization = checkpoint['init_weight'] print('loading state:', start_state) print('loading from epoch: ',start_epoch, 'best_sa=', best_sa) else: all_result = {} all_result['train'] = [] all_result['test_ta'] = [] all_result['ta'] = [] start_epoch = 0 start_state = 0 print('######################################## Start Standard Training Iterative Pruning ########################################') print(model.normalize) for state in range(start_state, args.pruning_times): print('******************************************') print('pruning state', state) print('******************************************') for epoch in range(start_epoch, args.epochs): print(optimizer.state_dict()['param_groups'][0]['lr']) check_sparsity(model) acc = train(train_loader, model, criterion, optimizer, epoch) # evaluate on validation set tacc = validate(val_loader, model, criterion) # evaluate on test set test_tacc = validate(test_loader, model, criterion) scheduler.step() all_result['train'].append(acc) all_result['ta'].append(tacc) all_result['test_ta'].append(test_tacc) # remember best prec@1 and save checkpoint is_best_sa = tacc > best_sa best_sa = max(tacc, best_sa) save_checkpoint({ 'state': state, 'result': all_result, 'epoch': epoch + 1, 'state_dict': model.state_dict(), 'best_sa': best_sa, 'optimizer': optimizer.state_dict(), 'scheduler': scheduler.state_dict(), 'init_weight': initalization }, is_SA_best=is_best_sa, pruning=state, save_path=args.save_dir) plt.plot(all_result['train'], label='train_acc') plt.plot(all_result['ta'], label='val_acc') plt.plot(all_result['test_ta'], label='test_acc') plt.legend() plt.savefig(os.path.join(args.save_dir, str(state)+'net_train.png')) plt.close() #report result check_sparsity(model, True) print('report best SA={}'.format(best_sa)) all_result = {} all_result['train'] = [] all_result['test_ta'] = [] all_result['ta'] = [] best_sa = 0 start_epoch = 0 if args.prune_type == 'pt': print('report loading pretrained weight') initalization = torch.load(os.path.join(args.save_dir, '0model_SA_best.pth.tar'), map_location = torch.device('cuda:'+str(args.gpu)))['state_dict'] #pruning_model(model, args.rate) #current_mask = extract_mask(model.state_dict()) #remove_prune(model) #rewind weight to init pruning_model(model, args.rate) check_sparsity(model) current_mask = torch.load(os.path.join(args.mask_path, '{}checkpoint.pth.tar'.format(state+1)))['state_dict'] remove_prune(model) #rewind weight to init model.load_state_dict(initalization) prune_model_custom(model, current_mask) check_sparsity(model) optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=decreasing_lr, gamma=0.1)
def main(args=None): parser = argparse.ArgumentParser( description='Simple training script for training a RetinaNet network.') parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.') parser.add_argument('--coco_path', help='Path to COCO directory') parser.add_argument( '--csv_train', help='Path to file containing training annotations (see readme)') parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)') parser.add_argument( '--csv_val', help= 'Path to file containing validation annotations (optional, see readme)' ) parser.add_argument( '--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=50) parser.add_argument('--epochs', help='Number of epochs', type=int, default=100) parser = parser.parse_args(args) # Create the data loaders if parser.dataset == 'coco': if parser.coco_path is None: raise ValueError('Must provide --coco_path when training on COCO,') dataset_train = CocoDataset(parser.coco_path, set_name='train2017', transform=transforms.Compose( [Normalizer(), Augmenter(), Resizer()])) dataset_val = CocoDataset(parser.coco_path, set_name='val2017', transform=transforms.Compose( [Normalizer(), Resizer()])) elif parser.dataset == 'csv': if parser.csv_train is None: raise ValueError('Must provide --csv_train when training on COCO,') if parser.csv_classes is None: raise ValueError( 'Must provide --csv_classes when training on COCO,') dataset_train = CSVDataset(train_file=parser.csv_train, class_list=parser.csv_classes, transform=transforms.Compose( [Normalizer(), Augmenter(), Resizer()])) if parser.csv_val is None: dataset_val = None print('No validation annotations provided.') else: dataset_val = CSVDataset(train_file=parser.csv_val, class_list=parser.csv_classes, transform=transforms.Compose( [Normalizer(), Resizer()])) else: raise ValueError( 'Dataset type not understood (must be csv or coco), exiting.') sampler = AspectRatioBasedSampler(dataset_train, batch_size=2, drop_last=False) dataloader_train = DataLoader(dataset_train, num_workers=3, collate_fn=collater, batch_sampler=sampler) if dataset_val is not None: sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=1, drop_last=False) dataloader_val = DataLoader(dataset_val, num_workers=3, collate_fn=collater, batch_sampler=sampler_val) # Create the model if parser.depth == 18: retinanet = model.resnet18(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 34: retinanet = model.resnet34(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 50: retinanet = model.resnet50(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 101: retinanet = model.resnet101(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 152: retinanet = model.resnet152(num_classes=dataset_train.num_classes(), pretrained=True) else: raise ValueError( 'Unsupported model depth, must be one of 18, 34, 50, 101, 152') use_gpu = True if use_gpu: retinanet = retinanet.cuda() retinanet = torch.nn.DataParallel(retinanet).cuda() retinanet.training = True optimizer = optim.Adam(retinanet.parameters(), lr=1e-5) scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) loss_hist = collections.deque(maxlen=500) retinanet.train() retinanet.module.freeze_bn() print('Num training images: {}'.format(len(dataset_train))) for epoch_num in range(parser.epochs): retinanet.train() retinanet.module.freeze_bn() epoch_loss = [] for iter_num, data in enumerate(dataloader_train): try: optimizer.zero_grad() classification_loss, regression_loss = retinanet( [data['img'].cuda().float(), data['annot']]) classification_loss = classification_loss.mean() regression_loss = regression_loss.mean() loss = classification_loss + regression_loss if bool(loss == 0): continue loss.backward() torch.nn.utils.clip_grad_norm_(retinanet.parameters(), 0.1) optimizer.step() loss_hist.append(float(loss)) epoch_loss.append(float(loss)) print( 'Epoch: {} | Iteration: {} | Classification loss: {:1.5f} | Regression loss: {:1.5f} | Running loss: {:1.5f}' .format(epoch_num, iter_num, float(classification_loss), float(regression_loss), np.mean(loss_hist))) del classification_loss del regression_loss except Exception as e: print(e) continue if parser.dataset == 'coco': print('Evaluating dataset') coco_eval.evaluate_coco(dataset_val, retinanet) elif parser.dataset == 'csv' and parser.csv_val is not None: print('Evaluating dataset') mAP = csv_eval.evaluate(dataset_val, retinanet) scheduler.step(np.mean(epoch_loss)) torch.save( retinanet.module, '{}_retinanet_dilation_{}.pt'.format(parser.dataset, epoch_num)) retinanet.eval() torch.save(retinanet, 'model_final_dilation.pt'.format(epoch_num))
def main(args=None): parser = argparse.ArgumentParser( description='Simple training script for training a RetinaNet network.') parser.add_argument('--dataset', help='Dataset type, must be one of csv or coco.') parser.add_argument('--coco_path', help='Path to COCO directory') parser.add_argument( '--csv_train', help='Path to file containing training annotations (see readme)') parser.add_argument('--csv_classes', help='Path to file containing class list (see readme)') parser.add_argument( '--csv_val', help= 'Path to file containing validation annotations (optional, see readme)' ) parser.add_argument( '--depth', help='Resnet depth, must be one of 18, 34, 50, 101, 152', type=int, default=50) parser.add_argument('--epochs', help='Number of epochs', type=int, default=100) parser.add_argument('--optimizer', help='[SGD | Adam]', type=str, default='SGD') parser.add_argument('--model', help='Path to model (.pt) file.') parser = parser.parse_args(args) # Create the data loaders print("\n[Phase 1]: Creating DataLoader for {} dataset".format( parser.dataset)) if parser.dataset == 'coco': if parser.coco_path is None: raise ValueError('Must provide --coco_path when training on COCO,') dataset_train = CocoDataset(parser.coco_path, set_name='train2014', transform=transforms.Compose( [Normalizer(), Augmenter(), Resizer()])) dataset_val = CocoDataset(parser.coco_path, set_name='val2014', transform=transforms.Compose( [Normalizer(), Resizer()])) elif parser.dataset == 'csv': if parser.csv_train is None: raise ValueError('Must provide --csv_train when training on COCO,') if parser.csv_classes is None: raise ValueError( 'Must provide --csv_classes when training on COCO,') dataset_train = CSVDataset(train_file=parser.csv_train, class_list=parser.csv_classes, transform=transforms.Compose( [Normalizer(), Augmenter(), Resizer()])) if parser.csv_val is None: dataset_val = None print('No validation annotations provided.') else: dataset_val = CSVDataset(train_file=parser.csv_val, class_list=parser.csv_classes, transform=transforms.Compose( [Normalizer(), Resizer()])) else: raise ValueError( 'Dataset type not understood (must be csv or coco), exiting.') sampler = AspectRatioBasedSampler(dataset_train, batch_size=8, drop_last=False) dataloader_train = DataLoader(dataset_train, num_workers=8, collate_fn=collater, batch_sampler=sampler) if dataset_val is not None: sampler_val = AspectRatioBasedSampler(dataset_val, batch_size=16, drop_last=False) dataloader_val = DataLoader(dataset_val, num_workers=8, collate_fn=collater, batch_sampler=sampler_val) # Create the model if parser.depth == 18: retinanet = model.resnet18(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 34: retinanet = model.resnet34(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 50: retinanet = model.resnet50(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 101: retinanet = model.resnet101(num_classes=dataset_train.num_classes(), pretrained=True) elif parser.depth == 152: retinanet = model.resnet152(num_classes=dataset_train.num_classes(), pretrained=True) else: raise ValueError( 'Unsupported model depth, must be one of 18, 34, 50, 101, 152') print('| Num training images: {}'.format(len(dataset_train))) print('| Num test images : {}'.format(len(dataset_val))) print("\n[Phase 2]: Preparing RetinaNet Detection Model...") use_gpu = torch.cuda.is_available() if use_gpu: device = torch.device('cuda') retinanet = retinanet.to(device) retinanet = torch.nn.DataParallel(retinanet, device_ids=range( torch.cuda.device_count())) print("| Using %d GPUs for Train/Validation!" % torch.cuda.device_count()) retinanet.training = True if parser.optimizer == 'Adam': optimizer = optim.Adam(retinanet.parameters(), lr=1e-5) # not mentioned print("| Adam Optimizer with Learning Rate = {}".format(1e-5)) elif parser.optimizer == 'SGD': optimizer = optim.SGD(retinanet.parameters(), lr=1e-2, momentum=0.9, weight_decay=1e-4) print("| SGD Optimizer with Learning Rate = {}".format(1e-2)) else: raise ValueError('Unsupported Optimizer, must be one of [SGD | Adam]') scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=3, verbose=True) loss_hist = collections.deque(maxlen=500) retinanet.train() retinanet.module.freeze_bn( ) # Freeze the BN parameters to ImageNet configuration # Check if there is a 'checkpoints' path if not osp.exists('./checkpoints/'): os.makedirs('./checkpoints/') print("\n[Phase 3]: Training Model on {} dataset...".format( parser.dataset)) for epoch_num in range(parser.epochs): epoch_loss = [] for iter_num, data in enumerate(dataloader_train): try: optimizer.zero_grad() classification_loss, regression_loss = retinanet( [data['img'].to(device), data['annot']]) classification_loss = classification_loss.mean() regression_loss = regression_loss.mean() loss = classification_loss + regression_loss if bool(loss == 0): continue loss.backward() torch.nn.utils.clip_grad_norm_(retinanet.parameters(), 0.001) optimizer.step() loss_hist.append(float(loss)) epoch_loss.append(float(loss)) sys.stdout.write('\r') sys.stdout.write( '| Epoch: {} | Iteration: {}/{} | Classification loss: {:1.5f} | Regression loss: {:1.5f} | Running loss: {:1.5f}' .format(epoch_num + 1, iter_num + 1, len(dataloader_train), float(classification_loss), float(regression_loss), np.mean(loss_hist))) sys.stdout.flush() del classification_loss del regression_loss except Exception as e: print(e) continue print("\n| Saving current best model at epoch {}...".format(epoch_num + 1)) torch.save( retinanet.state_dict(), './checkpoints/{}_retinanet_{}.pt'.format(parser.dataset, epoch_num + 1)) if parser.dataset == 'coco': #print('Evaluating dataset') coco_eval.evaluate_coco(dataset_val, retinanet, device) elif parser.dataset == 'csv' and parser.csv_val is not None: #print('Evaluating dataset') mAP = csv_eval.evaluate(dataset_val, retinanet, device) scheduler.step(np.mean(epoch_loss)) retinanet.eval() torch.save(retinanet.state_dict(), './checkpoints/model_final.pt')