def __init__(self, info): super(SSD, self).__init__() self.infer = info['infer'] detection_metadata = info['metadatas'][1] if detection_metadata and 'Categories' in detection_metadata: self.categories = detection_metadata['Categories'] else: self.categories = ['object'] self.num_classes = len(self.categories) + 1 lib.eprint('ssd: set num_classes={}'.format(self.num_classes)) self.mode = info['params'].get('mode', 'mb2-ssd-lite') mb2_width_mult = info['params'].get('mb2_width_mult', 1.0) # adapt from train_ssd.py if self.mode == 'vgg16-ssd': create_net = create_vgg_ssd config = vgg_ssd_config elif self.mode == 'mb1-ssd': create_net = create_mobilenetv1_ssd config = mobilenetv1_ssd_config elif self.mode == 'mb1-ssd-lite': create_net = create_mobilenetv1_ssd_lite config = mobilenetv1_ssd_config elif self.mode == 'sq-ssd-lite': create_net = create_squeezenet_ssd_lite config = squeezenet_ssd_config elif self.mode == 'mb2-ssd-lite': create_net = lambda num, is_test: create_mobilenetv2_ssd_lite( num, width_mult=mb2_width_mult, is_test=is_test) config = mobilenetv1_ssd_config elif self.mode == 'mb3-large-ssd-lite': create_net = lambda num: create_mobilenetv3_large_ssd_lite( num, is_test=is_test) config = mobilenetv1_ssd_config elif self.mode == 'mb3-small-ssd-lite': create_net = lambda num: create_mobilenetv3_small_ssd_lite( num, is_test=is_test) config = mobilenetv1_ssd_config config.iou_threshold = info['params'].get( 'iou_threshold', config.iou_threshold) self.prob_threshold = info['params'].get( 'confidence_threshold', 0.01) self.config = config self.model = create_net(self.num_classes, is_test=self.infer) self.criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=3, center_variance=0.1, size_variance=0.2, device=info['device']) self.match_prior = MatchPrior(config.priors, config.center_variance, config.size_variance, 0.5) self.image_mean = torch.tensor(self.config.image_mean, dtype=torch.float32).reshape( 1, 3, 1, 1).to(info['device'])
def res_test(dataset, net, device): config = mobilenetv1_ssd_config criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=3, center_variance=0.1, size_variance=0.2, device=device) target_transform = MatchPrior(config.priors, config.center_variance, config.size_variance, 0.5) test_transform = TestTransform(config.image_size, config.image_mean, config.image_std) val_dataset = SKUDataset(dataset, transform=test_transform, target_transform=target_transform, mode='1') loader = DataLoader(val_dataset, args.batch_size, num_workers=args.num_workers, shuffle=False) net.eval() running_loss = 0.0 running_regression_loss = 0.0 running_classification_loss = 0.0 num = 0 for i, data in enumerate(loader): images, boxes, labels = data images = images.to(device) boxes = boxes.to(device) labels = labels.to(device) num += 1 with torch.no_grad(): confidence, locations = net(images) regression_loss, classification_loss = criterion( confidence, locations, labels, boxes) loss = regression_loss + classification_loss running_loss += loss.item() running_regression_loss += regression_loss.item() running_classification_loss += classification_loss.item() if i % 50 == 0: logger.info(f"Step: {i} in Test - loss : {loss}. ") return running_loss / num, running_regression_loss / num, running_classification_loss / num
timer.start("Load Model") if args.resume: logging.info(f"Resume from the model {args.resume}") net.load_state_dict(torch.load(args.resume, map_location=lambda storage, loc: storage)) elif args.base_net: logging.info(f"Init from base net {args.base_net}") net.init_from_base_net(args.base_net) elif args.pretrained_ssd: logging.info(f"Init from pretrained ssd {args.pretrained_ssd}") net.init_from_pretrained_ssd(args.pretrained_ssd) logging.info(f'Took {timer.end("Load Model"):.2f} seconds to load the model.') net.to(DEVICE) criterion = MultiboxLoss(config.priors, iou_threshold=0.45, neg_pos_ratio=3, center_variance=0.1, size_variance=0.2, device=DEVICE) optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) logging.info(f"Learning rate: {args.lr}, Base net learning rate: {base_net_lr}, " + f"Extra Layers learning rate: {extra_layers_lr}.") if args.scheduler == 'multi-step': logging.info("Uses MultiStepLR scheduler.") milestones = [int(v.strip()) for v in args.milestones.split(",")] scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=0.1,) elif args.scheduler == 'cosine': logging.info("Uses CosineAnnealingLR scheduler.") scheduler = CosineAnnealingLR(optimizer, args.t_max,) elif args.scheduler == 'step': logging.info("Uses Step scheduler.")
def optim_and_model_initial(args, net, timer, config, DEVICE): #net = create_net(num_classes) last_epoch = -1 base_net_lr = args['Training_hyperparam']['base_net_lr'] if args[ 'Training_hyperparam']['base_net_lr'] != "None" else args[ 'Training_hyperparam']['lr'] extra_layers_lr = args['Training_hyperparam']['extra_layers_lr'] if args[ 'Training_hyperparam']['extra_layers_lr'] != "None" else args[ 'Training_hyperparam']['lr'] if args['flow_control']['freeze_base_net']: logging.info("Freeze base net.") freeze_net_layers(net.base_net) params = itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters(), net.regression_headers.parameters(), net.classification_headers.parameters()) params = [{ 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) }] elif args['flow_control']['freeze_net']: freeze_net_layers(net.base_net) freeze_net_layers(net.source_layer_add_ons) freeze_net_layers(net.extras) params = itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) logging.info("Freeze all the layers except prediction heads.") else: params = [{ 'params': net.base_net.parameters(), 'lr': base_net_lr }, { 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) }] timer.start("Load Model") if args['flow_control']['resume']: logging.info("Resume from the model {}".format( args['flow_control']['resume'])) net.load(args['flow_control']['resume']) elif args['flow_control']['base_net']: logging.info("Init from base net {}".format( args['flow_control']['base_net'])) net.init_from_base_net(args['flow_control']['base_net']) elif args['flow_control']['pretrained_ssd']: logging.info("Init from pretrained ssd {}".format( args['flow_control']['pretrained_ssd'])) net.init_from_pretrained_ssd(args['flow_control']['pretrained_ssd']) logging.info('Took {:.2f} seconds to load the model.'.format( timer.end("Load Model"))) # net.to(DEVICE) net = nn.DataParallel(net).cuda() neg_pos_ratio = 3 #3 criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=neg_pos_ratio, center_variance=0.1, size_variance=0.2, device=DEVICE) # criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=1, # center_variance=0.1, size_variance=0.2, device=DEVICE) optimizer = torch.optim.SGD( params, lr=args['Training_hyperparam']['lr'], momentum=args['Training_hyperparam']['momentum'], weight_decay=args['Training_hyperparam']['weighted_decay']) logging.info("Learning rate: {}, Base net learning rate: {}, ".format( args['Training_hyperparam']['lr'], base_net_lr) + "Extra Layers learning rate: {}.".format(extra_layers_lr)) if args['Training_hyperparam']['lr_scheduler'] == 'multi-step': logging.info("Uses MultiStepLR scheduler.") milestones = [ int(v.strip()) for v in args["Training_hyperparam"] ["lr_scheduler_param"]["multi-step"]['milestones'].split(",") ] scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=args["Training_hyperparam"] ["lr_scheduler_param"]["multi-step"]['gamma'], last_epoch=last_epoch) elif args['Training_hyperparam']['lr_scheduler'] == 'cosine': logging.info("Uses CosineAnnealingLR scheduler.") scheduler = CosineAnnealingLR( optimizer, float(args['Training_hyperparam']['lr_scheduler_param']['cosine'] ['t_max']), last_epoch=last_epoch) else: logging.fatal("Unsupported Scheduler: {}.".format( args['Training_hyperparam']['lr_scheduler'])) parser.print_help(sys.stderr) sys.exit(1) logging.info("Start training from epoch {}.".format(last_epoch + 1)) return net, criterion, optimizer, scheduler
def main(args): DEVICE = torch.device( "cuda:0" if torch.cuda.is_available() and args.use_cuda else "cpu") #DEVICE = torch.device("cpu") if args.use_cuda and torch.cuda.is_available(): torch.backends.cudnn.benchmark = True logging.info("Use Cuda.") timer = Timer() logging.info(args) if args.net == 'vgg16-ssd': create_net = create_vgg_ssd config = vgg_ssd_config elif args.net == 'mb1-ssd': create_net = create_mobilenetv1_ssd config = mobilenetv1_ssd_config elif args.net == 'mb1-ssd-lite': create_net = create_mobilenetv1_ssd_lite config = mobilenetv1_ssd_config elif args.net == 'sq-ssd-lite': create_net = create_squeezenet_ssd_lite config = squeezenet_ssd_config elif args.net == 'mb2-ssd-lite': create_net = lambda num: create_mobilenetv2_ssd_lite( num, width_mult=args.mb2_width_mult) config = mobilenetv1_ssd_config else: logging.fatal("The net type is wrong.") parser.print_help(sys.stderr) sys.exit(1) train_transform = TrainAugmentation(config.image_size, config.image_mean, config.image_std) target_transform = MatchPrior(config.priors, config.center_variance, config.size_variance, 0.5) test_transform = TestTransform(config.image_size, config.image_mean, config.image_std) logging.info("Prepare training datasets.") datasets = [] for dataset_path in args.datasets: if args.dataset_type == 'voc': dataset = VOCDataset(dataset_path, transform=train_transform, target_transform=target_transform) label_file = os.path.join(args.checkpoint_folder, "voc-model-labels.txt") store_labels(label_file, dataset.class_names) num_classes = len(dataset.class_names) elif args.dataset_type == 'open_images': dataset = OpenImagesDataset(dataset_path, transform=train_transform, target_transform=target_transform, dataset_type="train", balance_data=args.balance_data) label_file = os.path.join(args.checkpoint_folder, "open-images-model-labels.txt") store_labels(label_file, dataset.class_names) logging.info(dataset) num_classes = len(dataset.class_names) elif args.dataset_type == 'coco': # root, annFile, transform=None, target_transform=None, transforms=None) # dataset_type="train", balance_data=args.balance_data) dataset = CocoDetection( "/home/wenyen4desh/datasets/coco/train2017", "/home/wenyen4desh/datasets/coco/annotations/instances_train2017.json", transform=train_transform, target_transform=target_transform) label_file = os.path.join(args.checkpoint_folder, "open-images-model-labels.txt") store_labels(label_file, dataset.class_names) logging.info(dataset) num_classes = len(dataset.class_names) # raise ValueError("Dataset type {} yet implement.".format(args.dataset_type)) else: raise ValueError("Dataset type {} is not supported.".format( args.dataset_type)) datasets.append(dataset) logging.info("Stored labels into file {}.".format(label_file)) train_dataset = ConcatDataset(datasets) logging.info("Train dataset size: {}".format(len(train_dataset))) train_loader = DataLoader(train_dataset, args.batch_size, num_workers=args.num_workers, shuffle=True) logging.info("Prepare Validation datasets.") if args.dataset_type == "voc": val_dataset = VOCDataset(args.validation_dataset, transform=test_transform, target_transform=target_transform, is_test=True) elif args.dataset_type == 'open_images': val_dataset = OpenImagesDataset(dataset_path, transform=test_transform, target_transform=target_transform, dataset_type="test") logging.info(val_dataset) elif args.dataset_type == "coco": val_dataset = CocoDetection( "/home/wenyen4desh/datasets/coco/val2017", "/home/wenyen4desh/datasets/coco/annotations/instances_val2017.json", transform=test_transform, target_transform=target_transform) logging.info(val_dataset) logging.info("validation dataset size: {}".format(len(val_dataset))) val_loader = DataLoader(val_dataset, args.batch_size, num_workers=args.num_workers, shuffle=False) logging.info("Build network.") net = create_net(num_classes) min_loss = -10000.0 last_epoch = -1 base_net_lr = args.base_net_lr if args.base_net_lr is not None else args.lr extra_layers_lr = args.extra_layers_lr if args.extra_layers_lr is not None else args.lr if args.freeze_base_net: logging.info("Freeze base net.") freeze_net_layers(net.base_net) params = itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters(), net.regression_headers.parameters(), net.classification_headers.parameters()) params = [{ 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) }] elif args.freeze_net: freeze_net_layers(net.base_net) freeze_net_layers(net.source_layer_add_ons) freeze_net_layers(net.extras) params = itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) logging.info("Freeze all the layers except prediction heads.") else: params = [{ 'params': net.base_net.parameters(), 'lr': base_net_lr }, { 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) }] timer.start("Load Model") if args.resume: logging.info("Resume from the model {}".format(args.resume)) net.load(args.resume) elif args.base_net: logging.info("Init from base net {}".format(args.base_net)) net.init_from_base_net(args.base_net) elif args.pretrained_ssd: logging.info("Init from pretrained ssd {}".format(args.pretrained_ssd)) net.init_from_pretrained_ssd(args.pretrained_ssd) logging.info('Took {:.2f} seconds to load the model.'.format( timer.end("Load Model"))) net.to(DEVICE) criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=3, center_variance=0.1, size_variance=0.2, device=DEVICE) optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) logging.info("Learning rate: {}, Base net learning rate: {}, ".format( args.lr, base_net_lr) + "Extra Layers learning rate: {}.".format(extra_layers_lr)) if args.scheduler == 'multi-step': logging.info("Uses MultiStepLR scheduler.") milestones = [int(v.strip()) for v in args.milestones.split(",")] scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=0.1, last_epoch=last_epoch) elif args.scheduler == 'cosine': logging.info("Uses CosineAnnealingLR scheduler.") scheduler = CosineAnnealingLR(optimizer, args.t_max, last_epoch=last_epoch) else: logging.fatal("Unsupported Scheduler: {}.".format(args.scheduler)) parser.print_help(sys.stderr) sys.exit(1) logging.info("Start training from epoch {}.".format(last_epoch + 1)) for epoch in range(last_epoch + 1, args.num_epochs): scheduler.step() train(train_loader, net, criterion, optimizer, device=DEVICE, debug_steps=args.debug_steps, epoch=epoch) if epoch % args.validation_epochs == 0 or epoch == args.num_epochs - 1: val_loss, val_regression_loss, val_classification_loss = test( val_loader, net, criterion, DEVICE) logging.info("Epoch: {}, ".format(epoch) + "Validation Loss: {:.4f}, ".format(val_loss) + "Validation Regression Loss {:.4f}, ".format( val_regression_loss) + "Validation Classification Loss: {:.4f}".format( val_classification_loss)) model_path = os.path.join( args.checkpoint_folder, "{}-Epoch-{}-Loss-{}.pth".format(args.net, epoch, val_loss)) net.save(model_path) logging.info("Saved model {}".format(model_path))
timer.start("Load Model") if args.resume: logging.info(f"Resume from the model {args.resume}") net.load(args.resume) elif args.base_net: logging.info(f"Init from base net {args.base_net}") net.init_from_base_net(args.base_net) elif args.pretrained_ssd: logging.info(f"Init from pretrained ssd {args.pretrained_ssd}") net.init_from_pretrained_ssd(args.pretrained_ssd) logging.info( f'Took {timer.end("Load Model"):.2f} seconds to load the model.') criterion = MultiboxLoss(config.priors, neg_pos_ratio=3, center_variance=0.1, size_variance=0.2) if args.optimizer_type != "Adam": if args.scheduler == 'multi-step': logging.info("Uses MultiStepLR scheduler.") milestones = [int(v.strip()) for v in args.milestones.split(",")] scheduler = MultiStepDecay(args.lr, milestones=milestones, gamma=0.1, last_epoch=last_epoch) elif args.scheduler == 'cosine': logging.info("Uses CosineAnnealingLR scheduler.") scheduler = CosineAnnealingDecay(args.lr, args.t_max, last_epoch=last_epoch)
def setup_and_start_training(self): logging.basicConfig( stream=sys.stdout, level=logging.INFO, format='%(asctime)s - %(name)s - %(levelname)s - %(message)s') DEVICE = torch.device("cuda:0" if torch.cuda.is_available() and self. system_dict["params"]["use_cuda"] else "cpu") if self.system_dict["params"]["use_cuda"] and torch.cuda.is_available( ): torch.backends.cudnn.benchmark = True logging.info("Using gpu.") else: logging.info("Using cpu.") timer = Timer() logging.info(self.system_dict) if self.system_dict["params"]["net"] == 'vgg16-ssd': create_net = create_vgg_ssd config = vgg_ssd_config elif self.system_dict["params"]["net"] == 'mb1-ssd': create_net = create_mobilenetv1_ssd config = mobilenetv1_ssd_config elif self.system_dict["params"]["net"] == 'mb1-ssd-lite': create_net = create_mobilenetv1_ssd_lite config = mobilenetv1_ssd_config elif self.system_dict["params"]["net"] == 'sq-ssd-lite': create_net = create_squeezenet_ssd_lite config = squeezenet_ssd_config elif self.system_dict["params"]["net"] == 'mb2-ssd-lite': create_net = lambda num: create_mobilenetv2_ssd_lite( num, width_mult=self.system_dict["params"]["mb2_width_mult"]) config = mobilenetv1_ssd_config else: logging.fatal("The net type is wrong.") sys.exit(1) train_transform = TrainAugmentation(config.image_size, config.image_mean, config.image_std) target_transform = MatchPrior(config.priors, config.center_variance, config.size_variance, 0.5) test_transform = TestTransform(config.image_size, config.image_mean, config.image_std) logging.info("Prepare training datasets.") datasets = [] dataset = VOCDataset( self.system_dict["dataset"]["val"]["img_dir"], self.system_dict["dataset"]["val"]["label_dir"], transform=train_transform, target_transform=target_transform, label_file=self.system_dict["params"]["label_file"]) label_file = self.system_dict["params"]["label_file"] #store_labels(label_file, dataset.class_names) num_classes = len(dataset.class_names) datasets.append(dataset) logging.info(f"Stored labels into file {label_file}.") train_dataset = ConcatDataset(datasets) logging.info("Train dataset size: {}".format(len(train_dataset))) train_loader = DataLoader( train_dataset, self.system_dict["params"]["batch_size"], num_workers=self.system_dict["params"]["num_workers"], shuffle=True) if (self.system_dict["dataset"]["val"]["status"]): val_dataset = VOCDataset( self.system_dict["dataset"]["val"]["img_dir"], self.system_dict["dataset"]["val"]["label_dir"], transform=test_transform, target_transform=target_transform, is_test=True, label_file=self.system_dict["params"]["label_file"]) logging.info("validation dataset size: {}".format( len(val_dataset))) val_loader = DataLoader( val_dataset, self.system_dict["params"]["batch_size"], num_workers=self.system_dict["params"]["num_workers"], shuffle=False) logging.info("Build network.") net = create_net(num_classes) min_loss = -10000.0 last_epoch = -1 base_net_lr = self.system_dict["params"][ "base_net_lr"] if self.system_dict["params"][ "base_net_lr"] is not None else self.system_dict["params"]["lr"] extra_layers_lr = self.system_dict["params"][ "extra_layers_lr"] if self.system_dict["params"][ "extra_layers_lr"] is not None else self.system_dict["params"][ "lr"] if self.system_dict["params"]["freeze_base_net"]: logging.info("Freeze base net.") freeze_net_layers(net.base_net) params = itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters(), net.regression_headers.parameters(), net.classification_headers.parameters()) params = [{ 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) }] elif self.system_dict["params"]["freeze_net"]: freeze_net_layers(net.base_net) freeze_net_layers(net.source_layer_add_ons) freeze_net_layers(net.extras) params = itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) logging.info("Freeze all the layers except prediction heads.") else: params = [{ 'params': net.base_net.parameters(), 'lr': base_net_lr }, { 'params': itertools.chain(net.source_layer_add_ons.parameters(), net.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.regression_headers.parameters(), net.classification_headers.parameters()) }] timer.start("Load Model") resume = self.system_dict["params"]["resume"] base_net = self.system_dict["params"]["base_net"] pretrained_ssd = self.system_dict["params"]["pretrained_ssd"] if self.system_dict["params"]["resume"]: logging.info(f"Resume from the model {resume}") net.load(self.system_dict["params"]["resume"]) elif self.system_dict["params"]["base_net"]: logging.info(f"Init from base net {base_net}") net.init_from_base_net(self.system_dict["params"]["base_net"]) elif self.system_dict["params"]["pretrained_ssd"]: logging.info(f"Init from pretrained ssd {pretrained_ssd}") net.init_from_pretrained_ssd( self.system_dict["params"]["pretrained_ssd"]) logging.info( f'Took {timer.end("Load Model"):.2f} seconds to load the model.') net.to(DEVICE) criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=3, center_variance=0.1, size_variance=0.2, device=DEVICE) optimizer = torch.optim.SGD( params, lr=self.system_dict["params"]["lr"], momentum=self.system_dict["params"]["momentum"], weight_decay=self.system_dict["params"]["weight_decay"]) lr = self.system_dict["params"]["lr"] logging.info( f"Learning rate: {lr}, Base net learning rate: {base_net_lr}, " + f"Extra Layers learning rate: {extra_layers_lr}.") if (not self.system_dict["params"]["milestones"]): self.system_dict["params"]["milestones"] = "" self.system_dict["params"]["milestones"] += str( int(self.system_dict["params"]["num_epochs"] / 3)) + "," self.system_dict["params"]["milestones"] += str( int(2 * self.system_dict["params"]["num_epochs"] / 3)) if self.system_dict["params"]["scheduler"] == 'multi-step': logging.info("Uses MultiStepLR scheduler.") milestones = [ int(v.strip()) for v in self.system_dict["params"]["milestones"].split(",") ] scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=0.1, last_epoch=last_epoch) elif self.system_dict["params"]["scheduler"] == 'cosine': logging.info("Uses CosineAnnealingLR scheduler.") scheduler = CosineAnnealingLR(optimizer, self.system_dict["params"]["t_max"], last_epoch=last_epoch) logging.info(f"Start training from epoch {last_epoch + 1}.") for epoch in range(last_epoch + 1, self.system_dict["params"]["num_epochs"]): scheduler.step() self.base_train( train_loader, net, criterion, optimizer, device=DEVICE, debug_steps=self.system_dict["params"]["debug_steps"], epoch=epoch) if ((self.system_dict["dataset"]["val"]["status"]) and (epoch % self.system_dict["params"]["validation_epochs"] == 0 or epoch == self.system_dict["params"]["num_epochs"] - 1)): val_loss, val_regression_loss, val_classification_loss = self.base_test( val_loader, net, criterion, DEVICE) logging.info( f"Epoch: {epoch}, " + f"Validation Loss: {val_loss:.4f}, " + f"Validation Regression Loss {val_regression_loss:.4f}, " + f"Validation Classification Loss: {val_classification_loss:.4f}" ) net_name = self.system_dict["params"]["net"] model_path = os.path.join( self.system_dict["params"]["checkpoint_folder"], f"{net_name}-Epoch-{epoch}-Loss-{val_loss}.pth") net.save(model_path) logging.info(f"Saved model {model_path}") if (not self.system_dict["dataset"]["val"]["status"]): model_path = os.path.join( self.system_dict["params"]["checkpoint_folder"], f"{net_name}-Epoch-{epoch}.pth") net.save(model_path) logging.info(f"Saved model {model_path}")
def main(): script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) global msglogger # Parse arguments args = parser.get_parser().parse_args() if args.epochs is None: args.epochs = 90 if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger( os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir, args.verbose) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state( filter(None, [args.compress, args.qe_stats_file ]), # remove both None and empty strings msglogger.logdir, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) if args.evaluate: args.deterministic = True if args.deterministic: distiller.set_deterministic( args.seed) # For experiment reproducability else: if args.seed is not None: distiller.set_seed(args.seed) # Turn on CUDNN benchmark mode for best performance. This is usually "safe" for image # classification models, as the input sizes don't change during the run # See here: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/3 cudnn.benchmark = True start_epoch = 0 ending_epoch = args.epochs perf_scores_history = [] if args.cpu or not torch.cuda.is_available(): # Set GPU index to -1 if using CPU args.device = 'cpu' args.gpus = -1 else: args.device = 'cuda' if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: raise ValueError( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: raise ValueError( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name args.dataset = distiller.apputils.classification_dataset_str_from_arch( args.arch) args.num_classes = distiller.apputils.classification_num_classes( args.dataset) if args.earlyexit_thresholds: args.num_exits = len(args.earlyexit_thresholds) + 1 args.loss_exits = [0] * args.num_exits args.losses_exits = [] args.exiterrors = [] # Create the model model, config = create_model(args.pretrained, args.dataset, args.arch, parallel=not args.load_serialized, device_ids=args.gpus) compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # capture thresholds for early-exit training if args.earlyexit_thresholds: msglogger.info('=> using early-exit threshold values of %s', args.earlyexit_thresholds) # TODO(barrh): args.deprecated_resume is deprecated since v0.3.1 if args.deprecated_resume: msglogger.warning( 'The "--resume" flag is deprecated. Please use "--resume-from=YOUR_PATH" instead.' ) if not args.reset_optimizer: msglogger.warning( 'If you wish to also reset the optimizer, call with: --reset-optimizer' ) args.reset_optimizer = True args.resumed_checkpoint_path = args.deprecated_resume # We can optionally resume from a checkpoint optimizer = None if args.resumed_checkpoint_path: model, compression_scheduler, optimizer, start_epoch = apputils.load_checkpoint( model, args.resumed_checkpoint_path, model_device=args.device) elif args.load_model_path: model = apputils.load_lean_checkpoint(model, args.load_model_path, model_device=args.device) if args.reset_optimizer: start_epoch = 0 if optimizer is not None: optimizer = None msglogger.info( '\nreset_optimizer flag set: Overriding resumed optimizer and resetting epoch count to 0' ) # Define loss function (criterion) if "ssd" in args.arch: neg_pos_ratio = 3 criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=neg_pos_ratio, center_variance=0.1, size_variance=0.2, device=args.device, reduction="sum", class_reduction=True, verbose=0) else: criterion = nn.CrossEntropyLoss().to(args.device) if optimizer is None: if "ssd" in args.arch: base_net_lr = args.lr extra_layers_lr = args.lr params = [{ 'params': model.base_net.parameters(), 'lr': base_net_lr }, { 'params': itertools.chain(model.source_layer_add_ons.parameters(), model.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(model.regression_headers.parameters(), model.classification_headers.parameters()) }] else: params = model.parameters() optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.AMC: return automated_deep_compression(model, criterion, optimizer, pylogger, args) if args.greedy: return greedy(model, criterion, optimizer, pylogger, args) # This sample application can be invoked to produce various summary reports. if args.summary: for summary in args.summary: distiller.model_summary(model, summary, args.dataset) return if args.export_onnx is not None: return distiller.export_img_classifier_to_onnx(model, os.path.join( msglogger.logdir, args.export_onnx), args.dataset, add_softmax=True, verbose=False) if args.qe_calibration: return acts_quant_stats_collection(model, criterion, pylogger, args) if args.activation_histograms: return acts_histogram_collection(model, criterion, pylogger, args) activations_collectors = create_activation_stats_collectors( model, *args.activation_stats) # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = load_data(args, config=config) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) if args.sensitivity is not None: sensitivities = np.arange(args.sensitivity_range[0], args.sensitivity_range[1], args.sensitivity_range[2]) return sensitivity_analysis(model, criterion, test_loader, pylogger, args, sensitivities) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, args, compression_scheduler) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config( model, optimizer, args.compress, compression_scheduler, (start_epoch - 1) if args.resumed_checkpoint_path else None) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.to(args.device) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) if args.thinnify: #zeros_mask_dict = distiller.create_model_masks_dict(model) assert args.resumed_checkpoint_path is not None, \ "You must use --resume-from to provide a checkpoint file to thinnify" distiller.remove_filters(model, compression_scheduler.zeros_mask_dict, args.arch, args.dataset, optimizer=None) apputils.save_checkpoint(0, args.arch, model, optimizer=None, scheduler=compression_scheduler, name="{}_thinned".format( args.resumed_checkpoint_path.replace( ".pth.tar", "")), dir=msglogger.logdir) print( "Note: your model may have collapsed to random inference, so you may want to fine-tune" ) return args.kd_policy = None if args.kd_teacher: teacher, _ = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, parallel=not args.load_serialized, device_ids=args.gpus) if args.kd_resume: teacher = apputils.load_lean_checkpoint(teacher, args.kd_resume) dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) raw_teacher_model_path = msglogger.logdir + "/raw_teacher.pth.tar" if not os.path.exists(raw_teacher_model_path): teacher.save(raw_teacher_model_path) msglogger.info(Fore.CYAN + '\tRaw Teacher Model saved: {0}'.format( raw_teacher_model_path) + Style.RESET_ALL) args.kd_policy = distiller.KnowledgeDistillationPolicy( model, teacher, args.kd_temp, dlw, loss_type=args.kd_loss_type, focal_alpha=args.kd_focal_alpha, use_adaptive=args.kd_focal_adaptive, verbose=0) compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs, frequency=1) msglogger.info('\nStudent-Teacher knowledge distillation enabled:') msglogger.info('\tTeacher Model: %s', args.kd_teacher) msglogger.info('\tTemperature: %s', args.kd_temp) msglogger.info('\tLoss Weights (distillation | student | teacher): %s', ' | '.join(['{:.2f}'.format(val) for val in dlw])) msglogger.info('\tStarting from Epoch: %s', args.kd_start_epoch) if start_epoch >= ending_epoch: msglogger.error( 'epoch count is too low, starting epoch is {} but total epochs set to {}' .format(start_epoch, ending_epoch)) raise ValueError('Epochs parameter is too low. Nothing to do.') for epoch in range(start_epoch, ending_epoch): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin( epoch, metrics=(vloss if (epoch != start_epoch) else 10**6)) # Train for one epoch with collectors_context(activations_collectors["train"]) as collectors: train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], args=args) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics( epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity: msglogger.info( distiller.masks_sparsity_tbl_summary( model, compression_scheduler)) # evaluate on validation set with collectors_context(activations_collectors["valid"]) as collectors: top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args, epoch) distiller.log_activation_statsitics( epoch, "valid", loggers=[tflogger], collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) stats = ('Performance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) # Update the list of top scores achieved so far, and save the checkpoint update_training_scores_history(perf_scores_history, model, top1, top5, epoch, args.num_best_scores) is_best = epoch == perf_scores_history[0].epoch checkpoint_extras = { 'current_top1': top1, 'best_top1': perf_scores_history[0].top1, 'best_epoch': perf_scores_history[0].epoch } try: raw_fullpath_best = apputils.save_checkpoint( epoch, args.arch, model, optimizer=optimizer, scheduler=compression_scheduler, extras=checkpoint_extras, is_best=is_best, name=args.name, dir=msglogger.logdir) except Exception as ex: # keep previous fullpath_best pass mlflow.log_artifacts(msglogger.logdir) # Finally run results on the test set eval_params = { "model_type": args.arch, "model_path": raw_fullpath_best, "dataset_path": args.data, "label_path": "models/voc-model-labels.txt" } mlflow.projects.run(uri=".", entry_point="eval", use_conda=False, parameters=eval_params)
def train_network(dataset_path, model_path, net_type): args.datasets = dataset_path args.validation_dataset = dataset_path args.checkpoint_folder = model_path args.log_dir = os.path.join(args.checkpoint_folder, 'log') args.net = net_type timer = Timer() logging.info(args) if args.net == 'slim': create_net = create_mb_tiny_fd config = fd_config elif args.net == 'RFB': create_net = create_Mb_Tiny_RFB_fd config = fd_config else: logging.fatal("The net type is wrong.") parser.print_help(sys.stderr) sys.exit(1) train_transform = TrainAugmentation(config.image_size, config.image_mean, config.image_std) target_transform = MatchPrior(config.priors, config.center_variance, config.size_variance, args.overlap_threshold) test_transform = TestTransform(config.image_size, config.image_mean_test, config.image_std) if not os.path.exists(args.checkpoint_folder): os.makedirs(args.checkpoint_folder) logging.info("Prepare training datasets.") datasets = [] # voc datasets dataset = VOCDataset(dataset_path, transform=train_transform, target_transform=target_transform) label_file = os.path.join(args.checkpoint_folder, "voc-model-labels.txt") store_labels(label_file, dataset.class_names) num_classes = len(dataset.class_names) print('num_classes: ', num_classes) logging.info(f"Stored labels into file {label_file}.") # train_dataset = ConcatDataset(datasets) train_dataset = dataset logging.info("Train dataset size: {}".format(len(train_dataset))) train_loader = DataLoader(train_dataset, args.batch_size, num_workers=args.num_workers, shuffle=True) logging.info("Prepare Validation datasets.") val_dataset = VOCDataset(args.validation_dataset, transform=test_transform, target_transform=target_transform, is_test=True) logging.info("validation dataset size: {}".format(len(val_dataset))) val_loader = DataLoader(val_dataset, args.batch_size, num_workers=args.num_workers, shuffle=False) logging.info("Build network.") net = create_net(num_classes) timer.start("Load Model") if args.resume: logging.info(f"Resume from the model {args.resume}") net.load(args.resume) logging.info( f'Took {timer.end("Load Model"):.2f} seconds to load the model.') # add multigpu_train if torch.cuda.device_count() >= 1: cuda_index_list = [int(v.strip()) for v in args.cuda_index.split(",")] net = nn.DataParallel(net, device_ids=cuda_index_list) logging.info("use gpu :{}".format(cuda_index_list)) min_loss = -10000.0 last_epoch = -1 base_net_lr = args.base_net_lr if args.base_net_lr is not None else args.lr extra_layers_lr = args.extra_layers_lr if args.extra_layers_lr is not None else args.lr params = [{ 'params': net.module.base_net.parameters(), 'lr': base_net_lr }, { 'params': itertools.chain(net.module.source_layer_add_ons.parameters(), net.module.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(net.module.regression_headers.parameters(), net.module.classification_headers.parameters()) }] net.to(DEVICE) criterion = MultiboxLoss(config.priors, iou_threshold=args.iou_threshold, neg_pos_ratio=5, center_variance=0.1, size_variance=0.2, device=DEVICE, num_classes=num_classes, loss_type=args.loss_type) if args.optimizer_type == "SGD": optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) elif args.optimizer_type == "Adam": optimizer = torch.optim.Adam(params, lr=args.lr) logging.info("use Adam optimizer") else: logging.fatal(f"Unsupported optimizer: {args.scheduler}.") parser.print_help(sys.stderr) sys.exit(1) logging.info( f"Learning rate: {args.lr}, Base net learning rate: {base_net_lr}, " + f"Extra Layers learning rate: {extra_layers_lr}.") if args.optimizer_type != "Adam": if args.scheduler == 'multi-step': logging.info("Uses MultiStepLR scheduler.") milestones = [int(v.strip()) for v in args.milestones.split(",")] scheduler = MultiStepLR(optimizer, milestones=milestones, gamma=0.1, last_epoch=last_epoch) elif args.scheduler == 'poly': logging.info("Uses PolyLR scheduler.") else: logging.fatal(f"Unsupported Scheduler: {args.scheduler}.") parser.print_help(sys.stderr) sys.exit(1) logging.info(f"Start training from epoch {last_epoch + 1}.") for epoch in range(last_epoch + 1, args.num_epochs): if args.optimizer_type != "Adam": if args.scheduler != "poly": if epoch != 0: scheduler.step() train(train_loader, net, criterion, optimizer, device=DEVICE, debug_steps=args.debug_steps, epoch=epoch) if args.scheduler == "poly": adjust_learning_rate(optimizer, epoch) logging.info("epoch: {} lr rate :{}".format( epoch, optimizer.param_groups[0]['lr'])) if epoch % args.validation_epochs == 0 or epoch == args.num_epochs - 1: logging.info("validation epoch: {} lr rate :{}".format( epoch, optimizer.param_groups[0]['lr'])) val_loss, val_regression_loss, val_classification_loss = test( val_loader, net, criterion, DEVICE) logging.info( f"Epoch: {epoch}, " + f"Validation Loss: {val_loss:.4f}, " + f"Validation Regression Loss {val_regression_loss:.4f}, " + f"Validation Classification Loss: {val_classification_loss:.4f}" ) model_path = os.path.join( args.checkpoint_folder, f"{args.net}-Epoch-{epoch}-Loss-{val_loss:.4f}.pth") net.module.save(model_path) logging.info(f"Saved model {model_path}")