def __init__(self, config): # Init arguments self.config = config if config.weight_bit_width is None and config.act_bit_width is None: prec_name = "" else: prec_name = "_{}W{}A".format(config.weight_bit_width, config.act_bit_width) experiment_name = '{}{}_{}'.format( config.network, prec_name, datetime.now().strftime('%Y%m%d_%H%M%S')) self.output_dir_path = os.path.join(config.experiments, experiment_name) if self.config.resume: self.output_dir_path, _ = os.path.split(config.resume) self.output_dir_path, _ = os.path.split(self.output_dir_path) if not config.dry_run: self.checkpoints_dir_path = os.path.join(self.output_dir_path, 'checkpoints') if not config.resume: os.mkdir(self.output_dir_path) os.mkdir(self.checkpoints_dir_path) self.logger = Logger(self.output_dir_path, config.dry_run) # Randomness random.seed(config.random_seed) torch.manual_seed(config.random_seed) torch.cuda.manual_seed_all(config.random_seed) # Datasets transform_to_tensor = transforms.Compose([transforms.ToTensor()]) if config.dataset == 'CIFAR10': train_transforms_list = [ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor() ] transform_train = transforms.Compose(train_transforms_list) builder = CIFAR10 self.num_classes = 10 in_channels = 3 elif config.dataset == 'MNIST': transform_train = transform_to_tensor builder = MNIST self.num_classes = 10 in_channels = 1 pass else: raise Exception("Dataset not supported: {}".format(config.dataset)) train_set = builder(root=config.datadir, train=True, download=True, transform=transform_train) test_set = builder(root=config.datadir, train=False, download=True, transform=transform_to_tensor) self.train_loader = DataLoader(train_set, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers) self.test_loader = DataLoader(test_set, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers) # Init starting values self.starting_epoch = 1 self.best_val_acc = 0 # Setup device if config.gpus is not None: config.gpus = [int(i) for i in config.gpus.split(',')] self.device = 'cuda:' + str(config.gpus[0]) torch.backends.cudnn.benchmark = True else: self.device = 'cpu' self.device = torch.device(self.device) # Setup model if config.network == 'CNV': model = CNV(weight_bit_width=config.weight_bit_width, act_bit_width=config.act_bit_width, in_bit_width=config.in_bit_width, num_classes=self.num_classes, in_ch=in_channels) elif config.network == 'LFC': model = LFC(weight_bit_width=config.weight_bit_width, act_bit_width=config.act_bit_width, in_bit_width=config.in_bit_width, num_classes=self.num_classes, in_ch=in_channels) else: raise Exception("Model not supported") model = model.to(device=self.device) if config.gpus is not None and len(config.gpus) > 1: model = nn.DataParallel(model, config.gpus) self.model = model # Loss function if config.loss == 'SqrHinge': self.criterion = SqrHingeLoss() else: self.criterion = nn.CrossEntropyLoss() self.criterion = self.criterion.to(device=self.device) # Resume model, if any if config.resume: print('Loading model checkpoint at: {}'.format(config.resume)) package = torch.load(config.resume, map_location=self.device) model_state_dict = package['state_dict'] self.model.load_state_dict(model_state_dict, strict=config.strict) # Init optimizer if config.optim == 'ADAM': self.optimizer = optim.Adam(self.model.parameters(), lr=config.lr, weight_decay=config.weight_decay) elif config.optim == 'SGD': self.optimizer = optim.SGD(self.model.parameters(), lr=self.config.lr, momentum=self.config.momentum, weight_decay=self.config.weight_decay) # Resume optimizer, if any if config.resume and not config.evaluate: self.logger.log.info("Loading optimizer checkpoint") if 'optim_dict' in package.keys(): self.optimizer.load_state_dict(package['optim_dict']) if 'epoch' in package.keys(): self.starting_epoch = package['epoch'] if 'best_val_acc' in package.keys(): self.best_val_acc = package['best_val_acc'] # LR scheduler if config.scheduler == 'STEP': milestones = [int(i) for i in config.milestones.split(',')] self.scheduler = MultiStepLR(optimizer=self.optimizer, milestones=milestones, gamma=0.1) elif config.scheduler == 'FIXED': self.scheduler = None else: raise Exception("Unrecognized scheduler {}".format( self.config.scheduler)) # Resume scheduler, if any if config.resume and not config.evaluate and self.scheduler is not None: self.scheduler.last_epoch = package['epoch'] - 1
def __init__(self, args): if args.network == 'TFC-W1A1': model = fc() dataset = 'MNIST' else: model = cnv() dataset = 'CIFAR10' # Init arguments self.args = args self.output_dir_path = os.path.join(args.experiments, args.network) if self.args.resume: self.output_dir_path, _ = os.path.split(args.resume) self.output_dir_path, _ = os.path.split(self.output_dir_path) if not args.dry_run: self.checkpoints_dir_path = os.path.join(self.output_dir_path, 'checkpoints') if not args.resume: os.mkdir(self.output_dir_path) os.mkdir(self.checkpoints_dir_path) self.logger = Logger(self.output_dir_path, args.dry_run) # Randomness random.seed(args.random_seed) torch.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) # Datasets transform_to_tensor = transforms.Compose([transforms.ToTensor()]) self.num_classes = 10 if dataset == 'CIFAR10': train_transforms_list = [ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor() ] transform_train = transforms.Compose(train_transforms_list) builder = CIFAR10 elif dataset == 'MNIST': transform_train = transform_to_tensor builder = MirrorMNIST else: raise Exception("Dataset not supported: {}".format(args.dataset)) train_set = builder(root=args.datadir, train=True, download=True, transform=transform_train) test_set = builder(root=args.datadir, train=False, download=True, transform=transform_to_tensor) self.train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) self.test_loader = DataLoader(test_set, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) # Init starting values self.starting_epoch = 1 self.best_val_acc = 0 # Setup device if args.gpus is not None: args.gpus = [int(i) for i in args.gpus.split(',')] self.device = 'cuda:' + str(args.gpus[0]) torch.backends.cudnn.benchmark = True else: self.device = 'cpu' self.device = torch.device(self.device) # Resume checkpoint, if any if args.resume: print('Loading model checkpoint at: {}'.format(args.resume)) package = torch.load(args.resume, map_location='cpu') model_state_dict = package['state_dict'] model.load_state_dict(model_state_dict) if args.gpus is not None and len(args.gpus) == 1: model = model.to(device=self.device) if args.gpus is not None and len(args.gpus) > 1: model = nn.DataParallel(model, args.gpus) self.model = model # Loss function if args.loss == 'SqrHinge': self.criterion = SqrHingeLoss() else: self.criterion = nn.CrossEntropyLoss() self.criterion = self.criterion.to(device=self.device) # Init optimizer if args.optim == 'ADAM': self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optim == 'SGD': self.optimizer = optim.SGD(self.model.parameters(), lr=self.args.lr, momentum=self.args.momentum, weight_decay=self.args.weight_decay) # Resume optimizer, if any if args.resume and not args.evaluate: self.logger.log.info("Loading optimizer checkpoint") if 'optim_dict' in package.keys(): self.optimizer.load_state_dict(package['optim_dict']) if 'epoch' in package.keys(): self.starting_epoch = package['epoch'] if 'best_val_acc' in package.keys(): self.best_val_acc = package['best_val_acc'] # LR scheduler if args.scheduler == 'STEP': milestones = [int(i) for i in args.milestones.split(',')] self.scheduler = MultiStepLR(optimizer=self.optimizer, milestones=milestones, gamma=0.1) elif args.scheduler == 'FIXED': self.scheduler = None else: raise Exception("Unrecognized scheduler {}".format( self.args.scheduler)) # Resume scheduler, if any if args.resume and not args.evaluate and self.scheduler is not None: self.scheduler.last_epoch = package['epoch'] - 1
class Trainer(object): def __init__(self, config): # Init arguments self.config = config if config.weight_bit_width is None and config.act_bit_width is None: prec_name = "" else: prec_name = "_{}W{}A".format(config.weight_bit_width, config.act_bit_width) experiment_name = '{}{}_{}'.format( config.network, prec_name, datetime.now().strftime('%Y%m%d_%H%M%S')) self.output_dir_path = os.path.join(config.experiments, experiment_name) if self.config.resume: self.output_dir_path, _ = os.path.split(config.resume) self.output_dir_path, _ = os.path.split(self.output_dir_path) if not config.dry_run: self.checkpoints_dir_path = os.path.join(self.output_dir_path, 'checkpoints') if not config.resume: os.mkdir(self.output_dir_path) os.mkdir(self.checkpoints_dir_path) self.logger = Logger(self.output_dir_path, config.dry_run) # Randomness random.seed(config.random_seed) torch.manual_seed(config.random_seed) torch.cuda.manual_seed_all(config.random_seed) # Datasets transform_to_tensor = transforms.Compose([transforms.ToTensor()]) if config.dataset == 'CIFAR10': train_transforms_list = [ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor() ] transform_train = transforms.Compose(train_transforms_list) builder = CIFAR10 self.num_classes = 10 in_channels = 3 elif config.dataset == 'MNIST': transform_train = transform_to_tensor builder = MNIST self.num_classes = 10 in_channels = 1 pass else: raise Exception("Dataset not supported: {}".format(config.dataset)) train_set = builder(root=config.datadir, train=True, download=True, transform=transform_train) test_set = builder(root=config.datadir, train=False, download=True, transform=transform_to_tensor) self.train_loader = DataLoader(train_set, batch_size=config.batch_size, shuffle=True, num_workers=config.num_workers) self.test_loader = DataLoader(test_set, batch_size=config.batch_size, shuffle=False, num_workers=config.num_workers) # Init starting values self.starting_epoch = 1 self.best_val_acc = 0 # Setup device if config.gpus is not None: config.gpus = [int(i) for i in config.gpus.split(',')] self.device = 'cuda:' + str(config.gpus[0]) torch.backends.cudnn.benchmark = True else: self.device = 'cpu' self.device = torch.device(self.device) # Setup model if config.network == 'CNV': model = CNV(weight_bit_width=config.weight_bit_width, act_bit_width=config.act_bit_width, in_bit_width=config.in_bit_width, num_classes=self.num_classes, in_ch=in_channels) elif config.network == 'LFC': model = LFC(weight_bit_width=config.weight_bit_width, act_bit_width=config.act_bit_width, in_bit_width=config.in_bit_width, num_classes=self.num_classes, in_ch=in_channels) else: raise Exception("Model not supported") model = model.to(device=self.device) if config.gpus is not None and len(config.gpus) > 1: model = nn.DataParallel(model, config.gpus) self.model = model # Loss function if config.loss == 'SqrHinge': self.criterion = SqrHingeLoss() else: self.criterion = nn.CrossEntropyLoss() self.criterion = self.criterion.to(device=self.device) # Resume model, if any if config.resume: print('Loading model checkpoint at: {}'.format(config.resume)) package = torch.load(config.resume, map_location=self.device) model_state_dict = package['state_dict'] self.model.load_state_dict(model_state_dict, strict=config.strict) # Init optimizer if config.optim == 'ADAM': self.optimizer = optim.Adam(self.model.parameters(), lr=config.lr, weight_decay=config.weight_decay) elif config.optim == 'SGD': self.optimizer = optim.SGD(self.model.parameters(), lr=self.config.lr, momentum=self.config.momentum, weight_decay=self.config.weight_decay) # Resume optimizer, if any if config.resume and not config.evaluate: self.logger.log.info("Loading optimizer checkpoint") if 'optim_dict' in package.keys(): self.optimizer.load_state_dict(package['optim_dict']) if 'epoch' in package.keys(): self.starting_epoch = package['epoch'] if 'best_val_acc' in package.keys(): self.best_val_acc = package['best_val_acc'] # LR scheduler if config.scheduler == 'STEP': milestones = [int(i) for i in config.milestones.split(',')] self.scheduler = MultiStepLR(optimizer=self.optimizer, milestones=milestones, gamma=0.1) elif config.scheduler == 'FIXED': self.scheduler = None else: raise Exception("Unrecognized scheduler {}".format( self.config.scheduler)) # Resume scheduler, if any if config.resume and not config.evaluate and self.scheduler is not None: self.scheduler.last_epoch = package['epoch'] - 1 def checkpoint_best(self, epoch, name): best_path = os.path.join(self.checkpoints_dir_path, name) self.logger.info("Saving checkpoint model to {}".format(best_path)) torch.save( { 'state_dict': self.model.state_dict(), 'optim_dict': self.optimizer.state_dict(), 'epoch': epoch + 1, 'best_val_acc': self.best_val_acc, 'config': self.config, }, best_path) def train_model(self): # training starts if self.config.detect_nan: torch.autograd.set_detect_anomaly(True) for epoch in range(self.starting_epoch, self.config.epochs): # Set to training mode self.model.train() self.criterion.train() # Init metrics epoch_meters = TrainingEpochMeters() start_data_loading = time.time() for i, data in enumerate(self.train_loader): (input, target) = data input = input.to(self.device, non_blocking=True) target = target.to(self.device, non_blocking=True) # for hingeloss only if isinstance(self.criterion, SqrHingeLoss): target = target.unsqueeze(1) target_onehot = torch.Tensor( target.size(0), self.num_classes).to(self.device, non_blocking=True) target_onehot.fill_(-1) target_onehot.scatter_(1, target, 1) target = target.squeeze() target_var = target_onehot else: target_var = target # measure data loading time epoch_meters.data_time.update(time.time() - start_data_loading) # Training batch starts start_batch = time.time() output = self.model(input) loss = self.criterion(output, target_var) # compute gradient and do SGD step self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.model.clip_weights(-1, 1) # measure elapsed time epoch_meters.batch_time.update(time.time() - start_batch) if i % int(self.config.log_freq) == 0 or i == len( self.train_loader) - 1: prec1, prec5 = accuracy(output.detach(), target, topk=(1, 5)) epoch_meters.losses.update(loss.item(), input.size(0)) epoch_meters.top1.update(prec1.item(), input.size(0)) epoch_meters.top5.update(prec5.item(), input.size(0)) self.logger.training_batch_cli_log(epoch_meters, epoch, i, len(self.train_loader)) # training batch ends start_data_loading = time.time() # Set the learning rate if self.scheduler is not None: self.scheduler.step(epoch) else: # Set the learning rate if epoch % 40 == 0: self.optimizer.param_groups[0]['lr'] *= 0.5 # Perform eval with torch.no_grad(): top1avg = self.eval_model(epoch) # checkpoint if top1avg >= self.best_val_acc and not self.config.dry_run: self.best_val_acc = top1avg self.checkpoint_best(epoch, "best.tar") elif not self.config.dry_run: self.checkpoint_best(epoch, "checkpoint.tar") # training ends if not self.config.dry_run: return os.path.join(self.checkpoints_dir_path, "best.tar") def eval_model(self, epoch=None): eval_meters = EvalEpochMeters() # switch to evaluate mode self.model.eval() self.criterion.eval() for i, data in enumerate(self.test_loader): end = time.time() (input, target) = data input = input.to(self.device, non_blocking=True) target = target.to(self.device, non_blocking=True) # for hingeloss only if isinstance(self.criterion, SqrHingeLoss): target = target.unsqueeze(1) target_onehot = torch.Tensor( target.size(0), self.num_classes).to(self.device, non_blocking=True) target_onehot.fill_(-1) target_onehot.scatter_(1, target, 1) target = target.squeeze() target_var = target_onehot else: target_var = target # compute output output = self.model(input) # measure model elapsed time eval_meters.model_time.update(time.time() - end) end = time.time() #compute loss loss = self.criterion(output, target_var) eval_meters.loss_time.update(time.time() - end) pred = output.data.argmax(1, keepdim=True) correct = pred.eq(target.data.view_as(pred)).sum() prec1 = 100. * correct.float() / input.size(0) _, prec5 = accuracy(output, target, topk=(1, 5)) eval_meters.losses.update(loss.item(), input.size(0)) eval_meters.top1.update(prec1.item(), input.size(0)) eval_meters.top5.update(prec5.item(), input.size(0)) #Eval batch ends self.logger.eval_batch_cli_log(eval_meters, i, len(self.test_loader)) return eval_meters.top1.avg
def __init__(self, args): self.input_shape = None model, cfg = model_with_cfg(args.network, args.pretrained) # Init arguments self.args = args self.weight_bit_width = cfg.getint('QUANT', 'WEIGHT_BIT_WIDTH') self.act_bit_width = cfg.getint('QUANT', 'ACT_BIT_WIDTH') self.input_bit_width = cfg.getint('QUANT', 'IN_BIT_WIDTH') self.in_channels = cfg.getint('MODEL', 'IN_CHANNELS') prec_name = "_{}W{}A".format(self.weight_bit_width, self.act_bit_width) experiment_name = '{}{}_{}'.format( args.network, prec_name, datetime.now().strftime('%Y%m%d_%H%M%S')) self.output_dir_path = os.path.join(args.experiments, experiment_name) if self.args.resume: self.output_dir_path, _ = os.path.split(args.resume) self.output_dir_path, _ = os.path.split(self.output_dir_path) if not args.dry_run: self.checkpoints_dir_path = os.path.join(self.output_dir_path, 'checkpoints') if not args.resume: os.mkdir(self.output_dir_path) os.mkdir(self.checkpoints_dir_path) self.logger = Logger(self.output_dir_path, args.dry_run) self.exporter = Exporter() # Randomness random.seed(args.random_seed) torch.manual_seed(args.random_seed) torch.cuda.manual_seed_all(args.random_seed) # Datasets transform_to_tensor = transforms.Compose([transforms.ToTensor()]) dataset = cfg.get('MODEL', 'DATASET') self.num_classes = cfg.getint('MODEL', 'NUM_CLASSES') if dataset == 'CIFAR10': train_transforms_list = [ transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor() ] transform_train = transforms.Compose(train_transforms_list) builder = CIFAR10 elif dataset == 'MNIST': transform_train = transform_to_tensor builder = MNIST else: raise Exception("Dataset not supported: {}".format(args.dataset)) train_set = builder(root=args.datadir, train=True, download=True, transform=transform_train) test_set = builder(root=args.datadir, train=False, download=True, transform=transform_to_tensor) self.train_loader = DataLoader(train_set, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers) self.test_loader = DataLoader(test_set, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers) imgs, _ = next(iter(self.train_loader)) self.input_shape = list(imgs.shape) self.input_shape[0] = 1 # Init starting values self.starting_epoch = 1 self.best_val_acc = 0 # Setup device if args.gpus is not None: args.gpus = [int(i) for i in args.gpus.split(',')] self.device = 'cuda:' + str(args.gpus[0]) torch.backends.cudnn.benchmark = True else: self.device = 'cpu' self.device = torch.device(self.device) # Resume checkpoint, if any if args.resume: print('Loading model checkpoint at: {}'.format(args.resume)) package = torch.load(args.resume, map_location='cpu') model_state_dict = package['state_dict'] model.load_state_dict(model_state_dict, strict=False) if args.gpus is not None and len(args.gpus) == 1: model = model.to(device=self.device) if args.gpus is not None and len(args.gpus) > 1: model = nn.DataParallel(model, args.gpus) self.model = model # Loss function if args.loss == 'SqrHinge': self.criterion = SqrHingeLoss() else: self.criterion = nn.CrossEntropyLoss() self.criterion = self.criterion.to(device=self.device) # Init optimizer if args.optim == 'ADAM': self.optimizer = optim.Adam(self.model.parameters(), lr=args.lr, weight_decay=args.weight_decay) elif args.optim == 'SGD': self.optimizer = optim.SGD(self.model.parameters(), lr=self.args.lr, momentum=self.args.momentum, weight_decay=self.args.weight_decay) # Resume optimizer, if any if args.resume and not args.evaluate: self.logger.log.info("Loading optimizer checkpoint") if 'optim_dict' in package.keys(): self.optimizer.load_state_dict(package['optim_dict']) if 'epoch' in package.keys(): self.starting_epoch = package['epoch'] if 'best_val_acc' in package.keys(): self.best_val_acc = package['best_val_acc'] # LR scheduler if args.scheduler == 'STEP': milestones = [int(i) for i in args.milestones.split(',')] self.scheduler = MultiStepLR(optimizer=self.optimizer, milestones=milestones, gamma=0.1) elif args.scheduler == 'FIXED': self.scheduler = None else: raise Exception("Unrecognized scheduler {}".format( self.args.scheduler)) # Resume scheduler, if any if args.resume and not args.evaluate and self.scheduler is not None: self.scheduler.last_epoch = package['epoch'] - 1