def _init_logger(args, script_dir): module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) global msglogger if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger(os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir, args.verbose) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state( filter(None, [args.compress, args.qe_stats_file]), # remove both None and empty strings msglogger.logdir, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) return msglogger.logdir
def main(): script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) global msglogger # Parse arguments args = parser.get_parser().parse_args() if args.epochs is None: args.epochs = 90 if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger(os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state(args.compress, msglogger.logdir, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) start_epoch = 0 ending_epoch = args.epochs perf_scores_history = [] if args.evaluate: args.deterministic = True if args.deterministic: # Experiment reproducibility is sometimes important. Pete Warden expounded about this # in his blog: https://petewarden.com/2018/03/19/the-machine-learning-reproducibility-crisis/ distiller.set_deterministic() # Use a well-known seed, for repeatability of experiments else: # Turn on CUDNN benchmark mode for best performance. This is usually "safe" for image # classification models, as the input sizes don't change during the run # See here: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/3 cudnn.benchmark = True if args.cpu or not torch.cuda.is_available(): # Set GPU index to -1 if using CPU args.device = 'cpu' args.gpus = -1 else: args.device = 'cuda' if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: raise ValueError('ERROR: Argument --gpus must be a comma-separated list of integers only') available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: raise ValueError('ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name args.dataset = 'cifar10' if 'cifar' in args.arch else 'imagenet' args.num_classes = 10 if args.dataset == 'cifar10' else 1000 if args.earlyexit_thresholds: args.num_exits = len(args.earlyexit_thresholds) + 1 args.loss_exits = [0] * args.num_exits args.losses_exits = [] args.exiterrors = [] # Create the model model = create_model(args.pretrained, args.dataset, args.arch, parallel=not args.load_serialized, device_ids=args.gpus) compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # capture thresholds for early-exit training if args.earlyexit_thresholds: msglogger.info('=> using early-exit threshold values of %s', args.earlyexit_thresholds) # TODO(barrh): args.deprecated_resume is deprecated since v0.3.1 if args.deprecated_resume: msglogger.warning('The "--resume" flag is deprecated. Please use "--resume-from=YOUR_PATH" instead.') if not args.reset_optimizer: msglogger.warning('If you wish to also reset the optimizer, call with: --reset-optimizer') args.reset_optimizer = True args.resumed_checkpoint_path = args.deprecated_resume # We can optionally resume from a checkpoint optimizer = None if args.resumed_checkpoint_path: model, compression_scheduler, optimizer, start_epoch = apputils.load_checkpoint( model, args.resumed_checkpoint_path, model_device=args.device) elif args.load_model_path: model = apputils.load_lean_checkpoint(model, args.load_model_path, model_device=args.device) if args.reset_optimizer: start_epoch = 0 if optimizer is not None: optimizer = None msglogger.info('\nreset_optimizer flag set: Overriding resumed optimizer and resetting epoch count to 0') # Define loss function (criterion) criterion = nn.CrossEntropyLoss().to(args.device) if optimizer is None: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.AMC: return automated_deep_compression(model, criterion, optimizer, pylogger, args) if args.greedy: return greedy(model, criterion, optimizer, pylogger, args) # This sample application can be invoked to produce various summary reports. if args.summary: return summarize_model(model, args.dataset, which_summary=args.summary) activations_collectors = create_activation_stats_collectors(model, *args.activation_stats) if args.qe_calibration: msglogger.info('Quantization calibration stats collection enabled:') msglogger.info('\tStats will be collected for {:.1%} of test dataset'.format(args.qe_calibration)) msglogger.info('\tSetting constant seeds and converting model to serialized execution') distiller.set_deterministic() model = distiller.make_non_parallel_copy(model) activations_collectors.update(create_quantization_stats_collector(model)) args.evaluate = True args.effective_test_size = args.qe_calibration # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = apputils.load_data( args.dataset, os.path.expanduser(args.data), args.batch_size, args.workers, args.validation_split, args.deterministic, args.effective_train_size, args.effective_valid_size, args.effective_test_size) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) if args.sensitivity is not None: sensitivities = np.arange(args.sensitivity_range[0], args.sensitivity_range[1], args.sensitivity_range[2]) return sensitivity_analysis(model, criterion, test_loader, pylogger, args, sensitivities) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, args, compression_scheduler) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config(model, optimizer, args.compress, compression_scheduler, (start_epoch-1) if args.resumed_checkpoint_path else None) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.to(args.device) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) if args.thinnify: #zeros_mask_dict = distiller.create_model_masks_dict(model) assert args.resumed_checkpoint_path is not None, \ "You must use --resume-from to provide a checkpoint file to thinnify" distiller.remove_filters(model, compression_scheduler.zeros_mask_dict, args.arch, args.dataset, optimizer=None) apputils.save_checkpoint(0, args.arch, model, optimizer=None, scheduler=compression_scheduler, name="{}_thinned".format(args.resumed_checkpoint_path.replace(".pth.tar", "")), dir=msglogger.logdir) print("Note: your model may have collapsed to random inference, so you may want to fine-tune") return args.kd_policy = None if args.kd_teacher: teacher = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, device_ids=args.gpus) if args.kd_resume: teacher = apputils.load_lean_checkpoint(teacher, args.kd_resume) dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) args.kd_policy = distiller.KnowledgeDistillationPolicy(model, teacher, args.kd_temp, dlw) compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs, frequency=1) msglogger.info('\nStudent-Teacher knowledge distillation enabled:') msglogger.info('\tTeacher Model: %s', args.kd_teacher) msglogger.info('\tTemperature: %s', args.kd_temp) msglogger.info('\tLoss Weights (distillation | student | teacher): %s', ' | '.join(['{:.2f}'.format(val) for val in dlw])) msglogger.info('\tStarting from Epoch: %s', args.kd_start_epoch) if start_epoch >= ending_epoch: msglogger.error( 'epoch count is too low, starting epoch is {} but total epochs set to {}'.format( start_epoch, ending_epoch)) raise ValueError('Epochs parameter is too low. Nothing to do.') for epoch in range(start_epoch, ending_epoch): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin(epoch, metrics=(vloss if (epoch != start_epoch) else 10**6)) # Train for one epoch with collectors_context(activations_collectors["train"]) as collectors: train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], args=args) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics(epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity: msglogger.info(distiller.masks_sparsity_tbl_summary(model, compression_scheduler)) # evaluate on validation set with collectors_context(activations_collectors["valid"]) as collectors: top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args, epoch) distiller.log_activation_statsitics(epoch, "valid", loggers=[tflogger], collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) stats = ('Performance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) # Update the list of top scores achieved so far, and save the checkpoint update_training_scores_history(perf_scores_history, model, top1, top5, epoch, args.num_best_scores) is_best = epoch == perf_scores_history[0].epoch checkpoint_extras = {'current_top1': top1, 'best_top1': perf_scores_history[0].top1, 'best_epoch': perf_scores_history[0].epoch} apputils.save_checkpoint(epoch, args.arch, model, optimizer=optimizer, scheduler=compression_scheduler, extras=checkpoint_extras, is_best=is_best, name=args.name, dir=msglogger.logdir) # Finally run results on the test set test(test_loader, model, criterion, [pylogger], activations_collectors, args=args)
torch.manual_seed(args.seed) cudnn.benchmark = False device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") if args.resume: with open(args.resume, 'rb') as f: model = torch.load(f).to(device) # after load the rnn params are not a continuous chunk of memory # this makes them a continuous chunk, and will speed up forward pass model.rnn.flatten_parameters() else: model = LII_LSTM(args.model, args.input_size, args.nhid, args.nlayers, args.dropout).to(device) # Distiller loggers msglogger = apputils.config_pylogger('config/logging.conf', experiment_name=None, output_dir='logs') tflogger = TensorBoardLogger(msglogger.logdir) tflogger.log_gradients = True pylogger = PythonLogger(msglogger) def export_onnx(path, batch_size, seq_len): msglogger.info('The model is also exported in ONNX format at {}'. format(os.path.realpath(args.onnx_export))) model.eval() dummy_input = torch.LongTensor(seq_len * batch_size).zero_().view(-1, batch_size).to(device) hidden = model.init_hidden(batch_size) torch.onnx.export(model, (dummy_input, hidden), path) def draw_lang_model_to_file(model, png_fname): """Draw a language model graph to a PNG file.
def train(c, net, compression_scheduler=None): import distiller.apputils as apputils from distiller.data_loggers import TensorBoardLogger, PythonLogger msglogger = apputils.config_pylogger('logging.conf', None) tflogger = TensorBoardLogger(msglogger.logdir) tflogger.log_gradients = True pylogger = PythonLogger(msglogger) c.setdefault(hebbian=False) emb_params = count_params(net.embed) + count_params(net.loss.projections) + count_params(net.loss.clusters) opt = get_opt(c, net) net, opt, step = c.init_model(net, opt=opt, step='max', train=True) step_lr = scheduler(c, opt, step) data_tr = SampleIterator(c, c.train_batch, split='valid' if c.debug else 'train') iter_tr = iter(data_tr) data_val = SequentialIterator(c, c.eval_batch, split='valid') s = Namespace(net=net, opt=opt, step=step) c.on_train_start(s) c.log('Embedding has %s parameters' % emb_params) if c.get("steps_per_epoch"): steps_per_epoch = c.steps_per_epoch else: steps_per_epoch = len(data_tr.tokens) // data_tr.bs // c.train_chunk print("#### steps per epoch %d ####" % steps_per_epoch) if c.hebbian: counters = [torch.ones(end - start, dtype=torch.long, device=c.device) for start, end in zip([0] + c.cutoffs, c.cutoffs + [c.n_vocab])] temp_counters = [torch.zeros_like(x) for x in counters] best_val_loss = np.inf if s.results is not None and 'val_loss' in s.results.columns: best_val_loss = s.results['val_loss'].dropna().max() try: while step < s.step_max: batch = step % steps_per_epoch epoch = step // steps_per_epoch if step % steps_per_epoch == 0: c.log("====> batch=%d, epoch=%d, step=%d" % (batch, epoch, step)) if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) if compression_scheduler: compression_scheduler.on_minibatch_begin(epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch) step_lr(step) x = to_torch(next(iter_tr), c.device).t() t_s = time() inputs, labels = x[:-1], x[1:] preds = net(inputs, labels) loss = preds['loss'] if compression_scheduler: _ = compression_scheduler.before_backward_pass(epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch, loss=loss, return_loss_components=False) opt.zero_grad() if torch.isnan(loss): raise RuntimeError('Encountered nan loss during training') loss.backward() torch.nn.utils.clip_grad_norm_(net.parameters(), c.get('clip_grad', 0.5)) opt.step() if c.hebbian: hebbian_weight_update(c, net, preds['hiddens'], counters, temp_counters) time_model = np.round(time() - t_s, 5) loss = from_torch(loss) perplexity = np.nan if loss > 5 else np.e ** loss step_result = pd.Series(dict( loss=loss, perplexity=perplexity, time=time_model )).add_prefix('train_') step_result['lr'] = next(iter(opt.param_groups))['lr'] if c.use_cache: step_result['theta'] = preds['theta'] step_result['lambda'] = preds['lambda'].item() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, minibatch_id=batch, minibatches_per_epoch=steps_per_epoch) if step % steps_per_epoch == 0: if compression_scheduler: compression_scheduler.on_epoch_end(epoch) s.step = step = step + 1 if step % c.step_eval == 0: distiller.log_weights_sparsity(net, epoch, loggers=[tflogger, pylogger]) t, total = distiller.weights_sparsity_tbl_summary(net, return_total_sparsity=True) c.log("total sparsity: %.3lf" % total) step_result = step_result.append( pd.Series(evaluate(c, data_val, net)).add_prefix('val_') ) s.record_step = step_result['val_loss'] < best_val_loss clear_gpu_memory() s.step_result = step_result c.on_step_end(s) except Exception as e: import traceback err = traceback.format_exc() if c.main: c.log(err) else: print(err) finally: c.on_train_end(s)
def main(args): utils.init_distributed_mode(args) print(args) device = torch.device(args.device) script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) if utils.is_main_process(): msglogger = apputils.config_pylogger(os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir, args.verbose) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state( filter(None, [args.compress, args.qe_stats_file]), # remove both None and empty strings msglogger.logdir) msglogger.debug("Distiller: %s", distiller.__version__) else: msglogger = logging.getLogger() msglogger.disabled = True # Data loading code print("Loading data") dataset, num_classes = get_dataset(args.dataset, "train", get_transform(train=True), args.data_path) dataset_test, _ = get_dataset(args.dataset, "val", get_transform(train=False), args.data_path) print("Creating data loaders") if args.distributed: train_sampler = torch.utils.data.distributed.DistributedSampler(dataset) test_sampler = torch.utils.data.distributed.DistributedSampler(dataset_test) else: train_sampler = torch.utils.data.RandomSampler(dataset) test_sampler = torch.utils.data.SequentialSampler(dataset_test) if args.aspect_ratio_group_factor >= 0: group_ids = create_aspect_ratio_groups(dataset, k=args.aspect_ratio_group_factor) train_batch_sampler = GroupedBatchSampler(train_sampler, group_ids, args.batch_size) else: train_batch_sampler = torch.utils.data.BatchSampler( train_sampler, args.batch_size, drop_last=True) data_loader = torch.utils.data.DataLoader( dataset, batch_sampler=train_batch_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) data_loader_test = torch.utils.data.DataLoader( dataset_test, batch_size=1, sampler=test_sampler, num_workers=args.workers, collate_fn=utils.collate_fn) print("Creating model") model = detection.__dict__[args.model](num_classes=num_classes, pretrained=args.pretrained) patch_fastrcnn(model) model.to(device) if args.summary: if utils.is_main_process(): for summary in args.summary: distiller.model_summary(model, summary, args.dataset) return model_without_ddp = model if args.distributed: model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[args.gpu]) model_without_ddp = model.module params = [p for p in model.parameters() if p.requires_grad] optimizer = torch.optim.SGD( params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) # lr_scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=args.lr_step_size, gamma=args.lr_gamma) lr_scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=args.lr_steps, gamma=args.lr_gamma) compression_scheduler = None if utils.is_main_process(): # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config(model, optimizer, args.compress, compression_scheduler, None) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.to(args.device) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) if args.qe_calibration: def test_fn(model): return evaluate(model, data_loader_test, device=device) collect_quant_stats(model_without_ddp, test_fn, save_dir=args.output_dir, modules_to_collect=['backbone', 'rpn', 'roi_heads']) # We skip `.transform` because it is a pre-processing unit. return if args.resume: checkpoint = torch.load(args.resume, map_location='cpu') model_without_ddp.load_state_dict(checkpoint['model']) optimizer.load_state_dict(checkpoint['optimizer']) lr_scheduler.load_state_dict(checkpoint['lr_scheduler']) if compression_scheduler and 'compression_scheduler' in checkpoint: compression_scheduler.load_state_dict(checkpoint['compression_scheduler']) if args.test_only: evaluate(model, data_loader_test, device=device) return activations_collectors = create_activation_stats_collectors(model, *args.activation_stats) print("Start training") start_time = time.time() # if not isinstance(model, nn.DataParallel) and torch.cuda.is_available() \ # and torch.cuda.device_count() > 1: # msglogger.info("Using %d GPUs on DataParallel." % torch.cuda.device_count()) # model = nn.DataParallel(model) for epoch in range(args.start_epoch, args.epochs): if args.distributed: train_sampler.set_epoch(epoch) dist.barrier() if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) with collectors_context(activations_collectors["train"]) as collectors: train_one_epoch(model, optimizer, data_loader, device, epoch, args.print_freq, compression_scheduler) if utils.is_main_process(): distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics(epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity and utils.is_main_process(): msglogger.info(distiller.masks_sparsity_tbl_summary(model, compression_scheduler)) lr_scheduler.step() if args.output_dir: save_dict = { 'model': model_without_ddp.state_dict(), 'optimizer': optimizer.state_dict(), 'lr_scheduler': lr_scheduler.state_dict(), 'args': args} if compression_scheduler: save_dict['compression_scheduler'] = compression_scheduler.state_dict() utils.save_on_master(save_dict, os.path.join(args.output_dir, 'model_{}.pth'.format(epoch))) # evaluate after every epoch evaluate(model, data_loader_test, device=device) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) print('Training time {}'.format(total_time_str))
def train(args): SRC = data.Field(tokenize=tokenize_de, pad_token=BLANK_WORD, lower=args.lower) TGT = data.Field(tokenize=tokenize_en, init_token=BOS_WORD, eos_token=EOS_WORD, pad_token=BLANK_WORD, lower=args.lower) # Load IWSLT Data ---> German to English Translation if args.dataset == 'IWSLT': train, val, test = datasets.IWSLT.splits( exts=('.de', '.en'), fields=(SRC, TGT), filter_pred=lambda x: len(vars(x)['src']) <= args.max_length and len(vars(x)['trg']) <= args.max_length) else: train, val, test = datasets.Multi30k.splits( exts=('.de', '.en'), fields=(SRC, TGT), filter_pred=lambda x: len(vars(x)['src']) <= args.max_length and len(vars(x)['trg']) <= args.max_length) # Frequency of words in the vocabulary SRC.build_vocab(train.src, min_freq=args.min_freq) TGT.build_vocab(train.trg, min_freq=args.min_freq) print("Size of source vocabulary:", len(SRC.vocab)) print("Size of target vocabulary:", len(TGT.vocab)) pad_idx = TGT.vocab.stoi[BLANK_WORD] model = make_model(len(SRC.vocab), len(TGT.vocab), n=args.num_blocks, d_model=args.hidden_dim, d_ff=args.ff_dim, h=args.num_heads, dropout=args.dropout) print("Model made with n:", args.num_blocks, "hidden_dim:", args.hidden_dim, "feed forward dim:", args.ff_dim, "heads:", args.num_heads, "dropout:", args.dropout) model_parameters = filter(lambda p: p.requires_grad, model.parameters()) params = sum([np.prod(p.size()) for p in model_parameters]) print("Number of parameters: ", params) if args.load_model: print("Loading model from [%s]" % args.load_model) model.load_state_dict(torch.load(args.load_model)) # UNCOMMENT WHEN RUNNING ON RESEARCH MACHINES - run on GPU # model.cuda() # Used by original authors, hurts perplexity but improves BLEU score criterion = LabelSmoothing(size=len(TGT.vocab), padding_idx=pad_idx, smoothing=0.1) # UNCOMMENT WHEN RUNNING ON RESEARCH MACHINES - run on GPU # criterion.cuda() train_iter = MyIterator(train, batch_size=args.batch_size, device=0, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=True) valid_iter = MyIterator(val, batch_size=args.batch_size, device=0, repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)), batch_size_fn=batch_size_fn, train=False, sort=False) model_par = nn.DataParallel(model, device_ids=devices) # model_opt = NoamOpt(model.src_embed[0].d_model, 1, 2000, # torch.optim.Adam(model.parameters(), lr=0, betas=(0.9, 0.98), eps=1e-9)) # Use standard optimizer -- As used in the paper model_opt = get_std_opt(model) # PRUNING CODE if args.summary: df = distiller.weights_sparsity_tbl_summary(model, False) print(df) exit(0) msglogger = apputils.config_pylogger('logging.conf', None) tflogger = TensorBoardLogger(msglogger.logdir) tflogger.log_gradients = True pylogger = PythonLogger(msglogger) source = args.compress if args.compress: compression_scheduler = distiller.config.file_config( model_par.module, None, args.compress) print(model_par.module) best_bleu = 0 best_epoch = 0 steps_per_epoch = math.ceil(len(train_iter.data()) / 60) for epoch in range(args.epoch): print("=" * 80) print("Epoch ", epoch + 1) print("=" * 80) print("Training...") model_par.train() if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) # IF PRUNING run_epoch((rebatch(pad_idx, b) for b in train_iter), model_par, MultiGPULossCompute(model.generator, criterion, devices=devices, opt=model_opt), args, epoch, steps_per_epoch, compression_scheduler, SRC, TGT, valid_iter, is_valid=False) # run_epoch((rebatch(pad_idx, b) for b in train_iter), model_par, # MultiGPULossCompute(model.generator, criterion, devices=devices, opt=model_opt), args, # SRC, TGT, valid_iter, is_valid=False) print("Validation...") model_par.eval() # IF PRUNING loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter), model_par, MultiGPULossCompute(model.generator, criterion, devices=devices, opt=None), args, epoch, steps_per_epoch, compression_scheduler, SRC, TGT, valid_iter, is_valid=True) # loss = run_epoch((rebatch(pad_idx, b) for b in valid_iter), model_par, # MultiGPULossCompute(model.generator, criterion, devices=devices, opt=None), args, # SRC, TGT, valid_iter, is_valid=True) if compression_scheduler: compression_scheduler.on_epoch_end(epoch) print('Validation loss:', loss) print('Validation perplexity: ', np.exp(loss)) bleu_score = run_validation_bleu_score(model, SRC, TGT, valid_iter) if best_bleu < bleu_score: best_bleu = bleu_score model_file = args.save_to + args.exp_name + 'validation.bin' print('Saving model without optimizer [%s]' % model_file) torch.save(model_par.module.state_dict(), model_file) best_epoch = epoch model_file = args.save_to + args.exp_name + 'latest.bin' print('Saving latest model without optimizer [%s]' % model_file) torch.save(model_par.module.state_dict(), model_file) print('The best epoch was:', best_epoch)
args.nlayers, args.dropout, args.tied).to(device) criterion = nn.CrossEntropyLoss() optimizer = torch.optim.SGD(model.parameters(), args.lr, momentum=args.momentum, weight_decay=args.weight_decay) lr_scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=0, verbose=True, factor=0.5) # </editor-fold> # <editor-fold desc=">>> Loggers & Summary"> msglogger = apputils.config_pylogger('logging.conf', None) tflogger = TensorBoardLogger(msglogger.logdir) tflogger.log_gradients = True pylogger = PythonLogger(msglogger) if args.summary: which_summary = args.summary if which_summary == 'png': draw_lang_model_to_file(model, 'rnn.png', 'wikitext2') elif which_summary == 'percentile': percentile = 0.9 for name, param in model.state_dict().items(): if param.dim() < 2: # Skip biases continue bottomk, _ = torch.topk(param.abs().view(-1),
def main(): script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) global msglogger # Parse arguments args = parser.get_parser().parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger( os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state(args.compress, msglogger.logdir, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) start_epoch = 0 best_epochs = [ distiller.MutableNamedTuple({ 'epoch': 0, 'top1': 0, 'sparsity': 0 }) for i in range(args.num_best_scores) ] if args.deterministic: # Experiment reproducibility is sometimes important. Pete Warden expounded about this # in his blog: https://petewarden.com/2018/03/19/the-machine-learning-reproducibility-crisis/ # In Pytorch, support for deterministic execution is still a bit clunky. if args.workers > 1: msglogger.error( 'ERROR: Setting --deterministic requires setting --workers/-j to 0 or 1' ) exit(1) # Use a well-known seed, for repeatability of experiments distiller.set_deterministic() else: # This issue: https://github.com/pytorch/pytorch/issues/3659 # Implies that cudnn.benchmark should respect cudnn.deterministic, but empirically we see that # results are not re-produced when benchmark is set. So enabling only if deterministic mode disabled. cudnn.benchmark = True if args.cpu or not torch.cuda.is_available(): # Set GPU index to -1 if using CPU args.device = 'cpu' args.gpus = -1 else: args.device = 'cuda' if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: msglogger.error( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) exit(1) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: msglogger.error( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) exit(1) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name args.dataset = 'cifar10' if 'cifar' in args.arch else 'imagenet' args.num_classes = 10 if args.dataset == 'cifar10' else 1000 if args.earlyexit_thresholds: args.num_exits = len(args.earlyexit_thresholds) + 1 args.loss_exits = [0] * args.num_exits args.losses_exits = [] args.exiterrors = [] # Create the model model = create_model(args.pretrained, args.dataset, args.arch, parallel=not args.load_serialized, device_ids=args.gpus) compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # capture thresholds for early-exit training if args.earlyexit_thresholds: msglogger.info('=> using early-exit threshold values of %s', args.earlyexit_thresholds) # We can optionally resume from a checkpoint if args.resume: model, compression_scheduler, start_epoch = apputils.load_checkpoint( model, chkpt_file=args.resume) model.to(args.device) # Define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().to(args.device) optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.AMC: return automated_deep_compression(model, criterion, optimizer, pylogger, args) if args.greedy: return greedy(model, criterion, optimizer, pylogger, args) # This sample application can be invoked to produce various summary reports. if args.summary: return summarize_model(model, args.dataset, which_summary=args.summary) activations_collectors = create_activation_stats_collectors( model, *args.activation_stats) if args.qe_calibration: msglogger.info('Quantization calibration stats collection enabled:') msglogger.info( '\tStats will be collected for {:.1%} of test dataset'.format( args.qe_calibration)) msglogger.info( '\tSetting constant seeds and converting model to serialized execution' ) distiller.set_deterministic() model = distiller.make_non_parallel_copy(model) activations_collectors.update( create_quantization_stats_collector(model)) args.evaluate = True args.effective_test_size = args.qe_calibration # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = apputils.load_data( args.dataset, os.path.expanduser(args.data), args.batch_size, args.workers, args.validation_split, args.deterministic, args.effective_train_size, args.effective_valid_size, args.effective_test_size) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) if args.sensitivity is not None: sensitivities = np.arange(args.sensitivity_range[0], args.sensitivity_range[1], args.sensitivity_range[2]) return sensitivity_analysis(model, criterion, test_loader, pylogger, args, sensitivities) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, args, compression_scheduler) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config(model, optimizer, args.compress, compression_scheduler) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.to(args.device) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) if args.thinnify: #zeros_mask_dict = distiller.create_model_masks_dict(model) assert args.resume is not None, "You must use --resume to provide a checkpoint file to thinnify" distiller.remove_filters(model, compression_scheduler.zeros_mask_dict, args.arch, args.dataset, optimizer=None) apputils.save_checkpoint(0, args.arch, model, optimizer=None, scheduler=compression_scheduler, name="{}_thinned".format( args.resume.replace(".pth.tar", "")), dir=msglogger.logdir) print( "Note: your model may have collapsed to random inference, so you may want to fine-tune" ) return args.kd_policy = None if args.kd_teacher: teacher = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, device_ids=args.gpus) if args.kd_resume: teacher, _, _ = apputils.load_checkpoint(teacher, chkpt_file=args.kd_resume) dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) args.kd_policy = distiller.KnowledgeDistillationPolicy( model, teacher, args.kd_temp, dlw) compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs, frequency=1) msglogger.info('\nStudent-Teacher knowledge distillation enabled:') msglogger.info('\tTeacher Model: %s', args.kd_teacher) msglogger.info('\tTemperature: %s', args.kd_temp) msglogger.info('\tLoss Weights (distillation | student | teacher): %s', ' | '.join(['{:.2f}'.format(val) for val in dlw])) msglogger.info('\tStarting from Epoch: %s', args.kd_start_epoch) for epoch in range(start_epoch, start_epoch + args.epochs): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) # Train for one epoch with collectors_context(activations_collectors["train"]) as collectors: train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], args=args) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics( epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity: msglogger.info( distiller.masks_sparsity_tbl_summary( model, compression_scheduler)) # evaluate on validation set with collectors_context(activations_collectors["valid"]) as collectors: top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args, epoch) distiller.log_activation_statsitics( epoch, "valid", loggers=[tflogger], collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) stats = ('Peformance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) # Update the list of top scores achieved so far, and save the checkpoint is_best = top1 > best_epochs[-1].top1 if top1 > best_epochs[0].top1: best_epochs[0].epoch = epoch best_epochs[0].top1 = top1 # Keep best_epochs sorted such that best_epochs[0] is the lowest top1 in the best_epochs list best_epochs = sorted(best_epochs, key=lambda score: score.top1) for score in reversed(best_epochs): if score.top1 > 0: msglogger.info('==> Best Top1: %.3f on Epoch: %d', score.top1, score.epoch) apputils.save_checkpoint(epoch, args.arch, model, optimizer, compression_scheduler, best_epochs[-1].top1, is_best, args.name, msglogger.logdir) # Finally run results on the test set test(test_loader, model, criterion, [pylogger], activations_collectors, args=args)
def main(): script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) global msglogger # Parse arguments args = parser.get_parser().parse_args() if args.epochs is None: args.epochs = 90 if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger( os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state(args.compress, msglogger.logdir, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) start_epoch = 0 ending_epoch = args.epochs perf_scores_history = [] if args.evaluate: args.deterministic = True if args.deterministic: # Experiment reproducibility is sometimes important. Pete Warden expounded about this # in his blog: https://petewarden.com/2018/03/19/the-machine-learning-reproducibility-crisis/ distiller.set_deterministic( ) # Use a well-known seed, for repeatability of experiments else: # Turn on CUDNN benchmark mode for best performance. This is usually "safe" for image # classification models, as the input sizes don't change during the run # See here: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/3 cudnn.benchmark = True if args.cpu or not torch.cuda.is_available(): # Set GPU index to -1 if using CPU args.device = 'cpu' args.gpus = -1 else: args.device = 'cuda' if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: raise ValueError( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: raise ValueError( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name args.dataset = 'cifar10' if 'cifar' in args.arch else 'imagenet' args.num_classes = 10 if args.dataset == 'cifar10' else 1000 # Create the model model = create_model(args.pretrained, args.dataset, args.arch, parallel=not args.load_serialized, device_ids=args.gpus) compression_scheduler = None optimizer = None if args.resumed_checkpoint_path: model, compression_scheduler, optimizer, start_epoch = apputils.load_checkpoint( model, args.resumed_checkpoint_path, use_swa_model=args.use_swa_model, model_device=args.device) elif args.load_model_path: model = apputils.load_lean_checkpoint(model, args.load_model_path, model_device=args.device) if args.reset_optimizer: start_epoch = 0 if optimizer is not None: optimizer = None msglogger.info( '\nreset_optimizer flag set: Overriding resumed optimizer and resetting epoch count to 0' ) # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # Define loss function (criterion) criterion = nn.CrossEntropyLoss().to(args.device) if optimizer is None: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) # This sample application can be invoked to produce various summary reports. if args.summary: return summarize_model(model, args.dataset, which_summary=args.summary) activations_collectors = create_activation_stats_collectors( model, *args.activation_stats) if args.qe_calibration: msglogger.info('Quantization calibration stats collection enabled:') msglogger.info( '\tStats will be collected for {:.1%} of test dataset'.format( args.qe_calibration)) msglogger.info( '\tSetting constant seeds and converting model to serialized execution' ) distiller.set_deterministic() model = distiller.make_non_parallel_copy(model) activations_collectors.update( create_quantization_stats_collector(model)) args.evaluate = True args.effective_test_size = args.qe_calibration # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = apputils.load_data( args.dataset, os.path.expanduser(args.data), args.batch_size, args.workers, args.validation_split, args.deterministic, args.effective_train_size, args.effective_valid_size, args.effective_test_size) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, args, compression_scheduler)
def main(): global msglogger script_dir = os.path.dirname(__file__) args = parse_args() # Distiller loggers msglogger = apputils.config_pylogger('logging.conf', args.name, output_dir=args.output_dir) tflogger = TensorBoardLogger(msglogger.logdir) # tflogger.log_gradients = True # pylogger = PythonLogger(msglogger) if args.seed is not None: msglogger.info("Using seed = {}".format(args.seed)) torch.manual_seed(args.seed) np.random.seed(seed=args.seed) args.qe_mode = str(args.qe_mode).split('.')[1] args.qe_clip_acts = str(args.qe_clip_acts).split('.')[1] apputils.log_execution_env_state(sys.argv) if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: msglogger.error( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) exit(1) if len(args.gpus) > 1: msglogger.error('ERROR: Only single GPU supported for NCF') exit(1) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: msglogger.error( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) exit(1) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Save configuration to file config = {k: v for k, v in args.__dict__.items()} config['timestamp'] = "{:.0f}".format(datetime.utcnow().timestamp()) config['local_timestamp'] = str(datetime.now()) run_dir = msglogger.logdir msglogger.info("Saving config and results to {}".format(run_dir)) if not os.path.exists(run_dir) and run_dir != '': os.makedirs(run_dir) utils.save_config(config, run_dir) # Check that GPUs are actually available use_cuda = not args.no_cuda and torch.cuda.is_available() t1 = time.time() # Load Data training = not (args.eval or args.qe_calibration or args.activation_histograms) msglogger.info('Loading data') if training: train_dataset = CFTrainDataset( os.path.join(args.data, TRAIN_RATINGS_FILENAME), args.negative_samples) train_dataloader = torch.utils.data.DataLoader( dataset=train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=True) nb_users, nb_items = train_dataset.nb_users, train_dataset.nb_items else: train_dataset = None train_dataloader = None nb_users, nb_items = (138493, 26744) test_ratings = load_test_ratings( os.path.join(args.data, TEST_RATINGS_FILENAME)) # noqa: E501 test_negs = load_test_negs(os.path.join(args.data, TEST_NEG_FILENAME)) msglogger.info( 'Load data done [%.1f s]. #user=%d, #item=%d, #train=%s, #test=%d' % (time.time() - t1, nb_users, nb_items, str(train_dataset.mat.nnz) if training else 'N/A', len(test_ratings))) # Create model model = NeuMF(nb_users, nb_items, mf_dim=args.factors, mf_reg=0., mlp_layer_sizes=args.layers, mlp_layer_regs=[0. for i in args.layers], split_final=args.split_final) if use_cuda: model = model.cuda() msglogger.info(model) msglogger.info("{} parameters".format(utils.count_parameters(model))) # Save model text description with open(os.path.join(run_dir, 'model.txt'), 'w') as file: file.write(str(model)) compression_scheduler = None start_epoch = 0 optimizer = None if args.load: if training: model, compression_scheduler, optimizer, start_epoch = apputils.load_checkpoint( model, args.load) if args.reset_optimizer: start_epoch = 0 optimizer = None else: model = apputils.load_lean_checkpoint(model, args.load) # Add loss to graph criterion = nn.BCEWithLogitsLoss() if use_cuda: criterion = criterion.cuda() if training and optimizer is None: optimizer = torch.optim.Adam(model.parameters(), lr=args.learning_rate) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.compress: compression_scheduler = distiller.file_config(model, optimizer, args.compress) model.cuda() # Create files for tracking training valid_results_file = os.path.join(run_dir, 'valid_results.csv') if args.qe_calibration or args.activation_histograms: calib = { 'portion': args.qe_calibration, 'desc_str': 'quantization calibration stats', 'collect_func': partial(distiller.data_loggers.collect_quant_stats, inplace_runtime_check=True, disable_inplace_attrs=True) } hists = { 'portion': args.activation_histograms, 'desc_str': 'activation histograms', 'collect_func': partial(distiller.data_loggers.collect_histograms, activation_stats=None, nbins=2048, save_hist_imgs=True) } d = calib if args.qe_calibration else hists distiller.utils.assign_layer_fq_names(model) num_users = int(np.floor(len(test_ratings) * d['portion'])) msglogger.info( "Generating {} based on {:.1%} of the test-set ({} users)".format( d['desc_str'], d['portion'], num_users)) test_fn = partial(val_epoch, ratings=test_ratings, negs=test_negs, K=args.topk, use_cuda=use_cuda, processes=args.processes, num_users=num_users) d['collect_func'](model=model, test_fn=test_fn, save_dir=run_dir, classes=None) return 0 if args.eval: if args.quantize_eval and args.qe_calibration is None: model.cpu() quantizer = quantization.PostTrainLinearQuantizer.from_args( model, args) dummy_input = (torch.tensor([1]), torch.tensor([1]), torch.tensor([True], dtype=torch.bool)) quantizer.prepare_model(dummy_input) model.cuda() distiller.utils.assign_layer_fq_names(model) if args.eval_fp16: model = model.half() # Calculate initial Hit Ratio and NDCG begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, processes=args.processes) val_time = time.time() - begin hit_rate = np.mean(hits) msglogger.info( 'Initial HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}, val_time = {val_time:.2f}' .format(K=args.topk, hit_rate=hit_rate, ndcg=np.mean(ndcgs), val_time=val_time)) hit_rate = 0 if args.quantize_eval: checkpoint_name = 'quantized' apputils.save_checkpoint(0, 'NCF', model, optimizer=None, extras={'quantized_hr@10': hit_rate}, name='_'.join([args.name, 'quantized']) if args.name else checkpoint_name, dir=msglogger.logdir) return 0 total_samples = len(train_dataloader.sampler) steps_per_epoch = math.ceil(total_samples / args.batch_size) best_hit_rate = 0 best_epoch = 0 for epoch in range(start_epoch, args.epochs): msglogger.info('') model.train() losses = utils.AverageMeter() begin = time.time() if compression_scheduler: compression_scheduler.on_epoch_begin(epoch, optimizer) loader = tqdm.tqdm(train_dataloader) for batch_index, (user, item, label) in enumerate(loader): user = torch.autograd.Variable(user, requires_grad=False) item = torch.autograd.Variable(item, requires_grad=False) label = torch.autograd.Variable(label, requires_grad=False) if use_cuda: user = user.cuda(async=True) item = item.cuda(async=True) label = label.cuda(async=True) if compression_scheduler: compression_scheduler.on_minibatch_begin( epoch, batch_index, steps_per_epoch, optimizer) outputs = model(user, item, torch.tensor([False], dtype=torch.bool)) loss = criterion(outputs, label) if compression_scheduler: compression_scheduler.before_backward_pass( epoch, batch_index, steps_per_epoch, loss, optimizer, return_loss_components=False) losses.update(loss.data.item(), user.size(0)) optimizer.zero_grad() loss.backward() optimizer.step() if compression_scheduler: compression_scheduler.on_minibatch_end(epoch, batch_index, steps_per_epoch, optimizer) # Save stats to file description = ( 'Epoch {} Loss {loss.val:.4f} ({loss.avg:.4f})'.format( epoch, loss=losses)) loader.set_description(description) steps_completed = batch_index + 1 if steps_completed % args.log_freq == 0: stats_dict = OrderedDict() stats_dict['Loss'] = losses.avg stats = ('Performance/Training/', stats_dict) params = model.named_parameters( ) if args.log_params_histograms else None distiller.log_training_progress(stats, params, epoch, steps_completed, steps_per_epoch, args.log_freq, [tflogger]) tflogger.log_model_buffers(model, ['tracked_min', 'tracked_max'], 'Quant/Train/Acts/TrackedMinMax', epoch, steps_completed, steps_per_epoch, args.log_freq) train_time = time.time() - begin begin = time.time() hits, ndcgs = val_epoch(model, test_ratings, test_negs, args.topk, use_cuda=use_cuda, output=valid_results_file, epoch=epoch, processes=args.processes) val_time = time.time() - begin if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) hit_rate = np.mean(hits) mean_ndcgs = np.mean(ndcgs) stats_dict = OrderedDict() stats_dict['HR@{0}'.format(args.topk)] = hit_rate stats_dict['NDCG@{0}'.format(args.topk)] = mean_ndcgs stats = ('Performance/Validation/', stats_dict) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) msglogger.info( 'Epoch {epoch}: HR@{K} = {hit_rate:.4f}, NDCG@{K} = {ndcg:.4f}, AvgTrainLoss = {loss.avg:.4f}, ' 'train_time = {train_time:.2f}, val_time = {val_time:.2f}'.format( epoch=epoch, K=args.topk, hit_rate=hit_rate, ndcg=mean_ndcgs, loss=losses, train_time=train_time, val_time=val_time)) is_best = False if hit_rate > best_hit_rate: best_hit_rate = hit_rate is_best = True best_epoch = epoch extras = { 'current_hr@10': hit_rate, 'best_hr@10': best_hit_rate, 'best_epoch': best_epoch } apputils.save_checkpoint(epoch, 'NCF', model, optimizer, compression_scheduler, extras, is_best, dir=run_dir) if args.threshold is not None: if np.mean(hits) >= args.threshold: msglogger.info("Hit threshold of {}".format(args.threshold)) break
def main(): script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) global msglogger # Parse arguments args = parser.get_parser().parse_args() if args.epochs is None: args.epochs = 90 if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger( os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir, args.verbose) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state( filter(None, [args.compress, args.qe_stats_file ]), # remove both None and empty strings msglogger.logdir, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) if args.evaluate: args.deterministic = True if args.deterministic: distiller.set_deterministic( args.seed) # For experiment reproducability else: if args.seed is not None: distiller.set_seed(args.seed) # Turn on CUDNN benchmark mode for best performance. This is usually "safe" for image # classification models, as the input sizes don't change during the run # See here: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/3 cudnn.benchmark = True start_epoch = 0 ending_epoch = args.epochs perf_scores_history = [] if args.cpu or not torch.cuda.is_available(): # Set GPU index to -1 if using CPU args.device = 'cpu' args.gpus = -1 else: args.device = 'cuda' if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: raise ValueError( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: raise ValueError( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name args.dataset = distiller.apputils.classification_dataset_str_from_arch( args.arch) args.num_classes = distiller.apputils.classification_num_classes( args.dataset) if args.earlyexit_thresholds: args.num_exits = len(args.earlyexit_thresholds) + 1 args.loss_exits = [0] * args.num_exits args.losses_exits = [] args.exiterrors = [] # Create the model model, config = create_model(args.pretrained, args.dataset, args.arch, parallel=not args.load_serialized, device_ids=args.gpus) compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # capture thresholds for early-exit training if args.earlyexit_thresholds: msglogger.info('=> using early-exit threshold values of %s', args.earlyexit_thresholds) # TODO(barrh): args.deprecated_resume is deprecated since v0.3.1 if args.deprecated_resume: msglogger.warning( 'The "--resume" flag is deprecated. Please use "--resume-from=YOUR_PATH" instead.' ) if not args.reset_optimizer: msglogger.warning( 'If you wish to also reset the optimizer, call with: --reset-optimizer' ) args.reset_optimizer = True args.resumed_checkpoint_path = args.deprecated_resume # We can optionally resume from a checkpoint optimizer = None if args.resumed_checkpoint_path: model, compression_scheduler, optimizer, start_epoch = apputils.load_checkpoint( model, args.resumed_checkpoint_path, model_device=args.device) elif args.load_model_path: model = apputils.load_lean_checkpoint(model, args.load_model_path, model_device=args.device) if args.reset_optimizer: start_epoch = 0 if optimizer is not None: optimizer = None msglogger.info( '\nreset_optimizer flag set: Overriding resumed optimizer and resetting epoch count to 0' ) # Define loss function (criterion) if "ssd" in args.arch: neg_pos_ratio = 3 criterion = MultiboxLoss(config.priors, iou_threshold=0.5, neg_pos_ratio=neg_pos_ratio, center_variance=0.1, size_variance=0.2, device=args.device, reduction="sum", class_reduction=True, verbose=0) else: criterion = nn.CrossEntropyLoss().to(args.device) if optimizer is None: if "ssd" in args.arch: base_net_lr = args.lr extra_layers_lr = args.lr params = [{ 'params': model.base_net.parameters(), 'lr': base_net_lr }, { 'params': itertools.chain(model.source_layer_add_ons.parameters(), model.extras.parameters()), 'lr': extra_layers_lr }, { 'params': itertools.chain(model.regression_headers.parameters(), model.classification_headers.parameters()) }] else: params = model.parameters() optimizer = torch.optim.SGD(params, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.AMC: return automated_deep_compression(model, criterion, optimizer, pylogger, args) if args.greedy: return greedy(model, criterion, optimizer, pylogger, args) # This sample application can be invoked to produce various summary reports. if args.summary: for summary in args.summary: distiller.model_summary(model, summary, args.dataset) return if args.export_onnx is not None: return distiller.export_img_classifier_to_onnx(model, os.path.join( msglogger.logdir, args.export_onnx), args.dataset, add_softmax=True, verbose=False) if args.qe_calibration: return acts_quant_stats_collection(model, criterion, pylogger, args) if args.activation_histograms: return acts_histogram_collection(model, criterion, pylogger, args) activations_collectors = create_activation_stats_collectors( model, *args.activation_stats) # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = load_data(args, config=config) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) if args.sensitivity is not None: sensitivities = np.arange(args.sensitivity_range[0], args.sensitivity_range[1], args.sensitivity_range[2]) return sensitivity_analysis(model, criterion, test_loader, pylogger, args, sensitivities) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, args, compression_scheduler) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config( model, optimizer, args.compress, compression_scheduler, (start_epoch - 1) if args.resumed_checkpoint_path else None) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.to(args.device) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) if args.thinnify: #zeros_mask_dict = distiller.create_model_masks_dict(model) assert args.resumed_checkpoint_path is not None, \ "You must use --resume-from to provide a checkpoint file to thinnify" distiller.remove_filters(model, compression_scheduler.zeros_mask_dict, args.arch, args.dataset, optimizer=None) apputils.save_checkpoint(0, args.arch, model, optimizer=None, scheduler=compression_scheduler, name="{}_thinned".format( args.resumed_checkpoint_path.replace( ".pth.tar", "")), dir=msglogger.logdir) print( "Note: your model may have collapsed to random inference, so you may want to fine-tune" ) return args.kd_policy = None if args.kd_teacher: teacher, _ = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, parallel=not args.load_serialized, device_ids=args.gpus) if args.kd_resume: teacher = apputils.load_lean_checkpoint(teacher, args.kd_resume) dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) raw_teacher_model_path = msglogger.logdir + "/raw_teacher.pth.tar" if not os.path.exists(raw_teacher_model_path): teacher.save(raw_teacher_model_path) msglogger.info(Fore.CYAN + '\tRaw Teacher Model saved: {0}'.format( raw_teacher_model_path) + Style.RESET_ALL) args.kd_policy = distiller.KnowledgeDistillationPolicy( model, teacher, args.kd_temp, dlw, loss_type=args.kd_loss_type, focal_alpha=args.kd_focal_alpha, use_adaptive=args.kd_focal_adaptive, verbose=0) compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs, frequency=1) msglogger.info('\nStudent-Teacher knowledge distillation enabled:') msglogger.info('\tTeacher Model: %s', args.kd_teacher) msglogger.info('\tTemperature: %s', args.kd_temp) msglogger.info('\tLoss Weights (distillation | student | teacher): %s', ' | '.join(['{:.2f}'.format(val) for val in dlw])) msglogger.info('\tStarting from Epoch: %s', args.kd_start_epoch) if start_epoch >= ending_epoch: msglogger.error( 'epoch count is too low, starting epoch is {} but total epochs set to {}' .format(start_epoch, ending_epoch)) raise ValueError('Epochs parameter is too low. Nothing to do.') for epoch in range(start_epoch, ending_epoch): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin( epoch, metrics=(vloss if (epoch != start_epoch) else 10**6)) # Train for one epoch with collectors_context(activations_collectors["train"]) as collectors: train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], args=args) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics( epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity: msglogger.info( distiller.masks_sparsity_tbl_summary( model, compression_scheduler)) # evaluate on validation set with collectors_context(activations_collectors["valid"]) as collectors: top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args, epoch) distiller.log_activation_statsitics( epoch, "valid", loggers=[tflogger], collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) stats = ('Performance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) # Update the list of top scores achieved so far, and save the checkpoint update_training_scores_history(perf_scores_history, model, top1, top5, epoch, args.num_best_scores) is_best = epoch == perf_scores_history[0].epoch checkpoint_extras = { 'current_top1': top1, 'best_top1': perf_scores_history[0].top1, 'best_epoch': perf_scores_history[0].epoch } try: raw_fullpath_best = apputils.save_checkpoint( epoch, args.arch, model, optimizer=optimizer, scheduler=compression_scheduler, extras=checkpoint_extras, is_best=is_best, name=args.name, dir=msglogger.logdir) except Exception as ex: # keep previous fullpath_best pass mlflow.log_artifacts(msglogger.logdir) # Finally run results on the test set eval_params = { "model_type": args.arch, "model_path": raw_fullpath_best, "dataset_path": args.data, "label_path": "models/voc-model-labels.txt" } mlflow.projects.run(uri=".", entry_point="eval", use_conda=False, parameters=eval_params)
def main(): script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) global msglogger # Parse arguments args = parser.get_parser().parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger( os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state(args.compress, msglogger.logdir, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) start_epoch = 0 best_epochs = list() if args.deterministic: if args.loaders is None: args.loaders = 1 # Experiment reproducibility is sometimes important. Pete Warden expounded about this # in his blog: https://petewarden.com/2018/03/19/the-machine-learning-reproducibility-crisis/ # In Pytorch, support for deterministic execution is still a bit clunky. if args.loaders > 1: msglogger.error( 'ERROR: Setting --deterministic requires setting --loaders to 0 or 1' ) exit(1) # Use a well-known seed, for repeatability of experiments distiller.set_deterministic() else: # This issue: https://github.com/pytorch/pytorch/issues/3659 # Implies that cudnn.benchmark should respect cudnn.deterministic, but empirically we see that # results are not re-produced when benchmark is set. So enabling only if deterministic mode disabled. cudnn.benchmark = True if args.use_cpu or (args.gpus is None and not torch.cuda.is_available()) or (args.gpus == ''): # Set GPU index to -1 if using CPU args.device = 'cpu' args.gpus = -1 else: args.device = 'cuda' if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: msglogger.error( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) exit(1) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: msglogger.error( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) exit(1) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) if args.loaders is None: active_gpus = args.gpus if args.gpus is not None else torch.cuda.device_count( ) args.loaders = max(parser.DEFAULT_LOADERS_COUNT, parser.DEFAULT_LOADERS_COUNT * active_gpus) msglogger.debug('Number of data loaders set to: {}'.format(args.loaders)) # Infer the dataset from the model name args.dataset = 'cifar10' if 'cifar' in args.arch else 'imagenet' args.num_classes = 10 if args.dataset == 'cifar10' else 1000 if args.earlyexit_thresholds: args.num_exits = len(args.earlyexit_thresholds) + 1 args.loss_exits = [0] * args.num_exits args.losses_exits = [] args.exiterrors = [] # Create the model model = create_model(args.pretrained, args.dataset, args.arch, parallel=not args.load_serialized, device_ids=args.gpus) compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # capture thresholds for early-exit training if args.earlyexit_thresholds: msglogger.info('=> using early-exit threshold values of %s', args.earlyexit_thresholds) # We can optionally resume from a checkpoint optimizer = None resumed_training_steps = None if args.resume or args.load_state_dict: if args.resume and not args.reset_optimizer: # initiate SGD with dummy lr optimizer = torch.optim.SGD(model.parameters(), lr=0.36787944117) model, compression_scheduler, optimizer, start_epoch, resumed_training_steps = apputils.load_checkpoint( model, args.resume or args.load_state_dict, optimizer=optimizer) model.to(args.device) # Define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss().to(args.device) if optimizer is not None: # optimizer was resumed from checkpoint # check if user has tried to set optimizer arguments # if so, ignore arguments with a warning. optimizer_group_args = [ 'lr', 'learning-rate', 'momentum', 'weight-decay', 'wd' ] user_optim_args = [ x for x in optimizer_group_args for arg in sys.argv if arg.startswith('--' + x) ] if user_optim_args: msglogger.warning( '{} optimizer arguments are ignored.'.format(user_optim_args)) msglogger.info( 'setting optimizer arguments when optimizer is resumed ' 'from checkpoint is forbidden.') else: optimizer = torch.optim.SGD(model.parameters(), lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.AMC: return automated_deep_compression(model, criterion, optimizer, pylogger, args) if args.greedy: return greedy(model, criterion, optimizer, pylogger, args) # This sample application can be invoked to produce various summary reports. if args.summary: return summarize_model(model, args.dataset, which_summary=args.summary) activations_collectors = create_activation_stats_collectors( model, *args.activation_stats) if args.qe_calibration: msglogger.info('Quantization calibration stats collection enabled:') msglogger.info( '\tStats will be collected for {:.1%} of test dataset'.format( args.qe_calibration)) msglogger.info( '\tSetting constant seeds and converting model to serialized execution' ) distiller.set_deterministic() model = distiller.make_non_parallel_copy(model) activations_collectors.update( create_quantization_stats_collector(model)) args.evaluate = True args.effective_test_size = args.qe_calibration # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. train_loader, val_loader, test_loader, _ = apputils.load_data( args.dataset, os.path.expanduser(args.data), args.batch_size, args.loaders, args.validation_split, args.deterministic, args.effective_train_size, args.effective_valid_size, args.effective_test_size) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) args.trainset_print_period = parser.getPrintPeriod( args, len(train_loader.sampler), args.batch_size) args.validset_print_period = parser.getPrintPeriod(args, len(val_loader.sampler), args.batch_size) args.testset_print_period = parser.getPrintPeriod(args, len(test_loader.sampler), args.batch_size) if args.sensitivity is not None: sensitivities = np.arange(args.sensitivity_range[0], args.sensitivity_range[1], args.sensitivity_range[2]) return sensitivity_analysis(model, criterion, test_loader, pylogger, args, sensitivities) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, args, compression_scheduler) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config( model, optimizer, args.compress, compression_scheduler, (start_epoch - 1) if (args.resume and not args.reset_optimizer) else None) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.to(args.device) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) if args.thinnify: #zeros_mask_dict = distiller.create_model_masks_dict(model) assert args.resume is not None, "You must use --resume to provide a checkpoint file to thinnify" distiller.remove_filters(model, compression_scheduler.zeros_mask_dict, args.arch, args.dataset, optimizer=None) apputils.save_checkpoint(0, args.arch, model, optimizer=None, scheduler=compression_scheduler, name="{}_thinned".format( args.resume.replace(".pth.tar", "")), dir=msglogger.logdir) print( "Note: your model may have collapsed to random inference, so you may want to fine-tune" ) return args.kd_policy = None if args.kd_teacher: teacher = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, device_ids=args.gpus) if args.kd_resume: teacher = apputils.load_checkpoint(teacher, chkpt_file=args.kd_resume)[0] dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) args.kd_policy = distiller.KnowledgeDistillationPolicy( model, teacher, args.kd_temp, dlw) compression_scheduler.add_policy( args.kd_policy, range(args.kd_start_epoch, args.epochs, 1)) msglogger.info('\nStudent-Teacher knowledge distillation enabled:') msglogger.info('\tTeacher Model: %s', args.kd_teacher) msglogger.info('\tTemperature: %s', args.kd_temp) msglogger.info('\tLoss Weights (distillation | student | teacher): %s', ' | '.join(['{:.2f}'.format(val) for val in dlw])) msglogger.info('\tStarting from Epoch: %s', args.kd_start_epoch) if getattr(compression_scheduler, 'global_policy_end_epoch', None) is not None: if compression_scheduler.global_policy_end_epoch >= (start_epoch + args.epochs): msglogger.warning( 'scheduler requires at least {} epochs, but only {} are sanctioned' .format(compression_scheduler.global_policy_end_epoch, args.epochs)) accumulated_training_steps = resumed_training_steps if resumed_training_steps is not None else 0 for epoch in range(start_epoch, start_epoch + args.epochs): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) # Train for one epoch with collectors_context(activations_collectors["train"]) as collectors: try: train(train_loader, model, criterion, optimizer, epoch, accumulated_training_steps, compression_scheduler, loggers=[tflogger, pylogger], args=args) except RuntimeError as e: if ('cuda out of memory' in str(e).lower()): msglogger.error( 'CUDA memory failure has been detected.\n' 'Sometimes it helps to decrease batch size.\n' 'e.g. Add the following flag to your call: --batch-size={}' .format(args.batch_size // 10)) raise distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics( epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity: msglogger.info( distiller.masks_sparsity_tbl_summary( model, compression_scheduler)) accumulated_training_steps += math.ceil( len(train_loader.sampler) / train_loader.batch_size) # evaluate on validation set with collectors_context(activations_collectors["valid"]) as collectors: top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args, epoch) distiller.log_activation_statsitics( epoch, "valid", loggers=[tflogger], collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) stats = ('Performance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) tflogger.log_training_progress(stats, epoch, None) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) if getattr(compression_scheduler, 'global_policy_end_epoch', None) is None or ( compression_scheduler.global_policy_end_epoch <= epoch): # Update the list of top scores achieved since all policies have concluded if top1 > 0: best_epochs.append( distiller.MutableNamedTuple({ 'top1': top1, 'top5': top5, 'epoch': epoch })) # Keep best_epochs sorted from best to worst # Sort by top1 first, secondary sort by top5, and so forth best_epochs.sort(key=operator.attrgetter('top1', 'top5', 'epoch'), reverse=True) for score in best_epochs[:args.num_best_scores]: msglogger.info('==> Best Top1: %.3f Top5: %.3f on epoch: %d', score.top1, score.top5, score.epoch) is_best = best_epochs and (epoch == best_epochs[0].epoch) apputils.save_checkpoint(epoch, args.arch, model, optimizer, compression_scheduler, best_epochs[0].top1 if best_epochs else None, is_best, args.name, msglogger.logdir, accumulated_training_steps) # Finally run results on the test set test(test_loader, model, criterion, [pylogger], activations_collectors, args=args)
def main(): script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) global msglogger # Parse arguments args = parser.get_parser().parse_args() if args.epochs is None: args.epochs = 200 if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger( os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state( filter(None, [args.compress, args.qe_stats_file ]), # remove both None and empty strings msglogger.logdir, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) if args.evaluate: args.deterministic = True if args.deterministic: distiller.set_deterministic( args.seed) # For experiment reproducability else: if args.seed is not None: distiller.set_seed(args.seed) # Turn on CUDNN benchmark mode for best performance. This is usually "safe" for image # classification models, as the input sizes don't change during the run # See here: https://discuss.pytorch.org/t/what-does-torch-backends-cudnn-benchmark-do/5936/3 cudnn.benchmark = True start_epoch = 0 ending_epoch = args.epochs perf_scores_history = [] if args.cpu or not torch.cuda.is_available(): # Set GPU index to -1 if using CPU args.device = 'cpu' args.gpus = -1 else: args.device = 'cuda' if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: raise ValueError( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: raise ValueError( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name # TODO args.dataset = 'coco' # args.num_classes = 21 # wc -l ~/data/VOC2012/voc-model-labels.txt if args.load_vgg19 and args.arch != 'vgg19': raise ValueError( '``load_vgg19`` should be set only when vgg19 is used') model = create_pose_estimation_model(args.pretrained, args.dataset, args.arch, load_vgg19=args.load_vgg19, parallel=not args.load_serialized, device_ids=args.gpus) compression_scheduler = None # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # <editor-fold desc=">>> Load Model"> # We can optionally resume from a checkpoint optimizer = None if args.resumed_checkpoint_path: model, compression_scheduler, optimizer, start_epoch = apputils.load_checkpoint( model, args.resumed_checkpoint_path, model_device=args.device) elif args.load_model_path: model = apputils.load_lean_checkpoint(model, args.load_model_path, model_device=args.device) if args.reset_optimizer: start_epoch = 0 if optimizer is not None: optimizer = None msglogger.info( '\nreset_optimizer flag set: Overriding resumed optimizer and resetting epoch count to 0' ) # </editor-fold> # Define loss function (criterion) # get_loss(saved_for_loss, heat_temp, heat_weight,vec_temp, vec_weight) criterion = { 'shufflenetv2': shufflenetv2_get_loss, 'vgg19': vgg19_get_loss, 'hourglass': hourglass_get_loss, }[args.arch] if optimizer is None: trainable_vars = [ param for param in model.parameters() if param.requires_grad ] optimizer = torch.optim.SGD(trainable_vars, lr=args.lr, momentum=args.momentum, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) # TODO: load lr_scheduler lr_scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.8, patience=5, verbose=True, threshold=0.0001, threshold_mode='rel', cooldown=3, min_lr=0, eps=1e-08) if args.AMC: return automated_deep_compression(model, criterion, optimizer, pylogger, args) if args.greedy: return greedy(model, criterion, optimizer, pylogger, args) # This sample application can be invoked to produce various summary reports. if args.summary: for summary in args.summary: distiller.model_summary(model, summary, args.dataset) return if args.export_onnx is not None: return distiller.export_img_classifier_to_onnx(model, os.path.join( msglogger.logdir, args.export_onnx), args.dataset, add_softmax=True, verbose=False) if args.qe_calibration: return acts_quant_stats_collection(model, criterion, pylogger, args) if args.activation_histograms: return acts_histogram_collection(model, criterion, pylogger, args) print('Building activations_collectors...') activations_collectors = create_activation_stats_collectors( model, *args.activation_stats) # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. print('Loading data...') train_loader, val_loader, test_loader, _ = load_data(args) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) if args.sensitivity is not None: sensitivities = np.arange(args.sensitivity_range[0], args.sensitivity_range[1], args.sensitivity_range[2]) return sensitivity_analysis(model, criterion, test_loader, pylogger, args, sensitivities) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, args, compression_scheduler) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. compression_scheduler = distiller.file_config( model, optimizer, args.compress, compression_scheduler, (start_epoch - 1) if args.resumed_checkpoint_path else None) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) model.to(args.device) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) if args.thinnify: # zeros_mask_dict = distiller.create_model_masks_dict(model) assert args.resumed_checkpoint_path is not None, \ "You must use --resume-from to provide a checkpoint file to thinnify" distiller.remove_filters(model, compression_scheduler.zeros_mask_dict, args.arch, args.dataset, optimizer=None) apputils.save_checkpoint(0, args.arch, model, optimizer=None, scheduler=compression_scheduler, name="{}_thinned".format( args.resumed_checkpoint_path.replace( ".pth.tar", "")), dir=msglogger.logdir) print( "Note: your model may have collapsed to random inference, so you may want to fine-tune" ) return if start_epoch >= ending_epoch: msglogger.error( 'epoch count is too low, starting epoch is {} but total epochs set to {}' .format(start_epoch, ending_epoch)) raise ValueError('Epochs parameter is too low. Nothing to do.') for epoch in range(start_epoch, ending_epoch): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin( epoch, metrics=(total_loss if (epoch != start_epoch) else 10**6)) # Train for one epoch with collectors_context(activations_collectors["train"]) as collectors: train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], args=args) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics( epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity: msglogger.info( distiller.masks_sparsity_tbl_summary( model, compression_scheduler)) # evaluate on validation set with collectors_context(activations_collectors["valid"]) as collectors: loss = validate(val_loader, model, criterion, [pylogger], args, epoch) distiller.log_activation_statsitics( epoch, "valid", loggers=[tflogger], collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) lr_scheduler.step(loss) stats = ('Performance/Validation/', OrderedDict([('Loss', loss)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) # Update the list of top scores achieved so far, and save the checkpoint update_training_scores_history(perf_scores_history, model, loss, epoch, args.num_best_scores) is_best = epoch == perf_scores_history[0].epoch checkpoint_extras = { 'current_loss': loss, 'best_loss': perf_scores_history[0].loss, 'best_epoch': perf_scores_history[0].epoch } apputils.save_checkpoint(epoch, args.arch, model, optimizer=optimizer, scheduler=compression_scheduler, extras=checkpoint_extras, is_best=is_best, name=args.name, dir=msglogger.logdir) # Finally run results on the test set test(test_loader, model, criterion, [pylogger], activations_collectors, args=args)
def main(): script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) global msglogger # Parse arguments args = parser.get_parser().parse_args() if args.epochs is None: args.epochs = 90 if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger(os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. apputils.log_execution_env_state(args.compress, msglogger.logdir, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) start_epoch = 0 ending_epoch = args.epochs perf_scores_history = [] if args.deterministic: # Experiment reproducibility is sometimes important. Pete Warden expounded about this # in his blog: https://petewarden.com/2018/03/19/the-machine-learning-reproducibility-crisis/ # In Pytorch, support for deterministic execution is still a bit clunky. if args.workers > 1: raise ValueError('ERROR: Setting --deterministic requires setting --workers/-j to 0 or 1') # Use a well-known seed, for repeatability of experiments distiller.set_deterministic() else: # This issue: https://github.com/pytorch/pytorch/issues/3659 # Implies that cudnn.benchmark should respect cudnn.deterministic, but empirically we see that # results are not re-produced when benchmark is set. So enabling only if deterministic mode disabled. cudnn.benchmark = True if args.cpu or not torch.cuda.is_available(): # Set GPU index to -1 if using CPU args.device = 'cpu' args.gpus = -1 else: args.device = 'cuda' if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: raise ValueError('ERROR: Argument --gpus must be a comma-separated list of integers only') available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: raise ValueError('ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name args.dataset = 'cifar10' if 'cifar' in args.arch else 'imagenet' args.num_classes = 10 if args.dataset == 'cifar10' else 1000 if args.earlyexit_thresholds: args.num_exits = len(args.earlyexit_thresholds) + 1 args.loss_exits = [0] * args.num_exits args.losses_exits = [] args.exiterrors = []
def main(): script_dir = os.path.dirname(__file__) module_path = os.path.abspath(os.path.join(script_dir, '..', '..')) global msglogger # Parse arguments args = parser.get_parser().parse_args() if not os.path.exists(args.output_dir): os.makedirs(args.output_dir) msglogger = apputils.config_pylogger( os.path.join(script_dir, 'logging.conf'), args.name, args.output_dir) # Log various details about the execution environment. It is sometimes useful # to refer to past experiment executions and this information may be useful. # 记录有关执行环境的各种详细信息。有时是有用的 # 参考过去的实验执行,这些信息可能有用。 apputils.log_execution_env_state(args.compress, msglogger.logdir, gitroot=module_path) msglogger.debug("Distiller: %s", distiller.__version__) start_epoch = 0 perf_scores_history = [] if args.deterministic: # Experiment reproducibility is sometimes important. Pete Warden expounded about this # in his blog: https://petewarden.com/2018/03/19/the-machine-learning-reproducibility-crisis/ # In Pytorch, support for deterministic execution is still a bit clunky. if args.workers > 1: msglogger.error( 'ERROR: Setting --deterministic requires setting --workers/-j to 0 or 1' ) # 错误:设置--确定性要求将--workers/-j设置为0或1 exit(1) # 正常退出程序 # Use a well-known seed, for repeatability of experiments 使用一种众所周知的种子,用于实验的重复性。 distiller.set_deterministic() else: # This issue: https://github.com/pytorch/pytorch/issues/3659 # Implies that cudnn.benchmark should respect cudnn.deterministic, but empirically we see that # results are not re-produced when benchmark is set. So enabling only if deterministic mode disabled. cudnn.benchmark = True if args.cpu or not torch.cuda.is_available(): # Set GPU index to -1 if using CPU args.device = 'cpu' args.gpus = -1 else: args.device = 'cuda' if args.gpus is not None: try: args.gpus = [int(s) for s in args.gpus.split(',')] except ValueError: msglogger.error( 'ERROR: Argument --gpus must be a comma-separated list of integers only' ) exit(1) available_gpus = torch.cuda.device_count() for dev_id in args.gpus: if dev_id >= available_gpus: msglogger.error( 'ERROR: GPU device ID {0} requested, but only {1} devices available' .format(dev_id, available_gpus)) exit(1) # Set default device in case the first one on the list != 0 torch.cuda.set_device(args.gpus[0]) # Infer the dataset from the model name args.dataset = 'cousm' if args.earlyexit_thresholds: args.num_exits = len(args.earlyexit_thresholds) + 1 args.loss_exits = [0] * args.num_exits args.losses_exits = [] args.exiterrors = [] # Create the model model = ResNet152() # model = torch.nn.DataParallel(model, device_ids=args.gpus) # 并行GPU model.to(args.device) compression_scheduler = None # 压缩调度 # Create a couple of logging backends. TensorBoardLogger writes log files in a format # that can be read by Google's Tensor Board. PythonLogger writes to the Python logger. # 创建两个日志后端 TensorBoardLogger以Google的Tensor板可以读取的格式写入日志文件。python logger将写入python记录器。 tflogger = TensorBoardLogger(msglogger.logdir) pylogger = PythonLogger(msglogger) # capture thresholds for early-exit training if args.earlyexit_thresholds: msglogger.info('=> using early-exit threshold values of %s', args.earlyexit_thresholds) # We can optionally resume from a checkpoint if args.resume: # 加载训练模型 # checkpoint = torch.load(args.resume) # model.load_state_dict(checkpoint['state_dict']) model, compression_scheduler, start_epoch = apputils.load_checkpoint( model, chkpt_file=args.resume) model.to(args.device) # Define loss function (criterion) and optimizer # 定义损失函数和优化器SGD criterion = nn.CrossEntropyLoss().to(args.device) # optimizer = torch.optim.SGD(model.fc.parameters(), lr=args.lr, # momentum=args.momentum, # weight_decay=args.weight_decay) optimizer = torch.optim.Adam(model.model.fc.parameters(), lr=args.lr, weight_decay=args.weight_decay) msglogger.info('Optimizer Type: %s', type(optimizer)) msglogger.info('Optimizer Args: %s', optimizer.defaults) if args.AMC: # 自动化的深层压缩 return automated_deep_compression(model, criterion, optimizer, pylogger, args) if args.greedy: # 贪婪的 return greedy(model, criterion, optimizer, pylogger, args) # This sample application can be invoked to produce various summary reports. # 可以调用此示例应用程序来生成各种摘要报告。 if args.summary: return summarize_model(model, args.dataset, which_summary=args.summary) # 激活统计收集器 activations_collectors = create_activation_stats_collectors( model, *args.activation_stats) if args.qe_calibration: msglogger.info('Quantization calibration stats collection enabled:') msglogger.info( '\tStats will be collected for {:.1%} of test dataset'.format( args.qe_calibration)) msglogger.info( '\tSetting constant seeds and converting model to serialized execution' ) distiller.set_deterministic() model = distiller.make_non_parallel_copy(model) activations_collectors.update( create_quantization_stats_collector(model)) # 量化统计收集器 args.evaluate = True args.effective_test_size = args.qe_calibration # Load the datasets: the dataset to load is inferred from the model name passed # in args.arch. The default dataset is ImageNet, but if args.arch contains the # substring "_cifar", then cifar10 is used. # 加载数据集:从传递的模型名称推断要加载的数据集 train_loader, val_loader, test_loader, _ = get_data_loaders( datasets_fn, r'/home/tian/Desktop/image_yasuo', args.batch_size, args.workers, args.validation_split, args.deterministic, args.effective_train_size, args.effective_valid_size, args.effective_test_size) msglogger.info('Dataset sizes:\n\ttraining=%d\n\tvalidation=%d\n\ttest=%d', len(train_loader.sampler), len(val_loader.sampler), len(test_loader.sampler)) # 可以调用此示例应用程序来对模型执行敏感性分析。输出保存到csv和png。 if args.sensitivity is not None: sensitivities = np.arange(args.sensitivity_range[0], args.sensitivity_range[1], args.sensitivity_range[2]) return sensitivity_analysis(model, criterion, test_loader, pylogger, args, sensitivities) if args.evaluate: return evaluate_model(model, criterion, test_loader, pylogger, activations_collectors, args, compression_scheduler) if args.compress: # The main use-case for this sample application is CNN compression. Compression # requires a compression schedule configuration file in YAML. # #这个示例应用程序的主要用例是CNN压缩 # #需要yaml中的压缩计划配置文件。 compression_scheduler = distiller.file_config(model, optimizer, args.compress, compression_scheduler) # Model is re-transferred to GPU in case parameters were added (e.g. PACTQuantizer) # 如果添加了参数(如PactQualifier),则模型会重新传输到GPU。 model.to(args.device) elif compression_scheduler is None: compression_scheduler = distiller.CompressionScheduler(model) # 压缩计划程序 if args.thinnify: # zeros_mask_dict = distiller.create_model_masks_dict(model) assert args.resume is not None, "You must use --resume to provide a checkpoint file to thinnify" # 必须使用--resume提供检查点文件以细化 distiller.remove_filters(model, compression_scheduler.zeros_mask_dict, args.arch, args.dataset, optimizer=None) apputils.save_checkpoint(0, args.arch, model, optimizer=None, scheduler=compression_scheduler, name="{}_thinned".format( args.resume.replace(".pth.tar", "")), dir=msglogger.logdir) print( "Note: your model may have collapsed to random inference, so you may want to fine-tune" ) # 注意:您的模型可能已折叠为随机推理,因此您可能需要对其进行微调。 return args.kd_policy = None # 蒸馏 if args.kd_teacher: teacher = create_model(args.kd_pretrained, args.dataset, args.kd_teacher, device_ids=args.gpus) if args.kd_resume: teacher, _, _ = apputils.load_checkpoint(teacher, chkpt_file=args.kd_resume) dlw = distiller.DistillationLossWeights(args.kd_distill_wt, args.kd_student_wt, args.kd_teacher_wt) args.kd_policy = distiller.KnowledgeDistillationPolicy( model, teacher, args.kd_temp, dlw) compression_scheduler.add_policy(args.kd_policy, starting_epoch=args.kd_start_epoch, ending_epoch=args.epochs, frequency=1) msglogger.info('\nStudent-Teacher knowledge distillation enabled:') msglogger.info('\tTeacher Model: %s', args.kd_teacher) msglogger.info('\tTemperature: %s', args.kd_temp) msglogger.info('\tLoss Weights (distillation | student | teacher): %s', ' | '.join(['{:.2f}'.format(val) for val in dlw])) msglogger.info('\tStarting from Epoch: %s', args.kd_start_epoch) lr = args.lr lr_decay = 0.5 for epoch in range(start_epoch, args.epochs): # This is the main training loop. msglogger.info('\n') if compression_scheduler: compression_scheduler.on_epoch_begin(epoch) # Train for one epoch with collectors_context(activations_collectors["train"]) as collectors: train(train_loader, model, criterion, optimizer, epoch, compression_scheduler, loggers=[tflogger, pylogger], args=args) distiller.log_weights_sparsity(model, epoch, loggers=[tflogger, pylogger]) distiller.log_activation_statsitics( epoch, "train", loggers=[tflogger], collector=collectors["sparsity"]) if args.masks_sparsity: # 打印掩盖稀疏表 在end of each epoch msglogger.info( distiller.masks_sparsity_tbl_summary( model, compression_scheduler)) # evaluate on validation set with collectors_context(activations_collectors["valid"]) as collectors: top1, top5, vloss = validate(val_loader, model, criterion, [pylogger], args, epoch) distiller.log_activation_statsitics( epoch, "valid", loggers=[tflogger], collector=collectors["sparsity"]) save_collectors_data(collectors, msglogger.logdir) stats = ('Peformance/Validation/', OrderedDict([('Loss', vloss), ('Top1', top1), ('Top5', top5)])) distiller.log_training_progress(stats, None, epoch, steps_completed=0, total_steps=1, log_freq=1, loggers=[tflogger]) if compression_scheduler: compression_scheduler.on_epoch_end(epoch, optimizer) # Update the list of top scores achieved so far, and save the checkpoint # 更新到目前为止获得的最高分数列表,并保存检查点 sparsity = distiller.model_sparsity(model) perf_scores_history.append( distiller.MutableNamedTuple({ 'sparsity': sparsity, 'top1': top1, 'top5': top5, 'epoch': epoch })) # Keep perf_scores_history sorted from best to worst # Sort by sparsity as main sort key, then sort by top1, top5 and epoch # 保持绩效分数历史记录从最好到最差的排序 # 按稀疏度排序为主排序键,然后按top1、top5、epoch排序 perf_scores_history.sort(key=operator.attrgetter( 'sparsity', 'top1', 'top5', 'epoch'), reverse=True) for score in perf_scores_history[:args.num_best_scores]: msglogger.info( '==> Best [Top1: %.3f Top5: %.3f Sparsity: %.2f on epoch: %d]', score.top1, score.top5, score.sparsity, score.epoch) is_best = epoch == perf_scores_history[0].epoch apputils.save_checkpoint(epoch, args.arch, model, optimizer, compression_scheduler, perf_scores_history[0].top1, is_best, args.name, msglogger.logdir) if not is_best: lr = lr * lr_decay # 当loss大于上一次loss,降低学习率 for param_group in optimizer.param_groups: param_group['lr'] = lr # Finally run results on the test set # 最后在测试集上运行结果 test(test_loader, model, criterion, [pylogger], activations_collectors, args=args)