def load_model(model, pretrained=True, num_classes=1000, model_params=None, weights_path: str = None) -> torch.nn.Module: """ ** WARNING: This is implemented using torch.load functionality, which itself uses Python's pickling facilities that may be used to perform arbitrary code execution during unpickling. Only load the data you trust. """ logger.info("Loading model: {}".format(model)) if model_params is None: model_params = {} if model in torchvision.models.__dict__: load_model_fn = partial(torchvision.models.__dict__[model], num_classes=num_classes, pretrained=pretrained, **model_params) elif model in custom_models.__dict__: load_model_fn = partial(custom_models.__dict__[model], num_classes=num_classes, pretrained=pretrained, **model_params) else: raise Exception("Undefined model name") loaded_model = safe_thread_call(load_model_fn) if not pretrained and weights_path is not None: sd = torch.load(weights_path, map_location='cpu') load_state(loaded_model, sd, is_resume=False) return loaded_model
def load_model(model, pretrained=True, num_classes=1000, model_params=None, weights_path: str = None) -> torch.nn.Module: logger.info("Loading model: {}".format(model)) if model_params is None: model_params = {} if model in torchvision.models.__dict__: load_model_fn = partial(torchvision.models.__dict__[model], num_classes=num_classes, pretrained=pretrained, **model_params) elif model in custom_models.__dict__: load_model_fn = partial(custom_models.__dict__[model], num_classes=num_classes, pretrained=pretrained, **model_params) else: raise Exception("Undefined model name") loaded_model = safe_thread_call(load_model_fn) if not pretrained and weights_path is not None: sd = torch.load(weights_path, map_location='cpu') load_state(loaded_model, sd, is_resume=False) return loaded_model
def create_cifar(config, dataset_config, is_train, transform): create_cifar_fn = None if dataset_config == 'cifar100': create_cifar_fn = partial(CIFAR100, config.dataset_dir, train=is_train, transform=transform) if dataset_config == 'cifar10': create_cifar_fn = partial(CIFAR10, config.dataset_dir, train=is_train, transform=transform) if create_cifar_fn: return safe_thread_call(partial(create_cifar_fn, download=True), partial(create_cifar_fn, download=False)) return None
def hawq_dumping_worker(gpu, ngpus_per_node, config, tmp_path): data_loader = distributed_init_test_default(gpu, ngpus_per_node, config) model = safe_thread_call(partial(mobilenet_v2, pretrained=True)) model.eval() criterion = torch.nn.MSELoss().cuda(config.gpu) config = register_default_init_args(config, criterion, data_loader) quant_model, compression_ctrl = create_compressed_model_and_algo_for_test(model, config) quant_model = post_compression_test_distr_init(compression_ctrl, config, ngpus_per_node, quant_model) # just to reproduce the same scale values without Dropout quant_model.eval() act_bitwidth_per_scope = get_bitwidth_per_scope(quant_model.module) out_file_path = get_path_to_bitwidth_dump(tmp_path, config.rank) torch.save(act_bitwidth_per_scope, str(out_file_path))
def load_model(model, pretrained=True, num_classes=1000, model_params=None): logger.info("Loading model: {}".format(model)) if model_params is None: model_params = {} if model in torchvision.models.__dict__: load_model_fn = partial(torchvision.models.__dict__[model], num_classes=num_classes, pretrained=pretrained, **model_params) elif model in custom_models.__dict__: load_model_fn = partial(custom_models.__dict__[model], num_classes=num_classes, pretrained=pretrained, **model_params) else: raise Exception("Undefined model name") return safe_thread_call(load_model_fn)
def scale_signed_dumping_worker(gpu, ngpus_per_node, config, tmp_path): distributed_init_test_default(gpu, ngpus_per_node, config) data_loader = create_rank_dataloader(config, gpu) model = safe_thread_call(partial(squeezenet1_1, pretrained=True)) config.register_extra_structs([QuantizationRangeInitArgs(data_loader)]) quant_model, compression_ctrl = create_compressed_model_and_algo_for_test( model, config) compression_scheduler = compression_ctrl.scheduler quant_model = post_compression_test_distr_init(compression_ctrl, config, ngpus_per_node, quant_model) criterion = torch.nn.MSELoss().cuda(config.gpu) optimizer = torch.optim.Adam(quant_model.parameters(), lr=0.01) torch.backends.cudnn.benchmark = True # just to reproduce the same scale values without Dropout quant_model.eval() act_sum = 0 for layer in get_all_modules_by_type(quant_model, "SymmetricQuantizer").values(): act_sum += layer.scale.sum() ref_sum = 4447.291 assert act_sum.item() == approx(ref_sum, 0.01), \ 'sum of scales is not expected {} vs {} rank {}'.format(act_sum.item(), ref_sum, config.rank) out_file_path = get_path_after_broadcast(tmp_path, config.rank) save_params(quant_model, out_file_path) compression_scheduler.step() for i, (input_, _) in enumerate(data_loader): if i > 5: break output = quant_model(input_) optimizer.zero_grad() dummy_target = torch.randn(1000).cuda(config.gpu, non_blocking=True) loss = criterion(output, dummy_target) compression_scheduler.step() loss.backward() optimizer.step() compression_scheduler.step() out_file_path = get_path_path_after_train_iters(tmp_path, config.rank) save_params(quant_model, out_file_path)
def hawq_dumping_worker(gpu, ngpus_per_node, config, tmp_path): config.batch_size = 3 config.workers = 3 config.gpu = gpu config.ngpus_per_node = ngpus_per_node config.rank = gpu config.distributed = True torch.distributed.init_process_group(backend="nccl", init_method='tcp://127.0.0.1:8899', world_size=config.world_size, rank=config.rank) model = safe_thread_call(partial(mobilenet_v2, pretrained=True)) model.eval() input_infos_list = create_input_infos(config) input_sample_size = input_infos_list[0].shape data_loader = torch.utils.data.DataLoader(RankDatasetMock( input_sample_size[1:], config.rank), batch_size=3, num_workers=1, shuffle=False) criterion = torch.nn.MSELoss().cuda(config.gpu) config = register_default_init_args(config, criterion, data_loader) quant_model, compression_algo = create_compressed_model_and_algo_for_test( model, config) torch.cuda.set_device(config.gpu) quant_model.cuda(config.gpu) config.batch_size = int(config.batch_size / ngpus_per_node) config.workers = int(config.workers / ngpus_per_node) quant_model = torch.nn.parallel.DistributedDataParallel( quant_model, device_ids=[config.gpu]) compression_algo.distributed() # just to reproduce the same scale values without Dropout quant_model.eval() act_bitwidth_per_scope = get_bitwidth_per_scope(quant_model.module) out_file_path = get_path_to_bitwidth_dump(tmp_path, config.rank) torch.save(act_bitwidth_per_scope, str(out_file_path))
def scale_signed_dumping_worker(gpu, ngpus_per_node, config, tmp_path): config.batch_size = 3 config.workers = 3 config.gpu = gpu config.ngpus_per_node = ngpus_per_node config.rank = gpu config.distributed = True torch.distributed.init_process_group(backend="nccl", init_method='tcp://127.0.0.1:8899', world_size=config.world_size, rank=config.rank) model = safe_thread_call(partial(squeezenet1_1_custom, pretrained=True)) quant_model, compression_ctrl = create_compressed_model_and_algo_for_test(model, config) compression_ctrl.distributed() compression_scheduler = compression_ctrl.scheduler torch.cuda.set_device(config.gpu) quant_model.cuda(config.gpu) config.batch_size = int(config.batch_size / ngpus_per_node) config.workers = int(config.workers / ngpus_per_node) quant_model = torch.nn.parallel.DistributedDataParallel(quant_model, device_ids=[config.gpu]) criterion = torch.nn.MSELoss().cuda(config.gpu) optimizer = torch.optim.Adam(quant_model.parameters(), lr=0.01) torch.backends.cudnn.benchmark = True input_infos_list = create_input_infos(config) input_sample_size = input_infos_list[0].shape data_loader = torch.utils.data.DataLoader(RankDatasetMock(input_sample_size[1:], config.rank), batch_size=3, num_workers=1, shuffle=False) # just to reproduce the same scale values without Dropout quant_model.eval() compression_ctrl.initialize(data_loader) act_sum = 0 for layer in get_all_modules_by_type(quant_model, "SymmetricQuantizer").values(): act_sum += layer.scale ref_sum = 3467.322 assert act_sum.item() == approx(ref_sum, 0.01), \ 'sum of scales is not expected {} vs {} rank {}'.format(act_sum.item(), ref_sum, config.rank) out_file_path = get_path_after_broadcast(tmp_path, config.rank) save_params(quant_model, out_file_path) compression_scheduler.step() for i, (input_, _) in enumerate(data_loader): if i > 5: break output = quant_model(input_) optimizer.zero_grad() dummy_target = torch.randn(1000).cuda(config.gpu, non_blocking=True) loss = criterion(output, dummy_target) compression_scheduler.step() loss.backward() optimizer.step() compression_scheduler.step() out_file_path = get_path_path_after_train_iters(tmp_path, config.rank) save_params(quant_model, out_file_path)