def test_gnmt_quantization(_case_config):
    model = GNMT(vocab_size=32)
    model = replace_lstm(model)
    forward_fn_ = gnmt_forward_fn(seq_len=10, batch_size=3, vocab_size=32)

    config = get_basic_quantization_config(_case_config.quant_type, input_sample_size=(3, 10))
    config["compression"].update({
        "quantizable_subgraph_patterns": [["linear", "__add__"],
                                          ["sigmoid", "__mul__", "__add__"],
                                          ["__add__", "tanh", "__mul__"],
                                          ["sigmoid", "__mul__"]],
        "disable_function_quantization_hooks": True,
        "ignored_scopes": ["GNMT/ResidualRecurrentEncoder[encoder]/Embedding[embedder]",
                           "GNMT/ResidualRecurrentDecoder[decoder]/Embedding[embedder]"]})

    compressed_model = NNCFNetwork(model,
                                   input_infos=create_input_infos(config),
                                   dummy_forward_fn=forward_fn_,
                                   scopes_without_shape_matching=
                                   ['GNMT/ResidualRecurrentDecoder[decoder]/RecurrentAttention[att_rnn]/'
                                    'BahdanauAttention[attn]'])

    compression_algo_builder_list = create_compression_algorithm_builders(config)

    for builder in compression_algo_builder_list:
        compressed_model = builder.apply_to(compressed_model)
    _ = compressed_model.commit_compression_changes()
    check_model_graph(compressed_model, 'gnmt_variable.dot', _case_config.graph_dir)
Ejemplo n.º 2
0
def create_compressed_model(model: Module,
                            config: Config,
                            dummy_forward_fn: Callable[[Module], Any] = None):
    """dummy_forward_fn will be used instead of a *forward* function call to build
    the graph - useful when the original training pipeline has special formats of
    data loader output or has additional *forward* arguments other than input tensors.
    Otherwise, the *forward* call of the model will be made with a single Tensor with
    a shape and type specified in config."""

    if dummy_forward_fn is None:
        input_info_list = create_input_infos(config)
        graph_builder = GraphBuilder(
            custom_forward_fn=create_dummy_forward_fn(input_info_list))
    else:
        graph_builder = GraphBuilder(custom_forward_fn=dummy_forward_fn)

    if is_main_process():
        print(*get_all_modules(model).keys(), sep="\n")
        reset_context('create_model')
        graph = graph_builder.build_graph(model, 'create_model')
        graph.dump_graph(osp.join(config.log_dir, "original_graph.dot"))

    compression_algo = create_compression_algorithm(model, config,
                                                    dummy_forward_fn)

    compressed_model = compression_algo.model
    if is_main_process() and not isinstance(compression_algo,
                                            NoCompressionAlgorithm):
        context_name = 'create_compressed_graph'
        if isinstance(compressed_model, QuantizedNetwork):
            context_name = compressed_model.get_context_name()
        graph = graph_builder.build_graph(compression_algo.model, context_name)
        graph.dump_graph(osp.join(config.log_dir, "compressed_graph.dot"))

    return compression_algo, compressed_model
Ejemplo n.º 3
0
def test_compressed_graph_models_hw(desc, hw_config_type):
    model = desc.model_builder()
    config = get_basic_quantization_config_with_hw_config_type(
        hw_config_type.value, input_sample_size=desc.input_sample_sizes)
    input_info_list = create_input_infos(config)
    hw_config_path = HWConfig.get_path_to_hw_config(hw_config_type)
    hw_config = HWConfig.from_json(hw_config_path)
    compressed_model = NNCFNetwork(model, input_infos=input_info_list)

    # pylint:disable=protected-access
    compression_algo_builder = create_compression_algorithm_builders(config)[0]
    potential_weights_modules =\
        compression_algo_builder.get_potential_quantized_modules(compressed_model)
    prop_graph_solver = QuantizerPropagationSolver(hw_config=hw_config)
    insertion_point_graph = compressed_model.get_insertion_point_graph()
    merged_ip_graph = insertion_point_graph.get_ip_graph_with_merged_hw_optimized_operations(
        hw_config)
    potential_activations_quantizers = prop_graph_solver.run_on_ip_graph(
        merged_ip_graph)
    sketch_graph = compressed_model.get_original_graph()

    potential_quantizer_graph = prepare_potential_quantizer_graph(
        sketch_graph, potential_activations_quantizers,
        potential_weights_modules)
    check_graph(potential_quantizer_graph,
                desc.dot_filename,
                _case_dir(hw_config_type.value),
                sort_dot_graph=False)
Ejemplo n.º 4
0
def create_test_dataloaders(config, dataset_dir):
    input_info = create_input_infos(config)[0]
    image_size = input_info.shape[-1]
    batch_size = input_info.shape[0]
    normalize = transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5))

    train_transforms = transforms.Compose([
        transforms.CenterCrop(image_size),
        transforms.ToTensor(),
        normalize,
    ])

    dummy_config = type('dummy', (object, ), {'dataset_dir': dataset_dir})()
    train_dataset = create_cifar(dummy_config,
                                 dataset_config='cifar10',
                                 is_train=True,
                                 transform=train_transforms)

    # Do not set num_workers > 0 here - random hangs occur during pytest runs of this files
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               shuffle=False,
                                               pin_memory=True,
                                               drop_last=True)
    return train_loader, train_dataset
Ejemplo n.º 5
0
def get_testing_dataset(dataset_name, path_to_annotations, path_to_imgs,
                        config):
    # for VOC path_to_imgs = path_to_annotations = voc_root
    assert dataset_name in ['voc', 'coco']
    preprocessing = get_preprocessing(config)
    input_info_list = create_input_infos(config)
    image_size = input_info_list[0].shape[-1]
    transform = BaseTransform(image_size, preprocessing.mean,
                              preprocessing.std, preprocessing.normalize_coef)
    if dataset_name == 'voc':
        testing_dataset = VOCDetection(
            path_to_imgs, [('2012', 'val')],
            transform=transform,
            target_transform=VOCAnnotationTransform(keep_difficult=True),
            return_image_info=True,
            rgb=preprocessing.rgb)
    if dataset_name == 'coco':
        testing_dataset = COCODataset(path_to_annotations,
                                      path_to_imgs,
                                      transform=transform,
                                      scale_bboxes=False,
                                      return_image_info=True,
                                      rgb=preprocessing.rgb)

    return testing_dataset
Ejemplo n.º 6
0
def create_dataloader(config):
    input_infos_list = create_input_infos(config)
    input_sample_size = input_infos_list[0].shape
    data_loader = torch.utils.data.DataLoader(OnesDatasetMock(input_sample_size[1:]),
                                              batch_size=1,
                                              num_workers=1,
                                              shuffle=False)
    return data_loader
Ejemplo n.º 7
0
def create_mock_dataloader(config, num_samples=1):
    input_infos_list = create_input_infos(config)
    input_sample_size = input_infos_list[0].shape
    data_loader = torch.utils.data.DataLoader(OnesDatasetMock(input_sample_size[1:], num_samples),
                                              batch_size=1,
                                              num_workers=0,  # Workaround
                                              shuffle=False, drop_last=True)
    return data_loader
Ejemplo n.º 8
0
def create_rank_dataloader(config, rank):
    input_infos_list = create_input_infos(config)
    input_sample_size = input_infos_list[0].shape
    data_loader = torch.utils.data.DataLoader(
        RankDatasetMock(input_sample_size[1:], rank),
        batch_size=3,
        num_workers=0,  # workaround
        shuffle=False,
        drop_last=True)
    return data_loader
Ejemplo n.º 9
0
 def create_dataloader(wrap_dataloader, config, device):
     input_infos_list = create_input_infos(config)
     input_sample_size = input_infos_list[0].shape
     data_loader = torch.utils.data.DataLoader(OnesDatasetMock(input_sample_size[1:]),
                                               batch_size=1,
                                               num_workers=1,
                                               shuffle=False)
     if wrap_dataloader:
         data_loader = InitializingDataLoader(data_loader=data_loader,
                                              device=device,
                                              kwargs={})
     return data_loader
Ejemplo n.º 10
0
def test_staged_scheduler_with_hawq():
    config = get_squeezenet_quantization_config()
    config['compression'].update({
        'params': {
            "activations_quant_start_epoch": 1,
            "weights_quant_start_epoch": 2,
        },
        'initializer': {
            'range': {
                'num_init_samples': 1
            },
            'precision': {
                "type": "hawq",
                "num_data_points": 1,
                "iter_number": 1,
                "tolerance": 1
            }
        }
    })
    num_classes = 10
    model = squeezenet1_1(num_classes=num_classes, dropout=0)

    input_infos_list = create_input_infos(config)
    input_sample_size = input_infos_list[0].shape
    data_loader = DataLoader(
        HawqDatasetMock(input_sample_size[1:], num_classes),
        batch_size=1,
        num_workers=0,  # Workaround for PyTorch MultiprocessingDataLoader issues
        shuffle=False)
    criterion = nn.CrossEntropyLoss().cuda()
    config = register_default_init_args(config, data_loader, criterion)

    model, algo = create_compressed_model_and_algo_for_test(model, config)
    scheduler = algo.scheduler

    for module in algo.all_quantizations.values():
        assert not module.is_enabled_quantization()

    scheduler.epoch_step()
    for module in algo.all_quantizations.values():
        assert not module.is_enabled_quantization()

    scheduler.epoch_step()
    for module in algo.all_quantizations.values():
        if module.is_weights:
            assert not module.is_enabled_quantization()
        else:
            assert module.is_enabled_quantization()

    scheduler.epoch_step()
    for module in algo.all_quantizations.values():
        assert module.is_enabled_quantization()
def create_model(config):
    input_info_list = create_input_infos(config)
    image_size = input_info_list[0].shape[-1]
    ssd_net = build_ssd(config.model, config.ssd_params, image_size, config.num_classes, config)
    compression_algo, ssd_net = create_compressed_model(ssd_net, config)
    ssd_net = compression_algo.model
    weights = config.get('weights')
    if weights:
        sd = torch.load(weights, map_location='cpu')
        load_state(ssd_net, sd)
    ssd_net.train()
    model, _ = prepare_model_for_execution(ssd_net, config)
    return compression_algo, model
Ejemplo n.º 12
0
def sr_dummy_forward_fn(model_, input_sample_sizes: Tuple[List[int]]):
    device = next(model_.parameters()).device
    config = {
        'input_info': [{
            "sample_size": sizes
        } for sizes in input_sample_sizes]
    }
    input_info_list = create_input_infos(config)
    tensor_list = [
        create_mock_tensor(info, device) for info in input_info_list
    ]
    args = (tuple(tensor_list), )
    args, _ = sr_wrap_inputs_fn(args, {})
    return model_(*args)
Ejemplo n.º 13
0
def sr_dummy_forward_fn(model_, input_sample_sizes: Tuple[List[int]]):
    device = next(model_.parameters()).device
    config = {
        'input_info': [{
            "sample_size": sizes
        } for sizes in input_sample_sizes]
    }
    input_info_list = create_input_infos(config)
    tensor_list = [
        create_mock_tensor(info, device) for info in input_info_list
    ]
    for idx, tensor in enumerate(tensor_list):
        tensor_list[idx] = nncf_model_input(tensor)
    return model_(tuple(tensor_list))
Ejemplo n.º 14
0
def main():
    model_bin, model_xml = get_ir_paths(args.model, args.bin)

    config = NNCFConfig.from_json(args.config)

    input_infos_list = create_input_infos(config)
    image_size = input_infos_list[0].shape[-1]

    size = int(image_size / 0.875)

    print('IE version: {}'.format(get_version()))

    # NOTE: importing torch after loading IE to plugin to avoid issue with built-in MKLDNN of PyTorch
    plugin = IEPlugin(device='CPU', plugin_dirs=args.cpu_plugin_dir)
    plugin.add_cpu_extension(
        os.path.join(args.cpu_plugin_dir, "libcpu_extension.so"))
    net = IENetwork(model=model_xml, weights=model_bin)
    exec_net = getExecNet(plugin, net)
    from torch.utils.data import DataLoader
    import torchvision.datasets as datasets
    import torchvision.transforms as transforms

    val_loader = DataLoader(datasets.ImageFolder(
        args.data,
        transforms.Compose([
            transforms.Resize(size),
            transforms.CenterCrop(image_size),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                 std=[0.229, 0.224, 0.225])
        ])),
                            batch_size=1,
                            shuffle=False,
                            num_workers=4,
                            pin_memory=True)
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    config['log_dir'] = args.output_dir

    infer_fn = partial(infer_ie_model, net=net)
    validate_general(val_loader, exec_net, infer_fn)

    validate_torch_model(os.path.join(args.output_dir, "PTH"),
                         config=config,
                         num_layers=args.num_layers,
                         dump=args.dump,
                         val_loader=val_loader,
                         cuda=args.cuda)
Ejemplo n.º 15
0
Archivo: main.py Proyecto: zbrnwpu/nncf
def create_model(config: SampleConfig, resuming_model_sd: dict = None):
    input_info_list = create_input_infos(config.nncf_config)
    image_size = input_info_list[0].shape[-1]
    ssd_net = build_ssd(config.model, config.ssd_params, image_size, config.num_classes, config)
    weights = config.get('weights')
    if weights:
        sd = torch.load(weights, map_location='cpu')
        load_state(ssd_net, sd)

    ssd_net.to(config.device)

    compression_ctrl, compressed_model = create_compressed_model(ssd_net, config.nncf_config, resuming_model_sd)
    compressed_model, _ = prepare_model_for_execution(compressed_model, config)

    compressed_model.train()
    return compression_ctrl, compressed_model
Ejemplo n.º 16
0
def distributed_init_test_default(gpu, ngpus_per_node, config):
    config.batch_size = 3
    config.workers = 0 #  workaround for the pytorch multiprocessingdataloader issue/
    config.gpu = gpu
    config.ngpus_per_node = ngpus_per_node
    config.rank = gpu
    config.distributed = True

    torch.distributed.init_process_group(backend="nccl", init_method='tcp://127.0.0.1:8899',
                                         world_size=config.world_size, rank=config.rank)

    input_infos_list = create_input_infos(config)
    input_sample_size = input_infos_list[0].shape
    data_loader = torch.utils.data.DataLoader(RankDatasetMock(input_sample_size[1:], config.rank),
                                              batch_size=3,
                                              num_workers=0, #  workaround
                                              shuffle=False)
    return data_loader
Ejemplo n.º 17
0
def test_staged_scheduler_with_range_init():
    config = get_squeezenet_quantization_config()
    config['compression'].update({
        'params': {
            "activations_quant_start_epoch": 1,
            "weights_quant_start_epoch": 2,
        },
        'initializer': {
            'range': {
                'num_init_samples': 1
            }
        }
    })
    model = squeezenet1_1(num_classes=10, dropout=0)

    input_infos_list = create_input_infos(config)
    input_sample_size = input_infos_list[0].shape
    data_loader = DataLoader(
        OnesDatasetMock(input_sample_size[1:]),
        batch_size=1,
        num_workers=0,  # Workaround for PyTorch MultiprocessingDataLoader issues
        shuffle=False)
    config.register_extra_structs([QuantizationRangeInitArgs(data_loader)])

    model, algo = create_compressed_model_and_algo_for_test(model, config)
    scheduler = algo.scheduler

    for module in algo.all_quantizations.values():
        assert not module.is_enabled_quantization()

    scheduler.epoch_step()
    for module in algo.all_quantizations.values():
        assert not module.is_enabled_quantization()

    scheduler.epoch_step()
    for module in algo.all_quantizations.values():
        if module.is_weights:
            assert not module.is_enabled_quantization()
        else:
            assert module.is_enabled_quantization()

    scheduler.epoch_step()
    for module in algo.all_quantizations.values():
        assert module.is_enabled_quantization()
Ejemplo n.º 18
0
def create_datasets(config):
    dataset_config = config.dataset if config.dataset is not None else 'imagenet'
    dataset_config = dataset_config.lower()
    assert dataset_config in ['imagenet', 'cifar100',
                              'cifar10'], "Unknown dataset option"

    if dataset_config == 'imagenet':
        normalize = transforms.Normalize(mean=(0.485, 0.456, 0.406),
                                         std=(0.229, 0.224, 0.225))
    elif dataset_config == 'cifar100':
        normalize = transforms.Normalize(mean=(0.4914, 0.4822, 0.4465),
                                         std=(0.2023, 0.1994, 0.2010))
    elif dataset_config == 'cifar10':
        normalize = transforms.Normalize(mean=(0.5, 0.5, 0.5),
                                         std=(0.5, 0.5, 0.5))

    input_info_list = create_input_infos(config)
    image_size = input_info_list[0].shape[-1]
    size = int(image_size / 0.875)
    val_transform = transforms.Compose([
        transforms.Resize(size),
        transforms.CenterCrop(image_size),
        transforms.ToTensor(),
        normalize,
    ])

    val_dataset = get_dataset(dataset_config,
                              config,
                              val_transform,
                              is_train=False)

    train_transforms = transforms.Compose([
        transforms.RandomResizedCrop(image_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ])

    train_dataset = get_dataset(dataset_config,
                                config,
                                train_transforms,
                                is_train=True)

    return train_dataset, val_dataset
Ejemplo n.º 19
0
def hawq_dumping_worker(gpu, ngpus_per_node, config, tmp_path):
    config.batch_size = 3
    config.workers = 3
    config.gpu = gpu
    config.ngpus_per_node = ngpus_per_node
    config.rank = gpu
    config.distributed = True

    torch.distributed.init_process_group(backend="nccl",
                                         init_method='tcp://127.0.0.1:8899',
                                         world_size=config.world_size,
                                         rank=config.rank)

    model = safe_thread_call(partial(mobilenet_v2, pretrained=True))
    model.eval()

    input_infos_list = create_input_infos(config)
    input_sample_size = input_infos_list[0].shape
    data_loader = torch.utils.data.DataLoader(RankDatasetMock(
        input_sample_size[1:], config.rank),
                                              batch_size=3,
                                              num_workers=1,
                                              shuffle=False)
    criterion = torch.nn.MSELoss().cuda(config.gpu)
    config = register_default_init_args(config, criterion, data_loader)
    quant_model, compression_algo = create_compressed_model_and_algo_for_test(
        model, config)

    torch.cuda.set_device(config.gpu)
    quant_model.cuda(config.gpu)
    config.batch_size = int(config.batch_size / ngpus_per_node)
    config.workers = int(config.workers / ngpus_per_node)
    quant_model = torch.nn.parallel.DistributedDataParallel(
        quant_model, device_ids=[config.gpu])

    compression_algo.distributed()

    # just to reproduce the same scale values without Dropout
    quant_model.eval()

    act_bitwidth_per_scope = get_bitwidth_per_scope(quant_model.module)
    out_file_path = get_path_to_bitwidth_dump(tmp_path, config.rank)
    torch.save(act_bitwidth_per_scope, str(out_file_path))
Ejemplo n.º 20
0
def create_nncf_model_and_algo_builder(model: NNCFNetwork, config: NNCFConfig,
                                       dummy_forward_fn: Callable[[Module], Any] = None,
                                       wrap_inputs_fn: Callable[[Tuple, Dict], Tuple[Tuple, Dict]] = None,
                                       resuming_state_dict: dict = None):
    assert isinstance(config, NNCFConfig)
    NNCFConfig.validate(config)
    input_info_list = create_input_infos(config)
    scopes_without_shape_matching = config.get('scopes_without_shape_matching', [])
    ignored_scopes = config.get('ignored_scopes')
    target_scopes = config.get('target_scopes')

    compressed_model = NNCFNetwork(model, input_infos=input_info_list,
                                   dummy_forward_fn=dummy_forward_fn,
                                   wrap_inputs_fn=wrap_inputs_fn,
                                   ignored_scopes=ignored_scopes,
                                   target_scopes=target_scopes,
                                   scopes_without_shape_matching=scopes_without_shape_matching)

    should_init = resuming_state_dict is None
    compression_algo_builder_list = create_compression_algorithm_builders(config, should_init=should_init)
    return compressed_model, compression_algo_builder_list
Ejemplo n.º 21
0
def create_compression_algorithm(model, config, dummy_forward_fn=None):
    compression_config = config.get('compression', {})

    input_info_list = create_input_infos(config)

    if isinstance(compression_config, dict):
        return get_compression_algorithm(compression_config)(
            model,
            compression_config,
            input_infos=input_info_list,
            dummy_forward_fn=dummy_forward_fn)
    if isinstance(compression_config, list) and len(compression_config) == 1:
        return get_compression_algorithm(compression_config[0])(
            model,
            compression_config[0],
            input_infos=input_info_list,
            dummy_forward_fn=dummy_forward_fn)

    logger.info("Creating composite compression algorithm:")
    composite_compression_algorithm = CompositeCompressionAlgorithm(
        model,
        compression_config,
        input_infos=input_info_list,
        dummy_forward_fn=dummy_forward_fn)

    for algo_config in compression_config:
        compression_algorithm = get_compression_algorithm(algo_config)(
            composite_compression_algorithm.model,
            algo_config,
            input_infos=input_info_list,
            dummy_forward_fn=dummy_forward_fn)
        composite_compression_algorithm.add(compression_algorithm)

    from nncf.utils import check_for_quantization_before_sparsity
    check_for_quantization_before_sparsity(
        composite_compression_algorithm.child_algos)
    return composite_compression_algorithm
Ejemplo n.º 22
0
def create_compressed_model(model: Module, config: NNCFConfig,
                            resuming_state_dict: dict = None,
                            dummy_forward_fn: Callable[[Module], Any] = None,
                            dump_graphs=True,) \
    -> Tuple[CompressionAlgorithmController, NNCFNetwork]:
    """
    The main function used to produce a model ready for compression fine-tuning from an original PyTorch
    model and a configuration object.
    dummy_forward_fn
    :param model: The original model. Should have its parameters already loaded from a checkpoint or another
    source.
    :param config: A configuration object used to determine the exact compression modifications to be applied
    to the model
    :param resuming_state_dict: A PyTorch state dict object to load (strictly) into the compressed model after
    building.
    :param dummy_forward_fn: will be used instead of a *forward* function call to build
    the internal graph representation via tracing. Specifying this is useful when the original training pipeline
    has special formats of data loader output or has additional *forward* arguments other than input tensors.
    Otherwise, the *forward* call of the model during graph tracing will be made with mock tensors according
    to the shape specified in the config object.
    :param dump_graphs: Whether or not should also dump the internal graph representation of the
    original and compressed models in the .dot format into the log directory.
    :return: A controller for the compression algorithm (or algorithms, in which case the controller
    is an instance of CompositeCompressionController) and the model ready for compression parameter training wrapped
    as an object of NNCFNetwork."""

    if dump_graphs:
        if dummy_forward_fn is None:
            input_info_list = create_input_infos(config)
            graph_builder = GraphBuilder(
                custom_forward_fn=create_dummy_forward_fn(
                    input_info_list, with_input_tracing=True))
        else:
            graph_builder = GraphBuilder(custom_forward_fn=dummy_forward_fn)

        if is_main_process():
            graph = graph_builder.build_graph(model)
            graph.dump_graph(osp.join(config.get("log_dir", "."),
                                      "original_graph.dot"),
                             extended=True)

    if is_debug():
        set_debug_log_dir(config.get("log_dir", "."))

    input_info_list = create_input_infos(config)
    scopes_without_shape_matching = config.get('scopes_without_shape_matching',
                                               [])
    ignored_scopes = config.get('ignored_scopes')
    target_scopes = config.get('target_scopes')

    compressed_model = NNCFNetwork(
        model,
        input_infos=input_info_list,
        dummy_forward_fn=dummy_forward_fn,
        ignored_scopes=ignored_scopes,
        target_scopes=target_scopes,
        scopes_without_shape_matching=scopes_without_shape_matching)

    should_init = resuming_state_dict is None
    compression_algo_builder_list = create_compression_algorithm_builders(
        config, should_init=should_init)

    for builder in compression_algo_builder_list:
        compressed_model = builder.apply_to(compressed_model)
    compression_ctrl = compressed_model.commit_compression_changes()

    if dump_graphs and is_main_process() and compression_algo_builder_list:
        if dummy_forward_fn is None:
            compressed_graph_builder = GraphBuilder(
                custom_forward_fn=create_dummy_forward_fn(
                    input_info_list, with_input_tracing=False))
        else:
            compressed_graph_builder = GraphBuilder(
                custom_forward_fn=dummy_forward_fn)

        graph = compressed_graph_builder.build_graph(
            compressed_model, compressed_model.get_tracing_context())
        graph.dump_graph(osp.join(config.get("log_dir", "."),
                                  "compressed_graph.dot"),
                         extended=True)

    if resuming_state_dict is not None:
        load_state(compressed_model, resuming_state_dict, is_resume=True)

    return compression_ctrl, compressed_model
Ejemplo n.º 23
0
def scale_signed_dumping_worker(gpu, ngpus_per_node, config, tmp_path):
    config.batch_size = 3
    config.workers = 3
    config.gpu = gpu
    config.ngpus_per_node = ngpus_per_node
    config.rank = gpu
    config.distributed = True

    torch.distributed.init_process_group(backend="nccl",
                                         init_method='tcp://127.0.0.1:8899',
                                         world_size=config.world_size,
                                         rank=config.rank)

    model = safe_thread_call(partial(squeezenet1_1_custom, pretrained=True))

    compression_algo = create_compression_algorithm(model, config)
    compression_algo.distributed()
    model = compression_algo.model
    compression_scheduler = compression_algo.scheduler

    torch.cuda.set_device(config.gpu)
    model.cuda(config.gpu)
    config.batch_size = int(config.batch_size / ngpus_per_node)
    config.workers = int(config.workers / ngpus_per_node)
    model = torch.nn.parallel.DistributedDataParallel(model,
                                                      device_ids=[config.gpu])

    criterion = torch.nn.MSELoss().cuda(config.gpu)
    optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

    torch.backends.cudnn.benchmark = True

    input_infos_list = create_input_infos(config)
    input_sample_size = input_infos_list[0].shape
    data_loader = torch.utils.data.DataLoader(RankDatasetMock(
        input_sample_size[1:], config.rank),
                                              batch_size=3,
                                              num_workers=1,
                                              shuffle=False)
    # just to reproduce the same scale values without Dropout
    model.eval()
    compression_algo.initialize(data_loader)

    act_sum = 0
    for layer in get_all_modules_by_type(model, "SymmetricQuantizer").values():
        act_sum += layer.scale
    ref_sum = 3467.322
    assert act_sum.item() == approx(ref_sum, 0.01), \
        'sum of scales is not expected {} vs {} rank {}'.format(act_sum.item(), ref_sum, config.rank)

    out_file_path = get_path_after_broadcast(tmp_path, config.rank)
    save_params(model, out_file_path)
    compression_scheduler.step()
    for i, (input_, _) in enumerate(data_loader):
        if i > 5:
            break
        output = model(input_)
        optimizer.zero_grad()
        dummy_target = torch.randn(1000).cuda(config.gpu, non_blocking=True)
        loss = criterion(output, dummy_target)
        compression_scheduler.step()
        loss.backward()
        optimizer.step()
        compression_scheduler.step()

    out_file_path = get_path_path_after_train_iters(tmp_path, config.rank)
    save_params(model, out_file_path)
def create_dataloaders(config):
    dataset_config = config.dataset if config.dataset is not None else 'imagenet'
    dataset_config = dataset_config.lower()
    assert dataset_config in ['imagenet', 'cifar100',
                              'cifar10'], "Unknown dataset option"

    if dataset_config == 'imagenet':
        normalize = transforms.Normalize(mean=(0.485, 0.456, 0.406),
                                         std=(0.229, 0.224, 0.225))
    elif dataset_config == 'cifar100':
        normalize = transforms.Normalize(mean=(0.4914, 0.4822, 0.4465),
                                         std=(0.2023, 0.1994, 0.2010))
    elif dataset_config == 'cifar10':
        normalize = transforms.Normalize(mean=(0.5, 0.5, 0.5),
                                         std=(0.5, 0.5, 0.5))

    input_info_list = create_input_infos(config)
    image_size = input_info_list[0].shape[-1]
    size = int(image_size / 0.875)
    val_transform = transforms.Compose([
        transforms.Resize(size),
        transforms.CenterCrop(image_size),
        transforms.ToTensor(),
        normalize,
    ])

    val_dataset = get_dataset(dataset_config,
                              config,
                              val_transform,
                              is_train=False)

    pin_memory = config.execution_mode != ExecutionMode.CPU_ONLY

    # When using a single GPU per process and per
    # DistributedDataParallel, we need to divide the batch size
    # ourselves based on the total number of GPUs we have
    batch_size = int(config.batch_size)
    workers = int(config.workers)
    if config.execution_mode == ExecutionMode.MULTIPROCESSING_DISTRIBUTED:
        batch_size //= config.ngpus_per_node
        workers //= config.ngpus_per_node

    val_loader = torch.utils.data.DataLoader(val_dataset,
                                             batch_size=batch_size,
                                             shuffle=False,
                                             num_workers=workers,
                                             pin_memory=pin_memory)

    if config.mode.lower() == "test":
        return None, None, val_loader

    train_transforms = transforms.Compose([
        transforms.RandomResizedCrop(image_size),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        normalize,
    ])

    train_dataset = get_dataset(dataset_config,
                                config,
                                train_transforms,
                                is_train=True)

    if config.distributed:
        train_sampler = torch.utils.data.distributed.DistributedSampler(
            train_dataset)
    else:
        train_sampler = None
    train_loader = torch.utils.data.DataLoader(train_dataset,
                                               batch_size=batch_size,
                                               shuffle=(train_sampler is None),
                                               num_workers=workers,
                                               pin_memory=pin_memory,
                                               sampler=train_sampler)
    return train_loader, train_sampler, val_loader
Ejemplo n.º 25
0
def create_compressed_model(model: Module, config: NNCFConfig,
                            resuming_state_dict: dict = None,
                            dummy_forward_fn: Callable[[Module], Any] = None,
                            wrap_inputs_fn: Callable[[Tuple, Dict], Tuple[Tuple, Dict]] = None,
                            dump_graphs=True,) \
    -> Tuple[CompressionAlgorithmController, NNCFNetwork]:
    """
    The main function used to produce a model ready for compression fine-tuning from an original PyTorch
    model and a configuration object.
    dummy_forward_fn
    :param model: The original model. Should have its parameters already loaded from a checkpoint or another
    source.
    :param config: A configuration object used to determine the exact compression modifications to be applied
    to the model
    :param resuming_state_dict: A PyTorch state dict object to load (strictly) into the compressed model after
    building.
    :param dummy_forward_fn: if supplied, will be used instead of a *forward* function call to build
    the internal graph representation via tracing. Specifying this is useful when the original training pipeline
    has special formats of data loader output or has additional *forward* arguments other than input tensors.
    Otherwise, the *forward* call of the model during graph tracing will be made with mock tensors according
    to the shape specified in the config object.
    :param wrap_inputs_fn: if supplied, will be used on the module's input arguments during a regular, non-dummy
    forward call before passing the inputs to the underlying compressed model. This is required if the model's input
    tensors that are important for compression are not supplied as arguments to the model's forward call directly, but
    instead are located in a container (such as list), and the model receives the container as an argument.
    wrap_inputs_fn should take as input two arguments - the tuple of positional arguments to the underlying
    model's forward call, and a dict of keyword arguments to the same. The function should wrap each tensor among the
    supplied model's args and kwargs that is important for compression (e.g. quantization) with an nncf.nncf_model_input
    function, which is a no-operation function and marks the tensors as inputs to be traced by NNCF in the internal
    graph representation. Output is the tuple of (args, kwargs), where args and kwargs are the same as were supplied in
    input, but each tensor in the original input.
    :param dump_graphs: Whether or not should also dump the internal graph representation of the
    original and compressed models in the .dot format into the log directory.
    :return: A controller for the compression algorithm (or algorithms, in which case the controller
    is an instance of CompositeCompressionController) and the model ready for compression parameter training wrapped
    as an object of NNCFNetwork."""

    # Compress model that will be deployed for the inference on target device. No need to compress parts of the
    # model that are used on training stage only (e.g. AuxLogits of Inception-v3 model) or unused modules with weights.
    # As a consequence, no need to care about spoiling BN statistics, as there're disabled in eval mode.
    model.eval()

    if dump_graphs:
        if dummy_forward_fn is None:
            input_info_list = create_input_infos(config)
            graph_builder = GraphBuilder(
                custom_forward_fn=create_dummy_forward_fn(
                    input_info_list, with_input_tracing=True))
        else:
            graph_builder = GraphBuilder(custom_forward_fn=dummy_forward_fn)

        if is_main_process():
            graph = graph_builder.build_graph(model)
            graph.visualize_graph(
                osp.join(config.get("log_dir", "."), "original_graph.dot"))

    set_debug_log_dir(config.get("log_dir", "."))

    input_info_list = create_input_infos(config)
    scopes_without_shape_matching = config.get('scopes_without_shape_matching',
                                               [])
    ignored_scopes = config.get('ignored_scopes')
    target_scopes = config.get('target_scopes')

    compressed_model = NNCFNetwork(
        model,
        input_infos=input_info_list,
        dummy_forward_fn=dummy_forward_fn,
        wrap_inputs_fn=wrap_inputs_fn,
        ignored_scopes=ignored_scopes,
        target_scopes=target_scopes,
        scopes_without_shape_matching=scopes_without_shape_matching)

    should_init = resuming_state_dict is None
    compression_algo_builder_list = create_compression_algorithm_builders(
        config, should_init=should_init)

    for builder in compression_algo_builder_list:
        compressed_model = builder.apply_to(compressed_model)
    compression_ctrl = compressed_model.commit_compression_changes()

    try:
        if resuming_state_dict is not None:
            load_state(compressed_model, resuming_state_dict, is_resume=True)
    finally:
        if dump_graphs and is_main_process() and compression_algo_builder_list:
            if dummy_forward_fn is None:
                compressed_graph_builder = GraphBuilder(
                    custom_forward_fn=create_dummy_forward_fn(
                        input_info_list, with_input_tracing=False))
            else:
                compressed_graph_builder = GraphBuilder(
                    custom_forward_fn=dummy_forward_fn)

            graph = compressed_graph_builder.build_graph(
                compressed_model, compressed_model.get_tracing_context())
            graph.visualize_graph(
                osp.join(config.get("log_dir", "."), "compressed_graph.dot"))
    return compression_ctrl, compressed_model